def test_get_upload_content(): pmid_s3_no_content = 'PMID000foobar' (ct, ct_type) = s3_client.get_upload_content(pmid_s3_no_content) assert ct is None assert ct_type is None pmid_s3_abstract_only = 'PMID000test4' s3_client.put_abstract(pmid_s3_abstract_only, 'foo') (ct, ct_type) = s3_client.get_upload_content(pmid_s3_abstract_only) assert ct == 'foo' assert ct_type == 'abstract' pmid_s3_fulltext = 'PMID000test5' s3_client.put_full_text(pmid_s3_fulltext, 'foo', full_text_type='txt') (ct, ct_type) = s3_client.get_upload_content(pmid_s3_fulltext) assert ct == 'foo' assert ct_type == 'txt'
def test_get_upload_content(): pmid_s3_no_content = 'PMID000foobar' (ct, ct_type) = s3_client.get_upload_content(pmid_s3_no_content) assert ct == None assert ct_type == None pmid_s3_abstract_only = 'PMID000test4' s3_client.put_abstract(pmid_s3_abstract_only, 'foo') (ct, ct_type) = s3_client.get_upload_content(pmid_s3_abstract_only) assert ct == 'foo' assert ct_type == 'abstract' pmid_s3_fulltext = 'PMID000test5' s3_client.put_full_text(pmid_s3_fulltext, 'foo', full_text_type='txt') (ct, ct_type) = s3_client.get_upload_content(pmid_s3_fulltext) assert ct == 'foo' assert ct_type == 'txt'
def get_text(): # Add timeout here for PubMed time.sleep(0.5) # full_pmid = s3_client.check_pmid(pmid) # Look for the full text content, content_type = s3_client.get_upload_content( pmid, force_fulltext_lookup=force_fulltext) content_path = None # Write the contents to a file if content_type is None or content is None: # No content found on S3, skipping content_source = 'content_not_found' elif content_type == 'pmc_oa_xml': content_source = 'pmc_oa_xml' content_path = os.path.join(input_dir, '%s.nxml' % pmid) elif content_type == 'pmc_auth_xml': content_source = 'pmc_auth_xml' content_path = os.path.join(input_dir, '%s.nxml' % pmid) elif content_type == 'pmc_oa_txt': content_source = 'pmc_oa_txt' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'elsevier_xml': content = elsevier_client.extract_text(content) # Couldn't get text from Elsevier XML if content is None: content_source = 'elsevier_extract_text_failure' else: content_source = 'elsevier_xml' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'txt': content_source = 'txt' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'abstract': content_source = 'abstract' content_path = os.path.join(input_dir, '%s.txt' % pmid) # Unhandled content type, skipping else: content_source = 'unhandled_content_type_%s' % content_type # If we got content, write the content to a file with the appropriate # extension if content_path: with open(content_path, 'wb') as f: # The XML string is Unicode enc = content.encode('utf-8') f.write(enc) # Return dict of results for this PMID result = { pmid: { 'content_source': content_source, 'content_path': content_path } } return result
print(usage) sys.exit() if len(sys.argv) == 5 and sys.argv[4] != 'force_fulltext': print(usage) sys.exit() elif len(sys.argv) == 5: force_fulltext = True else: force_fulltext = False logger = logging.getLogger('indra_reading.scripts.starcluster_reading.' 'upload_content') pmid_list = sys.argv[1] start_index = int(sys.argv[2]) end_index = int(sys.argv[3]) with open(pmid_list) as f: pmids = [line.strip('\n') for line in f.readlines()] # Search for full text and abstract on S3 and store info about what needs # to be uploaded (fulltext, abstract) in a list of tuples. pmids_to_upload = [] logger.info("-- Checking for %d full texts --" % len(pmids)) if end_index > len(pmids): end_index = len(pmids) for ix, pmid in enumerate(pmids[start_index:end_index]): logger.info("--- %d: %s ---" % ((start_index + ix), pmid)) s3_client.get_upload_content(pmid, force_fulltext_lookup=force_fulltext)
# Now iterate over the pmids to read and download from S3 to the input # directory num_pmc_oa_xml = 0 num_pmc_auth_xml = 0 num_txt = 0 num_elsevier_xml = 0 num_abstract = 0 num_not_found = 0 num_elsevier_xml_fail = 0 # Keep a map of the content type we've downloaded for each PMID text_sources = {} content_not_found = [] for pmid in pmids_to_read: full_pmid = s3_client.check_pmid(pmid) # Look for the full text (content, content_type) = s3_client.get_upload_content( pmid, force_fulltext_lookup=force_fulltext) # If we don't find the XML on S3, look for it using the PMC client #if xml: # num_found_s3 += 1 #else: # logger.info('No content for %s from S3' % pmid) # (content, content_type) = get_full_text(pmid, 'pmid') # if content_type == 'nxml': # logger.info('Found nxml for %s from PMC web service' % pmid) # xml = content # num_found_not_s3 += 1 # # Upload the xml to S3 for next time # logger.info('Uploading full text for %s to S3' % pmid) # s3_client.put_full_text(pmid, xml, full_text_type='pmc_oa_xml') # #elif content_type == 'abstract': # # logger.info('Found abstract for %s' % pmid)
if len(sys.argv) < 4 or len(sys.argv) > 5: print(usage) sys.exit() if len(sys.argv) == 5 and sys.argv[4] != 'force_fulltext': print(usage) sys.exit() elif len(sys.argv) == 5: force_fulltext = True else: force_fulltext = False logger = logging.getLogger('upload_content') pmid_list = sys.argv[1] start_index = int(sys.argv[2]) end_index = int(sys.argv[3]) with open(pmid_list) as f: pmids = [line.strip('\n') for line in f.readlines()] # Search for full text and abstract on S3 and store info about what needs # to be uploaded (fulltext, abstract) in a list of tuples. pmids_to_upload = [] logger.info("-- Checking for %d full texts --" % len(pmids)) if end_index > len(pmids): end_index = len(pmids) for ix, pmid in enumerate(pmids[start_index:end_index]): logger.info("--- %d: %s ---" % ((start_index + ix), pmid)) s3_client.get_upload_content(pmid, force_fulltext_lookup=force_fulltext)
# Now iterate over the pmids to read and download from S3 to the input # directory num_pmc_oa_xml = 0 num_pmc_auth_xml = 0 num_txt = 0 num_elsevier_xml = 0 num_abstract = 0 num_not_found = 0 num_elsevier_xml_fail = 0 # Keep a map of the content type we've downloaded for each PMID text_sources = {} content_not_found = [] for pmid in pmids_to_read: full_pmid = s3_client.check_pmid(pmid) # Look for the full text (content, content_type) = s3_client.get_upload_content(pmid, force_fulltext_lookup=force_fulltext) # If we don't find the XML on S3, look for it using the PMC client #if xml: # num_found_s3 += 1 #else: # logger.info('No content for %s from S3' % pmid) # (content, content_type) = get_full_text(pmid, 'pmid') # if content_type == 'nxml': # logger.info('Found nxml for %s from PMC web service' % pmid) # xml = content # num_found_not_s3 += 1 # # Upload the xml to S3 for next time # logger.info('Uploading full text for %s to S3' % pmid) # s3_client.put_full_text(pmid, xml, full_text_type='pmc_oa_xml') # #elif content_type == 'abstract': # # logger.info('Found abstract for %s' % pmid)