Ejemplo n.º 1
0
def test_get_upload_content():
    pmid_s3_no_content = 'PMID000foobar'
    (ct, ct_type) = s3_client.get_upload_content(pmid_s3_no_content)
    assert ct is None
    assert ct_type is None

    pmid_s3_abstract_only = 'PMID000test4'
    s3_client.put_abstract(pmid_s3_abstract_only, 'foo')
    (ct, ct_type) = s3_client.get_upload_content(pmid_s3_abstract_only)
    assert ct == 'foo'
    assert ct_type == 'abstract'

    pmid_s3_fulltext = 'PMID000test5'
    s3_client.put_full_text(pmid_s3_fulltext, 'foo', full_text_type='txt')
    (ct, ct_type) = s3_client.get_upload_content(pmid_s3_fulltext)
    assert ct == 'foo'
    assert ct_type == 'txt'
Ejemplo n.º 2
0
def test_get_upload_content():
    pmid_s3_no_content = 'PMID000foobar'
    (ct, ct_type) = s3_client.get_upload_content(pmid_s3_no_content)
    assert ct == None
    assert ct_type == None

    pmid_s3_abstract_only = 'PMID000test4'
    s3_client.put_abstract(pmid_s3_abstract_only, 'foo')
    (ct, ct_type) = s3_client.get_upload_content(pmid_s3_abstract_only)
    assert ct == 'foo'
    assert ct_type == 'abstract'

    pmid_s3_fulltext = 'PMID000test5'
    s3_client.put_full_text(pmid_s3_fulltext, 'foo', full_text_type='txt')
    (ct, ct_type) = s3_client.get_upload_content(pmid_s3_fulltext)
    assert ct == 'foo'
    assert ct_type == 'txt'
Ejemplo n.º 3
0
 def get_text():
     # Add timeout here for PubMed
     time.sleep(0.5)
     # full_pmid = s3_client.check_pmid(pmid)
     # Look for the full text
     content, content_type = s3_client.get_upload_content(
         pmid, force_fulltext_lookup=force_fulltext)
     content_path = None
     # Write the contents to a file
     if content_type is None or content is None:
         # No content found on S3, skipping
         content_source = 'content_not_found'
     elif content_type == 'pmc_oa_xml':
         content_source = 'pmc_oa_xml'
         content_path = os.path.join(input_dir, '%s.nxml' % pmid)
     elif content_type == 'pmc_auth_xml':
         content_source = 'pmc_auth_xml'
         content_path = os.path.join(input_dir, '%s.nxml' % pmid)
     elif content_type == 'pmc_oa_txt':
         content_source = 'pmc_oa_txt'
         content_path = os.path.join(input_dir, '%s.txt' % pmid)
     elif content_type == 'elsevier_xml':
         content = elsevier_client.extract_text(content)
         # Couldn't get text from Elsevier XML
         if content is None:
             content_source = 'elsevier_extract_text_failure'
         else:
             content_source = 'elsevier_xml'
             content_path = os.path.join(input_dir, '%s.txt' % pmid)
     elif content_type == 'txt':
         content_source = 'txt'
         content_path = os.path.join(input_dir, '%s.txt' % pmid)
     elif content_type == 'abstract':
         content_source = 'abstract'
         content_path = os.path.join(input_dir, '%s.txt' % pmid)
     # Unhandled content type, skipping
     else:
         content_source = 'unhandled_content_type_%s' % content_type
     # If we got content, write the content to a file with the appropriate
     # extension
     if content_path:
         with open(content_path, 'wb') as f:
             # The XML string is Unicode
             enc = content.encode('utf-8')
             f.write(enc)
     # Return dict of results for this PMID
     result = {
         pmid: {
             'content_source': content_source,
             'content_path': content_path
         }
     }
     return result
Ejemplo n.º 4
0
        print(usage)
        sys.exit()
    if len(sys.argv) == 5 and sys.argv[4] != 'force_fulltext':
        print(usage)
        sys.exit()
    elif len(sys.argv) == 5:
        force_fulltext = True
    else:
        force_fulltext = False

    logger = logging.getLogger('indra_reading.scripts.starcluster_reading.'
                               'upload_content')

    pmid_list = sys.argv[1]
    start_index = int(sys.argv[2])
    end_index = int(sys.argv[3])

    with open(pmid_list) as f:
        pmids = [line.strip('\n') for line in f.readlines()]

    # Search for full text and abstract on S3 and store info about what needs
    # to be uploaded (fulltext, abstract) in a list of tuples.
    pmids_to_upload = []
    logger.info("-- Checking for %d full texts --" % len(pmids))
    if end_index > len(pmids):
        end_index = len(pmids)
    for ix, pmid in enumerate(pmids[start_index:end_index]):
        logger.info("--- %d: %s ---" % ((start_index + ix), pmid))
        s3_client.get_upload_content(pmid,
                                     force_fulltext_lookup=force_fulltext)
Ejemplo n.º 5
0
 # Now iterate over the pmids to read and download from S3 to the input
 # directory
 num_pmc_oa_xml = 0
 num_pmc_auth_xml = 0
 num_txt = 0
 num_elsevier_xml = 0
 num_abstract = 0
 num_not_found = 0
 num_elsevier_xml_fail = 0
 # Keep a map of the content type we've downloaded for each PMID
 text_sources = {}
 content_not_found = []
 for pmid in pmids_to_read:
     full_pmid = s3_client.check_pmid(pmid)
     # Look for the full text
     (content, content_type) = s3_client.get_upload_content(
         pmid, force_fulltext_lookup=force_fulltext)
     # If we don't find the XML on S3, look for it using the PMC client
     #if xml:
     #    num_found_s3 += 1
     #else:
     #    logger.info('No content for %s from S3' % pmid)
     #    (content, content_type) = get_full_text(pmid, 'pmid')
     #    if content_type == 'nxml':
     #        logger.info('Found nxml for %s from PMC web service' % pmid)
     #        xml = content
     #        num_found_not_s3 += 1
     #        # Upload the xml to S3 for next time
     #        logger.info('Uploading full text for %s to S3' % pmid)
     #        s3_client.put_full_text(pmid, xml, full_text_type='pmc_oa_xml')
     #    #elif content_type == 'abstract':
     #    #    logger.info('Found abstract for %s' % pmid)
Ejemplo n.º 6
0
    if len(sys.argv) < 4 or len(sys.argv) > 5:
        print(usage)
        sys.exit()
    if len(sys.argv) == 5 and sys.argv[4] != 'force_fulltext':
        print(usage)
        sys.exit()
    elif len(sys.argv) == 5:
        force_fulltext = True
    else:
        force_fulltext = False

    logger = logging.getLogger('upload_content')

    pmid_list = sys.argv[1]
    start_index = int(sys.argv[2])
    end_index = int(sys.argv[3])

    with open(pmid_list) as f:
        pmids = [line.strip('\n') for line in f.readlines()]

    # Search for full text and abstract on S3 and store info about what needs
    # to be uploaded (fulltext, abstract) in a list of tuples.
    pmids_to_upload = []
    logger.info("-- Checking for %d full texts --" % len(pmids))
    if end_index > len(pmids):
        end_index = len(pmids)
    for ix, pmid in enumerate(pmids[start_index:end_index]):
        logger.info("--- %d: %s ---" % ((start_index + ix), pmid))
        s3_client.get_upload_content(pmid,
                                     force_fulltext_lookup=force_fulltext)
Ejemplo n.º 7
0
 # Now iterate over the pmids to read and download from S3 to the input
 # directory
 num_pmc_oa_xml = 0
 num_pmc_auth_xml = 0
 num_txt = 0
 num_elsevier_xml = 0
 num_abstract = 0
 num_not_found = 0
 num_elsevier_xml_fail = 0
 # Keep a map of the content type we've downloaded for each PMID
 text_sources = {}
 content_not_found = []
 for pmid in pmids_to_read:
     full_pmid = s3_client.check_pmid(pmid)
     # Look for the full text
     (content, content_type) = s3_client.get_upload_content(pmid, force_fulltext_lookup=force_fulltext)
     # If we don't find the XML on S3, look for it using the PMC client
     #if xml:
     #    num_found_s3 += 1
     #else:
     #    logger.info('No content for %s from S3' % pmid)
     #    (content, content_type) = get_full_text(pmid, 'pmid')
     #    if content_type == 'nxml':
     #        logger.info('Found nxml for %s from PMC web service' % pmid)
     #        xml = content
     #        num_found_not_s3 += 1
     #        # Upload the xml to S3 for next time
     #        logger.info('Uploading full text for %s to S3' % pmid)
     #        s3_client.put_full_text(pmid, xml, full_text_type='pmc_oa_xml')
     #    #elif content_type == 'abstract':
     #    #    logger.info('Found abstract for %s' % pmid)