Example #1
0
def test_full_upload():
    "Test whether we can perform a targeted upload to a test db."
    # This uses a specially curated sample directory designed to access most
    # code paths that the real system might experience, but on a much smaller
    # (thus faster) scale. Errors in the ftp service will not be caught by
    # this test.

    # Test the medline/pubmed upload.
    db, pm = get_test_db_with_pubmed_content(with_pm=True)
    tr_list = db.select_all('text_ref')
    assert len(tr_list), "No text refs were added..."
    assert all([hasattr(tr, 'pmid') for tr in tr_list]),\
        'All text_refs MUST have pmids by now.'

    mra_list = db.select_all(db.MeshRefAnnotations)
    num_mra_exp = sum(len(ann) for ann in pm.annotations.values())
    assert len(mra_list) == num_mra_exp,\
        "Only %s/%s annotations added" % (len(mra_list), num_mra_exp)
    assert all([hasattr(mra, 'mesh_num') for mra in mra_list]), \
        'All MESH annotations should have a mesh ID Number.'

    # Test the pmc oa upload.
    PmcOA(ftp_url=get_test_ftp_url(), local=True).populate(db)
    tcs_pmc = db.filter_query(
        db.TextContent, db.TextContent.source == PmcOA.my_source).count()
    assert tcs_pmc, "No pmc oa fulltext was added."
    trs_w_pmcids = db.filter_query(db.TextRef,
                                   db.TextRef.pmcid.isnot(None)).count()
    assert trs_w_pmcids >= tcs_pmc,\
        "Only %d of at least %d pmcids added." % (trs_w_pmcids, tcs_pmc)

    # Test the manuscripts upload.
    Manuscripts(ftp_url=get_test_ftp_url(), local=True).populate(db)
    tcs_manu = db.filter_query(
        db.TextContent,
        db.TextContent.source == Manuscripts.my_source).count()
    assert tcs_manu, "No manuscripts uploaded."
    trs_w_mids = db.filter_query(db.TextRef,
                                 db.TextRef.manuscript_id.isnot(None)).count()
    assert trs_w_mids >= tcs_manu,\
        "Only %d of at least %d manuscript ids added." % (trs_w_mids, tcs_manu)

    # Some overal checks.
    tc_list = db.select_all(db.TextContent)
    set_exp = {('manuscripts', 'xml', 'fulltext'),
               ('pmc_oa', 'xml', 'fulltext'), ('pubmed', 'text', 'abstract'),
               ('pubmed', 'text', 'title')}
    set_got = set([(tc.source, tc.format, tc.text_type) for tc in tc_list])
    assert set_exp == set_got,\
        "Expected %s, got %s for content layout." % (set_exp, set_got)

    # Test careful upload of medline (very shallow test...checks only for
    # critical failures)
    m = Pubmed(ftp_url=get_test_ftp_url(), local=True)
    m.load_files(db, 'baseline', carefully=True)
Example #2
0
def get_test_db_with_pubmed_content(with_pm=False):
    "Populate the database with sample content from pubmed."
    db = get_temp_db(clear=True)
    pm = Pubmed(ftp_url=get_test_ftp_url(), local=True)
    pm.populate(db)
    if with_pm:
        return db, pm
    else:
        return db
Example #3
0
def test_multible_pmc_oa_content():
    "Test to make sure repeated content is handled correctly."
    db = get_temp_db()
    pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True)
    pmc.populate(db)
    num_conts = len(db.select_all('text_content'))
    pmc.populate(db)
    assert len(db.select_all('text_content')) == num_conts,\
        "Duplicate text content allowed to be submitted."
    return
Example #4
0
def test_multiple_pmids():
    "Test that pre-existing pmids are correctly handled."
    db = get_temp_db()
    med = Pubmed(ftp_url=get_test_ftp_url(), local=True)
    med.populate(db)
    num_refs = len(db.select_all('text_ref'))
    med.populate(db)
    assert len(db.select_all('text_ref')) == num_refs,\
        "Duplicate pmids allowed to be submitted.."
    return
Example #5
0
def test_multiple_text_ref_pmc_oa():
    "Test whether a duplicate text ref in pmc oa is handled correctly."
    db = get_temp_db()
    pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True)
    pmc.review_fname = 'test_review_multiple_text_ref_pmc_oa.txt'
    inp = dict.fromkeys(pmc.tr_cols)
    inp.update(pmcid='PMC5579538', doi='10.1021/acsomega.7b00205')
    pmc.upload_batch(db, [inp], [])
    num_refs = len(db.select_all('text_ref'))
    pmc.upload_batch(db, [inp], [])
    assert len(db.select_all('text_ref')) == num_refs,\
        "Duplicate refs allowed to be submitted.."
    remove(pmc.review_fname)
    return
Example #6
0
def get_test_db_with_ftp_content():
    "Populate database with content from all the ftp services"
    db = get_test_db_with_pubmed_content()
    PmcOA(ftp_url=get_test_ftp_url(), local=True).populate(db)
    Manuscripts(ftp_url=get_test_ftp_url(), local=True).populate(db)
    return db
Example #7
0
def test_medline_ref_checks():
    "Test the text ref checks used by medline."
    db = get_temp_db(clear=True)
    med = Pubmed(ftp_url=get_test_ftp_url(), local=True)

    def check_input(input_pairs, expected_pairs, carefully, num):
        article_info = {
            pmid: dict(zip(['pmid', 'pmcid'], [pmid, pmcid]))
            for pmid, pmcid in input_pairs
        }
        med.load_text_refs(db, article_info, carefully)
        actual_pairs = [(tr.pmid, tr.pmcid)
                        for tr in db.select_all(db.TextRef)]
        desc = 'careful' if carefully else 'careless'
        msg = 'DB text refs mismatch after upload %d (%s)' % (num, desc)
        actual_pairs.sort(key=str)
        expected_pairs.sort(key=str)
        assert_contents_equal(actual_pairs, expected_pairs, msg)

    expected_pairs = [('CASEA', None), ('CASEB', 'PMCIDCASEB'),
                      ('CASEC', None), ('CASED', 'PMCIDCASED')]

    # Upload round 1
    check_input([('CASEA', None), ('CASEB', 'PMCIDCASEB'), ('CASEC', None),
                 ('CASEC', None), ('CASED', None), ('CASED', 'PMCIDCASED')],
                expected_pairs, False, 1)

    # Upload round 2
    expected_pairs += [('CASEE', None)]
    check_input([('CASEE', None), ('CASEC', 'PMCIDCASEC'),
                 ('CASEH1', 'PMCIDCASEH'), ('CASEK', 'PMCIDCASEK1')],
                expected_pairs + [('CASEH1', 'PMCIDCASEH'),
                                  ('CASEK', 'PMCIDCASEK1')], False, 2)

    # Interlude
    db.insert_many('text_ref', [
        {
            'pmcid': 'PMCIDCASEG'
        },
    ])

    # Upload round 3
    input_pairs = expected_pairs + [
        ('CASEF', None),
        ('CASEC', 'PMCIDCASEC'),
        ('CASEG', 'PMCIDCASEG'),
        ('CASEH2', 'PMCIDCASEH'),  # this should trigger a review.
        ('CASEK', 'PMCIDCASEK2')  # and so should this
    ]
    expected_pairs.remove(('CASEC', None))
    expected_pairs += [('CASEF', None), ('CASEC', 'PMCIDCASEC'),
                       ('CASEG', 'PMCIDCASEG'), ('CASEH1', 'PMCIDCASEH'),
                       ('CASEK', 'PMCIDCASEK1')]
    med.review_fname = 'test_review_%s.txt' % med.my_source
    open(med.review_fname, 'a+').close()
    with open(med.review_fname, 'r') as f:
        num_orig_lines = len(f.readlines())
    check_input(input_pairs, expected_pairs, True, 3)
    with open(med.review_fname, 'r') as f:
        lines = f.readlines()
        assert len(lines) == num_orig_lines + 2, \
            "Not all new reviews added: %d / %d" % (len(lines),
                                                    num_orig_lines + 2)
    remove(med.review_fname)
    return
Example #8
0
def test_id_handling_pmc_oa():
    "Test every conceivable combination pmid/pmcid presence."
    db = get_temp_db(clear=True)
    pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True)

    # Initialize with all possible states we could have gotten from medline.
    pm_inp_tpl_list = capitalize_list_of_tpls(
        [('caseA%d' % i, 'PMCcaseA%d' % i)
         for i in range(2)] + [('caseB%d' % i, None)
                               for i in range(2)] + [(None, 'PMCcaseC%d' % i)
                                                     for i in range(2)] +
        [('caseMisMatchA',
          'PMCcaseMisMatchB'), ('caseMisMatchB', 'PMCcaseMisiMatchB'),
         ('caseMultiMatch', 'PMCcaseMultiMatch'), ('28884161', None),
         ('26977217', 'PMC4771487')])
    db.insert_many('text_ref',
                   [dict(zip(('pmid', 'pmcid'), d)) for d in pm_inp_tpl_list])

    # Prepare the 'batch' to be submitted for pmc oa, and try it.
    oa_inp_tpl_list = capitalize_list_of_tpls(
        [('case%s0' % l, 'PMCcase%s0' % l) for l in ['A', 'B', 'C']] +
        [(None, 'PMCcase%s1' % l) for l in ['A', 'B', 'C']] + [
            (None, 'PMC5579538'),  # lookup pmid in db
            (None, 'PMC4238023'),  # lookup no pmid in db
            ('26977217', 'PMC5142709'),  # conflicting pmcid
            ('caseMisMatchB', 'PMCcaseMisMatchA'),  # multiple matches
            ('caseMultiMatch', 'PMCnotmatching'),
            ('notmatching', 'PMCcaseMultiMatch'),
        ])
    tr_inp = []
    for pmid, pmcid in oa_inp_tpl_list:
        inp_dict = dict.fromkeys(pmc.tr_cols)
        inp_dict.update(pmcid=pmcid, pmid=pmid)
        tr_inp.append(inp_dict)
    tc_inp = [{
        'pmcid': pmcid,
        'text_type': 'txt',
        'content': b'content'
    } for _, pmcid in oa_inp_tpl_list]
    pmc.review_fname = 'test_review_%s.txt' % pmc.my_source
    pmc.upload_batch(db, tr_inp, tc_inp)

    # Check the text refs.
    expected_pairs = capitalize_list_of_tpls([
        ('caseA0', 'PMCcaseA0'),
        ('caseA1', 'PMCcaseA1'),
        ('caseB0', 'PMCcaseB0'),
        ('caseB1', None),  # in practice this should be resolved with id_lookup
        ('caseC0', 'PMCcaseC0'),
        (None, 'PMCcaseC1'),
        ('28884161', 'PMC5579538'),
        ('26977217', 'PMC4771487'),
        (None, 'PMCcaseB1'),
        ('25409783', 'PMC4238023'),
        ('caseMisMatchA', 'PMCcaseMisMatchB'),
        ('caseMisMatchB', 'PMCcaseMisiMatchB'),
        ('caseMultiMatch', 'PMCcaseMultiMatch'),
    ])
    actual_pairs = [(tr.pmid, tr.pmcid) for tr in db.select_all('text_ref')]
    assert_contents_equal(actual_pairs, expected_pairs,
                          'DB text refs incorrect.')

    with open(pmc.review_fname, 'r') as f:
        found_conflict_msg = False
        for line in f.read().splitlines():
            if all([
                    word in line for word in
                ['PMC4771487', 'PMC5142709', 'conflicting pmcid']
            ]):
                found_conflict_msg = True
                break
        assert found_conflict_msg

    # Check the text content
    assert len(db.select_all('text_content')) is 8, 'Too much DB text content.'
    remove(pmc.review_fname)
    return