Beispiel #1
0
def get_test_db_with_pubmed_content(with_pm=False):
    "Populate the database with sample content from pubmed."
    db = get_temp_db(clear=True)
    pm = Pubmed(ftp_url=get_test_ftp_url(), local=True)
    pm.populate(db)
    if with_pm:
        return db, pm
    else:
        return db
Beispiel #2
0
def test_multiple_pmids():
    "Test that pre-existing pmids are correctly handled."
    db = get_temp_db()
    med = Pubmed(ftp_url=get_test_ftp_url(), local=True)
    med.populate(db)
    num_refs = len(db.select_all('text_ref'))
    med.populate(db)
    assert len(db.select_all('text_ref')) == num_refs,\
        "Duplicate pmids allowed to be submitted.."
    return
Beispiel #3
0
def test_full_upload():
    "Test whether we can perform a targeted upload to a test db."
    # This uses a specially curated sample directory designed to access most
    # code paths that the real system might experience, but on a much smaller
    # (thus faster) scale. Errors in the ftp service will not be caught by
    # this test.

    # Test the medline/pubmed upload.
    db, pm = get_test_db_with_pubmed_content(with_pm=True)
    tr_list = db.select_all('text_ref')
    assert len(tr_list), "No text refs were added..."
    assert all([hasattr(tr, 'pmid') for tr in tr_list]),\
        'All text_refs MUST have pmids by now.'

    mra_list = db.select_all(db.MeshRefAnnotations)
    num_mra_exp = sum(len(ann) for ann in pm.annotations.values())
    assert len(mra_list) == num_mra_exp,\
        "Only %s/%s annotations added" % (len(mra_list), num_mra_exp)
    assert all([hasattr(mra, 'mesh_num') for mra in mra_list]), \
        'All MESH annotations should have a mesh ID Number.'

    # Test the pmc oa upload.
    PmcOA(ftp_url=get_test_ftp_url(), local=True).populate(db)
    tcs_pmc = db.filter_query(
        db.TextContent, db.TextContent.source == PmcOA.my_source).count()
    assert tcs_pmc, "No pmc oa fulltext was added."
    trs_w_pmcids = db.filter_query(db.TextRef,
                                   db.TextRef.pmcid.isnot(None)).count()
    assert trs_w_pmcids >= tcs_pmc,\
        "Only %d of at least %d pmcids added." % (trs_w_pmcids, tcs_pmc)

    # Test the manuscripts upload.
    Manuscripts(ftp_url=get_test_ftp_url(), local=True).populate(db)
    tcs_manu = db.filter_query(
        db.TextContent,
        db.TextContent.source == Manuscripts.my_source).count()
    assert tcs_manu, "No manuscripts uploaded."
    trs_w_mids = db.filter_query(db.TextRef,
                                 db.TextRef.manuscript_id.isnot(None)).count()
    assert trs_w_mids >= tcs_manu,\
        "Only %d of at least %d manuscript ids added." % (trs_w_mids, tcs_manu)

    # Some overal checks.
    tc_list = db.select_all(db.TextContent)
    set_exp = {('manuscripts', 'xml', 'fulltext'),
               ('pmc_oa', 'xml', 'fulltext'), ('pubmed', 'text', 'abstract'),
               ('pubmed', 'text', 'title')}
    set_got = set([(tc.source, tc.format, tc.text_type) for tc in tc_list])
    assert set_exp == set_got,\
        "Expected %s, got %s for content layout." % (set_exp, set_got)

    # Test careful upload of medline (very shallow test...checks only for
    # critical failures)
    m = Pubmed(ftp_url=get_test_ftp_url(), local=True)
    m.load_files(db, 'baseline', carefully=True)
Beispiel #4
0
def build_set(n, parent_dir):
    """Create the nastiest set of content we're willing/able to handle.

    We create a small local representation of the entirety of the NLM
    repositories we use, including all the nasty corner cases we can manage.
    This allows for rapid development and testing.

    Parameters
    ----------
    n : int
        The number of instances (distinct articles) of each test case to be
        included. Examples are chosen as randomly as possible. Multiple samples
        generally increase the reliability of the test.
    parent_dir : str
        The head of the tree that stands in place of the url to the nih ftp
        directory.
    """

    # Create the necessary directories.
    def get_path(sub_path):
        return os.path.join(parent_dir, sub_path)

    if os.path.exists(parent_dir):
        shutil.rmtree(parent_dir)
    os.makedirs(parent_dir)
    os.makedirs(get_path('pub/pmc'))
    os.makedirs(get_path('pubmed/baseline'))
    os.makedirs(get_path('pub/pmc/manuscript'))

    # Get the pmid data from medline (med_pmid_list)
    print("Getting medline lists...")
    med = Pubmed()
    # This resource appears to have disappeared, sadly.
    # med_pmid_list = []
    # for i in range(1, 7):
    #     buf = BytesIO()
    #     med.ftp.ret_file("MuId-PmId-%d.zip" % i, buf)
    #     zf = zipfile.ZipFile(buf)
    #     with zf.open(zf.namelist()[0]) as id_f:
    #         id_str = id_f.read().decode('utf8')
    #     med_pmid_list += [l.split('\t')[1] for l in id_str.splitlines()]

    statementful_pmids = [
        '20949557', '23898069', '19801969', '21042724', '14675752', '25897078',
        '25486481', '12890751', '11251186', '20622853', '25616414', '21878640',
        '23295773', '19747910', '25778309', '25939761', '11871856', '16580132',
        '24730770', '23921085', '22018470', '19405127', '21464949', '18321309',
        '7907095', '12048232', '23751074', '18711136', '13679391', '22193543',
        '26645886', '27086966', '14570914', '20538416', '9417079', '23200589',
        '15146469', '18084123', '19265534', '19449221', '27381626', '14976202',
        '22445724', '20040392', '26039245', '17881156', '15902258', '1745350',
        '18276758', '22764095', '20652941', '25834816', '23068100', '16407218',
        '18830263', '24265318', '19752028', '8589722', '22671588', '14745431',
        '25042645', '19403642', '14707024', '23536437', '21167476', '22801439',
        '25726184', '19723643', '17409824', '28679432', '26908611', '20164468',
        '15189946', '12086229', '21900397', '12324477', '15545228', '23376846',
        '21719749', '20608972', '23583295', '23236067', '9705962', '20068183',
        '19437340', '14534726', '25731731', '15337767', '28067895', '25092803',
        '19261749', '22272295', '27121230', '23302038', '17410335', '17399955',
        '16254247', '21685363', '26598524', '25645929', '1386335', '20606534',
        '22492281', '22158902', '22022427', '24775712', '21298412', '24753544',
        '12553064', '19681600', '17912454', '17597401', '20672986', '21362231',
        '17999917', '21470928', '27334922', '16159962', '21079653', '15125833',
        '27617579', '19048115', '18687691', '27797218', '26413934', '16684954',
        '20501406', '27515963', '22784503', '25941399', '12473120', '17891137',
        '16733295', '23826126', '21427728', '8900182', '26234677', '24648515',
        '25786138', '12958678', '16998791', '19061835', '11283269', '18258923',
        '11839584', '20132317', '19158374', '23245941', '23352210', '15465819',
        '15386433', '22575647', '15966238', '23633483', '25131797', '17102080',
        '19956840', '18506362', '17961162', '1607067', '24770328', '19825990',
        '22365656', '19720761', '24435975', '26882953', '17292826', '25119113',
        '26044620', '20717925', '15316008', '16619041', '19893488', '26999786',
        '26103054', '17331464', '20022966', '24189165', '19059939', '25474223',
        '20507346', '20976540', '2810532', '15685397', '27562587', '18538673',
        '15712349', '15448517', '27467210', '7584044', '21330319', '18381962',
        '24789704', '19058873', '10523313'
    ]

    elsevier_pmids = [
        "140233", "126700", "138421", "131864", "122916", "127363", "130834",
        "135691", "147139", "142190", "124378", "132969", "127549", "131583",
        "148910", "140686", "126304", "124909", "145863", "127687", "143909",
        "134286", "144524", "145955", "125088", "122895", "144611", "152202",
        "140767", "139895", "152644", "140057", "149561", "143963", "136992",
        "137557", "144535", "148891", "145321", "133684", "126386", "148890",
        "124210", "131711", "124967", "138753", "132192", "142510", "130244",
        "123485", "126883", "151536", "126948", "137419", "141952", "130051",
        "122816", "150450", "133686", "126866", "138748", "149542", "144038",
        "145957", "136213", "148513", "141931", "140056", "139935", "123177",
        "124593", "141942", "133729", "124598", "124252", "126303", "152671",
        "141908", "124625", "152721", "150335", "133685", "150977", "124154",
        "140713", "146095", "123742", "140478", "143938", "140806", "124600",
        "123729", "127548", "145041", "139938", "143289", "131554", "125206",
        "142661", "122933"
    ]

    # Get the data from pmc oa (pmc_dicts)
    print("Getting pmc oa lists....")
    pmc = PmcOA()
    pmc_dicts = pmc.ftp.get_csv_as_dict('oa_file_list.csv', header=0)

    # Get the data for the manuscripts (man_dicts)
    print("Getting manuscript lists...")
    man = Manuscripts()
    man_dicts = man.ftp.get_csv_as_dict('filelist.csv', header=0)

    # Get pmid, pmcid, mid tuples for the examples that we will use.
    print("Generating example sets...")
    examples = []
    for case in [(1, 0, 0), (1, 1, 0), (0, 1, 0), (1, 1, 1), (1, 0, 1)]:
        for _ in range(n):
            example = _get_example(case, statementful_pmids + elsevier_pmids,
                                   pmc_dicts, man_dicts)
            examples.append(example)

    # Add a few pmids that probably include some statements.
    for pmid in random.sample(statementful_pmids, n):
        examples.append((pmid, '', ''))

    # Add a few pmids that link to elsevier content
    for pmid in random.sample(elsevier_pmids, n):
        examples.append((pmid, '', ''))

    # Add a special article to check article info.
    year_nums = str(datetime.now().year)[-2:]
    double_doi_info = med.get_article_info('baseline/pubmed%sn0343.xml.gz' %
                                           year_nums)
    pmids_w_double_doi = [
        k for k, v in double_doi_info.items()
        if v['doi'] is not None and len(v['doi']) > 100
    ]
    assert len(pmids_w_double_doi), "No double dois found."
    examples.append((
        random.choice(pmids_w_double_doi),
        '',
        '',
    ))

    # Create the test medline file.
    print("Creating medline test file...")
    pmid_list = [pmid for pmid, _, _ in examples if pmid != '']
    tree = None
    for pmid in pmid_list:
        params = {'db': 'pubmed', 'retmode': 'xml', 'id': pmid}
        if tree is None:
            tree = pub.send_request(pub.pubmed_fetch, params)
        else:
            resp = pub.send_request(pub.pubmed_fetch, params)
            attempts = 1
            while not resp and attempts <= 3:
                resp = pub.send_request(pub.pubmed_fetch, params)
                attempts += 1
                sleep(1)
            child = resp.getchildren()[0]
            tree.append(child)
        sleep(0.5)
    if tree is not None:
        f_bts = b''
        f_bts += b"<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
        f_bts += ET.tostring(tree)
        f_path = get_path('pubmed/baseline/pubmed18nTEST.xml.gz')
        with open(f_path, 'wb') as gzf:
            gzf.write(gzip.compress(f_bts))

    # Create the test pmc oa article directory.
    print("Getting pmc oa xmls...")
    art_dirname = get_path('pub/pmc/articles.TEST.xml')
    if os.path.exists(art_dirname):
        shutil.rmtree(art_dirname)
    os.mkdir(art_dirname)
    pmcid_list = [pmcid for _, pmcid, _ in examples if pmcid != '']
    ex_pmc_dicts = [d for d in pmc_dicts if d['Accession ID'] in pmcid_list]
    for d in ex_pmc_dicts:
        fname = pmc.ftp.download_file(d['File'])
        with tarfile.open(fname, 'r:gz') as tar:
            mems = tar.getmembers()
            mem = [mem for mem in mems if mem.name.endswith('.nxml')][0]
            f_str = tar.extractfile(mem).read()
        fname = d['Accession ID'] + '.nxml'
        re_ret = re.findall('<journal-title>(.*?)</journal-title>',
                            f_str.decode('utf8'))
        if len(re_ret):
            sub_dir = os.path.join(
                art_dirname, re_ret[0].replace(' ', '_').replace('&', ''))
        else:
            sub_dir = os.path.join(art_dirname, 'unknown')
        if not os.path.exists(sub_dir):
            os.mkdir(sub_dir)
        path = os.path.join(sub_dir, fname)
        with open(path, 'wb') as f:
            f.write(f_str)
    with tarfile.open(art_dirname + '.tar.gz', 'w:gz') as tar:
        for dirname in os.listdir(art_dirname):
            tar.add(os.path.join(art_dirname, dirname), arcname=dirname)
    shutil.rmtree(art_dirname)

    # Create deleted pmids file (just make an empty file,for now.
    # TODO: Add test case to touch this.
    with open(get_path('pubmed/deleted.pmids.gz'), 'wb') as gzf:
        gzf.write(gzip.compress(b''))

    # Create the test manuscripts file.
    print('Adding manuscript directories...')
    dirfmt = get_path('pub/pmc/manuscript/%s')
    dirnames = [dirfmt % ('PMC00%dXXXXXX.xml' % i) for i in range(2, 6)]
    for dirname in dirnames:
        if os.path.exists(dirname):
            shutil.rmtree(dirname)
        os.mkdir(dirname)
    ex_man_dicts = [d for d in man_dicts if d['PMCID'] in pmcid_list]
    for d in ex_man_dicts:
        d['Tarfile'] = man.get_tarname_from_filename(d['File'])
    tar_members = dict.fromkeys(set([d['Tarfile'] for d in ex_man_dicts]))
    for tarname in tar_members.keys():
        if not os.path.exists(tarname):
            print("\tDownloading %s..." % tarname)
            man.ftp.download_file(tarname)
    for d in ex_man_dicts:
        parent_dir = os.path.join(dirfmt % tarname.replace('.tar.gz', ''),
                                  os.path.dirname(d['File']))
        test_fname = os.path.join(dirfmt % tarname.replace('.tar.gz', ''),
                                  d['File'])
        try:
            with tarfile.open(d['Tarfile'], 'r:gz') as tar:
                print('\tExtracting %s from %s...' % (d['File'], d['Tarfile']))
                tar.extract(d['File'])
        except KeyError:
            print("WARNING: Could not extract %s from %s" %
                  (d['File'], d['Tarfile']))
            continue
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)
        os.rename(d['File'], test_fname)
    for dirname in dirnames:
        with tarfile.open(dirname + '.tar.gz', 'w:gz') as tar:
            for sub_dirname in os.listdir(dirname):
                tar.add(os.path.join(dirname, sub_dirname),
                        arcname=sub_dirname)
        shutil.rmtree(dirname)

    return examples
Beispiel #5
0
def get_db_with_pubmed_content():
    "Populate the database with sample content from pubmed."
    from indra_db.managers.content_manager import Pubmed
    db = get_temp_db(clear=True)
    Pubmed(ftp_url=get_test_ftp_url(), local=True).populate(db)
    return db
Beispiel #6
0
def test_medline_ref_checks():
    "Test the text ref checks used by medline."
    db = get_temp_db(clear=True)
    med = Pubmed(ftp_url=get_test_ftp_url(), local=True)

    def check_input(input_pairs, expected_pairs, carefully, num):
        article_info = {
            pmid: dict(zip(['pmid', 'pmcid'], [pmid, pmcid]))
            for pmid, pmcid in input_pairs
        }
        med.load_text_refs(db, article_info, carefully)
        actual_pairs = [(tr.pmid, tr.pmcid)
                        for tr in db.select_all(db.TextRef)]
        desc = 'careful' if carefully else 'careless'
        msg = 'DB text refs mismatch after upload %d (%s)' % (num, desc)
        actual_pairs.sort(key=str)
        expected_pairs.sort(key=str)
        assert_contents_equal(actual_pairs, expected_pairs, msg)

    expected_pairs = [('CASEA', None), ('CASEB', 'PMCIDCASEB'),
                      ('CASEC', None), ('CASED', 'PMCIDCASED')]

    # Upload round 1
    check_input([('CASEA', None), ('CASEB', 'PMCIDCASEB'), ('CASEC', None),
                 ('CASEC', None), ('CASED', None), ('CASED', 'PMCIDCASED')],
                expected_pairs, False, 1)

    # Upload round 2
    expected_pairs += [('CASEE', None)]
    check_input([('CASEE', None), ('CASEC', 'PMCIDCASEC'),
                 ('CASEH1', 'PMCIDCASEH'), ('CASEK', 'PMCIDCASEK1')],
                expected_pairs + [('CASEH1', 'PMCIDCASEH'),
                                  ('CASEK', 'PMCIDCASEK1')], False, 2)

    # Interlude
    db.insert_many('text_ref', [
        {
            'pmcid': 'PMCIDCASEG'
        },
    ])

    # Upload round 3
    input_pairs = expected_pairs + [
        ('CASEF', None),
        ('CASEC', 'PMCIDCASEC'),
        ('CASEG', 'PMCIDCASEG'),
        ('CASEH2', 'PMCIDCASEH'),  # this should trigger a review.
        ('CASEK', 'PMCIDCASEK2')  # and so should this
    ]
    expected_pairs.remove(('CASEC', None))
    expected_pairs += [('CASEF', None), ('CASEC', 'PMCIDCASEC'),
                       ('CASEG', 'PMCIDCASEG'), ('CASEH1', 'PMCIDCASEH'),
                       ('CASEK', 'PMCIDCASEK1')]
    med.review_fname = 'test_review_%s.txt' % med.my_source
    open(med.review_fname, 'a+').close()
    with open(med.review_fname, 'r') as f:
        num_orig_lines = len(f.readlines())
    check_input(input_pairs, expected_pairs, True, 3)
    with open(med.review_fname, 'r') as f:
        lines = f.readlines()
        assert len(lines) == num_orig_lines + 2, \
            "Not all new reviews added: %d / %d" % (len(lines),
                                                    num_orig_lines + 2)
    remove(med.review_fname)
    return
Beispiel #7
0
def get_db_with_pubmed_content():
    "Populate the database with sample content from pubmed."
    db = get_db()
    Pubmed(ftp_url=TEST_FTP, local=True).populate(db)
    return db