def test_full_upload(): "Test whether we can perform a targeted upload to a test db." # This uses a specially curated sample directory designed to access most # code paths that the real system might experience, but on a much smaller # (thus faster) scale. Errors in the ftp service will not be caught by # this test. # Test the medline/pubmed upload. db = get_db_with_content() tr_list = db.select_all('text_ref') assert len(tr_list), "No text refs were added..." assert all([hasattr(tr, 'pmid') for tr in tr_list]),\ 'All text_refs MUST have pmids by now.' # Test the pmc oa upload. PmcOA(ftp_url=TEST_FTP, local=True).populate(db) tcs_pmc = db.filter_query( db.TextContent, db.TextContent.source == PmcOA.my_source).count() assert tcs_pmc, "No pmc oa fulltext was added." trs_w_pmcids = db.filter_query( db.TextRef, db.TextRef.pmcid.isnot(None)).count() assert trs_w_pmcids >= tcs_pmc,\ "Only %d of at least %d pmcids added." % (trs_w_pmcids, tcs_pmc) # Test the manuscripts upload. Manuscripts(ftp_url=TEST_FTP, local=True).populate(db) tcs_manu = db.filter_query( db.TextContent, db.TextContent.source == Manuscripts.my_source ).count() assert tcs_manu, "No manuscripts uploaded." trs_w_mids = db.filter_query( db.TextRef, db.TextRef.manuscript_id.isnot(None) ).count() assert trs_w_mids >= tcs_manu,\ "Only %d of at least %d manuscript ids added." % (trs_w_mids, tcs_manu) # Some overal checks. tc_list = db.select_all(db.TextContent) set_exp = {('manuscripts', 'xml', 'fulltext'), ('pmc_oa', 'xml', 'fulltext'), ('pubmed', 'text', 'abstract')} set_got = set([(tc.source, tc.format, tc.text_type) for tc in tc_list]) assert set_exp == set_got,\ "Expected %s, got %s for content layout." % (set_exp, set_got)
def get_db_with_ftp_content(): "Populate database with content from all the ftp services" db = get_db_with_pubmed_content() PmcOA(ftp_url=TEST_FTP, local=True).populate(db) Manuscripts(ftp_url=TEST_FTP, local=True).populate(db) return db
def build_set(n, parent_dir): """Create the nastiest set of content we're willing/able to handle. We create a small local representation of the entirety of the NLM repositories we use, including all the nasty corner cases we can manage. This allows for rapid development and testing. Parameters ---------- n : int The number of instances (distinct articles) of each test case to be included. Examples are chosen as randomly as possible. Multiple samples generally increase the reliability of the test. parent_dir : str The head of the tree that stands in place of the url to the nih ftp directory. """ # Create the necessary directories. def get_path(sub_path): return os.path.join(parent_dir, sub_path) if os.path.exists(parent_dir): shutil.rmtree(parent_dir) os.makedirs(parent_dir) os.makedirs(get_path('pub/pmc')) os.makedirs(get_path('pubmed/baseline')) os.makedirs(get_path('pub/pmc/manuscript')) # Get the pmid data from medline (med_pmid_list) print("Getting medline lists...") med_pmid_list = [] med = Pubmed() for i in range(1, 7): buf = BytesIO() med.ftp.ret_file("MuId-PmId-%d.zip" % i, buf) zf = zipfile.ZipFile(buf) with zf.open(zf.namelist()[0]) as id_f: id_str = id_f.read().decode('utf8') med_pmid_list += [l.split('\t')[1] for l in id_str.splitlines()] statementful_pmids = [ '20949557', '23898069', '19801969', '21042724', '14675752', '25897078', '25486481', '12890751', '11251186', '20622853', '25616414', '21878640', '23295773', '19747910', '25778309', '25939761', '11871856', '16580132', '24730770', '23921085', '22018470', '19405127', '21464949', '18321309', '7907095', '12048232', '23751074', '18711136', '13679391', '22193543', '26645886', '27086966', '14570914', '20538416', '9417079', '23200589', '15146469', '18084123', '19265534', '19449221', '27381626', '14976202', '22445724', '20040392', '26039245', '17881156', '15902258', '1745350', '18276758', '22764095', '20652941', '25834816', '23068100', '16407218', '18830263', '24265318', '19752028', '8589722', '22671588', '14745431', '25042645', '19403642', '14707024', '23536437', '21167476', '22801439', '25726184', '19723643', '17409824', '28679432', '26908611', '20164468', '15189946', '12086229', '21900397', '12324477', '15545228', '23376846', '21719749', '20608972', '23583295', '23236067', '9705962', '20068183', '19437340', '14534726', '25731731', '15337767', '28067895', '25092803', '19261749', '22272295', '27121230', '23302038', '17410335', '17399955', '16254247', '21685363', '26598524', '25645929', '1386335', '20606534', '22492281', '22158902', '22022427', '24775712', '21298412', '24753544', '12553064', '19681600', '17912454', '17597401', '20672986', '21362231', '17999917', '21470928', '27334922', '16159962', '21079653', '15125833', '27617579', '19048115', '18687691', '27797218', '26413934', '16684954', '20501406', '27515963', '22784503', '25941399', '12473120', '17891137', '16733295', '23826126', '21427728', '8900182', '26234677', '24648515', '25786138', '12958678', '16998791', '19061835', '11283269', '18258923', '11839584', '20132317', '19158374', '23245941', '23352210', '15465819', '15386433', '22575647', '15966238', '23633483', '25131797', '17102080', '19956840', '18506362', '17961162', '1607067', '24770328', '19825990', '22365656', '19720761', '24435975', '26882953', '17292826', '25119113', '26044620', '20717925', '15316008', '16619041', '19893488', '26999786', '26103054', '17331464', '20022966', '24189165', '19059939', '25474223', '20507346', '20976540', '2810532', '15685397', '27562587', '18538673', '15712349', '15448517', '27467210', '7584044', '21330319', '18381962', '24789704', '19058873', '10523313' ] elsevier_pmids = [ "140233", "126700", "138421", "131864", "122916", "127363", "130834", "135691", "147139", "142190", "124378", "132969", "127549", "131583", "148910", "140686", "126304", "124909", "145863", "127687", "143909", "134286", "144524", "145955", "125088", "122895", "144611", "152202", "140767", "139895", "152644", "140057", "149561", "143963", "136992", "137557", "144535", "148891", "145321", "133684", "126386", "148890", "124210", "131711", "124967", "138753", "132192", "142510", "130244", "123485", "126883", "151536", "126948", "137419", "141952", "130051", "122816", "150450", "133686", "126866", "138748", "149542", "144038", "145957", "136213", "148513", "141931", "140056", "139935", "123177", "124593", "141942", "133729", "124598", "124252", "126303", "152671", "141908", "124625", "152721", "150335", "133685", "150977", "124154", "140713", "146095", "123742", "140478", "143938", "140806", "124600", "123729", "127548", "145041", "139938", "143289", "131554", "125206", "142661", "122933" ] # Get the data from pmc oa (pmc_dicts) print("Getting pmc oa lists....") pmc = PmcOA() pmc_dicts = pmc.ftp.get_csv_as_dict('oa_file_list.csv', header=0) # Get the data for the manuscripts (man_dicts) print("Getting manuscript lists...") man = Manuscripts() man_dicts = man.ftp.get_csv_as_dict('filelist.csv', header=0) # Get pmid, pmcid, mid tuples for the examples that we will use. print("Generating example sets...") examples = [] for case in [(1, 0, 0), (1, 1, 0), (0, 1, 0), (1, 1, 1), (1, 0, 1)]: for _ in range(n): example = _get_example(case, med_pmid_list, pmc_dicts, man_dicts) examples.append(example) # Add a few pmids that probably include some statements. for pmid in random.sample(statementful_pmids, n): examples.append((pmid, '', '')) # Add a few pmids that link to elsevier content for pmid in random.sample(elsevier_pmids, n): examples.append((pmid, '', '')) # Add a special article to check article info. double_doi_info = med.get_article_info('baseline/pubmed18n0343.xml.gz') pmids_w_double_doi = [ k for k, v in double_doi_info.items() if v['doi'] is not None and len(v['doi']) > 100 ] assert len(pmids_w_double_doi), "No double dois found." examples.append(( random.choice(pmids_w_double_doi), '', '', )) # Create the test medline file. print("Creating medline test file...") pmid_list = [pmid for pmid, _, _ in examples if pmid != ''] tree = None for pmid in pmid_list: params = {'db': 'pubmed', 'retmode': 'xml', 'id': pmid} if tree is None: tree = pub.send_request(pub.pubmed_fetch, params) else: child = pub.send_request(pub.pubmed_fetch, params).getchildren()[0] tree.append(child) if tree is not None: f_bts = b'' f_bts += b"<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" f_bts += ET.tostring(tree) f_path = get_path('pubmed/baseline/pubmed18nTEST.xml.gz') with open(f_path, 'wb') as gzf: gzf.write(gzip.compress(f_bts)) # Create the test pmc oa article directory. print("Getting pmc oa xmls...") art_dirname = get_path('pub/pmc/articles.TEST.xml') if os.path.exists(art_dirname): shutil.rmtree(art_dirname) os.mkdir(art_dirname) pmcid_list = [pmcid for _, pmcid, _ in examples if pmcid != ''] ex_pmc_dicts = [d for d in pmc_dicts if d['Accession ID'] in pmcid_list] for d in ex_pmc_dicts: fname = pmc.ftp.download_file(d['File']) with tarfile.open(fname, 'r:gz') as tar: mems = tar.getmembers() mem = [mem for mem in mems if mem.name.endswith('.nxml')][0] f_str = tar.extractfile(mem).read() fname = d['Accession ID'] + '.nxml' re_ret = re.findall('<journal-title>(.*?)</journal-title>', f_str.decode('utf8')) if len(re_ret): sub_dir = os.path.join( art_dirname, re_ret[0].replace(' ', '_').replace('&', '')) else: sub_dir = os.path.join(art_dirname, 'unknown') if not os.path.exists(sub_dir): os.mkdir(sub_dir) path = os.path.join(sub_dir, fname) with open(path, 'wb') as f: f.write(f_str) with tarfile.open(art_dirname + '.tar.gz', 'w:gz') as tar: for dirname in os.listdir(art_dirname): tar.add(os.path.join(art_dirname, dirname), arcname=dirname) shutil.rmtree(art_dirname) # Create deleted pmids file (just make an empty file,for now. # TODO: Add test case to touch this. with open(get_path('pubmed/deleted.pmids.gz'), 'wb') as gzf: gzf.write(gzip.compress(b'')) # Create the test manuscripts file. print('Adding manuscript directories...') dirfmt = get_path('pub/pmc/manuscript/%s') dirnames = [dirfmt % ('PMC00%dXXXXXX.xml' % i) for i in range(2, 6)] for dirname in dirnames: if os.path.exists(dirname): shutil.rmtree(dirname) os.mkdir(dirname) ex_man_dicts = [d for d in man_dicts if d['PMCID'] in pmcid_list] for d in ex_man_dicts: d['Tarfile'] = man.get_tarname_from_filename(d['File']) tar_members = dict.fromkeys(set([d['Tarfile'] for d in ex_man_dicts])) for tarname in tar_members.keys(): if not os.path.exists(tarname): print("\tDownloading %s..." % tarname) man.ftp.download_file(tarname) for d in ex_man_dicts: parent_dir = os.path.join(dirfmt % tarname.replace('.tar.gz', ''), os.path.dirname(d['File'])) test_fname = os.path.join(dirfmt % tarname.replace('.tar.gz', ''), d['File']) with tarfile.open(d['Tarfile'], 'r:gz') as tar: print('\tExtracting %s from %s...' % (d['File'], d['Tarfile'])) tar.extract(d['File']) if not os.path.exists(parent_dir): os.makedirs(parent_dir) os.rename(d['File'], test_fname) for dirname in dirnames: with tarfile.open(dirname + '.tar.gz', 'w:gz') as tar: for sub_dirname in os.listdir(dirname): tar.add(os.path.join(dirname, sub_dirname), arcname=sub_dirname) shutil.rmtree(dirname) return examples