def test_pubmed_importer(pubmed_importer): last_index = pubmed_importer.api.get_changelog(limit=1)[0].index with open('tests/files/pubmedsample_2019.xml', 'r') as f: pubmed_importer.bezerk_mode = True counts = Bs4XmlLargeFilePusher(pubmed_importer, f, "PubmedArticle").run() assert counts['insert'] == 176 assert counts['exists'] == 0 assert counts['skip'] == 0 # fetch most recent editgroup change = pubmed_importer.api.get_changelog_entry(index=last_index + 1) eg = change.editgroup assert eg.description assert "pubmed" in eg.description.lower() assert eg.extra['git_rev'] assert "fatcat_tools.PubmedImporter" in eg.extra['agent'] last_index = pubmed_importer.api.get_changelog(limit=1)[0].index with open('tests/files/pubmedsample_2019.xml', 'r') as f: pubmed_importer.bezerk_mode = False pubmed_importer.reset() counts = Bs4XmlLargeFilePusher(pubmed_importer, f, "PubmedArticle").run() assert counts['insert'] == 0 assert counts['exists'] == 176 assert counts['skip'] == 0 assert last_index == pubmed_importer.api.get_changelog(limit=1)[0].index
def test_dblp_importer(dblp_importer): last_index = dblp_importer.api.get_changelog(limit=1)[0].index with open("tests/files/example_dblp.xml", "rb") as f: dblp_importer.bezerk_mode = True counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run() # print(counts) assert counts["insert"] == 3 assert counts["exists"] == 0 assert counts["skip"] == 1 # fetch most recent editgroup change = dblp_importer.api.get_changelog_entry(index=last_index + 1) eg = change.editgroup assert eg.description assert "dblp" in eg.description.lower() assert eg.extra["git_rev"] assert "fatcat_tools.DblpReleaseImporter" in eg.extra["agent"] # check that entity name mangling was fixed on import eg = dblp_importer.api.get_editgroup(eg.editgroup_id) release = dblp_importer.api.get_release(eg.edits.releases[0].ident) for r_edit in eg.edits.releases: release = dblp_importer.api.get_release(r_edit.ident) # print(release.ext_ids.dblp) if release.ext_ids.dblp == "conf/er/Norrie08": break assert release.ext_ids.dblp == "conf/er/Norrie08" assert release.contribs[0].raw_name == "Moira C. Norrie" assert release.contribs[1].raw_name == "Michael H. Böhlen" last_index = dblp_importer.api.get_changelog(limit=1)[0].index with open("tests/files/example_dblp.xml", "rb") as f: dblp_importer.bezerk_mode = False dblp_importer.reset() counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run() # print(counts) assert counts["insert"] == 0 assert counts["exists"] == 3 assert counts["skip"] == 1 assert last_index == dblp_importer.api.get_changelog(limit=1)[0].index
def run_dblp_release(args: argparse.Namespace) -> None: dri = DblpReleaseImporter( args.api, dblp_container_map_file=args.dblp_container_map_file, edit_batch_size=args.batch_size, do_updates=args.do_updates, dump_json_mode=args.dump_json_mode, ) Bs4XmlLargeFilePusher( dri, args.xml_file, DblpReleaseImporter.ELEMENT_TYPES, use_lxml=True, ).run()
def run_pubmed(args: argparse.Namespace) -> None: pi = PubmedImporter( args.api, args.issn_map_file, edit_batch_size=args.batch_size, do_updates=args.do_updates, lookup_refs=(not args.no_lookup_refs), ) if args.kafka_mode: KafkaBs4XmlPusher( pi, args.kafka_hosts, args.kafka_env, "ftp-pubmed", "fatcat-{}-import-pubmed".format(args.kafka_env), ).run() else: Bs4XmlLargeFilePusher( pi, args.xml_file, ["PubmedArticle"], ).run()