Exemple #1
0
def test_pubmed_importer(pubmed_importer):
    last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
    with open('tests/files/pubmedsample_2019.xml', 'r') as f:
        pubmed_importer.bezerk_mode = True
        counts = Bs4XmlLargeFilePusher(pubmed_importer, f,
                                       "PubmedArticle").run()
    assert counts['insert'] == 176
    assert counts['exists'] == 0
    assert counts['skip'] == 0

    # fetch most recent editgroup
    change = pubmed_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "pubmed" in eg.description.lower()
    assert eg.extra['git_rev']
    assert "fatcat_tools.PubmedImporter" in eg.extra['agent']

    last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
    with open('tests/files/pubmedsample_2019.xml', 'r') as f:
        pubmed_importer.bezerk_mode = False
        pubmed_importer.reset()
        counts = Bs4XmlLargeFilePusher(pubmed_importer, f,
                                       "PubmedArticle").run()
    assert counts['insert'] == 0
    assert counts['exists'] == 176
    assert counts['skip'] == 0
    assert last_index == pubmed_importer.api.get_changelog(limit=1)[0].index
Exemple #2
0
def test_dblp_importer(dblp_importer):
    last_index = dblp_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/example_dblp.xml", "rb") as f:
        dblp_importer.bezerk_mode = True
        counts = Bs4XmlLargeFilePusher(dblp_importer,
                                       f,
                                       dblp_importer.ELEMENT_TYPES,
                                       use_lxml=True).run()
    # print(counts)
    assert counts["insert"] == 3
    assert counts["exists"] == 0
    assert counts["skip"] == 1

    # fetch most recent editgroup
    change = dblp_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "dblp" in eg.description.lower()
    assert eg.extra["git_rev"]
    assert "fatcat_tools.DblpReleaseImporter" in eg.extra["agent"]

    # check that entity name mangling was fixed on import
    eg = dblp_importer.api.get_editgroup(eg.editgroup_id)
    release = dblp_importer.api.get_release(eg.edits.releases[0].ident)
    for r_edit in eg.edits.releases:
        release = dblp_importer.api.get_release(r_edit.ident)
        # print(release.ext_ids.dblp)
        if release.ext_ids.dblp == "conf/er/Norrie08":
            break
    assert release.ext_ids.dblp == "conf/er/Norrie08"
    assert release.contribs[0].raw_name == "Moira C. Norrie"
    assert release.contribs[1].raw_name == "Michael H. Böhlen"

    last_index = dblp_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/example_dblp.xml", "rb") as f:
        dblp_importer.bezerk_mode = False
        dblp_importer.reset()
        counts = Bs4XmlLargeFilePusher(dblp_importer,
                                       f,
                                       dblp_importer.ELEMENT_TYPES,
                                       use_lxml=True).run()
    # print(counts)
    assert counts["insert"] == 0
    assert counts["exists"] == 3
    assert counts["skip"] == 1
    assert last_index == dblp_importer.api.get_changelog(limit=1)[0].index
Exemple #3
0
def run_dblp_release(args: argparse.Namespace) -> None:
    dri = DblpReleaseImporter(
        args.api,
        dblp_container_map_file=args.dblp_container_map_file,
        edit_batch_size=args.batch_size,
        do_updates=args.do_updates,
        dump_json_mode=args.dump_json_mode,
    )
    Bs4XmlLargeFilePusher(
        dri,
        args.xml_file,
        DblpReleaseImporter.ELEMENT_TYPES,
        use_lxml=True,
    ).run()
Exemple #4
0
def run_pubmed(args: argparse.Namespace) -> None:
    pi = PubmedImporter(
        args.api,
        args.issn_map_file,
        edit_batch_size=args.batch_size,
        do_updates=args.do_updates,
        lookup_refs=(not args.no_lookup_refs),
    )
    if args.kafka_mode:
        KafkaBs4XmlPusher(
            pi,
            args.kafka_hosts,
            args.kafka_env,
            "ftp-pubmed",
            "fatcat-{}-import-pubmed".format(args.kafka_env),
        ).run()
    else:
        Bs4XmlLargeFilePusher(
            pi,
            args.xml_file,
            ["PubmedArticle"],
        ).run()