Ejemplo n.º 1
0
def test_ingest_importer_web(ingest_web_importer):
    last_index = ingest_web_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/example_ingest_html.json", "r") as f:
        ingest_web_importer.bezerk_mode = True
        counts = JsonLinePusher(ingest_web_importer, f).run()
    assert counts["insert"] == 1
    assert counts["exists"] == 0
    assert counts["skip"] == 0

    # fetch most recent editgroup
    change = ingest_web_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "crawled from web" in eg.description.lower()
    assert eg.extra["git_rev"]
    assert "fatcat_tools.IngestWebResultImporter" in eg.extra["agent"]

    # re-import should skip
    with open("tests/files/example_ingest_html.json", "r") as f:
        ingest_web_importer.reset()
        ingest_web_importer.bezerk_mode = False
        counts = JsonLinePusher(ingest_web_importer, f).run()
    assert counts["insert"] == 0
    assert counts["exists"] == 1
    assert counts["skip"] == 0
Ejemplo n.º 2
0
def test_matched_importer(matched_importer):
    last_index = matched_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/example_matched.json", "r") as f:
        matched_importer.bezerk_mode = True
        counts = JsonLinePusher(matched_importer, f).run()
    assert counts["insert"] == 2
    assert counts["exists"] == 0
    assert counts["skip"] == 11

    # fetch most recent editgroup
    change = matched_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "file-to-release" in eg.description.lower()
    assert eg.extra["git_rev"]
    assert "fatcat_tools.MatchedImporter" in eg.extra["agent"]

    # re-insert; should skip
    with open("tests/files/example_matched.json", "r") as f:
        matched_importer.reset()
        matched_importer.bezerk_mode = False
        counts = JsonLinePusher(matched_importer, f).run()
    assert counts["insert"] == 0
    assert counts["exists"] == 2
    assert counts["skip"] == 11
Ejemplo n.º 3
0
def test_datacite_importer(datacite_importer):
    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/datacite_sample.jsonl", "r") as f:
        datacite_importer.bezerk_mode = True
        counts = JsonLinePusher(datacite_importer, f).run()
    assert counts["insert"] == 1
    assert counts["exists"] == 0
    assert counts["skip"] == 0

    # fetch most recent editgroup
    change = datacite_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "datacite" in eg.description.lower()
    assert eg.extra["git_rev"]
    assert "fatcat_tools.DataciteImporter" in eg.extra["agent"]

    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/datacite_sample.jsonl", "r") as f:
        datacite_importer.bezerk_mode = False
        datacite_importer.reset()
        counts = JsonLinePusher(datacite_importer, f).run()
    assert counts["insert"] == 0
    assert counts["exists"] == 1
    assert counts["skip"] == 0
    assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index
Ejemplo n.º 4
0
def test_crossref_importer(crossref_importer):
    last_index = crossref_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/crossref-works.2018-01-21.badsample.json",
              "r") as f:
        crossref_importer.bezerk_mode = True
        counts = JsonLinePusher(crossref_importer, f).run()
    assert counts["insert"] == 14
    assert counts["exists"] == 0
    assert counts["skip"] == 0

    # fetch most recent editgroup
    change = crossref_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "crossref" in eg.description.lower()
    assert eg.extra["git_rev"]
    assert "fatcat_tools.CrossrefImporter" in eg.extra["agent"]

    last_index = crossref_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/crossref-works.2018-01-21.badsample.json",
              "r") as f:
        crossref_importer.bezerk_mode = False
        crossref_importer.reset()
        counts = JsonLinePusher(crossref_importer, f).run()
    assert counts["insert"] == 0
    assert counts["exists"] == 14
    assert counts["skip"] == 0
    assert last_index == crossref_importer.api.get_changelog(limit=1)[0].index
Ejemplo n.º 5
0
def test_ingest_importer(ingest_importer):
    last_index = ingest_importer.api.get_changelog(limit=1)[0].index
    with open('tests/files/example_ingest.json', 'r') as f:
        ingest_importer.bezerk_mode = True
        counts = JsonLinePusher(ingest_importer, f).run()
    assert counts['insert'] == 2
    assert counts['exists'] == 0
    assert counts['skip'] == 11

    # fetch most recent editgroup
    change = ingest_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "crawled from web" in eg.description.lower()
    assert eg.extra['git_rev']
    assert "fatcat_tools.IngestFileResultImporter" in eg.extra['agent']

    # re-insert; should skip
    with open('tests/files/example_ingest.json', 'r') as f:
        ingest_importer.reset()
        ingest_importer.bezerk_mode = False
        counts = JsonLinePusher(ingest_importer, f).run()
    assert counts['insert'] == 0
    assert counts['exists'] == 2
    assert counts['skip'] == 11
Ejemplo n.º 6
0
def test_shadow_importer(shadow_importer):
    last_index = shadow_importer.api.get_changelog(limit=1)[0].index
    with open('tests/files/example_shadow.json', 'r') as f:
        shadow_importer.bezerk_mode = True
        counts = JsonLinePusher(shadow_importer, f).run()
    assert counts['insert'] == 2
    assert counts['exists'] == 0
    assert counts['skip'] == 8

    # fetch most recent editgroup
    change = shadow_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "shadow library" in eg.description.lower()
    assert eg.extra['git_rev']
    assert "fatcat_tools.ShadowLibraryImporter" in eg.extra['agent']

    # re-insert; should skip
    with open('tests/files/example_shadow.json', 'r') as f:
        shadow_importer.reset()
        shadow_importer.bezerk_mode = False
        counts = JsonLinePusher(shadow_importer, f).run()
    assert counts['insert'] == 0
    assert counts['exists'] == 2
    assert counts['skip'] == 8
Ejemplo n.º 7
0
def test_fileset_importer(fileset_importer):
    last_index = fileset_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/fileset_ltjp7k2nrbes3or5h4na5qgxlu.json", "r") as f:
        fileset_importer.bezerk_mode = True
        counts = JsonLinePusher(fileset_importer, f).run()
    assert counts["insert"] == 1
    assert counts["exists"] == 0
    assert counts["skip"] == 0

    # fetch most recent editgroup
    change = fileset_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "generic fileset" in eg.description.lower()
    assert eg.extra["git_rev"]
    assert "fatcat_tools.FilesetImporter" in eg.extra["agent"]

    # re-insert; should skip
    with open("tests/files/fileset_ltjp7k2nrbes3or5h4na5qgxlu.json", "r") as f:
        fileset_importer.reset()
        fileset_importer.bezerk_mode = False
        counts = JsonLinePusher(fileset_importer, f).run()
    assert counts["insert"] == 0
    assert counts["exists"] == 1
    assert counts["skip"] == 0
Ejemplo n.º 8
0
def test_ingest_fileset_file_importer(ingest_fileset_file_importer):
    """
    Similar to the above, but specifically tests 'file'/'success-file' import pathway
    """
    last_index = ingest_fileset_file_importer.api.get_changelog(
        limit=1)[0].index
    with open("tests/files/example_fileset_file_ingest_result.json", "r") as f:
        ingest_fileset_file_importer.bezerk_mode = True
        counts = JsonLinePusher(ingest_fileset_file_importer, f).run()
    assert counts["insert"] == 16
    assert counts["exists"] == 0
    assert counts["skip"] == 4
    assert counts["skip-bad-hashes"] == 4

    # fetch most recent editgroup
    change = ingest_fileset_file_importer.api.get_changelog_entry(
        index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "crawled from web" in eg.description.lower()
    assert eg.extra["git_rev"]
    assert "fatcat_tools.IngestFilesetFileResultImporter" in eg.extra["agent"]

    # re-insert; should skip
    with open("tests/files/example_fileset_file_ingest_result.json", "r") as f:
        ingest_fileset_file_importer.reset()
        ingest_fileset_file_importer.bezerk_mode = False
        counts = JsonLinePusher(ingest_fileset_file_importer, f).run()
    assert counts["insert"] == 0
    assert counts["exists"] == 16
    assert counts["skip"] == 4
    assert counts["skip-bad-hashes"] == 4
Ejemplo n.º 9
0
def test_journal_metadata_importer(journal_metadata_importer):
    last_index = journal_metadata_importer.api.get_changelog(limit=1)[0].index
    with open('tests/files/journal_metadata.sample.json', 'r') as f:
        journal_metadata_importer.bezerk_mode = True
        counts = JsonLinePusher(journal_metadata_importer, f).run()
    assert counts['insert'] == 20
    assert counts['exists'] == 0
    assert counts['skip'] == 0

    # fetch most recent editgroup
    change = journal_metadata_importer.api.get_changelog_entry(
        index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "container" in eg.description.lower()
    assert eg.extra['git_rev']
    assert "fatcat_tools.JournalMetadataImporter" in eg.extra['agent']

    with open('tests/files/journal_metadata.sample.json', 'r') as f:
        journal_metadata_importer.reset()
        journal_metadata_importer.bezerk_mode = False
        counts = JsonLinePusher(journal_metadata_importer, f).run()
    assert counts['insert'] == 0
    assert counts['exists'] == 20
    assert counts['skip'] == 0
Ejemplo n.º 10
0
def test_doaj_importer_existing_doi(doaj_importer):
    """
    One of the DOAJ test entities has a dummy DOI (10.123/abc); this test
    ensures that it isn't clobbered, an then that it gets updated.
    """
    with open("tests/files/example_doaj_articles.json", "r") as f:
        doaj_importer.reset()
        doaj_importer.bezerk_mode = False
        doaj_importer.do_updates = False
        doaj_importer.do_fuzzy_match = False
        counts = JsonLinePusher(doaj_importer, f).run()
    print(counts)
    assert counts["insert"] == 4
    assert counts["exists"] == 1
    assert counts["skip"] == 0
    success_changelog = doaj_importer.api.get_changelog(limit=1)[0]
    success_editgroup = doaj_importer.api.get_editgroup(
        success_changelog.editgroup_id)

    with open("tests/files/example_doaj_articles.json", "r") as f:
        doaj_importer.reset()
        doaj_importer.bezerk_mode = False
        doaj_importer.do_updates = True
        doaj_importer.do_fuzzy_match = False
        counts = JsonLinePusher(doaj_importer, f).run()
    print(counts)
    assert counts["insert"] == 0
    assert counts["exists"] == 4
    assert counts["update"] == 1
    update_changelog = doaj_importer.api.get_changelog(limit=1)[0]
    update_editgroup = doaj_importer.api.get_editgroup(
        update_changelog.editgroup_id)

    with open("tests/files/example_doaj_articles.json", "r") as f:
        doaj_importer.reset()
        doaj_importer.bezerk_mode = False
        doaj_importer.do_updates = True
        doaj_importer.do_fuzzy_match = False
        counts = JsonLinePusher(doaj_importer, f).run()
    print(counts)
    assert counts["insert"] == 0
    assert counts["exists"] == 5
    assert counts["update"] == 0

    # cleanup file entities (so other import tests work)
    eg = quick_eg(doaj_importer.api)
    for release_edit in success_editgroup.edits.releases:
        doaj_importer.api.delete_release(eg.editgroup_id, release_edit.ident)
    for release_edit in update_editgroup.edits.releases:
        print(release_edit)
        doaj_importer.api.update_release(
            eg.editgroup_id,
            release_edit.ident,
            ReleaseEntity(
                revision=release_edit.prev_revision,
                ext_ids=ReleaseExtIds(),
            ),
        )
    doaj_importer.api.accept_editgroup(eg.editgroup_id)
Ejemplo n.º 11
0
def test_dblp_container_importer(dblp_container_importer):
    last_index = dblp_container_importer.api.get_changelog(limit=1)[0].index
    output_tsv_map = io.StringIO()
    with open("tests/files/example_dblp_containers.json", "r") as f:
        dblp_container_importer.bezerk_mode = True
        dblp_container_importer.dblp_container_map_output = output_tsv_map
        counts = JsonLinePusher(dblp_container_importer, f).run()
    assert counts["insert"] == 10
    assert counts["exists"] == 0
    assert counts["skip"] == 0

    # fetch most recent editgroup
    change = dblp_container_importer.api.get_changelog_entry(index=last_index +
                                                             1)
    eg = change.editgroup
    assert eg.description
    assert "dblp" in eg.description.lower()
    assert eg.extra["git_rev"]
    assert "fatcat_tools.DblpContainerImporter" in eg.extra["agent"]

    # check that entity name mangling was fixed on import
    eg = dblp_container_importer.api.get_editgroup(eg.editgroup_id)
    for c_edit in eg.edits.containers:
        container = dblp_container_importer.api.get_container(c_edit.ident)
        if container.issnl == "1877-3273":
            break
    assert container.name == "Atlantis Thinking Machines"
    assert container.issnl == "1877-3273"
    assert container.container_type == "book-series"
    assert container.extra["dblp"]["prefix"] == "series/atlantis"
    assert container.extra["urls"] == [
        "http://link.springer.com/bookseries/10077"
    ]

    last_index = dblp_container_importer.api.get_changelog(limit=1)[0].index
    output_tsv_map.seek(0)
    # print(output_tsv_map.read())
    # output_tsv_map.seek(0)
    with open("tests/files/example_dblp_containers.json", "r") as f:
        dblp_container_importer.reset()
        dblp_container_importer.bezerk_mode = False
        dblp_container_importer.dblp_container_map_output = io.StringIO()
        dblp_container_importer.read_dblp_container_map_file(output_tsv_map)
        counts = JsonLinePusher(dblp_container_importer, f).run()
    print(counts)
    assert counts["insert"] == 0
    assert counts["exists"] == 10
    assert counts["skip"] == 0
    assert last_index == dblp_container_importer.api.get_changelog(
        limit=1)[0].index
Ejemplo n.º 12
0
def test_datacite_dict_parse(datacite_importer):
    with open('tests/files/datacite_sample.jsonl', 'r') as f:
        raw = json.load(f)
        r = datacite_importer.parse_record(raw)
        # ensure the API server is ok with format
        JsonLinePusher(datacite_importer, [json.dumps(raw)]).run()

        print(r.extra)
        assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
        assert r.publisher == "International Centre for Agricultural Research in Dry Areas"
        assert r.release_type == "article"
        assert r.release_stage == "published"
        assert r.license_slug == None
        assert r.original_title == None
        assert r.ext_ids.doi == "10.18730/8dym9"
        assert r.ext_ids.isbn13 == None
        assert r.language == "en"
        assert r.subtitle == None
        assert r.release_date == None
        assert r.release_year == 1986
        assert 'subtitle' not in r.extra
        assert 'subtitle' not in r.extra['datacite']
        assert 'funder' not in r.extra
        assert 'funder' not in r.extra['datacite']
        # matched by ISSN, so shouldn't be in there
        #assert extra['container_name'] == "International Journal of Quantum Chemistry"
        assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
        assert len(r.abstracts) == 1
        assert len(r.abstracts[0].content) == 421
        assert len(r.contribs) == 2
        assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA"
        assert r.contribs[0].given_name == None
        assert r.contribs[0].surname == None
        assert len(r.refs) == 0
Ejemplo n.º 13
0
def run_fileset(args: argparse.Namespace) -> None:
    fmi = FilesetImporter(
        args.api,
        edit_batch_size=100,
        skip_release_fileset_check=args.skip_release_fileset_check,
    )
    JsonLinePusher(fmi, args.json_file).run()
Ejemplo n.º 14
0
def run_files(args):
    fmi = FileCleaner(
        args.api,
        dry_run_mode=args.dry_run,
        edit_batch_size=args.batch_size,
        editgroup_description=args.editgroup_description_override)
    JsonLinePusher(fmi, args.json_file).run()
Ejemplo n.º 15
0
def run_file_meta(args: argparse.Namespace) -> None:
    # do_updates defaults to true for this importer
    fmi = FileMetaImporter(
        args.api,
        edit_batch_size=100,
        editgroup_description=args.editgroup_description_override,
    )
    JsonLinePusher(fmi, args.json_file).run()
Ejemplo n.º 16
0
def run_merge_files(args: argparse.Namespace) -> None:
    em = FileMerger(
        args.api,
        edit_batch_size=args.batch_size,
        dry_run_mode=args.dry_run,
        editgroup_description=args.editgroup_description_override,
    )
    JsonLinePusher(em, args.json_file).run()
Ejemplo n.º 17
0
def test_crossref_dict_parse(crossref_importer):
    with open('tests/files/crossref-works.single.json', 'r') as f:
        # not a single line
        raw = json.loads(f.read())
        r = crossref_importer.parse_record(raw)
        # ensure the API server is ok with format
        JsonLinePusher(crossref_importer, [json.dumps(raw)]).run()

        print(r.extra)
        assert r.title == "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators"
        assert r.publisher == "Wiley-Blackwell"
        assert r.release_type == "article-journal"
        assert r.release_stage == "published"
        assert r.license_slug == "CC-BY-NC-ND"
        assert r.original_title == "Renormalized perturbation theory auf deutsch"
        assert r.ext_ids.doi == "10.1002/(sici)1097-461x(1998)66:4<261::aid-qua1>3.0.co;2-t"
        assert r.ext_ids.isbn13 == "978-3-16-148410-0"
        assert r.language == "fr"
        assert r.subtitle == None
        assert 'subtitle' not in r.extra
        assert 'subtitle' not in r.extra['crossref']
        assert 'funder' not in r.extra
        assert 'funder' not in r.extra['crossref']
        # matched by ISSN, so shouldn't be in there
        #assert extra['container_name'] == "International Journal of Quantum Chemistry"
        assert r.extra['aliases'] == ["some other title"]
        assert r.extra['crossref']['archive'] == ['Portico', 'LOCKSS']
        assert len(r.contribs) == 6
        assert r.contribs[0].raw_name == "Marcelo D. Radicioni"
        assert r.contribs[0].given_name == "Marcelo D."
        assert r.contribs[0].surname == "Radicioni"
        assert r.contribs[0].index == 0
        assert r.contribs[0].extra['seq'] == "first"
        assert r.contribs[1].raw_affiliation == "Some University"
        assert r.contribs[1].extra['more_affiliations'] == ["Some Department"]
        assert r.contribs[1].role == "author"
        assert r.contribs[4].role == "editor"
        assert r.contribs[4].index is None
        assert r.contribs[4].extra is None
        assert r.contribs[5].role == "translator"
        assert r.contribs[5].index is None
        assert len(r.refs) == 25
        assert r.refs[0].key == "BIB1"
        assert r.refs[0].year == 1972
        assert r.refs[0].locator == "1734"
        assert r.refs[0].container_name == "J. Chem. Phys."
        assert r.refs[0].extra == {
            "volume": "57",
            "authors": ["Swenson"],
            "doi": "10.1063/1.1678462",
            "medium": "DVD"
        }
        assert r.refs[2].key == 'BIB3'
        assert r.refs[2].extra.get('author') is None
        assert r.refs[
            2].container_name == "Hypervirial Theorem's, Lecture Notes in Chemistry <3"
        assert r.refs[
            3].container_name == "Large Order Perturbation Theory and Summation Methods in Quantum Mechanics, Lecture Notes in Chemistry"
Ejemplo n.º 18
0
def run_matched(args: argparse.Namespace) -> None:
    fmi = MatchedImporter(
        args.api,
        edit_batch_size=args.batch_size,
        editgroup_description=args.editgroup_description_override,
        default_link_rel=args.default_link_rel,
        default_mimetype=args.default_mimetype,
    )
    JsonLinePusher(fmi, args.json_file).run()
Ejemplo n.º 19
0
def test_datacite_importer_huge(datacite_importer):
    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
    with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f:
        datacite_importer.bezerk_mode = True
        counts = JsonLinePusher(datacite_importer, f).run()
    assert counts['insert'] == 998
    change = datacite_importer.api.get_changelog_entry(index=last_index+1)
    release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident)
    assert len(release.contribs) == 3
Ejemplo n.º 20
0
def run_merge_containers(args: argparse.Namespace) -> None:
    em = ContainerMerger(
        args.api,
        edit_batch_size=args.batch_size,
        dry_run_mode=args.dry_run,
        max_container_releases=args.max_container_releases,
        clobber_human_edited=args.clobber_human_edited,
        editgroup_description=args.editgroup_description_override,
    )
    JsonLinePusher(em, args.json_file).run()
Ejemplo n.º 21
0
def run_dblp_container(args: argparse.Namespace) -> None:
    dci = DblpContainerImporter(
        args.api,
        args.issn_map_file,
        dblp_container_map_file=args.dblp_container_map_file,
        dblp_container_map_output=args.dblp_container_map_output,
        edit_batch_size=args.batch_size,
        do_updates=args.do_updates,
    )
    JsonLinePusher(dci, args.json_file).run()
Ejemplo n.º 22
0
def test_crossref_importer_huge(crossref_importer):
    last_index = crossref_importer.api.get_changelog(limit=1)[0].index
    with gzip.open('tests/files/huge_crossref_doi.json.gz', 'rt') as f:
        crossref_importer.bezerk_mode = True
        line = f.readline()
        mega_blob = [line for i in range(95)]
        counts = JsonLinePusher(crossref_importer, mega_blob).run()
    assert counts['insert'] == 95
    change = crossref_importer.api.get_changelog_entry(index=last_index+1)
    release = crossref_importer.api.get_release(change.editgroup.edits.releases[0].ident)
    assert len(release.contribs) == 1014
Ejemplo n.º 23
0
def test_doaj_importer(doaj_importer):
    last_index = doaj_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/example_doaj_articles.json", "r") as f:
        doaj_importer.bezerk_mode = True
        doaj_importer.do_fuzzy_match = False
        counts = JsonLinePusher(doaj_importer, f).run()
    assert counts["insert"] == 5
    assert counts["exists"] == 0
    assert counts["skip"] == 0
    success_changelog = doaj_importer.api.get_changelog(limit=1)[0]
    assert last_index + 1 == success_changelog.index

    # fetch most recent editgroup
    change = doaj_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "doaj" in eg.description.lower()
    assert eg.extra["git_rev"]
    assert "fatcat_tools.DoajArticleImporter" in eg.extra["agent"]

    last_index = doaj_importer.api.get_changelog(limit=1)[0].index
    with open("tests/files/example_doaj_articles.json", "r") as f:
        doaj_importer.bezerk_mode = False
        doaj_importer.reset()
        counts = JsonLinePusher(doaj_importer, f).run()
    assert counts["insert"] == 0
    assert counts["exists"] == 5
    assert counts["skip"] == 0
    assert last_index == doaj_importer.api.get_changelog(limit=1)[0].index

    # cleanup file entities (so other import tests work)
    success_editgroup = doaj_importer.api.get_editgroup(
        success_changelog.editgroup_id)
    eg = quick_eg(doaj_importer.api)
    for release_edit in success_editgroup.edits.releases:
        doaj_importer.api.delete_release(eg.editgroup_id, release_edit.ident)
    doaj_importer.api.accept_editgroup(eg.editgroup_id)
Ejemplo n.º 24
0
def test_orcid_importer(orcid_importer):
    last_index = orcid_importer.api.get_changelog(limit=1)[0].index
    with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
        orcid_importer.bezerk_mode = True
        counts = JsonLinePusher(orcid_importer, f).run()
    assert counts['insert'] == 1
    assert counts['exists'] == 0
    assert counts['skip'] == 0

    # fetch most recent editgroup
    change = orcid_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "orcid" in eg.description.lower()
    assert eg.extra['git_rev']
    assert "fatcat_tools.OrcidImporter" in eg.extra['agent']

    with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
        orcid_importer.reset()
        orcid_importer.bezerk_mode = False
        counts = JsonLinePusher(orcid_importer, f).run()
    assert counts['insert'] == 0
    assert counts['exists'] == 1
    assert counts['skip'] == 0
Ejemplo n.º 25
0
def test_crossref_subtitle(crossref_importer):
    """
    Tests new subtitle field, explicitly
    """
    with open('tests/files/crossref-works.single.json', 'r') as f:
        # not a single line
        raw = json.loads(f.read())
        raw['subtitle'] = ["some bogus subtitle", "blah"]
        r = crossref_importer.parse_record(raw)
        # ensure the API server is ok with format
        JsonLinePusher(crossref_importer, [json.dumps(raw)]).run()

        print(r.extra)
        assert r.title == "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators"
        assert r.subtitle == "some bogus subtitle"
        assert 'subtitle' not in r.extra
        assert 'subtitle' not in r.extra['crossref']
Ejemplo n.º 26
0
def run_doaj_article(args: argparse.Namespace) -> None:
    dai = DoajArticleImporter(
        args.api,
        args.issn_map_file,
        edit_batch_size=args.batch_size,
        do_updates=args.do_updates,
    )
    if args.kafka_mode:
        KafkaJsonPusher(
            dai,
            args.kafka_hosts,
            args.kafka_env,
            "api-doaj",
            "fatcat-{}-import-doaj".format(args.kafka_env),
            consume_batch_size=args.batch_size,
        ).run()
    else:
        JsonLinePusher(dai, args.json_file).run()
Ejemplo n.º 27
0
def run_crossref(args: argparse.Namespace) -> None:
    fci = CrossrefImporter(
        args.api,
        args.issn_map_file,
        edit_batch_size=args.batch_size,
        bezerk_mode=args.bezerk_mode,
    )
    if args.kafka_mode:
        KafkaJsonPusher(
            fci,
            args.kafka_hosts,
            args.kafka_env,
            "api-crossref",
            "fatcat-{}-import-crossref".format(args.kafka_env),
            consume_batch_size=args.batch_size,
        ).run()
    else:
        JsonLinePusher(fci, args.json_file).run()
Ejemplo n.º 28
0
def run_arabesque_match(args: argparse.Namespace) -> None:
    if (args.sqlite_file
            and args.json_file) or not (args.sqlite_file or args.json_file):
        print("Supply one of --sqlite-file or --json-file")
    ami = ArabesqueMatchImporter(
        args.api,
        editgroup_description=args.editgroup_description_override,
        do_updates=args.do_updates,
        require_grobid=(not args.no_require_grobid),
        extid_type=args.extid_type,
        crawl_id=args.crawl_id,
        default_link_rel=args.default_link_rel,
        edit_batch_size=args.batch_size,
    )
    if args.sqlite_file:
        SqlitePusher(ami, args.sqlite_file, "crawl_result",
                     ARABESQUE_MATCH_WHERE_CLAUSE).run()
    elif args.json_file:
        JsonLinePusher(ami, args.json_file).run()
Ejemplo n.º 29
0
def run_savepapernow_fileset(args: argparse.Namespace) -> None:
    ifri = SavePaperNowFilesetImporter(
        args.api,
        editgroup_description=args.editgroup_description_override,
        edit_batch_size=args.batch_size,
    )
    if args.kafka_mode:
        KafkaJsonPusher(
            ifri,
            args.kafka_hosts,
            args.kafka_env,
            "ingest-file-results",
            "fatcat-{}-savepapernow-fileset-result".format(args.kafka_env),
            kafka_namespace="sandcrawler",
            consume_batch_size=args.batch_size,
            force_flush=True,
        ).run()
    else:
        JsonLinePusher(ifri, args.json_file).run()
Ejemplo n.º 30
0
def test_file_meta_importer_basic(file_meta_importer):

    # insert two file entities
    api = file_meta_importer.api
    eg = quick_eg(file_meta_importer.api)
    # with full metadata
    f1edit = api.create_file(
        eg.editgroup_id,
        FileEntity(
            size=372121,
            md5="e1fd97475c8aa102568f5d70a1bd0c07",
            sha1="0000045687dad717ed6512e395b04ec9c00995b7",
            sha256=
            "51bdc9e40cc175089fcb60b0b188e6cbcdcddb1ff8acbe6b039b8f8fff0afff0",
            mimetype="application/pdf",
        ))
    # partial/stub metadata
    f2edit = api.create_file(
        eg.editgroup_id,
        FileEntity(
            sha1="00000376ad49f56145721503f1eb5e6e49e779fd",
            mimetype="application/pdf",
        ))
    api.accept_editgroup(eg.editgroup_id)

    last_index = file_meta_importer.api.get_changelog(limit=1)[0].index

    with open('tests/files/example_file_meta.json', 'r') as f:
        counts = JsonLinePusher(file_meta_importer, f).run()

    assert counts['insert'] == 0
    assert counts['exists'] == 0
    assert counts['update'] == 1
    assert counts['skip-no-match'] == 4
    assert counts['skip-missing-field'] == 1
    assert counts['skip-existing-complete'] == 1

    # cleanup file entities
    eg = quick_eg(file_meta_importer.api)
    api.delete_file(eg.editgroup_id, f1edit.ident)
    api.delete_file(eg.editgroup_id, f2edit.ident)
    api.accept_editgroup(eg.editgroup_id)