Exemple #1
0
def test_arabesque_importer(arabesque_importer):
    last_index = arabesque_importer.api.get_changelog(limit=1)[0].index
    arabesque_importer.bezerk_mode = True
    counts = SqlitePusher(arabesque_importer,
                          'tests/files/arabesque_example.sqlite3',
                          "crawl_result").run()
    assert counts['insert'] == 1
    assert counts['exists'] == 0
    assert counts['skip'] == 490

    # fetch most recent editgroup
    change = arabesque_importer.api.get_changelog_entry(index=last_index + 1)
    eg = change.editgroup
    assert eg.description
    assert "identifier/url seedlist" in eg.description.lower()
    assert eg.extra['git_rev']
    assert eg.extra['crawl_id'] == "DUMMY123"
    assert "fatcat_tools.ArabesqueMatchImporter" in eg.extra['agent']

    # re-insert; should skip
    arabesque_importer.reset()
    arabesque_importer.bezerk_mode = False
    counts = SqlitePusher(arabesque_importer,
                          'tests/files/arabesque_example.sqlite3',
                          "crawl_result").run()
    assert counts['insert'] == 0
    assert counts['exists'] == 1
    assert counts['skip'] == 490
Exemple #2
0
def run_arabesque_match(args: argparse.Namespace) -> None:
    if (args.sqlite_file
            and args.json_file) or not (args.sqlite_file or args.json_file):
        print("Supply one of --sqlite-file or --json-file")
    ami = ArabesqueMatchImporter(
        args.api,
        editgroup_description=args.editgroup_description_override,
        do_updates=args.do_updates,
        require_grobid=(not args.no_require_grobid),
        extid_type=args.extid_type,
        crawl_id=args.crawl_id,
        default_link_rel=args.default_link_rel,
        edit_batch_size=args.batch_size,
    )
    if args.sqlite_file:
        SqlitePusher(ami, args.sqlite_file, "crawl_result",
                     ARABESQUE_MATCH_WHERE_CLAUSE).run()
    elif args.json_file:
        JsonLinePusher(ami, args.json_file).run()
Exemple #3
0
def test_arabesque_importer_basic(arabesque_importer):
    SqlitePusher(arabesque_importer, 'tests/files/arabesque_example.sqlite3',
                 "crawl_result").run()