def test_arabesque_importer(arabesque_importer): last_index = arabesque_importer.api.get_changelog(limit=1)[0].index arabesque_importer.bezerk_mode = True counts = SqlitePusher(arabesque_importer, 'tests/files/arabesque_example.sqlite3', "crawl_result").run() assert counts['insert'] == 1 assert counts['exists'] == 0 assert counts['skip'] == 490 # fetch most recent editgroup change = arabesque_importer.api.get_changelog_entry(index=last_index + 1) eg = change.editgroup assert eg.description assert "identifier/url seedlist" in eg.description.lower() assert eg.extra['git_rev'] assert eg.extra['crawl_id'] == "DUMMY123" assert "fatcat_tools.ArabesqueMatchImporter" in eg.extra['agent'] # re-insert; should skip arabesque_importer.reset() arabesque_importer.bezerk_mode = False counts = SqlitePusher(arabesque_importer, 'tests/files/arabesque_example.sqlite3', "crawl_result").run() assert counts['insert'] == 0 assert counts['exists'] == 1 assert counts['skip'] == 490
def run_arabesque_match(args: argparse.Namespace) -> None: if (args.sqlite_file and args.json_file) or not (args.sqlite_file or args.json_file): print("Supply one of --sqlite-file or --json-file") ami = ArabesqueMatchImporter( args.api, editgroup_description=args.editgroup_description_override, do_updates=args.do_updates, require_grobid=(not args.no_require_grobid), extid_type=args.extid_type, crawl_id=args.crawl_id, default_link_rel=args.default_link_rel, edit_batch_size=args.batch_size, ) if args.sqlite_file: SqlitePusher(ami, args.sqlite_file, "crawl_result", ARABESQUE_MATCH_WHERE_CLAUSE).run() elif args.json_file: JsonLinePusher(ami, args.json_file).run()
def test_arabesque_importer_basic(arabesque_importer): SqlitePusher(arabesque_importer, 'tests/files/arabesque_example.sqlite3', "crawl_result").run()