def test_exports_complete(td_tmpdir, env_setup, capsys, pytestconfig): arg_sets = [ { "export-type": "ht-bib-full", "merge-version": "v2", "name": "full" }, { "export-type": "ht-bib-incr", "merge-version": "v3", "name": "incr" }, ] for arg_set in arg_sets: with pytest.raises(SystemExit) as pytest_e: sys.argv = [ "", arg_set["export-type"], "-mv", arg_set["merge-version"], "--force", "--verbosity", pytestconfig.getoption("verbose"), ] generate_cli() assert [pytest_e.type, pytest_e.value.code] == [SystemExit, 0] # compare cache created to reference cache new_cache = ExportCache( td_tmpdir, "cache-{}-{}".format( arg_set["merge-version"], datetime.datetime.today().strftime("%Y-%m-%d")), ) ref_cache = ExportCache( td_tmpdir, "cache-{}-ref".format(arg_set["merge-version"])) assert new_cache.size() == ref_cache.size() assert hash(new_cache.frozen_content_set()) == hash( ref_cache.frozen_content_set()) export_filename = "ht_bib_export_{}_{}.json".format( arg_set["name"], datetime.datetime.today().strftime("%Y-%m-%d")) assert filecmp.cmp( os.path.join(td_tmpdir, export_filename), os.path.join( td_tmpdir, "{}-ht_bib_export_{}_ref.json".format(arg_set["merge-version"], arg_set["name"]), ), )
def main(): htmm_db = config["database"][config["env"]] HTMM_DB_CONNECT_STR = str( URL( htmm_db.get("drivername", None), htmm_db.get("username", None), htmm_db.get("password", None), htmm_db.get("host", None), htmm_db.get("port", None), htmm_db.get("database", None), )) htmm_engine = create_engine(HTMM_DB_CONNECT_STR) live_statement = ( "select cid as cache_id, " "crc32(CONCAT(count(id), max(db_updated_at))) as cache_key " "from zephir_records " "where attr_ingest_date is not null " "group by cache_id") print(datetime.datetime.time(datetime.datetime.now())) cache = ExportCache(os.path.abspath("cache"), "quick-complete") live_index = {} with htmm_engine.connect() as con: result = con.execute(live_statement) for row in result: live_index[row.cache_id] = row.cache_key comparison = cache.compare(live_index) print("uncached") print(len(comparison.uncached)) print("verified") print(len(comparison.verified)) print("stale") print(len(comparison.stale)) print("unexamined") print(len(comparison.unexamined)) print(datetime.datetime.time(datetime.datetime.now()))
def test_add_to_cache(td_tmpdir): cache = ExportCache(td_tmpdir, "empty-cache") assert cache.size() == 0 new_entry = { "cache_id": "012345", "cache_key": "C7EE1838", "cache_data": '{"leader":"01158nam a22003491 4500"}', "cache_date": "2016-06-29 11:09:04", } cache.add(**new_entry) assert cache.size() == 1
def compare_cache_cli(ctx, files, verbosity): """Compare export caches for content differences. Ignores datetime of cache creation.""" console = ConsoleMessenger(app="ZEPHIR-EXPORT", verbosity=verbosity) f1_cache = ExportCache(path=set_abs_filepath(files[0])) f1_set = f1_cache.frozen_content_set() f2_cache = ExportCache(path=set_abs_filepath(files[1])) f2_set = f2_cache.frozen_content_set() if hash(f1_set) != hash(f2_set): for line in f1_set - f2_set: console.out("-(cid:{},key:{})".format(line[0], line[1])) for line in f2_set - f1_set: console.out("+(cid:{},key:{})".format(line[0], line[1])) console.info("Differences found between cache files") else: console.info("No differences found between cache files")
def test_export_with_alternate_cache_and_output(td_tmpdir, env_setup, capsys, pytestconfig): # SETUP TODO (cscollett: there may be a better place to put this) # set temp current working directory real_cwd = os.getcwd() os.chdir(td_tmpdir) with pytest.raises(SystemExit) as pytest_e: sys.argv = [ "", "ht-bib-full", "-mv", "v3", "--cache-path", "my_custom_cache.db", "--output-path", "my_custom_output.json", "--force", "--verbosity", pytestconfig.getoption("verbose"), ] generate_cli() assert [pytest_e.type, pytest_e.value.code] == [SystemExit, 0] # compare cache created to reference cache new_cache = ExportCache(td_tmpdir, "my_custom_cache") ref_cache = ExportCache(td_tmpdir, "cache-v3-ref") assert new_cache.size() == ref_cache.size() assert hash(new_cache.frozen_content_set()) == hash( ref_cache.frozen_content_set()) assert filecmp.cmp( os.path.join(td_tmpdir, "my_custom_output.json"), os.path.join(td_tmpdir, "v3-ht_bib_export_full_ref.json"), ) # CLEANUP # unset temp current working directory os.chdir(real_cwd)
def test_loaded_true_when_cache_table_does_exists(td_tmpdir): exists = ExportCache(td_tmpdir, "empty-cache") assert exists.cache_schema_exists_on_load is True
def test_remove_set(td_tmpdir): cache = ExportCache(td_tmpdir, "empty-cache") first_entry = { "cache_id": "012345", "cache_key": "C7EE1838", "cache_data": '{"leader":"01158nam a22003491 4500"}', "cache_date": "2016-06-29 11:09:04", } second_entry = { "cache_id": "67891", "cache_key": "R9PE2815", "cache_data": '{"leader":"02258nam a22003491 4500"}', "cache_date": "2018-06-29 11:09:04", } third_entry = { "cache_id": "11111", "cache_key": "2222222", "cache_data": '{"follower":"02258nam a22003491 4500"}', "cache_date": "2017-06-29 11:09:04", } cache.add(**first_entry) cache.add(**second_entry) cache.add(**third_entry) assert cache.size() == 3 cache.remove_set(["012345", "67891"]) assert cache.size() == 1
def test_update_to_cache_key_only_without_data_update(td_tmpdir): cache = ExportCache(td_tmpdir, "empty-cache") first_entry = { "cache_id": "012345", "cache_key": "C7EE1838", "cache_data": '{"leader":"this is marc data"}', "cache_date": "2016-06-29 11:09:04", } cache.add(**first_entry) first_result = cache.get("012345") second_entry = { "cache_id": "012345", "cache_key": "NEWKEY", "cache_data": '{"leader":"this is marc data"}', "cache_date": "NEWDATE", } cache.update(**second_entry) assert cache.get("012345")["cache_key"] == "NEWKEY" assert cache.get("012345")["cache_date"] == "NEWDATE" assert cache.get("012345")["data_date"] == first_result["data_date"] assert cache.get("012345")["data_key"] == first_result["data_key"] assert cache.get("012345")["cache_data"] == first_result["cache_data"]
def test_loaded_false_when_cache_table_does_not_exists(td_tmpdir): does_no_exist = ExportCache(td_tmpdir, "does-not-exist") assert does_no_exist.cache_schema_exists_on_load is False
def test_cache_hash_consistent(td_tmpdir): cache = ExportCache(td_tmpdir, "cache") same_cache = ExportCache(td_tmpdir, "same-cache") different_cache = ExportCache(td_tmpdir, "different-cache") new_entry = { "cache_id": "012345", "cache_key": "C7EE1838", "cache_data": '{"leader":"01158nam a22003491 4500"}', "cache_date": "2016-06-29 11:09:04", } diff_entry = { "cache_id": "543210", "cache_key": "RB87WZ38", "cache_data": '{"leader":"01158nam a22003491 6500"}', "cache_date": "2017-12-19 12:11:02", } cache.add(**new_entry) same_cache.add(**new_entry) different_cache.add(**diff_entry) cache_set = cache.frozen_content_set() same_cache_set = same_cache.frozen_content_set() different_cache_set = different_cache.frozen_content_set() assert hash(cache_set) == hash(same_cache_set) assert hash(cache_set) != hash(different_cache_set) different_cache.add(**new_entry) cache.add(**diff_entry) cache_set = cache.frozen_content_set() different_cache_set = different_cache.frozen_content_set() assert hash(cache_set) == hash(different_cache_set)
def main(argv=None): # Command line argument configuration parser = argparse.ArgumentParser() parser.add_argument( "-s", "--selection", action="store", help="Selection algorithm used for export", ) args = parser.parse_args() selection = args.selection if selection is None: raise "Must pass a selection algorithm to use. See --help" htmm_db = config["database"][config["env"]] HTMM_DB_CONNECT_STR = str( URL( htmm_db.get("drivername", None), htmm_db.get("username", None), htmm_db.get("password", None), htmm_db.get("host", None), htmm_db.get("port", None), htmm_db.get("database", None), ) ) htmm_engine = create_engine(HTMM_DB_CONNECT_STR) sql_select = { "v2": "select cid, db_updated_at, metadata_json, " "var_usfeddoc, var_score, concat(cid,'_',zr.autoid) as vufind_sort " "from zephir_records as zr " "inner join zephir_filedata as zf on zr.id = zf.id " "where attr_ingest_date is not null " "order by cid, var_score DESC, vufind_sort ASC", "v3": "select cid, db_updated_at, metadata_json, " "var_usfeddoc, var_score, concat(cid,'_',zr.autoid) as vufind_sort " "from zephir_records as zr " "inner join zephir_filedata as zf on zr.id = zf.id " "where attr_ingest_date is not null " "order by cid, var_usfeddoc DESC, var_score DESC, vufind_sort ASC", } start_time = datetime.datetime.now() live_index = {} max_date = None record_count = 0 records = [] htid = None current_cid = None cache = ExportCache( os.path.abspath(os.path.join(os.path.dirname(__file__), "cache")), "cache-{}-{}".format(selection, datetime.datetime.today().strftime("%Y-%m-%d")), ) try: bulk_session = cache.session() conn = mysql.connector.connect( user=htmm_db.get("username", None), password=htmm_db.get("password", None), host=htmm_db.get("host", None), database=htmm_db.get("database", None), ) cursor = conn.cursor() cursor.execute(sql_select[selection]) curr_cid = None records = [] entries = [] max_date = None for idx, row in enumerate(cursor): cid, db_date, record, var_usfeddoc, var_score, vufind_sort = row if cid != curr_cid or curr_cid is None: # write last cluster if curr_cid: cache_id = curr_cid cache_data = json.dumps( VufindFormatter.create_record(curr_cid, records).as_dict(), separators=(",", ":"), ) cache_key = zlib.crc32( "{}{}".format(len(records), max_date).encode("utf8") ) cache_date = max_date entry = cache.entry(cache_id, cache_key, cache_data, cache_date) entries.append(entry) # prepare next cluster curr_cid = cid records = [record] max_date = db_date else: if db_date > max_date: max_date = db_date records.append(record) # periodic save to chunk work if idx % 5000 == 0: bulk_session.bulk_save_objects(entries) entries = [] cache_id = curr_cid cache_data = json.dumps( VufindFormatter.create_record(curr_cid, records).as_dict(), separators=(",", ":"), ) cache_key = zlib.crc32("{}{}".format(len(records), max_date).encode("utf8")) cache_date = max_date entry = cache.entry(cache_id, cache_key, cache_data, cache_date) entries.append(entry) bulk_session.bulk_save_objects(entries) bulk_session.commit() bulk_session.close() print( "Finished: {} (Elapsed: {})".format( selection, str(datetime.datetime.now() - start_time) ) ) finally: cursor.close() conn.close()
def ht_bib_cache(console, input_path=None, cache_path=None, merge_version=None, force=False): debug_start_time = datetime.datetime.now() # LOAD: environment, configuration default_root_dir = os.path.join(os.path.dirname(__file__), "..") APP = utils.AppEnv(name="ZEPHIR", root_dir=default_root_dir) console.debug("Loading application environment and configuration") console.debug("Environment: {}".format(APP.ENV)) console.debug("Configuration: {}".format(APP.CONFIG_PATH)) # CACHE (store for caclulated/merged records for exports) cache_path = cache_path or APP.CACHE_PATH # use working directory if relative path given if not os.path.isabs(cache_path): cache_path = os.path.join(os.getcwd(), cache_path) # create a template file name if only a directory is given if os.path.isdir(cache_path): cache_template = "cache-{}-{}.db".format( merge_version, datetime.datetime.today().strftime("%Y-%m-%d")) cache_path = "{}/{}".format(cache_path, cache_template) console.debug("Cache template for file:{}".format(cache_template)) # if the directory doesn't exist, faile if not os.path.exists(os.path.dirname(cache_path)): console.error("Cache path invalid") raise SystemExit(2) # handle existing files if os.path.exists(cache_path): if force: console.debug("Cache file exist. Forcing overwrite") else: console.debug("Using existing cache: {}".format(cache_path)) return cache_path # create temporary cache tmp_cache_template = "tmp-cache-{}-{}".format( merge_version, datetime.datetime.today().strftime("%Y-%m-%d_%H%M%S.%f")) tmp_cache_path = os.path.join(os.path.dirname(cache_path), tmp_cache_template) console.debug("Tmp cache location: {}".format(tmp_cache_path)) console.debug("Cache location: {}".format(cache_path)) # create cache console.debug("Creating cache file, session") cache = ExportCache(path=tmp_cache_path, force=force) bulk_session = cache.session() try: # DATABASE: Access to current records # Load database settings db_settings = APP.CONFIG.get("database", {}).get(APP.ENV) db_config = utils.DatabaseHelper(config=db_settings, env_prefix="ZEPHIR") db_connection = mysql.connector.connect(**db_config.connection_args()) db_cursor = db_connection.cursor() # Load merge version queries sql_select = { "v2": "select cid, db_updated_at, metadata_json, " "var_usfeddoc, var_score, concat(cid,'_',zr.autoid) as vufind_sort " "from zephir_records as zr " "inner join zephir_filedata as zf on zr.id = zf.id " "where attr_ingest_date is not null " "order by cid, var_score DESC, vufind_sort ASC", "v3": "select cid, db_updated_at, metadata_json, " "var_usfeddoc, var_score, concat(cid,'_',zr.autoid) as vufind_sort " "from zephir_records as zr " "inner join zephir_filedata as zf on zr.id = zf.id " "where attr_ingest_date is not null " "order by cid, var_usfeddoc DESC, var_score DESC, vufind_sort ASC", } # Execute query db_cursor.execute(sql_select[merge_version]) # PROCESS: calculate/merge records from database into cache datastore console.debug("Processing records...") curr_cid = None records = [] entries = [] max_date = None for idx, row in enumerate(db_cursor): cid, db_date, record, var_usfeddoc, var_score, vufind_sort = row if cid != curr_cid or curr_cid is None: # write last cluster if curr_cid: cache_id = curr_cid cache_data = json.dumps( VufindFormatter.create_record(curr_cid, records).as_dict(), separators=(",", ":"), ) cache_key = zlib.crc32("{}{}".format( len(records), max_date).encode("utf8")) cache_date = max_date entry = cache.entry(cache_id, cache_key, cache_data, cache_date) entries.append(entry) # prepare next cluster curr_cid = cid records = [record] max_date = db_date else: if db_date > max_date: max_date = db_date records.append(record) # periodic save records to datastore to chunk work if idx % 5000 == 0: bulk_session.bulk_save_objects(entries) entries = [] # finish processing on last chunk of work cache_id = curr_cid cache_data = json.dumps( VufindFormatter.create_record(curr_cid, records).as_dict(), separators=(",", ":"), ) cache_key = zlib.crc32("{}{}".format(len(records), max_date).encode("utf8")) cache_date = max_date entry = cache.entry(cache_id, cache_key, cache_data, cache_date) entries.append(entry) bulk_session.bulk_save_objects(entries) console.debug("Finished processing, final commit to cache datastore") bulk_session.commit() bulk_session.close() os.rename(tmp_cache_path, cache_path) finally: # TODO(cc): This will fail if cursor not defined db_cursor.close() db_connection.close() console.debug("Completed Cache: {}".format( str(datetime.datetime.now() - debug_start_time))) return cache_path