Beispiel #1
0
def test_exports_complete(td_tmpdir, env_setup, capsys, pytestconfig):
    arg_sets = [
        {
            "export-type": "ht-bib-full",
            "merge-version": "v2",
            "name": "full"
        },
        {
            "export-type": "ht-bib-incr",
            "merge-version": "v3",
            "name": "incr"
        },
    ]
    for arg_set in arg_sets:
        with pytest.raises(SystemExit) as pytest_e:
            sys.argv = [
                "",
                arg_set["export-type"],
                "-mv",
                arg_set["merge-version"],
                "--force",
                "--verbosity",
                pytestconfig.getoption("verbose"),
            ]

            generate_cli()

        assert [pytest_e.type, pytest_e.value.code] == [SystemExit, 0]
        # compare cache created to reference cache
        new_cache = ExportCache(
            td_tmpdir,
            "cache-{}-{}".format(
                arg_set["merge-version"],
                datetime.datetime.today().strftime("%Y-%m-%d")),
        )
        ref_cache = ExportCache(
            td_tmpdir, "cache-{}-ref".format(arg_set["merge-version"]))
        assert new_cache.size() == ref_cache.size()
        assert hash(new_cache.frozen_content_set()) == hash(
            ref_cache.frozen_content_set())
        export_filename = "ht_bib_export_{}_{}.json".format(
            arg_set["name"],
            datetime.datetime.today().strftime("%Y-%m-%d"))
        assert filecmp.cmp(
            os.path.join(td_tmpdir, export_filename),
            os.path.join(
                td_tmpdir,
                "{}-ht_bib_export_{}_ref.json".format(arg_set["merge-version"],
                                                      arg_set["name"]),
            ),
        )
Beispiel #2
0
def main():
    htmm_db = config["database"][config["env"]]

    HTMM_DB_CONNECT_STR = str(
        URL(
            htmm_db.get("drivername", None),
            htmm_db.get("username", None),
            htmm_db.get("password", None),
            htmm_db.get("host", None),
            htmm_db.get("port", None),
            htmm_db.get("database", None),
        ))
    htmm_engine = create_engine(HTMM_DB_CONNECT_STR)
    live_statement = (
        "select cid as cache_id, "
        "crc32(CONCAT(count(id), max(db_updated_at))) as cache_key "
        "from zephir_records "
        "where attr_ingest_date is not null "
        "group by cache_id")

    print(datetime.datetime.time(datetime.datetime.now()))
    cache = ExportCache(os.path.abspath("cache"), "quick-complete")
    live_index = {}
    with htmm_engine.connect() as con:
        result = con.execute(live_statement)
        for row in result:
            live_index[row.cache_id] = row.cache_key
    comparison = cache.compare(live_index)
    print("uncached")
    print(len(comparison.uncached))
    print("verified")
    print(len(comparison.verified))
    print("stale")
    print(len(comparison.stale))
    print("unexamined")
    print(len(comparison.unexamined))
    print(datetime.datetime.time(datetime.datetime.now()))
def test_add_to_cache(td_tmpdir):
    cache = ExportCache(td_tmpdir, "empty-cache")
    assert cache.size() == 0
    new_entry = {
        "cache_id": "012345",
        "cache_key": "C7EE1838",
        "cache_data": '{"leader":"01158nam a22003491  4500"}',
        "cache_date": "2016-06-29 11:09:04",
    }
    cache.add(**new_entry)
    assert cache.size() == 1
Beispiel #4
0
def compare_cache_cli(ctx, files, verbosity):
    """Compare export caches for content differences. Ignores datetime of cache creation."""
    console = ConsoleMessenger(app="ZEPHIR-EXPORT", verbosity=verbosity)
    f1_cache = ExportCache(path=set_abs_filepath(files[0]))
    f1_set = f1_cache.frozen_content_set()
    f2_cache = ExportCache(path=set_abs_filepath(files[1]))
    f2_set = f2_cache.frozen_content_set()
    if hash(f1_set) != hash(f2_set):
        for line in f1_set - f2_set:
            console.out("-(cid:{},key:{})".format(line[0], line[1]))
        for line in f2_set - f1_set:
            console.out("+(cid:{},key:{})".format(line[0], line[1]))
        console.info("Differences found between cache files")
    else:
        console.info("No differences found between cache files")
Beispiel #5
0
def test_export_with_alternate_cache_and_output(td_tmpdir, env_setup, capsys,
                                                pytestconfig):
    # SETUP TODO (cscollett: there may be a better place to put this)
    # set temp current working directory
    real_cwd = os.getcwd()
    os.chdir(td_tmpdir)

    with pytest.raises(SystemExit) as pytest_e:
        sys.argv = [
            "",
            "ht-bib-full",
            "-mv",
            "v3",
            "--cache-path",
            "my_custom_cache.db",
            "--output-path",
            "my_custom_output.json",
            "--force",
            "--verbosity",
            pytestconfig.getoption("verbose"),
        ]

        generate_cli()

    assert [pytest_e.type, pytest_e.value.code] == [SystemExit, 0]
    # compare cache created to reference cache
    new_cache = ExportCache(td_tmpdir, "my_custom_cache")
    ref_cache = ExportCache(td_tmpdir, "cache-v3-ref")
    assert new_cache.size() == ref_cache.size()
    assert hash(new_cache.frozen_content_set()) == hash(
        ref_cache.frozen_content_set())
    assert filecmp.cmp(
        os.path.join(td_tmpdir, "my_custom_output.json"),
        os.path.join(td_tmpdir, "v3-ht_bib_export_full_ref.json"),
    )
    # CLEANUP
    # unset temp current working directory
    os.chdir(real_cwd)
def test_loaded_true_when_cache_table_does_exists(td_tmpdir):
    exists = ExportCache(td_tmpdir, "empty-cache")
    assert exists.cache_schema_exists_on_load is True
def test_remove_set(td_tmpdir):
    cache = ExportCache(td_tmpdir, "empty-cache")
    first_entry = {
        "cache_id": "012345",
        "cache_key": "C7EE1838",
        "cache_data": '{"leader":"01158nam a22003491  4500"}',
        "cache_date": "2016-06-29 11:09:04",
    }
    second_entry = {
        "cache_id": "67891",
        "cache_key": "R9PE2815",
        "cache_data": '{"leader":"02258nam a22003491  4500"}',
        "cache_date": "2018-06-29 11:09:04",
    }
    third_entry = {
        "cache_id": "11111",
        "cache_key": "2222222",
        "cache_data": '{"follower":"02258nam a22003491  4500"}',
        "cache_date": "2017-06-29 11:09:04",
    }
    cache.add(**first_entry)
    cache.add(**second_entry)
    cache.add(**third_entry)
    assert cache.size() == 3
    cache.remove_set(["012345", "67891"])
    assert cache.size() == 1
def test_update_to_cache_key_only_without_data_update(td_tmpdir):
    cache = ExportCache(td_tmpdir, "empty-cache")
    first_entry = {
        "cache_id": "012345",
        "cache_key": "C7EE1838",
        "cache_data": '{"leader":"this is marc data"}',
        "cache_date": "2016-06-29 11:09:04",
    }
    cache.add(**first_entry)
    first_result = cache.get("012345")
    second_entry = {
        "cache_id": "012345",
        "cache_key": "NEWKEY",
        "cache_data": '{"leader":"this is marc data"}',
        "cache_date": "NEWDATE",
    }
    cache.update(**second_entry)
    assert cache.get("012345")["cache_key"] == "NEWKEY"
    assert cache.get("012345")["cache_date"] == "NEWDATE"
    assert cache.get("012345")["data_date"] == first_result["data_date"]
    assert cache.get("012345")["data_key"] == first_result["data_key"]
    assert cache.get("012345")["cache_data"] == first_result["cache_data"]
def test_loaded_false_when_cache_table_does_not_exists(td_tmpdir):
    does_no_exist = ExportCache(td_tmpdir, "does-not-exist")
    assert does_no_exist.cache_schema_exists_on_load is False
def test_cache_hash_consistent(td_tmpdir):
    cache = ExportCache(td_tmpdir, "cache")
    same_cache = ExportCache(td_tmpdir, "same-cache")
    different_cache = ExportCache(td_tmpdir, "different-cache")
    new_entry = {
        "cache_id": "012345",
        "cache_key": "C7EE1838",
        "cache_data": '{"leader":"01158nam a22003491  4500"}',
        "cache_date": "2016-06-29 11:09:04",
    }
    diff_entry = {
        "cache_id": "543210",
        "cache_key": "RB87WZ38",
        "cache_data": '{"leader":"01158nam a22003491  6500"}',
        "cache_date": "2017-12-19 12:11:02",
    }
    cache.add(**new_entry)
    same_cache.add(**new_entry)
    different_cache.add(**diff_entry)
    cache_set = cache.frozen_content_set()
    same_cache_set = same_cache.frozen_content_set()
    different_cache_set = different_cache.frozen_content_set()
    assert hash(cache_set) == hash(same_cache_set)
    assert hash(cache_set) != hash(different_cache_set)
    different_cache.add(**new_entry)
    cache.add(**diff_entry)
    cache_set = cache.frozen_content_set()
    different_cache_set = different_cache.frozen_content_set()
    assert hash(cache_set) == hash(different_cache_set)
def main(argv=None):
    # Command line argument configuration
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-s",
        "--selection",
        action="store",
        help="Selection algorithm used for export",
    )
    args = parser.parse_args()
    selection = args.selection
    if selection is None:
        raise "Must pass a selection algorithm to use. See --help"

    htmm_db = config["database"][config["env"]]

    HTMM_DB_CONNECT_STR = str(
        URL(
            htmm_db.get("drivername", None),
            htmm_db.get("username", None),
            htmm_db.get("password", None),
            htmm_db.get("host", None),
            htmm_db.get("port", None),
            htmm_db.get("database", None),
        )
    )
    htmm_engine = create_engine(HTMM_DB_CONNECT_STR)

    sql_select = {
        "v2": "select cid, db_updated_at, metadata_json, "
        "var_usfeddoc, var_score, concat(cid,'_',zr.autoid) as vufind_sort  "
        "from zephir_records as zr "
        "inner join zephir_filedata as zf on zr.id = zf.id "
        "where attr_ingest_date is not null "
        "order by cid, var_score DESC, vufind_sort ASC",
        "v3": "select cid, db_updated_at, metadata_json, "
        "var_usfeddoc, var_score, concat(cid,'_',zr.autoid) as vufind_sort  "
        "from zephir_records as zr "
        "inner join zephir_filedata as zf on zr.id = zf.id "
        "where attr_ingest_date is not null "
        "order by cid, var_usfeddoc DESC, var_score DESC, vufind_sort ASC",
    }
    start_time = datetime.datetime.now()
    live_index = {}
    max_date = None
    record_count = 0
    records = []
    htid = None
    current_cid = None

    cache = ExportCache(
        os.path.abspath(os.path.join(os.path.dirname(__file__), "cache")),
        "cache-{}-{}".format(selection, datetime.datetime.today().strftime("%Y-%m-%d")),
    )

    try:
        bulk_session = cache.session()
        conn = mysql.connector.connect(
            user=htmm_db.get("username", None),
            password=htmm_db.get("password", None),
            host=htmm_db.get("host", None),
            database=htmm_db.get("database", None),
        )

        cursor = conn.cursor()
        cursor.execute(sql_select[selection])

        curr_cid = None
        records = []
        entries = []
        max_date = None
        for idx, row in enumerate(cursor):
            cid, db_date, record, var_usfeddoc, var_score, vufind_sort = row
            if cid != curr_cid or curr_cid is None:
                # write last cluster
                if curr_cid:
                    cache_id = curr_cid
                    cache_data = json.dumps(
                        VufindFormatter.create_record(curr_cid, records).as_dict(),
                        separators=(",", ":"),
                    )
                    cache_key = zlib.crc32(
                        "{}{}".format(len(records), max_date).encode("utf8")
                    )
                    cache_date = max_date
                    entry = cache.entry(cache_id, cache_key, cache_data, cache_date)
                    entries.append(entry)

                # prepare next cluster
                curr_cid = cid
                records = [record]
                max_date = db_date
            else:
                if db_date > max_date:
                    max_date = db_date
                records.append(record)

            # periodic save to chunk work
            if idx % 5000 == 0:
                bulk_session.bulk_save_objects(entries)
                entries = []

        cache_id = curr_cid
        cache_data = json.dumps(
            VufindFormatter.create_record(curr_cid, records).as_dict(),
            separators=(",", ":"),
        )
        cache_key = zlib.crc32("{}{}".format(len(records), max_date).encode("utf8"))
        cache_date = max_date
        entry = cache.entry(cache_id, cache_key, cache_data, cache_date)
        entries.append(entry)
        bulk_session.bulk_save_objects(entries)
        bulk_session.commit()
        bulk_session.close()
        print(
            "Finished: {} (Elapsed: {})".format(
                selection, str(datetime.datetime.now() - start_time)
            )
        )
    finally:
        cursor.close()
        conn.close()
Beispiel #12
0
def ht_bib_cache(console,
                 input_path=None,
                 cache_path=None,
                 merge_version=None,
                 force=False):
    debug_start_time = datetime.datetime.now()

    # LOAD: environment, configuration
    default_root_dir = os.path.join(os.path.dirname(__file__), "..")
    APP = utils.AppEnv(name="ZEPHIR", root_dir=default_root_dir)
    console.debug("Loading application environment and configuration")
    console.debug("Environment: {}".format(APP.ENV))
    console.debug("Configuration: {}".format(APP.CONFIG_PATH))

    # CACHE (store for caclulated/merged records for exports)
    cache_path = cache_path or APP.CACHE_PATH
    # use working directory if relative path given
    if not os.path.isabs(cache_path):
        cache_path = os.path.join(os.getcwd(), cache_path)

    # create a template file name if only a directory is given
    if os.path.isdir(cache_path):
        cache_template = "cache-{}-{}.db".format(
            merge_version,
            datetime.datetime.today().strftime("%Y-%m-%d"))
        cache_path = "{}/{}".format(cache_path, cache_template)
        console.debug("Cache template for file:{}".format(cache_template))

    # if the directory doesn't exist, faile
    if not os.path.exists(os.path.dirname(cache_path)):
        console.error("Cache path invalid")
        raise SystemExit(2)

    # handle existing files
    if os.path.exists(cache_path):
        if force:
            console.debug("Cache file exist. Forcing overwrite")
        else:
            console.debug("Using existing cache: {}".format(cache_path))
            return cache_path

    #  create temporary cache
    tmp_cache_template = "tmp-cache-{}-{}".format(
        merge_version,
        datetime.datetime.today().strftime("%Y-%m-%d_%H%M%S.%f"))
    tmp_cache_path = os.path.join(os.path.dirname(cache_path),
                                  tmp_cache_template)
    console.debug("Tmp cache location: {}".format(tmp_cache_path))
    console.debug("Cache location: {}".format(cache_path))

    # create cache
    console.debug("Creating cache file, session")
    cache = ExportCache(path=tmp_cache_path, force=force)
    bulk_session = cache.session()

    try:
        # DATABASE: Access to current records
        # Load database settings
        db_settings = APP.CONFIG.get("database", {}).get(APP.ENV)
        db_config = utils.DatabaseHelper(config=db_settings,
                                         env_prefix="ZEPHIR")
        db_connection = mysql.connector.connect(**db_config.connection_args())
        db_cursor = db_connection.cursor()

        # Load merge version queries
        sql_select = {
            "v2":
            "select cid, db_updated_at, metadata_json, "
            "var_usfeddoc, var_score, concat(cid,'_',zr.autoid) as vufind_sort  "
            "from zephir_records as zr "
            "inner join zephir_filedata as zf on zr.id = zf.id "
            "where attr_ingest_date is not null "
            "order by cid, var_score DESC, vufind_sort ASC",
            "v3":
            "select cid, db_updated_at, metadata_json, "
            "var_usfeddoc, var_score, concat(cid,'_',zr.autoid) as vufind_sort  "
            "from zephir_records as zr "
            "inner join zephir_filedata as zf on zr.id = zf.id "
            "where attr_ingest_date is not null "
            "order by cid, var_usfeddoc DESC, var_score DESC, vufind_sort ASC",
        }

        # Execute query
        db_cursor.execute(sql_select[merge_version])

        # PROCESS: calculate/merge records from database into cache datastore
        console.debug("Processing records...")
        curr_cid = None
        records = []
        entries = []
        max_date = None
        for idx, row in enumerate(db_cursor):
            cid, db_date, record, var_usfeddoc, var_score, vufind_sort = row
            if cid != curr_cid or curr_cid is None:
                # write last cluster
                if curr_cid:
                    cache_id = curr_cid
                    cache_data = json.dumps(
                        VufindFormatter.create_record(curr_cid,
                                                      records).as_dict(),
                        separators=(",", ":"),
                    )
                    cache_key = zlib.crc32("{}{}".format(
                        len(records), max_date).encode("utf8"))
                    cache_date = max_date
                    entry = cache.entry(cache_id, cache_key, cache_data,
                                        cache_date)
                    entries.append(entry)

                # prepare next cluster
                curr_cid = cid
                records = [record]
                max_date = db_date
            else:
                if db_date > max_date:
                    max_date = db_date
                records.append(record)

            # periodic save records to datastore to chunk work
            if idx % 5000 == 0:
                bulk_session.bulk_save_objects(entries)
                entries = []

        # finish processing on last chunk of work
        cache_id = curr_cid
        cache_data = json.dumps(
            VufindFormatter.create_record(curr_cid, records).as_dict(),
            separators=(",", ":"),
        )
        cache_key = zlib.crc32("{}{}".format(len(records),
                                             max_date).encode("utf8"))
        cache_date = max_date
        entry = cache.entry(cache_id, cache_key, cache_data, cache_date)
        entries.append(entry)
        bulk_session.bulk_save_objects(entries)

        console.debug("Finished processing, final commit to cache datastore")
        bulk_session.commit()
        bulk_session.close()
        os.rename(tmp_cache_path, cache_path)
    finally:
        # TODO(cc): This will fail if cursor not defined
        db_cursor.close()
        db_connection.close()

    console.debug("Completed Cache: {}".format(
        str(datetime.datetime.now() - debug_start_time)))

    return cache_path