def generate_cli(
    ctx, export_type, verbosity, merge_version, output_path, cache_path, force
):
    """Generate Zephir exports files for HathiTrust."""
    console = ConsoleMessenger(app="ZEPHIR-EXPORT", verbosity=verbosity)

    if cache_path and os.path.exists(cache_path):
        console.debug("Using existing cache {}".format(cache_path))
        cache = cache_path
    else:
        cache = ht_bib_cache(
            console=console,
            cache_path=cache_path,
            merge_version=merge_version,
            force=force,
        )
    if export_type == "ht-bib-full":
        ht_bib_full(
            console=console,
            cache_path=cache,
            output_path=output_path,
            merge_version=merge_version,
            force=force,
        )
    elif export_type == "ht-bib-incr":
        ht_bib_incr(
            console=console, cache_path=cache, merge_version=merge_version, force=force
        )
Exemple #2
0
def compare_cache_cli(ctx, files, verbosity):
    """Compare export caches for content differences. Ignores datetime of cache creation."""
    console = ConsoleMessenger(app="ZEPHIR-EXPORT", verbosity=verbosity)
    f1_cache = ExportCache(path=set_abs_filepath(files[0]))
    f1_set = f1_cache.frozen_content_set()
    f2_cache = ExportCache(path=set_abs_filepath(files[1]))
    f2_set = f2_cache.frozen_content_set()
    if hash(f1_set) != hash(f2_set):
        for line in f1_set - f2_set:
            console.out("-(cid:{},key:{})".format(line[0], line[1]))
        for line in f2_set - f1_set:
            console.out("+(cid:{},key:{})".format(line[0], line[1]))
        console.info("Differences found between cache files")
    else:
        console.info("No differences found between cache files")
Exemple #3
0
def test_create_with_custom_cache_output(td_tmpdir, env_setup, capsys,
                                         pytestconfig):
    # SETUP TODO (cscollett: there may be a better place to put this)
    # set temp current working directory
    real_cwd = os.getcwd()
    os.chdir(td_tmpdir)

    for merge_version in ["v2", "v3"]:
        shutil.copyfile(
            os.path.join(td_tmpdir, "cache-{}-ref.db".format(merge_version)),
            os.path.join(td_tmpdir, "my_custom_cache.db"),
        )
        console = ConsoleMessenger(verbosity=pytestconfig.getoption("verbose"))
        ht_bib_incr(
            console=console,
            cache_path="my_custom_cache.db",
            output_path="my_custom_output.json",
            merge_version=merge_version,
            force=True,
        )

        assert filecmp.cmp(
            os.path.join(td_tmpdir, "my_custom_output.json"),
            os.path.join(
                td_tmpdir,
                "{}-ht_bib_export_incr_ref.json".format(merge_version)),
        )
        # clean up to avoid name conflict next merge-version
    os.remove(os.path.join(td_tmpdir, "my_custom_output.json"))
    os.chdir(real_cwd)
Exemple #4
0
def test_create_bib_export_incr(td_tmpdir, env_setup, capsys, pytestconfig):
    for merge_version in ["v2", "v3"]:
        os.rename(
            os.path.join(td_tmpdir, "cache-{}-ref.db".format(merge_version)),
            os.path.join(
                td_tmpdir,
                "cache-{}-{}.db".format(
                    merge_version,
                    datetime.datetime.today().strftime("%Y-%m-%d")),
            ),
        )
        console = ConsoleMessenger(verbosity=pytestconfig.getoption("verbose"))
        ht_bib_incr(console=console, merge_version=merge_version, force=True)

        export_filename = "ht_bib_export_incr_{}.json".format(
            datetime.datetime.today().strftime("%Y-%m-%d"))

        assert filecmp.cmp(
            os.path.join(td_tmpdir, export_filename),
            os.path.join(
                td_tmpdir,
                "{}-ht_bib_export_incr_ref.json".format(merge_version)),
        )
        # clean up to avoid name conflict next merge-version
        os.remove(os.path.join(td_tmpdir, export_filename))
Exemple #5
0
def compare_file_cli(ctx, files, verbosity):
    """Compare export files for content differences."""
    console = ConsoleMessenger(app="ZEPHIR-EXPORT", verbosity=verbosity)
    count = 0
    with open(files[0]) as a, open(files[1]) as b:
        for line_a in a:
            count += 1
            if line_a != b.readline():
                console.info("Differences start on line: {}".format(count))
                raise SystemExit(0)

    console.info("No differences found between files")
Exemple #6
0
def audit(filepath, quiet, verbose, dry_run, suffix):
    """Audit.py: Audit ZED log file to ensure all the data is represented in
    the database"""
    # Print handler to manage when and how messages should print
    console = ConsoleMessenger(quiet, verbose)

    # REQUIREMENTS
    if len(filepath) == 0:
        console.error("No files given to process.")
        sys.exit(1)

    # APPLICATION SETUP
    # load environment
    env = environs.Env()
    env.read_env()

    ROOT_PATH = os.environ.get("ZED_ROOT_PATH") or os.path.dirname(__file__)
    ENV = os.environ.get("ZED_ENV")
    CONFIG_PATH = os.environ.get("ZED_CONFIG_PATH") or os.path.join(
        ROOT_PATH, "config")
    OVERRIDE_CONFIG_PATH = os.environ.get("ZED_OVERRIDE_CONFIG_PATH")

    # load all configuration files in directory
    config = utils.load_config(CONFIG_PATH)

    # used in testing, config files in test data will override local config files
    if OVERRIDE_CONFIG_PATH is not None:
        config = utils.load_config(OVERRIDE_CONFIG_PATH, config)

    # Print handler to manage when/where messages should print
    console = ConsoleMessenger(quiet, verbose)

    # DATABASE SETUP
    # Create database client, connection manager.
    db = config.get("zed_db", {}).get(ENV)

    DB_CONNECT_STR = str(utils.db_connect_url(db))

    engine = sqla.create_engine(DB_CONNECT_STR)

    # Create classes through reflection
    Base = sqla_automap.automap_base()
    Base.prepare(engine, reflect=True)
    Event = Base.classes.events

    # Create a session to the database.
    Session = sqla.orm.sessionmaker()
    Session.configure(bind=engine)
    session = Session()

    if dry_run:
        console.diagnostic("DRY RUN")

    # Iterate over the json log files to process
    for file in filepath:

        if not os.path.isfile(file):
            console.error(
                "File path '{0}' does not exist. Exiting...".format(file))
            break

        # # Get the file name, path, and create destination file name, path
        f_path, f_name = os.path.split(file)
        renamed_file = os.path.join("{0}.{1}".format(file, suffix))

        if os.path.isfile(renamed_file):
            console.error(
                "Audit file '{0}' already exists.".format(renamed_file))
            break

        log_events = []
        db_events = set()
        file_pass = True  # Assume valid until line found invalid
        # Open file and process
        with open(file) as f_io:
            ln_cnt = 0
            console.diagnostic("Auditing: " + file)
            for line in f_io:
                ln_cnt += 1
                try:
                    log_events.append(json.loads(line.strip()))
                except json.decoder.JSONDecodeError:
                    file_pass = False
                    console.error(
                        "ERROR: Innvalid JSON on line {0}".format(ln_cnt))
                    break  # invalid json, stop successive validation routines

        if file_pass and len(log_events) > 0:
            query_params = {
                "event_type":
                log_events[0]["type"],
                "first_timestamp":
                (iso8601.parse_date(log_events[0]["timestamp"]) -
                 datetime.timedelta(seconds=60)).isoformat("T"),
                "last_timestamp":
                (iso8601.parse_date(log_events[-1]["timestamp"]) +
                 datetime.timedelta(seconds=60)).isoformat("T"),
            }

            session = Session()
            try:
                query = (session.query(Event.event_key).filter(
                    Event.timestamp >= query_params["first_timestamp"]).filter(
                        Event.timestamp <= query_params["last_timestamp"]).
                         filter(Event.type == query_params["event_type"]))

                for event in query.all():
                    db_events.add(event.event_key)
            except Exception as e:
                session.rollback()
                raise e
            finally:
                session.close()

            for event in log_events:
                if not event["event"] in db_events:
                    file_pass = False
                    console.error(
                        "ERROR: Missing event {0} in database.".format(
                            event["event"]))

        # Report results
        if file_pass is False:
            console.error("File {0}: fail.".format(file))
        else:
            if not dry_run:
                os.rename(file, renamed_file)
            console.report("File {0}: pass. {1} event(s) audited.\
            ".format(file, len(log_events)))

    console.report("Done!")
    sys.exit(0)
Exemple #7
0
def validate(filepath, quiet, verbose, dry_run, suffix):
    """validate.py: Validate ZED log file to ensure all the data is JSON and
    conforms to schemas"""

    # Print handler to manage when and how messages should print
    console = ConsoleMessenger(quiet, verbose)

    # REQUIREMENTS
    if len(filepath) == 0:
        console.error("No files given to process.")
        sys.exit(1)

    # APPLICATION SETUP
    # load environment
    env = environs.Env()
    env.read_env()

    schema_file = os.path.join(os.path.dirname(__file__),
                               "config/zed_schema.json")

    with open(schema_file, "r") as f:
        schema_data = f.read()
        schema = json.loads(schema_data)

    if dry_run:
        console.diagnostic("DRY RUN")

    # Iterate over the json log files to process
    for file in filepath:

        if not os.path.isfile(file):
            console.error("File path '{0}' does not exist.".format(file))
            break

        # Get the file name, path, and create destination file name, path
        f_path, f_name = os.path.split(file)
        renamed_file = os.path.join("{0}.{1}".format(file, suffix))

        if os.path.isfile(renamed_file):
            console.error(
                "Validated file '{0}' already exists.".format(renamed_file))
            break

        # Open file and validate
        with open(file) as f_io:
            event_counter = defaultdict(int)
            file_valid = True  # Assume valid until line found invalid
            ln_cnt = 0
            console.diagnostic("Validating: {}".format(file))
            for line in f_io:
                ln_cnt += 1

                # JSON VALIDATION BLOCK
                try:
                    event = json.loads(line.strip())
                    jsonschema.validate(event, schema)
                except json.decoder.JSONDecodeError:
                    file_valid = False
                    console.error("Invalid JSON on line {0}".format(ln_cnt))
                    break
                except jsonschema.exceptions.ValidationError:
                    file_valid = False
                    console.error(
                        "JSON Validation error on line {0}".format(ln_cnt))
                    break

                # DUPE-DETECTION BLOCK
                event_counter[event["event"]] += 1
                if event_counter[event["event"]] > 1:
                    file_valid = False
                    console.error("Duplicate ID ({0}) found on line {1} \
                        ".format(event["event"], ln_cnt))
                    break
            # Report results
            if file_valid is False:
                console.error("File {0}: invalid.".format(file))
            else:
                if not dry_run:
                    os.rename(file, renamed_file)
                console.report(
                    "File {0}: valid. {1} event(s) validated.".format(
                        file, ln_cnt))
    console.report("Done!")
    print(filepath)
    sys.exit(0)