Exemple #1
0
def warmup_command() -> None:
    """
    Evaluate objects in the audit log to see if they should be moved to a
    warmer storage class.
    """
    config = get_config()
    moved_output = BigQueryOutput(get_table(TableDefinitions.OBJECTS_MOVED))
    excluded_output = BigQueryOutput(
        get_table(TableDefinitions.OBJECTS_EXCLUDED))
    rows_read = 0

    # Create temp table object. Doesn't need to be initialized, as the
    # query job will do that.
    temp_table = Table(
        config.get('BIGQUERY',
                   'TEMP_TABLE',
                   fallback='smart_archiver_temp_warmup'))

    # Register cleanup as shutdown hook
    def cleanup():
        # Flush any remaining output
        moved_output.flush()
        excluded_output.flush()
        # Delete temp table
        temp_table.drop()
        # Print statistics
        LOG.info("%s rows read.", rows_read)
        LOG.info(moved_output.stats())
        LOG.info(excluded_output.stats())

    register(cleanup)

    # Run query job
    job = run_query_job(compose_warmup_query(),
                        temp_table.get_fully_qualified_name())

    # evaluate, archive and record
    def archive_worker(row: Row) -> None:
        if should_warm_up(row):
            rewrite_object(row, 'STANDARD', moved_output, excluded_output)

    workers = config.getint('RUNTIME', 'WORKERS')
    size = int(config.getint('RUNTIME', 'WORK_QUEUE_SIZE') / 2)
    with BoundedThreadPoolExecutor(max_workers=workers,
                                   queue_size=size) as executor:
        # get total rows in result, report it
        result = job.result()
        total_rows = result.total_rows
        percentage_reported = 0
        LOG.info("Total rows: %s", total_rows)
        # Start all worker threads
        for row in result:
            rows_read += 1
            executor.submit(archive_worker, row)
            # calculate the percentage and show it if it's a new 10%ile
            percentage = int(rows_read / total_rows * 100)
            if percentage > percentage_reported and not percentage % 10:
                LOG.info("%s percent complete.", percentage)
                percentage_reported = percentage
def evaluate_objects() -> None:
    """
    Evaluate objects in the audit log to see if they should be moved to a
    new storage class.
    """
    config = get_config()
    cold_storage_class = config.get('RULES', 'COLD_STORAGE_CLASS')
    moved_output = BigQueryOutput(get_table(TableDefinitions.OBJECTS_MOVED))
    excluded_output = BigQueryOutput(
        get_table(TableDefinitions.OBJECTS_EXCLUDED))
    work_queue = Queue(maxsize=3000)

    # evaluate, archive and record
    def archive_worker():
        while True:
            row = work_queue.get()
            if not row:
                break
            if should_warm_up(row):
                rewrite_object(row, 'STANDARD', moved_output, excluded_output)
            elif should_cool_down(row):
                rewrite_object(row, cold_storage_class, moved_output,
                               excluded_output)
            work_queue.task_done()

    # Start all worker threads
    worker_threads = []
    for _ in range(32):
        thread = Thread(target=archive_worker)
        thread.start()
        worker_threads.append(thread)

    # Create temp table object. Doesn't need to be initialized.
    temp_table = Table("smart_archiver_temp")

    # Register cleanup as shutdown hook
    def cleanup():
        # Flush any remaining output
        moved_output.flush()
        excluded_output.flush()
        # Delete temp table
        temp_table.drop()
        # Print statistics
        LOG.info("%s rows read.", rows_read)
        LOG.info(moved_output.stats())
        LOG.info(excluded_output.stats())

    register(cleanup)

    rows_read = 0
    # Run query job
    job = run_query_job(compose_access_query(),
                        temp_table.get_fully_qualified_name())
    # Enqueue all work
    for row in job.result():
        rows_read += 1
        work_queue.put(row)

    # wait for all of the row jobs to complete
    LOG.info("All work enqueued. Waiting for last jobs to complete.")
    work_queue.join()

    # shutdown workers
    for _ in range(32):
        work_queue.put(None)
    for thread in worker_threads:
        thread.join()
def compose_access_query() -> str:
    """Compose the query to get access information for all objects.

    Returns:
        str -- The query text.
    """
    access_log = get_table(TableDefinitions.DATA_ACCESS_LOGS)
    moved_objects = get_table(TableDefinitions.OBJECTS_MOVED)
    excluded_objects = get_table(TableDefinitions.OBJECTS_EXCLUDED)

    # First, find the most recent move for each object. Then, join the full
    # move records on that aggregated data to find full move info for the most
    # recent move.
    # TODO: Eliminate the JOIN?
    most_recent_moves = """
        SELECT full_move_info.*
            FROM `{0}` AS full_move_info
        INNER JOIN (
            SELECT
                resourceName,
                MAX(moveTimestamp) as timestamp
            FROM `{0}`
            GROUP BY resourceName
        ) AS most_recent
        ON most_recent.timestamp       = full_move_info.moveTimestamp
        AND most_recent.resourceName   = full_move_info.resourceName
    """.format(moved_objects.get_fully_qualified_name())

    # Perform a bounded query of n days of access logs, possibly with a UNION
    # of a catch-up table. REGEXP_REPLACE is to unify the representation of
    # resourceName between create and get events, which differ slightly.
    raw_access_records = """
    SELECT
        REGEXP_REPLACE(protopayload_auditlog.resourceName, "gs://.*/", "") AS resourceName,
        timestamp
    FROM `{0}`
    WHERE
        _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {1} DAY))
        AND FORMAT_DATE("%Y%m%d", CURRENT_DATE())
    {2}
    """.format(access_log.get_fully_qualified_name(),
               _calculate_day_partitions(), _compose_catch_up_union())

    # Aggregate the raw access records, in order to calculate most
    # recent access (coldness) as well as the count of accesses within a
    # specified period (hotness).
    aggregated_access_records = """
    SELECT 
        resourceName,
        MAX(timestamp) AS lastAccess,
        COUNTIF(TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), timestamp, DAY) <= {0}) AS recent_access_count
    FROM raw_access_records
    GROUP BY resourceName
    """.format(_get_warm_threshold_days())

    # Final query text. Joins most_recent_moves in order to determine
    # the latest known storage class (avoiding a GET per object to find this
    # out from GCS), and joins excluded_objects to remove them from the results.
    querytext = """
    WITH most_recent_moves AS ({0}), raw_access_records AS ({1}), aggregated_access_records AS ({2})

    SELECT 
        access_records.resourceName, 
        most_recent_moves.storageClass, 
        access_records.lastAccess, 
        access_records.recent_access_count 
    FROM aggregated_access_records as access_records

    LEFT JOIN most_recent_moves ON access_records.resourceName = most_recent_moves.resourceName

    LEFT JOIN `{3}` as excluded ON access_records.resourceName = excluded.resourceName
    WHERE excluded.resourceName IS NULL
    """.format(most_recent_moves, raw_access_records, aggregated_access_records,
               excluded_objects.get_fully_qualified_name())

    return querytext