def warmup_command() -> None: """ Evaluate objects in the audit log to see if they should be moved to a warmer storage class. """ config = get_config() moved_output = BigQueryOutput(get_table(TableDefinitions.OBJECTS_MOVED)) excluded_output = BigQueryOutput( get_table(TableDefinitions.OBJECTS_EXCLUDED)) rows_read = 0 # Create temp table object. Doesn't need to be initialized, as the # query job will do that. temp_table = Table( config.get('BIGQUERY', 'TEMP_TABLE', fallback='smart_archiver_temp_warmup')) # Register cleanup as shutdown hook def cleanup(): # Flush any remaining output moved_output.flush() excluded_output.flush() # Delete temp table temp_table.drop() # Print statistics LOG.info("%s rows read.", rows_read) LOG.info(moved_output.stats()) LOG.info(excluded_output.stats()) register(cleanup) # Run query job job = run_query_job(compose_warmup_query(), temp_table.get_fully_qualified_name()) # evaluate, archive and record def archive_worker(row: Row) -> None: if should_warm_up(row): rewrite_object(row, 'STANDARD', moved_output, excluded_output) workers = config.getint('RUNTIME', 'WORKERS') size = int(config.getint('RUNTIME', 'WORK_QUEUE_SIZE') / 2) with BoundedThreadPoolExecutor(max_workers=workers, queue_size=size) as executor: # get total rows in result, report it result = job.result() total_rows = result.total_rows percentage_reported = 0 LOG.info("Total rows: %s", total_rows) # Start all worker threads for row in result: rows_read += 1 executor.submit(archive_worker, row) # calculate the percentage and show it if it's a new 10%ile percentage = int(rows_read / total_rows * 100) if percentage > percentage_reported and not percentage % 10: LOG.info("%s percent complete.", percentage) percentage_reported = percentage
def evaluate_objects() -> None: """ Evaluate objects in the audit log to see if they should be moved to a new storage class. """ config = get_config() cold_storage_class = config.get('RULES', 'COLD_STORAGE_CLASS') moved_output = BigQueryOutput(get_table(TableDefinitions.OBJECTS_MOVED)) excluded_output = BigQueryOutput( get_table(TableDefinitions.OBJECTS_EXCLUDED)) work_queue = Queue(maxsize=3000) # evaluate, archive and record def archive_worker(): while True: row = work_queue.get() if not row: break if should_warm_up(row): rewrite_object(row, 'STANDARD', moved_output, excluded_output) elif should_cool_down(row): rewrite_object(row, cold_storage_class, moved_output, excluded_output) work_queue.task_done() # Start all worker threads worker_threads = [] for _ in range(32): thread = Thread(target=archive_worker) thread.start() worker_threads.append(thread) # Create temp table object. Doesn't need to be initialized. temp_table = Table("smart_archiver_temp") # Register cleanup as shutdown hook def cleanup(): # Flush any remaining output moved_output.flush() excluded_output.flush() # Delete temp table temp_table.drop() # Print statistics LOG.info("%s rows read.", rows_read) LOG.info(moved_output.stats()) LOG.info(excluded_output.stats()) register(cleanup) rows_read = 0 # Run query job job = run_query_job(compose_access_query(), temp_table.get_fully_qualified_name()) # Enqueue all work for row in job.result(): rows_read += 1 work_queue.put(row) # wait for all of the row jobs to complete LOG.info("All work enqueued. Waiting for last jobs to complete.") work_queue.join() # shutdown workers for _ in range(32): work_queue.put(None) for thread in worker_threads: thread.join()
def compose_access_query() -> str: """Compose the query to get access information for all objects. Returns: str -- The query text. """ access_log = get_table(TableDefinitions.DATA_ACCESS_LOGS) moved_objects = get_table(TableDefinitions.OBJECTS_MOVED) excluded_objects = get_table(TableDefinitions.OBJECTS_EXCLUDED) # First, find the most recent move for each object. Then, join the full # move records on that aggregated data to find full move info for the most # recent move. # TODO: Eliminate the JOIN? most_recent_moves = """ SELECT full_move_info.* FROM `{0}` AS full_move_info INNER JOIN ( SELECT resourceName, MAX(moveTimestamp) as timestamp FROM `{0}` GROUP BY resourceName ) AS most_recent ON most_recent.timestamp = full_move_info.moveTimestamp AND most_recent.resourceName = full_move_info.resourceName """.format(moved_objects.get_fully_qualified_name()) # Perform a bounded query of n days of access logs, possibly with a UNION # of a catch-up table. REGEXP_REPLACE is to unify the representation of # resourceName between create and get events, which differ slightly. raw_access_records = """ SELECT REGEXP_REPLACE(protopayload_auditlog.resourceName, "gs://.*/", "") AS resourceName, timestamp FROM `{0}` WHERE _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL {1} DAY)) AND FORMAT_DATE("%Y%m%d", CURRENT_DATE()) {2} """.format(access_log.get_fully_qualified_name(), _calculate_day_partitions(), _compose_catch_up_union()) # Aggregate the raw access records, in order to calculate most # recent access (coldness) as well as the count of accesses within a # specified period (hotness). aggregated_access_records = """ SELECT resourceName, MAX(timestamp) AS lastAccess, COUNTIF(TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), timestamp, DAY) <= {0}) AS recent_access_count FROM raw_access_records GROUP BY resourceName """.format(_get_warm_threshold_days()) # Final query text. Joins most_recent_moves in order to determine # the latest known storage class (avoiding a GET per object to find this # out from GCS), and joins excluded_objects to remove them from the results. querytext = """ WITH most_recent_moves AS ({0}), raw_access_records AS ({1}), aggregated_access_records AS ({2}) SELECT access_records.resourceName, most_recent_moves.storageClass, access_records.lastAccess, access_records.recent_access_count FROM aggregated_access_records as access_records LEFT JOIN most_recent_moves ON access_records.resourceName = most_recent_moves.resourceName LEFT JOIN `{3}` as excluded ON access_records.resourceName = excluded.resourceName WHERE excluded.resourceName IS NULL """.format(most_recent_moves, raw_access_records, aggregated_access_records, excluded_objects.get_fully_qualified_name()) return querytext