def save(context, metric_sets, partition, schema_hash): util.debug_print("\t{}:".format(partition)) for schema_hash, dict in metric_sets[partition].iteritems(): if util.time_remaining(context) <= ( context[c.CW_ATTR_DELETE_DURATION] + 20): break columns = dict[c.KEY_SET_COLUMNS] if len(dict[c.KEY_SET]) == 0: continue values = dict[c.KEY_SET].values() set = pd.DataFrame(values, columns=columns) util.debug_print("\t\t{}:".format(schema_hash)) path = create_file_path(partition, schema_hash, context[c.KEY_SEPERATOR_PARTITION]) util.debug_print("Writing to path '{}' with set:\n {}".format( path, set)) elapsed = 0 try: util.debug_print("Writing to partition '{}'".format(partition)) write(context[c.KEY_METRIC_BUCKET], path, set, context[c.KEY_SEPERATOR_PARTITION], schema.object_encoding(columns)) context[c.KEY_SUCCEEDED_MSG_IDS] += dict[c.KEY_MSG_IDS] except Exception as e: print "[{}]An error occured writing to path '{}'.\nSet: {}\nError: \n{}".format( context[c.KEY_REQUEST_ID], path, set, traceback.format_exc()) raise e finally: number_of_rows = len(values) if c.INFO_TOTAL_ROWS not in context[c.KEY_AGGREGATOR].info: context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] = 0 context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] += number_of_rows del set del columns
def save(context, metric_sets, partition, schema_hash): util.debug_print("\t{}:".format(partition)) paths = [] for schema_hash, dict in iteritems(metric_sets[partition]): if util.time_remaining(context) <= ( context[c.CW_ATTR_DELETE_DURATION] + 20): break columns = dict[c.KEY_SET_COLUMNS] if len(dict[c.KEY_SET]) == 0: continue values = dict[c.KEY_SET].values() set = pd.DataFrame(values, columns=columns) util.debug_print("\t\t{}:".format(schema_hash)) path = create_file_path(partition, schema_hash, context[c.KEY_SEPERATOR_PARTITION]) paths.append(path) util.debug_print("Writing to path '{}' with set:\n {}".format( path, set)) elapsed = 0 try: util.debug_print("Writing to partition '{}'".format(partition)) write(context[c.KEY_METRIC_BUCKET], path, set, context[c.KEY_SEPERATOR_PARTITION], schema.object_encoding(columns)) handoff_event_to_emitter(context, context[c.KEY_METRIC_BUCKET], path, set) context[c.KEY_SUCCEEDED_MSG_IDS] += dict[c.KEY_MSG_IDS] util.debug_print("Save complete to path '{}'".format(path)) except Exception as e: print( "[{}]An error occured writing to path '{}'.\nSet: {}\nError: \n{}" .format(context[c.KEY_REQUEST_ID], path, set, traceback.format_exc())) raise e finally: number_of_rows = len(values) if c.INFO_TOTAL_ROWS not in context[c.KEY_AGGREGATOR].info: context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] = 0 context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] += number_of_rows del set del values del columns gc.collect() try: util.debug_print("Adding amoeba message to SQS queue '{}'".format( context[c.KEY_SQS_AMOEBA].queue_url)) context[c.KEY_SQS_AMOEBA].send_generic_message( json.dumps({"paths": paths})) except Exception as e: print( "[{}]An error occured writing messages to the Amoeba SQS queue. \nError: \n{}" .format(context[c.KEY_REQUEST_ID], traceback.format_exc())) raise e
def __excret(self, key, filename, data, sep): output_path = "{}{}{}".format(key, sep, filename) print "\t\tSaving ingested file to '{}{}'".format( self.bucket, output_path) writer.write(self.bucket, output_path, data, sep, m_schema.object_encoding(data.columns.values))
def ingest(event, lambdacontext): starttime = time.time() gc.collect() root = event.get("root", None) print "Initial memory size:", mutil.get_memory_object() print "Started amoeba with root {}".format(root) context = event.get("context", {}) context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr( lambdacontext, 'function_name') else None context[c.KEY_START_TIME] = starttime is_lambda = context[c.KEY_LAMBDA_FUNCTION] is not None bucket = os.environ[c.ENV_S3_STORAGE] crawler = Crawler(context, bucket) roots = crawler.crawl_from_relative(root) s3_fs = s3fs.S3FileSystem() s3 = S3(context, bucket) timeout = calculate_aggregate_window_timeout( context[c.KEY_MAX_LAMBDA_TIME]) target_excretion_size = context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB] compression_ratio = context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO] sep = context[c.KEY_SEPERATOR_PARTITION] memory_used = mutil.get_memory_usage() projected_compressed_file_size_in_mb = 0 print "Hunting for {} seconds in bucket '{}'".format(timeout, bucket) for path in roots: #The GLUE Crawler does not work well when a single key in S3 contains varying data schemas. files = roots[path] if len(files) == 1: continue debug_print("\t\tIngesting path {}".format(path)) df = {} keys_ingested = [] data = None for file in files: debug_print("\t\t\t{}".format(file)) key = "{}/{}".format(path, file) try: size_in_megabytes = s3.size_in_megabytes(key) except ClientError as e: if str(e.response['Error']['Code']) == '404': continue else: print "Error: ", e.response['Error']['Code'], key raise e if size_in_megabytes > target_excretion_size: debug_print( "Skipping file '{}'. It has reached the targetted file size" .format(key)) continue size_in_bytes = size_in_megabytes * 1024 * 1024 try: data = reader.read(s3_fs, bucket, key) keys_ingested.append(key) except ClientError as e: print e.response['Error']['Code'], "key=>", key #handle corrupt files, this can happen if a write did not finish correctly if e.message == "Seek before start of file": print "Deleting corrupt file %s", key s3.delete([key]) elif e.response['Error']['Code'] == 'NoSuchKey': print '{}: for key {}'.format(e.response['Error']['Code'], key) else: util.logger.error(e) continue for row in data.itertuples(index=True): row = row.__dict__ del row['Index'] key_parts = KeyParts(key, sep) uuid_key = "{}{}{}".format(row[c.PRIMARY_KEY], key_parts.event, row[c.TERTIARY_KEY]) df_size = len(row) debug_print( "\t\t\tSize on S3 in MB: {} Size as Dataframe: {} Ratio: {}" .format(size_in_megabytes, df_size, compression_ratio)) #a dictionary is the fastest way to create a unique set. if uuid_key in df: debug_print( "\t\t\tFound duplication in key '{}'. Keeping the first occurrence." .format(key)) else: df[uuid_key] = row current_dataframe_size = len(df) #break conditions #1. Memory limit exceeded #2. Time window exceeded #3. Target excretion size hit projected_compressed_file_size_in_mb = ( compression_ratio * current_dataframe_size) / 1048576.0 memory_used = mutil.get_memory_usage() debug_print( "\t\t\t{} seconds have elapsed. {} kilobytes of memory have been used. The projected compressed file size is {} MB. We are targetting an excretion file size of {} MB." .format(util.elapsed(context), memory_used / 1024, projected_compressed_file_size_in_mb, target_excretion_size)) if util.elapsed(context) > timeout or memory_used > context[ c. KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size: break if util.elapsed(context) > timeout or memory_used > context[ c. KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size: print "Elapsed", util.elapsed(context), "Start:", context[ c. KEY_START_TIME], "Timeout:", timeout, "Has timed out:", util.elapsed( context ) > timeout, "Mem Used:", memory_used, "Max Memory %:", context[ c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] break if len(keys_ingested) > 0 and util.time_remaining(context) > 45: values = df.values() #since the schema hash of the event columns are all the same we can infer the dict[0].keys has the same column headers as the rest columns = values[0].keys() set = pd.DataFrame(values, columns=columns) excret(s3, bucket, path, set, keys_ingested, sep, m_schema.object_encoding(columns)) del set elif util.time_remaining(context) <= 45: return del data del df del keys_ingested if util.elapsed( context) > timeout or mutil.get_process_memory_usage_bytes( ) >= c.ONE_GB_IN_BYTES: print "\tThe elapsed time threshold of {} seconds has been hit or the memory threshold of {} megabytes has been hit. Time: {}s, Memory: {}MB".format( timeout, c.ONE_GB_IN_BYTES / 1048576.0, util.elapsed(context), mutil.get_process_memory_usage_megabytes()) return print "I've consumed everything I can in bucket '{}'".format(bucket) return