Exemple #1
0
def handoff_event_to_emitter(context, bucket, key, events):
    bucket = os.environ["ProjectConfigurationBucket"]
    lmdclient = Lambda(context)
    s3client = S3(context, bucket)

    parts = KeyParts(key, context[c.KEY_SEPERATOR_PARTITION])
    key = "deployment/share/emitted_event_payloads/{}/{}/{}/{}".format(
        parts.source, parts.event, parts.datetime,
        parts.filename.replace(parts.extension, 'json'))

    payload = {
        'emitted': {
            'key': key,
            'bucket': bucket,
            'type': parts.event,
            'source': parts.source,
            'buildid': parts.buildid,
            'filename': parts.filename.replace(parts.extension, 'json'),
            'datetime': parts.datetime,
            'datetimeformat': util.partition_date_format(),
            'sensitivitylevel': parts.sensitivity_level
        }
    }

    #create a temporary file for the event emitter to read
    expires = datetime.datetime.utcnow() + datetime.timedelta(minutes=30)
    s3client.put_object(key, events.to_json(orient='records'), expires)

    resp = lmdclient.invoke(os.environ[c.ENV_EVENT_EMITTER], payload)
 def ingest(self):                    
     debug_print("Ingesting directory {}".format(self.directory))               
     debug_print("Ingesting the files \n{}".format(self.files))                    
     is_lambda =  self.context[c.KEY_LAMBDA_FUNCTION] is not None
     timeout = self.__calculate_aggregate_window_timeout(self.context[c.KEY_MAX_LAMBDA_TIME])
     target_excretion_size = self.context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB]
     compression_ratio = self.context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO]
     sep = self.context[c.KEY_SEPERATOR_PARTITION]
     memory_trigger = self.context[c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER]             
     memory_used = mutil.get_memory_usage()             
     main_filename, main_file_data, main_file_size_mb = self.__get_main_aggregate_file(self.directory, sep, target_excretion_size)        
     main_file_data = self.__append(None, main_file_data)           
     keys_ingested = []
     for file in self.files:
         debug_print("\tProcessing file {}".format(file))
         key_parts = KeyParts(file, sep)
         duration = datetime.datetime.utcnow() - key_parts.filename_timestamp            
         if duration.total_seconds() < 300:
             debug_print("The file '{}' is {}s old.  It is too new and will be processed later to allow for S3 propagation.".format(file, duration.total_seconds()))
             continue
         keys_ingested.append(file)            
         data = self.__open(file, main_file_data)
         if data is None:
             continue            
         size_in_megabytes = self.__size(file)            
         main_file_data = self.__append(main_file_data, data) 
         del data
         gc.collect()
         current_dataframe_size = sys.getsizeof(main_file_data)        
         #break conditions
         #1. Memory limit exceeded
         #2. Time window exceeded
         #3. Target excretion size hit
         main_file_size_mb += size_in_megabytes
         memory_used = mutil.get_memory_usage()           
         debug_print("\t\tSize on S3: {}MB Size of new dataset: {}bytes Estimated Compression Ratio: {} Memory Used: {}% Project Compression Size {}MB  Target Excretion Size {}MB".format(size_in_megabytes, current_dataframe_size, compression_ratio, memory_used, main_file_size_mb, target_excretion_size))
         if util.elapsed(self.context) > timeout or memory_used > memory_trigger or main_file_size_mb > target_excretion_size :
             print "Elapsed", util.elapsed(self.context), "Start:", self.starttime, "Timeout:", timeout, "Has timed out:", util.elapsed(self.context) > timeout, "Mem Used %:", memory_used, "Max Memory %:", memory_trigger
             break                
     
     #only save the files if we have a reasonable amount of time remaining before the lambda timeout.
     debug_print("Time remaining: {}s".format(util.time_remaining(self.context)))    
     debug_print("There were {} keys ingested.  The keys ingested are: \n {}".format(len(keys_ingested), keys_ingested))
     if len(keys_ingested)>0 and util.time_remaining(self.context) > c.SAVE_WINDOW_IN_SECONDS and not main_file_data.empty:            
         main_file_data = self.__convert_to_submission_df(main_file_data)
         gc.collect()
         self.__excret(self.directory, main_filename, main_file_data, sep)            
         self.__delete_keys(keys_ingested)
     elif util.time_remaining(self.context) <= c.SAVE_WINDOW_IN_SECONDS:            
         print "Time has run out!  We have less than {} seconds remaining before this lambda times out.  Abandoning the S3 commit to avoid file corruption.".format(c.SAVE_WINDOW_IN_SECONDS)
         print "Aggregation window (Max Lambda Execution Time * {}): {} seconds".format(c.RATIO_OF_MAX_LAMBDA_TIME, timeout) 
         print "S3 Save window: {} seconds".format(c.SAVE_WINDOW_IN_SECONDS) 
         print "Lambda time remaining: {} seconds".format(util.time_remaining(self.context))                        
        
     remaining_files = list(set(self.files) - set(keys_ingested))
     if len(remaining_files) > 0:        
         debug_print("Re-adding the {} paths to SQS to attempt again. The paths are \n{}".format(len(remaining_files), remaining_files))               
         self.__add_to_sqs(remaining_files)        
     print "I've consumed everything I can in bucket '{}'".format(self.directory)
     return
Exemple #3
0
def write(bucket, key, data, sep, object_encoding, append=False):   
    if data.empty:        
        raise RuntimeError( "[{}]An attempt to write an empty dataset has occurred.  The request dataset was: {}".format(error.Error.empty_dataframe(), data))    
    sensitivity_type = KeyParts(key, sep).sensitivity_level.lower()   
    s3 = s3fsmap[sensitivity_type]    
    s3_open = s3.open    
    size_before_dup_drop = len(data)
    data.drop_duplicates(inplace=True)        
    size_after_dup_drop = len(data)        
    if size_before_dup_drop - size_after_dup_drop > 0:
        print "{} duplicates have been dropped".format(size_before_dup_drop - size_after_dup_drop) 
    util.debug_print("Using object encoding {}".format(object_encoding))
    path='{}{}'.format(bucket,key)          
    pwrite(path, data, open_with=s3_open, compression='GZIP', append=append, has_nulls=True, object_encoding=object_encoding)        
    return path
Exemple #4
0
def write(bucket, key, data, sep, object_encoding):
    if data.empty:
        raise RuntimeError(
            "[{}]An attempt to write an empty dataset has occurred.  The request dataset was: {}"
            .format(error.Error.empty_dataframe(), data))
    sensitivity_type = KeyParts(key, sep).sensitivity_level.lower()
    s3 = s3fsmap[sensitivity_type]
    s3_open = s3.open
    path = '{}{}'.format(bucket, key)
    pwrite(path,
           data,
           open_with=s3_open,
           compression='GZIP',
           append=False,
           has_nulls=True,
           object_encoding=object_encoding)
Exemple #5
0
 def crawl_from_relative(self, prefix):        
     #combine only files for the past two days.  Older files should already be fully aggregated
     start = datetime.datetime.utcnow() - datetime.timedelta(days=2)
     crawl_paths = dict({})
     for page in self.__s3.list(prefix):            
         if "Contents" in page:
             for obj in page[ "Contents" ]:
                 key = obj['Key']                                                    
                 parts = KeyParts(key, self.__sep)                    
                 event_date = datetime.datetime(parts.year, parts.month, parts.day, parts.hour)
                 if event_date >= start:
                     path = self.__sep.join(parts.path.split(self.__sep)[:-1])                          
                     if path not in crawl_paths: 
                         crawl_paths[path] = []
                     crawl_paths[path].append(parts.filename)                        
                     
     #assign an amoeba generator per identified path
     return crawl_paths     
Exemple #6
0
 def crawl(self, prefix, lambda_pool, func, depth=10):        
     #combine only files for the past X.  Older files should already be fully aggregated        
     print "Locating paths to crawl on bucket '{}' with prefix '{}'".format(self.__bucket, prefix)
     crawled_paths = {}
     idx = 0
     count = 0        
     for page in self.__s3.list(prefix=prefix):                        
         if "Contents" in page:
             for obj in page[ "Contents" ]:
                 key = obj['Key']                                                                        
                 parts = KeyParts(key, self.__sep)                                      
                 path = self.__sep.join(parts.path.split(self.__sep)[:depth])                     
                 if path not in crawled_paths:                       
                     crawled_paths[path]=True
                     func(self.__context, path, lambda_pool[idx])
                     count += 1
                     idx += 1                        
                     if idx >= len(lambda_pool):
                         idx = 0
     
     print "Path scouting complete on bucket '{}'".format(self.__bucket)
     return count  
Exemple #7
0
def ingest(event, lambdacontext):
    starttime = time.time()
    gc.collect()
    root = event.get("root", None)
    print "Initial memory size:", mutil.get_memory_object()
    print "Started amoeba with root {}".format(root)
    context = event.get("context", {})
    context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr(
        lambdacontext, 'function_name') else None
    context[c.KEY_START_TIME] = starttime
    is_lambda = context[c.KEY_LAMBDA_FUNCTION] is not None
    bucket = os.environ[c.ENV_S3_STORAGE]
    crawler = Crawler(context, bucket)
    roots = crawler.crawl_from_relative(root)
    s3_fs = s3fs.S3FileSystem()
    s3 = S3(context, bucket)
    timeout = calculate_aggregate_window_timeout(
        context[c.KEY_MAX_LAMBDA_TIME])
    target_excretion_size = context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB]
    compression_ratio = context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO]
    sep = context[c.KEY_SEPERATOR_PARTITION]
    memory_used = mutil.get_memory_usage()
    projected_compressed_file_size_in_mb = 0
    print "Hunting for {} seconds in bucket '{}'".format(timeout, bucket)
    for path in roots:
        #The GLUE Crawler does not work well when a single key in S3 contains varying data schemas.
        files = roots[path]
        if len(files) == 1:
            continue
        debug_print("\t\tIngesting path {}".format(path))
        df = {}
        keys_ingested = []
        data = None

        for file in files:
            debug_print("\t\t\t{}".format(file))
            key = "{}/{}".format(path, file)
            try:
                size_in_megabytes = s3.size_in_megabytes(key)
            except ClientError as e:
                if str(e.response['Error']['Code']) == '404':
                    continue
                else:
                    print "Error: ", e.response['Error']['Code'], key
                    raise e

            if size_in_megabytes > target_excretion_size:
                debug_print(
                    "Skipping file '{}'.  It has reached the targetted file size"
                    .format(key))
                continue
            size_in_bytes = size_in_megabytes * 1024 * 1024
            try:
                data = reader.read(s3_fs, bucket, key)
                keys_ingested.append(key)
            except ClientError as e:
                print e.response['Error']['Code'], "key=>", key
                #handle corrupt files, this can happen if a write did not finish correctly
                if e.message == "Seek before start of file":
                    print "Deleting corrupt file %s", key
                    s3.delete([key])
                elif e.response['Error']['Code'] == 'NoSuchKey':
                    print '{}: for key {}'.format(e.response['Error']['Code'],
                                                  key)
                else:
                    util.logger.error(e)
                continue
            for row in data.itertuples(index=True):
                row = row.__dict__
                del row['Index']
                key_parts = KeyParts(key, sep)
                uuid_key = "{}{}{}".format(row[c.PRIMARY_KEY], key_parts.event,
                                           row[c.TERTIARY_KEY])
                df_size = len(row)

                debug_print(
                    "\t\t\tSize on S3 in MB: {} Size as Dataframe: {} Ratio: {}"
                    .format(size_in_megabytes, df_size, compression_ratio))

                #a dictionary is the fastest way to create a unique set.
                if uuid_key in df:
                    debug_print(
                        "\t\t\tFound duplication in key '{}'.  Keeping the first occurrence."
                        .format(key))
                else:
                    df[uuid_key] = row

                current_dataframe_size = len(df)
                #break conditions
                #1. Memory limit exceeded
                #2. Time window exceeded
                #3. Target excretion size hit
                projected_compressed_file_size_in_mb = (
                    compression_ratio * current_dataframe_size) / 1048576.0
                memory_used = mutil.get_memory_usage()
                debug_print(
                    "\t\t\t{} seconds have elapsed.  {} kilobytes of memory have been used. The projected compressed file size is {} MB.  We are targetting an excretion file size of {} MB."
                    .format(util.elapsed(context), memory_used / 1024,
                            projected_compressed_file_size_in_mb,
                            target_excretion_size))
                if util.elapsed(context) > timeout or memory_used > context[
                        c.
                        KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size:
                    break
            if util.elapsed(context) > timeout or memory_used > context[
                    c.
                    KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size:
                print "Elapsed", util.elapsed(context), "Start:", context[
                    c.
                    KEY_START_TIME], "Timeout:", timeout, "Has timed out:", util.elapsed(
                        context
                    ) > timeout, "Mem Used:", memory_used, "Max Memory %:", context[
                        c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER]
                break
        if len(keys_ingested) > 0 and util.time_remaining(context) > 45:
            values = df.values()
            #since the schema hash of the event columns are all the same we can infer the dict[0].keys has the same column headers as the rest
            columns = values[0].keys()
            set = pd.DataFrame(values, columns=columns)
            excret(s3, bucket, path, set, keys_ingested, sep,
                   m_schema.object_encoding(columns))
            del set
        elif util.time_remaining(context) <= 45:
            return
        del data
        del df
        del keys_ingested

        if util.elapsed(
                context) > timeout or mutil.get_process_memory_usage_bytes(
                ) >= c.ONE_GB_IN_BYTES:
            print "\tThe elapsed time threshold of {} seconds has been hit or the memory threshold of {} megabytes has been hit. Time: {}s, Memory: {}MB".format(
                timeout, c.ONE_GB_IN_BYTES / 1048576.0, util.elapsed(context),
                mutil.get_process_memory_usage_megabytes())
            return
    print "I've consumed everything I can in bucket '{}'".format(bucket)
    return