# Setting longer timeout for reading lambda results and larger connections pool lambda_config = Config(read_timeout=lambda_read_timeout, max_pool_connections=boto_max_connections) lambda_client = boto3.client('lambda', config=lambda_config) # Fetch all the keys that match the prefix all_keys = [] for obj in s3.Bucket(bucket).objects.filter(Prefix=config["prefix"]).all(): all_keys.append(obj) print(all_keys) #exit(0) bsize = lambdautils.compute_batch_size(all_keys, lambda_memory, concurrent_lambdas) #print(bsize) #batches = lambdautils.batch_creator(all_keys, bsize) #print(batches) batches = lambdautils.batch_creator_for_ny_trip(all_keys, lambda_memory, concurrent_lambdas) print(batches) #exit(0) n_mappers = len(batches) document = xray_recorder.current_subsegment() document.put_metadata("Batch size: ", bsize, "Processing initialization") document.put_metadata("Mappers: ", n_mappers, "Processing initialization") xray_recorder.end_subsegment() #Get all keys to be processed
def get_reducer_batch_size(keys): #TODO: Paramertize memory size batch_size = lambdautils.compute_batch_size(keys, 1536) return max(batch_size, 2) # At least 2 in a batch - Condition for termination
# 1. Get all keys to be processed # init bucket = config["bucket"] job_bucket = config["jobBucket"] region = config["region"] lambda_memory = config["lambdaMemory"] concurrent_lambdas = config["concurrentLambdas"] #all_keys = s3_client.list_objects(Bucket=bucket, Prefix=config["prefix"])["Contents"] # Fetch all the keys that match the prefix all_keys = [] for obj in s3.Bucket(bucket).objects.filter(Prefix=config["prefix"]).all(): all_keys.append(obj) bsize = lambdautils.compute_batch_size(all_keys, lambda_memory) batches = lambdautils.batch_creator(all_keys, bsize) n_mappers = len(batches) # 2. Create the lambda functions L_PREFIX = "BL" # Lambda functions mapper_lambda_name = L_PREFIX + "-mapper-" + job_id reducer_lambda_name = L_PREFIX + "-reducer-" + job_id rc_lambda_name = L_PREFIX + "-rc-" + job_id # write job config write_job_config(job_id, job_bucket, n_mappers, reducer_lambda_name, config["reducer"]["handler"])
def get_reducer_batch_size(keys): # TODO: Paramertize memory size batch_size = lambdautils.compute_batch_size(keys, 1536, 1000) return max(batch_size, 2) # 적어도 배치가 2개 라면 - 종료
def handler(event, context): entry = time.time() * 1000 logger = logging.getLogger() logger.setLevel(logging.WARN) if not event: #requires arguments print('No event was passed to the handler, exiting...') return if 'mapper' not in event or 'reducer' not in event: print('No mapper or reducer function names given, unable to proceed, exiting...') return # create an S3 session if not context: #calling from main boto3.setup_default_session(profile_name='cjk1') s3 = boto3.resource('s3') config = botocore.client.Config(connect_timeout=50, read_timeout=200) s3_client = boto3.client('s3',config=config) JOB_INFO = 'jobinfo.json' # 1. Get all keys to be processed # init endearly = 0 if 'endearly' in event: endearly = int(event['endearly']) bucket = event["bucket"] dryrun = True if "dryrun" in event else False lambda_memory = 1536 # Fetch all the keys that match the prefix all_keys = [] for obj in s3.Bucket(bucket).objects.filter(Prefix=event["prefix"]).all(): all_keys.append(obj) bsize = lambdautils.compute_batch_size(all_keys, lambda_memory) batches = lambdautils.batch_creator(all_keys, bsize) n_mappers = len(batches) if endearly > 0 and endearly < n_mappers: n_mappers = endearly print("Num. of Mappers (and Reducers) ", n_mappers) if dryrun: #don't go any further delta = (time.time() * 1000) - entry me_str = 'TIMER:CALL:{}:dryrun:0'.format(delta) logger.warn(me_str) return me_str #process the remaining arguments job_id = event["job_id"] job_bucket = event["jobBucket"] region = event["region"] async = True if "full_async" in event else False reducer_lambda_name = event["reducer"] mapper_lambda_name = event["mapper"] # Write Jobdata to S3 j_key = job_id + "/jobdata"; data = json.dumps({ "mapCount": n_mappers, "totalS3Files": len(all_keys), "startTime": time.time() }) write_to_s3(s3, job_bucket, j_key, data, {}) data = json.dumps({ "jobId": job_id, "jobBucket" : job_bucket, "mapCount": n_mappers, "reducerFunction": reducer_lambda_name, "reducerHandler": "{}.handler".format(reducer_lambda_name) }, indent=4); j_key = job_id + "/jobinfo.json"; write_to_s3(s3,job_bucket,j_key,data,{}) ### Execute ### total_lambda_secs = 0 reducer_lambda_time = 0 mapper_outputs = [] if async: #asynchronous invocation of mappers for i in range(n_mappers): invoke_lambda(mapper_lambda_name,batches,bucket,job_bucket,job_id,i) else: #synchronous invocation of mappers on parallel threads pool = ThreadPool(n_mappers) Ids = [i+1 for i in range(n_mappers)] invoke_lambda_partial = partial(invoke_lambda_sync,mapper_lambda_name,batches,mapper_outputs,bucket,job_bucket,job_id) # Burst request handling mappers_executed = 0 concurrent_lambdas = 100 #only used by synchronous run (use --dryrun to see how many actual mappers are needed while mappers_executed < n_mappers: nm = min(concurrent_lambdas, n_mappers) results = pool.map(invoke_lambda_partial, Ids[mappers_executed: mappers_executed + nm]) mappers_executed += nm pool.close() pool.join() for output in mapper_outputs: if 'body' in output: total_lambda_secs += float(output['body'][2]) else: total_lambda_secs += float(output[2]) if not async: #Note: Wait for the job to complete so that we can compute total cost ; create a poll every 10 secs # Get all reducer keys reducer_keys = [] # Total execution time for reducers while True: job_keys = s3_client.list_objects(Bucket=job_bucket, Prefix=job_id)["Contents"] keys = [jk["Key"] for jk in job_keys] total_s3_size = sum([jk["Size"] for jk in job_keys]) logger.info("checking if job is done") # check job done if job_id + "/result" in keys: reducer_lambda_time += float(s3.Object(job_bucket, job_id + "/result").metadata['processingtime']) for key in keys: if "task/reducer" in key: reducer_lambda_time += float(s3.Object(job_bucket, key).metadata['processingtime']) reducer_keys.append(key) break time.sleep(5) delta = (time.time() * 1000) - entry me_str = 'TIMER:CALL:{}:mappers:{}:reducer:{}'.format(delta,total_lambda_secs,reducer_lambda_time) logger.warn(me_str) return me_str
def handler(event, context): entry = time.time() * 1000 logger = logging.getLogger() logger.setLevel(logging.WARN) if not event: #requires arguments print('No event was passed to the handler, exiting...') return if 'mapper' not in event or 'reducer' not in event: print( 'No mapper or reducer function names given, unable to proceed, exiting...' ) return # create an S3 session if not context: #calling from main boto3.setup_default_session(profile_name='cjk1') s3 = boto3.resource('s3') config = botocore.client.Config(connect_timeout=50, read_timeout=200) s3_client = boto3.client('s3', config=config) JOB_INFO = 'jobinfo.json' # 1. Get all keys to be processed # init endearly = 0 if 'endearly' in event: endearly = int(event['endearly']) bucket = event["bucket"] dryrun = True if "dryrun" in event else False lambda_memory = 1536 # Fetch all the keys that match the prefix all_keys = [] for obj in s3.Bucket(bucket).objects.filter(Prefix=event["prefix"]).all(): all_keys.append(obj) bsize = lambdautils.compute_batch_size(all_keys, lambda_memory) batches = lambdautils.batch_creator(all_keys, bsize) n_mappers = len(batches) if endearly > 0 and endearly < n_mappers: n_mappers = endearly print("Num. of Mappers (and Reducers) ", n_mappers) if dryrun: #don't go any further delta = (time.time() * 1000) - entry me_str = 'TIMER:CALL:{}:dryrun:0'.format(delta) logger.warn(me_str) return me_str #process the remaining arguments job_id = event["job_id"] job_bucket = event["jobBucket"] region = event["region"] async = True if "full_async" in event else False reducer_lambda_name = event["reducer"] mapper_lambda_name = event["mapper"] # Write Jobdata to S3 j_key = job_id + "/jobdata" data = json.dumps({ "mapCount": n_mappers, "totalS3Files": len(all_keys), "startTime": time.time() }) write_to_s3(s3, job_bucket, j_key, data, {}) data = json.dumps( { "jobId": job_id, "jobBucket": job_bucket, "mapCount": n_mappers, "reducerFunction": reducer_lambda_name, "reducerHandler": "{}.handler".format(reducer_lambda_name) }, indent=4) j_key = job_id + "/jobinfo.json" write_to_s3(s3, job_bucket, j_key, data, {}) ### Execute ### total_lambda_secs = 0 reducer_lambda_time = 0 mapper_outputs = [] if async: #asynchronous invocation of mappers for i in range(n_mappers): invoke_lambda(mapper_lambda_name, batches, bucket, job_bucket, job_id, i) else: #synchronous invocation of mappers on parallel threads pool = ThreadPool(n_mappers) Ids = [i + 1 for i in range(n_mappers)] invoke_lambda_partial = partial(invoke_lambda_sync, mapper_lambda_name, batches, mapper_outputs, bucket, job_bucket, job_id) # Burst request handling mappers_executed = 0 concurrent_lambdas = 100 #only used by synchronous run (use --dryrun to see how many actual mappers are needed while mappers_executed < n_mappers: nm = min(concurrent_lambdas, n_mappers) results = pool.map(invoke_lambda_partial, Ids[mappers_executed:mappers_executed + nm]) mappers_executed += nm pool.close() pool.join() for output in mapper_outputs: if 'body' in output: total_lambda_secs += float(output['body'][2]) else: total_lambda_secs += float(output[2]) if not async: #Note: Wait for the job to complete so that we can compute total cost ; create a poll every 10 secs # Get all reducer keys reducer_keys = [] # Total execution time for reducers while True: job_keys = s3_client.list_objects(Bucket=job_bucket, Prefix=job_id)["Contents"] keys = [jk["Key"] for jk in job_keys] total_s3_size = sum([jk["Size"] for jk in job_keys]) logger.info("checking if job is done") # check job done if job_id + "/result" in keys: reducer_lambda_time += float( s3.Object(job_bucket, job_id + "/result").metadata['processingtime']) for key in keys: if "task/reducer" in key: reducer_lambda_time += float( s3.Object(job_bucket, key).metadata['processingtime']) reducer_keys.append(key) break time.sleep(5) delta = (time.time() * 1000) - entry me_str = 'TIMER:CALL:{}:mappers:{}:reducer:{}'.format( delta, total_lambda_secs, reducer_lambda_time) logger.warn(me_str) return me_str
def get_reducer_batch_size(keys, lambda_memory, concurrent_lambdas): batch_size = lambdautils.compute_batch_size(keys, lambda_memory, concurrent_lambdas) return max(batch_size, 2) # At least 2 in a batch - Condition for termination