# init bucket = config["bucket"] job_bucket = config["jobBucket"] region = config["region"] lambda_memory = config["lambdaMemory"] concurrent_lambdas = config["concurrentLambdas"] #all_keys = s3_client.list_objects(Bucket=bucket, Prefix=config["prefix"])["Contents"] # Fetch all the keys that match the prefix all_keys = [] for obj in s3.Bucket(bucket).objects.filter(Prefix=config["prefix"]).all(): all_keys.append(obj) bsize = lambdautils.compute_batch_size(all_keys, lambda_memory) batches = lambdautils.batch_creator(all_keys, bsize) n_mappers = len(batches) # 2. Create the lambda functions L_PREFIX = "BL" # Lambda functions mapper_lambda_name = L_PREFIX + "-mapper-" + job_id reducer_lambda_name = L_PREFIX + "-reducer-" + job_id rc_lambda_name = L_PREFIX + "-rc-" + job_id # write job config write_job_config(job_id, job_bucket, n_mappers, reducer_lambda_name, config["reducer"]["handler"])
def lambda_handler(event, context): print("Received event: " + json.dumps(event, indent=2)) start_time = time.time(); # Job Bucket. We just got a notification from this bucket bucket = event['Records'][0]['s3']['bucket']['name'] #key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key'].encode('utf8')) config = json.loads(open('./jobinfo.json', "r").read()) job_id = config["jobId"] map_count = config["mapCount"] r_function_name = config["reducerFunction"] r_handler = config["reducerHandler"] ### Get Mapper Finished Count ### # Get job files files = s3_client.list_objects(Bucket=bucket, Prefix=job_id)["Contents"] if check_job_done(files) == True: print "Job done!!! Check the result file" # TODO: Delete reducer and coordinator lambdas return else: ### Stateless Coordinator logic mapper_keys = get_mapper_files(files) print "Mappers Done so far ", len(mapper_keys) if map_count == len(mapper_keys): # All the mappers have finished, time to schedule the reducers stepInfo = get_reducer_state_info(files, job_id, bucket) print "stepInfo", stepInfo step_number = stepInfo[0]; reducer_keys = stepInfo[1]; if len(reducer_keys) == 0: print "Still waiting to finish Reducer step ", step_number return # Compute this based on metadata of files r_batch_size = get_reducer_batch_size(reducer_keys); print "Starting the the reducer step", step_number print "Batch Size", r_batch_size # Create Batch params for the Lambda function r_batch_params = lambdautils.batch_creator(reducer_keys, r_batch_size); # Build the lambda parameters n_reducers = len(r_batch_params) n_s3 = n_reducers * len(r_batch_params[0]) step_id = step_number +1; for i in range(len(r_batch_params)): batch = [b['Key'] for b in r_batch_params[i]] # invoke the reducers asynchronously resp = lambda_client.invoke( FunctionName = r_function_name, InvocationType = 'Event', Payload = json.dumps({ "bucket": bucket, "keys": batch, "jobBucket": bucket, "jobId": job_id, "nReducers": n_reducers, "stepId": step_id, "reducerId": i }) ) print resp # Now write the reducer state fname = "%s/reducerstate.%s" % (job_id, step_id) write_reducer_state(n_reducers, n_s3, bucket, fname) else: print "Still waiting for all the mappers to finish .."
def handler(event, context): start_time = time.time(); # Job Bucket. We just got a notification from this bucket bucket = event['Records'][0]['s3']['bucket']['name'] key = event['Records'][0]['s3']['object']['key'] print("Received event: {}:{}".format(bucket,key)) idx = key.find('/') tmpdir = key[:idx] obj = s3.Object(bucket, '{}/jobinfo.json'.format(tmpdir)) file_content = obj.get()['Body'].read().decode('utf-8') config = json.loads(file_content) job_id = config["jobId"] map_count = config["mapCount"] r_function_name = config["reducerFunction"] r_handler = config["reducerHandler"] ### Get Mapper Finished Count ### # Get job files files = s3_client.list_objects(Bucket=bucket, Prefix=job_id)["Contents"] if check_job_done(files) == True: print("Job done!!! Check the result file") return else: ### Stateless Coordinator logic mapper_keys = get_mapper_files(files) print("Mappers Done so far ", len(mapper_keys)) if map_count == len(mapper_keys): # All the mappers have finished, time to schedule the reducers stepInfo = get_reducer_state_info(files, job_id, bucket) #print("stepInfo", stepInfo) step_number = stepInfo[0]; reducer_keys = stepInfo[1]; if len(reducer_keys) == 0: print("Waiting to finish Reducer step ", step_number) return # Compute this based on metadata of files r_batch_size = get_reducer_batch_size(reducer_keys); #print("Starting the the reducer step", step_number) # Create Batch params for the Lambda function r_batch_params = lambdautils.batch_creator(reducer_keys, r_batch_size); print("Batch Size {}, Spawning this many reducers: {}".format(r_batch_size,len(r_batch_params))) # Build the lambda parameters n_reducers = len(r_batch_params) n_s3 = n_reducers * len(r_batch_params[0]) step_id = step_number +1; for i in range(len(r_batch_params)): batch = [b['Key'] for b in r_batch_params[i]] # invoke the reducers asynchronously resp = lambda_client.invoke( FunctionName = r_function_name, InvocationType = 'Event', Payload = json.dumps({ "bucket": bucket, "keys": batch, "jobBucket": bucket, "jobId": job_id, "nReducers": n_reducers, "stepId": step_id, "reducerId": i }) ) #print('Reducer: {}'.format(resp)) # Now write the reducer state fname = "%s/reducerstate.%s" % (job_id, step_id) write_reducer_state(n_reducers, n_s3, bucket, fname) else: print("Still waiting for all the mappers or reducers (if count > total_jobs (Num. of Mappers reported by driver)) to finish ..")
def lambda_handler(event, context): print("Received event: " + json.dumps(event, indent=2)) start_time = time.time() # Job Bucket. We just got a notification from this bucket bucket = event['Records'][0]['s3']['bucket']['name'] #key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key'].encode('utf8')) config = json.loads(open('./jobinfo.json', "r").read()) job_id = config["jobId"] map_count = config["mapCount"] r_function_name = config["reducerFunction"] r_handler = config["reducerHandler"] ### Get Mapper Finished Count ### # Get job files files = s3_client.list_objects(Bucket=bucket, Prefix=job_id)["Contents"] if check_job_done(files) == True: print "Job done!!! Check the result file" # TODO: Delete reducer and coordinator lambdas return else: ### Stateless Coordinator logic mapper_keys = get_mapper_files(files) print "Mappers Done so far ", len(mapper_keys) if map_count == len(mapper_keys): # All the mappers have finished, time to schedule the reducers stepInfo = get_reducer_state_info(files, job_id, bucket) print "stepInfo", stepInfo step_number = stepInfo[0] reducer_keys = stepInfo[1] if len(reducer_keys) == 0: print "Still waiting to finish Reducer step ", step_number return # Compute this based on metadata of files r_batch_size = get_reducer_batch_size(reducer_keys) print "Starting the the reducer step", step_number print "Batch Size", r_batch_size # Create Batch params for the Lambda function r_batch_params = lambdautils.batch_creator(reducer_keys, r_batch_size) # Build the lambda parameters n_reducers = len(r_batch_params) n_s3 = n_reducers * len(r_batch_params[0]) step_id = step_number + 1 for i in range(len(r_batch_params)): batch = [b['Key'] for b in r_batch_params[i]] # invoke the reducers asynchronously resp = lambda_client.invoke(FunctionName=r_function_name, InvocationType='Event', Payload=json.dumps({ "bucket": bucket, "keys": batch, "jobBucket": bucket, "jobId": job_id, "nReducers": n_reducers, "stepId": step_id, "reducerId": i })) print resp # Now write the reducer state fname = "%s/reducerstate.%s" % (job_id, step_id) write_reducer_state(n_reducers, n_s3, bucket, fname) else: print "Still waiting for all the mappers to finish .."
def handler(event, context): start_time = time.time() # Job Bucket. We just got a notification from this bucket bucket = event['Records'][0]['s3']['bucket']['name'] key = event['Records'][0]['s3']['object']['key'] print("Received event: {}:{}".format(bucket, key)) idx = key.find('/') tmpdir = key[:idx] obj = s3.Object(bucket, '{}/jobinfo.json'.format(tmpdir)) file_content = obj.get()['Body'].read().decode('utf-8') config = json.loads(file_content) job_id = config["jobId"] map_count = config["mapCount"] r_function_name = config["reducerFunction"] r_handler = config["reducerHandler"] ### Get Mapper Finished Count ### # Get job files files = s3_client.list_objects(Bucket=bucket, Prefix=job_id)["Contents"] if check_job_done(files) == True: print("Job done!!! Check the result file") return else: ### Stateless Coordinator logic mapper_keys = get_mapper_files(files) print("Mappers Done so far ", len(mapper_keys)) if map_count == len(mapper_keys): # All the mappers have finished, time to schedule the reducers stepInfo = get_reducer_state_info(files, job_id, bucket) #print("stepInfo", stepInfo) step_number = stepInfo[0] reducer_keys = stepInfo[1] if len(reducer_keys) == 0: print("Waiting to finish Reducer step ", step_number) return # Compute this based on metadata of files r_batch_size = get_reducer_batch_size(reducer_keys) #print("Starting the the reducer step", step_number) # Create Batch params for the Lambda function r_batch_params = lambdautils.batch_creator(reducer_keys, r_batch_size) print("Batch Size {}, Spawning this many reducers: {}".format( r_batch_size, len(r_batch_params))) # Build the lambda parameters n_reducers = len(r_batch_params) n_s3 = n_reducers * len(r_batch_params[0]) step_id = step_number + 1 for i in range(len(r_batch_params)): batch = [b['Key'] for b in r_batch_params[i]] # invoke the reducers asynchronously resp = lambda_client.invoke(FunctionName=r_function_name, InvocationType='Event', Payload=json.dumps({ "bucket": bucket, "keys": batch, "jobBucket": bucket, "jobId": job_id, "nReducers": n_reducers, "stepId": step_id, "reducerId": i })) #print('Reducer: {}'.format(resp)) # Now write the reducer state fname = "%s/reducerstate.%s" % (job_id, step_id) write_reducer_state(n_reducers, n_s3, bucket, fname) else: print( "Still waiting for all the mappers or reducers (if count > total_jobs (Num. of Mappers reported by driver)) to finish .." )
def lambda_handler(event, context): print("Received event: " + json.dumps(event, indent=2)) start_time = time.time() # Job Bucket으로 이 Bucket으로부터 notification을 받습니다. bucket = event['Records'][0]['s3']['bucket']['name'] config = json.loads(open('./jobinfo.json', "r").read()) job_id = config["jobId"] map_count = config["mapCount"] r_function_name = config["reducerFunction"] r_handler = config["reducerHandler"] ### Mapper 완료된 수를 count 합니다. ### # Job 파일들을 가져옵니다. files = s3_client.list_objects(Bucket=bucket, Prefix=job_id)["Contents"] if check_job_done(files) == True: print("Job done!!! Check the result file") return else: mapper_keys = get_mapper_files(files) print("Mappers Done so far ", len(mapper_keys)) if map_count == len(mapper_keys): # 모든 mapper가 완료되었다면, reducer를 시작합니다. stepInfo = get_reducer_state_info(files, job_id, bucket) print("stepInfo", stepInfo) step_number = stepInfo[0] reducer_keys = stepInfo[1] if len(reducer_keys) == 0: print("Still waiting to finish Reducer step ", step_number) return # 메타데이터(metadata)의 파일을 기반으로 Reduce의 배치 사이즈를 계산합니다. r_batch_size = get_reducer_batch_size(reducer_keys) print("Starting the the reducer step", step_number) print("Batch Size", r_batch_size) r_batch_params = lambdautils.batch_creator(reducer_keys, r_batch_size) n_reducers = len(r_batch_params) n_s3 = n_reducers * len(r_batch_params[0]) step_id = step_number + 1 for i in range(len(r_batch_params)): batch = [b['Key'] for b in r_batch_params[i]] # Reducer Lambda를 비동기식(asynchronously)으로 호출(invoke)합니다. resp = lambda_client.invoke(FunctionName=r_function_name, InvocationType='Event', Payload=json.dumps({ "bucket": bucket, "keys": batch, "jobBucket": bucket, "jobId": job_id, "nReducers": n_reducers, "stepId": step_id, "reducerId": i })) print(resp) # Reducer의 상태를 S3에 저장합니다. fname = "%s/reducerstate.%s" % (job_id, step_id) write_reducer_state(n_reducers, n_s3, bucket, fname) else: print("Still waiting for all the mappers to finish ..")
def handler(event, context): entry = time.time() * 1000 logger = logging.getLogger() logger.setLevel(logging.WARN) if not event: #requires arguments print('No event was passed to the handler, exiting...') return if 'mapper' not in event or 'reducer' not in event: print('No mapper or reducer function names given, unable to proceed, exiting...') return # create an S3 session if not context: #calling from main boto3.setup_default_session(profile_name='cjk1') s3 = boto3.resource('s3') config = botocore.client.Config(connect_timeout=50, read_timeout=200) s3_client = boto3.client('s3',config=config) JOB_INFO = 'jobinfo.json' # 1. Get all keys to be processed # init endearly = 0 if 'endearly' in event: endearly = int(event['endearly']) bucket = event["bucket"] dryrun = True if "dryrun" in event else False lambda_memory = 1536 # Fetch all the keys that match the prefix all_keys = [] for obj in s3.Bucket(bucket).objects.filter(Prefix=event["prefix"]).all(): all_keys.append(obj) bsize = lambdautils.compute_batch_size(all_keys, lambda_memory) batches = lambdautils.batch_creator(all_keys, bsize) n_mappers = len(batches) if endearly > 0 and endearly < n_mappers: n_mappers = endearly print("Num. of Mappers (and Reducers) ", n_mappers) if dryrun: #don't go any further delta = (time.time() * 1000) - entry me_str = 'TIMER:CALL:{}:dryrun:0'.format(delta) logger.warn(me_str) return me_str #process the remaining arguments job_id = event["job_id"] job_bucket = event["jobBucket"] region = event["region"] async = True if "full_async" in event else False reducer_lambda_name = event["reducer"] mapper_lambda_name = event["mapper"] # Write Jobdata to S3 j_key = job_id + "/jobdata"; data = json.dumps({ "mapCount": n_mappers, "totalS3Files": len(all_keys), "startTime": time.time() }) write_to_s3(s3, job_bucket, j_key, data, {}) data = json.dumps({ "jobId": job_id, "jobBucket" : job_bucket, "mapCount": n_mappers, "reducerFunction": reducer_lambda_name, "reducerHandler": "{}.handler".format(reducer_lambda_name) }, indent=4); j_key = job_id + "/jobinfo.json"; write_to_s3(s3,job_bucket,j_key,data,{}) ### Execute ### total_lambda_secs = 0 reducer_lambda_time = 0 mapper_outputs = [] if async: #asynchronous invocation of mappers for i in range(n_mappers): invoke_lambda(mapper_lambda_name,batches,bucket,job_bucket,job_id,i) else: #synchronous invocation of mappers on parallel threads pool = ThreadPool(n_mappers) Ids = [i+1 for i in range(n_mappers)] invoke_lambda_partial = partial(invoke_lambda_sync,mapper_lambda_name,batches,mapper_outputs,bucket,job_bucket,job_id) # Burst request handling mappers_executed = 0 concurrent_lambdas = 100 #only used by synchronous run (use --dryrun to see how many actual mappers are needed while mappers_executed < n_mappers: nm = min(concurrent_lambdas, n_mappers) results = pool.map(invoke_lambda_partial, Ids[mappers_executed: mappers_executed + nm]) mappers_executed += nm pool.close() pool.join() for output in mapper_outputs: if 'body' in output: total_lambda_secs += float(output['body'][2]) else: total_lambda_secs += float(output[2]) if not async: #Note: Wait for the job to complete so that we can compute total cost ; create a poll every 10 secs # Get all reducer keys reducer_keys = [] # Total execution time for reducers while True: job_keys = s3_client.list_objects(Bucket=job_bucket, Prefix=job_id)["Contents"] keys = [jk["Key"] for jk in job_keys] total_s3_size = sum([jk["Size"] for jk in job_keys]) logger.info("checking if job is done") # check job done if job_id + "/result" in keys: reducer_lambda_time += float(s3.Object(job_bucket, job_id + "/result").metadata['processingtime']) for key in keys: if "task/reducer" in key: reducer_lambda_time += float(s3.Object(job_bucket, key).metadata['processingtime']) reducer_keys.append(key) break time.sleep(5) delta = (time.time() * 1000) - entry me_str = 'TIMER:CALL:{}:mappers:{}:reducer:{}'.format(delta,total_lambda_secs,reducer_lambda_time) logger.warn(me_str) return me_str
def handler(event, context): entry = time.time() * 1000 logger = logging.getLogger() logger.setLevel(logging.WARN) if not event: #requires arguments print('No event was passed to the handler, exiting...') return if 'mapper' not in event or 'reducer' not in event: print( 'No mapper or reducer function names given, unable to proceed, exiting...' ) return # create an S3 session if not context: #calling from main boto3.setup_default_session(profile_name='cjk1') s3 = boto3.resource('s3') config = botocore.client.Config(connect_timeout=50, read_timeout=200) s3_client = boto3.client('s3', config=config) JOB_INFO = 'jobinfo.json' # 1. Get all keys to be processed # init endearly = 0 if 'endearly' in event: endearly = int(event['endearly']) bucket = event["bucket"] dryrun = True if "dryrun" in event else False lambda_memory = 1536 # Fetch all the keys that match the prefix all_keys = [] for obj in s3.Bucket(bucket).objects.filter(Prefix=event["prefix"]).all(): all_keys.append(obj) bsize = lambdautils.compute_batch_size(all_keys, lambda_memory) batches = lambdautils.batch_creator(all_keys, bsize) n_mappers = len(batches) if endearly > 0 and endearly < n_mappers: n_mappers = endearly print("Num. of Mappers (and Reducers) ", n_mappers) if dryrun: #don't go any further delta = (time.time() * 1000) - entry me_str = 'TIMER:CALL:{}:dryrun:0'.format(delta) logger.warn(me_str) return me_str #process the remaining arguments job_id = event["job_id"] job_bucket = event["jobBucket"] region = event["region"] async = True if "full_async" in event else False reducer_lambda_name = event["reducer"] mapper_lambda_name = event["mapper"] # Write Jobdata to S3 j_key = job_id + "/jobdata" data = json.dumps({ "mapCount": n_mappers, "totalS3Files": len(all_keys), "startTime": time.time() }) write_to_s3(s3, job_bucket, j_key, data, {}) data = json.dumps( { "jobId": job_id, "jobBucket": job_bucket, "mapCount": n_mappers, "reducerFunction": reducer_lambda_name, "reducerHandler": "{}.handler".format(reducer_lambda_name) }, indent=4) j_key = job_id + "/jobinfo.json" write_to_s3(s3, job_bucket, j_key, data, {}) ### Execute ### total_lambda_secs = 0 reducer_lambda_time = 0 mapper_outputs = [] if async: #asynchronous invocation of mappers for i in range(n_mappers): invoke_lambda(mapper_lambda_name, batches, bucket, job_bucket, job_id, i) else: #synchronous invocation of mappers on parallel threads pool = ThreadPool(n_mappers) Ids = [i + 1 for i in range(n_mappers)] invoke_lambda_partial = partial(invoke_lambda_sync, mapper_lambda_name, batches, mapper_outputs, bucket, job_bucket, job_id) # Burst request handling mappers_executed = 0 concurrent_lambdas = 100 #only used by synchronous run (use --dryrun to see how many actual mappers are needed while mappers_executed < n_mappers: nm = min(concurrent_lambdas, n_mappers) results = pool.map(invoke_lambda_partial, Ids[mappers_executed:mappers_executed + nm]) mappers_executed += nm pool.close() pool.join() for output in mapper_outputs: if 'body' in output: total_lambda_secs += float(output['body'][2]) else: total_lambda_secs += float(output[2]) if not async: #Note: Wait for the job to complete so that we can compute total cost ; create a poll every 10 secs # Get all reducer keys reducer_keys = [] # Total execution time for reducers while True: job_keys = s3_client.list_objects(Bucket=job_bucket, Prefix=job_id)["Contents"] keys = [jk["Key"] for jk in job_keys] total_s3_size = sum([jk["Size"] for jk in job_keys]) logger.info("checking if job is done") # check job done if job_id + "/result" in keys: reducer_lambda_time += float( s3.Object(job_bucket, job_id + "/result").metadata['processingtime']) for key in keys: if "task/reducer" in key: reducer_lambda_time += float( s3.Object(job_bucket, key).metadata['processingtime']) reducer_keys.append(key) break time.sleep(5) delta = (time.time() * 1000) - entry me_str = 'TIMER:CALL:{}:mappers:{}:reducer:{}'.format( delta, total_lambda_secs, reducer_lambda_time) logger.warn(me_str) return me_str