def schedule_same_pipeline_next_stage(stage_configuration, stage_id, shuffling_bucket, job_name, submission_time): cur_stage_config = stage_configuration[str(stage_id)] next_stage_config = stage_configuration[str(stage_id + 1)] invoking_lambda_name = next_stage_config["invoking_lambda_name"] next_stage_num_operators = stage_configuration[str(stage_id + 1)]["num_operators"] if cur_stage_config["stage_type"] == 1: keys_bins = get_map_shuffle_outputs(next_stage_num_operators, shuffling_bucket, job_name, stage_id) else: keys_bins = get_map_reduce_outputs(shuffling_bucket, job_name, [stage_id]) # keys_bin_size = len(keys_bins[0]) # for i in range(1, len(keys_bins)): # assert keys_bin_size == len(keys_bins[i]) if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: stage_progress_obj = stage_progress.StageProgress(in_lambda=True, is_local_testing=static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (job_name, submission_time) total_num_jobs = sum([len(keys_bin) for keys_bin in keys_bins]) stage_progress_obj.update_total_num_keys(stage_progress_table_name, stage_id + 1, total_num_jobs) if next_stage_config["stage_type"] == 1: for i in range(len(keys_bins)): response = lambda_client.invoke( FunctionName=invoking_lambda_name, InvocationType='Event', Payload=json.dumps({ "keys": keys_bins[i], "id": i + 1, "load_data_from_input": False, "function_pickle_path": next_stage_config["function_pickle_path"], "combiner_function_pickle_path": next_stage_config["combiner_function_pickle_path"], "partition_function_pickle_path": next_stage_config["partition_function_pickle_path"] }) ) else: for i in range(len(keys_bins)): response = lambda_client.invoke( FunctionName=invoking_lambda_name, InvocationType='Event', Payload=json.dumps({ "keys": keys_bins[i], "id": i + 1, "load_data_from_input": False, "function_pickle_path": next_stage_config["function_pickle_path"] }) ) logger.info("All operators finished in stage %s, next stage: number of operators scheduled: %s" % (stage_id, next_stage_num_operators))
def get_stage_progress(): job_name = request.args.get('job-name') submission_time = request.args.get('submission-time') logger.info( "WebUI: Received request for path /stage-progress with parameters: %s, %s" % (job_name, submission_time)) is_local_testing = os.environ.get( "local_testing") == 'True' or os.environ.get("local_testing") == 'true' stage_progress_obj = stage_progress.StageProgress( in_lambda=False, is_local_testing=is_local_testing) stages_progress = stage_progress_obj.read_progress_table( StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (job_name, submission_time)) return jsonify(stages_progress)
def _invoke_pipelines(self, invoking_pipelines_info): job_name = self.static_job_info[StaticVariables.JOB_NAME_FN] if StaticVariables.OPTIMISATION_FN not in self.static_job_info \ or not self.static_job_info[StaticVariables.OPTIMISATION_FN]: stage_progress_obj = stage_progress.StageProgress( in_lambda=self.is_serverless, is_local_testing=self.static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % ( job_name, self.submission_time) for pipeline_id, invoking_pipeline_info in invoking_pipelines_info.items( ): logger.info("Scheduling pipeline %s" % pipeline_id) num_mappers = invoking_pipeline_info[1] batches = invoking_pipeline_info[2] first_function = invoking_pipeline_info[3] stage_id = invoking_pipeline_info[4] concurrent_lambdas = self.config[StaticVariables.NUM_CONCURRENT_LAMBDAS_FN] \ if StaticVariables.NUM_CONCURRENT_LAMBDAS_FN in self.config else StaticVariables.DEFAULT_NUM_CONCURRENT_LAMBDAS if StaticVariables.OPTIMISATION_FN not in self.static_job_info \ or not self.static_job_info[StaticVariables.OPTIMISATION_FN]: total_num_jobs = sum([len(batch) for batch in batches]) stage_progress_obj.update_total_num_keys( stage_progress_table_name, stage_id, total_num_jobs) # Exec Parallel logger.info("Number of Mappers: %s" % num_mappers) pool = ThreadPool(num_mappers) ids = [i + 1 for i in range(num_mappers)] invoke_lambda_partial = partial(self.invoke_lambda, batches, first_function, stage_id) # Burst request handling mappers_executed = 0 while mappers_executed < num_mappers: nm = min(concurrent_lambdas, num_mappers) results = pool.map(invoke_lambda_partial, ids[mappers_executed:mappers_executed + nm]) mappers_executed += nm pool.close() pool.join() logger.info("Pipeline %s scheduled successfully" % pipeline_id)
def lambda_handler(event, _): logger.info("**************Map****************") start_time = time.time() io_time = 0 src_keys = event['keys'] load_data_from_input = event['load_data_from_input'] mapper_id = event['id'] map_function_pickle_path = event['function_pickle_path'] with open(map_function_pickle_path, 'rb') as f: map_function = pickle.load(f) # create an S3 session if static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]: s3_client = boto3.client('s3', aws_access_key_id='', aws_secret_access_key='', region_name=StaticVariables.DEFAULT_REGION, endpoint_url='http://%s:4572' % os.environ['LOCALSTACK_HOSTNAME']) lambda_client = boto3.client( 'lambda', aws_access_key_id='', aws_secret_access_key='', region_name=StaticVariables.DEFAULT_REGION, endpoint_url='http://%s:4574' % os.environ['LOCALSTACK_HOSTNAME']) else: s3_client = boto3.client('s3') lambda_client = boto3.client('lambda') shuffling_bucket = static_job_info[StaticVariables.SHUFFLING_BUCKET_FN] job_name = static_job_info[StaticVariables.JOB_NAME_FN] stage_id = int(os.environ.get("stage_id")) total_num_stages = int(os.environ.get("total_num_stages")) coordinator_lambda_name = os.environ.get("coordinator_lambda_name") submission_time = os.environ.get("submission_time") logger.info("Stage: %s" % stage_id) if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: stage_progress_obj = stage_progress.StageProgress( in_lambda=True, is_local_testing=static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % ( job_name, submission_time) # aggr line_count = 0 # INPUT CSV => OUTPUT JSON begin_time = time.time() interval_time = random.randint(1, 3) interval_num_keys_processed = 0 outputs = [] start_overhead = time.time() - start_time logger.info("Start overhead: %s" % str(start_overhead)) if load_data_from_input: cur_input_handler = input_handler.get_input_handler( static_job_info[StaticVariables.INPUT_SOURCE_TYPE_FN], static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN], in_lambda=True) input_source = static_job_info[StaticVariables.INPUT_SOURCE_FN] for input_key in src_keys: io_start_time = time.time() input_value = cur_input_handler.read_value(input_source, input_key, static_job_info) io_time += time.time() - io_start_time input_pair = (input_key, input_value) map_function(outputs, input_pair) # TODO: Line count can be used to verify correctness of the job. Can be removed if needed in the future. if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: if static_job_info[ StaticVariables.INPUT_SOURCE_TYPE_FN] == "s3": line_count += len(input_value.split('\n')) - 1 elif static_job_info[ StaticVariables.INPUT_SOURCE_TYPE_FN] == "dynamodb": line_count += 1 interval_num_keys_processed += 1 current_time = time.time() if int(current_time - begin_time) > interval_time: begin_time = current_time interval_time = random.randint(1, 3) stage_progress_obj.increase_num_processed_keys( stage_progress_table_name, stage_id, interval_num_keys_processed) interval_num_keys_processed = 0 else: for input_key in src_keys: io_start_time = time.time() response = s3_client.get_object(Bucket=shuffling_bucket, Key=input_key) contents = response['Body'].read() input_value = json.loads(contents) io_time += time.time() - io_start_time input_pair = (input_key, input_value) map_function(outputs, input_pair) if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: line_count += len(input_value) interval_num_keys_processed += 1 current_time = time.time() if int(current_time - begin_time) > interval_time: begin_time = current_time interval_time = random.randint(1, 3) stage_progress_obj.increase_num_processed_keys( stage_progress_table_name, stage_id, interval_num_keys_processed) interval_num_keys_processed = 0 if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: stage_progress_obj.increase_num_processed_keys( stage_progress_table_name, stage_id, interval_num_keys_processed) # timeTaken = time_in_secs * 1000000000 # in 10^9 # s3DownloadTime = 0 # totalProcessingTime = 0 logger.info("Map sample outputs: %s" % str(outputs[0:10])) if stage_id == total_num_stages: cur_output_handler = output_handler.get_output_handler( static_job_info[StaticVariables.OUTPUT_SOURCE_TYPE_FN], static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN], in_lambda=True) # cur_output_handler.write_output(mapper_id, outputs, metadata, submission_time, static_job_info) io_start_time = time.time() cur_output_handler.write_output(mapper_id, outputs, {}, static_job_info, submission_time) io_time += time.time() - io_start_time else: mapper_filename = "%s/%s-%s/%s" % ( job_name, StaticVariables.OUTPUT_PREFIX, stage_id, mapper_id) # s3_client.put_object(Bucket=shuffling_bucket, Key=mapper_filename, # Body=json.dumps(outputs), Metadata=metadata) io_start_time = time.time() s3_client.put_object(Bucket=shuffling_bucket, Key=mapper_filename, Body=json.dumps(outputs)) io_time += time.time() - io_start_time lambda_client.invoke(FunctionName=coordinator_lambda_name, InvocationType='Event', Payload=json.dumps({'stage_id': stage_id})) execution_time = time.time() - start_time metadata = { "lineCount": '%s' % line_count, "processingTime": '%s' % execution_time, "memoryUsage": '%s' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, "numKeys": '%s' % len(src_keys), "ioTime": '%s' % io_time, "computeTime": '%s' % str(execution_time - io_time) } info_write_start_time = time.time() metrics_bucket = StaticVariables.METRICS_BUCKET % job_name execution_info_s3_key = "%s/stage-%s/%s" % (job_name, stage_id, mapper_id) s3_client.put_object(Bucket=metrics_bucket, Key=execution_info_s3_key, Body=json.dumps({}), Metadata=metadata) logger.info("Info write time: %s" % str(time.time() - info_write_start_time)) logger.info("Mapper %s finishes execution" % str(mapper_id)) logger.info("Execution time: %s" % str(time.time() - start_time))
def schedule_different_pipeline_next_stage(is_serverless_driver, stage_configuration, cur_pipeline_id, shuffling_bucket, job_name, submission_time): if not is_serverless_driver: with open(StaticVariables.PIPELINE_DEPENDENCIES_PATH) as json_file: adj_list = json.load(json_file) else: response = s3_client.get_object( Bucket=shuffling_bucket, Key=StaticVariables.PIPELINE_DEPENDENCIES_PATH) contents = response['Body'].read() adj_list = json.loads(contents) if not is_serverless_driver: with open(StaticVariables.PIPELINE_TO_FIRST_LAST_STAGE_PATH ) as json_file: pipeline_first_last_stage_ids = json.load(json_file) else: response = s3_client.get_object( Bucket=shuffling_bucket, Key=StaticVariables.PIPELINE_TO_FIRST_LAST_STAGE_PATH) contents = response['Body'].read() pipeline_first_last_stage_ids = json.loads(contents) in_degree_obj = in_degree.InDegree( in_lambda=True, is_local_testing=static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_obj = stage_progress.StageProgress( in_lambda=True, is_local_testing=static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % ( job_name, submission_time) for dependent_pipeline_id in adj_list[str(cur_pipeline_id)]: response = in_degree_obj.decrement_in_degree_table( StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % (job_name, submission_time), dependent_pipeline_id) dependent_in_degree = int(response["Attributes"]["in_degree"]["N"]) if dependent_in_degree == 0: next_pipeline_first_stage_id = pipeline_first_last_stage_ids[str( dependent_pipeline_id)][0] next_stage_config = stage_configuration[str( next_pipeline_first_stage_id)] invoking_lambda_name = next_stage_config["invoking_lambda_name"] dependent_stage_ids = next_stage_config["dependent_last_stage_ids"] # The last stages of a pipeline is assumed to be always either a map or reduce. keys_bins = get_map_reduce_outputs(shuffling_bucket, job_name, dependent_stage_ids) total_num_jobs = sum([len(keys_bin) for keys_bin in keys_bins]) stage_progress_obj.update_total_num_keys( stage_progress_table_name, next_pipeline_first_stage_id, total_num_jobs) if next_stage_config["stage_type"] == 1: for i in range(len(keys_bins)): response = lambda_client.invoke( FunctionName=invoking_lambda_name, InvocationType='Event', Payload=json.dumps({ "keys": keys_bins[i], "id": i + 1, "load_data_from_input": False, "function_pickle_path": next_stage_config["function_pickle_path"], "combiner_function_pickle_path": next_stage_config["combiner_function_pickle_path"], "partition_function_pickle_path": next_stage_config["partition_function_pickle_path"] })) else: for i in range(len(keys_bins)): response = lambda_client.invoke( FunctionName=invoking_lambda_name, InvocationType='Event', Payload=json.dumps({ "keys": keys_bins[i], "id": i + 1, "load_data_from_input": False, "function_pickle_path": next_stage_config["function_pickle_path"] })) print( "All operators finished in pipeline %s, next pipeline: number of operators scheduled: %s" % (cur_pipeline_id, len(keys_bins)))
def lambda_handler(event, _): logger.info("**************Map-Shuffle****************") start_time = time.time() io_time = 0 src_keys = event['keys'] mapper_id = event['id'] load_data_from_input = event['load_data_from_input'] map_function_pickle_path = event['function_pickle_path'] combiner_function_pickle_path = event['combiner_function_pickle_path'] partition_function_pickle_path = event['partition_function_pickle_path'] with open(map_function_pickle_path, 'rb') as f: map_function = pickle.load(f) with open(combiner_function_pickle_path, 'rb') as f: combiner_function = pickle.load(f) with open(partition_function_pickle_path, 'rb') as f: partition_function = pickle.load(f) # create an S3 session if static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]: s3_client = boto3.client('s3', aws_access_key_id='', aws_secret_access_key='', region_name=StaticVariables.DEFAULT_REGION, endpoint_url='http://%s:4572' % os.environ['LOCALSTACK_HOSTNAME']) lambda_client = boto3.client('lambda', aws_access_key_id='', aws_secret_access_key='', region_name=StaticVariables.DEFAULT_REGION, endpoint_url='http://%s:4574' % os.environ['LOCALSTACK_HOSTNAME']) else: s3_client = boto3.client('s3') lambda_client = boto3.client('lambda') shuffling_bucket = static_job_info[StaticVariables.SHUFFLING_BUCKET_FN] job_name = static_job_info[StaticVariables.JOB_NAME_FN] use_combine = static_job_info[StaticVariables.USE_COMBINE_FLAG_FN] stage_id = int(os.environ.get("stage_id")) num_bins = int(os.environ.get("num_reducers")) coordinator_lambda_name = os.environ.get("coordinator_lambda_name") submission_time = os.environ.get("submission_time") logger.info("Stage: %s" % stage_id) if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: stage_progress_obj = stage_progress.StageProgress(in_lambda=True, is_local_testing=static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (job_name, submission_time) # aggr line_count = 0 err = '' # INPUT CSV => OUTPUT JSON begin_time = time.time() interval_time = random.randint(1, 3) interval_num_keys_processed = 0 intermediate_data = [] if load_data_from_input: cur_input_handler = input_handler.get_input_handler(static_job_info[StaticVariables.INPUT_SOURCE_TYPE_FN], static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN], in_lambda=True) input_source = static_job_info[StaticVariables.INPUT_SOURCE_FN] for input_key in src_keys: io_start_time = time.time() input_value = cur_input_handler.read_value(input_source, input_key, static_job_info) io_time += time.time() - io_start_time input_pair = (input_key, input_value) # logger.info("Before calling map function") # logger.info("The input key: %s" % input_key) map_function(intermediate_data, input_pair) # logger.info("After calling map function") if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: # TODO: Line count can be used to verify correctness of the job. Can be removed if needed in the future. if static_job_info[StaticVariables.INPUT_SOURCE_TYPE_FN] == "s3": line_count += len(input_value.split('\n')) - 1 elif static_job_info[StaticVariables.INPUT_SOURCE_TYPE_FN] == "dynamodb": line_count += 1 interval_num_keys_processed += 1 current_time = time.time() if int(current_time - begin_time) > interval_time: begin_time = current_time interval_time = random.randint(1, 3) stage_progress_obj.increase_num_processed_keys(stage_progress_table_name, stage_id, interval_num_keys_processed) interval_num_keys_processed = 0 else: for input_key in src_keys: io_start_time = time.time() response = s3_client.get_object(Bucket=shuffling_bucket, Key=input_key) contents = response['Body'].read() input_value = json.loads(contents) io_time += time.time() - io_start_time input_pair = (input_key, input_value) map_function(intermediate_data, input_pair) if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: line_count += len(input_value) interval_num_keys_processed += 1 current_time = time.time() if int(current_time - begin_time) > interval_time: begin_time = current_time interval_time = random.randint(1, 3) stage_progress_obj.increase_num_processed_keys(stage_progress_table_name, stage_id, interval_num_keys_processed) interval_num_keys_processed = 0 if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: stage_progress_obj.increase_num_processed_keys(stage_progress_table_name, stage_id, interval_num_keys_processed) if use_combine: intermediate_data.sort(key=lambda x: x[0]) cur_key = None cur_values = [] outputs = [] for input_key, value in intermediate_data: if cur_key == input_key: cur_values.append(value) else: if cur_key is not None: combiner_function(outputs, (cur_key, cur_values)) cur_key = input_key cur_values = [value] if cur_key is not None: combiner_function(outputs, (cur_key, cur_values)) else: outputs = intermediate_data # timeTaken = time_in_secs * 1000000000 # in 10^9 # s3DownloadTime = 0 # totalProcessingTime = 0 # Partition ids are from 1 to n (inclusive). output_partitions = [[] for _ in range(num_bins + 1)] logger.info("MapShuffle sample outputs: %s" % str(outputs[0:10])) for input_key, value in outputs: partition_id = partition_function(input_key, num_bins) + 1 cur_partition = output_partitions[partition_id] cur_partition.append(tuple((input_key, value))) for i in range(1, num_bins + 1): partition_id = "bin-%s" % i mapper_filename = "%s/%s-%s/%s/%s" % (job_name, StaticVariables.OUTPUT_PREFIX, stage_id, partition_id, mapper_id) io_start_time = time.time() s3_client.put_object(Bucket=shuffling_bucket, Key=mapper_filename, Body=json.dumps(output_partitions[i])) io_time += time.time() - io_start_time lambda_client.invoke( FunctionName=coordinator_lambda_name, InvocationType='Event', Payload=json.dumps({ 'stage_id': stage_id }) ) time_in_secs = time.time() - start_time metadata = { "lineCount": '%s' % line_count, "processingTime": '%s' % time_in_secs, "memoryUsage": '%s' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, "numKeys": '%s' % len(src_keys), "ioTime": '%s' % io_time, "computeTime": '%s' % str(time_in_secs - io_time) } info_write_start_time = time.time() metrics_bucket = StaticVariables.METRICS_BUCKET % job_name execution_info_s3_key = "%s/stage-%s/%s" % (job_name, stage_id, mapper_id) s3_client.put_object(Bucket=metrics_bucket, Key=execution_info_s3_key, Body=json.dumps({}), Metadata=metadata) logger.info("Info write time: %s" % str(time.time() - info_write_start_time)) logger.info("MapShuffler %s finishes execution" % str(mapper_id)) logger.info("Execution time: %s" % str(time.time() - start_time))
def lambda_handler(event, _): print("**************Reduce****************") start_time = time.time() reduce_keys = event['keys'] reducer_id = event['id'] reduce_function_pickle_path = event['function_pickle_path'] with open(reduce_function_pickle_path, 'rb') as f: reduce_function = pickle.load(f) # create an S3 & Dynamo session static_job_info = json.loads( open(StaticVariables.STATIC_JOB_INFO_PATH, 'r').read()) if static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]: s3_client = boto3.client('s3', aws_access_key_id='', aws_secret_access_key='', region_name=StaticVariables.DEFAULT_REGION, endpoint_url='http://%s:4572' % os.environ['LOCALSTACK_HOSTNAME']) lambda_client = boto3.client( 'lambda', aws_access_key_id='', aws_secret_access_key='', region_name=StaticVariables.DEFAULT_REGION, endpoint_url='http://%s:4574' % os.environ['LOCALSTACK_HOSTNAME']) else: s3_client = boto3.client('s3') lambda_client = boto3.client('lambda') shuffling_bucket = static_job_info[StaticVariables.SHUFFLING_BUCKET_FN] # use_combine = static_job_info[StaticVariables.USE_COMBINE_FLAG_FN] job_name = static_job_info[StaticVariables.JOB_NAME_FN] stage_id = int(os.environ.get("stage_id")) total_num_stages = int(os.environ.get("total_num_stages")) coordinator_lambda_name = os.environ.get("coordinator_lambda_name") submission_time = os.environ.get("submission_time") print("Stage:", stage_id) stage_progress_obj = stage_progress.StageProgress( in_lambda=True, is_local_testing=static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % ( job_name, submission_time) # aggr line_count = 0 intermediate_data = [] # INPUT JSON => OUTPUT JSON # Download and process all keys for key in reduce_keys: response = s3_client.get_object(Bucket=shuffling_bucket, Key=key) contents = response['Body'].read() for key_value in json.loads(contents): line_count += 1 intermediate_data.append(key_value) intermediate_data.sort(key=lambda x: x[0]) begin_time = time.time() interval_time = random.randint(60, 180) interval_num_keys_processed = 0 average_num_keys = float(len(intermediate_data) / len(reduce_keys)) cur_key = None cur_values = [] outputs = [] for key, value in intermediate_data: if cur_key == key: cur_values.append(value) else: if cur_key is not None: cur_key_outputs = [] reduce_function(cur_key_outputs, (cur_key, cur_values)) outputs += cur_key_outputs cur_key = key cur_values = [value] interval_num_keys_processed += 1 current_time = time.time() if int(current_time - begin_time) > interval_time: begin_time = current_time interval_time = random.randint(1, 3) interval_num_files_processed = int(interval_num_keys_processed / average_num_keys) stage_progress_obj.increase_num_processed_keys( stage_progress_table_name, stage_id, interval_num_files_processed) interval_num_keys_processed = interval_num_keys_processed % average_num_keys if cur_key is not None: cur_key_outputs = [] reduce_function(cur_key_outputs, (cur_key, cur_values)) outputs += cur_key_outputs interval_num_files_processed = int(interval_num_keys_processed / average_num_keys) stage_progress_obj.increase_num_processed_keys( stage_progress_table_name, stage_id, interval_num_files_processed) time_in_secs = (time.time() - start_time) # timeTaken = time_in_secs * 1000000000 # in 10^9 # s3DownloadTime = 0 # totalProcessingTime = 0 metadata = { "lineCount": '%s' % line_count, "processingTime": '%s' % time_in_secs, "memoryUsage": '%s' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, "numKeys": '%s' % len(reduce_keys) } if stage_id == total_num_stages: cur_output_handler = output_handler.get_output_handler( static_job_info[StaticVariables.OUTPUT_SOURCE_TYPE_FN], static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN], in_lambda=True) cur_output_handler.write_output(reducer_id, outputs, metadata, static_job_info, submission_time) else: mapper_filename = "%s/%s-%s/%s" % ( job_name, StaticVariables.OUTPUT_PREFIX, stage_id, reducer_id) s3_client.put_object(Bucket=shuffling_bucket, Key=mapper_filename, Body=json.dumps(outputs), Metadata=metadata) lambda_client.invoke(FunctionName=coordinator_lambda_name, InvocationType='Event', Payload=json.dumps({'stage_id': stage_id}))
def _create_lambdas(self): job_name = self.static_job_info[StaticVariables.JOB_NAME_FN] lambda_name_prefix = self.static_job_info[StaticVariables.LAMBDA_NAME_PREFIX_FN] \ if StaticVariables.LAMBDA_NAME_PREFIX_FN in self.static_job_info \ else StaticVariables.DEFAULT_LAMBDA_NAME_PREFIX shuffling_bucket = self.static_job_info[ StaticVariables.SHUFFLING_BUCKET_FN] region = self.config[StaticVariables.REGION_FN] \ if StaticVariables.REGION_FN in self.config else StaticVariables.DEFAULT_REGION stage_id = 1 num_operators = 0 function_lambdas = [] stage_config = {} mapping_stage_id_pipeline_id = {} adj_list = defaultdict(list) self.in_degrees = {} invoking_pipelines_info = {} pipelines_last_stage_num_operators = {} pipelines_first_last_stage_ids = {} stage_type_of_operations = {} cur_coordinator_lambda_name = "%s-%s-%s" % ( job_name, lambda_name_prefix, "coordinator") # The first function should be a map/map_shuffle function for pipeline_id, pipeline in self.pipelines.items(): functions = pipeline.get_functions() pipeline_static_job_info = self._overwrite_existing_job_info( pipeline.get_config()) # TODO: The next line is correct? self.static_job_info = pipeline_static_job_info dependent_pipeline_ids = pipeline.get_dependent_pipeline_ids() for dependent_pipeline_id in dependent_pipeline_ids: adj_list[dependent_pipeline_id].append(pipeline_id) self.in_degrees[pipeline_id] = self.in_degrees.get( pipeline_id, 0) + 1 if len(dependent_pipeline_ids) == 0: if not self.is_serverless: set_up_local_input_data(pipeline_static_job_info) all_keys, num_operators, batches = self._get_all_keys( pipeline_static_job_info) first_function = functions[0] invoking_pipelines_info[pipeline_id] = [ all_keys, num_operators, batches, first_function, stage_id ] else: num_operators = 0 for dependent_pipeline_id in dependent_pipeline_ids: num_operators += pipelines_last_stage_num_operators[ dependent_pipeline_id] pipelines_first_last_stage_ids[pipeline_id] = [stage_id] for i in range(len(functions)): mapping_stage_id_pipeline_id[stage_id] = pipeline_id cur_function = functions[i] cur_function_zip_path = "%s-%s.zip" % ( cur_function.get_string(), stage_id) stage_type_of_operations[stage_id] = cur_function.get_string() # Prepare Lambda functions if driver running in local machine if not self.is_serverless: pickle_functions_and_zip_stage(cur_function_zip_path, cur_function, stage_id) cur_function_lambda_name = "%s-%s-%s-%s" % ( job_name, lambda_name_prefix, cur_function.get_string(), stage_id) cur_function_lambda = lambda_manager.LambdaManager( self.lambda_client, self.s3_client, region, cur_function_zip_path, job_name, cur_function_lambda_name, cur_function.get_handler_function_path()) if isinstance(cur_function, MapShuffleFunction): assert i + 1 < len(functions) and isinstance( functions[i + 1], ReduceFunction) cur_function_lambda.update_code_or_create_on_no_exist( self.total_num_functions, submission_time=self.submission_time, coordinator_lambda_name=cur_coordinator_lambda_name, stage_id=stage_id, num_reducers=functions[i + 1].get_num_reducers()) else: cur_function_lambda.update_code_or_create_on_no_exist( self.total_num_functions, submission_time=self.submission_time, coordinator_lambda_name=cur_coordinator_lambda_name, stage_id=stage_id) function_lambdas.append(cur_function_lambda) # Coordinator cur_function_pickle_path = 'job/%s-%s.pkl' % ( cur_function.get_string(), stage_id) dependent_last_stage_ids = [] for dependent_pipeline_id in dependent_pipeline_ids: dependent_last_stage_ids.append( pipelines_first_last_stage_ids[dependent_pipeline_id] [1]) if isinstance(cur_function, MapShuffleFunction): partition_function_pickle_path = 'job/%s-%s.pkl' % ( "partition", stage_id) combiner_function_pickle_path = 'job/%s-%s.pkl' % ( "combiner", stage_id) stage_config[stage_id] = \ create_stage_config_file(num_operators, 1, cur_function_lambda_name, cur_function_pickle_path, dependent_last_stage_ids, partition_function_pickle_path, combiner_function_pickle_path) else: if isinstance(cur_function, ReduceFunction): num_operators = cur_function.get_num_reducers() stage_config[stage_id] = \ create_stage_config_file(num_operators, 2, cur_function_lambda_name, cur_function_pickle_path, dependent_last_stage_ids) stage_id += 1 pipelines_first_last_stage_ids[pipeline_id].append(stage_id - 1) pipelines_last_stage_num_operators[pipeline_id] = num_operators coordinator_zip_path = StaticVariables.COORDINATOR_ZIP_PATH if not self.is_serverless: self._write_config_to_local(adj_list, mapping_stage_id_pipeline_id, pipelines_first_last_stage_ids, stage_config) zip.zip_lambda([StaticVariables.COORDINATOR_HANDLER_PATH], coordinator_zip_path) else: self._write_config_to_s3(adj_list, mapping_stage_id_pipeline_id, pipelines_first_last_stage_ids, stage_config, shuffling_bucket) # Web UI information if StaticVariables.OPTIMISATION_FN not in self.static_job_info \ or not self.static_job_info[StaticVariables.OPTIMISATION_FN]: dag_information = construct_dag_information( adj_list, mapping_stage_id_pipeline_id, pipelines_first_last_stage_ids, stage_type_of_operations) populate_static_job_info(self.static_job_info, len(pipelines_first_last_stage_ids), len(stage_type_of_operations), self.submission_time) self._write_web_ui_info( dag_information, stage_config, self.static_job_info, StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, job_name) cur_coordinator_lambda = lambda_manager.LambdaManager( self.lambda_client, self.s3_client, region, coordinator_zip_path, job_name, cur_coordinator_lambda_name, StaticVariables.COORDINATOR_HANDLER_FUNCTION_PATH) cur_coordinator_lambda.update_code_or_create_on_no_exist( self.total_num_functions, submission_time=self.submission_time) # cur_coordinator_lambda.add_lambda_permission(random.randint(1, 1000), shuffling_bucket) # shuffling_s3_path_prefix = "%s/" % job_name # cur_coordinator_lambda.create_s3_event_source_notification(shuffling_bucket, shuffling_s3_path_prefix) # time.sleep(1) function_lambdas.append(cur_coordinator_lambda) if len(self.pipelines) > 1: in_degree_obj = in_degree.InDegree( in_lambda=self.is_serverless, is_local_testing=self.static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) in_degree_table_name = StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % ( job_name, self.submission_time) in_degree_obj.delete_in_degree_table(in_degree_table_name) in_degree_obj.create_in_degree_table(in_degree_table_name) in_degree_obj.initialise_in_degree_table(in_degree_table_name, self.in_degrees) if StaticVariables.OPTIMISATION_FN not in self.static_job_info \ or not self.static_job_info[StaticVariables.OPTIMISATION_FN]: stage_progress_obj = stage_progress.StageProgress( in_lambda=self.is_serverless, is_local_testing=self.static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % ( job_name, self.submission_time) stage_progress_obj.delete_progress_table(stage_progress_table_name) stage_progress_obj.create_progress_table(stage_progress_table_name) stage_progress_obj.initialise_progress_table( stage_progress_table_name, stage_id - 1) if not self.is_serverless: delete_files(glob.glob(StaticVariables.LAMBDA_ZIP_GLOB_PATH)) delete_files(glob.glob(StaticVariables.FUNCTIONS_PICKLE_GLOB_PATH)) return function_lambdas, invoking_pipelines_info, num_operators
def lambda_handler(event, _): logger.info("**************Reduce****************") start_time = time.time() io_time = 0 reduce_keys = event['keys'] reducer_id = event['id'] reduce_function_pickle_path = event['function_pickle_path'] with open(reduce_function_pickle_path, 'rb') as f: reduce_function = pickle.load(f) # create an S3 & Dynamo session if static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]: s3_client = boto3.client('s3', aws_access_key_id='', aws_secret_access_key='', region_name=StaticVariables.DEFAULT_REGION, endpoint_url='http://%s:4572' % os.environ['LOCALSTACK_HOSTNAME']) lambda_client = boto3.client( 'lambda', aws_access_key_id='', aws_secret_access_key='', region_name=StaticVariables.DEFAULT_REGION, endpoint_url='http://%s:4574' % os.environ['LOCALSTACK_HOSTNAME']) else: s3_client = boto3.client('s3') lambda_client = boto3.client('lambda') shuffling_bucket = static_job_info[StaticVariables.SHUFFLING_BUCKET_FN] # use_combine = static_job_info[StaticVariables.USE_COMBINE_FLAG_FN] job_name = static_job_info[StaticVariables.JOB_NAME_FN] stage_id = int(os.environ.get("stage_id")) total_num_stages = int(os.environ.get("total_num_stages")) coordinator_lambda_name = os.environ.get("coordinator_lambda_name") submission_time = os.environ.get("submission_time") logger.info("Stage: %s" % str(stage_id)) logger.info("Reducer id: %s" % str(reducer_id)) if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: stage_progress_obj = stage_progress.StageProgress( in_lambda=True, is_local_testing=static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % ( job_name, submission_time) # aggr line_count = 0 intermediate_data = [] retry_reduce_keys = [] # INPUT JSON => OUTPUT JSON # Download and process all keys for key in reduce_keys: try: io_start_time = time.time() response = s3_client.get_object(Bucket=shuffling_bucket, Key=key) contents = response['Body'].read() io_time += time.time() - io_start_time for key_value in json.loads(contents): line_count += 1 intermediate_data.append(key_value) except Exception as e: logger.info("Key: %s" % key) logger.info("First time Error: %s" % str(e)) retry_reduce_keys.append(key) # time.sleep(1) # second_retry_reduce_keys = [] # for key in retry_reduce_keys: # try: # io_start_time = time.time() # response = s3_client.get_object(Bucket=shuffling_bucket, Key=key) # contents = response['Body'].read() # io_time += time.time() - io_start_time # # for key_value in json.loads(contents): # line_count += 1 # intermediate_data.append(key_value) # except Exception as e: # logger.info("Key: %s" % key) # logger.info("Second time Error: %s" % str(e)) # second_retry_reduce_keys.append(key) # time.sleep(2) # for key in second_retry_reduce_keys: # try: # io_start_time = time.time() # response = s3_client.get_object(Bucket=shuffling_bucket, Key=key) # contents = response['Body'].read() # io_time += time.time() - io_start_time # # for key_value in json.loads(contents): # line_count += 1 # intermediate_data.append(key_value) # except Exception as e: # logger.info("Key: %s" % key) # logger.info("Third time Error: %s" % str(e)) # raise RuntimeError("%s" % str(e)) intermediate_data.sort(key=lambda x: x[0]) begin_time = time.time() interval_time = random.randint(60, 180) interval_num_keys_processed = 0 average_num_keys = float(len(intermediate_data) / len(reduce_keys)) cur_key = None cur_values = [] outputs = [] for key, value in intermediate_data: if cur_key == key: cur_values.append(value) else: if cur_key is not None: reduce_function(outputs, (cur_key, cur_values)) cur_key = key cur_values = [value] if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: interval_num_keys_processed += 1 current_time = time.time() if int(current_time - begin_time) > interval_time: begin_time = current_time interval_time = random.randint(1, 3) interval_num_files_processed = int( interval_num_keys_processed / average_num_keys) stage_progress_obj.increase_num_processed_keys( stage_progress_table_name, stage_id, interval_num_files_processed) interval_num_keys_processed = interval_num_keys_processed % average_num_keys if cur_key is not None: reduce_function(outputs, (cur_key, cur_values)) if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: interval_num_files_processed = int(interval_num_keys_processed / average_num_keys) stage_progress_obj.increase_num_processed_keys( stage_progress_table_name, stage_id, interval_num_files_processed) logger.info("Reduce sample outputs: %s" % str(outputs[0:10])) if stage_id == total_num_stages: cur_output_handler = output_handler.get_output_handler( static_job_info[StaticVariables.OUTPUT_SOURCE_TYPE_FN], static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN], in_lambda=True) io_start_time = time.time() cur_output_handler.write_output(reducer_id, outputs, {}, static_job_info, submission_time) io_time += time.time() - io_start_time logger.info("Finished writing the output") else: mapper_filename = "%s/%s-%s/%s" % ( job_name, StaticVariables.OUTPUT_PREFIX, stage_id, reducer_id) io_start_time = time.time() s3_client.put_object(Bucket=shuffling_bucket, Key=mapper_filename, Body=json.dumps(outputs)) io_time += time.time() - io_start_time logger.info("Finished writing the output") lambda_client.invoke(FunctionName=coordinator_lambda_name, InvocationType='Event', Payload=json.dumps({'stage_id': stage_id})) logger.info("Finished scheduling the coordinator Lambda function") time_in_secs = time.time() - start_time metadata = { "lineCount": '%s' % line_count, "processingTime": '%s' % time_in_secs, "memoryUsage": '%s' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, "numKeys": '%s' % len(reduce_keys), "ioTime": '%s' % io_time, "computeTime": '%s' % str(time_in_secs - io_time) } info_write_start_time = time.time() metrics_bucket = StaticVariables.METRICS_BUCKET % job_name execution_info_s3_key = "%s/stage-%s/%s" % (job_name, stage_id, reducer_id) s3_client.put_object(Bucket=metrics_bucket, Key=execution_info_s3_key, Body=json.dumps({}), Metadata=metadata) logger.info("Info write time: %s" % str(time.time() - info_write_start_time)) logger.info("Reducer %s finishes execution" % str(reducer_id)) logger.info("Execution time: %s" % str(time.time() - start_time))