def invoke(self): response = self.lambda_client.invoke( FunctionName=self.driver_lambda_name, InvocationType='RequestResponse', Payload=json.dumps({})) logger.info("Finished executing this job: %s" % response)
def __init__(self, args): logger.info('Initializing') self.args = args # Load config self.config = load_config() # Get selected detector if self.config['object_detection']['use_tensorflow']: self.detector = TensorFlowDetector() self.detector_class = TensorFlowDetector self.detector_config = self.config['object_detection'][ 'tensorflow'] # Load label maps self.categories = json.load(open(self.config['data']['label_map'], 'r')).get('classes') self.categories = [{ 'id': cat['id'], 'name': cat['name'] } for cat in self.categories] self.road_condition_label = TensorFlowRecognizer.load_recognizer_label( ) # Create dicts for simple translation between name and id self.cat2id = {cat['name']: cat['id'] for cat in self.categories} self.id2cat = {cat['id']: cat['name'] for cat in self.categories} # Initialize vars self.per_class_detections = None self.all_detections = None self.all_recognitions = None self.prepare_dicts()
def get_all_input_keys(self, static_job_info): # Returns all input keys to be processed: a list of format obj where obj is a map of {'Key': ..., 'Size': ...} all_keys = [] input_table_name = static_job_info[StaticVariables.INPUT_SOURCE_FN] response = self.client.describe_table(TableName=input_table_name) number_of_records = response['Table']['ItemCount'] if number_of_records == 0: logger.warning("Number of live records in DynamoDB is 0. " "Note that DynamoDB updates this number every 6 hours.") logger.info("The average size of one record will be assumed to be maximum") # The maximum size of a DynamoDB item is 400KB. one_record_avg_size = 400 * 1024 else: one_record_avg_size = response['Table']['TableSizeBytes'] / number_of_records input_partition_key = static_job_info[StaticVariables.INPUT_PARTITION_KEY_DYNAMODB] input_sort_key = static_job_info[StaticVariables.INPUT_SORT_KEY_DYNAMODB] \ if StaticVariables.INPUT_SORT_KEY_DYNAMODB in static_job_info else None projection_expression = input_partition_key[0] if input_sort_key is not None: projection_expression += ",%s" % input_sort_key[0] response = self.client.scan(TableName=input_table_name, ProjectionExpression=projection_expression) for record in response['Items']: record_key = {input_partition_key[0]: record[input_partition_key[0]][input_partition_key[1]]} if input_sort_key is not None: record_key[input_sort_key[0]] = record[input_sort_key[0]][input_sort_key[1]] all_keys.append({'Key': record_key, 'Size': int(one_record_avg_size)}) return all_keys
def get_source_files(): logger.info("WebUI: Received request for path /get-source-files") job_name = request.args.get('job-name') is_local_testing = os.environ.get( "local_testing") == 'True' or os.environ.get("local_testing") == 'true' if is_local_testing: local_endpoint_url = 'http://localhost:4572' client = boto3.client('s3', aws_access_key_id='', aws_secret_access_key='', region_name=StaticVariables.DEFAULT_REGION, endpoint_url=local_endpoint_url) else: client = boto3.client('s3') response = client.get_object( Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, Key=(StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_INFO_PATH % job_name)) contents = response['Body'].read() contents_json = json.loads(contents) job_source_info = contents_json['sourceInfo'] main = contents_json['main'] source_files = {} for source_file_info in job_source_info: response = client.get_object( Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, Key=source_file_info["location"]) contents = response['Body'].read() string_content = contents.decode("utf-8") source_files[source_file_info["filePath"]] = string_content response = {'main': main, 'sourceFiles': source_files} return jsonify(response)
def create_s3_event_source_notification(self, bucket, prefix): response = self.s3.put_bucket_notification_configuration( Bucket=bucket, NotificationConfiguration={ 'LambdaFunctionConfigurations': [{ 'Events': ['s3:ObjectCreated:*'], 'LambdaFunctionArn': self.function_arn, 'Filter': { 'Key': { 'FilterRules': [ { 'Name': 'prefix', 'Value': prefix }, ] } } }], # 'TopicConfigurations' : [], # 'QueueConfigurations' : [] }) logger.info( "Creating S3 event notification to Lambda function %s - response: %s" % (self.function_name, response))
def get_num_completed_operators(): job_name = request.args.get('job-name') submission_time = request.args.get('submission-time') logger.info( "WebUI: Received request for path /num-completed-operators with parameters: %s, %s" % (job_name, submission_time)) is_local_testing = os.environ.get( "local_testing") == 'True' or os.environ.get("local_testing") == 'true' input_handler_s3_obj = input_handler_s3.InputHandlerS3( in_lambda=False, is_local_testing=is_local_testing) s3_stage_conf_path = StaticVariables.S3_UI_STAGE_CONFIGURATION_PATH % ( job_name, submission_time) stage_config = json.loads( input_handler_s3_obj.read_value( StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, s3_stage_conf_path, None)) stage_state_obj = stage_state.StageState(in_lambda=False, is_local_testing=is_local_testing) stage_states = stage_state_obj.read_state_table( StaticVariables.STAGE_STATE_DYNAMODB_TABLE_NAME % (job_name, submission_time)) result = {} for stage_id, stage_num_completed in stage_states.items(): result[stage_id] = [ stage_num_completed, stage_config[stage_id]["num_operators"] ] return jsonify(result)
def set_up_local_input_data(self, input_file_paths, static_job_info): input_bucket = static_job_info[StaticVariables.INPUT_SOURCE_FN] self.client.create_bucket(Bucket=input_bucket) s3_bucket_exists_waiter = self.client.get_waiter('bucket_exists') s3_bucket_exists_waiter.wait(Bucket=input_bucket) self.client.put_bucket_acl( ACL='public-read-write', Bucket=input_bucket, ) for i in range(len(input_file_paths)): input_file_path = input_file_paths[i] if os.path.isdir(input_file_path): continue if StaticVariables.INPUT_PREFIX_FN in static_job_info: prefix = static_job_info[StaticVariables.INPUT_PREFIX_FN] key = '%s/input-%s' % (prefix, str(i + 1)) else: key = 'input-%s' % (str(i + 1)) self.client.upload_file(Filename=input_file_path, Bucket=input_bucket, Key=key) logger.info("Set up local input data successfully")
def create_lambda_function(self, stage_id, total_num_stages, submission_time, coordinator_lambda_name, num_reducers): runtime = 'python3.7' response = self.lambda_client.create_function( FunctionName=self.function_name, Code={"ZipFile": open(self.code_file, 'rb').read()}, Handler=self.handler, Role=self.role, Runtime=runtime, Description=self.function_name, Environment={ 'Variables': { "serverless_mapreduce_role": self.role, "stage_id": str(stage_id), "total_num_stages": str(total_num_stages), 'submission_time': str(submission_time), 'coordinator_lambda_name': str(coordinator_lambda_name), "num_reducers": str(num_reducers) } }, MemorySize=self.memory, Timeout=self.timeout, TracingConfig={'Mode': 'PassThrough'}) self.function_arn = response['FunctionArn'] logger.info("Creation of Lambda function %s - response: %s" % (self.function_name, response))
def load_model(self): """ Load the TensorFlow model into memory and get tensor names """ logger.info('Loading model') # Set gpu device for process os.environ["CUDA_VISIBLE_DEVICES"] = str(self.config['gpu_id']) # with tf.device('/gpu:{}'.format(self.config['gpu_id'])): recognition_graph = tf.Graph() with recognition_graph.as_default(): rc_graph_def = tf.GraphDef() with tf.gfile.GFile(self.config['graph_path'], 'rb') as fid: serialized_graph = fid.read() rc_graph_def.ParseFromString(serialized_graph) tf.import_graph_def(rc_graph_def, name='') # Save session for later access self.sess = tf.Session(graph=recognition_graph) self.input_operation = recognition_graph.get_operation_by_name( self.config['input_layer']) self.output_operation = recognition_graph.get_operation_by_name( self.config['output_layer']) logger.info('Model loaded')
def create_progress_table(self, table_name): waiter = self.client.get_waiter('table_not_exists') waiter.wait(TableName=table_name) self.client.create_table( AttributeDefinitions=[{ 'AttributeName': 'stage_id', 'AttributeType': 'N' }], TableName=table_name, KeySchema=[{ 'AttributeName': 'stage_id', 'KeyType': 'HASH' }], ProvisionedThroughput={ 'ReadCapacityUnits': 100, 'WriteCapacityUnits': 100 } ) # Wait until the created table becomes active response = self.client.describe_table(TableName=table_name)['Table']['TableStatus'] while response != 'ACTIVE': time.sleep(1) response = self.client.describe_table(TableName=table_name)['Table']['TableStatus'] logger.info("Stage progress table created successfully")
def __init__(self): logger.info('[ApiSncf] - Instanciation of ApiSncf class') load_dotenv(dotenv_path='utils/.env') self.url_api = 'https://api.sncf.com/v1/coverage/sncf/stop_areas?count=1000' self.token_auth = os.environ.get('TOKEN_AUTH') self.filename_export = 'stop_areas' self.request_api_sncf = None self.list_gares = []
def get_map_reduce_outputs(bucket, job_name, stage_ids): keys_bins = [] for stage_id in stage_ids: prefix = "%s/%s-%s/" % (job_name, StaticVariables.OUTPUT_PREFIX, stage_id) for obj in s3_client.list_objects(Bucket=bucket, Prefix=prefix)["Contents"]: if not obj["Key"].endswith('/'): keys_bins.append([obj["Key"]]) logger.info("Key Batches for stage %s is: %s" % (stage_ids[0], str(keys_bins))) return keys_bins
def initialise_in_degree_table(self, table_name, in_degrees): for pipeline_id, in_degree in in_degrees.items(): self.client.put_item( TableName=table_name, Item={ 'pipeline_id': {'N': str(pipeline_id)}, 'in_degree': {'N': str(in_degree)} } ) logger.info("In degree table initialised successfully")
def delete_state_table(self, table_name): existing_tables = self.client.list_tables()['TableNames'] if table_name in existing_tables: self.client.delete_table(TableName=table_name) logger.info("Stage state table deleted successfully") return logger.info("Stage state table has not been created")
def load_label(self): """ Load road condition label map """ logger.info('Loading label map') proto_as_ascii_lines = tf.gfile.GFile( self.config['label_map']).readlines() for l in proto_as_ascii_lines: self.label.append(l.rstrip())
def set_up_bucket(self, bucket_name): self.s3_client.create_bucket(Bucket=bucket_name) s3_bucket_exists_waiter = self.s3_client.get_waiter('bucket_exists') s3_bucket_exists_waiter.wait(Bucket=bucket_name) if self.static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]: self.s3_client.put_bucket_acl( ACL='public-read-write', Bucket=bucket_name, ) logger.info("%s Bucket created successfully" % bucket_name)
def run(self): # 1. Create the aws_lambda functions function_lambdas, invoking_pipelines_info, num_outputs = self._create_lambdas( ) cur_output_handler = output_handler.get_output_handler( self.static_job_info[StaticVariables.OUTPUT_SOURCE_TYPE_FN], self.static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN], self.is_serverless) cur_output_handler.create_output_storage(self.static_job_info, self.submission_time) # Execute # 2. Invoke Mappers asynchronously self._invoke_pipelines(invoking_pipelines_info) # 3. Calculate costs - Approx (since we are using exec time reported by our func and not billed ms) StaticVariables.JOB_START_TIME = time.time() logger.info("PERFORMANCE INFO: Job setup time: %s" % (StaticVariables.JOB_START_TIME - StaticVariables.SETUP_START_TIME)) self._calculate_cost(num_outputs, cur_output_handler, invoking_pipelines_info) # 4. Delete the function lambdas for function_lambda in function_lambdas: function_lambda.delete_function() if StaticVariables.OPTIMISATION_FN not in self.static_job_info \ or not self.static_job_info[StaticVariables.OPTIMISATION_FN]: self._update_duration() # 5. View one of the last stage executor's outputs # logger.info(cur_output_handler.get_output(3, self.static_job_info, self.submission_time)) else: job_name = self.static_job_info[StaticVariables.JOB_NAME_FN] table_name = StaticVariables.STAGE_STATE_DYNAMODB_TABLE_NAME % ( job_name, self.submission_time) self.map_phase_state.delete_state_table(table_name) in_degree_obj = in_degree.InDegree( in_lambda=self.is_serverless, is_local_testing=self.static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) in_degree_table_name = StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % ( job_name, self.submission_time) in_degree_obj.delete_in_degree_table(in_degree_table_name) # metrics_bucket = StaticVariables.METRICS_BUCKET % job_name # self.delete_s3_objects(metrics_bucket, "") # self.s3_client.delete_bucket(Bucket=metrics_bucket) tear_down_time = time.time() - StaticVariables.TEAR_DOWN_START_TIME logger.info("PERFORMANCE INFO - Job tear down time: %s seconds" % str(tear_down_time)) return self.submission_time
def delete_files(dirname, filenames): logger.info("At delete_files, the current working directory is %s" % str(os.getcwd())) for filename in filenames: if dirname == "": dst_file = filename else: dst_file = "%s/%s" % (dirname, filename) logger.info("The file to delete: %s" % str(dst_file)) if os.path.exists(dst_file): os.remove(dst_file)
def get_map_shuffle_outputs(num_bins, bucket, job_name, stage_id): keys_bins = [[] for _ in range(num_bins)] for bin_id in range(1, num_bins + 1): prefix = "%s/%s-%s/bin-%s/" % (job_name, StaticVariables.OUTPUT_PREFIX, stage_id, bin_id) objs = s3_client.list_objects(Bucket=bucket, Prefix=prefix)["Contents"] # logger.info("********************The number of items is: %s********************" % str(len(objs))) keys_bins[bin_id - 1] = [obj["Key"] for obj in objs] logger.info("Key Batches for stage %s is: %s" % (stage_id, str(keys_bins))) return keys_bins
def initialise_progress_table(self, table_name, num_stages): for i in range(1, num_stages + 1): self.client.put_item( TableName=table_name, Item={ 'stage_id': {'N': str(i)}, 'num_processed_keys': {'N': str(0)}, 'total_num_keys': {'N': str(0)} } ) logger.info("Stage progress table initialised successfully")
def set_up_local_input_data(static_job_info): if static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]: cur_input_handler = input_handler.get_input_handler(static_job_info[StaticVariables.INPUT_SOURCE_TYPE_FN], static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]) os.chdir(StaticVariables.PROJECT_WORKING_DIRECTORY) local_testing_input_path = static_job_info[StaticVariables.LOCAL_TESTING_INPUT_PATH] local_file_paths = glob.glob(local_testing_input_path + "**", recursive=True) logger.info("The current working directory for local file paths is: %s" % os.getcwd()) logger.info("The list of local file paths: %s" % local_file_paths) cur_input_handler.set_up_local_input_data(local_file_paths, static_job_info) os.chdir(StaticVariables.LIBRARY_WORKING_DIRECTORY)
def schedule_same_pipeline_next_stage(stage_configuration, stage_id, shuffling_bucket, job_name, submission_time): cur_stage_config = stage_configuration[str(stage_id)] next_stage_config = stage_configuration[str(stage_id + 1)] invoking_lambda_name = next_stage_config["invoking_lambda_name"] next_stage_num_operators = stage_configuration[str(stage_id + 1)]["num_operators"] if cur_stage_config["stage_type"] == 1: keys_bins = get_map_shuffle_outputs(next_stage_num_operators, shuffling_bucket, job_name, stage_id) else: keys_bins = get_map_reduce_outputs(shuffling_bucket, job_name, [stage_id]) # keys_bin_size = len(keys_bins[0]) # for i in range(1, len(keys_bins)): # assert keys_bin_size == len(keys_bins[i]) if StaticVariables.OPTIMISATION_FN not in static_job_info \ or not static_job_info[StaticVariables.OPTIMISATION_FN]: stage_progress_obj = stage_progress.StageProgress(in_lambda=True, is_local_testing=static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (job_name, submission_time) total_num_jobs = sum([len(keys_bin) for keys_bin in keys_bins]) stage_progress_obj.update_total_num_keys(stage_progress_table_name, stage_id + 1, total_num_jobs) if next_stage_config["stage_type"] == 1: for i in range(len(keys_bins)): response = lambda_client.invoke( FunctionName=invoking_lambda_name, InvocationType='Event', Payload=json.dumps({ "keys": keys_bins[i], "id": i + 1, "load_data_from_input": False, "function_pickle_path": next_stage_config["function_pickle_path"], "combiner_function_pickle_path": next_stage_config["combiner_function_pickle_path"], "partition_function_pickle_path": next_stage_config["partition_function_pickle_path"] }) ) else: for i in range(len(keys_bins)): response = lambda_client.invoke( FunctionName=invoking_lambda_name, InvocationType='Event', Payload=json.dumps({ "keys": keys_bins[i], "id": i + 1, "load_data_from_input": False, "function_pickle_path": next_stage_config["function_pickle_path"] }) ) logger.info("All operators finished in stage %s, next stage: number of operators scheduled: %s" % (stage_id, next_stage_num_operators))
def __init__(self): logger.info('Initializing') self.config = load_config()['road_condition'] # Create empty attribute fields self.sess = None self.input_operation = None self.output_operation = None # Load label self.label = [] self.load_label()
def copy_job_function(function): logger.info("Library working directory is %s" % library_working_dir) inspect_object = inspect.getfile(function) rel_filepath = os.path.relpath(inspect_object) logger.info("The path of the function is %s" % rel_filepath) # dst_file = "%s/%s/%s" % (library_working_dir, "user_job_3", "map.py") dst_file = "%s/%s" % (library_working_dir, rel_filepath) if os.path.normpath(inspect_object) != os.path.normpath(dst_file): os.makedirs(os.path.dirname(dst_file), exist_ok=True) shutil.copy2(rel_filepath, dst_file) return rel_filepath
def create_output_storage(self, static_job_info, submission_time): output_bucket = static_job_info[StaticVariables.SHUFFLING_BUCKET_FN] \ if StaticVariables.OUTPUT_SOURCE_FN not in static_job_info \ else static_job_info[StaticVariables.OUTPUT_SOURCE_FN] self.client.create_bucket(Bucket=output_bucket) s3_bucket_exists_waiter = self.client.get_waiter('bucket_exists') s3_bucket_exists_waiter.wait(Bucket=output_bucket) if static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]: self.client.put_bucket_acl( ACL='public-read-write', Bucket=output_bucket, ) logger.info("Finished setting up output bucket")
def load_label_map(self): """ Load the label map. :return: TensorFlow specific label map """ logger.info('Loading label map') label_map = label_map_util.load_labelmap(self.config_tf['label_map']) categories = label_map_util.convert_label_map_to_categories( label_map, max_num_classes=self.config_tf['max_class_id'], use_display_name=True) return label_map_util.create_category_index(categories)
def check_job_finish(self, completed_executors, num_final_dst_operators, static_job_info, submission_time): last_stage_keys = [] reducer_metadata = [] lambda_time = 0 for record in completed_executors: last_stage_keys.append( record[OutputHandlerDynamoDB.METADATA_TABLE_KEY_NAME]['S']) reducer_metadata.append( json.loads(record[ OutputHandlerDynamoDB.METADATA_TABLE_COLUMN_NAME]['S'])) if len(last_stage_keys) == num_final_dst_operators: output_table_name_prefix = static_job_info[StaticVariables.SHUFFLING_BUCKET_FN] \ if StaticVariables.OUTPUT_SOURCE_FN not in static_job_info \ else static_job_info[StaticVariables.OUTPUT_SOURCE_FN] output_table_name = "%s-%s" % (output_table_name_prefix, submission_time) job_name = static_job_info[StaticVariables.JOB_NAME_FN] metadata_table_name = "%s-%s-metadata" % (job_name, submission_time) metadata_table_size = self.client.describe_table( TableName=metadata_table_name)['Table']['TableSizeBytes'] output_table_info = self.client.describe_table( TableName=output_table_name)['Table'] output_table_item_count = output_table_info['ItemCount'] output_table_size = output_table_info['TableSizeBytes'] dynamodb_size = output_table_size + metadata_table_size for data in reducer_metadata: # Even though metadata processing time is written as processingTime, # AWS does not accept uppercase letter metadata key lambda_time += float(data['processingTime']) num_write_ops = len(last_stage_keys) + output_table_item_count num_read_ops = 0 # DynamoDB costs $0.25/GB/month, if approximated by 3 cents/GB/month, then per hour it is $0.000052/GB storage_cost = 1 * 0.0000521574022522109 * ( dynamodb_size / 1024.0 / 1024.0 / 1024.0) # DynamoDB write # $1.25/1000000 write_cost = num_write_ops * 1.25 / 1000000 # DynamoDB read # $0.25/1000000 read_cost = num_read_ops * 0.25 / 1000000 logger.info("Last stage number of write ops: %s" % num_write_ops) logger.info("Last stage number of read ops: %s" % num_read_ops) return lambda_time, storage_cost, write_cost, read_cost return -1, -1, -1, -1
def increment_current_stage_id(self, table_name): response = self.client.update_item( TableName=table_name, Key={'stage_id': { 'N': str(-1) }}, UpdateExpression="set current_stage_id = current_stage_id + :val", ExpressionAttributeValues={':val': { 'N': str(1) }}, ReturnValues="UPDATED_NEW") logger.info("Current stage id incremented successfully") return response
def add_lambda_permission(self, s_id, bucket): try: response = self.lambda_client.add_permission( Action='lambda:InvokeFunction', FunctionName=self.function_name, Principal='s3.amazonaws.com', StatementId='%s' % s_id, SourceArn='arn:aws:s3:::' + bucket) logger.info( "Adding permission to Lambda function %s - response: %s" % (self.function_name, response)) except Exception as e: logger.info("Failed to add permission to Lambda function %s: %s" % (self.function_name, e))
def update_function(self): """ Update aws_lambda function """ response = self.lambda_client.update_function_code( FunctionName=self.function_name, ZipFile=open(self.code_file, 'rb').read(), Publish=True) updated_arn = response['FunctionArn'] # parse arn and remove the release number (:n) arn = ":".join(updated_arn.split(':')[:-1]) self.function_arn = arn logger.info("Update of Lambda function %s - response: %s" % (self.function_name, response))