def _send_status_rabbitmq(self): """ Send the status event to RabbitMQ """ dmpd_response_status = json.dumps(self.response) drs = sizeof_fmt(len(dmpd_response_status)) executor_id = self.response['executor_id'] job_id = self.response['job_id'] rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url') status_sent = False output_query_count = 0 params = pika.URLParameters(rabbit_amqp_url) exchange = 'cloudbutton-{}-{}'.format(executor_id, job_id) while not status_sent and output_query_count < 5: output_query_count = output_query_count + 1 try: connection = pika.BlockingConnection(params) channel = connection.channel() channel.exchange_declare(exchange=exchange, exchange_type='fanout', auto_delete=True) channel.basic_publish(exchange=exchange, routing_key='', body=dmpd_response_status) connection.close() logger.info("Execution status sent to rabbitmq - Size: {}".format(drs)) status_sent = True except Exception as e: logger.error("Unable to send status to rabbitmq") logger.error(str(e)) logger.info('Retrying to send status to rabbitmq...') time.sleep(0.2)
def put_object(self, bucket_name, key, data): """ Put an object in COS. Override the object if the key already exists. :param key: key of the object. :param data: data of the object :type data: str/bytes :return: None """ retries = 0 status = None while status is None: try: res = self.cos_client.put_object(Bucket=bucket_name, Key=key, Body=data) status = 'OK' if res['ResponseMetadata'][ 'HTTPStatusCode'] == 200 else 'Error' try: logger.debug('PUT Object {} - Size: {} - {}'.format( key, sizeof_fmt(len(data)), status)) except Exception: logger.debug('PUT Object {} {}'.format(key, status)) except ibm_botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "NoSuchKey": raise StorageNoSuchKeyError(bucket_name, key) else: raise e except ibm_botocore.exceptions.ReadTimeoutError as e: if retries == OBJ_REQ_RETRIES: raise e logger.debug('PUT Object timeout. Retrying request') retries += 1 return True
def _send_status_os(self): """ Send the status event to the Object Storage """ executor_id = self.response['executor_id'] job_id = self.response['job_id'] call_id = self.response['call_id'] act_id = self.response['activation_id'] if self.response['type'] == '__init__': init_key = create_init_key(JOBS_PREFIX, executor_id, job_id, call_id, act_id) self.internal_storage.put_data(init_key, '') elif self.response['type'] == '__end__': status_key = create_status_key(JOBS_PREFIX, executor_id, job_id, call_id) dmpd_response_status = json.dumps(self.response) drs = sizeof_fmt(len(dmpd_response_status)) logger.info("Storing execution stats - Size: {}".format(drs)) self.internal_storage.put_data(status_key, dmpd_response_status)
def get_memory_usage(formatted=True): """ Gets the current memory usage of the runtime. To be used only in the action code. """ if not is_unix_system(): return split_args = False pids_to_show = None discriminate_by_pid = False ps_mem.verify_environment(pids_to_show) sorted_cmds, shareds, count, total, swaps, total_swap = \ ps_mem.get_memory_usage(pids_to_show, split_args, discriminate_by_pid, include_self=True, only_self=False) if formatted: return sizeof_fmt(int(ps_mem.human(total, units=1))) else: return int(ps_mem.human(total, units=1))
def put_object(self, container_name, key, data): """ Put an object in Swift. Override the object if the key already exists. :param key: key of the object. :param data: data of the object :type data: str/bytes :return: None """ url = '/'.join([self.endpoint, container_name, key]) try: res = self.session.put(url, data=data) status = 'OK' if res.status_code == 201 else 'Error' try: logger.debug('PUT Object {} - Size: {} - {}'.format( key, sizeof_fmt(len(data)), status)) except Exception: logger.debug('PUT Object {} - {}'.format(key, status)) except Exception as e: print(e)
def _create_job(config, internal_storage, executor_id, job_id, func, data, runtime_meta, runtime_memory=None, extra_env=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], execution_timeout=None, job_created_tstamp=None): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ log_level = os.getenv('CLOUDBUTTON_LOGLEVEL') runtime_name = config['cloudbutton']['runtime'] if runtime_memory is None: runtime_memory = config['cloudbutton']['runtime_memory'] ext_env = {} if extra_env is None else extra_env.copy() if ext_env: ext_env = utils.convert_bools_to_string(ext_env) logger.debug("Extra environment vars {}".format(ext_env)) if not data: return [] if execution_timeout is None: execution_timeout = config['cloudbutton']['runtime_timeout'] - 5 job_description = {} job_description['runtime_name'] = runtime_name job_description['runtime_memory'] = runtime_memory job_description['execution_timeout'] = execution_timeout job_description['function_name'] = func.__name__ job_description['extra_env'] = ext_env job_description['total_calls'] = len(data) job_description['invoke_pool_threads'] = invoke_pool_threads job_description['executor_id'] = executor_id job_description['job_id'] = job_id exclude_modules_cfg = config['cloudbutton'].get('exclude_modules', []) include_modules_cfg = config['cloudbutton'].get('include_modules', []) exc_modules = set() inc_modules = set() if exclude_modules_cfg: exc_modules.update(exclude_modules_cfg) if exclude_modules: exc_modules.update(exclude_modules) if include_modules_cfg is not None: inc_modules.update(include_modules_cfg) if include_modules_cfg is None and not include_modules: inc_modules = None if include_modules is not None and include_modules: inc_modules.update(include_modules) if include_modules is None: inc_modules = None host_job_meta = {'job_created_tstamp': job_created_tstamp} logger.debug( 'ExecutorID {} | JobID {} - Serializing function and data'.format( executor_id, job_id)) serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + data, inc_modules, exc_modules) data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) module_data = create_module_data(mod_paths) func_str = func_and_data_ser[0] func_module_str = pickle.dumps( { 'func': func_str, 'module_data': module_data }, -1) func_module_size_bytes = len(func_module_str) total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes) host_job_meta['data_size_bytes'] = data_size_bytes host_job_meta['func_module_size_bytes'] = func_module_size_bytes if 'data_limit' in config['cloudbutton']: data_limit = config['cloudbutton']['data_limit'] else: data_limit = MAX_AGG_DATA_SIZE if data_limit and data_size_bytes > data_limit * 1024**2: log_msg = ( 'ExecutorID {} | JobID {} - Total data exceeded maximum size ' 'of {}'.format(executor_id, job_id, utils.sizeof_fmt(data_limit * 1024**2))) raise Exception(log_msg) log_msg = ('ExecutorID {} | JobID {} - Uploading function and data ' '- Total: {}'.format(executor_id, job_id, total_size)) print(log_msg) if not log_level else logger.info(log_msg) # Upload data data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id) job_description['data_key'] = data_key data_bytes, data_ranges = utils.agg_data(data_strs) job_description['data_ranges'] = data_ranges data_upload_start = time.time() internal_storage.put_data(data_key, data_bytes) data_upload_end = time.time() host_job_meta['data_upload_time'] = round( data_upload_end - data_upload_start, 6) # Upload function and modules func_upload_start = time.time() func_key = create_func_key(JOBS_PREFIX, executor_id, job_id) job_description['func_key'] = func_key internal_storage.put_func(func_key, func_module_str) func_upload_end = time.time() host_job_meta['func_upload_time'] = round( func_upload_end - func_upload_start, 6) job_description['metadata'] = host_job_meta return job_description
def run(self): """ Runs the function """ # self.stats.write('jobrunner_start', time.time()) logger.info("Started") result = None exception = False try: loaded_func_all = self._get_function_and_modules() self._save_modules(loaded_func_all['module_data']) function = self._unpickle_function(loaded_func_all['func']) data = self._load_data() if strtobool(os.environ.get('__PW_REDUCE_JOB', 'False')): self._wait_futures(data) elif is_object_processing_function(function): self._load_object(data) self._fill_optional_args(function, data) logger.info("Going to execute '{}()'".format(str( function.__name__))) print('---------------------- FUNCTION LOG ----------------------', flush=True) function_start_tstamp = time.time() result = function(**data) function_end_tstamp = time.time() print('----------------------------------------------------------', flush=True) logger.info("Success function execution") self.stats.write('function_start_tstamp', function_start_tstamp) self.stats.write('function_end_tstamp', function_end_tstamp) self.stats.write( 'function_exec_time', round(function_end_tstamp - function_start_tstamp, 8)) # Check for new futures if result is not None: self.stats.write("result", True) if isinstance(result, ResponseFuture) or \ (type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture)): self.stats.write('new_futures', True) logger.debug("Pickling result") output_dict = {'result': result} pickled_output = pickle.dumps(output_dict) else: logger.debug("No result to store") self.stats.write("result", False) except Exception: exception = True self.stats.write("exception", True) exc_type, exc_value, exc_traceback = sys.exc_info() print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) try: logger.debug("Pickling exception") pickled_exc = pickle.dumps( (exc_type, exc_value, exc_traceback)) pickle.loads( pickled_exc ) # this is just to make sure they can be unpickled self.stats.write("exc_info", str(pickled_exc)) except Exception as pickle_exception: # Shockingly often, modules like subprocess don't properly # call the base Exception.__init__, which results in them # being unpickleable. As a result, we actually wrap this in a try/catch block # and more-carefully handle the exceptions if any part of this save / test-reload # fails self.stats.write("exc_pickle_fail", True) pickled_exc = pickle.dumps({ 'exc_type': str(exc_type), 'exc_value': str(exc_value), 'exc_traceback': exc_traceback, 'pickle_exception': pickle_exception }) pickle.loads(pickled_exc ) # this is just to make sure it can be unpickled self.stats.write("exc_info", str(pickled_exc)) finally: store_result = strtobool(os.environ.get('STORE_RESULT', 'True')) if result is not None and store_result and not exception: output_upload_start_tstamp = time.time() logger.info("Storing function result - Size: {}".format( sizeof_fmt(len(pickled_output)))) self.internal_storage.put_data(self.output_key, pickled_output) output_upload_end_tstamp = time.time() self.stats.write( "output_upload_time", round( output_upload_end_tstamp - output_upload_start_tstamp, 8)) self.jobrunner_conn.send("Finished") logger.info("Finished")
def function_handler(event): start_tstamp = time.time() log_level = event['log_level'] cloud_logging_config(log_level) logger.debug("Action handler started") extra_env = event.get('extra_env', {}) os.environ.update(extra_env) os.environ.update({'CLOUDBUTTON_FUNCTION': 'True', 'PYTHONUNBUFFERED': 'True'}) config = event['config'] call_id = event['call_id'] job_id = event['job_id'] executor_id = event['executor_id'] exec_id = "{}/{}/{}".format(executor_id, job_id, call_id) logger.info("Execution-ID: {}".format(exec_id)) runtime_name = event['runtime_name'] runtime_memory = event['runtime_memory'] execution_timeout = event['execution_timeout'] logger.debug("Runtime name: {}".format(runtime_name)) logger.debug("Runtime memory: {}MB".format(runtime_memory)) logger.debug("Function timeout: {}s".format(execution_timeout)) func_key = event['func_key'] data_key = event['data_key'] data_byte_range = event['data_byte_range'] storage_config = extract_storage_config(config) internal_storage = InternalStorage(storage_config) call_status = CallStatus(config, internal_storage) call_status.response['host_submit_tstamp'] = event['host_submit_tstamp'] call_status.response['start_tstamp'] = start_tstamp context_dict = { 'cloudbutton_version': os.environ.get("CLOUDBUTTON_VERSION"), 'call_id': call_id, 'job_id': job_id, 'executor_id': executor_id, 'activation_id': os.environ.get('__PW_ACTIVATION_ID') } call_status.response.update(context_dict) show_memory_peak = strtobool(os.environ.get('SHOW_MEMORY_PEAK', 'False')) try: if version.__version__ != event['cloudbutton_version']: msg = ("Cloudbutton version mismatch. Host version: {} - Runtime version: {}" .format(event['cloudbutton_version'], version.__version__)) raise RuntimeError('HANDLER', msg) # send init status event call_status.send('__init__') # call_status.response['free_disk_bytes'] = free_disk_space("/tmp") custom_env = {'CLOUDBUTTON_CONFIG': json.dumps(config), 'CLOUDBUTTON_EXECUTION_ID': exec_id, 'PYTHONPATH': "{}:{}".format(os.getcwd(), LIBS_PATH)} os.environ.update(custom_env) jobrunner_stats_dir = os.path.join(STORAGE_FOLDER, storage_config['bucket'], JOBS_PREFIX, executor_id, job_id, call_id) os.makedirs(jobrunner_stats_dir, exist_ok=True) jobrunner_stats_filename = os.path.join(jobrunner_stats_dir, 'jobrunner.stats.txt') jobrunner_config = {'cloudbutton_config': config, 'call_id': call_id, 'job_id': job_id, 'executor_id': executor_id, 'func_key': func_key, 'data_key': data_key, 'log_level': log_level, 'data_byte_range': data_byte_range, 'output_key': create_output_key(JOBS_PREFIX, executor_id, job_id, call_id), 'stats_filename': jobrunner_stats_filename} if show_memory_peak: mm_handler_conn, mm_conn = Pipe() memory_monitor = Thread(target=memory_monitor_worker, args=(mm_conn, )) memory_monitor.start() handler_conn, jobrunner_conn = Pipe() jobrunner = JobRunner(jobrunner_config, jobrunner_conn, internal_storage) logger.debug('Starting JobRunner process') local_execution = strtobool(os.environ.get('__PW_LOCAL_EXECUTION', 'False')) jrp = Thread(target=jobrunner.run) if local_execution else Process(target=jobrunner.run) jrp.start() jrp.join(execution_timeout) logger.debug('JobRunner process finished') if jrp.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it try: jrp.terminate() except Exception: # thread does not have terminate method pass msg = ('Function exceeded maximum time of {} seconds and was ' 'killed'.format(execution_timeout)) raise TimeoutError('HANDLER', msg) if show_memory_peak: mm_handler_conn.send('STOP') memory_monitor.join() peak_memory_usage = int(mm_handler_conn.recv()) logger.info("Peak memory usage: {}".format(sizeof_fmt(peak_memory_usage))) call_status.response['peak_memory_usage'] = peak_memory_usage if not handler_conn.poll(): logger.error('No completion message received from JobRunner process') logger.debug('Assuming memory overflow...') # Only 1 message is returned by jobrunner when it finishes. # If no message, this means that the jobrunner process was killed. # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM. msg = 'Function exceeded maximum memory and was killed' raise MemoryError('HANDLER', msg) if os.path.exists(jobrunner_stats_filename): with open(jobrunner_stats_filename, 'r') as fid: for l in fid.readlines(): key, value = l.strip().split(" ", 1) try: call_status.response[key] = float(value) except Exception: call_status.response[key] = value if key in ['exception', 'exc_pickle_fail', 'result', 'new_futures']: call_status.response[key] = eval(value) except Exception: # internal runtime exceptions print('----------------------- EXCEPTION !-----------------------', flush=True) traceback.print_exc(file=sys.stdout) print('----------------------------------------------------------', flush=True) call_status.response['exception'] = True pickled_exc = pickle.dumps(sys.exc_info()) pickle.loads(pickled_exc) # this is just to make sure they can be unpickled call_status.response['exc_info'] = str(pickled_exc) finally: call_status.response['end_tstamp'] = time.time() call_status.send('__end__') for key in extra_env: os.environ.pop(key) logger.info("Finished")