def map(self, map_function, iterdata, obj_chunk_size=None, extra_env=None, extra_meta=None, remote_invocation=False, remote_invocation_groups=None, invoke_pool_threads=128, data_all_as_one=True, job_max_runtime=wrenconfig.RUNTIME_TIMEOUT, overwrite_invoke_args=None, exclude_modules=None): """ Wrapper to launch map() method. It integrates COS logic to process objects. """ data = wrenutil.iterdata_as_list(iterdata) map_func = map_function map_iterdata = data new_invoke_pool_threads = invoke_pool_threads parts_per_object = None if wrenutil.is_object_processing(map_function): ''' If it is object processing function, create partitions according chunk_size ''' logger.debug("Calling map on partitions from object storage flow") arg_data = wrenutil.verify_args(map_function, data, object_processing=True) storage = COSBackend(self.config['ibm_cos']) map_iterdata, parts_per_object = create_partitions(arg_data, obj_chunk_size, storage) map_func = partition_processor(map_function) # Remote invocation functionality original_iterdata_len = len(iterdata) if original_iterdata_len > 1 and remote_invocation: runtime_name = self.runtime_name runtime_memory = self.runtime_memory rabbitmq_monitor = "PYWREN_RABBITMQ_MONITOR" in os.environ def remote_invoker(input_data): pw = pywren.ibm_cf_executor(runtime=runtime_name, runtime_memory=runtime_memory, rabbitmq_monitor=rabbitmq_monitor) return pw.map(map_function, input_data, invoke_pool_threads=invoke_pool_threads, extra_env=extra_env, extra_meta=extra_meta) map_func = remote_invoker if remote_invocation_groups: map_iterdata = [[iterdata[x:x+remote_invocation_groups]] for x in range(0, original_iterdata_len, remote_invocation_groups)] else: map_iterdata = [iterdata] new_invoke_pool_threads = 1 map_futures = self._map(map_func, map_iterdata, extra_env=extra_env, extra_meta=extra_meta, invoke_pool_threads=new_invoke_pool_threads, data_all_as_one=data_all_as_one, overwrite_invoke_args=overwrite_invoke_args, exclude_modules=exclude_modules, original_func_name=map_function.__name__, remote_invocation=remote_invocation, original_iterdata_len=original_iterdata_len, job_max_runtime=job_max_runtime) return map_futures, parts_per_object
def _map(self, func, iterdata, extra_env=None, extra_meta=None, invoke_pool_threads=128, data_all_as_one=True, overwrite_invoke_args=None, exclude_modules=None, original_func_name=None, remote_invocation=False, original_iterdata_len=None, job_max_runtime=wrenconfig.RUNTIME_TIMEOUT): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :param original_func_name: Name of the function to invoke. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ if original_func_name: func_name = original_func_name else: func_name = func.__name__ data = wrenutil.iterdata_as_list(iterdata) if extra_env is not None: extra_env = wrenutil.convert_bools_to_string(extra_env) if not data: return [] if self.map_item_limit is not None and len(data) > self.map_item_limit: raise ValueError("len(data) ={}, exceeding map item limit of {}" "consider mapping over a smaller" "number of items".format(len(data), self.map_item_limit)) # This allows multiple parameters in functions data = wrenutil.verify_args(func, data) callgroup_id = wrenutil.create_callgroup_id() host_job_meta = {} log_msg = 'Executor ID {} Serializing function and data'.format(self.executor_id) logger.debug(log_msg) # pickle func and all data (to capture module dependencies) func_and_data_ser, mod_paths = self.serializer([func] + data) func_str = func_and_data_ser[0] data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) agg_data_key = None host_job_meta['agg_data'] = False host_job_meta['data_size_bytes'] = data_size_bytes log_msg = 'Executor ID {} Uploading function and data'.format(self.executor_id) logger.info(log_msg) if not self.log_level: print(log_msg, end=' ') if data_size_bytes < wrenconfig.MAX_AGG_DATA_SIZE and data_all_as_one: agg_data_key = create_agg_data_key(self.internal_storage.prefix, self.executor_id, callgroup_id) agg_data_bytes, agg_data_ranges = self.agg_data(data_strs) agg_upload_time = time.time() self.internal_storage.put_data(agg_data_key, agg_data_bytes) host_job_meta['agg_data'] = True host_job_meta['data_upload_time'] = time.time() - agg_upload_time host_job_meta['data_upload_timestamp'] = time.time() else: log_msg = ('Executor ID {} Total data exceeded ' 'maximum size of {} bytes'.format(self.executor_id, wrenconfig.MAX_AGG_DATA_SIZE)) logger.warning(log_msg) if exclude_modules: for module in exclude_modules: for mod_path in list(mod_paths): if module in mod_path and mod_path in mod_paths: mod_paths.remove(mod_path) module_data = create_mod_data(mod_paths) # Create func and upload func_module_str = pickle.dumps({'func': func_str, 'module_data': module_data}, -1) host_job_meta['func_module_bytes'] = len(func_module_str) func_upload_time = time.time() func_key = create_func_key(self.internal_storage.prefix, self.executor_id, callgroup_id) self.internal_storage.put_func(func_key, func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() if not self.log_level: func_and_data_size = wrenutil.sizeof_fmt(host_job_meta['func_module_bytes']+host_job_meta['data_size_bytes']) log_msg = '- Total: {}'.format(func_and_data_size) print(log_msg) def invoke(data_str, executor_id, callgroup_id, call_id, func_key, host_job_meta, agg_data_key=None, data_byte_range=None): data_key, output_key, status_key = create_keys(self.internal_storage.prefix, executor_id, callgroup_id, call_id) host_job_meta['job_invoke_timestamp'] = time.time() if agg_data_key is None: data_upload_time = time.time() self.internal_storage.put_data(data_key, data_str) data_upload_time = time.time() - data_upload_time host_job_meta['data_upload_time'] = data_upload_time host_job_meta['data_upload_timestamp'] = time.time() data_key = data_key else: data_key = agg_data_key return self.invoke_with_keys(func_key, data_key, output_key, status_key, executor_id, callgroup_id, call_id, extra_env, extra_meta, data_byte_range, host_job_meta.copy(), job_max_runtime, overwrite_invoke_args=overwrite_invoke_args) N = len(data) call_futures = [] if remote_invocation and original_iterdata_len > 1: log_msg = 'Executor ID {} Starting {} remote invocation function: Spawning {}() - Total: {} activations'.format(self.executor_id, N, func_name, original_iterdata_len) else: log_msg = 'Executor ID {} Starting function invocation: {}() - Total: {} activations'.format(self.executor_id, func_name, N) logger.info(log_msg) if not self.log_level: print(log_msg) with ThreadPoolExecutor(max_workers=invoke_pool_threads) as executor: for i in range(N): call_id = "{:05d}".format(i) data_byte_range = None if agg_data_key is not None: data_byte_range = agg_data_ranges[i] future = executor.submit(invoke, data_strs[i], self.executor_id, callgroup_id, call_id, func_key, host_job_meta.copy(), agg_data_key, data_byte_range) call_futures.append(future) res = [ft.result() for ft in call_futures] return res
def create_map_job(config, internal_storage, executor_id, job_id, map_function, iterdata, obj_chunk_size=None, extra_env=None, extra_meta=None, runtime_memory=None, remote_invocation=False, remote_invocation_groups=None, invoke_pool_threads=128, exclude_modules=None, is_cf_cluster=False, execution_timeout=EXECUTION_TIMEOUT, overwrite_invoke_args=None): """ Wrapper to create a map job. It integrates COS logic to process objects. """ map_job_id = f'M{job_id}' data = utils.iterdata_as_list(iterdata) map_func = map_function map_iterdata = data new_invoke_pool_threads = invoke_pool_threads new_runtime_memory = runtime_memory # Object processing functionality parts_per_object = None if utils.is_object_processing_function(map_function): ''' If it is object processing function, create partitions according chunk_size ''' logger.debug( 'ExecutorID {} | JobID {} - Calling map on partitions from object storage flow' .format(executor_id, job_id)) arg_data = utils.verify_args(map_function, data, object_processing=True) map_iterdata, parts_per_object = create_partitions( config, arg_data, obj_chunk_size) map_func = partition_processor(map_function) # ######## # Remote invocation functionality original_total_tasks = len(map_iterdata) if original_total_tasks == 1 or is_cf_cluster: remote_invocation = False if remote_invocation: rabbitmq_monitor = "CB_RABBITMQ_MONITOR" in os.environ def remote_invoker(input_data): pw = pywren.ibm_cf_executor(rabbitmq_monitor=rabbitmq_monitor) return pw.map(map_function, input_data, runtime_memory=runtime_memory, invoke_pool_threads=invoke_pool_threads, extra_env=extra_env, extra_meta=extra_meta) map_func = remote_invoker if remote_invocation_groups: map_iterdata = [[ iterdata[x:x + remote_invocation_groups] ] for x in range(0, original_total_tasks, remote_invocation_groups) ] else: map_iterdata = [iterdata] new_invoke_pool_threads = 1 new_runtime_memory = runtime_memory # ######## job_description = _create_job(config, internal_storage, executor_id, map_job_id, map_func, map_iterdata, extra_env=extra_env, extra_meta=extra_meta, runtime_memory=new_runtime_memory, invoke_pool_threads=new_invoke_pool_threads, overwrite_invoke_args=overwrite_invoke_args, exclude_modules=exclude_modules, original_func_name=map_function.__name__, remote_invocation=remote_invocation, original_total_tasks=original_total_tasks, execution_timeout=execution_timeout) return job_description, parts_per_object
def _create_job(config, internal_storage, executor_id, job_id, func, iterdata, extra_env=None, extra_meta=None, runtime_memory=None, invoke_pool_threads=128, overwrite_invoke_args=None, exclude_modules=None, original_func_name=None, remote_invocation=False, original_total_tasks=None, execution_timeout=EXECUTION_TIMEOUT): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :param original_func_name: Name of the function to invoke. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ log_level = os.getenv('CB_LOG_LEVEL') runtime_name = config['pywren']['runtime'] if runtime_memory is None: runtime_memory = config['pywren']['runtime_memory'] runtime_memory = int(runtime_memory) runtime_preinstalls = select_runtime(config, internal_storage, executor_id, job_id, runtime_name, runtime_memory) serializer = SerializeIndependent(runtime_preinstalls) if original_func_name: func_name = original_func_name else: func_name = func.__name__ data = utils.iterdata_as_list(iterdata) if extra_env is not None: extra_env = utils.convert_bools_to_string(extra_env) if not data: return [] # This allows multiple parameters in functions data = utils.verify_args(func, data) host_job_meta = {} job_description = {} job_description['runtime_name'] = runtime_name job_description['runtime_memory'] = runtime_memory job_description['task_execution_timeout'] = execution_timeout job_description['func_name'] = func_name job_description['extra_env'] = extra_env job_description['extra_meta'] = extra_meta job_description['total_calls'] = len(data) job_description['invoke_pool_threads'] = invoke_pool_threads job_description['overwrite_invoke_args'] = overwrite_invoke_args job_description['job_id'] = job_id job_description['remote_invocation'] = remote_invocation job_description['original_total_calls'] = original_total_tasks log_msg = 'ExecutorID {} | JobID {} - Serializing function and data'.format( executor_id, job_id) logger.debug(log_msg) # pickle func and all data (to capture module dependencies) func_and_data_ser, mod_paths = serializer([func] + data) func_str = func_and_data_ser[0] data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) host_job_meta['agg_data'] = False host_job_meta['data_size_bytes'] = data_size_bytes log_msg = 'ExecutorID {} | JobID {} - Uploading function and data'.format( executor_id, job_id) logger.info(log_msg) if not log_level: print(log_msg, end=' ') if data_size_bytes < MAX_AGG_DATA_SIZE: agg_data_key = create_agg_data_key(internal_storage.prefix, executor_id, job_id) job_description['data_key'] = agg_data_key agg_data_bytes, agg_data_ranges = _agg_data(data_strs) job_description['data_ranges'] = agg_data_ranges agg_upload_time = time.time() internal_storage.put_data(agg_data_key, agg_data_bytes) host_job_meta['agg_data'] = True host_job_meta['data_upload_time'] = time.time() - agg_upload_time host_job_meta['data_upload_timestamp'] = time.time() else: log_msg = ('ExecutorID {} | JobID {} - Total data exceeded ' 'maximum size of {} bytes'.format(executor_id, job_id, MAX_AGG_DATA_SIZE)) raise Exception(log_msg) if exclude_modules: for module in exclude_modules: for mod_path in list(mod_paths): if module in mod_path and mod_path in mod_paths: mod_paths.remove(mod_path) module_data = create_module_data(mod_paths) # Create func and upload host_job_meta['func_name'] = func_name func_module_str = pickle.dumps( { 'func': func_str, 'module_data': module_data }, -1) host_job_meta['func_module_bytes'] = len(func_module_str) func_upload_time = time.time() func_key = create_func_key(internal_storage.prefix, executor_id, job_id) job_description['func_key'] = func_key internal_storage.put_func(func_key, func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() if not log_level: func_and_data_size = utils.sizeof_fmt( host_job_meta['func_module_bytes'] + host_job_meta['data_size_bytes']) log_msg = '- Total: {}'.format(func_and_data_size) print(log_msg) job_description['host_job_meta'] = host_job_meta return job_description