def create_map_job(config, internal_storage, executor_id, map_job_id, map_function, iterdata, runtime_meta, runtime_memory=None, extra_params=None, extra_env=None, obj_chunk_size=None, obj_chunk_number=None, remote_invocation=False, remote_invocation_groups=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], is_remote_cluster=False, execution_timeout=EXECUTION_TIMEOUT): """ Wrapper to create a map job. It integrates COS logic to process objects. """ map_func = map_function map_iterdata = utils.verify_args(map_function, iterdata, extra_params) new_invoke_pool_threads = invoke_pool_threads new_runtime_memory = runtime_memory # Object processing functionality parts_per_object = None if utils.is_object_processing_function(map_function): # If it is object processing function, create partitions according chunk_size or chunk_number logger.debug('ExecutorID {} | JobID {} - Calling map on partitions from object storage flow'.format(executor_id, map_job_id)) map_iterdata, parts_per_object = create_partitions(config, map_iterdata, obj_chunk_size, obj_chunk_number) # ######## # Remote invocation functionality original_total_tasks = len(map_iterdata) if original_total_tasks == 1 or is_remote_cluster: remote_invocation = False if remote_invocation: def remote_invoker(input_data): pw = pywren.ibm_cf_executor() return pw.map(map_function, input_data, runtime_memory=runtime_memory, invoke_pool_threads=invoke_pool_threads, extra_env=extra_env) map_func = remote_invoker if remote_invocation_groups: map_iterdata = [[iterdata[x:x+remote_invocation_groups]] for x in range(0, original_total_tasks, remote_invocation_groups)] else: map_iterdata = [iterdata] map_iterdata = utils.verify_args(remote_invoker, map_iterdata, extra_params) new_invoke_pool_threads = 1 new_runtime_memory = runtime_memory # ######## job_description = _create_job(config, internal_storage, executor_id, map_job_id, map_func, map_iterdata, runtime_meta=runtime_meta, runtime_memory=new_runtime_memory, extra_env=extra_env, invoke_pool_threads=new_invoke_pool_threads, include_modules=include_modules, exclude_modules=exclude_modules, remote_invocation=remote_invocation, original_total_tasks=original_total_tasks, execution_timeout=execution_timeout) job_description['parts_per_object'] = parts_per_object return job_description
def map(self, map_function, iterdata, obj_chunk_size=None, extra_env=None, extra_meta=None, remote_invocation=False, remote_invocation_groups=None, invoke_pool_threads=128, data_all_as_one=True, job_max_runtime=wrenconfig.RUNTIME_TIMEOUT, overwrite_invoke_args=None, exclude_modules=None): """ Wrapper to launch map() method. It integrates COS logic to process objects. """ data = wrenutil.iterdata_as_list(iterdata) map_func = map_function map_iterdata = data new_invoke_pool_threads = invoke_pool_threads parts_per_object = None if wrenutil.is_object_processing(map_function): ''' If it is object processing function, create partitions according chunk_size ''' logger.debug("Calling map on partitions from object storage flow") arg_data = wrenutil.verify_args(map_function, data, object_processing=True) storage = COSBackend(self.config['ibm_cos']) map_iterdata, parts_per_object = create_partitions(arg_data, obj_chunk_size, storage) map_func = partition_processor(map_function) # Remote invocation functionality original_iterdata_len = len(iterdata) if original_iterdata_len > 1 and remote_invocation: runtime_name = self.runtime_name runtime_memory = self.runtime_memory rabbitmq_monitor = "PYWREN_RABBITMQ_MONITOR" in os.environ def remote_invoker(input_data): pw = pywren.ibm_cf_executor(runtime=runtime_name, runtime_memory=runtime_memory, rabbitmq_monitor=rabbitmq_monitor) return pw.map(map_function, input_data, invoke_pool_threads=invoke_pool_threads, extra_env=extra_env, extra_meta=extra_meta) map_func = remote_invoker if remote_invocation_groups: map_iterdata = [[iterdata[x:x+remote_invocation_groups]] for x in range(0, original_iterdata_len, remote_invocation_groups)] else: map_iterdata = [iterdata] new_invoke_pool_threads = 1 map_futures = self._map(map_func, map_iterdata, extra_env=extra_env, extra_meta=extra_meta, invoke_pool_threads=new_invoke_pool_threads, data_all_as_one=data_all_as_one, overwrite_invoke_args=overwrite_invoke_args, exclude_modules=exclude_modules, original_func_name=map_function.__name__, remote_invocation=remote_invocation, original_iterdata_len=original_iterdata_len, job_max_runtime=job_max_runtime) return map_futures, parts_per_object
def create_call_async_job(config, internal_storage, executor_id, async_job_id, func, data, runtime_meta, extra_env=None, runtime_memory=None, include_modules=[], exclude_modules=[], execution_timeout=EXECUTION_TIMEOUT): """ Wrapper to create call_async job that contains only one function invocation. """ data = utils.verify_args(func, [data], None) return _create_job(config, internal_storage, executor_id, async_job_id, func, data, runtime_meta, runtime_memory=runtime_memory, extra_env=extra_env, execution_timeout=execution_timeout, exclude_modules=exclude_modules, include_modules=include_modules)
def create_reduce_job(config, internal_storage, executor_id, reduce_job_id, reduce_function, map_job, map_futures, runtime_meta, reducer_one_per_object=False, runtime_memory=None, extra_env=None, include_modules=[], exclude_modules=[], execution_timeout=None): """ Wrapper to create a reduce job. Apply a function across all map futures. """ job_created_tstamp = time.time() iterdata = [[map_futures, ]] if 'parts_per_object' in map_job and reducer_one_per_object: prev_total_partitons = 0 iterdata = [] for total_partitions in map_job['parts_per_object']: iterdata.append([map_futures[prev_total_partitons:prev_total_partitons+total_partitions]]) prev_total_partitons = prev_total_partitons + total_partitions reduce_job_env = {'__PW_REDUCE_JOB': True} if extra_env is None: ext_env = reduce_job_env else: ext_env = extra_env.copy() ext_env.update(reduce_job_env) iterdata = utils.verify_args(reduce_function, iterdata, None) return _create_job(config, internal_storage, executor_id, reduce_job_id, reduce_function, iterdata, runtime_meta=runtime_meta, runtime_memory=runtime_memory, extra_env=ext_env, include_modules=include_modules, exclude_modules=exclude_modules, execution_timeout=execution_timeout, job_created_tstamp=job_created_tstamp)
def create_map_job(config, internal_storage, executor_id, job_id, map_function, iterdata, runtime_meta, runtime_memory=None, extra_params=None, extra_env=None, obj_chunk_size=None, obj_chunk_number=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], execution_timeout=None): """ Wrapper to create a map job. It integrates COS logic to process objects. """ job_created_timestamp = time.time() map_func = map_function map_iterdata = utils.verify_args(map_function, iterdata, extra_params) new_invoke_pool_threads = invoke_pool_threads new_runtime_memory = runtime_memory if config['pywren'].get('rabbitmq_monitor', False): rabbit_amqp_url = config['rabbitmq'].get('amqp_url') utils.create_rabbitmq_resources(rabbit_amqp_url, executor_id, job_id) # Object processing functionality parts_per_object = None if is_object_processing_function(map_function): # If it is object processing function, create partitions according chunk_size or chunk_number logger.debug( 'ExecutorID {} | JobID {} - Calling map on partitions from object storage flow' .format(executor_id, job_id)) map_iterdata, parts_per_object = create_partitions( config, map_iterdata, obj_chunk_size, obj_chunk_number) # ######## job_description = _create_job(config, internal_storage, executor_id, job_id, map_func, map_iterdata, runtime_meta=runtime_meta, runtime_memory=new_runtime_memory, extra_env=extra_env, invoke_pool_threads=new_invoke_pool_threads, include_modules=include_modules, exclude_modules=exclude_modules, execution_timeout=execution_timeout, job_created_timestamp=job_created_timestamp) if parts_per_object: job_description['parts_per_object'] = parts_per_object return job_description
def create_map_job(config, internal_storage, executor_id, map_job_id, map_function, iterdata, runtime_meta, runtime_memory=None, extra_params=None, extra_env=None, obj_chunk_size=None, obj_chunk_number=None, invoke_pool_threads=128, include_modules=[], exclude_modules=[], execution_timeout=EXECUTION_TIMEOUT): """ Wrapper to create a map job. It integrates COS logic to process objects. """ map_func = map_function map_iterdata = utils.verify_args(map_function, iterdata, extra_params) new_invoke_pool_threads = invoke_pool_threads new_runtime_memory = runtime_memory # Object processing functionality parts_per_object = None if utils.is_object_processing_function(map_function): # If it is object processing function, create partitions according chunk_size or chunk_number logger.debug( 'ExecutorID {} | JobID {} - Calling map on partitions from object storage flow' .format(executor_id, map_job_id)) map_iterdata, parts_per_object = create_partitions( config, map_iterdata, obj_chunk_size, obj_chunk_number) # ######## job_description = _create_job(config, internal_storage, executor_id, map_job_id, map_func, map_iterdata, runtime_meta=runtime_meta, runtime_memory=new_runtime_memory, extra_env=extra_env, invoke_pool_threads=new_invoke_pool_threads, include_modules=include_modules, exclude_modules=exclude_modules, execution_timeout=execution_timeout) job_description['parts_per_object'] = parts_per_object return job_description
def create_reduce_job(config, internal_storage, executor_id, reduce_job_id, reduce_function, map_job, map_futures, runtime_meta, reducer_one_per_object=False, runtime_memory=None, extra_env=None, include_modules=[], exclude_modules=[]): """ Wrapper to create a reduce job. Apply a function across all map futures. """ iterdata = [[map_futures, ]] if map_job['parts_per_object'] and reducer_one_per_object: prev_total_partitons = 0 iterdata = [] for total_partitions in map_job['parts_per_object']: iterdata.append([map_futures[prev_total_partitons:prev_total_partitons+total_partitions]]) prev_total_partitons = prev_total_partitons + total_partitions def reduce_function_wrapper(fut_list, internal_storage, ibm_cos): logger.info('Waiting for results') if 'SHOW_MEMORY_USAGE' in os.environ: show_memory = eval(os.environ['SHOW_MEMORY_USAGE']) else: show_memory = False # Wait for all results wait_storage(fut_list, internal_storage, download_results=True) results = [f.result() for f in fut_list if f.done and not f.futures] fut_list.clear() reduce_func_args = {'results': results} if show_memory: logger.debug("Memory usage after getting the results: {}".format(utils.get_current_memory_usage())) # Run reduce function func_sig = inspect.signature(reduce_function) if 'ibm_cos' in func_sig.parameters: reduce_func_args['ibm_cos'] = ibm_cos if 'internal_storage' in func_sig.parameters: reduce_func_args['internal_storage'] = internal_storage return reduce_function(**reduce_func_args) iterdata = utils.verify_args(reduce_function_wrapper, iterdata, None) return _create_job(config, internal_storage, executor_id, reduce_job_id, reduce_function_wrapper, iterdata, runtime_meta=runtime_meta, runtime_memory=runtime_memory, extra_env=extra_env, include_modules=include_modules, exclude_modules=exclude_modules, original_func_name=reduce_function.__name__)
def _map(self, func, iterdata, extra_env=None, extra_meta=None, invoke_pool_threads=128, data_all_as_one=True, overwrite_invoke_args=None, exclude_modules=None, original_func_name=None, remote_invocation=False, original_iterdata_len=None, job_max_runtime=wrenconfig.RUNTIME_TIMEOUT): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :param original_func_name: Name of the function to invoke. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ if original_func_name: func_name = original_func_name else: func_name = func.__name__ data = wrenutil.iterdata_as_list(iterdata) if extra_env is not None: extra_env = wrenutil.convert_bools_to_string(extra_env) if not data: return [] if self.map_item_limit is not None and len(data) > self.map_item_limit: raise ValueError("len(data) ={}, exceeding map item limit of {}" "consider mapping over a smaller" "number of items".format(len(data), self.map_item_limit)) # This allows multiple parameters in functions data = wrenutil.verify_args(func, data) callgroup_id = wrenutil.create_callgroup_id() host_job_meta = {} log_msg = 'Executor ID {} Serializing function and data'.format(self.executor_id) logger.debug(log_msg) # pickle func and all data (to capture module dependencies) func_and_data_ser, mod_paths = self.serializer([func] + data) func_str = func_and_data_ser[0] data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) agg_data_key = None host_job_meta['agg_data'] = False host_job_meta['data_size_bytes'] = data_size_bytes log_msg = 'Executor ID {} Uploading function and data'.format(self.executor_id) logger.info(log_msg) if not self.log_level: print(log_msg, end=' ') if data_size_bytes < wrenconfig.MAX_AGG_DATA_SIZE and data_all_as_one: agg_data_key = create_agg_data_key(self.internal_storage.prefix, self.executor_id, callgroup_id) agg_data_bytes, agg_data_ranges = self.agg_data(data_strs) agg_upload_time = time.time() self.internal_storage.put_data(agg_data_key, agg_data_bytes) host_job_meta['agg_data'] = True host_job_meta['data_upload_time'] = time.time() - agg_upload_time host_job_meta['data_upload_timestamp'] = time.time() else: log_msg = ('Executor ID {} Total data exceeded ' 'maximum size of {} bytes'.format(self.executor_id, wrenconfig.MAX_AGG_DATA_SIZE)) logger.warning(log_msg) if exclude_modules: for module in exclude_modules: for mod_path in list(mod_paths): if module in mod_path and mod_path in mod_paths: mod_paths.remove(mod_path) module_data = create_mod_data(mod_paths) # Create func and upload func_module_str = pickle.dumps({'func': func_str, 'module_data': module_data}, -1) host_job_meta['func_module_bytes'] = len(func_module_str) func_upload_time = time.time() func_key = create_func_key(self.internal_storage.prefix, self.executor_id, callgroup_id) self.internal_storage.put_func(func_key, func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() if not self.log_level: func_and_data_size = wrenutil.sizeof_fmt(host_job_meta['func_module_bytes']+host_job_meta['data_size_bytes']) log_msg = '- Total: {}'.format(func_and_data_size) print(log_msg) def invoke(data_str, executor_id, callgroup_id, call_id, func_key, host_job_meta, agg_data_key=None, data_byte_range=None): data_key, output_key, status_key = create_keys(self.internal_storage.prefix, executor_id, callgroup_id, call_id) host_job_meta['job_invoke_timestamp'] = time.time() if agg_data_key is None: data_upload_time = time.time() self.internal_storage.put_data(data_key, data_str) data_upload_time = time.time() - data_upload_time host_job_meta['data_upload_time'] = data_upload_time host_job_meta['data_upload_timestamp'] = time.time() data_key = data_key else: data_key = agg_data_key return self.invoke_with_keys(func_key, data_key, output_key, status_key, executor_id, callgroup_id, call_id, extra_env, extra_meta, data_byte_range, host_job_meta.copy(), job_max_runtime, overwrite_invoke_args=overwrite_invoke_args) N = len(data) call_futures = [] if remote_invocation and original_iterdata_len > 1: log_msg = 'Executor ID {} Starting {} remote invocation function: Spawning {}() - Total: {} activations'.format(self.executor_id, N, func_name, original_iterdata_len) else: log_msg = 'Executor ID {} Starting function invocation: {}() - Total: {} activations'.format(self.executor_id, func_name, N) logger.info(log_msg) if not self.log_level: print(log_msg) with ThreadPoolExecutor(max_workers=invoke_pool_threads) as executor: for i in range(N): call_id = "{:05d}".format(i) data_byte_range = None if agg_data_key is not None: data_byte_range = agg_data_ranges[i] future = executor.submit(invoke, data_strs[i], self.executor_id, callgroup_id, call_id, func_key, host_job_meta.copy(), agg_data_key, data_byte_range) call_futures.append(future) res = [ft.result() for ft in call_futures] return res
def create_map_job(config, internal_storage, executor_id, job_id, map_function, iterdata, obj_chunk_size=None, extra_env=None, extra_meta=None, runtime_memory=None, remote_invocation=False, remote_invocation_groups=None, invoke_pool_threads=128, exclude_modules=None, is_cf_cluster=False, execution_timeout=EXECUTION_TIMEOUT, overwrite_invoke_args=None): """ Wrapper to create a map job. It integrates COS logic to process objects. """ map_job_id = f'M{job_id}' data = utils.iterdata_as_list(iterdata) map_func = map_function map_iterdata = data new_invoke_pool_threads = invoke_pool_threads new_runtime_memory = runtime_memory # Object processing functionality parts_per_object = None if utils.is_object_processing_function(map_function): ''' If it is object processing function, create partitions according chunk_size ''' logger.debug( 'ExecutorID {} | JobID {} - Calling map on partitions from object storage flow' .format(executor_id, job_id)) arg_data = utils.verify_args(map_function, data, object_processing=True) map_iterdata, parts_per_object = create_partitions( config, arg_data, obj_chunk_size) map_func = partition_processor(map_function) # ######## # Remote invocation functionality original_total_tasks = len(map_iterdata) if original_total_tasks == 1 or is_cf_cluster: remote_invocation = False if remote_invocation: rabbitmq_monitor = "CB_RABBITMQ_MONITOR" in os.environ def remote_invoker(input_data): pw = pywren.ibm_cf_executor(rabbitmq_monitor=rabbitmq_monitor) return pw.map(map_function, input_data, runtime_memory=runtime_memory, invoke_pool_threads=invoke_pool_threads, extra_env=extra_env, extra_meta=extra_meta) map_func = remote_invoker if remote_invocation_groups: map_iterdata = [[ iterdata[x:x + remote_invocation_groups] ] for x in range(0, original_total_tasks, remote_invocation_groups) ] else: map_iterdata = [iterdata] new_invoke_pool_threads = 1 new_runtime_memory = runtime_memory # ######## job_description = _create_job(config, internal_storage, executor_id, map_job_id, map_func, map_iterdata, extra_env=extra_env, extra_meta=extra_meta, runtime_memory=new_runtime_memory, invoke_pool_threads=new_invoke_pool_threads, overwrite_invoke_args=overwrite_invoke_args, exclude_modules=exclude_modules, original_func_name=map_function.__name__, remote_invocation=remote_invocation, original_total_tasks=original_total_tasks, execution_timeout=execution_timeout) return job_description, parts_per_object
def _create_job(config, internal_storage, executor_id, job_id, func, iterdata, extra_env=None, extra_meta=None, runtime_memory=None, invoke_pool_threads=128, overwrite_invoke_args=None, exclude_modules=None, original_func_name=None, remote_invocation=False, original_total_tasks=None, execution_timeout=EXECUTION_TIMEOUT): """ :param func: the function to map over the data :param iterdata: An iterable of input data :param extra_env: Additional environment variables for CF environment. Default None. :param extra_meta: Additional metadata to pass to CF. Default None. :param remote_invocation: Enable remote invocation. Default False. :param invoke_pool_threads: Number of threads to use to invoke. :param data_all_as_one: upload the data as a single object. Default True :param overwrite_invoke_args: Overwrite other args. Mainly used for testing. :param exclude_modules: Explicitly keep these modules from pickled dependencies. :param original_func_name: Name of the function to invoke. :return: A list with size `len(iterdata)` of futures for each job :rtype: list of futures. """ log_level = os.getenv('CB_LOG_LEVEL') runtime_name = config['pywren']['runtime'] if runtime_memory is None: runtime_memory = config['pywren']['runtime_memory'] runtime_memory = int(runtime_memory) runtime_preinstalls = select_runtime(config, internal_storage, executor_id, job_id, runtime_name, runtime_memory) serializer = SerializeIndependent(runtime_preinstalls) if original_func_name: func_name = original_func_name else: func_name = func.__name__ data = utils.iterdata_as_list(iterdata) if extra_env is not None: extra_env = utils.convert_bools_to_string(extra_env) if not data: return [] # This allows multiple parameters in functions data = utils.verify_args(func, data) host_job_meta = {} job_description = {} job_description['runtime_name'] = runtime_name job_description['runtime_memory'] = runtime_memory job_description['task_execution_timeout'] = execution_timeout job_description['func_name'] = func_name job_description['extra_env'] = extra_env job_description['extra_meta'] = extra_meta job_description['total_calls'] = len(data) job_description['invoke_pool_threads'] = invoke_pool_threads job_description['overwrite_invoke_args'] = overwrite_invoke_args job_description['job_id'] = job_id job_description['remote_invocation'] = remote_invocation job_description['original_total_calls'] = original_total_tasks log_msg = 'ExecutorID {} | JobID {} - Serializing function and data'.format( executor_id, job_id) logger.debug(log_msg) # pickle func and all data (to capture module dependencies) func_and_data_ser, mod_paths = serializer([func] + data) func_str = func_and_data_ser[0] data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) host_job_meta['agg_data'] = False host_job_meta['data_size_bytes'] = data_size_bytes log_msg = 'ExecutorID {} | JobID {} - Uploading function and data'.format( executor_id, job_id) logger.info(log_msg) if not log_level: print(log_msg, end=' ') if data_size_bytes < MAX_AGG_DATA_SIZE: agg_data_key = create_agg_data_key(internal_storage.prefix, executor_id, job_id) job_description['data_key'] = agg_data_key agg_data_bytes, agg_data_ranges = _agg_data(data_strs) job_description['data_ranges'] = agg_data_ranges agg_upload_time = time.time() internal_storage.put_data(agg_data_key, agg_data_bytes) host_job_meta['agg_data'] = True host_job_meta['data_upload_time'] = time.time() - agg_upload_time host_job_meta['data_upload_timestamp'] = time.time() else: log_msg = ('ExecutorID {} | JobID {} - Total data exceeded ' 'maximum size of {} bytes'.format(executor_id, job_id, MAX_AGG_DATA_SIZE)) raise Exception(log_msg) if exclude_modules: for module in exclude_modules: for mod_path in list(mod_paths): if module in mod_path and mod_path in mod_paths: mod_paths.remove(mod_path) module_data = create_module_data(mod_paths) # Create func and upload host_job_meta['func_name'] = func_name func_module_str = pickle.dumps( { 'func': func_str, 'module_data': module_data }, -1) host_job_meta['func_module_bytes'] = len(func_module_str) func_upload_time = time.time() func_key = create_func_key(internal_storage.prefix, executor_id, job_id) job_description['func_key'] = func_key internal_storage.put_func(func_key, func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() if not log_level: func_and_data_size = utils.sizeof_fmt( host_job_meta['func_module_bytes'] + host_job_meta['data_size_bytes']) log_msg = '- Total: {}'.format(func_and_data_size) print(log_msg) job_description['host_job_meta'] = host_job_meta return job_description