def map(self, func, iterdata, extra_env=None, extra_meta=None, invoke_pool_threads=64, data_all_as_one=True, use_cached_runtime=True): """ # FIXME work with an actual iterable instead of just a list data_all_as_one : upload the data as a single s3 object; fewer tcp transactions (good) but potentially higher latency for workers (bad) use_cached_runtime : if runtime has been cached, use that. When set to False, redownloads runtime. """ host_job_meta = {} pool = ThreadPool(invoke_pool_threads) callset_id = s3util.create_callset_id() data = list(iterdata) ### pickle func and all data (to capture module dependencies serializer = serialize.SerializeIndependent() func_and_data_ser, mod_paths = serializer([func] + data) func_str = func_and_data_ser[0] data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) s3_agg_data_key = None host_job_meta['aggregated_data_in_s3'] = False host_job_meta['data_size_bytes'] = data_size_bytes if data_size_bytes < wrenconfig.MAX_AGG_DATA_SIZE and data_all_as_one: s3_agg_data_key = s3util.create_agg_data_key( self.s3_bucket, self.s3_prefix, callset_id) agg_data_bytes, agg_data_ranges = self.agg_data(data_strs) agg_upload_time = time.time() self.s3client.put_object(Bucket=s3_agg_data_key[0], Key=s3_agg_data_key[1], Body=agg_data_bytes) host_job_meta['agg_data_in_s3'] = True host_job_meta['data_upload_time'] = time.time() - agg_upload_time host_job_meta['data_upload_timestamp'] = time.time() else: # FIXME add warning that you wanted data all as one but # it exceeded max data size pass module_data = self.create_mod_data(mod_paths) func_str_encoded = wrenutil.bytes_to_b64str(func_str) #debug_foo = {'func' : func_str_encoded, # 'module_data' : module_data} #pickle.dump(debug_foo, open("/tmp/py35.debug.pickle", 'wb')) ### Create func and upload func_module_str = json.dumps({ 'func': func_str_encoded, 'module_data': module_data }) host_job_meta['func_module_str_len'] = len(func_module_str) func_upload_time = time.time() s3_func_key = s3util.create_func_key(self.s3_bucket, self.s3_prefix, callset_id) self.s3client.put_object(Bucket=s3_func_key[0], Key=s3_func_key[1], Body=func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() def invoke(data_str, callset_id, call_id, s3_func_key, host_job_meta, s3_agg_data_key=None, data_byte_range=None): s3_data_key, s3_output_key, s3_status_key \ = s3util.create_keys(self.s3_bucket, self.s3_prefix, callset_id, call_id) host_job_meta['job_invoke_timestamp'] = time.time() if s3_agg_data_key is None: data_upload_time = time.time() self.put_data(s3_data_key, data_str, callset_id, call_id) data_upload_time = time.time() - data_upload_time host_job_meta['data_upload_time'] = data_upload_time host_job_meta['data_upload_timestamp'] = time.time() data_key = s3_data_key else: data_key = s3_agg_data_key return self.invoke_with_keys(s3_func_key, data_key, s3_output_key, s3_status_key, callset_id, call_id, extra_env, extra_meta, data_byte_range, use_cached_runtime, host_job_meta.copy(), self.job_max_runtime) N = len(data) call_result_objs = [] for i in range(N): call_id = "{:05d}".format(i) data_byte_range = None if s3_agg_data_key is not None: data_byte_range = agg_data_ranges[i] cb = pool.apply_async( invoke, (data_strs[i], callset_id, call_id, s3_func_key, host_job_meta.copy(), s3_agg_data_key, data_byte_range)) logger.info("map {} {} apply async".format(callset_id, call_id)) call_result_objs.append(cb) res = [c.get() for c in call_result_objs] pool.close() pool.join() logger.info("map invoked {} {} pool join".format(callset_id, call_id)) # FIXME take advantage of the callset to return a lot of these # note these are just the invocation futures return res
def map(self, func, iterdata, extra_env=None, extra_meta=None, invoke_pool_threads=64, data_all_as_one=True, use_cached_runtime=True, overwrite_invoke_args=None, exclude_modules=None): """ # FIXME work with an actual iterable instead of just a list data_all_as_one : upload the data as a single object; fewer tcp transactions (good) but potentially higher latency for workers (bad) use_cached_runtime : if runtime has been cached, use that. When set to False, redownloads runtime. """ data = list(iterdata) if not data: return [] if self.map_item_limit is not None and len(data) > self.map_item_limit: raise ValueError("len(data) ={}, exceeding map item limit of {}"\ "consider mapping over a smaller"\ "number of items".format(len(data), self.map_item_limit)) host_job_meta = {} pool = ThreadPool(invoke_pool_threads) callset_id = wrenutil.create_callset_id() ### pickle func and all data (to capture module dependencies func_and_data_ser, mod_paths = self.serializer([func] + data) func_str = func_and_data_ser[0] data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) agg_data_key = None host_job_meta['agg_data'] = False host_job_meta['data_size_bytes'] = data_size_bytes if data_size_bytes < wrenconfig.MAX_AGG_DATA_SIZE and data_all_as_one: agg_data_key = storage_utils.create_agg_data_key(self.storage.prefix, callset_id) agg_data_bytes, agg_data_ranges = self.agg_data(data_strs) agg_upload_time = time.time() self.storage.put_data(agg_data_key, agg_data_bytes) host_job_meta['agg_data'] = True host_job_meta['data_upload_time'] = time.time() - agg_upload_time host_job_meta['data_upload_timestamp'] = time.time() else: # FIXME add warning that you wanted data all as one but # it exceeded max data size pass if exclude_modules: for module in exclude_modules: for mod_path in list(mod_paths): if module in mod_path and mod_path in mod_paths: mod_paths.remove(mod_path) module_data = create_mod_data(mod_paths) func_str_encoded = wrenutil.bytes_to_b64str(func_str) #debug_foo = {'func' : func_str_encoded, # 'module_data' : module_data} #pickle.dump(debug_foo, open("/tmp/py35.debug.pickle", 'wb')) ### Create func and upload func_module_str = json.dumps({'func' : func_str_encoded, 'module_data' : module_data}) host_job_meta['func_module_str_len'] = len(func_module_str) func_upload_time = time.time() func_key = create_func_key(self.storage.prefix, callset_id) self.storage.put_func(func_key, func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() def invoke(data_str, callset_id, call_id, func_key, host_job_meta, agg_data_key=None, data_byte_range=None): data_key, output_key, status_key \ = storage_utils.create_keys(self.storage.prefix, callset_id, call_id) host_job_meta['job_invoke_timestamp'] = time.time() if agg_data_key is None: data_upload_time = time.time() self.put_data(data_key, data_str, callset_id, call_id) data_upload_time = time.time() - data_upload_time host_job_meta['data_upload_time'] = data_upload_time host_job_meta['data_upload_timestamp'] = time.time() data_key = data_key else: data_key = agg_data_key return self.invoke_with_keys(func_key, data_key, output_key, status_key, callset_id, call_id, extra_env, extra_meta, data_byte_range, use_cached_runtime, host_job_meta.copy(), self.job_max_runtime, overwrite_invoke_args=overwrite_invoke_args) N = len(data) call_result_objs = [] for i in range(N): call_id = "{:05d}".format(i) data_byte_range = None if agg_data_key is not None: data_byte_range = agg_data_ranges[i] cb = pool.apply_async(invoke, (data_strs[i], callset_id, call_id, func_key, host_job_meta.copy(), agg_data_key, data_byte_range)) logger.info("map {} {} apply async".format(callset_id, call_id)) call_result_objs.append(cb) res = [c.get() for c in call_result_objs] pool.close() pool.join() logger.info("map invoked {} {} pool join".format(callset_id, call_id)) # FIXME take advantage of the callset to return a lot of these # note these are just the invocation futures return res