def test_simple(x): def func(x): return x + 1 data = list(range(5)) serializer = serialize.SerializeIndependent() func_and_data_ser, mod_paths = serializer([func] + data) for m in mod_paths: print(m) config = pywren.wrenconfig.default() runtime_bucket = config['runtime']['s3_bucket'] runtime_key = config['runtime']['s3_key'] info = runtime.get_runtime_info(runtime_bucket, runtime_key) print(info.keys()) for f in info['pkg_ver_list']: print(f[0])
def map(self, func, iterdata, extra_env=None, extra_meta=None, invoke_pool_threads=64, data_all_as_one=True, use_cached_runtime=True): """ # FIXME work with an actual iterable instead of just a list data_all_as_one : upload the data as a single s3 object; fewer tcp transactions (good) but potentially higher latency for workers (bad) use_cached_runtime : if runtime has been cached, use that. When set to False, redownloads runtime. """ host_job_meta = {} pool = ThreadPool(invoke_pool_threads) callset_id = s3util.create_callset_id() data = list(iterdata) ### pickle func and all data (to capture module dependencies serializer = serialize.SerializeIndependent() func_and_data_ser, mod_paths = serializer([func] + data) func_str = func_and_data_ser[0] data_strs = func_and_data_ser[1:] data_size_bytes = sum(len(x) for x in data_strs) s3_agg_data_key = None host_job_meta['aggregated_data_in_s3'] = False host_job_meta['data_size_bytes'] = data_size_bytes if data_size_bytes < wrenconfig.MAX_AGG_DATA_SIZE and data_all_as_one: s3_agg_data_key = s3util.create_agg_data_key( self.s3_bucket, self.s3_prefix, callset_id) agg_data_bytes, agg_data_ranges = self.agg_data(data_strs) agg_upload_time = time.time() self.s3client.put_object(Bucket=s3_agg_data_key[0], Key=s3_agg_data_key[1], Body=agg_data_bytes) host_job_meta['agg_data_in_s3'] = True host_job_meta['data_upload_time'] = time.time() - agg_upload_time host_job_meta['data_upload_timestamp'] = time.time() else: # FIXME add warning that you wanted data all as one but # it exceeded max data size pass module_data = self.create_mod_data(mod_paths) func_str_encoded = wrenutil.bytes_to_b64str(func_str) #debug_foo = {'func' : func_str_encoded, # 'module_data' : module_data} #pickle.dump(debug_foo, open("/tmp/py35.debug.pickle", 'wb')) ### Create func and upload func_module_str = json.dumps({ 'func': func_str_encoded, 'module_data': module_data }) host_job_meta['func_module_str_len'] = len(func_module_str) func_upload_time = time.time() s3_func_key = s3util.create_func_key(self.s3_bucket, self.s3_prefix, callset_id) self.s3client.put_object(Bucket=s3_func_key[0], Key=s3_func_key[1], Body=func_module_str) host_job_meta['func_upload_time'] = time.time() - func_upload_time host_job_meta['func_upload_timestamp'] = time.time() def invoke(data_str, callset_id, call_id, s3_func_key, host_job_meta, s3_agg_data_key=None, data_byte_range=None): s3_data_key, s3_output_key, s3_status_key \ = s3util.create_keys(self.s3_bucket, self.s3_prefix, callset_id, call_id) host_job_meta['job_invoke_timestamp'] = time.time() if s3_agg_data_key is None: data_upload_time = time.time() self.put_data(s3_data_key, data_str, callset_id, call_id) data_upload_time = time.time() - data_upload_time host_job_meta['data_upload_time'] = data_upload_time host_job_meta['data_upload_timestamp'] = time.time() data_key = s3_data_key else: data_key = s3_agg_data_key return self.invoke_with_keys(s3_func_key, data_key, s3_output_key, s3_status_key, callset_id, call_id, extra_env, extra_meta, data_byte_range, use_cached_runtime, host_job_meta.copy(), self.job_max_runtime) N = len(data) call_result_objs = [] for i in range(N): call_id = "{:05d}".format(i) data_byte_range = None if s3_agg_data_key is not None: data_byte_range = agg_data_ranges[i] cb = pool.apply_async( invoke, (data_strs[i], callset_id, call_id, s3_func_key, host_job_meta.copy(), s3_agg_data_key, data_byte_range)) logger.info("map {} {} apply async".format(callset_id, call_id)) call_result_objs.append(cb) res = [c.get() for c in call_result_objs] pool.close() pool.join() logger.info("map invoked {} {} pool join".format(callset_id, call_id)) # FIXME take advantage of the callset to return a lot of these # note these are just the invocation futures return res
m = sklearn.linear_model.Lasso() return str(x) args = [pd.Series([1, 2, 3])] s = StringIO() cp = cloudpickle.CloudPickler(s, 2) cp.dump(f) print len(cp.modules), len(s.getvalue()) s = StringIO() cp = cloudpickle.CloudPickler(s, 2) cp.dump(args) print len(cp.modules), len(s.getvalue()) s = StringIO() cp = cloudpickle.CloudPickler(s, 2) cp.dump(f) cp.dump(args) print len(cp.modules), len(s.getvalue()) a = pickle.loads(s.getvalue()) # THIS ONLY RESTORES THE ORIGINAL FUNCTION # now try with serializer ser = serialize.SerializeIndependent() list_of_strs, mod_paths = ser([f, args]) print[len(a) for a in list_of_strs]