Esempio n. 1
0
    def test_simple(x):

        def func(x):
            return x + 1
        data = list(range(5))

        serializer = serialize.SerializeIndependent()
        func_and_data_ser, mod_paths = serializer([func] + data)
        for m in mod_paths:
            print(m)

        config =  pywren.wrenconfig.default()

        runtime_bucket = config['runtime']['s3_bucket']
        runtime_key =  config['runtime']['s3_key']
        info = runtime.get_runtime_info(runtime_bucket, runtime_key)
        print(info.keys())
        for f in info['pkg_ver_list']:
            print(f[0])
Esempio n. 2
0
    def map(self,
            func,
            iterdata,
            extra_env=None,
            extra_meta=None,
            invoke_pool_threads=64,
            data_all_as_one=True,
            use_cached_runtime=True):
        """
        # FIXME work with an actual iterable instead of just a list

        data_all_as_one : upload the data as a single s3 object; fewer
        tcp transactions (good) but potentially higher latency for workers (bad)

        use_cached_runtime : if runtime has been cached, use that. When set
        to False, redownloads runtime.
        """

        host_job_meta = {}

        pool = ThreadPool(invoke_pool_threads)
        callset_id = s3util.create_callset_id()
        data = list(iterdata)

        ### pickle func and all data (to capture module dependencies
        serializer = serialize.SerializeIndependent()
        func_and_data_ser, mod_paths = serializer([func] + data)

        func_str = func_and_data_ser[0]
        data_strs = func_and_data_ser[1:]
        data_size_bytes = sum(len(x) for x in data_strs)
        s3_agg_data_key = None
        host_job_meta['aggregated_data_in_s3'] = False
        host_job_meta['data_size_bytes'] = data_size_bytes

        if data_size_bytes < wrenconfig.MAX_AGG_DATA_SIZE and data_all_as_one:
            s3_agg_data_key = s3util.create_agg_data_key(
                self.s3_bucket, self.s3_prefix, callset_id)
            agg_data_bytes, agg_data_ranges = self.agg_data(data_strs)
            agg_upload_time = time.time()
            self.s3client.put_object(Bucket=s3_agg_data_key[0],
                                     Key=s3_agg_data_key[1],
                                     Body=agg_data_bytes)
            host_job_meta['agg_data_in_s3'] = True
            host_job_meta['data_upload_time'] = time.time() - agg_upload_time
            host_job_meta['data_upload_timestamp'] = time.time()
        else:
            # FIXME add warning that you wanted data all as one but
            # it exceeded max data size
            pass

        module_data = self.create_mod_data(mod_paths)
        func_str_encoded = wrenutil.bytes_to_b64str(func_str)
        #debug_foo = {'func' : func_str_encoded,
        #             'module_data' : module_data}

        #pickle.dump(debug_foo, open("/tmp/py35.debug.pickle", 'wb'))
        ### Create func and upload
        func_module_str = json.dumps({
            'func': func_str_encoded,
            'module_data': module_data
        })
        host_job_meta['func_module_str_len'] = len(func_module_str)

        func_upload_time = time.time()
        s3_func_key = s3util.create_func_key(self.s3_bucket, self.s3_prefix,
                                             callset_id)
        self.s3client.put_object(Bucket=s3_func_key[0],
                                 Key=s3_func_key[1],
                                 Body=func_module_str)
        host_job_meta['func_upload_time'] = time.time() - func_upload_time
        host_job_meta['func_upload_timestamp'] = time.time()

        def invoke(data_str,
                   callset_id,
                   call_id,
                   s3_func_key,
                   host_job_meta,
                   s3_agg_data_key=None,
                   data_byte_range=None):
            s3_data_key, s3_output_key, s3_status_key \
                = s3util.create_keys(self.s3_bucket,
                                     self.s3_prefix,
                                     callset_id, call_id)

            host_job_meta['job_invoke_timestamp'] = time.time()

            if s3_agg_data_key is None:
                data_upload_time = time.time()
                self.put_data(s3_data_key, data_str, callset_id, call_id)
                data_upload_time = time.time() - data_upload_time
                host_job_meta['data_upload_time'] = data_upload_time
                host_job_meta['data_upload_timestamp'] = time.time()

                data_key = s3_data_key
            else:
                data_key = s3_agg_data_key

            return self.invoke_with_keys(s3_func_key, data_key, s3_output_key,
                                         s3_status_key, callset_id, call_id,
                                         extra_env, extra_meta,
                                         data_byte_range, use_cached_runtime,
                                         host_job_meta.copy(),
                                         self.job_max_runtime)

        N = len(data)
        call_result_objs = []
        for i in range(N):
            call_id = "{:05d}".format(i)

            data_byte_range = None
            if s3_agg_data_key is not None:
                data_byte_range = agg_data_ranges[i]

            cb = pool.apply_async(
                invoke,
                (data_strs[i], callset_id, call_id, s3_func_key,
                 host_job_meta.copy(), s3_agg_data_key, data_byte_range))

            logger.info("map {} {} apply async".format(callset_id, call_id))

            call_result_objs.append(cb)

        res = [c.get() for c in call_result_objs]
        pool.close()
        pool.join()
        logger.info("map invoked {} {} pool join".format(callset_id, call_id))

        # FIXME take advantage of the callset to return a lot of these

        # note these are just the invocation futures

        return res
Esempio n. 3
0
    m = sklearn.linear_model.Lasso()
    return str(x)


args = [pd.Series([1, 2, 3])]

s = StringIO()
cp = cloudpickle.CloudPickler(s, 2)
cp.dump(f)
print len(cp.modules), len(s.getvalue())

s = StringIO()
cp = cloudpickle.CloudPickler(s, 2)

cp.dump(args)
print len(cp.modules), len(s.getvalue())

s = StringIO()
cp = cloudpickle.CloudPickler(s, 2)

cp.dump(f)
cp.dump(args)
print len(cp.modules), len(s.getvalue())
a = pickle.loads(s.getvalue())
# THIS ONLY RESTORES THE ORIGINAL FUNCTION

# now try with serializer
ser = serialize.SerializeIndependent()
list_of_strs, mod_paths = ser([f, args])
print[len(a) for a in list_of_strs]