def get_feature_vals_by_cand_split(pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, candsplit, show_progress):
    feature_table = cloudpickle.loads(pickled_obj)
    if show_progress:
        prog_bar = pyprind.ProgBar(len(candsplit))

    l_dict = {}
    r_dict = {}

    feat_vals = []
    for row in candsplit.itertuples(index=False):
        if show_progress:
            prog_bar.update()

        fk_ltable_val = row[fk_ltable_idx]
        fk_rtable_val = row[fk_rtable_idx]

        if fk_ltable_val not in l_dict:
            l_dict[fk_ltable_val] = l_df.ix[fk_ltable_val]
        l_tuple = l_dict[fk_ltable_val]

        if fk_rtable_val not in r_dict:
            r_dict[fk_rtable_val] = r_df.ix[fk_rtable_val]
        r_tuple = r_dict[fk_rtable_val]

        f = apply_feat_fns(l_tuple, r_tuple, feature_table)
        feat_vals.append(f)

    return feat_vals
Example #2
0
 def _local_execute_func(exec_func, write_func, pickle_func, python_path):
     table_env = BatchTableEnvironment.create(
         environment_settings=EnvironmentSettings.new_instance(
         ).use_blink_planner().in_batch_mode().build())
     table_env.get_config().get_configuration().set_string(
         'parallelism.default', '1')
     table_env.get_config().set_python_executable(python_path)
     table_env.register_function(
         exec_func,
         udf(lambda _: pickle_func, DataTypes.BIGINT(), DataTypes.STRING()))
     table_env.connect(FileSystem().path(write_func)) \
         .with_format(OldCsv().field('func', DataTypes.STRING())) \
         .with_schema(Schema().field('func', DataTypes.STRING())) \
         .create_temporary_table(exec_func)
     table = table_env.from_elements([(1, 'Joblib')])
     table.select('{}(_1)'.format(exec_func)).insert_into(exec_func)
     table_env.execute(exec_func)
     # decode execution result from table sink file.
     execute_result = cloudpickle.loads(
         codecs.decode(
             pd.DataFrame(pd.read_csv(write_func))[0:].columns[0].encode(),
             'base64'))
     # remove table sink file to clear ineffective files.
     os.remove(write_func)
     return execute_result
Example #3
0
 def _remote_execute_func(exec_func, write_func, exec_dict, jm, py):
     func_stdout = '{}/exec_{}_stdout.log'.format(get_file_dir(__file__),
                                                  exec_func)
     func_stderr = '{}/exec_{}_stderr.log'.format(get_file_dir(__file__),
                                                  exec_func)
     with open(func_stdout, 'a') as out, open(func_stderr, 'a') as err:
         # execute `flink run -m <remote> -py function.py` to submit batch job
         submitted_process = Popen(
             args=
             "{}/bin/flink run -m {} -py {}/exec_function.py -pyexec {} {} {} '{}'"
             .format(_find_flink_home(), jm, get_file_dir(__file__), py,
                     exec_func, write_func, json.dumps(exec_dict)),
             shell=True,
             stdout=out,
             stderr=err)
         submitted_process.wait()
     # decode execution result from table sink file.
     execute_result = cloudpickle.loads(
         codecs.decode(
             pd.DataFrame(
                 pd.read_csv(write_func))['func'].values[0].encode(),
             'base64'))
     # remove table sink file to clear ineffective files.
     os.remove(write_func)
     return execute_result
def get_feature_vals_by_cand_split(pickled_obj, fk_ltable_idx, fk_rtable_idx,
                                   l_df, r_df, candsplit, show_progress):
    feature_table = cloudpickle.loads(pickled_obj)
    if show_progress:
        prog_bar = pyprind.ProgBar(len(candsplit))

    l_dict = {}
    r_dict = {}

    feat_vals = []
    for row in candsplit.itertuples(index=False):
        if show_progress:
            prog_bar.update()

        fk_ltable_val = row[fk_ltable_idx]
        fk_rtable_val = row[fk_rtable_idx]

        if fk_ltable_val not in l_dict:
            l_dict[fk_ltable_val] = l_df.ix[fk_ltable_val]
        l_tuple = l_dict[fk_ltable_val]

        if fk_rtable_val not in r_dict:
            r_dict[fk_rtable_val] = r_df.ix[fk_rtable_val]
        r_tuple = r_dict[fk_rtable_val]

        f = apply_feat_fns(l_tuple, r_tuple, feature_table)
        feat_vals.append(f)

    return feat_vals
Example #5
0
def loads_fn(fn_bytes: bytes) -> Callable:
    fn_bytes_hash = hash(fn_bytes)
    try:
        fn = _fn_load_cache[fn_bytes_hash]
    except KeyError:
        fn = cloudpickle.loads(fn_bytes)
        _fn_load_cache[fn_bytes_hash] = fn
    return fn
Example #6
0
def deserialize(code):
    data = cloudpickle.loads(base64.b64decode(code))
    return data
Example #7
0
def deserialize(code):
    # todo: add better error handling for <h1>Internal Server Error</h1>
    data = cloudpickle.loads(base64.b64decode(code))
    return data
Example #8
0
def deserialize(code):
    data = cloudpickle.loads(base64.b64decode(code))
    return data['thunk'], data['args'] or (), data['kwargs'] or {}