コード例 #1
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    from h2o.estimators import H2OWord2vecEstimator
    w2v_model = H2OWord2vecEstimator(
        epochs=int(params.get('epochs')),
        init_learning_rate=float(params.get('init_learning_rate')),
        max_runtime_secs=float(params.get('max_runtime_secs')),
        min_word_freq=int(params.get('min_word_freq')),
        sent_sample_rate=float(params.get('sent_sample_rate')),
        vec_size=int(params.get('vec_size')),
        window_size=int(params.get('window_size')))

    w2v_model.train(training_frame=df)

    save_model(params, w2v_model.model_id)

    is_transform = params.get("is_transform")
    if is_transform is not None and to_bool(is_transform):
        df_vecs = w2v_model.transform(
            df, aggregate_method=params.get('aggregate_method'))
        dest_frame_id = append_frame_id(frame_id,
                                        params.get('transform_suffix'))
        h2o.assign(df_vecs, dest_frame_id)
    else:
        dest_frame_id = frame_id

    return {'frame_id': dest_frame_id, 'model_id': w2v_model.model_id}
コード例 #2
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    target_column = params.get("target_column")
    analyzer = params.get("analyzer")
    if len(analyzer) > 0:
        url = params.get("url")
        df_token = df[target_column].tokenize(
            f'tokenize:elasticsearch:{url}?analyzer={analyzer}_analyzer')
    else:
        df_token = df[target_column].tokenize(params.get('regex'))

    if to_bool(params.get('lower_case')):
        df_token = df_token.tolower()

    min_word_len = int(params.get('min_word_len'))
    if min_word_len > 0:
        df_token = df_token[(df_token.nchar() >= min_word_len) |
                            (df_token.isna()), :]

    if to_bool(params.get('use_stop_words')):
        df_token = df_token[(df_token.isna()) |
                            (~df_token.isin(STOP_WORDS)), :]

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_token, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #3
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')
    model_id = config.get('model_id')

    df = h2o.get_frame(frame_id)
    column_header = params.get('column_header')
    if len(column_header) > 0:
        df_head = df[:int(column_header)]
        df = df[int(column_header):]

    pred_model = h2o.get_model(model_id)

    df_pred = pred_model.predict(df)
    df_pred.columns = [x[len('reconstr_'):] for x in df_pred.columns]

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))

    if to_bool(params.get('topn_output')):
        df_topn = get_topN(df_pred, int(params.get('topn_percent')))
        if df_head is not None:
            df_topn = df_head.cbind(df_topn)
        h2o.assign(df_topn, dest_frame_id)
        h2o.remove(str(df_pred.frame_id))
    else:
        h2o.assign(df_pred, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #4
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')
    model_id = config.get('model_id')

    df = h2o.get_frame(frame_id)

    input_columns = params.get("input_columns")
    if input_columns is None or len(input_columns) <= 2:
        input_columns = df.col_names
    else:
        input_columns = json.loads(input_columns)

    output_columns = params.get("output_columns")
    if output_columns is None or len(output_columns) <= 2:
        output_columns = []
    else:
        output_columns = json.loads(output_columns)

    pred_model = h2o.get_model(model_id)

    df_pred = pred_model.predict(df[input_columns])
    for col_name in output_columns:
        df_pred[col_name] = df[col_name]

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_pred, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #5
0
ファイル: frame_replace.py プロジェクト: codelibs/fione
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    column = params.get('column')
    value = params.get('value')
    c_type = df.types[column]

    if c_type == 'real':
        value = float(value)
    elif c_type == 'int':
        value = int(value)
    elif c_type == 'enum':
        for c in df[column].categories():
            if value == c:
                value = c
                break

    row_conditions = params.get('row_conditions')
    if row_conditions is not None and len(row_conditions) > 0:
        mask = parse_row_condition(df, row_conditions)
        df[mask, column] = value
    else:
        df[column] = value

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #6
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    df_pivot = df.pivot(index=params.get('index'), column=params.get('column'), value=params.get('value'))

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_pivot, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #7
0
ファイル: frame_fillna.py プロジェクト: codelibs/fione
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    df_fillna = df.fillna(method=params.get('method'),
                          axis=int(params.get('axis')),
                          maxlen=int(params.get('maxlen')))

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_fillna, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #8
0
ファイル: frame_rslice.py プロジェクト: codelibs/fione
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    row_conditions = params.get('row_conditions')
    if row_conditions is not None and len(row_conditions) > 0:
        mask = parse_row_condition(df, row_conditions)
        df = df[mask, :]

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #9
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    column = params.get('column')
    ascending = to_bool(params.get('ascending'))

    df_sort = df.sort(by=[column], ascending=[ascending])

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_sort, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #10
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    bind_frame_id = params.get('bind_frame_id')
    df_2 = h2o.get_frame(bind_frame_id)

    df_bind = df.cbind(df_2)

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_bind, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #11
0
ファイル: frame_floor.py プロジェクト: codelibs/fione
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    columns = params.get('columns')
    if columns is not None or len(columns) > 2:
        columns = json.loads(columns)
        df = df[columns]

    df_floor = df.floor()

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_floor, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #12
0
ファイル: frame_cslice.py プロジェクト: codelibs/fione
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    columns = params.get('columns')
    if columns is None or len(columns) <= 2:
        columns = df.columns
    else:
        columns = json.loads(columns)

    df_filtered = df[columns]

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_filtered, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #13
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    frames = params.get('frames')
    if frames is None or len(frames) <= 2:
        print("frames are empty.")
        sys.exit(1)
    frames = json.loads(frames)

    df_concat = df.concat([h2o.get_frame(x) for x in frames], axis=int(params.get('axis')))

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_concat, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #14
0
ファイル: frame_cor.py プロジェクト: codelibs/fione
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    columns = params.get('columns')
    if columns is not None and len(columns) > 2:
        columns = json.loads(columns)
        df = df[columns]

    use_value = params.get('use')
    if use_value is not None and len(use_value) == 0:
        use_value = None
    df_cor = df.cor(na_rm=to_bool(params.get('na_rm')),
                    use=use_value,
                    method=params.get('method'))

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_cor, dest_frame_id)

    return {'frame_id': dest_frame_id}
コード例 #15
0
ファイル: frame_split.py プロジェクト: codelibs/fione
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    train = int(params.get('train_ratio'))

    test = params.get('test_ratio')
    if test is None or len(test) == 0:
        test = 0
    else:
        test = int(test)

    valid = params.get('valid_ratio')
    if valid is None or len(valid) == 0:
        valid = 0
    else:
        valid = int(valid)

    seed = params.get('seed')
    if seed is None or len(seed) == 0:
        seed = None
    else:
        seed = int(seed)

    train_ratio = train / (train + test + valid)
    test_ratio = test / (train + test + valid)
    valid_ratio = valid / (train + test + valid)

    if valid == 0 and test == 0:
        return {'frame_id': frame_id}
    elif valid == 0:
        df_train, df_test = df.split_frame(ratios=[train_ratio], seed=seed)
        df_valid = None
    elif test == 0:
        df_train, df_valid = df.split_frame(ratios=[train_ratio], seed=seed)
        df_test = None
    else:
        df_train, df_test, df_valid = df.split_frame(
            ratios=[train_ratio, test_ratio], seed=seed)

    train_frame_id = append_frame_id(frame_id, params.get('train_suffix'))
    h2o.assign(df_train, train_frame_id)

    if df_test is None:
        test_frame_id = None
    else:
        test_frame_id = append_frame_id(frame_id, params.get('test_suffix'))
        h2o.assign(df_test, test_frame_id)

    if df_valid is None:
        valid_frame_id = None
    else:
        valid_frame_id = append_frame_id(frame_id, params.get('valid_suffix'))
        h2o.assign(df_valid, valid_frame_id)

    return {
        'frame_id': train_frame_id,
        'train_frame_id': train_frame_id,
        'test_frame_id': test_frame_id,
        'valid_frame_id': valid_frame_id,
    }