def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) from h2o.estimators import H2OWord2vecEstimator w2v_model = H2OWord2vecEstimator( epochs=int(params.get('epochs')), init_learning_rate=float(params.get('init_learning_rate')), max_runtime_secs=float(params.get('max_runtime_secs')), min_word_freq=int(params.get('min_word_freq')), sent_sample_rate=float(params.get('sent_sample_rate')), vec_size=int(params.get('vec_size')), window_size=int(params.get('window_size'))) w2v_model.train(training_frame=df) save_model(params, w2v_model.model_id) is_transform = params.get("is_transform") if is_transform is not None and to_bool(is_transform): df_vecs = w2v_model.transform( df, aggregate_method=params.get('aggregate_method')) dest_frame_id = append_frame_id(frame_id, params.get('transform_suffix')) h2o.assign(df_vecs, dest_frame_id) else: dest_frame_id = frame_id return {'frame_id': dest_frame_id, 'model_id': w2v_model.model_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) target_column = params.get("target_column") analyzer = params.get("analyzer") if len(analyzer) > 0: url = params.get("url") df_token = df[target_column].tokenize( f'tokenize:elasticsearch:{url}?analyzer={analyzer}_analyzer') else: df_token = df[target_column].tokenize(params.get('regex')) if to_bool(params.get('lower_case')): df_token = df_token.tolower() min_word_len = int(params.get('min_word_len')) if min_word_len > 0: df_token = df_token[(df_token.nchar() >= min_word_len) | (df_token.isna()), :] if to_bool(params.get('use_stop_words')): df_token = df_token[(df_token.isna()) | (~df_token.isin(STOP_WORDS)), :] dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_token, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') model_id = config.get('model_id') df = h2o.get_frame(frame_id) column_header = params.get('column_header') if len(column_header) > 0: df_head = df[:int(column_header)] df = df[int(column_header):] pred_model = h2o.get_model(model_id) df_pred = pred_model.predict(df) df_pred.columns = [x[len('reconstr_'):] for x in df_pred.columns] dest_frame_id = append_frame_id(frame_id, params.get('suffix')) if to_bool(params.get('topn_output')): df_topn = get_topN(df_pred, int(params.get('topn_percent'))) if df_head is not None: df_topn = df_head.cbind(df_topn) h2o.assign(df_topn, dest_frame_id) h2o.remove(str(df_pred.frame_id)) else: h2o.assign(df_pred, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') model_id = config.get('model_id') df = h2o.get_frame(frame_id) input_columns = params.get("input_columns") if input_columns is None or len(input_columns) <= 2: input_columns = df.col_names else: input_columns = json.loads(input_columns) output_columns = params.get("output_columns") if output_columns is None or len(output_columns) <= 2: output_columns = [] else: output_columns = json.loads(output_columns) pred_model = h2o.get_model(model_id) df_pred = pred_model.predict(df[input_columns]) for col_name in output_columns: df_pred[col_name] = df[col_name] dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_pred, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) column = params.get('column') value = params.get('value') c_type = df.types[column] if c_type == 'real': value = float(value) elif c_type == 'int': value = int(value) elif c_type == 'enum': for c in df[column].categories(): if value == c: value = c break row_conditions = params.get('row_conditions') if row_conditions is not None and len(row_conditions) > 0: mask = parse_row_condition(df, row_conditions) df[mask, column] = value else: df[column] = value dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) df_pivot = df.pivot(index=params.get('index'), column=params.get('column'), value=params.get('value')) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_pivot, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) df_fillna = df.fillna(method=params.get('method'), axis=int(params.get('axis')), maxlen=int(params.get('maxlen'))) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_fillna, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) row_conditions = params.get('row_conditions') if row_conditions is not None and len(row_conditions) > 0: mask = parse_row_condition(df, row_conditions) df = df[mask, :] dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) column = params.get('column') ascending = to_bool(params.get('ascending')) df_sort = df.sort(by=[column], ascending=[ascending]) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_sort, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) bind_frame_id = params.get('bind_frame_id') df_2 = h2o.get_frame(bind_frame_id) df_bind = df.cbind(df_2) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_bind, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) columns = params.get('columns') if columns is not None or len(columns) > 2: columns = json.loads(columns) df = df[columns] df_floor = df.floor() dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_floor, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) columns = params.get('columns') if columns is None or len(columns) <= 2: columns = df.columns else: columns = json.loads(columns) df_filtered = df[columns] dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_filtered, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) frames = params.get('frames') if frames is None or len(frames) <= 2: print("frames are empty.") sys.exit(1) frames = json.loads(frames) df_concat = df.concat([h2o.get_frame(x) for x in frames], axis=int(params.get('axis'))) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_concat, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) columns = params.get('columns') if columns is not None and len(columns) > 2: columns = json.loads(columns) df = df[columns] use_value = params.get('use') if use_value is not None and len(use_value) == 0: use_value = None df_cor = df.cor(na_rm=to_bool(params.get('na_rm')), use=use_value, method=params.get('method')) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_cor, dest_frame_id) return {'frame_id': dest_frame_id}
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) train = int(params.get('train_ratio')) test = params.get('test_ratio') if test is None or len(test) == 0: test = 0 else: test = int(test) valid = params.get('valid_ratio') if valid is None or len(valid) == 0: valid = 0 else: valid = int(valid) seed = params.get('seed') if seed is None or len(seed) == 0: seed = None else: seed = int(seed) train_ratio = train / (train + test + valid) test_ratio = test / (train + test + valid) valid_ratio = valid / (train + test + valid) if valid == 0 and test == 0: return {'frame_id': frame_id} elif valid == 0: df_train, df_test = df.split_frame(ratios=[train_ratio], seed=seed) df_valid = None elif test == 0: df_train, df_valid = df.split_frame(ratios=[train_ratio], seed=seed) df_test = None else: df_train, df_test, df_valid = df.split_frame( ratios=[train_ratio, test_ratio], seed=seed) train_frame_id = append_frame_id(frame_id, params.get('train_suffix')) h2o.assign(df_train, train_frame_id) if df_test is None: test_frame_id = None else: test_frame_id = append_frame_id(frame_id, params.get('test_suffix')) h2o.assign(df_test, test_frame_id) if df_valid is None: valid_frame_id = None else: valid_frame_id = append_frame_id(frame_id, params.get('valid_suffix')) h2o.assign(df_valid, valid_frame_id) return { 'frame_id': train_frame_id, 'train_frame_id': train_frame_id, 'test_frame_id': test_frame_id, 'valid_frame_id': valid_frame_id, }