def create_fold_column_if_not_exist(h2o_base_table: h2o.H2OFrame, fold_column: str, nfolds: int = None) -> h2o.H2OFrame: if fold_column and fold_column not in h2o_base_table.col_names: h2o_fold_col = h2o_base_table.kfold_column(n_folds=nfolds) h2o_fold_col.set_names([fold_column]) h2o_base_table = h2o_base_table.cbind(h2o_fold_col) return h2o_base_table
def _train_test_split_as_frames(x, y, is_str=False, is_classifier=False): y = y.astype(np.str) if is_str else y.astype(np.int64) x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.3, random_state=42) f_train_x = H2OFrame(x_train) f_train_y = H2OFrame(y_train) f_train = f_train_x.cbind(f_train_y) if is_classifier: f_train[f_train.ncol - 1] = f_train[f_train.ncol - 1].asfactor() return f_train, x_test.astype(np.float32)
def bernoulli_synthetic_data_gbm_medium(): # Generate training dataset (adaptation of http://www.stat.missouri.edu/~speckman/stat461/boost.R) train_rows = 10000 train_cols = 10 # Generate variables V1, ... V10 X_train = np.random.randn(train_rows, train_cols) # y = +1 if sum_i x_{ij}^2 > chisq median on 10 df y_train = np.asarray([1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in np.multiply(X_train,X_train).tolist()]]) # Train scikit gbm # TODO: grid-search distribution = "bernoulli" ntrees = 150 min_rows = 1 max_depth = 2 learn_rate = .01 nbins = 20 gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(X_train,y_train) # Generate testing dataset test_rows = 2000 test_cols = 10 # Generate variables V1, ... V10 X_test = np.random.randn(test_rows, test_cols) # y = +1 if sum_i x_{ij}^2 > chisq median on 10 df y_test = np.asarray([1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in np.multiply(X_test,X_test).tolist()]]) # Score (AUC) the scikit gbm model on the test data auc_sci = roc_auc_score(y_test, gbm_sci.predict_proba(X_test)[:,1]) # Compare this result to H2O train_h2o = H2OFrame.fromPython(zip(*np.column_stack((y_train, X_train)).tolist())) test_h2o = H2OFrame.fromPython(zip(*np.column_stack((y_test, X_test)).tolist())) gbm_h2o = h2o.gbm(x=train_h2o[1:], y=train_h2o["C1"].asfactor(), distribution=distribution, ntrees=ntrees, min_rows=min_rows, max_depth=max_depth, learn_rate=learn_rate, nbins=nbins) gbm_perf = gbm_h2o.model_performance(test_h2o) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert abs(auc_h2o - auc_sci) < 1e-2, "h2o (auc) performance degradation, with respect to scikit. h2o auc: {0} " \ "scickit auc: {1}".format(auc_h2o, auc_sci)
def fit_h2o(x_train, y_train, estimator): parameters = estimator._parms estimator_type = estimator.__class__ current_estimator = estimator_type() current_estimator._parms = parameters column_types_x = get_h2o_column_types(x_train.columns) x_train = H2OFrame(x_train, column_types=column_types_x) y_train = H2OFrame(list(y_train), column_types=['enum']) training_frame = x_train.cbind(y_train) if y_train is not None else x_train x_train = x_train.names y_train = y_train.names[0] current_estimator.train(x_train, y_train, training_frame) return current_estimator
def test1(): badFrame = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"], "three": [0, 5.2, 14]}) badClone = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"], "three": [0, 5.2, 14]}) compare_frames(badFrame, badClone) try: badFrame.asfactor() assert False, "The frame contaied a real number, an error should be thrown" except H2OValueError: # as designed pass compare_frames(badFrame, badClone) originalAfterOp = H2OFrame.get_frame(badFrame.frame_id) compare_frames(badFrame, originalAfterOp) goodFrame = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]}) goodClone = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]}) compare_frames(goodFrame, goodClone) factoredFrame = goodFrame.asfactor() originalAfterOp = H2OFrame.get_frame(goodFrame.frame_id) compare_frames(goodFrame, originalAfterOp) expectedFactoredFrame = H2OFrame({"one": [4, 6, 1], "two": ["a", "b", "cde"]}, column_types={"one":"categorical", "two": "enum"}) compare_frames(expectedFactoredFrame, factoredFrame) refactoredFrame = expectedFactoredFrame.asfactor() factoredAfterOp = H2OFrame.get_frame(refactoredFrame.frame_id) compare_frames(expectedFactoredFrame, factoredAfterOp)
def predict_dataframe(data, node): if node.classifier: if isinstance(node.classifier, h2o.estimators.H2OEstimator): if not isinstance(data, H2OFrame): column_types = get_h2o_column_types(data.columns) data_h2o = H2OFrame(data, column_types=column_types) prediction = node.classifier.predict(data_h2o) if len(prediction['predict'].as_data_frame().values) == 0: prediction = np.array([]) else: prediction = np.concatenate( prediction['predict'].as_data_frame().values) else: prediction = node.classifier.predict(data) data_right = data.iloc[[ i for i in range(len(prediction)) if prediction[i] == 1 ]] data_left = data.iloc[[ i for i in range(len(prediction)) if prediction[i] == 0 ]] prediction_left = predict_dataframe(data_left, node.left_node) prediction_right = predict_dataframe(data_right, node.right_node) return sorted(prediction_left + prediction_right) else: return [(i, list(node.classes)[0]) for i in list(data.index)]
def retrieve_h2o_base_table_predictors(self, h2o_base_table: h2o.H2OFrame): cols_to_drop = [ 'row_id', self.auto_ml_config[AutoMLConfig.DATA][AutoMLConfig.LABEL_COL], self.auto_ml_config[AutoMLConfig.DATA][AutoMLConfig.ORIGIN_COL] ] + self.auto_ml_config[AutoMLConfig.DATA][AutoMLConfig.CATEGORICAL_VARIABLES] return h2o_base_table.drop(cols_to_drop).col_names
def build_auto_h2o(regressor, name): transformer = ColumnTransformer( [(column, CategoricalDomain(), [column]) for column in ["cylinders", "model_year", "origin"]] + [(column, ContinuousDomain(), [column]) for column in ["displacement", "horsepower", "weight", "acceleration"]]) pipeline = PMMLPipeline([("transformer", transformer), ("uploader", H2OFrameCreator(column_names=[ "cylinders", "model_year", "origin", "displacement", "horsepower", "weight", "acceleration" ], column_types=[ "enum", "enum", "enum", "numeric", "numeric", "numeric", "numeric" ])), ("regressor", regressor)]) pipeline.fit(auto_X, H2OFrame(auto_y.to_frame())) pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) regressor = pipeline._final_estimator store_mojo(regressor, name + ".zip") store_pkl(pipeline, name + ".pkl") mpg = pipeline.predict(auto_X) mpg.set_names(["mpg"]) store_csv(mpg.as_data_frame(), name + ".csv")
def get_expected_output_frame(out_doc_ids, out_tokens, out_TFs, out_IDFs, out_TFIDFs): return H2OFrame( OrderedDict([('DocID', out_doc_ids), ('Token', out_tokens), ('TF', out_TFs), ('IDF', out_IDFs), ('TF_IDF', out_TFIDFs)]), column_types=['numeric', 'string', 'numeric', 'numeric', 'numeric'])
def optimum_threshold(self, hf: h2o.H2OFrame, model: H2OGenericEstimator) -> float: """ Selects the best threshold for this model given the cost values of this instance Args: hf (DataFrame): Data used for evaluation. Must contain ground truth column named fraud model (H2OModel): A model object to be evaluated Returns: optimum_threshold (float): Indicates that if a model p1 value is less than this number the prediction is 0 (not fraud). If the model p1 value is greater than this number the prediction is 1 (fraud) """ # Extract the probability of the positive class from the predictions df = hf.as_data_frame() df['model_score'] = model.predict(test_data=hf).as_data_frame()['p1'] matrix = {str(model.model_id): {'x': [], 'y': []}} # Calculate cost function for ever 1/100 ranging from 0 to 1 for t in range(1, 100): t = t / 100 df['prediction'] = predict(df, t, 1, 'model_score') df = reconcile(df, 'prediction', 'fraud', f"CM_{t}") t_cost, df = outcome(df, self.inverse_costs, f"CM_{t}", f"costs_{t}") matrix[str(model.model_id)]['x'].append(t) matrix[str(model.model_id)]['y'].append(t_cost) # Return threshold that produced the minimum cost idx_min_cost = matrix[str(model.model_id)]['y'].index(min(matrix[str(model.model_id)]['y'])) optimum_threshold = matrix[str(model.model_id)]['x'][idx_min_cost] print(f"optimum_threshold: {optimum_threshold}") return optimum_threshold
def upload_file(): a = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) print a.describe() from h2o import H2OFrame # using lists [] py_list_to_h2o = H2OFrame.fromPython(zip(*[[0, 1, 2, 3, 4]])) print py_list_to_h2o.describe() py_list_to_h2o_2 = H2OFrame.fromPython(zip(*[[0, 1, 2, 3], [5, 6, "hi", "dog"]])) print py_list_to_h2o_2.describe() # using tuples () py_tuple_to_h2o = H2OFrame.fromPython(zip(*[(0, 1, 2, 3, 4)])) print py_tuple_to_h2o.describe() py_tuple_to_h2o_2 = H2OFrame.fromPython(zip(*((0, 1, 2, 3), (5, 6, "hi", "dog")))) print py_tuple_to_h2o_2.describe() # using dicts {} py_dict_to_h2o = H2OFrame.fromPython({"column1": [5, 4, 3, 2, 1], "column2": (1, 2, 3, 4, 5)}) py_dict_to_h2o.describe() py_dict_to_h2o_2 = H2OFrame.fromPython({"colA": ["bilbo", "baggins"], "colB": ["meow"]}) print py_dict_to_h2o_2.describe() # using collections.OrderedDict import collections d = {"colA": ["bilbo", "baggins"], "colB": ["meow"]} # still unordered! py_ordered_dict_to_h2o = H2OFrame.fromPython(collections.OrderedDict(d)) py_ordered_dict_to_h2o.describe() # make an ordered dictionary! d2 = collections.OrderedDict() d2["colA"] = ["bilbo", "baggins"] d2["colB"] = ["meow"] py_ordered_dict_to_h2o_2 = H2OFrame.fromPython(collections.OrderedDict(d2)) py_ordered_dict_to_h2o_2.describe()
def upload_file(ip, port): h2o.init(ip, port) a = h2o.upload_file("../../smalldata/logreg/prostate.csv") print a.describe() from h2o import H2OFrame # using lists [] py_list_to_h2o = H2OFrame(python_obj=[0, 1, 2, 3, 4]) print py_list_to_h2o.describe() py_list_to_h2o_2 = H2OFrame(python_obj=[[0, 1, 2, 3], [5, 6, "hi", "dog"]]) print py_list_to_h2o_2.describe() # using tuples () py_tuple_to_h2o = H2OFrame(python_obj=(0, 1, 2, 3, 4)) print py_tuple_to_h2o.describe() py_tuple_to_h2o_2 = H2OFrame(python_obj=((0, 1, 2, 3), (5, 6, "hi", "dog"))) print py_tuple_to_h2o_2.describe() # using dicts {} py_dict_to_h2o = H2OFrame(python_obj={"column1": [5, 4, 3, 2, 1], "column2": (1, 2, 3, 4, 5)}) py_dict_to_h2o.describe() py_dict_to_h2o_2 = H2OFrame(python_obj={"colA": ["bilbo", "baggins"], "colB": ["meow"]}) print py_dict_to_h2o_2.describe() # using collections.OrderedDict import collections d = {"colA": ["bilbo", "baggins"], "colB": ["meow"]} # still unordered! py_ordered_dict_to_h2o = H2OFrame(python_obj=collections.OrderedDict(d)) py_ordered_dict_to_h2o.describe() # make an ordered dictionary! d2 = collections.OrderedDict() d2["colA"] = ["bilbo", "baggins"] d2["colB"] = ["meow"] py_ordered_dict_to_h2o_2 = H2OFrame(python_obj=collections.OrderedDict(d2)) py_ordered_dict_to_h2o_2.describe()
def upload_file(): a = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) print(a.describe()) from h2o import H2OFrame # using lists [] py_list_to_h2o = H2OFrame([[0, 1, 2, 3, 4]]) print(py_list_to_h2o.describe()) py_list_to_h2o_2 = H2OFrame([[0, 1, 2, 3], [5, 6, "hi", "dog"]]) print(py_list_to_h2o_2.describe()) # using tuples () py_tuple_to_h2o = H2OFrame([(0, 1, 2, 3, 4)]) print(py_tuple_to_h2o.describe()) py_tuple_to_h2o_2 = H2OFrame(((0, 1, 2, 3), (5, 6, "hi", "dog"))) print(py_tuple_to_h2o_2.describe()) # using dicts {} py_dict_to_h2o = H2OFrame({ "column1": [5, 4, 3, 2, 1], "column2": (1, 2, 3, 4, 5) }) py_dict_to_h2o.describe() py_dict_to_h2o_2 = H2OFrame({ "colA": ["bilbo", "baggins"], "colB": ["meow"] }) print(py_dict_to_h2o_2.describe()) # using collections.OrderedDict import collections d = {"colA": ["bilbo", "baggins"], "colB": ["meow"]} # still unordered! py_ordered_dict_to_h2o = H2OFrame(collections.OrderedDict(d)) py_ordered_dict_to_h2o.describe() # make an ordered dictionary! d2 = collections.OrderedDict() d2["colA"] = ["bilbo", "baggins"] d2["colB"] = ["meow"] py_ordered_dict_to_h2o_2 = H2OFrame(collections.OrderedDict(d2)) py_ordered_dict_to_h2o_2.describe()
def _assert_expr_results_eq(expr_provider, skip_expr_assert=False): flag = h2o.is_expr_optimizations_enabled() try: # Get result of optimized expression h2o.enable_expr_optimizations(True) opt_expr = expr_provider() opt_result = H2OFrame._expr(opt_expr) # Get result of full expression h2o.enable_expr_optimizations(False) noopt_expr = expr_provider() noopt_result = H2OFrame._expr(noopt_expr) if not skip_expr_assert: assert opt_expr._debug_print() != noopt_expr._debug_print(), "The optimization should simplify expression!" assert noopt_result.as_data_frame(use_pandas=False) == opt_result.as_data_frame( use_pandas=False), "Results with/without expression optimization should match!" return opt_expr, noopt_expr finally: h2o.enable_expr_optimizations(flag)
def add_unique_row_id(h2o_base_table: h2o.H2OFrame): num_rows = h2o_base_table.shape[0] ids = [] for id in range(0, num_rows): ids.append(id) h2o_id_frame = h2o.H2OFrame(ids) return h2o_base_table.cbind(h2o_id_frame.set_names(['row_id']))
def predict_with_probabilities(self, data): data_frame = H2OFrame(data, column_names=self._column_names) preds = self._mojo_model.predict(data_frame).as_data_frame(use_pandas=True) if len(preds.columns) == 1: return [preds.to_numpy()] else: return [ preds.iloc[:, 0].to_numpy().astype(np.str), preds.iloc[:, 1:].to_numpy() ]
def fold_prediction_result(x_train, y_train, x_test, y_test, classification_types, basic_classifier): """ The training and prediction for one fold for all the types of classifiers indicated in classification_types :param x_train: the training data :param y_train: the training classes :param x_test: the testing data :param y_test: the testing classes :param classification_types: the classification types to be considered :param basic_classifier: the basic classifier to be used either independently or for the meta classifiers :return: metrics_dict - dictionary containing a dictionary for every metric with data for every classifier training_time - dictionary with training time in seconds for every classification type test_time - dictionary with testing time in seconds for every classification type """ metrics_dict = {} for metric in METRICS: metrics_dict[metric] = {} training_time = {} test_time = {} for classification in classification_types: # logger.info("*****************************") logger.info(classification) if classification in ENCODING_TYPES: classifier = EncodedClassifier(basic_classifier, encoding_type=classification) elif classification == "meta_binary_tree_classifier": classifier = MetaBinaryTreeClassifier(basic_classifier) elif classification == "standard-classifier": classifier = basic_classifier else: raise Exception("The Classification Method is not a valid one") start_time = time.time() if isinstance(classifier, h2o.estimators.H2OEstimator): classifier = fit_h2o(x_train, y_train, classifier) else: classifier.fit(x_train, y_train) train_time = time.time() - start_time if isinstance(classifier, h2o.estimators.H2OEstimator): column_types = get_h2o_column_types(x_test.columns) x_test = H2OFrame(x_test, column_types=column_types) prediction = classifier.predict(x_test) y_pred = np.concatenate( prediction['predict'].as_data_frame().values) else: y_pred = classifier.predict(x_test) prediction_time = time.time() - train_time - start_time # Calculate metrics for metric, f in METRICS.items(): metrics_dict[metric][classification] = f(y_test, y_pred) training_time[classification] = train_time test_time[classification] = prediction_time return metrics_dict, training_time, test_time
def test_pav(y, X, w): X = X.reshape(-1) # run Isotonic Regression to extract thresholds iso_reg = IsotonicRegression().fit(X, y, w) thresholds_scikit = H2OFrame(np.column_stack(get_thresholds(iso_reg))) print(thresholds_scikit.as_data_frame()) # now invoke H2O PAVA thresholds_h2o = pav(y, X, w) print(thresholds_h2o.as_data_frame()) assert_frame_equal(thresholds_scikit.as_data_frame(), thresholds_h2o.as_data_frame())
def pubdev_6394(): # JUnit tests are to be found in RapidsTest class data = [['location'], ['X県 A市'], ['X県 B市'], ['X県 B市'], ['Y県 C市'], ['Y県 C市']] original_frame = H2OFrame(data, header=True, column_types=['enum']) assert original_frame.type('location') == 'enum' assert original_frame.categories() == [u'X県 A市', u'X県 B市', u'Y県 C市'] # Reduce cardinality of 'location' column to 2 by reducing existing categorical values to ['X県','Y県'] expected_categories = [u'X県', u'Y県'] transformed_frame = original_frame['location'].gsub(' .*', '') print(transformed_frame) assert transformed_frame.ncols == 1 assert transformed_frame.nrows == original_frame.nrows assert transformed_frame.type('C1') == 'enum' assert transformed_frame['C1'].categories() == expected_categories # Test gsub without changing the cardinality data = [['location'], ['ab'], ['ac'], ['ad'], ['ae'], ['af']] original_frame = H2OFrame(data, header=True, column_types=['enum']) assert original_frame.type('location') == 'enum' assert original_frame.categories() == ['ab', 'ac', 'ad', 'ae', 'af'] expected_categories = ['b', 'c', 'd', 'e', 'f'] transformed_frame = original_frame['location'].gsub('a', '') print(transformed_frame) assert transformed_frame.ncols == 1 assert transformed_frame.nrows == original_frame.nrows assert transformed_frame.type('C1') == 'enum' assert transformed_frame['C1'].categories() == expected_categories
def process_w2v(df, w2v_model): """ returns new df with text-features all replaced by word2vec features """ print("processind data with word2vec ...") df = df.copy() text_columns = w2v_model.text_columns df_text = df[text_columns] text_frame = H2OFrame(df_text) for col in text_columns: text_frame[col] = text_frame[col].ascharacter() words = text_frame.tokenize(" ") text_feats = w2v_model.transform(words, aggregate_method = "AVERAGE") text_feats = text_feats.as_data_frame() df.drop(columns=text_columns, inplace=True) return pd.concat([df,text_feats], axis=1).reset_index()
def predict(self, x): if isinstance(self.estimator, h2o.estimators.H2OEstimator): column_types_x = get_h2o_column_types(x.columns) x = H2OFrame(x, column_types=column_types_x) results = pd.DataFrame(index=range(len(x))) i = 0 for estimator in self.estimators_: predictions = estimator.predict(x) results[i] = predictions['predict'].as_data_frame().values i += 1 else: results = np.array( [estimator.predict(x) for estimator in self.estimators_]).T results = pd.DataFrame(results) y_pred = decode_users(results, self.dict_code_user) return np.array(y_pred)
def Predict(self, request: IrisRequest, context): if not hasattr(request, 'SepalLength') or not hasattr(request, 'SepalWidth') \ or not hasattr(request, 'PetalLength') or not hasattr(request, 'PetalWidth'): msg = 'wrong arguments for IrisRequest' context.set_details(msg) context.set_code(grpc.StatusCode.INVALID_ARGUMENT) test_data = H2OFrame({ "SepalLength": request.SepalLength, "SepalWidth": request.SepalWidth, "PetalLength": request.PetalLength, "PetalWidth": request.PetalWidth }) prediction = self.model.predict(test_data).getrow() species = SPECIES.get(np.argmax(prediction)) return IrisReply(species=species)
def merge_ages(frame, ages): df = frame.merge(ages, all_x=True).sort('PassengerId').as_data_frame() missing_rows = df['Age'].isna() df.loc[missing_rows, 'Age'] = df.loc[missing_rows, 'predict'] # For odds turns of fate, need to convert the response var back to factor # exactly here. (If the Pandas frame is converted to an H2O frame, the # response column becomes a real number instead of an integer and H2O can # convert integers to factors, but it cannot convert real numbers to factors. df['Survived_factor'] = df['Survived_factor'].astype('category') merged_frame = H2OFrame(df) # Somehow, the columns, some columns get corrupted in by the merge copy_df = h2o.deep_copy(merged_frame, 'copy_df') copy_df['Age'] = merged_frame.pop('Age') return copy_df.drop('predict')
def grouped_kfold(frame: H2OFrame, n_folds: int, src_col_name: str, dest_col_name='_kfold', seed=-1, remove_frame=True): src_col_frame = frame[src_col_name] group_col_uniq = src_col_frame.unique() print(f"kfold group unique val count:: {src_col_name}, {group_col_uniq.nrows}") kfold_col_frame = group_col_uniq.kfold_column(n_folds, seed) group_col_kfold = group_col_uniq.cbind(kfold_col_frame) group_col_kfold_named = group_col_kfold.set_names([src_col_name, dest_col_name]) kfold_frame = frame.merge(group_col_kfold_named) # force eval... print(f"merged frame id: {kfold_frame.frame_id}") remove_frames([src_col_frame, group_col_uniq, group_col_kfold, kfold_col_frame, group_col_kfold_named]) if remove_frame: h2o.remove(frame.frame_id) return kfold_frame
def build_audit_h2o(classifier, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] + [([column], CategoricalDomain()) for column in ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("uploader", H2OFrameCreator()), ("classifier", classifier) ]) pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"])) pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) classifier = pipeline._final_estimator store_mojo(classifier, name) store_pkl(pipeline, name) adjusted = pipeline.predict(audit_X) adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"]) store_csv(adjusted.as_data_frame(), name)
def predict_row(row, node): if node.classifier: if isinstance(node.classifier, h2o.estimators.H2OEstimator): if not isinstance(row, H2OFrame): column_types_row = get_h2o_column_types(row.columns) row = H2OFrame(row, column_types=column_types_row) prediction = node.classifier.predict(row) prediction = np.concatenate( prediction['predict'].as_data_frame().values) else: prediction = node.classifier.predict(row) if prediction[0] == 0: prediction = predict_row(row, node.left_node) else: prediction = predict_row(row, node.right_node) else: return list(node.classes)[0] return prediction
def pubdev_6534(): df_data = [["D", "E", "NA", "NA"], ["1", "A", "NA", "NA"]] df = H2OFrame.from_python(df_data, column_types=['factor'] * 4, na_strings=["NA"]) assert df.type("C1") == "enum" assert df.type("C2") == "enum" assert df.type("C3") == "int" assert df.type("C4") == "int" # convert empty col to enum df['C3'] = df['C3'].asfactor() # convert empty cols to char df['C4'] = df['C4'].ascharacter() print(df) assert df.type("C3") == "enum" assert df.type("C4") == "string"
def _prepare_one_hot(file, y, exclude_cols=None): if exclude_cols is None: exclude_cols = [] dir_path = os.path.dirname(os.path.realpath(__file__)) frame = h2o.import_file(dir_path + "/" + file) train, test = frame.split_frame([0.95], seed=42) cols_to_encode = [] other_cols = [] for name, ctype in test.types.items(): if name == y or name in exclude_cols: pass elif ctype == "enum": cols_to_encode.append(name) else: other_cols.append(name) train_frame = train.as_data_frame() train_encode = train_frame.loc[:, cols_to_encode] train_other = train_frame.loc[:, other_cols + [y]] enc = OneHotEncoder(categories='auto', handle_unknown='ignore') enc.fit(train_encode) colnames = [] for cidx in range(len(cols_to_encode)): for val in enc.categories_[cidx]: colnames.append(cols_to_encode[cidx] + "." + val) train_encoded = enc.transform(train_encode.values).toarray() train_encoded = pd.DataFrame(train_encoded) train_encoded.columns = colnames train = train_other.join(train_encoded) train = H2OFrame(train) test_frame = test.as_data_frame() test_encode = test_frame.loc[:, cols_to_encode] test_other = test_frame.loc[:, other_cols] test_encoded = enc.transform(test_encode.values).toarray() test_encoded = pd.DataFrame(test_encoded) test_encoded.columns = colnames test = test_other.join(test_encoded) return train, test
def model_performance(self, test_data=None): """ Compute the binary classifier model metrics on `test_data` :param test_data: An H2OFrame :return: A H2OBinomialMetrics object; prints model metrics summary """ if not test_data: raise ValueError("Missing`test_data`.") if not isinstance(test_data, H2OFrame): raise ValueError("`test_data` must be of type H2OFrame. Got: " + type(test_data)) fr_key = H2OFrame.send_frame(test_data) url_suffix = "ModelMetrics/models/" + self._key + "/frames/" + fr_key res = H2OConnection.post_json(url_suffix=url_suffix) raw_metrics = res["model_metrics"][0] return H2OBinomialModelMetrics(raw_metrics)
def test_fold_optimization_rbind_expr(): data0 = square_matrix(3, 0) data1 = square_matrix(3, 1) data2 = square_matrix(3, 2) def get_expr(): return ExprNode("rbind", ExprNode("rbind", ExprNode("rbind", data0, data1), data0, data1), data2) (expr, _) = _assert_expr_results_eq(get_expr) assert expr._op == "rbind", "Result operator is still cbind" assert len(expr._children) == 5, "Results has 5 arguments" fr = H2OFrame._expr(expr) assert fr.dim == [15, 3] assert fr.as_data_frame(use_pandas=False, header=False) == [['0'] * 3, ['0'] * 3, ['0'] * 3, ['1'] * 3, ['1'] * 3, ['1'] * 3, ['0'] * 3, ['0'] * 3, ['0'] * 3, ['1'] * 3, ['1'] * 3, ['1'] * 3, ['2'] * 3, ['2'] * 3, ['2'] * 3]
def classify(x_train, y_train, estimator, x_test): """ Make the classification and provide the result for the given estimator. For the H2O library, the transformation in H2oFrame is integrated :param x_train: the dataset for training :param y_train: the classes for training :param estimator: the estimator to be considered :param x_test: the dataset for testing :return: - the prediction for the x_test - the trained estimator """ if isinstance(estimator, h2o.estimators.H2OEstimator): current_estimator = fit_h2o(x_train, y_train, estimator) column_types_x = get_h2o_column_types(x_test.columns) x_test = H2OFrame(x_test, column_types=column_types_x) prediction = current_estimator.predict(x_test) return np.concatenate( prediction['predict'].as_data_frame().values), current_estimator else: current_estimator = clone(estimator) current_estimator.fit(x_train, y_train) return current_estimator.predict(x_test), current_estimator
def pubdev_6393(): locations = [['location'], ['�X県 A市 '], # First observation contains replacement character for unknown char ['X県 B市']] frame = H2OFrame(locations, header=True, column_types=['enum']) assert frame.ncols == 1 assert frame.nrows == len(locations) - 1 frame_categories= frame['location'].categories() print(frame_categories) frame_converted = frame['location'].ascharacter().asfactor() assert frame_converted.ncols == 1 assert frame_converted.nrows == len(locations) - 1 frame_converted_categories = frame_converted.categories(); print(frame_converted_categories) # Check for the representation of categoricals to be exactly the same # No explicit check for any specific behavior, the behavior of Categorical and asFactor should be the same for i in range(0,len(frame_converted_categories)): assert frame_categories[i] == frame_converted_categories[i]
def pubdev_6439(): data = [ ['C1'], [ 'X県 A市 ' ], # First observation contains replacement character for unknown char ['X県 B市'] ] frame = H2OFrame(data, header=True, column_types=['enum']) frame_categories = frame['C1'].categories() print(frame_categories) # Two observations assert len(frame_categories) == 2 assert len( frame_categories[0] ) == 6 # First observation has six characters (space at the end) assert len( frame_categories[1] ) == 5 # Second observation has 5 characters (missing space at the end) # Python 2 and 3 handle strings differently if (sys.version_info[0] == 3): assert ''.join(data[1]) == frame_categories[ 0] # First categorical level equals to first observation assert ''.join(data[2]) == frame_categories[ 1] # Second categorical levels equals to second observation elif (sys.version_info[0] == 2): assert ''.join(data[1]).decode("utf-8") == frame_categories[ 0] # First categorical level equals to first observation assert ''.join(data[2]).decode("utf-8") == frame_categories[ 1] # Second categorical levels equals to second observation else: assert False
def train_w2v(df, epochs=None, save_dir=None): """ trains word2vec model on all text columns of df. Returns w2v model object that can transform data. """ print("training word2vec model ...") args = {} if epochs is not None: args['epochs'] = int(epochs) if save_dir is not None: args['export_checkpoints_dir'] = os.path.join(save_dir,"h2o_model/") df = df.copy() text_columns = get_text_cols(df) print("Text columns are: ", text_columns) df_text = df[text_columns] text_frame = H2OFrame(df_text) for col in text_columns: text_frame[col] = text_frame[col].ascharacter() words = text_frame.tokenize(" ") w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, **args) w2v_model.train(training_frame=words) w2v_model.text_columns = text_columns return w2v_model
#!/usr/bin/env python from h2o import H2OFrame import h2o as h2o localH2O = h2o.init() air = H2OFrame.from_csv(localH2O, "allyears_tiny.csv", index_col = False) print(air.head()) air['RandNum'] = air.random.uniform() print(air.head()) air_train = air.ix[air['RandNum'] <= 0.8] air_valid = air.ix[(air['RandNum'] > 0.8) & (air['RandNum'] <= 0.9)] air_test = air.ix[air['RandNum'] > 0.9] myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "Month", "DayofMonth", "DayOfWeek"] myY = "IsDepDelayed" air_gbm = h2o.gbm(x = myX, y = myY, data = air_train, validation = air_valid, distribution = "multinomial", n_trees = 10, interaction_depth = 3, shrinkage = 0.01, importance = True) print(air_gbm) pred = h2o.predict(air_gbm, air_test) print(pred.head())
#!/usr/bin/env python from h2o import H2OFrame, H2OModel import h2o as h2o localH2O = h2o.init() air = H2OFrame.from_csv(localH2O, "allyears_tiny.csv", index_col = False) air.head().print() X_air = air['Origin', 'Dest', 'Distance', 'UniqueCarrier', 'Month', 'DayofMonth', 'DayOfWeek'] y_air = air['IsDepDelayed'] X_air_train, X_air_valid, X_air_test, y_air_train, y_air_valid, y_air_test = \ H2OFrame.train_valid_test(X_air, y_air, valid_size = 0.1, test_size = 0.1) my_gbm = H2OModel.GBM(distribution = "multinomial", n_trees = 10, interaction_depth = 3, shrinkage = 0.01, importance = True) air_gbm = my_gbm.fit(x=X_air_train, y=y_air_train, x_valid=X_air_valid, y_valid=y_air_valid) air_gbm.print() pred = air_gbm.predict(X_air_test) pred.head().print()
def temp_ctr(): return H2OFrame.temp_ctr() def rest_ctr(): return h2o.H2OConnection.rest_ctr()