def leaderboard(self): return H2OFrame([]) if self._leaderboard is None else self._leaderboard
def event_log(self): return H2OFrame([]) if self._event_log is None else self._event_log
x = row.geometry y = row.type df_temp = source_df[source_df.intersects(x)].copy() df_temp['type'] = y training_data = training_data.append(df_temp) training_data['type'] = training_data['type'].map(class_map) ### Model training here --------------------------------------------------------------------- h2o.init() # Convert the training data to an h2o frame. # NOTE that this process will be inefficien if the original data has many NaNs. hf = H2OFrame(training_data) # This block of code is fairly h2o standard. It trains 20 models on this data, # limiting the runtime to 1 hour. At the end of an hour or training 20 models, # whichever is first, it returns a DataFrame of predictions as preds, ordered by the quality of their predictions. # Split 'hf' into a taraining frame and validation frame. train, valid = hf.split_frame(ratios=[.8], seed=10) # Identify predictors and response x = predictors y = response ## For binary classification, response should be a factor train[y] = train[y].asfactor() valid[y] = valid[y].asfactor()
def _to_frame(X, feature_names): """Create H2OFrame object from received features. """ return H2OFrame(X, column_names=feature_names)
create_grid = GridSearchCV(pipeline, param_grid=check_params, cv=cv, scoring='roc_auc') create_grid.fit(X_train, y_train) print("score for %d fold CV := %3.2f" % (cv, create_grid.score(X_test, y_test))) print("!!!!!!!! Best-Fit Parameters From Training Data !!!!!!!!!!!!!!") print(create_grid.best_params_) print("out of the loop") print("grid best params: ", create_grid.best_params_) ############################## # Evaluation ############################## final_model = create_grid.best_estimator_ final_model.fit(X_train, H2OFrame(y_train.to_frame())) # classification evaluation from sklearn.metrics import roc_auc_score y_pred = final_model.predict_proba(X_test)[:, 1] auc_test = roc_auc_score(y_test, y_pred) print(auc_test) # adjust the recall from sklearn.metrics import precision_recall_curve from sklearn import metrics precision, recall, thresholds = precision_recall_curve(y_test, y_pred) pr_auc = metrics.auc(recall, precision) plt.title("Precision-Recall vs Threshold Chart") plt.plot(thresholds, precision[:-1], "b--", label="Precision") plt.plot(thresholds, recall[:-1], "r--", label="Recall")
#Audience 별 벡터화 된 hashed app을 저장한 DataFrame app = pd.DataFrame() for value in unique_hashed_app_str: app[value] = [0] * len(train_ap) for i in range(len(app)): app.iloc[i, :] = dict_hashed_app[device_ifa[i]] app = app.fillna(0).astype(int) app.insert(loc=0, column='device_ifa', value=device_ifa) #app kmeans clustering app_train = H2OFrame(app) app_cols = app_train.columns app_cols.remove("device_ifa") app_kmeans = H2OKMeansEstimator(k=10, init="Random", standardize=True) app_kmeans.train(x=app_cols, training_frame=app_train) app_predicted = app_kmeans.predict(app_train) pd_app_predicted = app_predicted['predict'].as_data_frame(use_pandas=True, header=True) app_clust = pd.DataFrame() app_clust['device_ifa'] = app['device_ifa'] app_clust['app_clusters'] = pd_app_predicted['predict'] app_kmeans_model_path = h2o.save_model(model=app_kmeans,
#Audience 별 벡터화 된 hashed app을 저장한 DataFrame app = pd.DataFrame() for value in unique_hashed_app_str: app[value] = [0] * len(test_ap) for i in range(len(app)): app.iloc[i, :] = dict_hashed_app[device_ifa[i]] app = app.fillna(0).astype(int) app.insert(loc=0, column='device_ifa', value=device_ifa) #app kmeans clustering app_train = H2OFrame(app) app_cols = app_train.columns app_cols.remove("device_ifa") rf = open("app_kmeans_model_path.txt", "rb") app_kmeans_model_path = pickle.load(rf) rf.close() app_kmeans = h2o.load_model(app_kmeans_model_path) app_predicted = app_kmeans.predict(app_train) pd_app_predicted = app_predicted['predict'].as_data_frame(use_pandas=True, header=True) app_clust = pd.DataFrame()
'time_diff', 'device_num', 'ip_num', 'class' ] data = data[features] data.head() # In[16]: # build the model # Initialize H2O cluster h2o.init() h2o.remove_all() # In[17]: # convert to h2o frame h2o_df = H2OFrame(data) # convert features to categories for f in [ 'signup_day', 'purchase_day', 'source', 'browser', 'sex', 'country', 'class' ]: h2o_df[f] = h2o_df[f].asfactor() h2o_df.summary() # In[18]: # Split training and test sets (70/30) # binary feature - class: use stratified split method to aviod imbalance data_split = h2o_df['class'].stratified_split(test_frac=0.3, seed=42)
model_data.head() # In[34]: # Initialize H2O cluster import h2o h2o.init() # In[36]: #model data summary model_data = H2OFrame(model_data) for name in ['fac_type', 'risk','service_code','grade','month','day','year','code_islarge']: model_data[name] = model_data[name].asfactor() model_data.summary() # In[37]: # Split into 70% training and 30% test dataset strat_split = model_data['score'].stratified_split(test_frac=0.3, seed=42) #train, test, valid = model_data.split_frame([0.6, 0.2]) #0.6,0.2,0.2 train = model_data[strat_split == 'train'] test = model_data[strat_split == 'test']