コード例 #1
0
 def leaderboard(self):
     return H2OFrame([]) if self._leaderboard is None else self._leaderboard
コード例 #2
0
 def event_log(self):
     return H2OFrame([]) if self._event_log is None else self._event_log
コード例 #3
0
ファイル: STEP2.py プロジェクト: worldbank/GOST_Urban
    x = row.geometry
    y = row.type

    df_temp = source_df[source_df.intersects(x)].copy()
    df_temp['type'] = y

    training_data = training_data.append(df_temp)

training_data['type'] = training_data['type'].map(class_map)

### Model training here ---------------------------------------------------------------------
h2o.init()

# Convert the training data to an h2o frame.
# NOTE that this process will be inefficien if the original data has many NaNs.
hf = H2OFrame(training_data)

# This block of code is fairly h2o standard. It trains 20 models on this data,
# limiting the runtime to 1 hour. At the end of an hour or training 20 models,
# whichever is first, it returns a DataFrame of predictions as preds, ordered by the quality of their predictions.

# Split 'hf' into a taraining frame and validation frame.
train, valid = hf.split_frame(ratios=[.8], seed=10)

# Identify predictors and response
x = predictors
y = response

## For binary classification, response should be a factor
train[y] = train[y].asfactor()
valid[y] = valid[y].asfactor()
コード例 #4
0
def _to_frame(X, feature_names):
    """Create H2OFrame object from received features.
    """
    return H2OFrame(X, column_names=feature_names)
コード例 #5
0
    create_grid = GridSearchCV(pipeline,
                               param_grid=check_params,
                               cv=cv,
                               scoring='roc_auc')
    create_grid.fit(X_train, y_train)
    print("score for %d fold CV := %3.2f" %
          (cv, create_grid.score(X_test, y_test)))
    print("!!!!!!!! Best-Fit Parameters From Training Data !!!!!!!!!!!!!!")
    print(create_grid.best_params_)
print("out of the loop")
print("grid best params: ", create_grid.best_params_)
##############################
# Evaluation
##############################
final_model = create_grid.best_estimator_
final_model.fit(X_train, H2OFrame(y_train.to_frame()))

# classification evaluation
from sklearn.metrics import roc_auc_score
y_pred = final_model.predict_proba(X_test)[:, 1]
auc_test = roc_auc_score(y_test, y_pred)
print(auc_test)

# adjust the recall
from sklearn.metrics import precision_recall_curve
from sklearn import metrics
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
pr_auc = metrics.auc(recall, precision)
plt.title("Precision-Recall vs Threshold Chart")
plt.plot(thresholds, precision[:-1], "b--", label="Precision")
plt.plot(thresholds, recall[:-1], "r--", label="Recall")
コード例 #6
0
#Audience 별 벡터화 된 hashed app을 저장한 DataFrame
app = pd.DataFrame()

for value in unique_hashed_app_str:
    app[value] = [0] * len(train_ap)

for i in range(len(app)):
    app.iloc[i, :] = dict_hashed_app[device_ifa[i]]

app = app.fillna(0).astype(int)

app.insert(loc=0, column='device_ifa', value=device_ifa)

#app kmeans clustering
app_train = H2OFrame(app)
app_cols = app_train.columns
app_cols.remove("device_ifa")

app_kmeans = H2OKMeansEstimator(k=10, init="Random", standardize=True)
app_kmeans.train(x=app_cols, training_frame=app_train)
app_predicted = app_kmeans.predict(app_train)

pd_app_predicted = app_predicted['predict'].as_data_frame(use_pandas=True,
                                                          header=True)

app_clust = pd.DataFrame()
app_clust['device_ifa'] = app['device_ifa']
app_clust['app_clusters'] = pd_app_predicted['predict']

app_kmeans_model_path = h2o.save_model(model=app_kmeans,
コード例 #7
0
#Audience 별 벡터화 된 hashed app을 저장한 DataFrame
app = pd.DataFrame()

for value in unique_hashed_app_str:
    app[value] = [0] * len(test_ap)

for i in range(len(app)):
    app.iloc[i, :] = dict_hashed_app[device_ifa[i]]

app = app.fillna(0).astype(int)

app.insert(loc=0, column='device_ifa', value=device_ifa)

#app kmeans clustering
app_train = H2OFrame(app)
app_cols = app_train.columns
app_cols.remove("device_ifa")

rf = open("app_kmeans_model_path.txt", "rb")
app_kmeans_model_path = pickle.load(rf)
rf.close()

app_kmeans = h2o.load_model(app_kmeans_model_path)

app_predicted = app_kmeans.predict(app_train)

pd_app_predicted = app_predicted['predict'].as_data_frame(use_pandas=True,
                                                          header=True)

app_clust = pd.DataFrame()
    'time_diff', 'device_num', 'ip_num', 'class'
]
data = data[features]
data.head()

# In[16]:

# build the model
# Initialize H2O cluster
h2o.init()
h2o.remove_all()

# In[17]:

# convert to h2o frame
h2o_df = H2OFrame(data)

# convert features to categories
for f in [
        'signup_day', 'purchase_day', 'source', 'browser', 'sex', 'country',
        'class'
]:
    h2o_df[f] = h2o_df[f].asfactor()

h2o_df.summary()

# In[18]:

# Split training and test sets (70/30)
# binary feature - class: use stratified split method to aviod imbalance
data_split = h2o_df['class'].stratified_split(test_frac=0.3, seed=42)
コード例 #9
0
model_data.head()


# In[34]:


# Initialize H2O cluster
import h2o
h2o.init()


# In[36]:


#model data summary
model_data = H2OFrame(model_data)

for name in ['fac_type', 'risk','service_code','grade','month','day','year','code_islarge']:
    model_data[name] = model_data[name].asfactor()

model_data.summary()


# In[37]:


# Split into 70% training and 30% test dataset
strat_split = model_data['score'].stratified_split(test_frac=0.3, seed=42)
#train, test, valid = model_data.split_frame([0.6, 0.2]) #0.6,0.2,0.2
train = model_data[strat_split == 'train']
test = model_data[strat_split == 'test']