Example #1
0
def run_h2o(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task = params.task

    config_data = get_models_hyperparameters()['H2O']
    max_models = config_data['MAX_MODELS']
    max_runtime_secs = config_data['MAX_RUNTIME_SECS']

    result_filename = f'{case_label}_m{max_models}_rs{max_runtime_secs}_{task.name}'
    exported_model_path = os.path.join(CURRENT_PATH, result_filename)

    # TODO Regression
    if result_filename not in os.listdir(CURRENT_PATH):
        train_data = InputData.from_csv(train_file_path)
        best_model = fit_h2o(train_data, round(max_runtime_secs / 60))
        temp_exported_model_path = h2o.save_model(model=best_model,
                                                  path=CURRENT_PATH)

        os.renames(temp_exported_model_path, exported_model_path)

    ip, port = get_h2o_connect_config()
    h2o.init(ip=ip, port=port, name='h2o_server')

    imported_model = h2o.load_model(exported_model_path)

    test_frame = InputData.from_csv(test_file_path)
    true_target = test_frame.target

    predicted = predict_h2o(imported_model, test_frame)

    h2o.shutdown(prompt=False)

    return true_target, predicted
Example #2
0
def PredictionH2o():
    best_model = request.json['best_model']
    best_model_id = request.json['best_model_id']
    file = request.json['test_file_path']
    target_var = request.json['target_var']
    ip = request.json['ip']
    port = request.json['port']
    nthreads = request.json['nthreads']
    max_mem_size = request.json['max_mem_size']
    cluster_name = request.json['cluster_name']

    ##Existing \ new workspace

    h2o.init(ip=ip,
             port=port,
             name=cluster_name,
             nthreads=nthreads,
             max_mem_size=max_mem_size)
    # load the model
    import os
    try:
        cwd = 'D:\\DCSAIAUTOML\\BestModels\\h2o'
        model_path = os.path.join(cwd, best_model, best_model_id)
        print(model_path)
        saved_model = h2o.load_model(model_path)
        #des_data = data.describe()

        print(file)
        test_data = h2o.import_file(file)
        #des_data = data.describe()
        #print(test_data)
        #predictors_test = list(test_data.columns)
        #predictors_test.remove(target_var)
        #predictors_df = test_data.as_data_frame()

        preds = saved_model.predict(test_data)
        #print(preds)
        dff = preds.as_data_frame()
        #dff1 = dff.drop(['p51'], axis=1)
        #dff2 = dff1.rename(columns={"predict": "Prediction", "Churner": "Churner Probability", "Nonchurner": "Nonchurner Probability"})
        test_df = test_data.as_data_frame()
        #stock_df = test_df[['fund_id','stock_id','ActionTaken']]
        result = pd.concat([test_df, dff], axis=1)
        #result.head()
        result.to_csv('D:/PredictionResult/H20/Prediction_h2o.csv',
                      index=False,
                      date_format='%Y%m%d')

        #pred_df.rename(index = {"predict": "prediction", "p0": "Prob for class1","p1": "Prob for class2",
        #"p2": "Prob for class3"}, inplace = True)
        pred_json = result.to_json(orient='records')
        #print(pred_json)
        h2o.shutdown()
        return pred_json
    except Exception as e:
        error_statement = str(e)
        print("Error statement: ", error_statement)
        return error_statement
def main(params):
    print("let`s get started!")
    print("trained models are detected!"
          ) if not params['model_train'] else print(
              "let`s train CTR, CPC and reach")
    print("predicted values are detected!"
          ) if not params['predict'] else print("let`s predict")

    # gather data from .csv.
    data = data_access.get_data_from_csv(params)
    # doing manipulations for parsing keywords and calculating total cost and revenue for each rows.
    data = data_manipulation.calculate_total_cost_revenue_conversion(data)
    # for sample data 8 million gathers combinations of each categorical features
    params['pred_data'], params[
        'dashboard_filters'], iters = prediction.combination_data_preparation(
            data, params)

    # start prediction or training model procces of each metric
    for y in params['output']:
        # if we assign model path is gathers model from there. Othervise path is assigned in constans.
        _path = params['output'][y]['model_path'] if params['output'][y][
            'model_path'] is not None else constants.model_save_path
        if params['model_train']:
            # inputs for training process
            if params['output'][y]['model_features']['num'] is None:
                X_decoded = constants.model_features[y]
            else:
                X_decoded = params['output'][y]['model_features']
            # this is for one-hot encoding for categorical features.
            _data, X_encoded = data_manipulation.converting_numeric_encoder(
                data, X_decoded, y)
            # each learning process assign for GGM, DRF, DNN, GLM machine Learning models.
            # At the end H2o allows us to find the best model by using stack Ensemble model
            _model = model_train.best_prediction_model(
                _data, constants.search_criteria, constants.hyper_p_gbm,
                constants.hyper_p_drf, constants.hyper_p_dnn,
                constants.hyper_p_glm, y, X_encoded, constants.split_ratio)
            _model.compute_train_process()
            _model.compute_best_model()
            params['output'][y]['best_model'] = _model.best_model
            h2o.save_model(model=params['output'][y]['best_model'],
                           path=_path,
                           force=True)
            # shot down h2o instance. This project it runs on each available cores on your server or local comp.
            h2o.shutdown(prompt=False)

        if params['predict']:
            # prediction is initializing with trained model. batch size is crucial point at here
            params['output'][y]['predicted'] = prediction.get_prediction(
                params['output'][y]['model_path'], iters, params['pred_data'],
                params['prediction_batch_size'])
            # writing on a pickle the prediction values as array
            data_access.pred_write_reader(_path, y, True, params)
        else:
            # check existing path has the predictions
            params['output'][y]['predicted'] = data_access.pred_write_reader(
                _path, y, False, [])
Example #4
0
def run_h2o(train_file_path: str,
            test_file_path: str,
            task: MachineLearningTasksEnum,
            case_name='h2o_default'):
    config_data = get_models_hyperparameters()['H2O']
    max_models = config_data['MAX_MODELS']
    max_runtime_secs = config_data['MAX_RUNTIME_SECS']

    result_filename = f'{case_name}_m{max_models}_rs{max_runtime_secs}_{task.name}'
    exported_model_path = os.path.join(CURRENT_PATH, result_filename)

    # TODO Regression
    if result_filename not in os.listdir(CURRENT_PATH):
        train_data = InputData.from_csv(train_file_path)
        best_model = fit_h2o(train_data)
        temp_exported_model_path = h2o.save_model(model=best_model,
                                                  path=CURRENT_PATH)

        os.renames(temp_exported_model_path, exported_model_path)

    ip, port = get_h2o_connect_config()
    h2o.init(ip=ip, port=port, name='h2o_server')

    imported_model = h2o.load_model(exported_model_path)

    test_frame = InputData.from_csv(test_file_path)
    true_target = test_frame.target

    predictions = predict_h2o(imported_model, test_frame)

    if task is MachineLearningTasksEnum.classification:
        train_roc_auc_value = round(imported_model.auc(train=True), 3)
        valid_roc_auc_value = round(imported_model.auc(valid=True), 3)
        test_roc_auc_value = round(roc_auc_score(true_target, predictions), 3)

        metrics = {
            'H2O_ROC_AUC_train': train_roc_auc_value,
            'H2O_ROC_AUC_valid': valid_roc_auc_value,
            'H2O_ROC_AUC_test': test_roc_auc_value
        }

        print(f"H2O_ROC_AUC_train: {metrics['H2O_ROC_AUC_train']}")
        print(f"H2O_ROC_AUC_valid: {metrics['H2O_ROC_AUC_valid']}")
        print(f"H2O_ROC_AUC_test: {metrics['H2O_ROC_AUC_test']}")
    else:
        mse_train = imported_model.mse()
        rmse_train = imported_model.rmse()

        metrics = {'H2O_MSE_train': mse_train, 'H2O_RMSE_train': rmse_train}

        print(f"H2O_MSE_train: {metrics['H2O_MSE_train']}")
        print(f"H2O_RMSE_train: {metrics['H2O_RMSE_train']}")

    h2o.shutdown(prompt=False)

    return metrics
Example #5
0
def get_prediction(model_path, iters, data, batch_size):
    h2o.init(nthreads=-1)
    model = h2o.load_model(model_path)
    prediction = []
    warnings.filterwarnings("ignore")
    t1= datetime.datetime.now()
    for i in range(0, iters+1):
        _data = data.ix[(i*batch_size):((i+1)*batch_size)]
        _data_h20 = h2o.H2OFrame(_data)
        pred = model.predict(_data_h20)
        prediction += list(pred.as_data_frame(use_pandas=True)['predict'])
    t2 = datetime.datetime.now()
    print("total run time :", round((t2 - t1).total_seconds() / 60, 2))
    h2o.shutdown(prompt = False)
    return prediction
Example #6
0
def imputation_waves(df: pd.DataFrame):

    df_train_0 = process_raw(TRAIN_PATH).pipe(take_difference).pipe(
        take_population_rates)

    h2o.init(nthreads=-1, min_mem_size='5G', max_mem_size='10G')

    df, df_train_1 = loose_correlated_vars(df, df_train_0)
    df, df_train_2 = df.pipe(gam_wave_0, df_train_1)
    df, df_train_3 = h2o_gbm(df,
                             wave_1_gbm,
                             predictors_wave_1,
                             df_train=df_train_2)
    df, df_train_4 = h2o_drf(df,
                             wave_1_rf,
                             predictors_wave_1,
                             df_train=df_train_3)
    df, df_train_5 = gam_wave_1(df, df_train=df_train_4)
    df, df_train_6 = random_forest(df,
                                   wave_1_cv_rf,
                                   predictors_wave_1,
                                   df_train=df_train_5)
    df, df_train_7 = h2o_gbm(df,
                             wave_2_gbm_population,
                             predictors_wave_2_population,
                             df_train=df_train_6)
    df, df_train_8 = h2o_gbm(df,
                             wave_2_gbm_macroeconmic,
                             predictors_wave_2_macroeconomic,
                             df_train=df_train_7)
    df, df_train_9 = h2o_gbm(df,
                             wave_2_gbm_health,
                             predictors_wave_2_health,
                             df_train=df_train_8)
    df, df_train_10 = h2o_drf(df,
                              wave_2_drf_macroeconomic,
                              predictors_wave_2_macroeconomic,
                              df_train=df_train_9)
    df, df_train_11 = random_forest(df,
                                    wave_2_cvrf_macroeconomic,
                                    predictors_wave_2_macroeconomic,
                                    df_train=df_train_10)
    df = df.pipe(last_imputation, df_train_11)
    df_train_11 = df_train_11.pipe(last_imputation, df_train_11)

    h2o.shutdown()

    return df, df_train_11
Example #7
0
def train(cfg):
    # Load data
    messages = load_data(cfg.datafile)
    # Prepare tf-idf to feature vectorization and also transform input data
    (vectorizer, train) = tf_idf(messages['message'])
    # Save Tf-Idf model
    h2o.init()
    train_table = h2o.H2OFrame(np.column_stack((messages['label'], train.toarray()))).set_names(['label'] + vectorizer.get_feature_names())
    gbm_model= H2OGradientBoostingEstimator(ntrees=10, learn_rate=0.01, max_depth=6, min_rows=10, distribution="bernoulli")
    gbm_model.train(x = range(1, train_table.shape[1]), y = 0, training_frame = train_table)
    if cfg.verbose: print "GBM Model", gbm_model
    # Save models
    if not os.path.exists(cfg.models_dir):
        os.makedirs(cfg.models_dir)
    saveModel(vectorizer, '{}/vectorizer.pickle'.format(cfg.models_dir))
    h2o.download_pojo(gbm_model, "{}/".format(cfg.models_dir))
    h2o.shutdown()
Example #8
0
    def run_example(self):

        h2o.init()

        # Import a sample binary outcome train/test set into H2O
        train = h2o.import_file("./data/churn-train.csv")
        test = h2o.import_file("./data/churn-test.csv")
        #df = h2o.import_file("./data/churn.csv")
        #train, test = df.split_frame(ratios=[.75])

        # Identify predictors and response
        x = train.columns
        y = "churn_probability"
        x.remove(y)

        # For binary classification, response should be a factor
        #train[y] = train[y].asfactor()
        #test[y] = test[y].asfactor()

        # Run AutoML for 20 base models (limited to 1 hour max runtime by default)
        aml = H2OAutoML(max_runtime_secs=20, seed=1, sort_metric="mae")
        aml.train(x=x, y=y, training_frame=train)

        # View the AutoML Leaderboard
        lb = aml.leaderboard
        lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

        # The leader model is stored here
        print(aml.leader.model_performance(test))

        # If you need to generate predictions on a test set, you can make
        # predictions directly on the `"H2OAutoML"` object, or on the leader
        # model object directly

        preds = aml.predict(test)

        # or:
        preds = aml.leader.predict(test)
        print(preds)

        resp = [aml, aml.leader, preds.as_data_frame()]

        h2o.shutdown()

        return resp
Example #9
0
 def crear_arbol(self):
     
     try:
         h2o.shutdown(prompt=False)
     except:
         pass
     json={}
     h2o.init(max_mem_size = "2G")
     h2o.remove_all() 
     df = pd.read_excel('Excel_Corregido_Final.xlsx',encoding="ISO-8859-1")
     df=self.variables(df)
     umbrales =np.linspace(0, 1, 7)
     df = df.replace(np.nan, -1)
     df = df.replace({'Categoría':{'a':'a','é':'e','í':'i','ó':'o','ú':'u'}}, regex=True)
     print(df["Categoría"])
     covtype_df=h2o.H2OFrame(self.discretizador.discretizar(self.listaVarDis,umbrales,df))
     covtype_df=covtype_df.drop([0], axis=0)
     print("ojo acá")
     print(self.discretizador.listaDeIntervalos)
     covtype_df["T"]=covtype_df["T"].asfactor()
     df=covtype_df
     t=covtype_df["T"]
     covtype_df=covtype_df.drop(["T"],axis=1)
     covtype_df["T"] = t["T"]
     self.test=covtype_df.drop(["T"],axis=1)
     train,x = covtype_df.split_frame([0.8], seed=56478)
     valid,x=covtype_df.split_frame([0.5],seed=56478)
     #self.test=self.test.drop(["T"],axis=1)
     covtype_X = covtype_df.col_names[:-1]     
     covtype_y = covtype_df.col_names[-1] 
     self.rf_v2.train(x=covtype_X, y=covtype_y, training_frame=train,validation_frame=valid)
     json["val"]=1-self.rf_v2.mean_per_class_error(valid=True)
     print(self.rf_v2.confusion_matrix(valid))
     # json["matriz"]=self.rf_v2.confusion_matrix(valid).as_data_frame().to_numpy()
     # json["matriz"]=json["matriz"].tolist()
     # for i in json["matriz"]:
     #     i[len(i)-2]=round(i[len(i)-2],4)
     print(json["val"])
     # json["val"]=self.rf_v2.mean_per_class_error(valid=True)
     #json_string = json.dumps(str(self.rf_v2._get_metrics),ensure_ascii=False)
     #python_dictionary = json.loads(json_string)
     #print(python_dictionary)
     print("algo")
Example #10
0
def run(data):
    h2o.init()
    try:

        model = h2o.load_model(model_path +
                               '/KMeans_model_python_1619773255297_1')
        print("input_data....")
        print(data.columns)
        print(type(data))
        data_h2o = h2o.H2OFrame(data)
        result = model.predict(data_h2o).as_data_frame()['predict']
        print("result.....")
        print(result)
        # You can return any data type, as long as it is JSON serializable.
        return json.dumps(result.tolist())  #np.array(result.tolist())

    except Exception as e:
        error = str(e)
        return error
    h2o.shutdown()
def KMeans_ClusteringH2O(data, metric, parameters):
    try:
        h2o.init()
        rfm_data = h2o.H2OFrame(data)
        train, valid = rfm_data.split_frame(
            ratios=[constants.clustering_parameters['split_ratio']],
            seed=constants.clustering_parameters['seed'])
        rfm_kmeans = H2OKMeansEstimator(
            k=constants.clustering_parameters['k'],
            seed=constants.clustering_parameters['seed'],
            max_iterations=int(len(data) / 2))
        rfm_kmeans.train(x=metric,
                         training_frame=train,
                         validation_frame=valid)
        grid = H2OGridSearch(
            model=rfm_kmeans,
            hyper_params=constants.clustering_parameters['hyper_params'],
            search_criteria=constants.clustering_parameters['search_criteria'])
        # train using the grid
        grid.train(x=metric, training_frame=train, validation_frame=valid)

        # sort the grid models by total within cluster sum-of-square error.
        sorted_grid = grid.get_grid(sort_by='tot_withinss', decreasing=False)
        prediction = sorted_grid[0].predict(rfm_data)
        data = rfm_data.concat(prediction,
                               axis=1)[[metric, 'predict'
                                        ]].as_data_frame(use_pandas=True)
        data = data.rename(columns={'predict': metric + '_segment'})
        data[metric + '_segment'] = data[metric +
                                         '_segment'].apply(lambda x: x + 1)
        if parameters['is_h2o_cluster_shut_down']:
            h2o.shutdown(prompt=False)
    except:
        if parameters['is_h2o_cluster_shut_down']:
            h2o.shutdown(prompt=False)
    return data


# Calculate the total amount of money earned or lost per loan
valid["expected_earned"] = valid["term"] * valid["installment"] - valid["loan_amnt"]
valid["earned"] = valid["total_pymnt"] - valid["loan_amnt"]



# Calculate how much money will be lost to false negative, vs how much will be saved due to true positives
valid["pred"] = pred_gbm["predict"]

grouped = valid.group_by(["bad_loan","pred"])
net = grouped.sum(col = "earned").get_frame()

n1 = net[(net["bad_loan"] == "0") & (net["pred"] == "0")]["sum_earned"].round(digits = 0).max()
n2 = net[(net["bad_loan"] == "0") & (net["pred"] == "1")]["sum_earned"].round(digits = 0).max()
n3 = (-1)*net[(net["bad_loan"] == "1") & (net["pred"] == "1")]["sum_earned"].round(digits = 0).max()
n4 = (-1)*net[(net["bad_loan"] == "1") & (net["pred"] == "0")]["sum_earned"].round(digits = 0).max()

# Calculate the amount of earned
print "Total amount of profit still earned using the model : %s" %'${:0,.0f}'.format(n1)
print "Total amount of profit forfeitted using the model : %s" %'${:0,.0f}'.format(n2)
print "Total amount of loss that could have been prevented : %s" %'${:0,.0f}'.format(n3)
print "Total amount of loss that still would've accrued : %s" %'${:0,.0f}'.format(n4)



h2o.shutdown()

def call_badshutdown(): # added this test per Pasha request.  Want to see error from one thread will pass on exception
    h2o.shutdown(badparam=1, prompt=True)
def call_shutdown():
    h2o.shutdown(prompt=True)   # call shutdown but do not actually shut anything down.
def call_badshutdown(
):  # added this test per Pasha request.  Want to see error from one thread will pass on exception
    h2o.shutdown(badparam=1, prompt=True)
Example #16
0
 def close():
     h2o.shutdown(prompt=True)
# For given crime and model returns probability of crime.
def score_event(crime, model, censusTable):
    srdd = spark.createDataFrame([crime])
    # Join table with census data
    df_row = censusTable.join(srdd).where("Community_Area = Community_Area_Number")
    row = h2oContext.as_h2o_frame(df_row)
    row["Season"] = row["Season"].asfactor()
    row["WeekDay"] = row["WeekDay"].asfactor()
    row["Primary_Type"] = row["Primary_Type"].asfactor()
    row["Location_Description"] = row["Location_Description"].asfactor()
    row["Domestic"] = row["Domestic"].asfactor()

    predictTable = model.predict(row)
    probOfArrest = predictTable["true"][0,0]
    return probOfArrest

for i in crime_examples:
    arrestProbGBM = 100*score_event(i, gbm_model, df_census)
    arrestProbDLM = 100*score_event(i, dl_model, df_census)

    print("""
       |Crime: """+str(i)+"""
       |  Probability of arrest best on DeepLearning: """+str(arrestProbDLM)+"""
       |  Probability of arrest best on GBM: """+str(arrestProbGBM)+"""
        """)

# stop H2O and Spark services
h2o.shutdown(prompt=False)
spark.stop()

Example #18
0
    def run_example(self, train_path, test_path, target):

        metrics = {}

        train = pd.read_csv(train_path)

        # Auto-keras
        regressor = ak.StructuredDataRegressor(max_trials=10,
                                               loss="mean_absolute_error")
        regressor.fit(x=train, y=target)
        metrics["auto-keras"] = regressor.evaluate(x=train, y=target)[0]

        # Auto-gluon
        train_data = task.Dataset(file_path=train_path)
        label_column = target
        predictor = task.fit(train_data=train_data,
                             label=label_column,
                             eval_metric="mean_absolute_error")
        test_data = task.Dataset(file_path=test_path)
        y_test = test_data[label_column]  # values to predict
        # delete label column to prove we're not cheating
        test_data_nolab = test_data.drop(labels=[label_column], axis=1)
        y_pred = predictor.predict(test_data_nolab)
        metrics["auto-gluon"] = predictor.evaluate_predictions(
            y_true=y_test, y_pred=y_pred,
            auxiliary_metrics=True)["mean_absolute_error"]

        # Auto-sklearn
        categorical_feature_mask = train.dtypes == object
        categorical_cols = train.columns[categorical_feature_mask].tolist()
        le = LabelEncoder()
        train[categorical_cols] = train[categorical_cols].apply(
            lambda col: le.fit_transform(col))
        X_train = train.drop(columns=[target]).to_numpy()
        y_train = train[target].to_numpy()
        test = pd.read_csv(test_path)
        test[categorical_cols] = test[categorical_cols].apply(
            lambda col: le.fit_transform(col))
        X_test = test.drop(columns=[target]).to_numpy()
        y_test = test[target].to_numpy()
        automl = autosklearn.regression.AutoSklearnRegressor(
            time_left_for_this_task=120,
            per_run_time_limit=30,
            resampling_strategy='cv',
            resampling_strategy_arguments={'folds': 5},
        )
        automl.fit(X_train.copy(),
                   y_train.copy(),
                   metric=autosklearn.metrics.mean_absolute_error)
        automl.refit(X_train.copy(), y_train.copy())
        predictions = automl.predict(X_test)
        metrics["auto-sklearn"] = sklearn.metrics.mean_absolute_error(
            y_test, predictions)

        # H2O AutoML
        h2o.init()

        train = h2o.import_file(train_path)
        test = h2o.import_file(test_path)
        x = train.columns
        y = target
        x.remove(y)
        aml = H2OAutoML(max_runtime_secs=20, seed=1, sort_metric="mae")
        aml.train(x=x, y=y, training_frame=train)
        metrics["h2o-automl"] = aml.leader.model_performance(test).mae()

        h2o.shutdown()

        # TPOT
        tpot = TPOTRegressor(generations=5,
                             population_size=50,
                             verbosity=2,
                             random_state=42,
                             scoring='neg_mean_absolute_error',
                             cv=5)
        tpot.fit(X_train, y_train)
        metrics["tpot"] = -tpot.score(X_test, y_test)

        best_metric = float("inf")
        best_model = "MODEL"
        for metric in metrics:
            if metrics[metric] < best_metric:
                best_metric = metrics[metric]
                best_model = metric

        print("THE BEST AUTOML TOOL IS " + str(best_model) +
              ", WITH A MAE OF " + str(best_metric) +
              " ACHIEVED BY THE BEST MODEL.")

        return metrics
Example #19
0
from __future__ import print_function
#Currently, our R/python test suite is executed against an established h2o cluster (run.py sets up the cluster). However, we ignore the mode of 
#operation where the h2o cluster is created by the client. Consequently, we may not recognize bugs in h2o.init() for this mode of operation. 
#For this ticket, I think we should create a set of tests that check that h2o.init() is successful for each OS/client interface combination.

#Below is the test that will be implemented:

import h2o

#Call h2o.init() just in case instance is not running
h2o.init(strict_version_check=False)

#First we will shutdown any instance of h2o
h2o.shutdown(prompt = False)

#Load up h2o.init()

h2o.init(strict_version_check=False)

#Get H2OConnection() class
conn = h2o.H2OConnection(ip="localhost", port=54321, start_h2o=True, enable_assertions=True,
						license=None, nthreads=-1, max_mem_size=None, min_mem_size=None, ice_root=None,
						strict_version_check=False, proxy=None, https=False, insecure=False, username=None,
						 password=None, max_mem_size_GB=None, min_mem_size_GB=None)


#Get if cluster is up (True) or not (False)
cluster_up = conn.cluster_is_up(conn)

#Hacky way to get if cluster is healthy or not. Might need to fix in cluster_status() function in h2o.py...
conn.json = h2o.H2OConnection.get_json("Cloud?skip_ticks=true")
Example #20
0
def RunAutoML():
    file = request.json['file_path']
    max_models = request.json['max_models']
    max_runtime_secs = request.json['max_runtime_secs']
    seed = request.json['seed']
    ip = request.json['ip']
    port = request.json['port']
    nthreads = request.json['nthreads']
    max_mem_size = request.json['max_mem_size']
    target_var = request.json['target_var']
    best_model = request.json['best_model']
    cluster_name = request.json['cluster_name']
    ##Existing \ new workspace
    h2o.init(ip=ip,
             port=port,
             name=cluster_name,
             nthreads=nthreads,
             max_mem_size=max_mem_size)
    print('Found existing Workspace.')
    data = h2o.import_file(file)
    predictors = list(data.columns)
    predictors.remove(target_var)  # Since we need to predict quality
    print(predictors)
    try:
        aml = H2OAutoML(max_models=max_models,
                        max_runtime_secs=max_runtime_secs,
                        seed=seed,
                        exclude_algos=["XGBoost", "DeepLearning"])
        aml.train(x=predictors, y=target_var, training_frame=data)
        #print(aml.leaderboard)
        aml_lb = aml.leaderboard
        print(aml_lb)
        dff = aml_lb.as_data_frame()
        #print(dff)
        # changing index cols with rename()
        dff.rename(index={
            0: "one",
            1: "two",
            2: "three",
            3: "four",
            4: "five"
        },
                   inplace=True)
        dff_json = dff.to_json(orient='index')
        print(dff_json)
        best_model_id = aml.leader.model_id
        best_model_id
        var1 = "@"
        var2 = var1 + best_model_id
        #best_model = "Model_h2o"
        # save the model
        # Join various path components
        # Path
        cwd = 'D:\\DCSAIAUTOML\\BestModels\\h2o'
        #best_model = "Model_h2o"
        model_path = os.path.join(cwd, best_model)
        print(model_path)
        my_model = h2o.save_model(model=aml.leader,
                                  path=model_path,
                                  force=True)
        modelfile = aml.download_mojo(path=model_path, get_genmodel_jar=True)
        print("Model saved to " + modelfile)
        #return dff_json
        #custom_train_1(file,target_var)
        h2o.shutdown(prompt=False)
        return '{} {}'.format(dff_json, var2)

    except Exception as e:
        error_statement = str(e)
        print("Error statement: ", error_statement)
        return error_statement
Example #21
0
test_np = test.as_data_frame()
test_n = len(test_np[0])
#test_np = test_np[1:]

pred_np = pred.as_data_frame()

for i in range(1, test_n):
    figure_array = [int(float(x[i])) for x in pred_np]
    W = np.reshape(figure_array, (128, 128))
    data = np.array(W, dtype=np.uint8)
    img = Image.fromarray(data)
    img.save("autoencoded/" + test_np[0][i])

end_time = time.time()
total_time = round(end_time - start_time, 2)
print(total_time)

h2o.shutdown()

#200x400x200            epoch = 10 -> 0.204861111111
#200x400x200            epoch = 100 -> [0.684027777778,0.739583333333] time = [,292.06]
#200x400x200x400        epoch = 100 -> [0.881944444444,0.715277777778] time = [,277.55]
#200,400,200,400,200    epoch = 100 -> 0.524305555556 time = 256.48

#300x500x300            epoch = 100 -> 0.9375           time = 409.3
#400x600x400            epoch = 100 -> 0.944444444444   time = 483.1

#400x600x400x600        epoch = 100 -> [0.989583333333,0.986111111111]   time = [552.86,460.22]
#400x600x400x600x600    epoch = 100 -> 0.913194444444   time = 488.3

#500x600x500x600        epoch = 100 -> 0.986111111111   time = 481.0
def call_shutdown():
    h2o.shutdown(
        prompt=True)  # call shutdown but do not actually shut anything down.