def mojo_predict_pandas_test(sandbox_dir): data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) input_csv = "%s/in.csv" % sandbox_dir pdf = data[1, 2:] h2o.export_file(pdf, input_csv) data[1] = data[1].asfactor() model = H2OGradientBoostingEstimator(distribution="bernoulli") model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data) h2o_prediction = model.predict(pdf) # download mojo model_zip_path = os.path.join(sandbox_dir, 'model.zip') genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar') download_mojo(model, model_zip_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) pandas_frame = pandas.read_csv(input_csv) mojo_prediction = h2o.mojo_predict_pandas(dataframe=pandas_frame, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path) print("Binomial Prediction (Binary) - p0: %f" % h2o_prediction[0,1]) print("Binomial Prediction (Binary) - p1: %f" % h2o_prediction[0,2]) print("Binomial Prediction (MOJO) - p0: %f" % mojo_prediction['0'].iloc[0]) print("Binomial Prediction (MOJO) - p1: %f" % mojo_prediction['1'].iloc[0]) assert h2o_prediction[0,1] == mojo_prediction['0'].iloc[0], "expected predictions to be the same for binary and MOJO model - p0" assert h2o_prediction[0,2] == mojo_prediction['1'].iloc[0], "expected predictions to be the same for binary and MOJO model - p0"
def mojo_predict_pandas_test(sandbox_dir): data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) input_csv = "%s/in.csv" % sandbox_dir pdf = data[1, 2:] h2o.export_file(pdf, input_csv) data[1] = data[1].asfactor() model = H2OGradientBoostingEstimator(distribution="bernoulli") model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data) h2o_prediction = model.predict(pdf) # download mojo model_zip_path = os.path.join(sandbox_dir, 'model.zip') genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar') download_mojo(model, model_zip_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) pandas_frame = pandas.read_csv(input_csv) mojo_prediction = h2o_utils.mojo_predict_pandas(dataframe=pandas_frame, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path) print("Binomial Prediction (Binary) - p0: %f" % h2o_prediction[0,1]) print("Binomial Prediction (Binary) - p1: %f" % h2o_prediction[0,2]) print("Binomial Prediction (MOJO) - p0: %f" % mojo_prediction['0'].iloc[0]) print("Binomial Prediction (MOJO) - p1: %f" % mojo_prediction['1'].iloc[0]) assert h2o_prediction[0,1] == mojo_prediction['0'].iloc[0], "expected predictions to be the same for binary and MOJO model - p0" assert h2o_prediction[0,2] == mojo_prediction['1'].iloc[0], "expected predictions to be the same for binary and MOJO model - p0"
def test_hdfs_io(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/airlines/airlines_all.05p.csv") print("Spliting data") for c in ["Month","DayofMonth","IsArrDelayed"]: h2o_data[c] = h2o_data[c].asfactor() myX = ["Month","DayofMonth","Distance"] train,test = h2o_data.split_frame(ratios=[0.9]) print("Exporting file to hdfs") h2o.export_file(test[:,["Year","DayOfWeek"]], "hdfs://" + hdfs_name_node + "/datasets/exported.csv") print("Reading file back in and comparing if data is the same") new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv") assert((test[:,"DayOfWeek"] - new_test[:,"DayOfWeek"]).sum() == 0) print("Training") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=myX, y="IsArrDelayed", training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path) print("Running predictions") preds = new_model.predict(test)
def export_file(): pros_hex = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) pros_hex[1] = pros_hex[1].asfactor() pros_hex[3] = pros_hex[3].asfactor() pros_hex[4] = pros_hex[4].asfactor() pros_hex[5] = pros_hex[5].asfactor() pros_hex[8] = pros_hex[8].asfactor() p_sid = pros_hex.runif() pros_train = pros_hex[p_sid > 0.2, :] pros_test = pros_hex[p_sid <= 0.2, :] glm = H2OGeneralizedLinearEstimator(family="binomial") myglm = glm.train(x=list(range(2, pros_hex.ncol)), y=1, training_frame=pros_train) mypred = glm.predict(pros_test) def id_generator(size=6, chars=string.ascii_uppercase + string.digits): return "".join(random.choice(chars) for _ in range(size)) fname = id_generator() + "_prediction.csv" path = pyunit_utils.locate("results") dname = path + "/" + fname h2o.export_file(mypred, dname) py_pred = pd.read_csv(dname) print(py_pred.head()) h_pred = mypred.as_data_frame(True) print(h_pred.head()) # Test to check if py_pred & h_pred are identical assert_frame_equal(py_pred, h_pred)
def h2oexport_file(): """ Python API test: h2o.export_file(frame, path, force=False, parts=1). Note taht force=True is only honored if parts=1. Otherwise, an error will be thrown. """ training_data = h2o.import_file( pyunit_utils.locate("smalldata/logreg/benign.csv")) try: results_dir = pyunit_utils.locate( "results") # find directory path to results folder final_path = os.path.join(results_dir, 'frameData') h2o.export_file(training_data, final_path, force=True, parts=1) # save data assert os.path.isfile( final_path), "h2o.export_file() command is not working." final_dir_path = os.path.join(results_dir, 'multiFrame') h2o.export_file(training_data, final_dir_path, force=True, parts=-1) assert len(os.listdir( final_dir_path)) > 0, "h2o.export_file() command is not working." except Exception as e: if e.__class__.__name__ == 'ValueError' and 'File not found' in e.args[ 0]: print( "Directory is not writable. h2o.export_file() command is not tested." ) else: assert e.__class__.__name__=='H2OResponseError' and \ 'exportFrame: Cannot use path' in e.args[0]._props['dev_msg'], \ "h2o.export_file() command is not working." print( "Directory: {0} is not empty. Delete or empy it before re-run. h2o.export_file() " "is not tested with multi-part export.".format(final_dir_path))
def export_file_multipart(): pros_hex = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) pros_hex[1] = pros_hex[1].asfactor() pros_hex[3] = pros_hex[3].asfactor() pros_hex[4] = pros_hex[4].asfactor() pros_hex[5] = pros_hex[5].asfactor() pros_hex[8] = pros_hex[8].asfactor() p_sid = pros_hex.runif() pros_train = pros_hex[p_sid > .2, :] pros_test = pros_hex[p_sid <= .2, :] glm = H2OGeneralizedLinearEstimator(family="binomial") myglm = glm.train(x=list(range(2, pros_hex.ncol)), y=1, training_frame=pros_train) mypred = glm.predict(pros_test) def id_generator(size=6, chars=string.ascii_uppercase + string.digits): return ''.join(random.choice(chars) for _ in range(size)) path = pyunit_utils.locate("results") dname = os.path.join(path, id_generator() + "_prediction") h2o.export_file(mypred, dname, parts=-1) assert os.path.isdir(dname) part_files = glob.glob(os.path.join(dname, "part-m-?????")) print(part_files) py_pred = pd.concat((pd.read_csv(f) for f in part_files)) print(py_pred.head()) h_pred = mypred.as_data_frame(True) print(h_pred.head()) #Test to check if py_pred & h_pred are identical assert_frame_equal(py_pred,h_pred)
def s3_import_export(): local_frame = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) for scheme in ["s3a"]: # s3n is deprecated since HDP3/CDH6 timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S.%f") unique_suffix = str(uuid.uuid4()) s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \ timestamp + "." + unique_suffix + ".csv.zip" h2o.export_file(local_frame, s3_path) s3 = boto3.resource('s3') client = boto3.client('s3') # S3 might have a delay in indexing the file (usually milliseconds or hundreds of milliseconds) # Wait for the file to be available, if not available in the biginning, try every 2 seconds, up to 10 times client.get_waiter('object_exists').wait(Bucket='test.0xdata.com', Key="h2o-hadoop-tests/test-export/" + scheme + "/exported." + \ timestamp + "." + unique_suffix + ".csv.zip", WaiterConfig={ 'Delay': 2, 'MaxAttempts': 10 }) s3_frame = h2o.import_file(s3_path) assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame()) s3.Object(bucket_name='test.0xdata.com', key="h2o-hadoop-tests/test-export/" + scheme + "/exported." + \ timestamp + "." + unique_suffix + ".csv.zip").delete()
def local_and_hdfs_frame_equality(): local_frame = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) hdfs_path = 'hdfs:///user/jenkins/tests/prostate_export' h2o.export_file(local_frame, hdfs_path, force=True) hdfs_frame = h2o.import_file(hdfs_path) assert_frame_equal(local_frame.as_data_frame(), hdfs_frame.as_data_frame())
def save_all_frames(self, path, overwrite=False): """Save all models to a directory. :param path: String path, where to save your models. :param overwrite: boolean, overwrite the frame """ models = [] for f in h2o.ls()['key']: if 'modelmetrics' not in f: try: fh = h2o.get_frame(f) except (h2o.exceptions.H2OResponseError, h2o.exceptions.H2OServerError): pass else: try: # quick and dirty solution for NoneType fh.frame_id except: pass else: print(fh.frame_id) print("Save frame " + fh.frame_id + " to " + path + "/" + fh.frame_id) h2o.export_file(fh, path=path + os.sep + fh.frame_id, force=overwrite)
def mojo_predict_pandas_test(sandbox_dir): data = h2o.import_file( path=pyunit_utils.locate("smalldata/coxph_test/heart.csv")) input_csv = "%s/in.csv" % sandbox_dir output_csv = "%s/prediction.csv" % sandbox_dir h2o.export_file(data, input_csv) data['transplant'] = data['transplant'].asfactor() model = H2OCoxProportionalHazardsEstimator(stratify_by=["transplant"], start_column="start", stop_column="stop") model.train(x=["age", "surgery", "transplant"], y="event", training_frame=data) h2o_prediction = model.predict(data) # download mojo model_zip_path = os.path.join(sandbox_dir, 'model.zip') genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar') download_mojo(model, model_zip_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) pandas_frame = pandas.read_csv(input_csv) mojo_prediction = h2o.mojo_predict_pandas(dataframe=pandas_frame, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path) assert len(mojo_prediction) == h2o_prediction.nrow assert_frame_equal(h2o_prediction.as_data_frame(use_pandas=True), mojo_prediction, check_dtype=False)
def test_hadoop(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv") print("Spliting data") train,test = h2o_data.split_frame(ratios=[0.9]) print("Exporting file to hdfs") h2o.export_file(test[:,0:2], "hdfs://" + hdfs_name_node + "/datasets/exported.csv") print("Reading file back in and comparing if data is the same") new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv") assert((test[:,1] - new_test[:,1]).sum() == 0) print("Training") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path) print("Running predictions") preds = new_model.predict(test)
def s3_import_export(): local_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) for scheme in ["s3n", "s3a"]: timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S") s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + timestamp + ".csv.zip" h2o.export_file(local_frame, s3_path) s3_frame = h2o.import_file(s3_path) assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())
def test_stacked_ensemble_is_able_to_use_imported_base_models(): import tempfile, shutil, glob train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv")) x = train.columns y = "species" x.remove(y) nfolds = 2 gbm = H2OGradientBoostingEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) gbm.train(x=x, y=y, training_frame=train) drf = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) drf.train(x=x, y=y, training_frame=train) se = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[gbm.model_id, drf.model_id]) se.train(x=x, y=y, training_frame=train) assert len(se.base_models) == 2 TMP_DIR = tempfile.mkdtemp() try: h2o.save_model(gbm, TMP_DIR + "/gbm.model") h2o.save_model(drf, TMP_DIR + "/drf.model") gbm_holdout_id = gbm.cross_validation_holdout_predictions().frame_id drf_holdout_id = drf.cross_validation_holdout_predictions().frame_id h2o.export_file(gbm.cross_validation_holdout_predictions(), TMP_DIR + "/gbm.holdout") h2o.export_file(drf.cross_validation_holdout_predictions(), TMP_DIR + "/drf.holdout") h2o.remove_all() h2o.import_file(TMP_DIR + "/gbm.holdout", gbm_holdout_id) h2o.import_file(TMP_DIR + "/drf.holdout", drf_holdout_id) gbm = h2o.upload_model(glob.glob(TMP_DIR + "/gbm.model/*")[0]) drf = h2o.upload_model(glob.glob(TMP_DIR + "/drf.model/*")[0]) train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"), "some_other_name_of_training_frame") test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"), "some_other_name_of_test_frame") x = train.columns y = "species" x.remove(y) se_loaded = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[gbm.model_id, drf.model_id]) se_loaded.train(x=x, y=y, training_frame=train) assert len(se_loaded.base_models) == 2 finally: shutil.rmtree(TMP_DIR)
def s3_import_export(scheme): local_frame = h2o.import_file("/home/0xdiag/smalldata/logreg/prostate.csv") timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S") unique_suffix = str(uuid.uuid4()) s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \ timestamp + "." + unique_suffix + ".csv.zip" h2o.export_file(local_frame, s3_path) s3_frame = h2o.import_file(s3_path) assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())
def s3_import_export(self, scheme): local_frame = h2o.import_file("/home/0xdiag/smalldata/logreg/prostate.csv") timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S") unique_suffix = str(uuid.uuid4()) s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \ timestamp + "." + unique_suffix + ".csv.zip" h2o.export_file(local_frame, s3_path) s3_frame = h2o.import_file(s3_path) assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())
def predict(self): pr_big = self.model2.predict(self.data) pr_big = pr_big.sort('p1', ascending=False) df_short_ = pr_big[:1000000, :] df_short_ = df_short_['msisdn'] h2o.export_file( df_short_, 'hdfs://T2-HDFS-HA-PROD/user/andrey.lukyanenko/exported.csv')
def mojo_predict_api_test(sandbox_dir): data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) input_csv = "%s/in.csv" % sandbox_dir output_csv = "%s/prediction.csv" % sandbox_dir h2o.export_file(data[1, 2:], input_csv) data[1] = data[1].asfactor() model = H2OGradientBoostingEstimator(distribution="bernoulli") model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data) # download mojo model_zip_path = os.path.join(sandbox_dir, 'model.zip') genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar') download_mojo(model, model_zip_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) # test that we can predict using default paths h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) # test that we can predict using custom genmodel path other_sandbox_dir = tempfile.mkdtemp() try: genmodel_path = os.path.join(other_sandbox_dir, 'h2o-genmodel-custom.jar') download_mojo(model, model_zip_path, genmodel_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) try: h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) assert False, "There should be no h2o-genmodel.jar at %s" % sandbox_dir except RuntimeError: pass assert not os.path.isfile(output_csv) h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(output_csv) output_csv = "%s/out.prediction" % other_sandbox_dir # test that we can predict using default paths h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True, output_csv_path=output_csv) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) finally: shutil.rmtree(other_sandbox_dir)
def mojo_predict_api_test(sandbox_dir): data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) input_csv = "%s/in.csv" % sandbox_dir output_csv = "%s/prediction.csv" % sandbox_dir h2o.export_file(data[1, 2:], input_csv) data[1] = data[1].asfactor() model = H2OGradientBoostingEstimator(distribution="bernoulli") model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data) # download mojo model_zip_path = os.path.join(sandbox_dir, 'model.zip') genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar') download_mojo(model, model_zip_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) # test that we can predict using default paths h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) # test that we can predict using custom genmodel path other_sandbox_dir = tempfile.mkdtemp() try: genmodel_path = os.path.join(other_sandbox_dir, 'h2o-genmodel-custom.jar') download_mojo(model, model_zip_path, genmodel_path) assert os.path.isfile(model_zip_path) assert os.path.isfile(genmodel_path) try: h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True) assert False, "There should be no h2o-genmodel.jar at %s" % sandbox_dir except RuntimeError: pass assert not os.path.isfile(output_csv) h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True) assert os.path.isfile(output_csv) os.remove(output_csv) output_csv = "%s/out.prediction" % other_sandbox_dir # test that we can predict using default paths h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path, verbose=True, output_csv_path=output_csv) assert os.path.isfile(output_csv) os.remove(model_zip_path) os.remove(genmodel_path) os.remove(output_csv) finally: shutil.rmtree(other_sandbox_dir)
def s3_import_export(): local_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) for scheme in ["s3n", "s3a"]: timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S") unique_suffix = str(uuid.uuid4()) s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \ timestamp + "." + unique_suffix + ".csv.zip" h2o.export_file(local_frame, s3_path) s3_frame = h2o.import_file(s3_path) assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())
def impute_data(method = "mean", to_impute = to_impute, predictors = predictors): if method == "mean": print "Mean imputing missing data for predictors:", to_impute # find mean for each time period in data for each predictor, save them in a matrix with a col for the mean values of each predictor # then on holdout use this table to fill in all missing values based on the time period (row) and the variable (col) of this matrix #if using python module h2o-3.1.0.3131: grouped = data.group_by(["time_period"]) # gm = [grouped.mean(predictor, na="rm").get_frame() for predictor in to_impute] gm = d["time_period"].unique() print "Finding means..." for predictor in to_impute: gm = gm.cbind(d.group_by(["time_period"], {predictor:["mean", d.names().index(predictor), "rm"]}, order_by = 0)) gm.show() print "Saving the imputation means to disk..." h2o.download_csv(gm, filename = saving_means_fp) # df_py = h2o.as_list(gm) # Now that's stored for the holdout data, do this a faster way in java for the training data: for predictor in to_impute: d.impute(predictor, method='mean', by = ['time_period'], inplace = True) print "Done imputing", predictor print "Saving the final mean imputed data to disk..." h2o.export_file(frame = d, path =saving_meanImputed_fp, force=True) if method == "model": # sequentially impute 'newdata', not 'data', so the order of the predictor variables in the loop does not matter # otherwise, you would be using increasingly imputed data to make predictions as the loop progresses. newdata = d # With training data, build a model for each col and predict missing data, save the models, use them on the holdout data to predict all missing data. for predictor in to_impute: print "Building model for imputing " + predictor print "Subsetting the data into missing values for predictor and no missing values for predictor" na_ind = d[predictor].isna() not_na_ind = na_ind != 1.0 to_train = d[not_na_ind] to_predict = d[na_ind] these_var = [var for var in predictors if var != predictor] trained = h2o.gbm(x = to_train[these_var], y = to_train[[predictor]], ntrees=300, max_depth=6, learn_rate=0.2) print "Saving the imputation tree model for " + predictor h2o.save_model(trained, dir = saving_models_fp, name = "dl_imputation_model_" + predictor) print "Imputing the missing " + predictor + " data by predicting with the model..." predicted = trained.predict(to_predict[these_var]) tofillin = newdata[predictor] assert len(predicted) == len(tofillin[na_ind]) tofillin[na_ind] = predicted # mutate the column in place newdata[predictor] = tofillin print "Saving the final model-imputed data to disk..." h2o.export_file(frame = d, path =saving_modelImputed_fp, force=True)
def s3_import_export(): local_frame = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) for scheme in ["s3a"]: # s3n is deprecated since HDP3/CDH6 timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S") unique_suffix = str(uuid.uuid4()) s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \ timestamp + "." + unique_suffix + ".csv.zip" h2o.export_file(local_frame, s3_path) s3_frame = h2o.import_file(s3_path) assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())
def main(): os.environ['NO_PROXY'] = 'localhost' # Start H2O on your local machine h2o.init() recall = 10 for i in range(10): new_recall, test_frame = model_build() if new_recall > recall: recall = new_recall h2o.export_file(test_frame, "/home/wso2123/My Work/Datasets/Breast cancer wisconsin/test.csv", force=True) print recall
def dump_results(self): model_ids = list( self.aml.leaderboard['model_id'].as_data_frame().iloc[:, 0]) for m_id in model_ids: mdl = h2o.get_model(m_id) h2o.save_model(model=mdl, path=self.logdir, force=True) h2o.export_file( self.aml.leaderboard, osp.join(self.logdir, 'aml_leaderboard.h2o'), force=True, )
def test_export_orc_hdfs(self): fr = h2o.import_file(path="hdfs://127.0.0.1/user/jenkins/prostate_NA.orc", header=1) export_path = "hdfs://127.0.0.1/user/jenkins/prostate_NA_export.orc" failure = False try: h2o.export_file(frame=fr, path=export_path, force=True) except: failure = True assert not failure imported = h2o.import_file(path=export_path, header=1) assert imported.ncol == fr.ncol assert imported.nrow == fr.nrow
def test_export_parquet_hdfs(self): fr = h2o.import_file(path="hdfs://127.0.0.1/user/jenkins/airlines-simple.snappy.parquet", header=1) export_path = "hdfs://127.0.0.1/user/jenkins/airlines-simple.snappy_export.parquet" failure = False try: h2o.export_file(frame=fr, path=export_path, force=True) except: failure = True assert not failure imported = h2o.import_file(path=export_path, header=1) assert imported.ncol == fr.ncol assert imported.nrow == fr.nrow
def export_data_to_csv(cls, h2o_frame, export_path=""): try: h2o.export_file(h2o_frame, export_path) return export_path except H2OError as e: print("Error: {}".format(e)) print("Trying to fall back to pandas") try: df = h2o_frame.as_data_frame(use_pandas=True, header=True) df.to_csv(export_path, header=True) return export_path except Exception as e: raise Exception("Error: {}, could not export data".format(e))
def mojo_predict_pandas_test(sandbox_dir, stratify_by=None): if not os.path.exists(sandbox_dir): os.makedirs(sandbox_dir) # bunch of random columns to be added to the dataset random_cols = ["c1", "c2", "c3", "c4"] data = h2o.import_file( pyunit_utils.locate("smalldata/coxph_test/heart.csv")) if stratify_by: for strat_col in stratify_by: data[strat_col] = data[strat_col].asfactor() data['surgery'] = data['surgery'].asfactor() data_random_local = pandas.DataFrame( np.random.random(size=(data.nrow, len(random_cols))), columns=random_cols) data = data.cbind(h2o.H2OFrame(data_random_local)) model = H2OCoxProportionalHazardsEstimator(stratify_by=stratify_by, start_column="start", stop_column="stop", interaction_pairs=[ ("age", "c1"), ("c1", "c2"), ("c3", "age") ]) model.train(x=["age", "surgery", "transplant"] + random_cols, y="event", training_frame=data) print(model) # reference predictions h2o_prediction = model.predict(data) assert pyunit_utils.test_java_scoring(model, data, h2o_prediction, 1e-8) # download mojo mojo = pyunit_utils.download_mojo(model) # export new file (including the random columns) input_csv = "%s/in.csv" % sandbox_dir h2o.export_file(data, input_csv) pandas_frame = pandas.read_csv(input_csv) mojo_prediction = h2o.mojo_predict_pandas(dataframe=pandas_frame, **mojo) assert len(mojo_prediction) == h2o_prediction.nrow assert_frame_equal(h2o_prediction.as_data_frame(use_pandas=True), mojo_prediction, check_dtype=False)
def test_export_orc_hdfs(self): fr = h2o.import_file( path="hdfs://127.0.0.1/user/jenkins/prostate_NA.orc", header=1) export_path = "hdfs://127.0.0.1/user/jenkins/prostate_NA_export.orc" failure = False try: h2o.export_file(frame=fr, path=export_path, force=True) except: failure = True assert not failure imported = h2o.import_file(path=export_path, header=1) assert imported.ncol == fr.ncol assert imported.nrow == fr.nrow
def test_export_xls_hdfs(): fr = h2o.import_file(path="hdfs://127.0.0.1/user/jenkins/iris.xls", header=1) export_path = "hdfs://127.0.0.1/user/jenkins/iris_export.xls" failure = False try: h2o.export_file(frame=fr, path=export_path, force=True) except: failure = True assert not failure imported = h2o.import_file(path=export_path, header=1) assert imported.ncol == fr.ncol assert imported.nrow == fr.nrow
def test_export_parquet_hdfs(self): fr = h2o.import_file( path="hdfs://127.0.0.1/user/jenkins/airlines-simple.snappy.parquet", header=1) export_path = "hdfs://127.0.0.1/user/jenkins/airlines-simple.snappy_export.parquet" failure = False try: h2o.export_file(frame=fr, path=export_path, force=True) except: failure = True assert not failure imported = h2o.import_file(path=export_path, header=1) assert imported.ncol == fr.ncol assert imported.nrow == fr.nrow
def train(data_path, max_models, model_name): train_data, test_data, train_cols = prepare_data(args.data_path) test_cols = train_cols[:-1] test_cols = "quality" with mlflow.start_run() as run: print("run_id:", run.info.run_id) model = H2OAutoML(max_models=max_models, max_runtime_secs=300, seed=24, nfolds=6) model.train(x=train_cols, y=test_cols, training_frame=train_data, validation_frame=test_data) mlflow.log_param("max_models", max_models) mlflow.log_metric("rmse", model.leader.rmse()) mlflow.set_tag("mlflow_version", mlflow.__version__) mlflow.set_tag("h2o_version", h2o.__version__) mlflow.set_tag("model.leader.class", qname(model.leader.__class__)) mlflow.set_tag("model.leader.estimator_type", model.leader._estimator_type) mlflow.set_tag("num_leaderboard_models", model.leaderboard.nrows) lb = get_leaderboard(model, extra_columns='ALL') print(lb) path = "leaderboard.csv" h2o.export_file(lb, path=path, force=True) mlflow.log_artifact(path) from tabulate import tabulate df = lb.as_data_frame() table = tabulate(df, headers="keys", tablefmt="psql", showindex=False) path = "leaderboard.txt" with open(path, "w") as f: f.write(table) mlflow.log_artifact(path) df = df[["model_id"]] with open("models.csv", "w") as f: df.to_csv(f, index=False, header=False) mlflow.log_artifact("models.csv") mlflow.h2o.log_model(model.leader, "h2o-model", registered_model_name=args.model_name)
def predictions(mod,test,test_X,run_id,allV,slice_no=0): # test = h2o.import_file(data) if slice_no>0: test = test[0:slice_no,:] test_X = test_X[0:slice_no,:] if allV is not None: ivd=get_all_variables_csv(allV) X=check_all_variables(test, ivd, y) mod_perf=mod.model_performance(test) stats_test={} stats_test=model_performance_stats(mod_perf) n=run_id+'_test_stats.json' dict_to_json(stats_test,n) try: cf=mod_perf.confusion_matrix(metrics=["f1","f2","f0point5","accuracy","precision","recall","specificity","absolute_mcc","min_per_class_accuracy","mean_per_class_accuracy"]) cf_df=cf[0].table.as_data_frame() cf_df.to_csv(run_id+'_test_confusion_matrix.csv') except: pass predictions = mod.predict(test_X) predictions_df=None try: seq= h2o.H2OFrame.from_python(np.arange(1,(test.shape[0]+1)).tolist(), column_names=['Seer Row ID']) test_id=seq.cbind(test) predictions_df=test_id.cbind(predictions) # except: try: seq= h2o.H2OFrame.from_python(np.arange(1,(test_X.shape[0]+1)).tolist(), column_names=['Seer Row ID']) test_id=seq.cbind(test_X) predictions_df=test_id.cbind(predictions) # except: pass n=run_id+'_predictions.csv' h2o.export_file(predictions_df, n) # Karan's changes return
def s3_import_export(): local_frame = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) for scheme in ["s3n", "s3a"]: timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S") unique_suffix = str(uuid.uuid4()) s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \ timestamp + "." + unique_suffix + ".csv.zip" h2o.export_file(local_frame, s3_path) s3_frame = h2o.import_file(s3_path) assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame()) #Delete the file afterwards s3 = boto3.resource('s3') s3.Object(bucket_name='test.0xdata.com', key="h2o-hadoop-tests/test-export/" + scheme + "/exported." + \ timestamp + "." + unique_suffix + ".csv.zip").delete()
def export_time(): pros_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/citibike-nyc/2013-07.csv")) def id_generator(size=6, chars=string.ascii_uppercase + string.digits): return ''.join(random.choice(chars) for _ in range(size)) fname = id_generator() + "_prediction.csv" path = pyunit_utils.locate("results") dname = path + "/" + fname start = time.time() h2o.export_file(pros_hex,dname) end = time.time() time_to_export = end-start print("Time to export is",time_to_export,"seconds.") assert time_to_export > 2, "File export happened instantly (less than 2 seconds). Please check if h2o.export() properly exported file"
def export_custom_separator(): data = {'col1': [1, 2], 'col2': [3, 4]} expected = pd.DataFrame(data=data) prostate = h2o.H2OFrame(expected) target_default = path.join(pyunit_utils.locate("results"), "export_file_default_sep.csv") target_custom = path.join(pyunit_utils.locate("results"), "export_file_custom_sep.csv") h2o.export_file(prostate, target_default) h2o.export_file(prostate, target_custom, sep="|") parsed_default = pd.read_csv(target_default, sep=",") parsed_custom = pd.read_csv(target_custom, sep="|") assert expected.equals(parsed_default) assert expected.equals(parsed_custom)
def gbm_on_hadoop(): local_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) hdfs_path = 'hdfs:///user/jenkins/tests/prostate_export' h2o.export_file(local_frame, hdfs_path, force=True) df = h2o.import_file(hdfs_path) train = df.drop("ID") vol = train['VOL'] vol[vol == 0] = None gle = train['GLEASON'] gle[gle == 0] = None train['CAPSULE'] = train['CAPSULE'].asfactor() my_gbm = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.1, distribution="bernoulli") my_gbm.train(x=list(range(1, train.ncol)), y="CAPSULE", training_frame=train, validation_frame=train) my_gbm.predict(train)
def calculate(self): # Get the value entered by the user into the # kilo_entry widget. ntrees_loc = int(self.ntrees_entry.get()) min_rows_loc = int(self.min_rows_entry.get()) max_depth_loc = int(self.max_depth_entry.get()) self.ntrees.set(ntrees_loc) self.min_rows.set(min_rows_loc) self.max_depth.set(max_depth_loc) # Import the titanic dataset into H2O: titanic = h2o.import_file("train.csv") test = h2o.import_file("test.csv") # Set the predictors and response; # set the response as a factor: titanic["Survived"] = titanic["Survived"].asfactor() predictors = ['Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'] response = "Survived" # Split the dataset into a train and valid set: ratio = float(self.slider.get()) train, valid = titanic.split_frame(ratios=[ratio], seed=1234) # Build and train the model: titanic_drf = H2ORandomForestEstimator(ntrees=ntrees_loc, max_depth=max_depth_loc, min_rows=min_rows_loc, calibrate_model=True, calibration_frame=valid, binomial_double_trees=True) titanic_drf.train(x=predictors, y=response, training_frame=train, validation_frame=valid) # Eval performance: perf = titanic_drf.model_performance() # Generate predictions on a validation set (if necessary): pred = titanic_drf.predict(valid) pred2 = titanic_drf.predict(test) self.auc.set(perf.auc()) self.logloss.set(perf.logloss()) # print(pred2) h2o.export_file(pred, 'result_pred.csv') h2o.export_file(pred2, 'result_pred2.csv')
def gbm_on_hadoop(): local_frame = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) hdfs_path = 'hdfs:///user/jenkins/tests/prostate_export' h2o.export_file(local_frame, hdfs_path, force=True) df = h2o.import_file(hdfs_path) train = df.drop("ID") vol = train['VOL'] vol[vol == 0] = None gle = train['GLEASON'] gle[gle == 0] = None train['CAPSULE'] = train['CAPSULE'].asfactor() my_gbm = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.1, distribution="bernoulli") my_gbm.train(x=list(range(1, train.ncol)), y="CAPSULE", training_frame=train, validation_frame=train) my_gbm.predict(train)
def auto(self): ntrees_loc = 400 min_rows_loc = 10 max_depth_loc = 9 self.ntrees.set(ntrees_loc) self.min_rows.set(min_rows_loc) self.max_depth.set(max_depth_loc) # Import the titanic dataset into H2O: titanic = h2o.import_file("train.csv") test = h2o.import_file("test.csv") # Set the predictors and response; # set the response as a factor: titanic["Survived"] = titanic["Survived"].asfactor() predictors = ['Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'] response = "Survived" # Split the dataset into a train and valid set: train, valid = titanic.split_frame(ratios=[.8], seed=1234) # Build and train the model: titanic_drf = H2ORandomForestEstimator(ntrees=ntrees_loc, max_depth=max_depth_loc, min_rows=min_rows_loc, calibrate_model=True, calibration_frame=valid, binomial_double_trees=True) titanic_drf.train(x=predictors, y=response, training_frame=train, validation_frame=valid) # Eval performance: perf = titanic_drf.model_performance() # Generate predictions on a validation set (if necessary): pred = titanic_drf.predict(valid) #DODAC DO TESTOWYCH DANYCH W calculate() TEZ JEST pred2 = titanic_drf.predict(test) pred2.describe() test.describe() self.auc.set(perf.auc()) self.logloss.set(perf.logloss()) h2o.export_file(pred, 'result_pred.csv') h2o.export_file(pred2, 'result_pred2.csv')
def export_file(): pros_hex = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) pros_hex[1] = pros_hex[1].asfactor() pros_hex[3] = pros_hex[3].asfactor() pros_hex[4] = pros_hex[4].asfactor() pros_hex[5] = pros_hex[5].asfactor() pros_hex[8] = pros_hex[8].asfactor() p_sid = pros_hex.runif() pros_train = pros_hex[p_sid > .2, :] pros_test = pros_hex[p_sid <= .2, :] glm = H2OGeneralizedLinearEstimator(family="binomial") myglm = glm.train(x=list(range(2, pros_hex.ncol)), y=1, training_frame=pros_train) mypred = glm.predict(pros_test) def id_generator(size=6, chars=string.ascii_uppercase + string.digits): return ''.join(random.choice(chars) for _ in range(size)) fname = id_generator() + "_prediction.csv" path = pyunit_utils.locate("results") dname = path + "/" + fname h2o.export_file(mypred, dname) py_pred = pd.read_csv(dname) print(py_pred.head()) h_pred = mypred.as_data_frame(True) print(h_pred.head()) #Test to check if py_pred & h_pred are identical try: assert_frame_equal(py_pred, h_pred) return True except: return False
def coxph_mojo_predict_with_interactions(sandbox_dir): np.random.seed(1234) n_rows = 10 start = np.random.choice([0, 1, 2, 3, 4], size=10) delta = np.random.choice([1, 2, 3], size=10) data = { "start": start, "stop": start + delta, "X1": np.random.randn(n_rows), "X2": np.random.randn(n_rows), "age": np.random.choice(["young", "old"], 10), "W": np.random.choice([10, 20], size=n_rows), "Offset": np.random.uniform(0, 1, 10), "Y": np.random.choice([0, 1], size=n_rows) } train = h2o.H2OFrame(pandas.DataFrame(data)) train["age"] = train["age"].asfactor() h2o_model = H2OCoxProportionalHazardsEstimator(start_column="start", stop_column="stop", weights_column="W", offset_column="Offset", interactions=["X1", "X2"], stratify_by=["age"]) h2o_model.train(x=["X1", "X2", "age"], y="Y", training_frame=train) mojo = pyunit_utils.download_mojo(h2o_model) # export new file (including the random columns) input_csv = "%s/in.csv" % sandbox_dir h2o.export_file(train, input_csv) pandas_frame = pandas.read_csv(input_csv) h2o_prediction = h2o_model.predict(train) mojo_prediction = h2o.mojo_predict_pandas(dataframe=pandas_frame, **mojo) assert len(mojo_prediction) == h2o_prediction.nrow assert_frame_equal(h2o_prediction.as_data_frame(use_pandas=True), mojo_prediction, check_dtype=False)
def h2oexport_file(): """ Python API test: h2o.export_file(frame, path, force=False, parts=1). Note taht force=True is only honored if parts=1. Otherwise, an error will be thrown. """ training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) try: results_dir = pyunit_utils.locate("results") # find directory path to results folder final_path = os.path.join(results_dir, 'frameData') h2o.export_file(training_data, final_path, force=True, parts=1) # save data assert os.path.isfile(final_path), "h2o.export_file() command is not working." final_dir_path = os.path.join(results_dir, 'multiFrame') h2o.export_file(training_data, final_dir_path, force=True, parts=-1) assert len(os.listdir(final_dir_path))>0, "h2o.export_file() command is not working." except Exception as e: if e.__class__.__name__=='ValueError' and 'File not found' in e.args[0]: print("Directory is not writable. h2o.export_file() command is not tested.") else : assert e.__class__.__name__=='H2OResponseError' and \ 'exportFrame: Cannot use path' in e.args[0]._props['dev_msg'], \ "h2o.export_file() command is not working." print("Directory: {0} is not empty. Delete or empy it before re-run. h2o.export_file() " "is not tested with multi-part export.".format(final_dir_path))
""" import os import glob import pandas import uyulala import h2o h2o.init() transformed = h2o.import_file(path=os.path.join(uyulala.dataDir, "transformed"), col_types={"DateCol": "enum"}) # predictionDF = h2o.as_list(transformed[['Symbol','DateCol']], use_pandas=True) for modelFile in glob.glob(os.path.join(uyulala.modelsDir, "model|*")): fileName = modelFile.split("/")[-1] junk, label, modelType = fileName.split("|") model = h2o.h2o.load_model(path=modelFile) prediction = model.predict(transformed) # predictionDF = predictionDF.merge(h2o.as_list(prediction, use_pandas=True),left_index=True,right_index=True).rename(columns={'predict':label+'|'+modelType}) transformed = transformed.cbind(prediction) # predictionDF.to_csv(path=os.path.join(uyulala.dataDir,'predictions.csv'),index=False) h2o.export_file(transformed, path=os.path.join(uyulala.dataDir, "predictions"), force=True, parts=-1) # labels = h2o.import_file(path=os.path.join(uyulala.dataDir,'labeled'),col_types={'DateCol':'enum'})
def mojo_predict_csv_test(target_dir): mojo_file_name = "prostate_gbm_model.zip" mojo_zip_path = os.path.join(target_dir, mojo_file_name) prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] # Getting first row from test data frame pdf = test[1, 2:] input_csv = "%s/in.csv" % target_dir output_csv = "%s/output.csv" % target_dir h2o.export_file(pdf, input_csv) # ================================================================= # Regression # ================================================================= regression_gbm1 = H2OGradientBoostingEstimator(distribution="gaussian") regression_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train) pred_reg = regression_gbm1.predict(pdf) p1 = pred_reg[0, 0] print("Regression prediction: " + str(p1)) download_mojo(regression_gbm1, mojo_zip_path) print("\nPerforming Regression Prediction using MOJO @... " + target_dir) prediction_result = h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) print("Prediction result: " + str(prediction_result)) assert p1 == float(prediction_result[0]['predict']), "expected predictions to be the same for binary and MOJO model for regression" # ================================================================= # Binomial # ================================================================= train[1] = train[1].asfactor() bernoulli_gbm1 = H2OGradientBoostingEstimator(distribution="bernoulli") bernoulli_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train) pred_bin = bernoulli_gbm1.predict(pdf) binary_prediction_0 = pred_bin[0, 1] binary_prediction_1 = pred_bin[0, 2] print("Binomial prediction: p0: " + str(binary_prediction_0)) print("Binomial prediction: p1: " + str(binary_prediction_1)) download_mojo(bernoulli_gbm1, mojo_zip_path) print("\nPerforming Binomial Prediction using MOJO @... " + target_dir) prediction_result = h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) mojo_prediction_0 = float(prediction_result[0]['0']) mojo_prediction_1 = float(prediction_result[0]['1']) print("Binomial prediction: p0: " + str(mojo_prediction_0)) print("Binomial prediction: p1: " + str(mojo_prediction_1)) assert binary_prediction_0 == mojo_prediction_0, "expected predictions to be the same for binary and MOJO model for Binomial - p0" assert binary_prediction_1 == mojo_prediction_1, "expected predictions to be the same for binary and MOJO model for Binomial - p1" # ================================================================= # Multinomial # ================================================================= iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) r = iris[0].runif() train = iris[r < 0.90] test = iris[r >= 0.10] # Getting first row from test data frame pdf = test[1, 0:4] input_csv = "%s/in-multi.csv" % target_dir output_csv = "%s/output.csv" % target_dir h2o.export_file(pdf, input_csv) multi_gbm = H2OGradientBoostingEstimator() multi_gbm.train(x=['C1', 'C2', 'C3', 'C4'], y='C5', training_frame=train) pred_multi = multi_gbm.predict(pdf) multinomial_prediction_1 = pred_multi[0, 1] multinomial_prediction_2 = pred_multi[0, 2] multinomial_prediction_3 = pred_multi[0, 3] print("Multinomial prediction (Binary): p0: " + str(multinomial_prediction_1)) print("Multinomial prediction (Binary): p1: " + str(multinomial_prediction_2)) print("Multinomial prediction (Binary): p2: " + str(multinomial_prediction_3)) download_mojo(multi_gbm, mojo_zip_path) print("\nPerforming Binomial Prediction using MOJO @... " + target_dir) prediction_result = h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path, output_csv_path=output_csv) mojo_prediction_1 = float(prediction_result[0]['Iris-setosa']) mojo_prediction_2 = float(prediction_result[0]['Iris-versicolor']) mojo_prediction_3 = float(prediction_result[0]['Iris-virginica']) print("Multinomial prediction (MOJO): p0: " + str(mojo_prediction_1)) print("Multinomial prediction (MOJO): p1: " + str(mojo_prediction_2)) print("Multinomial prediction (MOJO): p2: " + str(mojo_prediction_3)) assert multinomial_prediction_1 == mojo_prediction_1, "expected predictions to be the same for binary and MOJO model for Multinomial - p0" assert multinomial_prediction_2 == mojo_prediction_2, "expected predictions to be the same for binary and MOJO model for Multinomial - p1" assert multinomial_prediction_3 == mojo_prediction_3, "expected predictions to be the same for binary and MOJO model for Multinomial - p2"
def local_and_hdfs_frame_equality(): local_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) hdfs_path = 'hdfs:///user/jenkins/tests/prostate_export' h2o.export_file(local_frame, hdfs_path, force=True) hdfs_frame = h2o.import_file(hdfs_path) assert_frame_equal(local_frame.as_data_frame(), hdfs_frame.as_data_frame())
assert all([x not in testing_grids for x in training_grids]) # create vector allocating every obs to training or testing: training = np.array([x in training_grids for x in autocor]) print "Proportion of data in training", sum(training)/len(training), "and prop_train =", prop_train # assert round(sum(training)/len(training), 2) == prop_train or round(sum(training)/len(training), 2) == prop_train + 1 or round(sum(training)/len(training), 2) == prop_train - 1 print "len(training):", len(training) print "d.dim[0]:", d.dim[0] #assert len(training) == d.dim[0] # Save to csv to then load into h2o later: print "Starting to save to csv format..." training = pd.DataFrame(training) training.to_csv(save_training_ind_fp, header=False, index=False) print "Done with saving training and testing sets for training data." h2o.export_file(frame = d, path = save_training_data_fp, force=True) h2o.export_file(frame = holdout, path = save_holdout_data_fp, force=True) # Send email email = False if(email): import smtplib GMAIL_USERNAME = None GMAIL_PW = None RECIP = None SMTP_NUM = None session = smtplib.SMTP('smtp.gmail.com', SMTP_NUM) session.ehlo() session.starttls() session.login(GMAIL_USERNAME, GMAIL_PW) headers = "\r\n".join(["from: " + GMAIL_USERNAME,
GP_lag = data['GP_lag'] GP_lag[GP_lag==9999] = None data['GP_lag'] =GP_lag PSN_lag = data['PSN_lag'] PSN_lag[PSN_lag==9999] = None data['PSN_lag'] = PSN_lag nino34_lag = data['nino34_lag'] nino34_lag[nino34_lag==9999] = None data['nino34_lag'] = nino34_lag # h2o.remove([LST_lag, NDVI_lag, EVI_lag, EVI, PixelReliability, FPAR_lag, LAI_lag, GP_lag, PSN_lag, nino34_lag]) # del LST_lag, NDVI_lag, EVI_lag, EVI, PixelReliability, FPAR_lag, LAI_lag, GP_lag, PSN_lag, nino34_lag h2o.export_file(frame = data, path = save_data_fp, force=True) # Send email email = False if(email): import smtplib GMAIL_USERNAME = None GMAIL_PW = None RECIP = None SMTP_NUM = None session = smtplib.SMTP('smtp.gmail.com', SMTP_NUM) session.ehlo() session.starttls() session.login(GMAIL_USERNAME, GMAIL_PW) headers = "\r\n".join(["from: " + GMAIL_USERNAME, "subject: " + "Finished running script: " + __file__,