def plot_mlp(fig_name): """ plot MLP training result :param fig_name: saved figure name :return: figure """ dir = "log/heatmap" pattern = r'(\d{1,2}).csv$' utils.construct_heatmap_set(dir, pattern) df = pd.read_csv('data/df.csv', sep=';') Px, Py = utils.point2XY(df['COUNTER FLAG']) Px.name = 'Px' Py.name = 'Py' X = df.iloc[:, 0:8] X_train, X_valid, Px_train, Px_valid, Py_train, Py_valid = train_test_split( X, Px, Py) h2o.init() mlp_px, mlp_py = utils.train_mlp(X_train, X_valid, Px_train, Px_valid, Py_train, Py_valid) if not os.path.exists(dir_fig): os.makedirs(dir_fig) utils.plot_mlp_result(mlp_px) plt.title('Train result for Px') plt.savefig(dir_fig + '/' + fig_name + '4px.png') utils.plot_mlp_result(mlp_py) plt.title('Train result for Py') plt.savefig(dir_fig + '/' + fig_name + '4py.png') h2o.cluster().shutdown()
def predict_h2o(ext_df, features_to_include): # in case another instance of h2o is currently running flag_file = CODE_PATH + "H2O_currently_running.txt" while os.path.exists(flag_file): sleep(15) fpr = open(flag_file, "w") fpr.flush() fpr.close() drf_model_path = MODEL_PATH try: h2o.init(max_mem_size="8G") covtype_df = h2o.H2OFrame(ext_df[features_to_include]) drf_model = h2o.load_model(drf_model_path) ext_df["pred_Bs"] = drf_model.predict(covtype_df).as_data_frame() except: exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, file=sys.stdout) finally: h2o.cluster().shutdown(prompt=False) os.remove(flag_file) return
def __stop(self): try: if not (self._conf.is_manual_cluster_start_used() and self._conf.runs_in_external_cluster_mode()): h2o.cluster().shutdown() except: pass
def h2oinitname(): """ Python API test for h2o.init :return: """ try: h2o.init(strict_version_check=False, name="test") # Should initialize h2o.init(strict_version_check=False, name="test") # Should just connect assert h2o.cluster().cloud_name == "test" except H2OConnectionError as e: # some errors are okay like version mismatch print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0])) try: h2o.init(strict_version_check=False, port=54321, name="test2", as_port=True) assert False, "Should fail to connect and the port should be used by previous invocation." except H2OServerError as e: print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0])) try: h2o.init(strict_version_check=False, port=54321, name="test2") # Should bump the port to next one assert h2o.cluster().cloud_name == "test2" except H2OConnectionError as e: print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0])) try: h2o.init(strict_version_check=False, port=60000, name="test3", as_port=True) assert h2o.cluster().cloud_name == "test3" except H2OConnectionError as e: print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0])) assert_is_type(e, H2OConnectionError) h2o.cluster().shutdown()
def h2oinit_with_extra_classpath(): try: h2o.init(strict_version_check=False, extra_classpath=[os.path.realpath(__file__)], port=40000) finally: h2o.cluster().shutdown()
def pubdev_6431_deleted_key(): jobs_count_start = len(h2o.cluster().list_jobs()[0]) input_frame = h2o.import_file( pyunit_utils.locate("smalldata/flow_examples/abalone.csv.gz")) model = H2OGeneralizedLinearEstimator(alpha=0.3) model.train(y="C9", training_frame=input_frame, validation_frame=input_frame) assert (jobs_count_start+2) == len(h2o.cluster().list_jobs()[0]), \ "(after train) expected {0} jobs but found {1} - {2}".format((jobs_count_start+2), len(h2o.cluster().list_jobs()[0]), h2o.cluster().list_jobs()) h2o.remove(model) assert (jobs_count_start+2) == len(h2o.cluster().list_jobs()[0]), \ "(after rm) expected {0} jobs but found {1} - {2}".format((jobs_count_start+2), len(h2o.cluster().list_jobs()[0]), h2o.cluster().list_jobs()) hyper_parameters = { 'alpha': [0, 0.3], } grid_search = H2OGridSearch(H2OGeneralizedLinearEstimator, hyper_params=hyper_parameters) grid_search.train(y="C9", training_frame=input_frame, validation_frame=input_frame) assert (jobs_count_start+3) == len(h2o.cluster().list_jobs()[0]), \ "(after grid) expected {0} jobs but found {1}- {2}".format((jobs_count_start+3), len(h2o.cluster().list_jobs()[0]), h2o.cluster().list_jobs())
def stop_h2o(self): """Shuts down the H2O instance. """ # Turn off H2O cluster try: h2o.cluster().shutdown(prompt=False) except BaseException as e: print("Unknown error occurred during shutdown of H2O cluster") print(e)
def h2ocluster_get_status(): """ Python API test: h2o.cluster().get_status(), h2o.cluster().get_status_details() """ table = h2o.cluster().get_status() details = h2o.cluster().get_status_details() table.show() details.show()
def h2onetwork_test(): """ Python API test: h2o.cluster().network_test() """ try: h2o.cluster().network_test() # no return type except Exception as e: assert False, "h2o.cluster().network_test() command is not working."
def machine_learning_process(X, extra_crispr_df, y, train_index, test_index): try: __ml_train(X, extra_crispr_df, y, train_index, test_index) except: logger.debug("Fail to use random forest") finally: h2o.cluster().shutdown()
def h2ocluster(): """ Python API test: h2o.cluster() """ try: h2o.cluster() except Exception as e: assert False, "h2o.cluster() command not is working."
def h2ocluster_status(): """ Python API test: h2o.cluster().show_status(True) and h2o.cluster().show_status() """ try: h2o.cluster().show_status(True) # no return type h2o.cluster().show_status() except Exception as e: assert False, "h2o.cluster().show_status() command is not working."
def h2oinit_default_log_dir(): tmpdir = tempfile.mkdtemp() try: h2o.init(strict_version_check=False, name="default_log", ice_root=tmpdir) except H2OConnectionError as e: # some errors are okay like version mismatch print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0])) finally: assert os.path.exists(os.path.join(tmpdir, "h2ologs")) == True shutil.rmtree(tmpdir) h2o.cluster().shutdown()
def main(): frame = d_frame() frame = variables(frame) frame = wait_convert(frame) dummies_df = dummies(frame) train = get_frame(frame, dummies_df) test = test_set(train).reset_index(drop=True) train,test = data_fix(train,test) model(train,test) h2o.cluster().shutdown()
def h2oinit_fail_invalid_log_level(): try: h2o.init(strict_version_check=False, log_level="BAD_LOG_LEVEL") assert False, "Should fail to start an h2o instance with an invalid log level." except H2OConnectionError as e: # some errors are okay like version mismatch assert False, "Should fail to start an h2o instance with an invalid log level but H2OConnectionError was thrown." except H2OValueError: print("H2OValueError properly thrown") return finally: h2o.cluster().shutdown()
def __getattr__(self, item): if h2o.cluster(): raise AttributeError(( "Unknown attribute `{prop}` on object of type `{cls}`, " "this property is not available for this H2O backend [version={version}]." ).format(prop=item, cls=self.__class__.__name__, version=h2o.cluster().version)) else: raise H2OConnectionError( "Not connected to a cluster. Did you run `h2o.init()` or `h2o.connect()`?" )
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format( "pyunit_INTERNAL_HDFS_timestamp_date_orc.py")) pass else: origTZ = h2o.cluster().timezone newZone = 'America/Los_Angeles' h2o.cluster().timezone = newZone tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. allOrcFiles = [ "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/orc_split_elim.orc" ] allCsvFiles = [ "/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv", "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv", "/datasets/orc_parser/csv/orc_split_elim.csv" ] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex]) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" h2o.cluster().timezone = origTZ else: raise EnvironmentError
def run(config_path, work_station): config = import_module(config_path) # Import directories paths = get_paths(station=work_station) data_dir, pkl_dir = paths['data_dir'], paths['pkl_dir'] h2o_rand_dir, log_dir = config.SAVE_DIR, paths['logs'] # Get new logger logger = get_logger('H2oRandSearch', log_dir) meta = pd.read_pickle(pkl_dir + '/meta_df.pkl') h2o.init(**config.H2O_INIT_SETTINGS) logger.info("Started new H2o session " + str(h2o.cluster().cloud_name)) credit_data = h2o.upload_file(pkl_dir + "/train_imp_na_df.csv") logger.info("Loaded data into cluster") # Grid searching parameters X = set(credit_data.columns) - {'TARGET'} - set(meta.columns) Y = 'TARGET' credit_data[Y] = credit_data[Y].asfactor() data_info = { 'X': X, 'Y': Y, 'training_frame': credit_data, 'model_directory': h2o_rand_dir, 'logger': logger, 'configuration': config } del meta if config.INCLUDE_GBM: config.GBM_SETTINGS.update(data_info) random_h2o_model_search(**config.GBM_SETTINGS) if config.INCLUDE_XGB: config.XGB_SETTINGS.update(data_info) random_h2o_model_search(**config.XGB_SETTINGS) if config.INCLUDE_DEEP: config.DEEP_SETTINGS.update(data_info) random_h2o_model_search(**config.DEEP_SETTINGS) if config.INCLUDE_RF: config.RF_SETTINGS.update(data_info) random_h2o_model_search(**config.RF_SETTINGS) if config.INCLUDE_NAIVE_BAYES: config.NAI_BAYES_SETTINGS.update(data_info) random_h2o_model_search(**config.NAI_BAYES_SETTINGS) if config.INCLUDE_GLM: config.GLM_SETTINGS.update(data_info) random_h2o_model_search(**config.GLM_SETTINGS) logger.info("Completed search. Shutting down cluster " + str(h2o.cluster().cloud_name)) h2o.cluster().shutdown()
def get_all_variables_csv(i): ivd={} try: iv = pd.read_csv(i,header=None) except: logging.critical('read csv error') h2o.download_all_logs(dirname=logs_path, filename=logfile) h2o.cluster().shutdown() sys.exit(10) col=iv.values.tolist()[0] dt=iv.values.tolist()[1] i=0 for c in col: ivd[c.strip()]=dt[i].strip() i+=1 return ivd
def xgboost_estimation(): if ("XGBoost" not in h2o.cluster().list_all_extensions()): print("XGBoost extension is not present. Skipping test. . .") return # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if not hadoop_namenode_is_accessible: raise EnvironmentError("Hadoop namenode is not accessible") hdfs_name_node = pyunit_utils.hadoop_namenode() full_data = createData(500000, 500) myX = list(full_data.col_names) myX.remove("IsDepDelayed") xgb = H2OXGBoostEstimator(seed = 42, tree_method = "approx") xgboost_model = xgb.train(y = "IsDepDelayed", x = myX[0:480], training_frame = full_data, model_id = "xgboost") print(xgboost_model) pred = predict(xgboost_model, full_data) perf = h2o.performance(xgboost_model, full_data) return perf
def pubdev_6339(): cluster = h2o.cluster() # number of nodes cloud_size = cluster.cloud_size # number of CPUs cores = sum(node["num_cpus"] for node in cluster.nodes) # path to file file_paths = [ pyunit_utils.locate("smalldata/arcene/arcene_train.data"), pyunit_utils.locate("smalldata/census_income/adult_data.csv"), pyunit_utils.locate("smalldata/chicago/chicagoAllWeather.csv"), pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"), pyunit_utils.locate("smalldata/wa_cannabis/raw/Dashboard_Usable_Sales_w_Weight_Daily.csv") ] for file_path in file_paths: # read data and parse setup to get number of columns data_raw = h2o.import_file(path=file_path,parse=False) setup = h2o.parse_setup(data_raw) # get number of columns from setup num_cols = setup['number_columns'] # get the chunk size chunk_size = calculate_chunk_size(file_path, num_cols, cores, cloud_size) # get chunk size to compare if calculation is correct result_size = setup['chunk_size'] assert chunk_size == result_size, "Calculated chunk size is incorrect!" print("chunk size for file", file_path, "is:", chunk_size) data_raw = h2o.import_file(path=file_paths[1],parse=False) setup = h2o.parse_setup(data_raw)
def xgboost_estimation(): if ("XGBoost" not in h2o.cluster().list_all_extensions()): print("XGBoost extension is not present. Skipping test. . .") return # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if not hadoop_namenode_is_accessible: raise EnvironmentError("Hadoop namenode is not accessible") hdfs_name_node = pyunit_utils.hadoop_namenode() full_data = createData(500000, 500) myX = list(full_data.col_names) myX.remove("IsDepDelayed") xgb = H2OXGBoostEstimator(seed=42, tree_method="approx") xgboost_model = xgb.train(y="IsDepDelayed", x=myX[0:480], training_frame=full_data, model_id="xgboost") print(xgboost_model) pred = predict(xgboost_model, full_data) perf = h2o.performance(xgboost_model, full_data) return perf
def train_model(file, max_mod, max_time, max_mem_size, directory_name): h2o.init(nthreads=-1, max_mem_size=max_mem_size) print("Import and Parse data") df = h2o.import_file(path=_locate(file)) # Initialisation for cat in ["taxID", "phosphorylation_site", "metazoa"]: df[cat] = df[cat].asfactor() col_with_nan = [] for column in df.columns: if "freq" in column or "ACH" in column or "IC" in column or "shanon_entropy" in column: col_with_nan.append(column) for cat in [["geneID", "taxID", "clusterID", "metazoa"] + col_with_nan]: df[df[cat] == "nan", cat] = float('nan') # Variable selection df_names_x = df.names[:] for col in [ "phosphorylation_site", "uniprotID", "geneID", "sequence", "position", "seq_in_window", "metazoa", "clusterID", "nb_orthologs", "taxID", "nb_orthologs_metazoa", "nb_orthologs_nonmetazoa" ]: df_names_x.remove(col) # Model creation aml = H2OAutoML(max_models=max_mod, max_runtime_secs=int(max_time), seed=1) aml.train(x=df_names_x, y="phosphorylation_site", training_frame=df) lb = aml.leaderboard # Save models for id in list(lb['model_id'].as_data_frame().iloc[:, 0]): model = h2o.get_model(id) h2o.save_model(model, path=directory_name, force=True) with open('%s/info.txt' % directory_name, 'w', newline='') as g: orig_stdout = sys.stdout sys.stdout = g print(lb.head(rows=lb.nrows)) sys.stdout = orig_stdout h2o.cluster().shutdown
def available(): """ Returns True if a XGBoost model can be built, or False otherwise. """ if "XGBoost" not in h2o.cluster().list_core_extensions(): print("Cannot build an XGBoost model - no backend found.") return False else: return True
def h2oget_timezone(): """ Python API test: h2o.cluster().timezone() to get the time zone, h2o.cluster().timezone="UTC" to test setting the time zone. Copy from pyunit_get_set_list_timezones.py """ try: origTZ = h2o.cluster().timezone print("Original timezone: {0}".format(origTZ)) newZone = 'America/Los_Angeles' h2o.cluster().timezone = newZone assert str(h2o.cluster().timezone ) == newZone, "Time zone was not set correctly." h2o.cluster().timezone = origTZ # reset timezone back to original one except Exception as e: assert False, "h2o.cluster().timezone command is not working."
def available(): """ Returns True if a MOJO Pipelines can be used, or False otherwise. """ if "MojoPipeline" not in h2o.cluster().list_core_extensions(): print("Cannot use MOJO Pipelines - runtime was not found.") return False else: return True
def print_res(self): date = datetime.now() if (not self.res_class_acc.empty and not self.res_class_f1.empty): pathcla = './results/' + self.t + '/' + str(date).replace( ' ', '-') + '/classification' os.makedirs(pathcla) print( '---------------------------------RISULTATI DI CLASSIFICAZIONE ' + self.t + '---------------------------------') print(self.res_class_acc) print(self.res_class_f1) #print(self.pipelines_class) self.res_class_acc.to_csv(pathcla + '/acc.csv', index=False) self.res_class_f1.to_csv(pathcla + '/f1_score.csv', index=False) self.pipelines_class.to_csv(pathcla + '/pipelines.csv', sep='@', index=False) if (not self.res_reg_rmse.empty and not self.res_reg_r2.empty): pathreg = './results/' + self.t + '/' + str(date).replace( ' ', '-') + '/regression' os.makedirs(pathreg) print( '\n\n---------------------------------RISULTATI DI REGRESSIONE ' + self.t + '---------------------------------') print(self.res_reg_rmse) print(self.res_reg_r2) #print(self.pipelines_reg) self.res_reg_rmse.to_csv(pathreg + '/rmse.csv', index=False) self.res_reg_r2.to_csv(pathreg + '/r2_score.csv', index=False) self.pipelines_reg.to_csv(pathreg + '/pipelines.csv', sep='@', index=False) self.options.to_csv('./results/' + self.t + '/' + str(date).replace(' ', '-') + '/options.csv', index=False) h2o.cluster().shutdown() # Ritorno i dataframe oppure None se sono vuoti, ritorna una una lista di 4 dataframe return (str(date).replace(' ', '-'))
def available(): """ Ask the H2O server whether a XGBoost model can be built (depends on availability of native backends). :return: True if a XGBoost model can be built, or False otherwise. """ if "XGBoost" not in h2o.cluster().list_core_extensions(): print("Cannot build an XGBoost model - no backend found.") return False else: return True
def __init__(self, *args, **kwargs): super(H2OGBMForecaster, self).__init__(*args, **kwargs) # init the cluster if is not already up if h2o.cluster() is None: h2o.init(nthreads=-1) model_params = kwargs[ "model_params"] if "model_params" in kwargs else dict() self.model = H2OGBMRegressor(model_params) for feature, encoding in self.categorical_features.items(): if encoding == "default": self.categorical_features[feature] = ("y", ce.TargetEncoder, dict())
def test_frame_chunks(): hf = h2o.H2OFrame({'A': [1, 'NA', 2], 'B': [1, 2, 3], 'C': [4, 5, 6]}) result = h2o.api("GET /3/FrameChunks/%s" % urllib.parse.quote(hf.frame_id)) assert result["frame_id"]["name"] == hf.frame_id chunks = result["chunks"] assert len(chunks) > 0 for chunk in result["chunks"]: assert chunk["node_idx"] >= 0 assert chunk["node_idx"] < h2o.cluster().cloud_size assert sum(map(lambda c: c["row_count"], chunks)) == 3
def h2o_train(X_train, X_test, y_train, y_test, seed=2020): X_train_c = X_train.copy() X_test_c = X_test.copy() target_name = y_train.name cols = list(X_train_c.columns) cat_cols = [ col for col in X_train_c.columns if X_train_c[col].dtype == 'O' ] Train = h2o.H2OFrame.from_python(pd.concat([X_train_c, y_train], axis=1)) Train[target_name] = Train[target_name].asfactor() for col in cat_cols: Train[col] = Train[col].asfactor() model = H2OAutoML(seed=seed, max_runtime_secs=3600 * 2) model.train(x=cols, y=target_name, training_frame=Train) print('modeling steps: ', model.modeling_steps) print('modeling learderboard', model.leaderboard) print('modeling log', model.event_log) print('modeling leader', model.leader) Test = h2o.H2OFrame.from_python(pd.concat([X_test_c, y_test], axis=1)) Test[target_name] = Test[target_name].asfactor() for col in cat_cols: Test[col] = Test[col].asfactor() # encoding would influenced here? pred = model.predict(Test).as_data_frame().values[:, 2] from sklearn.metrics import roc_auc_score h2o_auc = roc_auc_score(y_test, pred) h2o.cluster().shutdown() print('result auc:', h2o_auc) return h2o_auc
def h2olist_timezones(): """ Python API test: h2o.cluster().list_timezones() """ try: timezones = h2o.cluster().list_timezones() assert_is_type(timezones, H2OFrame) # change the assert nrow from == to >= in case more timezones are introduced in the future. assert timezones.nrow >= 460, "h2o.cluster().list_timezones() returns frame with wrong row number." assert timezones.ncol == 1, "h2o.cluster().list_timezones() returns frame with wrong column number." except Exception as e: assert False, "h2o.cluster().list_timezones() command is not working."
def h2o_H2OFrame_as_date(): """ Python API test: h2o.frame.H2OFrame.as_date(format) Copied from pyunit_as_date.py """ hdf = h2o.import_file(path=pyunit_utils.locate("smalldata/jira/v-11-eurodate.csv")) temp = hdf['ds5'].as_date("%d.%m.%y %H:%M") assert_is_type(temp, H2OFrame) # choose one element from new timestamp frame and compare it with conversion by python. Should equal. row_ind = randrange(0, temp.nrows) s = hdf[row_ind,'ds5'] tz = h2o.cluster().timezone # set python timezone to be the same as H2O timezone os.environ['TZ']=tz time.tzset() pythonTime = (time.mktime(datetime.datetime.strptime(s, "%d.%m.%y %H:%M").timetuple()))*1000.0 assert abs(pythonTime-temp[row_ind,0]) < 1e-10, "h2o.H2OFrame.as_date() command is not working."
def test_show_time(): h2o.cluster().timezone = "UTC" df = h2o.H2OFrame.from_python( {"A": [1, 2, 3], "B": ["a", "a", "b"], "C": ["hello", "all", "world"], "D": ["12MAR2015:11:00:00", "13MAR2015:12:00:00", "14MAR2015:13:00:00"]}, column_types={"A": "numeric", "B": "enum", "C": "string", "D": "time"} ) out = df.__unicode__() print(out) assert "2015-03-12 11:00:00" in out assert "2015-03-13 12:00:00" in out assert "2015-03-14 13:00:00" in out df2 = h2o.create_frame(cols=6, rows=10, time_fraction=1, missing_fraction=0.1) out2 = df2.__unicode__() print(out2) assert "e+" not in out2 assert "E+" not in out2 lines = out2.splitlines()[2:-2] # skip header (first 2 lines) + footer (last 2 lines) regex = re.compile(r"(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)") for l in lines: for entry in l.split(" "): entry = entry.strip() if entry == "": continue # skip missing entries m = re.match(regex, entry) assert m is not None, "Failed to recognize time expression '%s'" % entry year = int(m.group(1)) month = int(m.group(2)) day = int(m.group(3)) assert 1970 <= year <= 2020 assert 1 <= month <= 12 assert 1 <= day <= 31
def call_shutdown(): h2o.cluster().shutdown(prompt=True) # call shutdown but do not actually shut anything down.
from pysparkling import * from pyspark.sql import SparkSession import h2o # Initiate SparkSession spark = SparkSession.builder.appName("App name").getOrCreate() # Initiate H2OContext hc = H2OContext.getOrCreate(spark) # Stop H2O and Spark services h2o.cluster().shutdown() spark.stop()
def stop_with_jvm(self): h2o.cluster().shutdown() self.stop()
# Calculation input LSIcutoff=4.50 # (Ang) # Header for output file header=" q Sk LSI(Ang^2)" #print header # Loop over all shapshots in file for isnap in range(0,nsnap): snap=h2o.snapshot(nat[isnap], A, txyz[isnap], coordtype=ct) allmol=h2o.cluster(nat[isnap]/napm, A, snap.tx) allmol.H2Oindx allmol.FindH2Os(snap.A, snap.Ainv, cutoff=LSIcutoff) # allmol.printsnap( 'L1: '+ str(A.T[0][:])+' L2: '+ str(A.T[1][:]) +' L3: '+ str(A.T[2][:]) ) for imol in range(0, allmol.nmol): q, Sk=allmol.getTOPs(snap.A, snap.Ainv, imol) LSI=allmol.LSI(snap.A, snap.Ainv, imol, rcutoff=3.7) print q, Sk, LSI
def tearDown(self): h2o.cluster().shutdown(False)
# Define and print file header header="rOO \t phi \t theta \t rOH11 \t rOH12 \t rOH21 \t rOH22 \t HOH1 \t HOH2 \t alpha \t beta \t gamma \t nu \t mu \t OdHOa" print header # Loop over all shapshots in file for icl in range(0,nsnap): # Create instance of the class snapshot snap=h2o.snapshot(nat[icl], A, txyz[icl], coordtype=ct) # Create instance of the class cluster, in this # case containing all molecules in the snapshot mycluster=h2o.cluster(nat[icl]/napm, A, snap.tx) mycluster.H2Oindx mycluster.wrap() mycluster.FindH2Os(snap.A, snap.Ainv, cutoff=4.5) # Find molecule roles in the dimer: # - centralmol labels the donor molecule (can take values 0,1) # - secondarymol labels the acceptor molecule (can take values 0,1) # - primaryh labels the donated H centralmol, secondarymol, primaryh, secondaryh = mycluster.findHBandsort() # Center and orient with respect to molecule with index centralmol mycluster.CaO(centralmol)
def call_badshutdown(): h2o.cluster().shutdown(badparam=1, prompt=True) # call shutdown but do not actually shut anything down.
# Transform census table # Remove all spaces from column names (causing problems in Spark SQL) col_names = map(lambda s: s.strip().replace(' ', '_').replace('+', '_'), f_census.col_names) # Update column names in the table # f_weather.names = col_names f_census.names = col_names # Transform crimes table # Drop useless columns f_crimes = f_crimes[2:] # Set time zone to UTC for date manipulation h2o.cluster().timezone = "Etc/UTC" # Replace ' ' by '_' in column names col_names = map(lambda s: s.replace(' ', '_'), f_crimes.col_names) f_crimes.names = col_names refine_date_col(f_crimes, "Date", "%m/%d/%Y %I:%M:%S %p") f_crimes = f_crimes.drop("Date") # Expose H2O frames as Spark DataFrame df_weather = h2oContext.as_spark_frame(f_weather) df_census = h2oContext.as_spark_frame(f_census) df_crimes = h2oContext.as_spark_frame(f_crimes) # Register DataFrames as tables df_weather.createOrReplaceTempView("chicagoWeather")
def tearDownClass(cls): h2o.cluster().shutdown() unit_test_utils.tear_down_class(cls)