Esempio n. 1
0
def plot_mlp(fig_name):
    """
    plot MLP training result
    :param fig_name: saved figure name
    :return: figure
    """
    dir = "log/heatmap"
    pattern = r'(\d{1,2}).csv$'
    utils.construct_heatmap_set(dir, pattern)
    df = pd.read_csv('data/df.csv', sep=';')
    Px, Py = utils.point2XY(df['COUNTER FLAG'])
    Px.name = 'Px'
    Py.name = 'Py'
    X = df.iloc[:, 0:8]
    X_train, X_valid, Px_train, Px_valid, Py_train, Py_valid = train_test_split(
        X, Px, Py)
    h2o.init()
    mlp_px, mlp_py = utils.train_mlp(X_train, X_valid, Px_train, Px_valid,
                                     Py_train, Py_valid)
    if not os.path.exists(dir_fig):
        os.makedirs(dir_fig)
    utils.plot_mlp_result(mlp_px)
    plt.title('Train result for Px')
    plt.savefig(dir_fig + '/' + fig_name + '4px.png')
    utils.plot_mlp_result(mlp_py)
    plt.title('Train result for Py')
    plt.savefig(dir_fig + '/' + fig_name + '4py.png')
    h2o.cluster().shutdown()
Esempio n. 2
0
def predict_h2o(ext_df, features_to_include):
    # in case another instance of h2o is currently running
    flag_file = CODE_PATH + "H2O_currently_running.txt"
    while os.path.exists(flag_file):
        sleep(15)
    fpr = open(flag_file, "w")
    fpr.flush()
    fpr.close()

    drf_model_path = MODEL_PATH

    try:
        h2o.init(max_mem_size="8G")
        covtype_df = h2o.H2OFrame(ext_df[features_to_include])
        drf_model = h2o.load_model(drf_model_path)

        ext_df["pred_Bs"] = drf_model.predict(covtype_df).as_data_frame()
    except:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        traceback.print_exception(exc_type,
                                  exc_value,
                                  exc_traceback,
                                  file=sys.stdout)
    finally:
        h2o.cluster().shutdown(prompt=False)
        os.remove(flag_file)
    return
Esempio n. 3
0
 def __stop(self):
     try:
         if not (self._conf.is_manual_cluster_start_used()
                 and self._conf.runs_in_external_cluster_mode()):
             h2o.cluster().shutdown()
     except:
         pass
Esempio n. 4
0
def h2oinitname():
    """
    Python API test for h2o.init
    :return:
    """
    try:
        h2o.init(strict_version_check=False, name="test")  # Should initialize
        h2o.init(strict_version_check=False, name="test")  # Should just connect
        assert h2o.cluster().cloud_name == "test"
    except H2OConnectionError as e:  # some errors are okay like version mismatch
        print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0]))

    try:
        h2o.init(strict_version_check=False, port=54321, name="test2", as_port=True)
        assert False, "Should fail to connect and the port should be used by previous invocation."
    except H2OServerError as e:
        print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0]))

    try:
        h2o.init(strict_version_check=False, port=54321, name="test2")  # Should bump the port to next one
        assert h2o.cluster().cloud_name == "test2"
    except H2OConnectionError as e:
        print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0]))

    try:
        h2o.init(strict_version_check=False, port=60000, name="test3", as_port=True)
        assert h2o.cluster().cloud_name == "test3"
    except H2OConnectionError as e:
        print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0]))
        assert_is_type(e, H2OConnectionError)
        h2o.cluster().shutdown()
Esempio n. 5
0
def h2oinit_with_extra_classpath():
    try:
        h2o.init(strict_version_check=False,
                 extra_classpath=[os.path.realpath(__file__)],
                 port=40000)
    finally:
        h2o.cluster().shutdown()
Esempio n. 6
0
def pubdev_6431_deleted_key():
    jobs_count_start = len(h2o.cluster().list_jobs()[0])

    input_frame = h2o.import_file(
        pyunit_utils.locate("smalldata/flow_examples/abalone.csv.gz"))
    model = H2OGeneralizedLinearEstimator(alpha=0.3)
    model.train(y="C9",
                training_frame=input_frame,
                validation_frame=input_frame)
    assert (jobs_count_start+2) == len(h2o.cluster().list_jobs()[0]), \
        "(after train) expected {0} jobs but found {1} - {2}".format((jobs_count_start+2), len(h2o.cluster().list_jobs()[0]), h2o.cluster().list_jobs())

    h2o.remove(model)
    assert (jobs_count_start+2) == len(h2o.cluster().list_jobs()[0]), \
        "(after rm) expected {0} jobs but found {1} - {2}".format((jobs_count_start+2), len(h2o.cluster().list_jobs()[0]), h2o.cluster().list_jobs())

    hyper_parameters = {
        'alpha': [0, 0.3],
    }
    grid_search = H2OGridSearch(H2OGeneralizedLinearEstimator,
                                hyper_params=hyper_parameters)
    grid_search.train(y="C9",
                      training_frame=input_frame,
                      validation_frame=input_frame)
    assert (jobs_count_start+3) == len(h2o.cluster().list_jobs()[0]), \
        "(after grid) expected {0} jobs but found {1}- {2}".format((jobs_count_start+3), len(h2o.cluster().list_jobs()[0]), h2o.cluster().list_jobs())
 def stop_h2o(self):
     """Shuts down the H2O instance. """
     # Turn off H2O cluster
     try:
         h2o.cluster().shutdown(prompt=False)
     except BaseException as e:
         print("Unknown error occurred during shutdown of H2O cluster")
         print(e)
def h2ocluster_get_status():
    """
    Python API test: h2o.cluster().get_status(), h2o.cluster().get_status_details()
    """
    table = h2o.cluster().get_status()
    details = h2o.cluster().get_status_details()
    table.show()
    details.show()
Esempio n. 9
0
 def stop_h2o(self):
     """Shuts down the H2O instance. """
     # Turn off H2O cluster
     try:
         h2o.cluster().shutdown(prompt=False)
     except BaseException as e:
         print("Unknown error occurred during shutdown of H2O cluster")
         print(e)
Esempio n. 10
0
def h2onetwork_test():
    """
    Python API test: h2o.cluster().network_test()
    """
    try:
        h2o.cluster().network_test()    # no return type
    except Exception as e:
        assert False, "h2o.cluster().network_test() command is not working."
Esempio n. 11
0
def machine_learning_process(X, extra_crispr_df, y, train_index, test_index):

    try:
        __ml_train(X, extra_crispr_df, y, train_index, test_index)
    except:
        logger.debug("Fail to use random forest")
    finally:
        h2o.cluster().shutdown()
Esempio n. 12
0
def h2ocluster():
    """
    Python API test: h2o.cluster()
    """

    try:
        h2o.cluster()
    except Exception as e:
        assert False, "h2o.cluster() command not is working."
def h2ocluster_status():
    """
    Python API test: h2o.cluster().show_status(True) and h2o.cluster().show_status()

    """
    try:
        h2o.cluster().show_status(True)  # no return type
        h2o.cluster().show_status()
    except Exception as e:
        assert False, "h2o.cluster().show_status() command is not working."
Esempio n. 14
0
def h2oinit_default_log_dir():
    tmpdir = tempfile.mkdtemp()
    try:
        h2o.init(strict_version_check=False, name="default_log", ice_root=tmpdir)
    except H2OConnectionError as e:  # some errors are okay like version mismatch
        print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0]))
    finally:
        assert os.path.exists(os.path.join(tmpdir, "h2ologs")) == True
        shutil.rmtree(tmpdir)
        h2o.cluster().shutdown()
Esempio n. 15
0
def main():
    frame = d_frame()
    frame = variables(frame)
    frame = wait_convert(frame)
    dummies_df = dummies(frame)
    train = get_frame(frame, dummies_df)
    test = test_set(train).reset_index(drop=True)
    train,test = data_fix(train,test)
    model(train,test)
    
    h2o.cluster().shutdown()
Esempio n. 16
0
def h2oinit_fail_invalid_log_level():
    try:
        h2o.init(strict_version_check=False, log_level="BAD_LOG_LEVEL")
        assert False, "Should fail to start an h2o instance with an invalid log level."
    except H2OConnectionError as e:  # some errors are okay like version mismatch
        assert False, "Should fail to start an h2o instance with an invalid log level but H2OConnectionError was thrown."
    except H2OValueError:
        print("H2OValueError properly thrown")
        return
    finally:
        h2o.cluster().shutdown()
Esempio n. 17
0
 def __getattr__(self, item):
     if h2o.cluster():
         raise AttributeError((
             "Unknown attribute `{prop}` on object of type `{cls}`, "
             "this property is not available for this H2O backend [version={version}]."
         ).format(prop=item,
                  cls=self.__class__.__name__,
                  version=h2o.cluster().version))
     else:
         raise H2OConnectionError(
             "Not connected to a cluster. Did you run `h2o.init()` or `h2o.connect()`?"
         )
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format(
                      "pyunit_INTERNAL_HDFS_timestamp_date_orc.py"))
            pass
        else:
            origTZ = h2o.cluster().timezone
            newZone = 'America/Los_Angeles'
            h2o.cluster().timezone = newZone
            tol_time = 200  # comparing in ms or ns
            tol_numeric = 1e-5  # tolerance for comparing other numeric fields
            numElements2Compare = 100  # choose number of elements per column to compare.  Save test time.

            allOrcFiles = [
                "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                "/datasets/orc_parser/orc/orc_split_elim.orc"
            ]

            allCsvFiles = [
                "/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv",
                "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv",
                "/datasets/orc_parser/csv/orc_split_elim.csv"
            ]

            for fIndex in range(len(allOrcFiles)):
                url_orc = "hdfs://{0}{1}".format(hdfs_name_node,
                                                 allOrcFiles[fIndex])
                url_csv = "hdfs://{0}{1}".format(hdfs_name_node,
                                                 allCsvFiles[fIndex])
                h2oOrc = h2o.import_file(url_orc)
                h2oCsv = h2o.import_file(url_csv)

                # compare the two frames
                assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                    "H2O frame parsed from orc and csv files are different!"

            h2o.cluster().timezone = origTZ
    else:
        raise EnvironmentError
Esempio n. 19
0
def run(config_path, work_station):
    config = import_module(config_path)

    # Import directories
    paths = get_paths(station=work_station)
    data_dir, pkl_dir = paths['data_dir'], paths['pkl_dir']
    h2o_rand_dir, log_dir = config.SAVE_DIR, paths['logs']
    # Get new logger
    logger = get_logger('H2oRandSearch', log_dir)

    meta = pd.read_pickle(pkl_dir + '/meta_df.pkl')
    h2o.init(**config.H2O_INIT_SETTINGS)
    logger.info("Started new H2o session " + str(h2o.cluster().cloud_name))
    credit_data = h2o.upload_file(pkl_dir + "/train_imp_na_df.csv")
    logger.info("Loaded data into cluster")
    # Grid searching parameters
    X = set(credit_data.columns) - {'TARGET'} - set(meta.columns)
    Y = 'TARGET'
    credit_data[Y] = credit_data[Y].asfactor()

    data_info = {
        'X': X,
        'Y': Y,
        'training_frame': credit_data,
        'model_directory': h2o_rand_dir,
        'logger': logger,
        'configuration': config
    }
    del meta

    if config.INCLUDE_GBM:
        config.GBM_SETTINGS.update(data_info)
        random_h2o_model_search(**config.GBM_SETTINGS)
    if config.INCLUDE_XGB:
        config.XGB_SETTINGS.update(data_info)
        random_h2o_model_search(**config.XGB_SETTINGS)
    if config.INCLUDE_DEEP:
        config.DEEP_SETTINGS.update(data_info)
        random_h2o_model_search(**config.DEEP_SETTINGS)
    if config.INCLUDE_RF:
        config.RF_SETTINGS.update(data_info)
        random_h2o_model_search(**config.RF_SETTINGS)
    if config.INCLUDE_NAIVE_BAYES:
        config.NAI_BAYES_SETTINGS.update(data_info)
        random_h2o_model_search(**config.NAI_BAYES_SETTINGS)
    if config.INCLUDE_GLM:
        config.GLM_SETTINGS.update(data_info)
        random_h2o_model_search(**config.GLM_SETTINGS)
    logger.info("Completed search. Shutting down cluster " + str(h2o.cluster().cloud_name))
    h2o.cluster().shutdown()
Esempio n. 20
0
def get_all_variables_csv(i):
    ivd={}
    try:
      iv = pd.read_csv(i,header=None)
    except:
      logging.critical('read csv error') 
      h2o.download_all_logs(dirname=logs_path, filename=logfile)
      h2o.cluster().shutdown()     
      sys.exit(10)             
    col=iv.values.tolist()[0]
    dt=iv.values.tolist()[1]
    i=0
    for c in col:
      ivd[c.strip()]=dt[i].strip()
      i+=1        
    return ivd
def xgboost_estimation():
    if ("XGBoost" not in h2o.cluster().list_all_extensions()):
        print("XGBoost extension is not present.  Skipping test. . .")
        return
     

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if not hadoop_namenode_is_accessible:
        raise EnvironmentError("Hadoop namenode is not accessible")

    hdfs_name_node = pyunit_utils.hadoop_namenode()

    full_data = createData(500000, 500)

    myX = list(full_data.col_names)
    myX.remove("IsDepDelayed")

    xgb = H2OXGBoostEstimator(seed = 42, tree_method = "approx")
    xgboost_model = xgb.train(y = "IsDepDelayed", x = myX[0:480], training_frame = full_data, model_id = "xgboost")

    print(xgboost_model)

    pred = predict(xgboost_model, full_data)
    perf = h2o.performance(xgboost_model, full_data)
    return perf
Esempio n. 22
0
def pubdev_6339():
    
    cluster = h2o.cluster()
    # number of nodes
    cloud_size = cluster.cloud_size
    # number of CPUs
    cores = sum(node["num_cpus"] for node in cluster.nodes)


    # path to file
    file_paths = [
        pyunit_utils.locate("smalldata/arcene/arcene_train.data"),
        pyunit_utils.locate("smalldata/census_income/adult_data.csv"),
        pyunit_utils.locate("smalldata/chicago/chicagoAllWeather.csv"),
        pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"),
        pyunit_utils.locate("smalldata/wa_cannabis/raw/Dashboard_Usable_Sales_w_Weight_Daily.csv")
    ]

    for file_path in file_paths:
        # read data and parse setup to get number of columns 
        data_raw = h2o.import_file(path=file_path,parse=False)
        setup = h2o.parse_setup(data_raw)

        # get number of columns from setup
        num_cols = setup['number_columns']
        # get the chunk size
        chunk_size = calculate_chunk_size(file_path, num_cols, cores, cloud_size)
    
        # get chunk size to compare if calculation is correct
        result_size = setup['chunk_size']
        assert chunk_size == result_size, "Calculated chunk size is incorrect!"
        print("chunk size for file", file_path, "is:", chunk_size)

    data_raw = h2o.import_file(path=file_paths[1],parse=False)
    setup = h2o.parse_setup(data_raw)
Esempio n. 23
0
def xgboost_estimation():
    if ("XGBoost" not in h2o.cluster().list_all_extensions()):
        print("XGBoost extension is not present.  Skipping test. . .")
        return

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if not hadoop_namenode_is_accessible:
        raise EnvironmentError("Hadoop namenode is not accessible")

    hdfs_name_node = pyunit_utils.hadoop_namenode()

    full_data = createData(500000, 500)

    myX = list(full_data.col_names)
    myX.remove("IsDepDelayed")

    xgb = H2OXGBoostEstimator(seed=42, tree_method="approx")
    xgboost_model = xgb.train(y="IsDepDelayed",
                              x=myX[0:480],
                              training_frame=full_data,
                              model_id="xgboost")

    print(xgboost_model)

    pred = predict(xgboost_model, full_data)
    perf = h2o.performance(xgboost_model, full_data)
    return perf
Esempio n. 24
0
def train_model(file, max_mod, max_time, max_mem_size, directory_name):
    h2o.init(nthreads=-1, max_mem_size=max_mem_size)

    print("Import and Parse data")
    df = h2o.import_file(path=_locate(file))

    # Initialisation

    for cat in ["taxID", "phosphorylation_site", "metazoa"]:
        df[cat] = df[cat].asfactor()
    col_with_nan = []
    for column in df.columns:
        if "freq" in column or "ACH" in column or "IC" in column or "shanon_entropy" in column:
            col_with_nan.append(column)
    for cat in [["geneID", "taxID", "clusterID", "metazoa"] + col_with_nan]:
        df[df[cat] == "nan", cat] = float('nan')

    # Variable selection

    df_names_x = df.names[:]
    for col in [
            "phosphorylation_site", "uniprotID", "geneID", "sequence",
            "position", "seq_in_window", "metazoa", "clusterID",
            "nb_orthologs", "taxID", "nb_orthologs_metazoa",
            "nb_orthologs_nonmetazoa"
    ]:
        df_names_x.remove(col)

    # Model creation

    aml = H2OAutoML(max_models=max_mod, max_runtime_secs=int(max_time), seed=1)
    aml.train(x=df_names_x, y="phosphorylation_site", training_frame=df)
    lb = aml.leaderboard

    # Save models

    for id in list(lb['model_id'].as_data_frame().iloc[:, 0]):
        model = h2o.get_model(id)
        h2o.save_model(model, path=directory_name, force=True)
    with open('%s/info.txt' % directory_name, 'w', newline='') as g:
        orig_stdout = sys.stdout
        sys.stdout = g
        print(lb.head(rows=lb.nrows))
        sys.stdout = orig_stdout

    h2o.cluster().shutdown
Esempio n. 25
0
 def available():
     """
     Returns True if a XGBoost model can be built, or False otherwise.
     """
     if "XGBoost" not in h2o.cluster().list_core_extensions():
         print("Cannot build an XGBoost model - no backend found.")
         return False
     else:
         return True
Esempio n. 26
0
def h2oget_timezone():
    """
    Python API test: h2o.cluster().timezone() to get the time zone, h2o.cluster().timezone="UTC" to
    test setting the time zone.

    Copy from pyunit_get_set_list_timezones.py
    """
    try:
        origTZ = h2o.cluster().timezone
        print("Original timezone: {0}".format(origTZ))

        newZone = 'America/Los_Angeles'
        h2o.cluster().timezone = newZone
        assert str(h2o.cluster().timezone
                   ) == newZone, "Time zone was not set correctly."
        h2o.cluster().timezone = origTZ  # reset timezone back to original one
    except Exception as e:
        assert False, "h2o.cluster().timezone command is not working."
Esempio n. 27
0
 def available():
     """
     Returns True if a MOJO Pipelines can be used, or False otherwise.
     """
     if "MojoPipeline" not in h2o.cluster().list_core_extensions():
         print("Cannot use MOJO Pipelines - runtime was not found.")
         return False
     else:
         return True
Esempio n. 28
0
    def print_res(self):
        date = datetime.now()
        if (not self.res_class_acc.empty and not self.res_class_f1.empty):
            pathcla = './results/' + self.t + '/' + str(date).replace(
                ' ', '-') + '/classification'
            os.makedirs(pathcla)
            print(
                '---------------------------------RISULTATI DI CLASSIFICAZIONE '
                + self.t + '---------------------------------')
            print(self.res_class_acc)
            print(self.res_class_f1)
            #print(self.pipelines_class)

            self.res_class_acc.to_csv(pathcla + '/acc.csv', index=False)
            self.res_class_f1.to_csv(pathcla + '/f1_score.csv', index=False)
            self.pipelines_class.to_csv(pathcla + '/pipelines.csv',
                                        sep='@',
                                        index=False)

        if (not self.res_reg_rmse.empty and not self.res_reg_r2.empty):
            pathreg = './results/' + self.t + '/' + str(date).replace(
                ' ', '-') + '/regression'
            os.makedirs(pathreg)
            print(
                '\n\n---------------------------------RISULTATI DI REGRESSIONE '
                + self.t + '---------------------------------')
            print(self.res_reg_rmse)
            print(self.res_reg_r2)
            #print(self.pipelines_reg)

            self.res_reg_rmse.to_csv(pathreg + '/rmse.csv', index=False)
            self.res_reg_r2.to_csv(pathreg + '/r2_score.csv', index=False)
            self.pipelines_reg.to_csv(pathreg + '/pipelines.csv',
                                      sep='@',
                                      index=False)

        self.options.to_csv('./results/' + self.t + '/' +
                            str(date).replace(' ', '-') + '/options.csv',
                            index=False)

        h2o.cluster().shutdown()

        # Ritorno i dataframe oppure None se sono vuoti, ritorna una una lista di 4 dataframe
        return (str(date).replace(' ', '-'))
Esempio n. 29
0
 def available():
     """
     Ask the H2O server whether a XGBoost model can be built (depends on availability of native backends).
     :return: True if a XGBoost model can be built, or False otherwise.
     """
     if "XGBoost" not in h2o.cluster().list_core_extensions():
         print("Cannot build an XGBoost model - no backend found.")
         return False
     else:
         return True
Esempio n. 30
0
 def __init__(self, *args, **kwargs):
     super(H2OGBMForecaster, self).__init__(*args, **kwargs)
     # init the cluster if is not already up
     if h2o.cluster() is None: h2o.init(nthreads=-1)
     model_params = kwargs[
         "model_params"] if "model_params" in kwargs else dict()
     self.model = H2OGBMRegressor(model_params)
     for feature, encoding in self.categorical_features.items():
         if encoding == "default":
             self.categorical_features[feature] = ("y", ce.TargetEncoder,
                                                   dict())
Esempio n. 31
0
def test_frame_chunks():
    hf = h2o.H2OFrame({'A': [1, 'NA', 2], 'B': [1, 2, 3], 'C': [4, 5, 6]})
    result = h2o.api("GET /3/FrameChunks/%s" % urllib.parse.quote(hf.frame_id))

    assert result["frame_id"]["name"] == hf.frame_id
    chunks = result["chunks"]
    assert len(chunks) > 0
    for chunk in result["chunks"]:
        assert chunk["node_idx"] >= 0
        assert chunk["node_idx"] < h2o.cluster().cloud_size
    assert sum(map(lambda c: c["row_count"], chunks)) == 3
Esempio n. 32
0
def h2o_train(X_train, X_test, y_train, y_test, seed=2020):

    X_train_c = X_train.copy()
    X_test_c = X_test.copy()

    target_name = y_train.name

    cols = list(X_train_c.columns)
    cat_cols = [
        col for col in X_train_c.columns if X_train_c[col].dtype == 'O'
    ]

    Train = h2o.H2OFrame.from_python(pd.concat([X_train_c, y_train], axis=1))

    Train[target_name] = Train[target_name].asfactor()

    for col in cat_cols:
        Train[col] = Train[col].asfactor()

    model = H2OAutoML(seed=seed, max_runtime_secs=3600 * 2)

    model.train(x=cols, y=target_name, training_frame=Train)

    print('modeling steps: ', model.modeling_steps)
    print('modeling learderboard', model.leaderboard)
    print('modeling log', model.event_log)
    print('modeling leader', model.leader)

    Test = h2o.H2OFrame.from_python(pd.concat([X_test_c, y_test], axis=1))
    Test[target_name] = Test[target_name].asfactor()

    for col in cat_cols:
        Test[col] = Test[col].asfactor()  # encoding would influenced here?
    pred = model.predict(Test).as_data_frame().values[:, 2]

    from sklearn.metrics import roc_auc_score
    h2o_auc = roc_auc_score(y_test, pred)
    h2o.cluster().shutdown()
    print('result auc:', h2o_auc)

    return h2o_auc
def h2olist_timezones():
    """
    Python API test: h2o.cluster().list_timezones()
    """
    try:
        timezones = h2o.cluster().list_timezones()
        assert_is_type(timezones, H2OFrame)

        # change the assert nrow from == to >= in case more timezones are introduced in the future.
        assert timezones.nrow >= 460, "h2o.cluster().list_timezones() returns frame with wrong row number."
        assert timezones.ncol == 1, "h2o.cluster().list_timezones() returns frame with wrong column number."
    except Exception as e:
        assert False, "h2o.cluster().list_timezones() command is not working."
def h2o_H2OFrame_as_date():
    """
    Python API test: h2o.frame.H2OFrame.as_date(format)

    Copied from pyunit_as_date.py
    """
    hdf = h2o.import_file(path=pyunit_utils.locate("smalldata/jira/v-11-eurodate.csv"))
    temp = hdf['ds5'].as_date("%d.%m.%y %H:%M")
    assert_is_type(temp, H2OFrame)

    # choose one element from new timestamp frame and compare it with conversion by python.  Should equal.
    row_ind = randrange(0, temp.nrows)
    s = hdf[row_ind,'ds5']

    tz = h2o.cluster().timezone     # set python timezone to be the same as H2O timezone
    os.environ['TZ']=tz
    time.tzset()
    pythonTime = (time.mktime(datetime.datetime.strptime(s, "%d.%m.%y %H:%M").timetuple()))*1000.0

    assert abs(pythonTime-temp[row_ind,0]) < 1e-10, "h2o.H2OFrame.as_date() command is not working."
Esempio n. 35
0
def test_show_time():

    h2o.cluster().timezone = "UTC"
    df = h2o.H2OFrame.from_python(
        {"A": [1, 2, 3],
         "B": ["a", "a", "b"],
         "C": ["hello", "all", "world"],
         "D": ["12MAR2015:11:00:00", "13MAR2015:12:00:00", "14MAR2015:13:00:00"]},
        column_types={"A": "numeric", "B": "enum", "C": "string", "D": "time"}
    )
    out = df.__unicode__()
    print(out)
    assert "2015-03-12 11:00:00" in out
    assert "2015-03-13 12:00:00" in out
    assert "2015-03-14 13:00:00" in out

    df2 = h2o.create_frame(cols=6, rows=10, time_fraction=1, missing_fraction=0.1)
    out2 = df2.__unicode__()
    print(out2)
    assert "e+" not in out2
    assert "E+" not in out2

    lines = out2.splitlines()[2:-2]  # skip header (first 2 lines) + footer (last 2 lines)
    regex = re.compile(r"(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)")
    for l in lines:
        for entry in l.split("  "):
            entry = entry.strip()
            if entry == "": continue  # skip missing entries
            m = re.match(regex, entry)
            assert m is not None, "Failed to recognize time expression '%s'" % entry
            year = int(m.group(1))
            month = int(m.group(2))
            day = int(m.group(3))
            assert 1970 <= year <= 2020
            assert 1 <= month <= 12
            assert 1 <= day <= 31
def call_shutdown():
    h2o.cluster().shutdown(prompt=True)   # call shutdown but do not actually shut anything down.
Esempio n. 37
0
from pysparkling import *
from pyspark.sql import SparkSession
import h2o

# Initiate SparkSession
spark = SparkSession.builder.appName("App name").getOrCreate()

# Initiate H2OContext
hc = H2OContext.getOrCreate(spark)

# Stop H2O and Spark services
h2o.cluster().shutdown()
spark.stop()
Esempio n. 38
0
 def stop_with_jvm(self):
     h2o.cluster().shutdown()
     self.stop()
Esempio n. 39
0

# Calculation input
LSIcutoff=4.50 # (Ang)


# Header for output file
header="      q              Sk             LSI(Ang^2)"
#print header




# Loop over all shapshots in file
for isnap in range(0,nsnap):
    
    snap=h2o.snapshot(nat[isnap], A, txyz[isnap], coordtype=ct)
    allmol=h2o.cluster(nat[isnap]/napm, A, snap.tx)
    allmol.H2Oindx
    allmol.FindH2Os(snap.A, snap.Ainv, cutoff=LSIcutoff)


#    allmol.printsnap( 'L1: '+ str(A.T[0][:])+' L2: '+ str(A.T[1][:]) +' L3: '+ str(A.T[2][:]) )        
    for imol in range(0, allmol.nmol):

        q, Sk=allmol.getTOPs(snap.A, snap.Ainv, imol)                  
        LSI=allmol.LSI(snap.A, snap.Ainv, imol, rcutoff=3.7) 

        print q, Sk, LSI

Esempio n. 40
0
 def tearDown(self):
     h2o.cluster().shutdown(False)
Esempio n. 41
0

# Define and print file header
header="rOO \t phi \t theta \t rOH11 \t rOH12 \t rOH21 \t rOH22 \t HOH1 \t HOH2 \t alpha \t beta \t gamma \t nu \t mu \t OdHOa"    
print header


# Loop over all shapshots in file
for icl in range(0,nsnap):
    
    # Create instance of the class snapshot
    snap=h2o.snapshot(nat[icl], A, txyz[icl], coordtype=ct)

    # Create instance of the class cluster, in this
    # case containing all molecules in the snapshot
    mycluster=h2o.cluster(nat[icl]/napm, A, snap.tx)
    mycluster.H2Oindx
    mycluster.wrap()    
    mycluster.FindH2Os(snap.A, snap.Ainv, cutoff=4.5)    


    # Find molecule roles in the dimer:
    # - centralmol labels the donor molecule (can take values 0,1) 
    # - secondarymol labels the acceptor molecule (can take values 0,1) 
    # - primaryh labels the donated H 
    centralmol, secondarymol, primaryh, secondaryh = mycluster.findHBandsort()


    # Center and orient with respect to molecule with index centralmol
    mycluster.CaO(centralmol)
Esempio n. 42
0
 def __stop(self):
     try:
         if not (self._conf.is_manual_cluster_start_used() and self._conf.runs_in_external_cluster_mode()):
             h2o.cluster().shutdown()
     except:
         pass
def call_badshutdown():
    h2o.cluster().shutdown(badparam=1, prompt=True)   # call shutdown but do not actually shut anything down.
# Transform census table
# Remove all spaces from column names (causing problems in Spark SQL)
col_names = map(lambda s: s.strip().replace(' ', '_').replace('+', '_'), f_census.col_names)

# Update column names in the table
# f_weather.names = col_names
f_census.names = col_names


# Transform crimes table
# Drop useless columns
f_crimes = f_crimes[2:]

# Set time zone to UTC for date manipulation
h2o.cluster().timezone = "Etc/UTC"

# Replace ' ' by '_' in column names
col_names = map(lambda s: s.replace(' ', '_'), f_crimes.col_names)
f_crimes.names = col_names
refine_date_col(f_crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
f_crimes = f_crimes.drop("Date")

# Expose H2O frames as Spark DataFrame

df_weather = h2oContext.as_spark_frame(f_weather)
df_census = h2oContext.as_spark_frame(f_census)
df_crimes = h2oContext.as_spark_frame(f_crimes)

# Register DataFrames as tables
df_weather.createOrReplaceTempView("chicagoWeather")
Esempio n. 45
0
 def tearDownClass(cls):
     h2o.cluster().shutdown()
     unit_test_utils.tear_down_class(cls)