Exemple #1
0
def hdfs_kmeans():
    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
    hdfs_covtype_file = "/datasets/runit/covtype.data"

    print("Import iris_wheader.csv from HDFS")
    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
    iris_h2o = h2o.import_file(url)
    n = iris_h2o.nrow
    print("rows: {0}".format(n))
    assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)

    print("Running KMeans on iris")
    iris_km = H2OKMeansEstimator(k=3, training_frame=iris_h2o[0:4], max_iterations=10)
    iris_km.train()
    print(iris_km)

    print("Importing covtype.data from HDFS")
    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file)
    covtype_h2o = h2o.import_file(url)
    n = covtype_h2o.nrow
    print("rows: {0}".format(n))
    assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012)

    print("Running KMeans on covtype")
    covtype_km = H2OKMeansEstimator(training_frame=covtype_h2o[0:55], k=8, max_iterations=10)
    covtype_km.train()
    print(covtype_km)
def parametersKmeans():

    print("Getting data...")
    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

    print("Create and and duplicate...")

    iris_km = H2OKMeansEstimator(k=3, seed=1234)
    iris_km.train(x=list(range(4)), training_frame=iris)
    parameters = iris_km._model_json['parameters']
    param_dict = {}
    for p in range(len(parameters)):
        param_dict[parameters[p]['name']] = parameters[p]['actual_value']

    fold_column = param_dict['fold_column']
    del param_dict['fold_column']
    del param_dict['training_frame']
    del param_dict['validation_frame']
    del param_dict['max_runtime_secs']
    iris_km_again = H2OKMeansEstimator(
        **param_dict)  ## not all parameters go here - invalid test
    iris_km_again.train(x=list(range(4)),
                        training_frame=iris,
                        fold_column=fold_column)

    print("wss")
    wss = iris_km.withinss().sort()
    wss_again = iris_km_again.withinss().sort()
    assert wss == wss_again, "expected wss to be equal"

    print("centers")
    centers = iris_km.centers()
    centers_again = iris_km_again.centers()
    assert centers == centers_again, "expected centers to be the same"
def parametersKmeans():

    print("Getting data...")
    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

    print("Create and and duplicate...")

    iris_km = H2OKMeansEstimator(k=3, seed=1234)
    iris_km.train(x=list(range(4)), training_frame=iris)
    parameters = iris_km._model_json['parameters']
    param_dict = {pp['name']: pp['actual_value'] for pp in parameters}
    fold_column = param_dict.pop('fold_column')
    del param_dict["model_id"]
    del param_dict['training_frame']
    del param_dict['validation_frame']
    del param_dict['max_runtime_secs']
    iris_km_again = H2OKMeansEstimator(
        **param_dict)  # not all parameters go here - invalid test
    # remove assigning the x parameter to prevent H2OValueError: Properties x and ignored_columns cannot be specified simultaneously
    iris_km_again.train(training_frame=iris, fold_column=fold_column)

    print("wss")
    wss = iris_km.withinss().sort()
    wss_again = iris_km_again.withinss().sort()
    assert wss == wss_again, "expected wss to be equal"

    print("centers")
    centers = iris_km.centers()
    centers_again = iris_km_again.centers()
    assert centers == centers_again, "expected centers to be the same"
Exemple #4
0
    def test_kmeans_fields(self):
        """
        test_kmeans_grid_search_over_validation_datasets performs the following:
        a. build H2O kmeans models using grid search.  Count and make sure models
           are only built for hyper-parameters set to legal values.  No model is built for bad hyper-parameters
           values.  We should instead get a warning/error message printed out.
        b. For each model built using grid search, we will extract the parameters used in building
           that model and manually build a H2O kmeans model.  Training metrics are calculated from the
           gridsearch model and the manually built model.  If their metrics
           differ by too much, print a warning message but don't fail the test.
        c. we will check and make sure the models are built within the max_runtime_secs time limit that was set
           for it as well.  If max_runtime_secs was exceeded, declare test failure.
        """

        print(
            "*******************************************************************************************"
        )
        h2o.cluster_info()

        good_params_list = {
            'max_iterations': 20,
            'k': 6,
            'init': 'Furthest',
            'seed': 1464891169
        }
        good_model_params = {'max_runtime_secs': 0.014673351}
        good_model = H2OKMeansEstimator(**good_params_list)
        good_model.train(x=self.x_indices,
                         training_frame=self.training1_data,
                         **good_model_params)

        bad_params_list = {
            'init': 'Random',
            'seed': 1464888628,
            'k': 6,
            'max_iterations': 0
        }
        bad_model_params = {'max_runtime_secs': 0.007948218600000001}
        bad_model = H2OKMeansEstimator(**bad_params_list)
        bad_model.train(x=self.x_indices,
                        training_frame=self.training1_data,
                        **bad_model_params)

        good_model_type = type(
            good_model._model_json['output']['model_summary'])
        bad_model_type = type(bad_model._model_json['output']['model_summary'])
        print(
            "good_model._model_json['output']['model_summary'] type is {0}.  "
            "bad_model._model_json['output']['model_summary'] type is "
            "{1}".format(good_model_type, bad_model_type))

        if not (good_model_type == bad_model_type):
            print("They are not equal for some reason....")
            self.test_failed = 1
        else:
            print("The fields are of the same type.")
def convergeKmeans():

    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Log.info("Importing ozone.csv data...\n")
    ozone_h2o = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/ozone.csv"))
    #ozone_h2o.summary()

    miters = 5
    ncent = 10

    # Log.info(paste("Run k-means in a loop of", miters, "iterations with max_iter = 1"))
    start = ozone_h2o[0:10, 0:4]

    # expect error for 0 iterations

    try:
        H2OKMeansEstimator(max_iterations=0).train(x=list(range(
            ozone_h2o.ncol)),
                                                   training_frame=ozone_h2o)
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    centers = start
    for i in range(miters):
        rep_fit = H2OKMeansEstimator(k=ncent,
                                     user_points=centers,
                                     max_iterations=1)
        rep_fit.train(x=list(range(ozone_h2o.ncol)), training_frame=ozone_h2o)
        centers = h2o.H2OFrame(rep_fit.centers())

    # Log.info(paste("Run k-means with max_iter=miters"))
    all_fit = H2OKMeansEstimator(k=ncent,
                                 user_points=start,
                                 max_iterations=miters)
    all_fit.train(x=list(range(ozone_h2o.ncol)), training_frame=ozone_h2o)
    assert rep_fit.centers() == all_fit.centers(
    ), "expected the centers to be the same"

    # Log.info("Check cluster centers have converged")
    all_fit2 = H2OKMeansEstimator(k=ncent,
                                  user_points=h2o.H2OFrame(all_fit.centers()),
                                  max_iterations=1)
    all_fit2.train(x=list(range(ozone_h2o.ncol)), training_frame=ozone_h2o)
    avg_change = old_div(
        sum([
            sum([pow((e1 - e2), 2) for e1, e2 in zip(c1, c2)])
            for c1, c2 in zip(all_fit.centers(), all_fit2.centers())
        ]), ncent)
    assert avg_change < 1e-6 or all_fit._model_json['output'][
        'iterations'] == miters
Exemple #6
0
def get_modelKmeans():
  # Connect to a pre-existing cluster
  # connect to localhost:54321

  #Log.info("Importing benign.csv data...\n")
  benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
  #benign_h2o.summary()

  benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
  # Impute missing values with column mean
  imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
  benign_sci = imp.fit_transform(benign_sci)


  for i in range(2,7):
    # Log.info("H2O K-Means")
    km_h2o = H2OKMeansEstimator(k=i)
    km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o)
    km_h2o.show()
    model = h2o.get_model(km_h2o._id)
    model.show()

    km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1)
    km_sci.fit(benign_sci)
    print "sckit centers"
    print km_sci.cluster_centers_
Exemple #7
0
def test_kmeans_cv():
    data = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

    km_model = H2OKMeansEstimator(k=3, nfolds=3, estimate_k=True)
    km_model.train(x=list(range(4)), training_frame=data)
    centers = km_model.centers()
    print(centers)

    # test cross validation model 3 has centroid stats
    cv_model1 = h2o.get_model(
        km_model._model_json['output']['cross_validation_models'][0]['name'])
    print(cv_model1)
    assert cv_model1._model_json['output']['training_metrics'][
        'centroid_stats'] is not None

    # test cross validation model 3 has centroid stats
    cv_model2 = h2o.get_model(
        km_model._model_json['output']['cross_validation_models'][1]['name'])
    print(cv_model2)
    assert cv_model2._model_json['output']['training_metrics'][
        'centroid_stats'] is not None

    # test cross validation model 3 has centroid stats
    cv_model3 = h2o.get_model(
        km_model._model_json['output']['cross_validation_models'][2]['name'])
    print(cv_model3)
    assert cv_model3._model_json['output']['training_metrics'][
        'centroid_stats'] is not None

    # test cross validation metrics does not have centroid stats
    print(km_model._model_json['output']['cross_validation_metrics'])
    assert km_model._model_json['output']['cross_validation_metrics'][
        'centroid_stats'] is None
Exemple #8
0
    def test_kmeans_grid_search_over_validation_datasets(self):
        """
        test_kmeans_grid_search_over_validation_datasets performs the following:
        a. build H2O kmeans models using grid search.
        b. For each model built using grid search, print out the total_sum_squares errors.
        c. If an exception was thrown, mark the test as failed.
        """
        print(
            "*******************************************************************************************"
        )
        print("test_kmeans_grid_search_over_validation_datasets for kmeans ")
        h2o.cluster_info()

        print("Hyper-parameters used here is {0}".format(self.hyper_params))

        #        try:
        # start grid search
        grid_model = H2OGridSearch(H2OKMeansEstimator(),
                                   hyper_params=self.hyper_params)
        grid_model.train(x=self.x_indices, training_frame=self.training1_data)

        for each_model in grid_model:
            summary_list = each_model._model_json["output"][
                "validation_metrics"]
            if (summary_list is not None) and (summary_list._metric_json
                                               is not None):
                grid_model_metrics = summary_list._metric_json['totss']
                print("total sum of squares of a model is: {0}".format(
                    grid_model_metrics))
            else:
                print(
                    'model._model_json["output"]["validation_metrics"] of a model is None for some reason....'
                )
Exemple #9
0
def prostateKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    #Log.info("Importing prostate.csv data...\n")
    prostate_h2o = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    #prostate.summary()

    prostate_sci = np.loadtxt(
        pyunit_utils.locate("smalldata/logreg/prostate_train.csv"),
        delimiter=',',
        skiprows=1)
    prostate_sci = prostate_sci[:, 1:]

    from h2o.estimators.kmeans import H2OKMeansEstimator

    for i in range(5, 9):
        #Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))
        #Log.info(paste( "Using these columns: ", colnames(prostate.hex)[-1]) )
        prostate_km_h2o = H2OKMeansEstimator(k=i)
        prostate_km_h2o.train(x=range(1, prostate_h2o.ncol),
                              training_frame=prostate_h2o)
        prostate_km_h2o.show()

        prostate_km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1)
        prostate_km_sci.fit(prostate_sci)
        print prostate_km_sci.cluster_centers_
Exemple #10
0
def iris_h2o_vs_sciKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    iris_h2o = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris.csv"))
    iris_sci = np.genfromtxt(pyunit_utils.locate("smalldata/iris/iris.csv"),
                             delimiter=',')
    iris_sci = iris_sci[:, 0:4]

    s = [[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1], [6.5, 3.0, 5.2, 2.0]]

    start = h2o.H2OFrame(zip(*s))

    h2o_km = H2OKMeansEstimator(k=3, user_points=start, standardize=False)
    h2o_km.train(x=range(4), training_frame=iris_h2o)

    sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
    sci_km.fit(iris_sci)

    # Log.info("Cluster centers from H2O:")
    print "Cluster centers from H2O:"
    h2o_centers = h2o_km.centers()
    print h2o_centers

    # Log.info("Cluster centers from scikit:")
    print "Cluster centers from scikit:"
    sci_centers = sci_km.cluster_centers_.tolist()
    sci_centers = zip(*sci_centers)

    for hcenter, scenter in zip(h2o_centers, sci_centers):
        for hpoint, spoint in zip(hcenter, scenter):
            assert (hpoint - spoint) < 1e-10, "expected centers to be the same"
def benignKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    #  Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()

    benign_sci = np.genfromtxt(
        pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    # Log.info(paste("H2O K-Means with ", i, " clusters:\n", sep = ""))

    from h2o.estimators.kmeans import H2OKMeansEstimator

    for i in range(1, 7):
        benign_h2o_km = H2OKMeansEstimator(k=i)
        benign_h2o_km.train(x=range(benign_h2o.ncol),
                            training_frame=benign_h2o)
        print "H2O centers"
        print benign_h2o_km.centers()

        benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1)
        benign_sci_km.fit(benign_sci)
        print "sckit centers"
        print benign_sci_km.cluster_centers_
    def attack(train, x):
        kwargs = {}

        # randomly select parameters and their corresponding values
        kwargs['k'] = random.randint(1, 20)
        if random.randint(0, 1): kwargs['model_id'] = "my_model"
        if random.randint(0, 1): kwargs['max_iterations'] = random.randint(1, 1000)
        if random.randint(0, 1): kwargs['standardize'] = [True, False][random.randint(0, 1)]
        if random.randint(0, 1):
            method = random.randint(0, 3)
            if method == 3:
                # Can be simplified to: train[x].mean() + (train[x].runif() - 0.5)*200
                # once .runif() is fixed
                s = [[train[c].mean().getrow()[0] + random.uniform(-100, 100)
                      for p in range(kwargs['k'])] for c in x]
                print("s: {0}".format(s))
                start = h2o.H2OFrame(list(zip(*s)))
                kwargs['user_points'] = start
            else:
                kwargs['init'] = ["Furthest", "Random", "PlusPlus"][method]
        if random.randint(0, 1): kwargs['seed'] = random.randint(1, 10000)

        # display the parameters and their corresponding values
        print("-----------------------")
        print("x: {0}".format(x))
        for k, v in kwargs.items():
            if k == 'user_points':
                print(k + ": ")
                start.show()
            else:
                print(k + ": {0}".format(v))


        H2OKMeansEstimator(**kwargs).train(x=x, training_frame=train)
        print("-----------------------")
Exemple #13
0
def pyunit_model_params():
    pros = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    m = H2OKMeansEstimator(k=4)
    m.train(x=list(range(pros.ncol)), training_frame=pros)
    print(m.params)
    print(m.full_parameters)
def _get_kmeans_model(predictor_col, response_col, train_f, val_f):
    from h2o.estimators.kmeans import H2OKMeansEstimator
    kmeans_model = H2OKMeansEstimator(k=2, max_iterations=1000000)
    kmeans_model.train(x=predictor_col,
                       training_frame=train_f,
                       validation_frame=val_f)
    return kmeans_model
Exemple #15
0
def kmeans_start(grid_id, export_dir, train, params, hyper_parameters):
    grid = H2OGridSearch(H2OKMeansEstimator(),
                         grid_id=grid_id,
                         hyper_params=hyper_parameters,
                         recovery_dir=export_dir)
    grid.start(x=list(range(4)), training_frame=train, **params)
    return grid
def k_means_export():
    print("###### K MEANS ######")
    frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    model = H2OKMeansEstimator(k=1)
    model.train(x=list(range(frame.ncol)), training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)
Exemple #17
0
def kmeans_model(df, xValues):
    
    hf = h2o.H2OFrame(df)
    train, valid, test = hf.split_frame(ratios=[.8, .1])    
    # kmeans model
    kmeans = H2OKMeansEstimator(k=3,max_iterations=5,seed = 10,categorical_encoding = "AUTO",max_runtime_secs=10)
    kmeans.train(xValues, training_frame= hf)
    
    # pca model, generate Principal Components for further modelling or plotting
    
    pca = H2OPrincipalComponentAnalysisEstimator(k=4)
#    pca.train(xValues, training_frame= hf)
    pca.train(list(df.columns), training_frame= hf)
    pca_features = pca.predict(hf).as_data_frame()
    pca_metric = pca.summary().as_data_frame()
    
    # model metrics
    cluster_column = kmeans.predict(hf).as_data_frame()
    # The Between Cluster Sum-of-Square Error
    inter_cluster_error = kmeans.betweenss()
    # Within Cluster Sum-of-Square Error
    intra_cluster_error = kmeans.withinss()
    # Centroids
    centroids = kmeans.centers()
    # Size of clusters
    cluster_size = kmeans.size()
# 
    cluster_column.columns = ['cluster']
    frames = [df,cluster_column]
    transformed_data = pd.concat(frames, axis=1)
    
    output = [transformed_data, pca_features, pca_metric, centroids, inter_cluster_error, intra_cluster_error, cluster_size]
    return output
Exemple #18
0
def k_means(xval=None, sample_size=None, nfolds=None, hparams=None):
    """
    create a k-means algorithm estimator
    :param xval: if for cross-validation
    :param sample_size: training set sample amount
    :param nfolds: k value for k-fold cross-validation
    :param hparams: hyper parameters for grid search
    :return: a constructed k-means estimator, a parameters' dict for grid search 
    """

    if sample_size <= 10000:
        if sample_size < 5000:
            default_nfolds = 3
        else:
            default_nfolds = 5
        k_opts = [3, 5, 10]
        max_iterations_opts = [5, 10, 20]
        standardize_opts = [0.1, 0.6, 0.8]

    elif 10000 < sample_size <= 100000:
        default_nfolds = 3
        k_opts = [3, 5, 10]
        max_iterations_opts = [5, 10, 20]
        standardize_opts = [0.1, 0.6]

    else:
        default_nfolds = 2
        k_opts = [3, 5, 10]
        max_iterations_opts = [5, 10]
        standardize_opts = [0.1, 0.6]

    default_hparams = dict({'k': k_opts,
                            'max_iterations': max_iterations_opts,
                            'standardize': standardize_opts})

    if nfolds is None:
        nfolds = default_nfolds
    if hparams is None:
        hparams = default_hparams

    if xval:
        km_estimator = H2OKMeansEstimator(nfolds=nfolds)
    else:
        km_estimator = H2OKMeansEstimator()

    return km_estimator, hparams
def km_num_iterations():
  # Connect to a pre-existing cluster
  # connect to localhost:54321

  prostate_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

  from h2o.estimators.kmeans import H2OKMeansEstimator
  prostate_km_h2o = H2OKMeansEstimator(k=3, max_iterations=4)
  prostate_km_h2o.train(training_frame=prostate_h2o, x=range(1,prostate_h2o.ncol))
  num_iterations = prostate_km_h2o.num_iterations()
  assert num_iterations <= 4, "Expected 4 iterations, but got {0}".format(num_iterations)
def emptyclusKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    #Log.info("Importing ozone.csv data...\n")
    ozone_sci = np.loadtxt(pyunit_utils.locate("smalldata/glm_test/ozone.csv"),
                           delimiter=',',
                           skiprows=1)
    ozone_h2o = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/ozone.csv"))

    ncent = 10
    nempty = random.randint(1, ncent / 2)
    initial_centers = [[41, 190, 67, 7.4], [36, 118, 72,
                                            8], [12, 149, 74, 12.6],
                       [18, 313, 62, 11.5], [23, 299, 65, 8.6],
                       [19, 99, 59, 13.8], [8, 19, 61, 20.1],
                       [16, 256, 69, 9.7], [11, 290, 66, 9.2],
                       [14, 274, 68, 10.9]]
    for i in random.sample(range(ncent - 1), nempty):
        initial_centers[i] = [
            100 * i for z in range(1,
                                   len(initial_centers[0]) + 1)
        ]

    initial_centers_sci = np.asarray(initial_centers)
    initial_centers = zip(*initial_centers)

    initial_centers_h2o = h2o.H2OFrame(initial_centers)

    #Log.info("Initial cluster centers:")
    print "H2O initial centers:"
    initial_centers_h2o.show()
    print "scikit initial centers:"
    print initial_centers_sci

    # H2O can handle empty clusters and so can scikit
    #Log.info("Check that H2O can handle badly initialized centers")
    km_sci = KMeans(n_clusters=ncent, init=initial_centers_sci, n_init=1)
    km_sci.fit(preprocessing.scale(ozone_sci))
    print "scikit final centers"
    print km_sci.cluster_centers_

    from h2o.estimators.kmeans import H2OKMeansEstimator

    km_h2o = H2OKMeansEstimator(k=ncent,
                                user_points=initial_centers_h2o,
                                standardize=True)
    km_h2o.train(x=range(ozone_h2o.ncol), training_frame=ozone_h2o)
    print "H2O final centers"
    print km_h2o.centers()
def kmeans_mllib():
    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hdfs_cross_file = "/datasets/runit/BigCross.data"

    print("Import BigCross.data from HDFS")
    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
    cross_h2o = h2o.import_file(url)
    n = cross_h2o.nrow

    err_mllib = np.genfromtxt(
        pyunit_utils.locate("smalldata/mllib_bench/bigcross_wcsse.csv"),
        delimiter=",",
        skip_header=1)
    ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))]

    for k in ncent:
        print("Run k-means++ with k = {0} and max_iterations = 10".format(k))
        cross_km = H2OKMeansEstimator(training_frame=cross_h2o,
                                      k=k,
                                      init="PlusPlus",
                                      max_iterations=10,
                                      standardize=False)
        cross_km.train()

        clust_mllib = np.genfromtxt(
            pyunit_utils.locate("smalldata/mllib_bench/bigcross_centers_" +
                                str(k) + ".csv"),
            delimiter=",").tolist()
        clust_h2o = cross_km.centers()

        # Sort in ascending order by first dimension for comparison purposes
        clust_mllib.sort(key=lambda x: x[0])
        clust_h2o.sort(key=lambda x: x[0])

        print("\nMLlib Cluster Centers:\n")
        print(clust_mllib)
        print("\nH2O Cluster Centers:\n")
        print(clust_h2o)

        wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1]
        wcsse_h2o = old_div(cross_km.tot_withinss(), n)
        print("\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib))
        print("H2O Average Within-Cluster SSE: \n".format(wcsse_h2o))
        assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \
                                         "got {1}".format(wcsse_mllib, wcsse_h2o)
 def tuneAndTrain(trainDataFrame):
     h2o.init()
     #trainData=trainDataFrame       
     trainDataHex=h2o.H2OFrame(trainDataFrame)
     #to consider categorical columns uncomment all the comments
     dc=DataCollection()
     categoricalColumns=dc.findCategorical(trainDataFrame)
     trainDataHex[categoricalColumns] = trainDataHex[categoricalColumns].asfactor()
     #
     #k = range(1,len(trainDataFrame))
     k = len(trainDataFrame)-1
     hyperParameters = {"k":k}
     modelGrid = H2OGridSearch(H2OKMeansEstimator(ignore_const_cols=False),hyper_params=hyperParameters)
     modelGrid.train(x= list(range(0,int(len(trainDataFrame.columns)))),training_frame=trainDataHex)
     gridperf1 = modelGrid.get_grid(sort_by='mse', decreasing=True)
     bestModel = gridperf1.models[0]
     return bestModel
     #
     """model = H2OKMeansEstimator(k = 5, estimate_k = True, ignore_const_cols=False)
Exemple #23
0
def benign_kmeans():
    print("Importing benign.csv data...")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))

    benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = SimpleImputer(missing_values=np.nan, strategy="mean")
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(1,7):
        print("H2O K-Means with " + str(i) + " clusters:")
        benign_h2o_km = H2OKMeansEstimator(k=i)
        benign_h2o_km.train(x=list(range(benign_h2o.ncol)), training_frame=benign_h2o)
        print("H2O centers")
        print(benign_h2o_km.centers())

        benign_sci_km = KMeans(n_clusters=i, init='k-means++', n_init=1)
        benign_sci_km.fit(benign_sci)
        print("sckit centers")
        print(benign_sci_km.cluster_centers_)
def ozoneKM():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/ozone.csv"))

    # See that the data is ready
    print(train.describe())

    # Run KMeans

    from h2o.estimators.kmeans import H2OKMeansEstimator
    my_km = H2OKMeansEstimator(k=10, init="PlusPlus", max_iterations=100)
    my_km.train(x=list(range(train.ncol)), training_frame=train)
    my_km.show()
    my_km.summary()

    my_pred = my_km.predict(train)
    my_pred.describe()
def get_model_kmeans():
    print("Importing benign.csv data...")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))

    benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = SimpleImputer(missing_values=np.nan, strategy="mean")
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2,7):
        km_h2o = H2OKMeansEstimator(k=i)
        km_h2o.train(x=list(range(benign_h2o.ncol)), training_frame=benign_h2o)
        km_h2o.show()
        model = h2o.get_model(km_h2o._id)
        model.show()

        km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1)
        km_sci.fit(benign_sci)
        print("sckit centers")
        print(km_sci.cluster_centers_)
def build_mojo_pipeline():
    results_dir = pyunit_utils.locate("results")
    iris_csv = pyunit_utils.locate('smalldata/iris/iris_train.csv')
    iris = h2o.import_file(iris_csv)

    pca = H2OPrincipalComponentAnalysisEstimator(k=2)
    pca.train(training_frame=iris)

    principal_components = pca.predict(iris)

    km = H2OKMeansEstimator(k=3)
    km.train(training_frame=principal_components)

    pca_mojo_path = pca.download_mojo(path=results_dir)
    km_mojo_path = km.download_mojo(get_genmodel_jar=True, path=results_dir)

    java_cmd = [
        "java", "-cp",
        os.path.join(results_dir, "h2o-genmodel.jar"),
        "hex.genmodel.tools.BuildPipeline", "--mapping"
    ]
    pca_mojo_name = os.path.basename(pca_mojo_path).split('.')[0]
    for i, pc in enumerate(principal_components.columns):
        mapping = pc + '=' + pca_mojo_name + ':' + str(i)
        java_cmd += [mapping]
    java_cmd += [
        "--output",
        os.path.join(results_dir, "pipe.zip"), "--input", km_mojo_path,
        pca_mojo_path
    ]

    subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT).communicate()

    h2o_preds = km.predict(principal_components)
    mojo_preds_raw = h2o.mojo_predict_csv(input_csv_path=iris_csv,
                                          mojo_zip_path=os.path.join(
                                              results_dir, "pipe.zip"))
    mojo_preds = h2o.H2OFrame([c['cluster'] for c in mojo_preds_raw],
                              column_names=['predict'])

    assert (mojo_preds == h2o_preds).mean()[0, "predict"] == 1
Exemple #27
0
    def test_kmeans_hangup(self):
        """
        train a kmeans model with some parameters that will make the system hang.
        """

        print(
            "*******************************************************************************************"
        )
        h2o.cluster_info()

        good_params_list = {
            'seed': 1464837706,
            'max_iterations': 50,
            'init': 'Furthest',
            'k': 5
        }
        good_model_params = {'max_runtime_secs': 0.001}
        good_model = H2OKMeansEstimator(**good_params_list)
        good_model.train(x=self.x_indices,
                         training_frame=self.training1_data,
                         **good_model_params)

        print("Finished.")
    def attack(train, x):
        kwargs = {}

        # randomly select parameters and their corresponding values
        kwargs['k'] = random.randint(1, 20)
        if random.randint(0, 1): kwargs['model_id'] = "my_model"
        if random.randint(0, 1):
            kwargs['max_iterations'] = random.randint(1, 1000)
        if random.randint(0, 1):
            kwargs['standardize'] = [True, False][random.randint(0, 1)]
        if random.randint(0, 1):
            method = random.randint(0, 3)
            if method == 3:
                s = [[
                    random.uniform(train[c].mean()[0] - 100,
                                   train[c].mean()[0] + 100)
                    for p in range(kwargs['k'])
                ] for c in x]
                print "s: {0}".format(s)
                start = h2o.H2OFrame(s)
                kwargs['user_points'] = start
            else:
                kwargs['init'] = ["Furthest", "Random", "PlusPlus"][method]
        if random.randint(0, 1): kwargs['seed'] = random.randint(1, 10000)

        # display the parameters and their corresponding values
        print "-----------------------"
        print "x: {0}".format(x)
        for k, v in zip(kwargs.keys(), kwargs.values()):
            if k == 'user_points':
                print k + ": "
                start.show()
            else:
                print k + ": {0}".format(v)

        H2OKMeansEstimator(**kwargs).train(x=x, training_frame=train)
        print "-----------------------"
def KMeans_ClusteringH2O(data, metric, parameters):
    try:
        h2o.init()
        rfm_data = h2o.H2OFrame(data)
        train, valid = rfm_data.split_frame(
            ratios=[constants.clustering_parameters['split_ratio']],
            seed=constants.clustering_parameters['seed'])
        rfm_kmeans = H2OKMeansEstimator(
            k=constants.clustering_parameters['k'],
            seed=constants.clustering_parameters['seed'],
            max_iterations=int(len(data) / 2))
        rfm_kmeans.train(x=metric,
                         training_frame=train,
                         validation_frame=valid)
        grid = H2OGridSearch(
            model=rfm_kmeans,
            hyper_params=constants.clustering_parameters['hyper_params'],
            search_criteria=constants.clustering_parameters['search_criteria'])
        # train using the grid
        grid.train(x=metric, training_frame=train, validation_frame=valid)

        # sort the grid models by total within cluster sum-of-square error.
        sorted_grid = grid.get_grid(sort_by='tot_withinss', decreasing=False)
        prediction = sorted_grid[0].predict(rfm_data)
        data = rfm_data.concat(prediction,
                               axis=1)[[metric, 'predict'
                                        ]].as_data_frame(use_pandas=True)
        data = data.rename(columns={'predict': metric + '_segment'})
        data[metric + '_segment'] = data[metric +
                                         '_segment'].apply(lambda x: x + 1)
        if parameters['is_h2o_cluster_shut_down']:
            h2o.shutdown(prompt=False)
    except:
        if parameters['is_h2o_cluster_shut_down']:
            h2o.shutdown(prompt=False)
    return data
def test_constrained_kmeans():

    iris_h2o = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris.csv"))

    k = 3

    start = h2o.H2OFrame([[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1],
                          [6.5, 3.0, 5.2, 2.0]])

    constraints = [[100, 40, 1], [100, 1, 1], [1, 100, 1], [1, 40, 100],
                   [1, 1, 1], [1, 1, 148], [147, 1, 1], [1, 148, 1], [1, 1, 1],
                   [50, 50, 50]]

    for i in range(len(constraints)):
        for standardize in [True, False]:
            print("===== Train KMeans model with constraints: ======")
            print(constraints[i])
            kmm = H2OKMeansEstimator(k=k,
                                     user_points=start,
                                     standardize=standardize,
                                     cluster_size_constraints=constraints[i],
                                     score_each_iteration=True)
            kmm.train(x=list(range(4)), training_frame=iris_h2o)

            kmm.show()

            for j in range(k):
                number_points = kmm._model_json['output'][
                    'training_metrics']._metric_json[
                        'centroid_stats']._cell_values[j][2]
                assert number_points >= constraints[i][
                    j], "Number of points (" + str(
                        number_points) + ") in cluster " + str(
                            i + 1) + " should be >= constraint value (" + str(
                                constraints[i][j]) + ")"