def test_load_glrm():
  print("Importing iris_wheader.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  irisH2O.describe()

  g_model = H2OGeneralizedLowRankEstimator(k=3)
  g_model.train(x=irisH2O.names, training_frame=irisH2O)
  yarch_old = g_model.archetypes()
  x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"])
  predOld = g_model.predict(irisH2O)
  TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results"))

  try:
    TMPDIR = pyunit_utils.locate("results")    # find directory path to results folder
  except:
    os.makedirs(TMPDIR)
  h2o.save_model(g_model, path=TMPDIR, force=True)       # save model
  full_path_filename = os.path.join(TMPDIR, g_model._id)

  h2o.remove(g_model)
  model_reloaded = h2o.load_model(full_path_filename)
  pred = model_reloaded.predict(irisH2O)
  yarch = model_reloaded.archetypes()
  x = h2o.get_frame(model_reloaded._model_json["output"]["representation_name"])

  # assert difference between old and new are close, archetypes should be the same
  pyunit_utils.compare_frames_local(x, x_old, tol=1e-6)
  pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1)
  for k in range(3):
    pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps = 1e-4, tolerance=1e-10)

  print("glrm model successfully loaded...")
def svd_1_golden():

    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    print("Compare with SVD")
    from h2o.transforms.decomposition import H2OSVD

    fitH2O = H2OSVD(nv=4, transform="NONE", max_iterations=2000)
    fitH2O.train(x=list(range(4)), training_frame=arrestsH2O)

    print("Compare singular values (D)")
    h2o_d = fitH2O._model_json["output"]["d"]
    r_d = [1419.06139509772, 194.825846110138, 45.6613376308754, 18.0695566224677]
    print("R Singular Values: {0}".format(r_d))
    print("H2O Singular Values: {0}".format(h2o_d))
    for r, h in zip(r_d, h2o_d):
        assert abs(r - h) < 1e-6, "H2O got {0}, but R got {1}".format(h, r)

    print("Compare right singular vectors (V)")
    h2o_v = h2o.as_list(h2o.get_frame(fitH2O._model_json["output"]["v_key"]["name"]), use_pandas=False)
    h2o_v.pop(0)
    r_v = [
        [-0.04239181, 0.01616262, -0.06588426, 0.99679535],
        [-0.94395706, 0.32068580, 0.06655170, -0.04094568],
        [-0.30842767, -0.93845891, 0.15496743, 0.01234261],
        [-0.10963744, -0.12725666, -0.98347101, -0.06760284],
    ]
    print("R Right Singular Vectors: {0}".format(r_v))
    print("H2O Right Singular Vectors: {0}".format(h2o_v))
    for rl, hl in zip(r_v, h2o_v):
        for r, h in zip(rl, hl):
            assert abs(abs(r) - abs(float(h))) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)

    print("Compare left singular vectors (U)")
    h2o_u = h2o.as_list(h2o.get_frame(fitH2O._model_json["output"]["u_key"]["name"]), use_pandas=False)
    h2o_u.pop(0)
    r_u = [
        [-0.1716251, 0.096325710, 0.06515480, 0.15369551],
        [-0.1891166, 0.173452566, -0.42665785, -0.17801438],
        [-0.2155930, 0.078998111, 0.02063740, -0.28070784],
        [-0.1390244, 0.059889811, 0.01392269, 0.01610418],
        [-0.2067788, -0.009812026, -0.17633244, -0.21867425],
        [-0.1558794, -0.064555293, -0.28288280, -0.11797419],
    ]
    print("R Left Singular Vectors: {0}".format(r_u))
    print("H2O Left Singular Vectors: {0}".format(h2o_u))
    for rl, hl in zip(r_u, h2o_u):
        for r, h in zip(rl, hl):
            assert abs(abs(r) - abs(float(h))) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)
Esempio n. 3
0
def h2oget_frame():
    """
    Python API test: h2o.get_frame(frame_id)
    """
    frame1 = h2o.import_file(pyunit_utils.locate("smalldata/jira/hexdev_29.csv"))
    frame2 = h2o.get_frame(frame1.frame_id)
    assert_is_type(frame2, H2OFrame)
def glrm_subset():
  acs_orig = h2o.upload_file(path=pyunit_utils.locate("bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip"), col_types = (['enum'] + ['numeric']*149))
  
  acs_full = acs_orig.drop("ZCTA5")
  acs_model = H2OGeneralizedLowRankEstimator(k = 10,
                                            transform = 'STANDARDIZE',
                                            loss = 'Quadratic',
                                            regularization_x = 'Quadratic',
                                            regularization_y = 'L1',
                                            gamma_x = 0.25,
                                            gamma_y = 0.5,
                                            max_iterations = 1)
  
  acs_model.train(x = acs_full.names, training_frame= acs_full)
  zcta_arch_x = h2o.get_frame(acs_model._model_json['output']['representation_name'])
  print (zcta_arch_x)
  
  acs_zcta_col = acs_orig["ZCTA5"].asfactor()
  
  idx = ((acs_zcta_col == '10065') |   # Manhattan, NY (Upper East Side)\n",
     (acs_zcta_col == '11219') |   # Manhattan, NY (East Harlem)\n",
      (acs_zcta_col == '66753') |   # McCune, KS\n",
     (acs_zcta_col == '84104') |   # Salt Lake City, UT\n",
     (acs_zcta_col == '94086') |   # Sunnyvale, CA\n",
      (acs_zcta_col == '95014'))    # Cupertino, CA\n",
  
  print(zcta_arch_x[idx,[0,1]])
def glrm_set_loss_by_col():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsPy = np.array(h2o.as_list(arrestsH2O))
    arrestsH2O.describe()
    
    print("H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber")
    glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None")
    glrm_h2o.show()
    
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    
    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    fit_diff = arrestsPy.__sub__(fit_xy)
    obj_val = np.absolute(fit_diff[:,0]) + np.square(fit_diff[:,1]) + np.square(fit_diff[:,2])
    def huber(a):
        return a*a/2 if abs(a) <= 1 else abs(a)-0.5
    huber = np.vectorize(huber)
    obj_val = obj_val + huber(fit_diff[:,3])
    obj_val = np.sum(obj_val)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(obj_val)
Esempio n. 6
0
def pca_prostate(ip, port):
    h2o.init(ip, port)

    print "Importing prostate.csv data...\n"
    prostate = h2o.upload_file(h2o.locate("smalldata/logreg/prostate.csv"))

    print "Converting CAPSULE, RACE, DPROS and DCAPS columns to factors"
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate["RACE"] = prostate["RACE"].asfactor()
    prostate["DPROS"] = prostate["DPROS"].asfactor()
    prostate["DCAPS"] = prostate["DCAPS"].asfactor()
    prostate.describe()

    print "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'"
    fitPCA = h2o.prcomp(x=prostate[2:9], k=3, transform="NONE", pca_method="Power")
    pred1 = fitPCA.predict(prostate)
    pred2 = h2o.get_frame(fitPCA._model_json['output']['loading_key']['name'])

    print "Compare dimensions of projection and loading matrix"
    print "Projection matrix:\n"
    print pred1.head()
    print "Loading matrix:\n"
    print pred2.head()
    assert pred1.nrow() == pred2.nrow(), "Expected same number of rows, but got {0} and {1}".format(pred1.nrow(),
                                                                                                    pred2.nrow())
    assert pred1.ncol() == pred2.ncol(), "Expected same number of rows, but got {0} and {1}".format(pred1.ncol(),
                                                                                                    pred2.ncol())
Esempio n. 7
0
 def cross_validation_fold_assignment(self):
   """
   Obtain the cross-validation fold assignment for all rows in the training data.
   :return: H2OFrame
   """
   fid = self._model_json["output"]["cross_validation_fold_assignment_frame_id"]
   if fid is None: return None
   return h2o.get_frame(fid["name"])
Esempio n. 8
0
 def fit(self, fr, **fit_params):
     res = []
     for step in self.steps:
         res.append(step[1].to_rest(step[0]))
     res = "[" + ",".join([_quoted(r.replace('"', "'")) for r in res]) + "]"
     j = H2OConnection.post_json(url_suffix="Assembly", steps=res, frame=fr.frame_id, _rest_version=99)
     self.id = j["assembly"]["name"]
     return get_frame(j["result"]["name"])
Esempio n. 9
0
 def cross_validation_holdout_predictions(self):
   """
   Obtain the (out-of-sample) holdout predictions of all cross-validation models on the training data.
   This is equivalent to summing up all H2OFrames returned by cross_validation_predictions.
   :return: H2OFrame
   """
   preds = self._model_json["output"]["cross_validation_holdout_predictions_frame_id"]
   if preds is None: return None
   return h2o.get_frame(preds["name"])
Esempio n. 10
0
  def deepfeatures(self, test_data, layer):
    """
    Return hidden layer details

    :param test_data: Data to create a feature space on
    :param layer: 0 index hidden layer
    """
    if test_data is None: raise ValueError("Must specify test data")
    j = H2OConnection.post_json("Predictions/models/" + self._id + "/frames/" + test_data._id, deep_features_hidden_layer=layer)
    return h2o.get_frame(j["predictions_frame"]["name"])
Esempio n. 11
0
  def predict(self, test_data):
    """
    Predict on a dataset.

    :param test_data: Data to be predicted on.
    :return: A new H2OFrame filled with predictions.
    """
    if not test_data: raise ValueError("Must specify test data")
    j = H2OConnection.post_json("Predictions/models/" + self._id + "/frames/" + test_data._id)
    prediction_frame_id = j["model_metrics"][0]["predictions"]["frame_id"]["name"]
    return h2o.get_frame(prediction_frame_id)
Esempio n. 12
0
  def deepfeatures(self, test_data, layer):
    """
    Return hidden layer details

    :param test_data: Data to create a feature space on
    :param layer: 0 index hidden layer
    """
    if test_data is None: raise ValueError("Must specify test data")
    j = H2OJob(h2o.H2OConnection.post_json("Predictions/models/" + self._id + "/frames/" + test_data.frame_id, deep_features_hidden_layer=layer, _rest_version=4), "deepfeatures")
    j.poll()
    return h2o.get_frame(j.dest_key)
Esempio n. 13
0
 def weights(self, matrix_id=0):
   """
   Return the frame for the respective weight matrix
   :param: matrix_id: an integer, ranging from 0 to number of layers, that specifies the weight matrix to return.
   :return: an H2OFrame which represents the weight matrix identified by matrix_id
   """
   num_weight_matrices = len(self._model_json['output']['weights'])
   if matrix_id not in list(range(num_weight_matrices)):
     raise ValueError("Weight matrix does not exist. Model has {0} weight matrices (0-based indexing), but matrix {1} "
                      "was requested.".format(num_weight_matrices, matrix_id))
   return h2o.get_frame(self._model_json['output']['weights'][matrix_id]['URL'].split('/')[3])
Esempio n. 14
0
 def biases(self, vector_id=0):
   """
   Return the frame for the respective bias vector
   :param: vector_id: an integer, ranging from 0 to number of layers, that specifies the bias vector to return.
   :return: an H2OFrame which represents the bias vector identified by vector_id
   """
   num_bias_vectors = len(self._model_json['output']['biases'])
   if vector_id not in list(range(num_bias_vectors)):
     raise ValueError("Bias vector does not exist. Model has {0} bias vectors (0-based indexing), but vector {1} "
                      "was requested.".format(num_bias_vectors, vector_id))
   return h2o.get_frame(self._model_json['output']['biases'][vector_id]['URL'].split('/')[3])
Esempio n. 15
0
 def cross_validation_predictions(self):
   """
   Obtain the (out-of-sample) holdout predictions of all cross-validation models on their holdout data.
   Note that the predictions are expanded to the full number of rows of the training data, with 0 fill-in.
   :return: list of H2OFrame objects
   """
   preds = self._model_json["output"]["cross_validation_predictions"]
   if preds is None: return None
   m = []
   for p in preds: m.append(h2o.get_frame(p["name"]))
   return m
Esempio n. 16
0
  def predict(self, test_data):
    """
    Predict on a dataset.

    :param test_data: Data to be predicted on.
    :return: A new H2OFrame filled with predictions.
    """
    if not isinstance(test_data, H2OFrame): raise ValueError("test_data must be an instance of H2OFrame")
    j = H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id)
    # prediction_frame_id = j["predictions_frame"] #j["model_metrics"][0]["predictions"]["frame_id"]["name"]
    return h2o.get_frame(j["predictions_frame"]["name"])
Esempio n. 17
0
    def predict_leaf_node_assignment(self, test_data):
        """
        Predict on a dataset and return the leaf node assignment (only for tree-based models).

        :param H2OFrame test_data: Data on which to make predictions.

        :returns: A new H2OFrame of predictions.
        """
        if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame")
        j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id),
                    data={"leaf_node_assignment": True})
        return h2o.get_frame(j["predictions_frame"]["name"])
Esempio n. 18
0
def glrm_nnmf():
    m = 1000
    n = 100
    k = 10

    print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)
    Y = np.random.rand(k, n)
    X = np.random.rand(m, k)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame.fromPython(zip(*train.tolist()))

    print "Run GLRM with non-negative regularization"
    initial_y = np.random.rand(n, k)
    initial_y_h2o = h2o.H2OFrame.fromPython(initial_y.tolist())

    glrm_h2o = H2OGeneralizedLowRankEstimator(
        k=k,
        init="User",
        user_y=initial_y_h2o,
        loss="Quadratic",
        regularization_x="NonNegative",
        regularization_y="NonNegative",
        gamma_x=1,
        gamma_y=1,
    )
    glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o)
    glrm_h2o.show()

    print "Check that X and Y matrices are non-negative"
    fit_y = glrm_h2o._model_json["output"]["archetypes"].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(glrm_h2o._model_json["output"]["representation_name"])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"

    print "Check final objective function value"
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json["output"]["objective"]
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)

    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json["output"]["training_metrics"]._metric_json["numerr"]
    glrm_caterr = glrm_h2o._model_json["output"]["training_metrics"]._metric_json["caterr"]
    assert abs(glrm_numerr - glrm_obj) < 1e-3, (
        "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    )
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
Esempio n. 19
0
    def deepfeatures(self, test_data, layer):
        """
        Return hidden layer details.

        :param test_data: Data to create a feature space on
        :param layer: 0 index hidden layer
        """
        if test_data is None: raise ValueError("Must specify test data")
        j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self._id, test_data.frame_id),
                           data={"deep_features_hidden_layer": layer}), "deepfeatures")
        j.poll()
        return h2o.get_frame(j.dest_key)
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression", seed=1234)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict

    h2o.save_model(glrmModel, TMPDIR)   # save GLRM model
    glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME))
    predict_model = glrmModel2.predict(test)
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
            predict_model[col] = predict_model[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    print("Comparing mojo predict and h2o predict from saved model...")
    pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10)
    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def glrm_unitonesparse():
    m = 1000
    n = 100
    k = 10

    print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n))
    Y = np.random.rand(k,n)
    def ind_list(k):
        tmp = [0] * k
        tmp[np.random.randint(0,k)] = 1
        return tmp
    X = [ind_list(k) for x in range(m)]
    X = np.array(X)
    train = np.dot(X,Y)
    train_h2o = h2o.H2OFrame(list(zip(*train.tolist())))

    print("Run GLRM with unit one-sparse regularization on X")
    initial_y = np.random.rand(k,n)
    initial_y_h2o = h2o.H2OFrame(list(zip(*initial_y.tolist())))
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.train(x=train_h2o.names,training_frame=train_h2o)
 #   glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.show()

    print("Check that X matrix consists of rows of basis vectors")
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    def is_basis(a):
        zeros = np.where(a == 0)[0].size
        ones = np.where(a == 1)[0].size
        basis = ones == 1 and (zeros + ones) == k
        assert basis, "Got " + str(ones) + " ones and " + str(zeros) + " zeros, but expected all zeros except a single 1"
        return basis
    np.apply_along_axis(is_basis, 1, fit_x_np)

    print("Check final objective function value")
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)

    print("Impute XY and check error metrics")
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
 def test_levelone_frame_has_expected_dimensions():
     ds = prepare_data(blending)
     models = train_base_models(ds)
     se = train_stacked_ensemble(ds, models, keep_levelone_frame=True)
     level_one_frame = h2o.get_frame(se.levelone_frame_id()["name"])
     
     se_training_frame = ds.blend if blending else ds.train
     
     num_col_level_one_frame = (se_training_frame[ds.y].unique().nrow) * len(models) + 1  # count_classes(probabilities) * count_models + 1 (target)
     assert level_one_frame.ncols == num_col_level_one_frame, \
         "The number of columns in a level one frame should be numClasses * numBaseModels + 1."
     assert level_one_frame.nrows == se_training_frame.nrows, \
         "The number of rows in the level one frame should match train number of rows. "
Esempio n. 23
0
    def predict(self, test_data):
        """
        Predict on a dataset.

        :param H2OFrame test_data: Data on which to make predictions.

        :returns: A new H2OFrame of predictions.
        """
        if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame")
        j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id)),
                   self._model_json['algo'] + " prediction")
        j.poll()
        return h2o.get_frame(j.dest_key)
def glrm_simplex():
    m = 1000
    n = 100
    k = 10
    
    print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n))
    Y = np.random.rand(k,n)
    def ind_list(k):
        tmp = [0] * k
        tmp[np.random.randint(0,k)] = 1
        return tmp
    X = [ind_list(k) for x in range(m)]
    X = np.array(X)
    train = np.dot(X,Y)
    train_h2o = h2o.H2OFrame(train.tolist())
    
    print("Run GLRM with quadratic mixtures (simplex) regularization on X")
    initial_y = np.random.rand(k,n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.train(x=train_h2o.names,training_frame=train_h2o)
#    glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.show()
    
    print("Check that X matrix consists of rows within standard probability simplex")
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    def is_simplex(a):
        row_sum = sum(a)
        simplex = abs(row_sum - 1) < 1e-6
        assert simplex, "Got sum over row = " + row_sum + ", but expected 1"
        return simplex
    np.apply_along_axis(is_simplex, 1, fit_x_np)
    
    print("Check final objective function value")
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)
    
    print("Impute XY and check error metrics")
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
Esempio n. 25
0
    def proj_archetypes(self, test_data, reverse_transform=False):
        """
        Convert archetypes of the model into original feature space.

        :param H2OFrame test_data: The dataset upon which the model was trained.
        :param bool reverse_transform: Whether the transformation of the training data during model-building
            should be reversed on the projected archetypes.

        :returns: model archetypes projected back into the original training data's feature space.
        """
        if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data")
        j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id),
                    data={"project_archetypes": True, "reverse_transform": reverse_transform})
        return h2o.get_frame(j["model_metrics"][0]["predictions"]["frame_id"]["name"])
Esempio n. 26
0
    def anomaly(self, test_data, per_feature=False):
        """
        Obtain the reconstruction error for the input test_data.

        :param H2OFrame test_data: The dataset upon which the reconstruction error is computed.
        :param bool per_feature: Whether to return the square reconstruction error per feature.
            Otherwise, return the mean square error.

        :returns: the reconstruction error.
        """
        if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data")
        j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id),
                    data={"reconstruction_error": True, "reconstruction_error_per_feature": per_feature})
        return h2o.get_frame(j["model_metrics"][0]["predictions"]["frame_id"]["name"])
Esempio n. 27
0
    def reconstruct(self, test_data, reverse_transform=False):
        """
        Reconstruct the training data from the model and impute all missing values.

        :param H2OFrame test_data: The dataset upon which the model was trained.
        :param bool reverse_transform: Whether the transformation of the training data during model-building
            should be reversed on the reconstructed frame.

        :returns: the approximate reconstruction of the training data.
        """
        if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data")
        j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id),
                    data={"reconstruct_train": True, "reverse_transform": reverse_transform})
        return h2o.get_frame(j["model_metrics"][0]["predictions"]["frame_id"]["name"])
Esempio n. 28
0
  def predict_leaf_node_assignment(self, test_data):
    """
    Predict on a dataset and return the leaf node assignment (only for tree-based models)

    Parameters
    ----------
    test_data: H2OFrame
      Data on which to make predictions.

    Returns
    -------
      A new H2OFrame of predictions.
    """
    if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame")
    j = h2o.H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id, leaf_node_assignment=True)
    return h2o.get_frame(j["predictions_frame"]["name"])
Esempio n. 29
0
 def predict(self, test_data):
   """
   Predict on a dataset.
   
   Parameters
   ----------    
   test_data: H2OFrame
     Data on which to make predictions.
   
   Returns
   -------
     A new H2OFrame of predictions.
   """
   if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame")
   j = H2OJob(h2o.H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id, _rest_version=4), self._model_json['algo'] + " prediction")
   j.poll()
   return h2o.get_frame(j.dest_key)
Esempio n. 30
0
 def predict(self, test_data):
   """
   Predict on a dataset.
   
   Parameters
   ----------    
   test_data: H2OFrame
     Data on which to make predictions.
   
   Returns
   -------
     A new H2OFrame of predictions.
   """
   if not isinstance(test_data, H2OFrame): raise ValueError("test_data must be an instance of H2OFrame")
   j = H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id)
   # prediction_frame_id = j["predictions_frame"] #j["model_metrics"][0]["predictions"]["frame_id"]["name"]
   return h2o.get_frame(j["predictions_frame"]["name"])
Esempio n. 31
0
 def predict(self, test_data):
     """
 Predict on a dataset.
 
 Parameters
 ----------    
 test_data: H2OFrame
   Data on which to make predictions.
 
 Returns
 -------
   A new H2OFrame of predictions.
 """
     if not isinstance(test_data, h2o.H2OFrame):
         raise ValueError("test_data must be an instance of H2OFrame")
     j = h2o.H2OConnection.post_json("Predictions/models/" + self.model_id +
                                     "/frames/" + test_data.frame_id)
     # prediction_frame_id = j["predictions_frame"] #j["model_metrics"][0]["predictions"]["frame_id"]["name"]
     return h2o.get_frame(j["predictions_frame"]["name"])
def glrm_set_loss_by_col():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsPy = np.array(h2o.as_list(arrestsH2O))
    arrestsH2O.describe()

    print(
        "H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber")
    glrm_h2o = H2OGeneralizedLowRankEstimator(
        k=3,
        loss="Quadratic",
        loss_by_col=["Absolute", "Huber"],
        loss_by_col_idx=[0, 3],
        regularization_x="None",
        regularization_y="None")
    glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O)
    #   glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None")
    glrm_h2o.show()

    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))

    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    fit_diff = arrestsPy.__sub__(fit_xy)
    obj_val = np.absolute(fit_diff[:, 0]) + np.square(
        fit_diff[:, 1]) + np.square(fit_diff[:, 2])

    def huber(a):
        return a * a / 2 if abs(a) <= 1 else abs(a) - 0.5

    huber = np.vectorize(huber)
    obj_val = obj_val + huber(fit_diff[:, 3])
    obj_val = np.sum(obj_val)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(obj_val)
Esempio n. 33
0
def glrm_nnmf():
  m = 1000
  n = 100
  k = 10

  print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)
  Y = np.random.rand(k,n)
  X = np.random.rand(m, k)
  train = np.dot(X,Y)
  train_h2o = h2o.H2OFrame(zip(*train.tolist()))

  print "Run GLRM with non-negative regularization"
  initial_y = np.random.rand(n,k)
  initial_y_h2o = h2o.H2OFrame(initial_y.tolist())

  glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="NonNegative", regularization_y="NonNegative", gamma_x=1, gamma_y=1)
  glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o)
  glrm_h2o.show()

  print "Check that X and Y matrices are non-negative"
  fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
  fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
  fit_y_np = np.array(fit_y_np)
  fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
  fit_x_np = np.array(h2o.as_list(fit_x))
  assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
  assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"

  print "Check final objective function value"
  fit_xy = np.dot(fit_x_np, fit_y_np)
  glrm_obj = glrm_h2o._model_json['output']['objective']
  sse = np.sum(np.square(train.__sub__(fit_xy)))
  assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)

  print "Impute XY and check error metrics"
  pred_h2o = glrm_h2o.predict(train_h2o)
  pred_np = np.array(h2o.as_list(pred_h2o))
  assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
  glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
  glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
  assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
  assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
Esempio n. 34
0
    def reconstruct(self, test_data, reverse_transform=False):
        """
        Reconstruct the training data from the model and impute all missing values.

        :param H2OFrame test_data: The dataset upon which the model was trained.
        :param bool reverse_transform: Whether the transformation of the training data during model-building
            should be reversed on the reconstructed frame.

        :returns: the approximate reconstruction of the training data.
        """
        if test_data is None or test_data.nrow == 0:
            raise ValueError("Must specify test data")
        j = h2o.api("POST /3/Predictions/models/%s/frames/%s" %
                    (self.model_id, test_data.frame_id),
                    data={
                        "reconstruct_train": True,
                        "reverse_transform": reverse_transform
                    })
        return h2o.get_frame(
            j["model_metrics"][0]["predictions"]["frame_id"]["name"])
Esempio n. 35
0
    def transform(self, words, aggregate_method):
        """
        Transform words (or sequences of words) to vectors using a word2vec model.

        :param str words: An H2OFrame made of a single column containing source words.
        :param str aggregate_method: Specifies how to aggregate sequences of words. If method is `NONE`
               then no aggregation is performed and each input word is mapped to a single word-vector.
               If method is 'AVERAGE' then input is treated as sequences of words delimited by NA.
               Each word of a sequences is internally mapped to a vector and vectors belonging to
               the same sentence are averaged and returned in the result.

        :returns: the approximate reconstruction of the training data.
        """
        j = h2o.api("GET /3/Word2VecTransform",
                    data={
                        'model': self.model_id,
                        'words_frame': words.frame_id,
                        'aggregate_method': aggregate_method
                    })
        return h2o.get_frame(j["vectors_frame"]["name"])
Esempio n. 36
0
    def proj_archetypes(self, test_data, reverse_transform=False):
        """
        Convert archetypes of the model into original feature space.

        :param H2OFrame test_data: The dataset upon which the model was trained.
        :param bool reverse_transform: Whether the transformation of the training data during model-building
            should be reversed on the projected archetypes.

        :returns: model archetypes projected back into the original training data's feature space.
        """
        if test_data is None or test_data.nrow == 0:
            raise ValueError("Must specify test data")
        j = h2o.api("POST /3/Predictions/models/%s/frames/%s" %
                    (self.model_id, test_data.frame_id),
                    data={
                        "project_archetypes": True,
                        "reverse_transform": reverse_transform
                    })
        return h2o.get_frame(
            j["model_metrics"][0]["predictions"]["frame_id"]["name"])
Esempio n. 37
0
    def anomaly(self, test_data, per_feature=False):
        """Obtain the reconstruction error for the input test_data.

        Parameters
        ----------
          test_data : H2OFrame
            The dataset upon which the reconstruction error is computed.

          per_feature : bool
            Whether to return the square reconstruction error per feature. Otherwise, return
            the mean square error.

        Returns
        -------
          Return the reconstruction error.
        """
        if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data")
        j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id),
                    data={"reconstruction_error": True, "reconstruction_error_per_feature": per_feature})
        return h2o.get_frame(j["model_metrics"][0]["predictions"]["frame_id"]["name"])
    def predict(self, test_data):
        """
        Predict on a dataset.

        Parameters
        ----------
        test_data: H2OFrame
          Data on which to make predictions.

        Returns
        -------
          A new H2OFrame of predictions.
        """
        if not isinstance(test_data, h2o.H2OFrame):
            raise ValueError("test_data must be an instance of H2OFrame")
        j = H2OJob(
            h2o.api("POST /4/Predictions/models/%s/frames/%s" %
                    (self.model_id, test_data.frame_id)),
            self._model_json['algo'] + " prediction")
        j.poll()
        return h2o.get_frame(j.dest_key)
Esempio n. 39
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    columns = params.get('columns')
    if columns is not None and len(columns) > 2:
        columns = json.loads(columns)
        df = df[columns]

    use_value = params.get('use')
    if use_value is not None and len(use_value) == 0:
        use_value = None
    df_cor = df.cor(na_rm=to_bool(params.get('na_rm')),
                    use=use_value,
                    method=params.get('method'))

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_cor, dest_frame_id)

    return {'frame_id': dest_frame_id}
def mojo_model_test():

    # GBM
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    gbm = H2OGradientBoostingEstimator(ntrees=1)
    gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines)

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = gbm.download_mojo(original_model_filename)

    key = h2o.lazy_import(original_model_filename)
    fr = h2o.get_frame(key[0])
    model = H2OGenericEstimator(model_key=fr)
    model.train()
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["variable_importances"] is not None
    assert len(
        model._model_json["output"]["variable_importances"]._cell_values) > 0
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0

    # Test constructor generating the model from existing MOJO file
    model = H2OGenericEstimator.from_file(original_model_filename)
    assert model is not None
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["variable_importances"] is not None
    assert len(
        model._model_json["output"]["variable_importances"]._cell_values) > 0
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)
Esempio n. 41
0
    def anomaly(self, test_data, per_feature=False):
        """
        Obtain the reconstruction error for the input test_data.

        :param H2OFrame test_data: The dataset upon which the reconstruction error is computed.
        :param bool per_feature: Whether to return the square reconstruction error per feature.
            Otherwise, return the mean square error.

        :returns: the reconstruction error.

        :examples:

        >>> from h2o.estimators.deeplearning import H2OAutoEncoderEstimator
        >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz")
        >>> test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz")
        >>> predictors = list(range(0,784))
        >>> resp = 784
        >>> train = train[predictors]
        >>> test = test[predictors]
        >>> ae_model = H2OAutoEncoderEstimator(activation="Tanh",
        ...                                    hidden=[2],
        ...                                    l1=1e-5,
        ...                                    ignore_const_cols=False,
        ...                                    epochs=1)
        >>> ae_model.train(x=predictors,training_frame=train)
        >>> test_rec_error = ae_model.anomaly(test)
        >>> test_rec_error
        >>> test_rec_error_features = ae_model.anomaly(test, per_feature=True)
        >>> test_rec_error_features
        """
        if test_data is None or test_data.nrow == 0:
            raise ValueError("Must specify test data")
        j = h2o.api("POST /3/Predictions/models/%s/frames/%s" %
                    (self.model_id, test_data.frame_id),
                    data={
                        "reconstruction_error": True,
                        "reconstruction_error_per_feature": per_feature
                    })
        return h2o.get_frame(
            j["model_metrics"][0]["predictions"]["frame_id"]["name"])
Esempio n. 42
0
def retain_keys_test():
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    gbm = H2OGradientBoostingEstimator(ntrees=1)
    gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines)

    h2o.remove_all([airlines.frame_id, gbm.model_id])

    assert h2o.get_frame(airlines.frame_id) is not None
    assert h2o.get_model(gbm.model_id) is not None

    ## Test key not being retained when unspecified
    gbm = H2OGradientBoostingEstimator(ntrees=1)
    gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines)

    h2o.remove_all([airlines.frame_id])
    h2o.ls()
    try:
        h2o.get_model(gbm.model_id)
        assert False
    except h2o.exceptions.H2OResponseError as e:
        assert e.args[0].dev_msg.find("not found for argument: key") != -1
Esempio n. 43
0
    def transform(self, frame, data_leakage_handling="None", noise=-1, seed=-1):
        """

        Apply transformation to `te_columns` based on the encoding maps generated during `train()` method call.

        :param H2OFrame frame: to which frame we are applying target encoding transformations.
        :param str data_leakage_handling: Supported options:

        1) "k_fold" - encodings for a fold are generated based on out-of-fold data.
        2) "leave_one_out" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies.
        3) "none" - we do not holdout anything. Using whole frame for training

        :param float noise: the amount of random noise added to the target encoding.  This helps prevent overfitting. Defaults to 0.01 * range of y.
        :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1.

        :example:
        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic[response] = titanic[response].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(k=35,
        ...                                        f=25,
        ...                                        data_leakage_handling="leave_one_out",
        ...                                        blending=True)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> transformed = titanic_te.transform(frame=titanic,
        ...                                    data_leakage_handling="leave_one_out",
        ...                                    seed=1234)
        """
        output = h2o.api("GET /3/TargetEncoderTransform", data={'model': self.model_id, 'frame': frame.key,
                                                                'data_leakage_handling': data_leakage_handling,
                                                                'noise': noise,
                                                                'seed': seed})
        return h2o.get_frame(output["name"])
Esempio n. 44
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    input_columns = params.get("input_columns")
    if input_columns is None or len(input_columns) == 0:
        input_columns = df.col_names
    else:
        import json
        input_columns = json.loads(input_columns)

    from h2o.estimators import H2OKMeansEstimator
    kmeans_model = H2OKMeansEstimator(
        categorical_encoding=params.get("categorical_encoding"),
        estimate_k=to_bool(params.get("estimate_k")),
        fold_assignment=params.get("fold_assignment"),
        ignore_const_cols=to_bool(params.get("ignore_const_cols")),
        init=params.get("init"),
        k=int(params.get("k")),
        keep_cross_validation_fold_assignment=to_bool(
            params.get("keep_cross_validation_fold_assignment")),
        keep_cross_validation_models=to_bool(
            params.get("keep_cross_validation_models")),
        keep_cross_validation_predictions=to_bool(
            params.get("keep_cross_validation_predictions")),
        max_iterations=int(params.get("max_iterations")),
        max_runtime_secs=float(params.get("max_runtime_secs")),
        nfolds=int(params.get("nfolds")),
        score_each_iteration=to_bool(params.get("score_each_iteration")),
        seed=int(params.get("seed")),
        standardize=to_bool(params.get("standardize")))
    kmeans_model.train(x=input_columns, training_frame=df)
    kmeans_model.show()

    save_model(params, kmeans_model.model_id)

    return {'frame_id': frame_id, 'model_id': kmeans_model.model_id}
Esempio n. 45
0
def save_artifacts(automl, dataset, config):
    artifacts = config.framework_params.get('_save_artifacts', ['leaderboard'])
    try:
        lb = automl.leaderboard.as_data_frame()
        log.debug("Leaderboard:\n%s", lb.to_string())
        if 'leaderboard' in artifacts:
            models_dir = make_subdir("models", config)
            write_csv(lb, os.path.join(models_dir, "leaderboard.csv"))
        if 'models' in artifacts:
            models_dir = make_subdir("models", config)
            all_models_se = next(
                (mid for mid in lb['model_id']
                 if mid.startswith("StackedEnsemble_AllModels")), None)
            mformat = 'mojo' if 'mojos' in artifacts else 'json'
            if all_models_se:
                save_model(all_models_se, dest_dir=models_dir, mformat=mformat)
            else:
                for mid in lb['model_id']:
                    save_model(mid, dest_dir=models_dir, mformat=mformat)

        if 'models_predictions' in artifacts:
            predictions_dir = make_subdir("predictions", config)
            test = h2o.get_frame(frame_name('test', config))
            for mid in lb['model_id']:
                model = h2o.get_model(mid)
                save_predictions(model,
                                 test,
                                 dataset=dataset,
                                 config=config,
                                 predictions_file=os.path.join(
                                     predictions_dir, mid, 'predictions.csv'))

        if 'logs' in artifacts:
            logs_dir = make_subdir("logs", config)
            h2o.download_all_logs(dirname=logs_dir)
    except:
        log.debug("Error when saving artifacts.", exc_info=True)
Esempio n. 46
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)
    column_header = params.get('column_header')
    if len(column_header) > 0:
        df = df[int(column_header):]

    from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator
    glrm_model = H2OGeneralizedLowRankEstimator(
        expand_user_y=to_bool(params.get('expand_user_y')),
        gamma_x=float(params.get('gamma_x')),
        gamma_y=float(params.get('gamma_y')),
        ignore_const_cols=to_bool(params.get('ignore_const_cols')),
        impute_original=to_bool(params.get('impute_original')),
        init=str(params.get('init')),
        init_step_size=float(params.get('init_step_size')),
        k=int(params.get('k')),
        loss=str(params.get('loss')),
        max_iterations=int(params.get('max_iterations')),
        max_runtime_secs=float(params.get('max_runtime_secs')),
        max_updates=int(params.get('max_updates')),
        min_step_size=float(params.get('min_step_size')),
        multi_loss=str(params.get('multi_loss')),
        period=int(params.get('period')),
        recover_svd=to_bool(params.get('recover_svd')),
        regularization_x=str(params.get('regularization_x')),
        regularization_y=str(params.get('regularization_y')),
        score_each_iteration=to_bool(params.get('score_each_iteration')),
        seed=int(params.get('seed')),
        svd_method=str(params.get('svd_method')))
    glrm_model.train(training_frame=df)
    glrm_model.show()
    save_model(params, glrm_model.model_id)

    return {'frame_id': frame_id, 'model_id': glrm_model.model_id}
Esempio n. 47
0
    def transform(self, words, aggregate_method):
        """
        Transform words (or sequences of words) to vectors using a word2vec model.

        :param str words: An H2OFrame made of a single column containing source words.
        :param str aggregate_method: Specifies how to aggregate sequences of words. If method is `NONE`
               then no aggregation is performed and each input word is mapped to a single word-vector.
               If method is 'AVERAGE' then input is treated as sequences of words delimited by NA.
               Each word of a sequences is internally mapped to a vector and vectors belonging to
               the same sentence are averaged and returned in the result.

        :returns: the approximate reconstruction of the training data.

        :examples:

        >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
        ...                               col_names = ["category", "jobtitle"], 
        ...                               col_types = ["string", "string"], 
        ...                               header = 1)
        >>> STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what",
        ...               "there","all","we","one","the","a","an","of","or","in","for","by","on",
        ...               "but","is","in","a","not","with","as","was","if","they","are","this","and","it","have",
        ...               "from","at","my","be","by","not","that","to","from","com","org","like","likes","so"]
        >>> words = job_titles.tokenize(" ")
        >>> words = words[(words.isna()) | (~ words.isin(STOP_WORDS)),:] 
        >>> w2v_model = H2OWord2vecEstimator(epochs = 10)
        >>> w2v_model.train(training_frame=words)
        >>> job_title_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")
        """
        j = h2o.api("GET /3/Word2VecTransform",
                    data={
                        'model': self.model_id,
                        'words_frame': words.frame_id,
                        'aggregate_method': aggregate_method
                    })
        return h2o.get_frame(j["vectors_frame"]["name"])
Esempio n. 48
0
    def anomaly(self, test_data, per_feature=False):
        """Obtain the reconstruction error for the input test_data.

    Parameters
    ----------
      test_data : H2OFrame
        The dataset upon which the reconstruction error is computed.
      per_feature : bool
        Whether to return the square reconstruction error per feature. Otherwise, return
        the mean square error.

    Returns
    -------
      Return the reconstruction error.
    """
        if test_data is None or test_data.nrow == 0:
            raise ValueError("Must specify test data")
        j = H2OConnection.post_json(
            "Predictions/models/" + self.model_id + "/frames/" +
            test_data.frame_id,
            reconstruction_error=True,
            reconstruction_error_per_feature=per_feature)
        return h2o.get_frame(
            j["model_metrics"][0]["predictions"]["frame_id"]["name"])
Esempio n. 49
0
    def transform(self, frame, data_leakage_handling="None", noise=-1, seed=-1):
        """

        Apply transformation to `te_columns` based on the encoding maps generated during `trains()` method call.

        :param H2OFrame frame: to which frame we are applying target encoding transformations.
        :param str data_leakage_handling: Supported options:

        1) "KFold" - encodings for a fold are generated based on out-of-fold data.
        2) "LeaveOneOut" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies.
        3) "None" - we do not holdout anything. Using whole frame for training

        :param float noise: the amount of random noise added to the target encoding.  This helps prevent overfitting. Defaults to 0.01 * range of y.
        :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1.

        :example:
        >>> targetEncoder = TargetEncoder(encoded_columns=te_columns, target_column=responseColumnName, blended_avg=True, inflection_point=10, smoothing=20)
                            >>> encodedTrain = targetEncoder.transform(frame=trainFrame, data_leakage_handling="None", seed=1234, is_train_or_valid=True)
        """
        output = h2o.api("GET /3/TargetEncoderTransform", data={'model': self.model_id, 'frame': frame.key,
                                                                'data_leakage_handling': data_leakage_handling,
                                                                'noise': noise,
                                                                'seed': seed})
        return h2o.get_frame(output["name"])
Esempio n. 50
0
def save_artifacts(automl, dataset, config):
    artifacts = config.framework_params.get('_save_artifacts', ['leaderboard'])
    try:
        lb = automl.leaderboard.as_data_frame()
        log.debug("Leaderboard:\n%s", lb.to_string())
        if 'leaderboard' in artifacts:
            models_dir = output_subdir("models", config)
            write_csv(lb, os.path.join(models_dir, "leaderboard.csv"))
        if 'models' in artifacts:
            models_dir = output_subdir("models", config)
            all_models_se = next(
                (mid for mid in lb['model_id']
                 if mid.startswith("StackedEnsemble_AllModels")), None)
            mformat = 'mojo' if 'mojos' in artifacts else 'json'
            if all_models_se and mformat == 'mojo':
                save_model(all_models_se, dest_dir=models_dir, mformat=mformat)
            else:
                for mid in lb['model_id']:
                    save_model(mid, dest_dir=models_dir, mformat=mformat)
                models_archive = os.path.join(models_dir, "models.zip")
                utils.zip_path(models_dir, models_archive)

                def delete(path, isdir):
                    if path != models_archive and os.path.splitext(
                            path)[1] in ['.json', '.zip']:
                        os.remove(path)

                utils.walk_apply(models_dir, delete, max_depth=0)

        if 'models_predictions' in artifacts:
            predictions_dir = output_subdir("predictions", config)
            test = h2o.get_frame(frame_name('test', config))
            for mid in lb['model_id']:
                model = h2o.get_model(mid)
                h2o_preds = model.predict(test)
                preds = extract_preds(h2o_preds, test, dataset=dataset)
                if preds.probabilities_labels is None:
                    preds.probabilities_labels = preds.h2o_labels
                write_preds(
                    preds, os.path.join(predictions_dir, mid,
                                        'predictions.csv'))
            utils.zip_path(
                predictions_dir,
                os.path.join(predictions_dir, "models_predictions.zip"))

            def delete(path, isdir):
                if isdir:
                    shutil.rmtree(path, ignore_errors=True)

            utils.walk_apply(predictions_dir, delete, max_depth=0)

        if 'logs' in artifacts:
            logs_dir = output_subdir("logs", config)
            logs_zip = os.path.join(logs_dir, "h2o_logs.zip")
            utils.zip_path(logs_dir, logs_zip)

            # h2o.download_all_logs(dirname=logs_dir)

            def delete(path, isdir):
                if isdir:
                    shutil.rmtree(path, ignore_errors=True)
                elif path != logs_zip:
                    os.remove(path)

            utils.walk_apply(logs_dir, delete, max_depth=0)
    except Exception:
        log.debug("Error when saving artifacts.", exc_info=True)
Esempio n. 51
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    train = int(params.get('train_ratio'))

    test = params.get('test_ratio')
    if test is None or len(test) == 0:
        test = 0
    else:
        test = int(test)

    valid = params.get('valid_ratio')
    if valid is None or len(valid) == 0:
        valid = 0
    else:
        valid = int(valid)

    seed = params.get('seed')
    if seed is None or len(seed) == 0:
        seed = None
    else:
        seed = int(seed)

    train_ratio = train / (train + test + valid)
    test_ratio = test / (train + test + valid)
    valid_ratio = valid / (train + test + valid)

    if valid == 0 and test == 0:
        return {'frame_id': frame_id}
    elif valid == 0:
        df_train, df_test = df.split_frame(ratios=[train_ratio], seed=seed)
        df_valid = None
    elif test == 0:
        df_train, df_valid = df.split_frame(ratios=[train_ratio], seed=seed)
        df_test = None
    else:
        df_train, df_test, df_valid = df.split_frame(
            ratios=[train_ratio, test_ratio], seed=seed)

    train_frame_id = append_frame_id(frame_id, params.get('train_suffix'))
    h2o.assign(df_train, train_frame_id)

    if df_test is None:
        test_frame_id = None
    else:
        test_frame_id = append_frame_id(frame_id, params.get('test_suffix'))
        h2o.assign(df_test, test_frame_id)

    if df_valid is None:
        valid_frame_id = None
    else:
        valid_frame_id = append_frame_id(frame_id, params.get('valid_suffix'))
        h2o.assign(df_valid, valid_frame_id)

    return {
        'frame_id': train_frame_id,
        'train_frame_id': train_frame_id,
        'test_frame_id': test_frame_id,
        'valid_frame_id': valid_frame_id,
    }
Esempio n. 52
0
def glrm_orthonnmf():
    m = 1000
    n = 100
    k = 10

    print "Uploading random uniform matrix with rows = " + str(
        m) + " and cols = " + str(n)
    Y = np.random.rand(k, n)
    X = np.random.rand(m, k)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame(zip(*train.tolist()))

    print "Run GLRM with orthogonal non-negative regularization on X, non-negative regularization on Y"
    initial_y = np.random.rand(k, n)
    initial_y_h2o = h2o.H2OFrame(zip(*initial_y.tolist()))
    glrm_h2o = h2o.glrm(x=train_h2o,
                        k=k,
                        init="User",
                        user_y=initial_y_h2o,
                        loss="Quadratic",
                        regularization_x="OneSparse",
                        regularization_y="NonNegative",
                        gamma_x=1,
                        gamma_y=1)
    glrm_h2o.show()

    print "Check that X and Y matrices are non-negative"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"

    print "Check that columns of X are orthogonal"
    xtx = np.dot(np.transpose(fit_x_np), fit_x_np)
    offdiag = np.extract(1 - np.eye(k), xtx)
    assert np.all(
        offdiag == 0), "All off diagonal elements of X'X must equal zero"

    print "Check final objective function value"
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(sse)

    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(
        pred_np, fit_xy
    ), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(
        glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(
        glrm_caterr) + " but should be zero"

    print "Run GLRM with orthogonal non-negative regularization on both X and Y"
    initial_y = np.random.rand(n, k)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = h2o.glrm(x=train_h2o,
                        k=k,
                        init="User",
                        user_y=initial_y_h2o,
                        loss="Quadratic",
                        regularization_x="OneSparse",
                        regularization_y="OneSparse",
                        gamma_x=1,
                        gamma_y=1)
    glrm_h2o.show()

    print "Check that X and Y matrices are non-negative"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"

    print "Check that columns of X are orthogonal"
    xtx = np.dot(np.transpose(fit_x_np), fit_x_np)
    offdiag_x = np.extract(1 - np.eye(k), xtx)
    assert np.all(
        offdiag_x == 0), "All off diagonal elements of X'X must equal zero"

    print "Check that rows of Y are orthogonal"
    yyt = np.dot(fit_y_np, np.transpose(fit_y_np))
    offdiag_y = np.extract(1 - np.eye(k), yyt)
    assert np.all(
        offdiag_y == 0), "All off diagonal elements of YY' must equal zero"

    print "Check final objective function value"
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(sse)

    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(
        pred_np, fit_xy
    ), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(
        glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(
        glrm_caterr) + " but should be zero"
Esempio n. 53
0
print acs_model

# In[ ]:

# Plot objective function value each iteration
acs_model_score = acs_model.score_history()
plt.xlabel("Iteration")
plt.ylabel("Objective")
plt.title("Objective Function Value per Iteration")
plt.plot(acs_model_score["iteration"], acs_model_score["objective"])
plt.show()

# In[ ]:

# Embedding of ZCTAs into archetypes (X)
zcta_arch_x = h2o.get_frame(
    acs_model._model_json["output"]["representation_name"])
zcta_arch_x.head()

# In[ ]:

# Plot a few ZCTAs on the first two archetypes
idx = ((acs_zcta_col == "10065") |  # Manhattan, NY (Upper East Side)
       (acs_zcta_col == "11219") |  # Manhattan, NY (East Harlem)
       (acs_zcta_col == "66753") |  # McCune, KS
       (acs_zcta_col == "84104") |  # Salt Lake City, UT
       (acs_zcta_col == "94086") |  # Sunnyvale, CA
       (acs_zcta_col == "95014"))  # Cupertino, CA

city_arch = np.array(h2o.as_list(zcta_arch_x[idx, [0, 1]]))
plt.xlabel("First Archetype")
plt.ylabel("Second Archetype")
Esempio n. 54
0
 def encoding_map_frames(self):
     return list(
         map(lambda x: get_frame(x['key']['name']),
             self._encodingMap.frames))
Esempio n. 55
0
def glrm_simplex():
    m = 1000
    n = 100
    k = 10

    print "Uploading random uniform matrix with rows = " + str(
        m) + " and cols = " + str(n)
    Y = np.random.rand(k, n)

    def ind_list(k):
        tmp = [0] * k
        tmp[np.random.randint(0, k)] = 1
        return tmp

    X = [ind_list(k) for x in xrange(m)]
    X = np.array(X)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame(train.tolist())

    print "Run GLRM with quadratic mixtures (simplex) regularization on X"
    initial_y = np.random.rand(k, n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=k,
                                              init="User",
                                              user_y=initial_y_h2o,
                                              loss="Quadratic",
                                              regularization_x="Simplex",
                                              regularization_y="None",
                                              gamma_x=1,
                                              gamma_y=0)
    glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o)
    #    glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.show()

    print "Check that X matrix consists of rows within standard probability simplex"
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))

    def is_simplex(a):
        row_sum = sum(a)
        simplex = abs(row_sum - 1) < 1e-6
        assert simplex, "Got sum over row = " + row_sum + ", but expected 1"
        return simplex

    np.apply_along_axis(is_simplex, 1, fit_x_np)

    print "Check final objective function value"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(sse)

    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(
        pred_np, fit_xy
    ), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(
        glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(
        glrm_caterr) + " but should be zero"
Esempio n. 56
0
# OPTIONAL: plot the objective function score at each iteration
model_score = model.score_history()
plt.xlabel("Iteration")
plt.ylabel("Objective")
plt.title("Objective Function Value per Iteration")
print (model_score)
plt.plot(model_score["iterations"], model_score["objective"])
plt.show()
plt.savefig('modelScore.jpg')

# STEP 5: Recover the X and Y features and save them into csv File
# Idk why its not so straightforward to Recover X and Y but this works...
# The outputs are X, a numRows x rank Array, and Y a rank x numCols

Y = model.proj_archetypes(Data)
x_key = model._model_json["output"]["representation_name"]
X = h2o.get_frame(x_key)
     
Y = h2o.as_list(Y)
X = h2o.as_list(X)
     
Y.to_csv('outputY.csv', index = False)
X.to_csv('outputX.csv', index = False)
  
# Shut down the cluster after use    
h2o.shutdown(prompt=False)


# In[ ]:
Esempio n. 57
0
    def transform(self,
                  frame,
                  blending=None,
                  inflection_point=None,
                  smoothing=None,
                  noise=None,
                  as_training=False,
                  **kwargs):
        """
        Apply transformation to `te_columns` based on the encoding maps generated during `train()` method call.

        :param H2OFrame frame: the frame on which to apply the target encoding transformations.
        :param boolean blending: If provided, this overrides the `blending` parameter on the model.
        :param float inflection_point: If provided, this overrides the `inflection_point` parameter on the model.
        :param float smoothing: If provided, this overrides the `smoothing` parameter on the model.
        :param float noise: If provided, this overrides the amount of random noise added to the target encoding defined on the model, this helps prevent overfitting.
        :param boolean as_training: Must be set to True when encoding the training frame. Defaults to False.

        :example:
        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic[response] = titanic[response].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out",
        ...                                        inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True,
        ...                                        seed=1234)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> transformed = titanic_te.transform(frame=titanic)
        """
        for k in kwargs:
            if k in ['seed', 'data_leakage_handling']:
                warnings.warn(
                    "`%s` is deprecated in `transform` method and will be ignored. "
                    "Instead, please ensure that it was set before training on the H2OTargetEncoderEstimator model."
                    % k, H2ODeprecationWarning)
            else:
                raise TypeError(
                    "transform() got an unexpected keyword argument '%s'" % k)

        if 'data_leakage_handling' in kwargs:
            dlh = kwargs['data_leakage_handling']
            assert_is_type(dlh, None, Enum("leave_one_out", "k_fold", "none"))
            if dlh is not None and dlh.lower() != "none":
                warnings.warn(
                    "Deprecated `data_leakage_handling=%s` is replaced by `as_training=True`. "
                    "Please update your code." % dlh, H2ODeprecationWarning)
                as_training = True

        params = dict(
            model=self.model_id,
            frame=frame.key,
            blending=blending if blending is not None else self.
            blending,  # always need to provide blending here as we can't represent unset value 
            inflection_point=inflection_point,
            smoothing=smoothing,
            noise=noise,
            as_training=as_training,
        )

        output = h2o.api("GET /3/TargetEncoderTransform", data=params)
        return h2o.get_frame(output["name"])
    def execute(self, name, x, y, training_frame, validation_frame, test_frame,
                subset_coef):
        params = grid.ParameterGrid(self.params_grid)
        if self.params_grid is None or len(self.params_grid) == 0:
            params = ["default"]
        results = []
        dt = datetime.datetime

        # R stuff
        ri.initr()
        h2or = importr("h2o")
        h2o_ensemble = importr("h2oEnsemble")
        base = importr("base")
        stats = importr("stats")
        cvauc = importr("cvAUC")

        h2or.h2o_init(ip=config.hostname, port=config.port, startH2O=False)

        # Add some base learners
        with open("{}/R/wrappers.r".format(os.path.dirname(__file__)),
                  "r") as f:
            ro.r("\n".join(f.readlines()))

        keep_frames = re.compile("|".join([
            training_frame.frame_id, validation_frame.frame_id,
            test_frame.frame_id
        ]) + "|.*\\.hex|py_.*")

        for p in params:
            row = [
                config.cluster, config.nthreads, name, subset_coef, self.name,
                str(p)
            ]

            # Initialize the model
            init_time = dt.now()
            # get frame names
            # load it in R
            train = h2or.h2o_getFrame(training_frame.frame_id)
            valid = h2or.h2o_getFrame(validation_frame.frame_id)
            test = h2or.h2o_getFrame(test_frame.frame_id)
            init_time = dt.now() - init_time

            # Train the model
            train_time = dt.now()
            if p == "default":
                model = h2o_ensemble.h2o_ensemble(x=toR(x),
                                                  y=y,
                                                  training_frame=train,
                                                  validation_frame=valid)
            else:
                p = {k: toR(v) for k, v in p.items()}
                model = h2o_ensemble.h2o_ensemble(x=toR(x),
                                                  y=y,
                                                  training_frame=train,
                                                  validation_frame=valid,
                                                  **p)
            train_time = dt.now() - train_time

            # Model metrics
            metrics_time = dt.now()
            RpredTrain = stats.predict(model, train)
            RpredValid = stats.predict(model, valid)
            RpredTest = stats.predict(model, test)
            predTrain = h2o.get_frame(
                h2or.h2o_getId(RpredTrain.rx2("pred"))[0])
            predValid = h2o.get_frame(
                h2or.h2o_getId(RpredValid.rx2("pred"))[0])
            predTest = h2o.get_frame(h2or.h2o_getId(RpredTest.rx2("pred"))[0])
            metrics_time = dt.now() - metrics_time

            row.append(init_time.total_seconds())
            row.append(train_time.total_seconds())
            row.append(metrics_time.total_seconds())
            row.append((init_time + train_time + metrics_time).total_seconds())

            datasets = [(RpredTrain, predTrain, train, training_frame),
                        (RpredValid, predValid, valid, validation_frame),
                        (RpredTest, predTest, test, test_frame)]

            append = row.append
            for pred_r_ptr, pred_py_ptr, data_r_ptr, data_py_ptr in datasets:
                acc = None
                err = None
                mse = ((pred_py_ptr - data_py_ptr[y])**2).mean()[0]
                if training_frame[y].isfactor()[0]:
                    acc = (pred_py_ptr == data_py_ptr[y]).mean()[0]
                    err = 1.0 - acc

                auc = cvauc.AUC(
                    base.attr(pred_r_ptr.rx2("pred"), "data")[2],
                    base.attr(data_r_ptr, "data").rx2(y))[0]

                # TODO: Add more metrics
                append(acc)
                append(err)
                append(None)  # F1()
                append(None)  # fnr()
                append(None)  # fpr()
                append(None)  # tnr()
                append(None)  # tpr()
                append(None)  # precision()
                append(None)  # recall()
                append(None)  # sensitivity()
                append(None)  # specificity()
                append(None)  # aic()
                append((auc))  # auc()
                append(None)  # logloss()
                append(None)  # mean_residual_deviance()
                append(mse)  # mse()
                append(None)  # null_degrees_of_freedom()
                append(None)  # null_deviance()
                append(None)  # r2()
                append(None)  # residual_degrees_of_freedom()
                append(None)  # residual_deviance()

                h2o.remove(pred_py_ptr)

            row = map(
                lambda x: None if isinstance(x, numbers.Number) and
                (x is None or np.isnan(x)) or x == u"NaN" or x == "NaN" else x,
                row)
            persist(row)
            results.append(row)
            for [frame] in h2o.ls().as_matrix():
                if not keep_frames.match(frame):
                    h2o.remove(frame)

        df = pd.DataFrame(results, columns=config.Names)
        return df
def glrm_unitonesparse():
    m = 1000
    n = 100
    k = 10

    print "Uploading random uniform matrix with rows = " + str(
        m) + " and cols = " + str(n)
    Y = np.random.rand(k, n)

    def ind_list(k):
        tmp = [0] * k
        tmp[np.random.randint(0, k)] = 1
        return tmp

    X = [ind_list(k) for x in xrange(m)]
    X = np.array(X)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame(zip(*train.tolist()))

    print "Run GLRM with unit one-sparse regularization on X"
    initial_y = np.random.rand(k, n)
    initial_y_h2o = h2o.H2OFrame(zip(*initial_y.tolist()))
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=k,
                                              init="User",
                                              user_y=initial_y_h2o,
                                              loss="Quadratic",
                                              regularization_x="UnitOneSparse",
                                              regularization_y="None",
                                              gamma_x=1,
                                              gamma_y=0)
    glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o)
    #   glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.show()

    print "Check that X matrix consists of rows of basis vectors"
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))

    def is_basis(a):
        zeros = np.where(a == 0)[0].size
        ones = np.where(a == 1)[0].size
        basis = ones == 1 and (zeros + ones) == k
        assert basis, "Got " + str(ones) + " ones and " + str(
            zeros) + " zeros, but expected all zeros except a single 1"
        return basis

    np.apply_along_axis(is_basis, 1, fit_x_np)

    print "Check final objective function value"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(sse)

    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(
        pred_np, fit_xy
    ), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(
        glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(
        glrm_caterr) + " but should be zero"
Esempio n. 60
0
                                       regularization_y = "None",
                                       max_iterations = 1000,
                                       min_step_size = 1e-6)
model_c1.train(training_frame=dfh2o_c1)
model_c1.show()

# Print importance of each component of GLRM model
model_c1._model_json["output"]["importance"]

# Split the feature matrix into product of two matrices X and Y
# The matrix X has the same number of rows as the original feature matrix
# but a reduced number of columns representing the original features
# GLRM matrix factors X and Y

#X_matrix and Y_matrix for cluster1...
X_matrix_c1 = h2o.get_frame(model_c1._model_json["output"]["representation_name"])
print(X_matrix_c1)
Y_matrix_c1 = model_c1._model_json["output"]["archetypes"]
print(Y_matrix_c1)

#model genearted for  cluster2 :age and limit balance
model_c2 = H2OGeneralizedLowRankEstimator(k = 1,
                                       loss = "Absolute", multi_loss = "Categorical",
                                       transform = "Standardize",
                                       regularization_x = "None",
                                       regularization_y = "None",
                                       max_iterations = 1000,
                                       min_step_size = 1e-6)
model_c2.train(training_frame=dfh2o_c2)
model_c2.show()