def glrm_cancar():
    print("Importing cancar.csv data...")
    cancarH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/glrm_test/cancar.csv"))
    cancarH2O.describe()

    print("Building GLRM model with init = PlusPlus:\n")
    glrm_pp = h2o.glrm(x=cancarH2O,
                       k=4,
                       transform="NONE",
                       init="PlusPlus",
                       loss="Quadratic",
                       regularization_x="None",
                       regularization_y="None",
                       max_iterations=1000)
    glrm_pp.show()

    print("Building GLRM model with init = SVD:\n")
    glrm_svd = h2o.glrm(x=cancarH2O,
                        k=4,
                        transform="NONE",
                        init="SVD",
                        loss="Quadratic",
                        regularization_x="None",
                        regularization_y="None",
                        max_iterations=1000)
    glrm_svd.show()
def glrm_cancar():
    print "Importing cancar.csv data..."
    cancarH2O = h2o.upload_file(tests.locate("smalldata/glrm_test/cancar.csv"))
    cancarH2O.describe()
    
    print "Building GLRM model with init = PlusPlus:\n"
    glrm_pp = h2o.glrm(x=cancarH2O, k=4, transform="NONE", init="PlusPlus", loss="Quadratic", regularization_x="None", regularization_y="None", max_iterations=1000)
    glrm_pp.show()

    print "Building GLRM model with init = SVD:\n"
    glrm_svd = h2o.glrm(x=cancarH2O, k=4, transform="NONE", init="SVD", loss="Quadratic", regularization_x="None", regularization_y="None", max_iterations=1000)
    glrm_svd.show()
Exemple #3
0
def glrm_arrests():
    print "Importing USArrests.csv data..."
    arrestsH2O = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv"))
    arrestsH2O.describe()

    print "H2O initial Y matrix:\n"
    initial_y = [
        [5.412, 65.24, -7.54, -0.032],
        [2.212, 92.24, -17.54, 23.268],
        [0.312, 123.24, 14.46, 9.768],
        [1.012, 19.24, -15.54, -1.732],
    ]
    initial_y_h2o = h2o.H2OFrame(initial_y)
    initial_y_h2o.show()

    print "H2O GLRM on de-meaned data with quadratic loss:\n"
    glrm_h2o = h2o.glrm(
        x=arrestsH2O,
        k=4,
        transform="DEMEAN",
        loss="Quadratic",
        gamma_x=0,
        gamma_y=0,
        init="User",
        user_points=initial_y_h2o,
        recover_svd=True,
    )
    glrm_h2o.show()
def glrm_iris():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisH2O.describe()

    for trans in ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"]:
        rank = random.randint(1, 7)
        gx = random.uniform(0, 1)
        gy = random.uniform(0, 1)

        print("H2O GLRM with rank k = " + str(rank) + ", gamma_x = " +
              str(gx) + ", gamma_y = " + str(gy) + ", transform = " + trans)
        glrm_h2o = h2o.glrm(x=irisH2O,
                            k=rank,
                            loss="Quadratic",
                            gamma_x=gx,
                            gamma_y=gy,
                            transform=trans)
        glrm_h2o.show()

        print("Impute original data from XY decomposition")
        pred_h2o = glrm_h2o.predict(irisH2O)
        pred_h2o.describe()
        h2o.remove(glrm_h2o._model_json['output']['representation_name'])
def glrm_iris():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisH2O.describe()

    for trans in ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"]:
        rank = random.randint(1, 7)
        gx = random.uniform(0, 1)
        gy = random.uniform(0, 1)

        print(
            "H2O GLRM with rank k = "
            + str(rank)
            + ", gamma_x = "
            + str(gx)
            + ", gamma_y = "
            + str(gy)
            + ", transform = "
            + trans
        )
        glrm_h2o = h2o.glrm(x=irisH2O, k=rank, loss="Quadratic", gamma_x=gx, gamma_y=gy, transform=trans)
        glrm_h2o.show()

        print("Impute original data from XY decomposition")
        pred_h2o = glrm_h2o.predict(irisH2O)
        pred_h2o.describe()
        h2o.remove(glrm_h2o._model_json["output"]["representation_name"])
def glrm_set_loss_by_col():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsPy = np.array(h2o.as_list(arrestsH2O))
    arrestsH2O.describe()
    
    print("H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber")
    glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None")
    glrm_h2o.show()
    
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    
    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    fit_diff = arrestsPy.__sub__(fit_xy)
    obj_val = np.absolute(fit_diff[:,0]) + np.square(fit_diff[:,1]) + np.square(fit_diff[:,2])
    def huber(a):
        return a*a/2 if abs(a) <= 1 else abs(a)-0.5
    huber = np.vectorize(huber)
    obj_val = obj_val + huber(fit_diff[:,3])
    obj_val = np.sum(obj_val)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(obj_val)
Exemple #7
0
def glrm_benign():
    print "Importing benign.csv data..."
    benignH2O = h2o.upload_file(tests.locate("smalldata/logreg/benign.csv"))
    benignH2O.describe()

    for i in range(8, 16, 2):
        print "H2O GLRM with rank " + str(i) + " decomposition:\n"
        glrm_h2o = h2o.glrm(x=benignH2O, k=i, init="SVD", recover_svd=True)
        glrm_h2o.show()
def glrm_benign():
    print "Importing benign.csv data..."
    benignH2O = h2o.upload_file(tests.locate("smalldata/logreg/benign.csv"))
    benignH2O.describe()
    
    for i in range(8,16,2):
        print "H2O GLRM with rank " + str(i) + " decomposition:\n"
        glrm_h2o = h2o.glrm(x=benignH2O, k=i, init="SVD", recover_svd=True)
        glrm_h2o.show()
Exemple #9
0
def glrm_nnmf():
    m = 1000
    n = 100
    k = 10

    print "Uploading random uniform matrix with rows = " + str(
        m) + " and cols = " + str(n)
    Y = np.random.rand(k, n)
    X = np.random.rand(m, k)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame(train.tolist())

    print "Run GLRM with non-negative regularization"
    initial_y = np.random.rand(k, n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = h2o.glrm(x=train_h2o,
                        k=k,
                        init="User",
                        user_y=initial_y_h2o,
                        loss="Quadratic",
                        regularization_x="NonNegative",
                        regularization_y="NonNegative",
                        gamma_x=1,
                        gamma_y=1)
    glrm_h2o.show()

    print "Check that X and Y matrices are non-negative"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['loading_key']['name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"

    print "Check final objective function value"
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(sse)

    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(
        pred_np, fit_xy
    ), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(
        glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(
        glrm_caterr) + " but should be zero"
def glrm_unitonesparse():
    m = 1000
    n = 100
    k = 10

    print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)
    Y = np.random.rand(k,n)
    def ind_list(k):
        tmp = [0] * k
        tmp[np.random.randint(0,k)] = 1
        return tmp
    X = [ind_list(k) for x in xrange(m)]
    X = np.array(X)
    train = np.dot(X,Y)
    train_h2o = h2o.H2OFrame(zip(*train.tolist()))

    print "Run GLRM with unit one-sparse regularization on X"
    initial_y = np.random.rand(k,n)
    initial_y_h2o = h2o.H2OFrame(zip(*initial_y.tolist()))
    glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.show()

    print "Check that X matrix consists of rows of basis vectors"
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    def is_basis(a):
        zeros = np.where(a == 0)[0].size
        ones = np.where(a == 1)[0].size
        basis = ones == 1 and (zeros + ones) == k
        assert basis, "Got " + str(ones) + " ones and " + str(zeros) + " zeros, but expected all zeros except a single 1"
        return basis
    np.apply_along_axis(is_basis, 1, fit_x_np)

    print "Check final objective function value"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)

    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def glrm_simplex():
    m = 1000
    n = 100
    k = 10
    
    print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)
    Y = np.random.rand(k,n)
    def ind_list(k):
        tmp = [0] * k
        tmp[np.random.randint(0,k)] = 1
        return tmp
    X = [ind_list(k) for x in xrange(m)]
    X = np.array(X)
    train = np.dot(X,Y)
    train_h2o = h2o.H2OFrame(train.tolist())
    
    print "Run GLRM with quadratic mixtures (simplex) regularization on X"
    initial_y = np.random.rand(k,n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0)
    glrm_h2o.show()
    
    print "Check that X matrix consists of rows within standard probability simplex"
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['loading_key']['name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    def is_simplex(a):
        row_sum = sum(a)
        simplex = abs(row_sum - 1) < 1e-6
        assert simplex, "Got sum over row = " + row_sum + ", but expected 1"
        return simplex
    np.apply_along_axis(is_simplex, 1, fit_x_np)
    
    print "Check final objective function value"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)
    
    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def glrm_prostate_miss():
    missing_ratios = np.arange(0.1, 1, 0.1).tolist()
    
    print("Importing prostate_cat.csv data and saving for validation...")
    prostate_full = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA"]*8)
    prostate_full.describe()
    totnas = 0
    for i in range(prostate_full.ncol):
        totnas = totnas + prostate_full[i].isna().sum()
    totobs = prostate_full.nrow * prostate_full.ncol - totnas
    
    train_numerr = [0]*len(missing_ratios)
    valid_numerr = [0]*len(missing_ratios)
    train_caterr = [0]*len(missing_ratios)
    valid_caterr = [0]*len(missing_ratios)
    
    for i in range(len(missing_ratios)):
        ratio = missing_ratios[i]
        print("Importing prostate_cat.csv and inserting {0}% missing entries".format(100*ratio))
        prostate_miss = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
        prostate_miss = prostate_miss.insert_missing_values(fraction=ratio)
        prostate_miss.describe()
        
        print("H2O GLRM with {0}% missing entries".format(100*ratio))
        prostate_glrm = h2o.glrm(x=prostate_miss, validation_frame=prostate_full, k=8, ignore_const_cols=False, loss="Quadratic", gamma_x=0.5, gamma_y=0.5, regularization_x="L1", regularization_y="L1", init="SVD", max_iterations=2000, min_step_size=1e-6)
        prostate_glrm.show()
        
        # Check imputed data and error metrics
        train_numcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['numcnt']
        valid_numcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numcnt']
        train_catcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['catcnt']
        valid_catcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['catcnt']
        assert valid_numcnt >= train_numcnt, "Number of non-missing numeric entries in training data should be less than or equal to validation data"
        assert valid_catcnt >= train_catcnt, "Number of non-missing categorical entries in training data should be less than or equal to validation data"
        assert (train_numcnt + valid_numcnt) < totobs, "Total non-missing numeric entries in training and validation data was {0}, but should be less than {1}".format(train_numcnt + valid_numcnt, totobs)
        assert (valid_numcnt + valid_catcnt) == totobs, "Number of non-missing entries in validation data was {0}, but should be {1}".format(valid_numcnt + valid_catcnt, totobs)

        train_numerr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['numerr']
        valid_numerr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numerr']
        train_caterr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['caterr']
        valid_caterr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['caterr']
        h2o.remove(prostate_glrm._model_json['output']['representation_name'])
    
    for i in range(len(missing_ratios)):
        print("Missing ratio: {0}% --> Training numeric error: {1}\tValidation numeric error: {2}".format(missing_ratios[i]*100, train_numerr[i], valid_numerr[i]))
        
    for i in range(len(missing_ratios)):
        print("Missing ratio: {0}% --> Training categorical error: {1}\tValidation categorical error: {2}".format(missing_ratios[i]*100, train_caterr[i], valid_caterr[i]))
def glrm_prostate_miss():
    missing_ratios = np.arange(0.1, 1, 0.1).tolist()
    
    print "Importing prostate_cat.csv data and saving for validation..."
    prostate_full = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA"]*8)
    prostate_full.describe()
    totnas = 0
    for i in range(prostate_full.ncol):
        totnas = totnas + prostate_full[i].isna().sum()
    totobs = prostate_full.nrow * prostate_full.ncol - totnas
    
    train_numerr = [0]*len(missing_ratios)
    valid_numerr = [0]*len(missing_ratios)
    train_caterr = [0]*len(missing_ratios)
    valid_caterr = [0]*len(missing_ratios)
    
    for i in range(len(missing_ratios)):
        ratio = missing_ratios[i]
        print "Importing prostate_cat.csv and inserting {0}% missing entries".format(100*ratio)
        prostate_miss = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
        prostate_miss = prostate_miss.insert_missing_values(fraction=ratio)
        prostate_miss.describe()
        
        print "H2O GLRM with {0}% missing entries".format(100*ratio)
        prostate_glrm = h2o.glrm(x=prostate_miss, validation_frame=prostate_full, k=8, ignore_const_cols=False, loss="Quadratic", gamma_x=0.5, gamma_y=0.5, regularization_x="L1", regularization_y="L1", init="SVD", max_iterations=2000, min_step_size=1e-6)
        prostate_glrm.show()
        
        # Check imputed data and error metrics
        train_numcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['numcnt']
        valid_numcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numcnt']
        train_catcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['catcnt']
        valid_catcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['catcnt']
        assert valid_numcnt >= train_numcnt, "Number of non-missing numeric entries in training data should be less than or equal to validation data"
        assert valid_catcnt >= train_catcnt, "Number of non-missing categorical entries in training data should be less than or equal to validation data"
        assert (train_numcnt + valid_numcnt) < totobs, "Total non-missing numeric entries in training and validation data was {0}, but should be less than {1}".format(train_numcnt + valid_numcnt, totobs)
        assert (valid_numcnt + valid_catcnt) == totobs, "Number of non-missing entries in validation data was {0}, but should be {1}".format(valid_numcnt + valid_catcnt, totobs)

        train_numerr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['numerr']
        valid_numerr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numerr']
        train_caterr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['caterr']
        valid_caterr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['caterr']
        h2o.remove(prostate_glrm._model_json['output']['representation_name'])
    
    for i in range(len(missing_ratios)):
        print "Missing ratio: {0}% --> Training numeric error: {1}\tValidation numeric error: {2}".format(missing_ratios[i]*100, train_numerr[i], valid_numerr[i])
        
    for i in range(len(missing_ratios)):
        print "Missing ratio: {0}% --> Training categorical error: {1}\tValidation categorical error: {2}".format(missing_ratios[i]*100, train_caterr[i], valid_caterr[i])
def glrm_arrests():
    print "Importing USArrests.csv data..."
    arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsH2O.describe()
    
    print "H2O initial Y matrix:\n"
    initial_y = [[5.412,  65.24,  -7.54, -0.032],
                 [2.212,  92.24, -17.54, 23.268],
                 [0.312, 123.24,  14.46,  9.768],
                 [1.012,  19.24, -15.54, -1.732]]
    initial_y_h2o = h2o.H2OFrame(initial_y)
    initial_y_h2o.show()

    print "H2O GLRM on de-meaned data with quadratic loss:\n"
    glrm_h2o = h2o.glrm(x=arrestsH2O, k=4, transform="DEMEAN", loss="Quadratic", gamma_x=0, gamma_y=0, init="User", user_y=initial_y_h2o, recover_svd=True)
    glrm_h2o.show()
def glrm_set_loss_by_col():
    print "Importing USArrests.csv data..."
    arrestsH2O = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv"))
    arrestsH2O.describe()

    print "H2O GLRM with loss by column = L1, Quadratic, Quadratic, Huber"
    glrm_h2o = h2o.glrm(
        x=arrestsH2O,
        k=3,
        loss="Quadratic",
        loss_by_col=["L1", "Huber"],
        loss_by_col_idx=[0, 3],
        regularization_x="None",
        regularization_y="None",
    )
    glrm_h2o.show()
def glrm_arrests_miss():
    missing_ratios = np.arange(0.1, 1, 0.1).tolist()
    
    print "Importing USArrests.csv data and saving for validation..."
    arrests_full = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv"))
    arrests_full.describe()
    totobs = arrests_full.nrow * arrests_full.ncol
    train_err = [0]*len(missing_ratios)
    valid_err = [0]*len(missing_ratios)
    
    for i in range(len(missing_ratios)):
        ratio = missing_ratios[i]
        print "Importing USArrests.csv and inserting {0}% missing entries".format(100*ratio)
        arrests_miss = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv"))
        arrests_miss = arrests_miss.insert_missing_values(fraction=ratio)
        arrests_miss.describe()
        
        print "H2O GLRM with {0}% missing entries".format(100*ratio)
        arrests_glrm = h2o.glrm(x=arrests_miss, validation_frame=arrests_full, k=4, ignore_const_cols=False, loss="Quadratic", regularization_x="None", regularization_y="None", init="PlusPlus", max_iterations=10, min_step_size=1e-6)
        arrests_glrm.show()
        
        # Check imputed data and error metrics
        glrm_obj = arrests_glrm._model_json['output']['objective']
        train_numerr = arrests_glrm._model_json['output']['training_metrics']._metric_json['numerr']
        train_caterr = arrests_glrm._model_json['output']['training_metrics']._metric_json['caterr']
        valid_numerr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numerr']
        valid_caterr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['caterr']
        assert abs(train_numerr - glrm_obj) < 1e-3, "Numeric error on training data was " + str(train_numerr) + " but should equal final objective " + str(glrm_obj)
        assert train_caterr == 0, "Categorical error on training data was " + str(train_caterr) + " but should be zero"
        assert valid_caterr == 0, "Categorical error on validation data was " + str(valid_caterr) + " but should be zero"
        
        train_numcnt = arrests_glrm._model_json['output']['training_metrics']._metric_json['numcnt']
        valid_numcnt = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numcnt']
        assert valid_numcnt > train_numcnt, "Number of non-missing numerical entries in training data should be less than validation data"
        assert valid_numcnt == totobs, "Number of non-missing numerical entries in validation data was " + str(valid_numcnt) + " but should be " + str(totobs)
        
        train_err[i] = train_numerr
        valid_err[i] = valid_numerr
        h2o.remove(arrests_glrm._model_json['output']['loading_key']['name'])
    
    for i in range(len(missing_ratios)):
        print "Missing ratio: {0}% --> Training error: {1}\tValidation error: {2}".format(missing_ratios[i]*100, train_err[i], valid_err[i])
Exemple #17
0
def glrm_arrests_miss():
    missing_ratios = np.arange(0.1, 1, 0.1).tolist()
    
    print("Importing USArrests.csv data and saving for validation...")
    arrests_full = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrests_full.describe()
    totobs = arrests_full.nrow * arrests_full.ncol
    train_err = [0]*len(missing_ratios)
    valid_err = [0]*len(missing_ratios)
    
    for i in range(len(missing_ratios)):
        ratio = missing_ratios[i]
        print("Importing USArrests.csv and inserting {0}% missing entries".format(100*ratio))
        arrests_miss = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
        arrests_miss = arrests_miss.insert_missing_values(fraction=ratio)
        arrests_miss.describe()
        
        print("H2O GLRM with {0}% missing entries".format(100*ratio))
        arrests_glrm = h2o.glrm(x=arrests_miss, validation_frame=arrests_full, k=4, ignore_const_cols=False, loss="Quadratic", regularization_x="None", regularization_y="None", init="PlusPlus", max_iterations=10, min_step_size=1e-6)
        arrests_glrm.show()
        
        # Check imputed data and error metrics
        glrm_obj = arrests_glrm._model_json['output']['objective']
        train_numerr = arrests_glrm._model_json['output']['training_metrics']._metric_json['numerr']
        train_caterr = arrests_glrm._model_json['output']['training_metrics']._metric_json['caterr']
        valid_numerr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numerr']
        valid_caterr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['caterr']
        assert abs(train_numerr - glrm_obj) < 1e-3, "Numeric error on training data was " + str(train_numerr) + " but should equal final objective " + str(glrm_obj)
        assert train_caterr == 0, "Categorical error on training data was " + str(train_caterr) + " but should be zero"
        assert valid_caterr == 0, "Categorical error on validation data was " + str(valid_caterr) + " but should be zero"
        
        train_numcnt = arrests_glrm._model_json['output']['training_metrics']._metric_json['numcnt']
        valid_numcnt = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numcnt']
        assert valid_numcnt > train_numcnt, "Number of non-missing numerical entries in training data should be less than validation data"
        assert valid_numcnt == totobs, "Number of non-missing numerical entries in validation data was " + str(valid_numcnt) + " but should be " + str(totobs)
        
        train_err[i] = train_numerr
        valid_err[i] = valid_numerr
        h2o.remove(arrests_glrm._model_json['output']['representation_name'])
    
    for i in range(len(missing_ratios)):
        print("Missing ratio: {0}% --> Training error: {1}\tValidation error: {2}".format(missing_ratios[i]*100, train_err[i], valid_err[i]))
def glrm_set_loss_by_col():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsPy = np.array(h2o.as_list(arrestsH2O))
    arrestsH2O.describe()

    print(
        "H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber")
    glrm_h2o = h2o.glrm(x=arrestsH2O,
                        k=3,
                        loss="Quadratic",
                        loss_by_col=["Absolute", "Huber"],
                        loss_by_col_idx=[0, 3],
                        regularization_x="None",
                        regularization_y="None")
    glrm_h2o.show()

    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))

    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    fit_diff = arrestsPy.__sub__(fit_xy)
    obj_val = np.absolute(fit_diff[:, 0]) + np.square(
        fit_diff[:, 1]) + np.square(fit_diff[:, 2])

    def huber(a):
        return a * a / 2 if abs(a) <= 1 else abs(a) - 0.5

    huber = np.vectorize(huber)
    obj_val = obj_val + huber(fit_diff[:, 3])
    obj_val = np.sum(obj_val)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(obj_val)
def glrm_nnmf():
    m = 1000
    n = 100
    k = 10

    print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n))
    Y = np.random.rand(k,n)
    X = np.random.rand(m, k)
    train = np.dot(X,Y)
    train_h2o = h2o.H2OFrame(list(zip(*train.tolist())))

    print("Run GLRM with non-negative regularization")
    initial_y = np.random.rand(n,k)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="NonNegative", regularization_y="NonNegative", gamma_x=1, gamma_y=1)
    glrm_h2o.show()

    print("Check that X and Y matrices are non-negative")
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"

    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)

    print("Impute XY and check error metrics")
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
Exemple #20
0
def glrm_orthonnmf():
    m = 1000
    n = 100
    k = 10

    print("Uploading random uniform matrix with rows = " + str(m) +
          " and cols = " + str(n))
    Y = np.random.rand(k, n)
    X = np.random.rand(m, k)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame(train.tolist())

    print(
        "Run GLRM with orthogonal non-negative regularization on X, non-negative regularization on Y"
    )
    initial_y = np.random.rand(k, n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = h2o.glrm(x=train_h2o,
                        k=k,
                        init="User",
                        user_y=initial_y_h2o,
                        loss="Quadratic",
                        regularization_x="OneSparse",
                        regularization_y="NonNegative",
                        gamma_x=1,
                        gamma_y=1)
    glrm_h2o.show()

    print("Check that X and Y matrices are non-negative")
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"

    print("Check that columns of X are orthogonal")
    xtx = np.dot(np.transpose(fit_x_np), fit_x_np)
    offdiag = np.extract(1 - np.eye(k), xtx)
    assert np.all(
        offdiag == 0), "All off diagonal elements of X'X must equal zero"

    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(sse)

    print("Impute XY and check error metrics")
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(
        pred_np, fit_xy
    ), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(
        glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(
        glrm_caterr) + " but should be zero"

    print(
        "Run GLRM with orthogonal non-negative regularization on both X and Y")
    initial_y = np.random.rand(k, n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = h2o.glrm(x=train_h2o,
                        k=k,
                        init="User",
                        user_y=initial_y_h2o,
                        loss="Quadratic",
                        regularization_x="OneSparse",
                        regularization_y="OneSparse",
                        gamma_x=1,
                        gamma_y=1)
    glrm_h2o.show()

    print("Check that X and Y matrices are non-negative")
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['representation_name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"

    print("Check that columns of X are orthogonal")
    xtx = np.dot(np.transpose(fit_x_np), fit_x_np)
    offdiag_x = np.extract(1 - np.eye(k), xtx)
    assert np.all(
        offdiag_x == 0), "All off diagonal elements of X'X must equal zero"

    print("Check that rows of Y are orthogonal")
    yyt = np.dot(fit_y_np, np.transpose(fit_y_np))
    offdiag_y = np.extract(1 - np.eye(k), yyt)
    assert np.all(
        offdiag_y == 0), "All off diagonal elements of YY' must equal zero"

    print("Check final objective function value")
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(sse)

    print("Impute XY and check error metrics")
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(
        pred_np, fit_xy
    ), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(
        glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(
        glrm_caterr) + " but should be zero"
def glrm_set_loss_by_col_rand():
    NUM_LOSS = ["Quadratic", "Absolute", "Huber", "Poisson", "Periodic"]
    CAT_LOSS = ["Categorical", "Ordinal"]
    NUM_COLS = [1, 5, 6, 7]
    CAT_COLS = [0, 2, 3, 4]

    print "Importing prostate_cat.csv data..."
    prostateH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"),
        na_strings=["NA"] * 8)
    prostateH2O.describe()

    # Fully specify every column's loss function (no need for loss_by_col_idx)
    loss_all = [
        rd.sample(NUM_LOSS, k=1)[0] if x in NUM_COLS else rd.sample(CAT_LOSS,
                                                                    k=1)[0]
        for x in xrange(0, 8)
    ]
    print "Run GLRM with loss_by_col = [" + ', '.join(loss_all) + "]"
    glrm_h2o = h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all)
    glrm_h2o.show()

    # Randomly set columns and loss functions
    cat_size = rd.sample(xrange(1, 5), 1)
    num_size = rd.sample(xrange(1, 5), 1)
    cat_idx = np.random.choice(CAT_COLS, size=cat_size, replace=False)
    num_idx = np.random.choice(NUM_COLS, size=num_size, replace=False)
    loss_by_col_cat = np.random.choice(CAT_LOSS, size=cat_size, replace=True)
    loss_by_col_num = np.random.choice(NUM_LOSS, size=num_size, replace=True)

    loss_idx_all = cat_idx.tolist() + num_idx.tolist()
    loss_all = loss_by_col_cat.tolist() + loss_by_col_num.tolist()
    loss_combined = zip(
        loss_all,
        loss_idx_all)  # Permute losses and indices in same way for testing
    rd.shuffle(loss_combined)
    loss_all[:], loss_idx_all[:] = zip(*loss_combined)

    if (len(loss_all) < prostateH2O.ncol):
        try:
            h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all)
            assert False, "Expected GLRM to throw error since column indices not specified"
        except:
            pass

    try:
        h2o.glrm(x=prostateH2O, k=5, loss_by_col_idx=loss_idx_all)
        assert False, "Expected GLRM to throw error since losses for columns not specified"
    except:
        pass

    try:
        h2o.glrm(x=prostateH2O,
                 k=5,
                 loss_by_col=["Absolute", "Ordinal", "Huber"],
                 loss_by_col_idx=[1, 2])
        assert False, "Expected GLRM to throw error since not all column indices specified"
    except:
        pass

    try:
        h2o.glrm(x=prostateH2O,
                 k=5,
                 loss_by_col=["Absolute", "Ordinal"],
                 loss_by_col_idx=[1, 2, 5])
        assert False, "Expected GLRM to throw error since not all losses for columns specified"
    except:
        pass

    try:
        h2o.glrm(x=prostateH2O, k=5, loss_by_col="Absolute", loss_by_col_idx=8)
        assert False, "Expected GLRM to throw error since column index 8 is out of bounds (zero indexing)"
    except:
        pass

    try:
        h2o.glrm(x=prostateH2O,
                 k=5,
                 loss_by_col=rd.sample(NUM_LOSS, 1),
                 loss_by_col_idx=rd.sample(CAT_COLS, 1))
        assert False, "Expected GLRM to throw error since numeric loss cannot apply to categorical column"
    except:
        pass

    try:
        h2o.glrm(x=prostateH2O,
                 k=5,
                 loss_by_col=rd.sample(CAT_LOSS, 1),
                 loss_by_col_idx=rd.sample(NUM_COLS, 1))
        assert False, "Expected GLRM to throw error since categorical loss cannot apply to numeric column"
    except:
        pass

    print "Run GLRM with loss_by_col = [" + ', '.join(
        loss_all) + "] and loss_by_col_idx = [" + ', '.join(
            [str(a) for a in loss_idx_all]) + "]"
    glrm_h2o = h2o.glrm(x=prostateH2O,
                        k=5,
                        loss_by_col=loss_all,
                        loss_by_col_idx=loss_idx_all)
    glrm_h2o.show()
def glrm_set_loss_by_col_rand():
    NUM_LOSS = ["Quadratic", "Absolute", "Huber", "Poisson", "Periodic"]
    CAT_LOSS = ["Categorical", "Ordinal"]
    NUM_COLS = [1, 5, 6, 7]
    CAT_COLS = [0, 2, 3, 4]
    
    print("Importing prostate_cat.csv data...")
    prostateH2O = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings = ["NA"]*8)
    prostateH2O.describe()
    
    # Fully specify every column's loss function (no need for loss_by_col_idx)
    loss_all = [rd.sample(NUM_LOSS, k=1)[0] if x in NUM_COLS else rd.sample(CAT_LOSS, k=1)[0] for x in range(0,8)]
    print("Run GLRM with loss_by_col = [" + ', '.join(loss_all) + "]")
    glrm_h2o = h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all)
    glrm_h2o.show()
    
    # Randomly set columns and loss functions
    cat_size = rd.sample(range(1,5), 1)
    num_size = rd.sample(range(1,5), 1)
    cat_idx = np.random.choice(CAT_COLS, size=cat_size, replace=False)
    num_idx = np.random.choice(NUM_COLS, size=num_size, replace=False)
    loss_by_col_cat = np.random.choice(CAT_LOSS, size=cat_size, replace=True)
    loss_by_col_num = np.random.choice(NUM_LOSS, size=num_size, replace=True)
    
    loss_idx_all = cat_idx.tolist() + num_idx.tolist()
    loss_all = loss_by_col_cat.tolist() + loss_by_col_num.tolist()
    loss_combined = list(zip(loss_all, loss_idx_all))   # Permute losses and indices in same way for testing
    rd.shuffle(loss_combined)
    loss_all[:], loss_idx_all[:] = list(zip(*loss_combined))
    
    if(len(loss_all) < prostateH2O.ncol):
        try:
            h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all)
            assert False, "Expected GLRM to throw error since column indices not specified"
        except:
            pass
    
    try:
        h2o.glrm(x=prostateH2O, k=5, loss_by_col_idx=loss_idx_all)
        assert False, "Expected GLRM to throw error since losses for columns not specified"
    except:
        pass
        
    try:
        h2o.glrm(x=prostateH2O, k=5, loss_by_col=["Absolute", "Ordinal", "Huber"], loss_by_col_idx = [1,2])
        assert False, "Expected GLRM to throw error since not all column indices specified"
    except:
        pass
        
    try:
        h2o.glrm(x=prostateH2O, k=5, loss_by_col=["Absolute", "Ordinal"], loss_by_col_idx=[1,2,5])
        assert False, "Expected GLRM to throw error since not all losses for columns specified"
    except:
        pass
    
    try:
        h2o.glrm(x=prostateH2O, k=5, loss_by_col="Absolute", loss_by_col_idx=8)
        assert False, "Expected GLRM to throw error since column index 8 is out of bounds (zero indexing)"
    except:
        pass
    
    try:
        h2o.glrm(x=prostateH2O, k=5, loss_by_col=rd.sample(NUM_LOSS,1), loss_by_col_idx=rd.sample(CAT_COLS,1))
        assert False, "Expected GLRM to throw error since numeric loss cannot apply to categorical column"
    except:
        pass
    
    try:
        h2o.glrm(x=prostateH2O, k=5, loss_by_col=rd.sample(CAT_LOSS,1), loss_by_col_idx=rd.sample(NUM_COLS,1))
        assert False, "Expected GLRM to throw error since categorical loss cannot apply to numeric column"
    except:
        pass
    
    print("Run GLRM with loss_by_col = [" + ', '.join(loss_all) + "] and loss_by_col_idx = [" + ', '.join([str(a) for a in loss_idx_all]) + "]")
    glrm_h2o = h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all, loss_by_col_idx=loss_idx_all)
    glrm_h2o.show()
Exemple #23
0
def glrm_simplex():
    m = 1000
    n = 100
    k = 10

    print "Uploading random uniform matrix with rows = " + str(
        m) + " and cols = " + str(n)
    Y = np.random.rand(k, n)

    def ind_list(k):
        tmp = [0] * k
        tmp[np.random.randint(0, k)] = 1
        return tmp

    X = [ind_list(k) for x in xrange(m)]
    X = np.array(X)
    train = np.dot(X, Y)
    train_h2o = h2o.H2OFrame(train.tolist())

    print "Run GLRM with quadratic mixtures (simplex) regularization on X"
    initial_y = np.random.rand(k, n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = h2o.glrm(x=train_h2o,
                        k=k,
                        init="User",
                        user_y=initial_y_h2o,
                        loss="Quadratic",
                        regularization_x="Simplex",
                        regularization_y="None",
                        gamma_x=1,
                        gamma_y=0)
    glrm_h2o.show()

    print "Check that X matrix consists of rows within standard probability simplex"
    fit_x = h2o.get_frame(
        glrm_h2o._model_json['output']['loading_key']['name'])
    fit_x_np = np.array(h2o.as_list(fit_x))

    def is_simplex(a):
        row_sum = sum(a)
        simplex = abs(row_sum - 1) < 1e-6
        assert simplex, "Got sum over row = " + row_sum + ", but expected 1"
        return simplex

    np.apply_along_axis(is_simplex, 1, fit_x_np)

    print "Check final objective function value"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(
        glrm_obj) + " but should equal " + str(sse)

    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(
        pred_np, fit_xy
    ), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output'][
        'training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(
        glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(
        glrm_caterr) + " but should be zero"
def glrm_orthonnmf():
    m = 1000
    n = 100
    k = 10
    
    print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)
    Y = np.random.rand(k,n)
    X = np.random.rand(m, k)
    train = np.dot(X,Y)
    train_h2o = h2o.H2OFrame(train.tolist())
    
    print "Run GLRM with orthogonal non-negative regularization on X, non-negative regularization on Y"
    initial_y = np.random.rand(k,n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="OneSparse", regularization_y="NonNegative", gamma_x=1, gamma_y=1)
    glrm_h2o.show()
    
    print "Check that X and Y matrices are non-negative"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['loading_key']['name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"
    
    print "Check that columns of X are orthogonal"
    xtx = np.dot(np.transpose(fit_x_np), fit_x_np)
    offdiag = np.extract(1-np.eye(k), xtx)
    assert np.all(offdiag == 0), "All off diagonal elements of X'X must equal zero"
    
    print "Check final objective function value"
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)
    
    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
    
    print "Run GLRM with orthogonal non-negative regularization on both X and Y"
    initial_y = np.random.rand(k,n)
    initial_y_h2o = h2o.H2OFrame(initial_y.tolist())
    glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="OneSparse", regularization_y="OneSparse", gamma_x=1, gamma_y=1)
    glrm_h2o.show()
    
    print "Check that X and Y matrices are non-negative"
    fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values
    fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y]
    fit_y_np = np.array(fit_y_np)
    fit_x = h2o.get_frame(glrm_h2o._model_json['output']['loading_key']['name'])
    fit_x_np = np.array(h2o.as_list(fit_x))
    assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements"
    assert np.all(fit_x_np >= 0), "X must contain only non-negative elements"
    
    print "Check that columns of X are orthogonal"
    xtx = np.dot(np.transpose(fit_x_np), fit_x_np)
    offdiag_x = np.extract(1-np.eye(k), xtx)
    assert np.all(offdiag_x == 0), "All off diagonal elements of X'X must equal zero"
    
    print "Check that rows of Y are orthogonal"
    yyt = np.dot(fit_y_np, np.transpose(fit_y_np))
    offdiag_y = np.extract(1-np.eye(k), yyt)
    assert np.all(offdiag_y == 0), "All off diagonal elements of YY' must equal zero"
    
    print "Check final objective function value"
    fit_xy = np.dot(fit_x_np, fit_y_np)
    glrm_obj = glrm_h2o._model_json['output']['objective']
    sse = np.sum(np.square(train.__sub__(fit_xy)))
    assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse)
    
    print "Impute XY and check error metrics"
    pred_h2o = glrm_h2o.predict(train_h2o)
    pred_np = np.array(h2o.as_list(pred_h2o))
    assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product"
    glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr']
    glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr']
    assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj)
    assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"