Example #1
0
def func():

    n_rows = [600000, 800000]
    n_cols = [400, 600, 800, 1000, 1200]
    for rows in n_rows:
        # res = {}
        # res['n_rows'] = []
        # res['n_cols'] = []
        # res['t_h2o'] = []
        # res['t_sklearn'] = []
        # res['r2_h2o'] = []
        # res['r2_sklearn'] = []

        for cols in n_cols:
            X, y = generate_data(rows, cols)
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42)
            print(X_train.nbytes, 'bytes')

            time_h2o, time_sklearn, r2_h2o, r2_sklearn = fit_model(
                X_train, y_train, X_test, y_test, reg_type='lasso')

            # res['n_rows'].append(rows)
            # res['n_cols'].append(cols)
            # res['t_h2o'].append(ret[0])
            # res['t_sklearn'].append(ret[1])
            # res['r2_h2o'].append(ret[2])
            # res['r2_sklearn'].append(ret[3])
            time.sleep(0.1)
        # res = pd.DataFrame(res)
        # res.to_csv("./benchmarks/results_%1.0f.csv" % rows, index=False)

        time.sleep(0.1)
        print('DONE!')
Example #2
0
    def fun(nGPUs=1,
            nFolds=1,
            nLambdas=100,
            nAlphas=8,
            validFraction=0.2,
            whichdata=0,
            double_precision=False):
        name = str(sys._getframe().f_code.co_name)
        t = time.time()

        print("cwd: %s" % (os.getcwd()))
        sys.stdout.flush()

        name = sys._getframe(1).f_code.co_name
        #    pipes = startfunnel(os.path.join(os.getcwd(), "tmp/"), name)

        print("Getting Data")
        from h2o4gpu.datasets import fetch_20newsgroups, fetch_20newsgroups_vectorized, fetch_california_housing, \
            fetch_covtype, fetch_kddcup99, fetch_lfw_pairs, fetch_lfw_people, fetch_mldata, fetch_olivetti_faces, \
            fetch_rcv1, fetch_species_distributions
        from h2o4gpu.model_selection import train_test_split

        # Fetch dataset
        if whichdata == 0:
            data = fetch_20newsgroups()  # runs
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 1:
            data = fetch_20newsgroups_vectorized()  # sparse
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 2:
            data = fetch_california_housing()  # runs
            sizetokeep = 1000
        elif whichdata == 3:
            data = fetch_covtype()
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 4:
            data = fetch_kddcup99()  # strings -> numeric
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 5:
            data = fetch_lfw_pairs()
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 6:
            data = fetch_lfw_people()
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 7:
            data = fetch_mldata('iris')
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 8:
            data = fetch_mldata('leukemia')  # runs
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 9:
            data = fetch_mldata('Whistler Daily Snowfall')
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 10:
            data = fetch_olivetti_faces()  # runs
            sizetokeep = 100
        elif whichdata == 11:
            data = fetch_rcv1()
            sizetokeep = 1000  # 1k rows for now
            # data = data.todense() # FIXME: glm and kmeans h2o4gpu currently only supports dense matrices
        elif whichdata == 12:
            data = fetch_species_distributions()
            sizetokeep = 1000  # 1k rows for now
        else:
            ValueError("No such whichdata")

        try:
            sizetokeep = min(sizetokeep, len(data.data[:, 0]))
            X = data.data[0:sizetokeep, :]
        except:
            sizetokeep = min(sizetokeep, len(data.data[:]))
            X = data.data[0:sizetokeep]
        y = data.target[0:sizetokeep]
        print("Got Data")

        import numpy as np

        # Create 0.8/0.2 train/test split
        print("Split Data")
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            train_size=0.8,
                                                            random_state=42)

        print("Encode Data")
        # from h2o4gpu.preprocessing import Imputer
        # imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        # imp.fit(X, y)
        # Xencoded = imp.transform(X)
        # yencoded = imp.transform(y)

        import pandas as pd
        X_test_pd = pd.DataFrame(X_test)
        X_train_pd = pd.DataFrame(X_train)

        # Importing LabelEncoder and initializing it
        from h2o4gpu.preprocessing import LabelEncoder
        le = LabelEncoder()
        # Iterating over all the common columns in train and test
        for col in X_test_pd.columns.values:
            # Encoding only categorical variables
            if X_test_pd[col].dtypes == 'object' or X_test_pd[
                    col].dtypes == 'bool':
                # Using whole data to form an exhaustive list of levels
                data = X_train_pd[col].append(X_test_pd[col])
                le.fit(data.values)
                X_train_pd[col] = le.transform(X_train_pd[col])
                X_test_pd[col] = le.transform(X_test_pd[col])

        X_train_pd = pd.get_dummies(X_train_pd).fillna(0.0)
        X_test_pd = pd.get_dummies(X_test_pd).fillna(0.0)
        y_train_pd = pd.Series(y_train).fillna(0.0)
        y_test_pd = pd.Series(y_test).fillna(0.0)

        # get back numpy
        X_test = X_test_pd.values
        X_train = X_train_pd.values
        y_test = y_test_pd.values
        y_train = y_train_pd.values

        if double_precision:
            mynptype = np.float64
        else:
            mynptype = np.float32
        X_test = X_test.astype(mynptype)
        X_train = X_train.astype(np.float64)
        y_test = y_test.astype(np.float64)
        y_train = y_train.astype(np.float64)

        # TODO: Should write this to file and avoid doing encoding if already exists

        t1 = time.time()
        print("Start ElasticNetH2O")
        rmse_train, rmse_test = run_glm(X_train,
                                        y_train,
                                        X_test,
                                        y_test,
                                        nGPUs=nGPUs,
                                        nlambda=nLambdas,
                                        nfolds=nFolds,
                                        nalpha=nAlphas,
                                        validFraction=validFraction,
                                        verbose=0,
                                        name=name,
                                        tolerance=0.2,
                                        tol=1E-2,
                                        tol_seek_factor=1.0)
        print("End ElasticNetH2O")

        # check rmse
        print(rmse_train[0, 0])
        print(rmse_train[0, 1])
        print(rmse_train[0, 2])
        print(rmse_test[0, 2])
        sys.stdout.flush()

        print('/n Total execution time:%d' % (time.time() - t1))

        print("TEST PASSED")
        sys.stdout.flush()

        print("Time taken: {}".format(time.time() - t))
        #    endfunnel(pipes)
        print("DONE.")
        sys.stdout.flush()
Example #3
0
    def fun(nGPUs=1, nFolds=1, nLambdas=100, nAlphas=8, validFraction=0.2, whichdata=0, double_precision=False):
        name = str(sys._getframe().f_code.co_name)
        t = time.time()
    
        print("cwd: %s" % (os.getcwd()))
        sys.stdout.flush()
    
        name = sys._getframe(1).f_code.co_name
        #    pipes = startfunnel(os.path.join(os.getcwd(), "tmp/"), name)
    
        print("Getting Data")
        from h2o4gpu.datasets import fetch_20newsgroups, fetch_20newsgroups_vectorized, fetch_california_housing, \
            fetch_covtype, fetch_kddcup99, fetch_lfw_pairs, fetch_lfw_people, fetch_mldata, fetch_olivetti_faces, \
            fetch_rcv1, fetch_species_distributions
        from h2o4gpu.model_selection import train_test_split
    
        # Fetch dataset
        if whichdata == 0:
            data = fetch_20newsgroups() # runs
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 1:
            data = fetch_20newsgroups_vectorized() # sparse
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 2:
            data = fetch_california_housing() # runs
            sizetokeep = 1000
        elif whichdata == 3:
            data = fetch_covtype()
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 4:
            data = fetch_kddcup99() # strings -> numeric
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 5:
            data = fetch_lfw_pairs()
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 6:
            data = fetch_lfw_people()
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 7:
            data = fetch_mldata('iris')
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 8:
            data = fetch_mldata('leukemia') # runs
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 9:
            data = fetch_mldata('Whistler Daily Snowfall')
            sizetokeep = 1000  # 1k rows for now
        elif whichdata == 10:
            data = fetch_olivetti_faces() # runs
            sizetokeep = 100
        elif whichdata == 11:
            data = fetch_rcv1()
            sizetokeep = 1000  # 1k rows for now
            #data = data.todense() # FIXME: glm and kmeans h2o4gpu currently only supports dense matrices
        elif whichdata == 12:
            data = fetch_species_distributions()
            sizetokeep = 1000  # 1k rows for now
        else:
            ValueError("No such whichdata")
    
        try:
            sizetokeep = min(sizetokeep,len(data.data[:,0]))
            X = data.data[0:sizetokeep, :]
        except:
            sizetokeep = min(sizetokeep, len(data.data[:]))
            X = data.data[0:sizetokeep]
        y = data.target[0:sizetokeep]
        print("Got Data")

        import numpy as np

        # Create 0.8/0.2 train/test split
        print("Split Data")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8,
                                                            random_state=42)
    
        print("Encode Data")
        # from h2o4gpu.preprocessing import Imputer
        # imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        # imp.fit(X, y)
        # Xencoded = imp.transform(X)
        # yencoded = imp.transform(y)
    
        import pandas as pd
        X_test_pd = pd.DataFrame(X_test)
        X_train_pd = pd.DataFrame(X_train)
    
        # Importing LabelEncoder and initializing it
        from h2o4gpu.preprocessing import LabelEncoder
        le = LabelEncoder()
        # Iterating over all the common columns in train and test
        for col in X_test_pd.columns.values:
            # Encoding only categorical variables
            if X_test_pd[col].dtypes == 'object' or X_test_pd[col].dtypes == 'bool':
                # Using whole data to form an exhaustive list of levels
                data = X_train_pd[col].append(X_test_pd[col])
                le.fit(data.values)
                X_train_pd[col] = le.transform(X_train_pd[col])
                X_test_pd[col] = le.transform(X_test_pd[col])
    
        X_train_pd = pd.get_dummies(X_train_pd).fillna(0.0)
        X_test_pd = pd.get_dummies(X_test_pd).fillna(0.0)
        y_train_pd = pd.Series(y_train).fillna(0.0)
        y_test_pd = pd.Series(y_test).fillna(0.0)
    
        # get back numpy
        X_test = X_test_pd.values
        X_train = X_train_pd.values
        y_test = y_test_pd.values
        y_train = y_train_pd.values
    
        if double_precision:
            mynptype = np.float64
        else:
            mynptype = np.float32
        X_test = X_test.astype(mynptype)
        X_train = X_train.astype(np.float64)
        y_test = y_test.astype(np.float64)
        y_train = y_train.astype(np.float64)

        # TODO: Should write this to file and avoid doing encoding if already exists
    
        t1 = time.time()
        print("Start ElasticNetH2O")
        rmse_train, rmse_test = run_glm(X_train, y_train, X_test, y_test, nGPUs=nGPUs, nlambda=nLambdas, nfolds=nFolds,
                                        nalpha=nAlphas,
                                        validFraction=validFraction, verbose=10, name=name, tolerance=0.2, tol=1E-2, tol_seek_factor=1.0)
        print("End ElasticNetH2O")
    
        # check rmse
        print(rmse_train[0, 0])
        print(rmse_train[0, 1])
        print(rmse_train[0, 2])
        print(rmse_test[0, 2])
        sys.stdout.flush()
    
        print('/n Total execution time:%d' % (time.time() - t1))
    
        print("TEST PASSED")
        sys.stdout.flush()
    
        print("Time taken: {}".format(time.time() - t))
        #    endfunnel(pipes)
        print("DONE.")
        sys.stdout.flush()