seed = 0 Xtrain, ytrain, Xtest, ytest = covtype(train_size=4000000, seed=seed) covtype_size = [ 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000 ] new_arr = np.repeat(Xtrain, 3, axis=0) #increase dataset size print(len(new_arr)) total_time = [] #nr_tree = 32 #Change to run script for different number of trees file_name = './models/model_' + str(nr_tree) + 'tree_4jobs.data' model = WoodClassifier.load(file_name) nr_classes = len(np.unique(ytrain)) + 1 model.compile_store_v2(new_arr, nr_classes, 10) print("Number of estimators: \t\t%i" % model.n_estimators) forest_time = [] for i in xrange(len(covtype_size)): times = np.zeros(8, np.float32) X_temp = new_arr[:covtype_size[i]] print("Number of training patterns:\t%i" % X_temp.shape[0]) start_time = time.time() cpu_test = model.predict(X_temp) cpu_test = model.predict(X_temp) cpu_test = model.predict(X_temp)
def single_run(dkey, train_size, n_bottom, param, seed, profile=False): print( "Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param))) if dkey == "covtype": traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed) elif dkey == "higgs": traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed) elif dkey == "susy": traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) param_wood = param['param_wood'] wood = WoodClassifier(n_estimators=1, criterion="gini", max_features=param_wood['max_features'], min_samples_split=2, n_jobs=param_wood['n_jobs'], seed=seed, bootstrap=param_wood['bootstrap'], tree_traversal_mode="dfs", tree_type=param_wood['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=0) model = HugeWoodClassifier( n_estimators=int(24 / n_bottom), n_estimators_bottom=int(n_bottom), n_top="auto", n_patterns_leaf=75000, balanced_top_tree=True, top_tree_lambda=1.0, top_tree_max_depth=None, top_tree_type="standard", top_tree_leaf_stopping_mode="ignore_impurity", n_jobs=param_wood['n_jobs'], seed=seed, verbose=1, plot_intermediate={}, chunk_max_megabytes=2048, wrapped_instance=wood, store=MemoryStore(), ) # training if profile == True: import yep assert param_wood['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen) fit_end_time = time.time() if profile == True: yep.stop() ypreds_train = model.predict(generator=traingen) # testing test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) evaluate(ypreds_train, traingen.get_all_target(), results, "training") evaluate(ypred_test, testgen.get_all_target(), results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param_wood['n_estimators']), str(param_wood['max_features']), str(param_wood['n_jobs']), str(param_wood['bootstrap']), str(param_wood['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), str(n_bottom), "hugewood_75K", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del (testgen) del (traingen) model.cleanup() time.sleep(1)
from sklearn.metrics import accuracy_score from woody import WoodClassifier from woody.data import * seed = 0 #Xtrain, ytrain, Xtest, ytest = covtype(train_size=400000, seed=seed) Xtrain, ytrain, Xtest, ytest = susy(train_size=4000000, seed=seed) if Xtrain.dtype != np.float32: Xtrain = Xtrain.astype(np.float32) ytrain = ytrain.astype(np.float32) Xtest = Xtest.astype(np.float32) ytest = ytest.astype(np.float32) model = WoodClassifier.load('./model_susy8tree.data') nr_classes = len(np.unique(ytrain)) + 1 #not sure if accurate model.compile_and_Store(Xtrain, nr_classes) cpu_train = model.predict(Xtrain) cpu_test = model.predict(Xtest) #print(cpu_train) assert np.allclose( cpu_train, model.cuda_predict(Xtrain)) == True, "cuda_predict failed for train set" assert np.allclose(cpu_train, model.cuda_pred_tree_mult( Xtrain, 10)) == True, "cuda_pred_tree_mult failed for train set" assert np.allclose(cpu_train, model.cuda_pred_forest( Xtrain)) == True, "cuda_pred_forest failed for train set" assert np.allclose(cpu_train, model.cuda_pred_forest_mult(
def single_run(dkey, train_size, param, seed, profile=False): print( "Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param))) if dkey == "covtype": Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size, seed=seed) elif dkey == "higgs": Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed) elif dkey == "susy": Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % Xtrain.shape[0]) print("Number of test patterns:\t%i" % Xtest.shape[0]) print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1]) model = WoodClassifier(n_estimators=param['n_estimators'], criterion="gini", max_features=param['max_features'], min_samples_split=2, n_jobs=param['n_jobs'], seed=seed, bootstrap=param['bootstrap'], tree_traversal_mode="dfs", tree_type=param['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=0) if profile == True: import yep assert param['n_jobs'] == 1 yep.start("train.prof") # training fit_start_time = time.time() model.fit(Xtrain, ytrain) fit_end_time = time.time() if profile == True: yep.stop() ypreds_train = model.predict(Xtrain) # testing test_start_time = time.time() ypred_test = model.predict(Xtest) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time: %f" % results['training_time']) print("Testing time: %f" % results['testing_time']) evaluate(ypreds_train, ytrain, results, "training") evaluate(ypred_test, ytest, results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "wood", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp)
Xtrain = Xtrain.astype(np.float32) ytrain = ytrain.astype(np.float32) Xtest = Xtest.astype(np.float32) ytest = ytest.astype(np.float32) print("") print("Number of training patterns:\t%i" % Xtrain.shape[0]) print("Number of test patterns:\t%i" % Xtest.shape[0]) print("Dimensionality of the data:\t%i" % Xtrain.shape[1]) model = WoodClassifier(n_estimators=nr_tree, criterion="gini", max_features=None, min_samples_split=2, n_jobs=4, seed=seed, bootstrap=True, tree_traversal_mode="dfs", tree_type="standard", min_samples_leaf=1, float_type="double", max_depth=None, verbose=1) fit_start_time = time.time() model.fit(Xtrain, ytrain) fit_end_time = time.time() if (f_type == 0): file_name = "./models/model_susy" + str(nr_tree) + "tree_4jobs.data" elif (f_type == 1): file_name = "./models/model_" + str(nr_tree) + "tree_4jobs.data"
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) tmp_dir = "tmp/hugewood" if dkey == "landsat": # TODO: Download file manually if needed (255GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd fname_train = "data/landsat_train.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=10000000) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) param_wood = param['param_wood'] wood = WoodClassifier(n_estimators=1, criterion="gini", max_features=param_wood['max_features'], min_samples_split=2, n_jobs=param_wood['n_jobs'], seed=params.seed, bootstrap=param_wood['bootstrap'], tree_traversal_mode="dfs", tree_type=param_wood['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=0) model = HugeWoodClassifier( n_estimators=param['n_estimators'], n_estimators_bottom=param['n_estimators_bottom'], n_top="auto", n_patterns_leaf="auto", balanced_top_tree=True, top_tree_max_depth=None, top_tree_type="standard", top_tree_leaf_stopping_mode="ignore_impurity", n_jobs=param_wood['n_jobs'], seed=params.seed, verbose=1, plot_intermediate={}, chunk_max_megabytes=2048, wrapped_instance=wood, odir=tmp_dir, store=DiskStore(), ) # training if profile == True: import yep assert param_wood['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen) fit_end_time = time.time() if profile == True: yep.stop() # testing print("Computing predictions ...") test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time results['total'] = model.get_training_times()['total'] results['retrieve'] = model.get_training_times()['retrieve'] results['top'] = model.get_training_times()['top'] results['distribute'] = model.get_training_times()['distribute'] results['bottom'] = model.get_training_times()['bottom'] print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) print("Evaluating test error ...") ytest = testgen.get_all_target() evaluate(ypred_test, ytest, results, "testing") fname = '%s_%s_%s_%s_%s.json' % ( str(param_wood['n_estimators']), str(param_wood['max_features']), str(param_wood['n_jobs']), str(param_wood['bootstrap']), str(param_wood['tree_type']), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "hugewood", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del (testgen) del (traingen) model.cleanup() time.sleep(1)
Xtrain = Xtrain.astype(np.float32) ytrain = ytrain.astype(np.float32) Xtest = Xtest.astype(np.float32) ytest = ytest.astype(np.float32) print("") print("Number of training patterns:\t%i" % Xtrain.shape[0]) print("Number of test patterns:\t%i" % Xtest.shape[0]) print("Dimensionality of the data:\t%i" % Xtrain.shape[1]) model = WoodClassifier( n_estimators=2, criterion="gini", max_features=None, min_samples_split=2, n_jobs=1, seed=seed, bootstrap=True, tree_traversal_mode="dfs", tree_type="standard", min_samples_leaf=1, float_type="double", max_depth=None, verbose=1) fit_start_time = time.time() #model.fit(Xtrain, ytrain) fit_end_time = time.time() #model.save("./model_susy2tree.data") model = WoodClassifier.load('./model_8tree.data') print("Number of estimators: \t\t%i\n" % model.n_estimators) nr_classes = len(np.unique(ytrain)) +1 #not sure if accurate