def create_dataset(self, container_key, dkey, data): ensure_dir_for_file(container_key) s = h5py.File(container_key, 'a', driver="sec2", libver='latest') dset = s.create_dataset(dkey, data.shape, maxshape=(None, data.shape[1]), compression="lzf") dset[:,:] = data s.close()
def append_to_dataset(self, container_key, dkey, data): ensure_dir_for_file(container_key) s = h5py.File(container_key, 'a', driver="sec2", libver='latest') offset = 0 if not dkey in s.keys(): dset = s.create_dataset(dkey, data.shape, maxshape=(None, data.shape[1]), compression="lzf") else: dset = s.get(dkey) offset += dset.shape[0] dset.resize(dset.shape[0] + data.shape[0], axis=0) dset[offset:, :] = data s.close()
def single_run(dkey, train_size, param, seed, profile=False): print( "Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param))) if dkey == "covtype": Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size, seed=seed) elif dkey == "higgs": Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed) elif dkey == "susy": Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % Xtrain.shape[0]) print("Number of test patterns:\t%i" % Xtest.shape[0]) print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1]) if param['tree_type'] == "randomized": from sklearn.ensemble import ExtraTreesClassifier as RF elif param['tree_type'] == "standard": from sklearn.ensemble import RandomForestClassifier as RF model = RF(n_estimators=param['n_estimators'], criterion="gini", max_features=param['max_features'], min_samples_split=2, n_jobs=param['n_jobs'], random_state=seed, bootstrap=param['bootstrap'], min_samples_leaf=1, max_depth=None, verbose=0) if profile == True: import yep assert param['n_jobs'] == 1 yep.start("train.prof") # training fit_start_time = time.time() model.fit(Xtrain, ytrain) fit_end_time = time.time() if profile == True: yep.stop() ypreds_train = model.predict(Xtrain) # testing test_start_time = time.time() ypred_test = model.predict(Xtest) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time: %f" % results['training_time']) print("Testing time: %f" % results['testing_time']) evaluate(ypreds_train, ytrain, results, "training") evaluate(ypred_test, ytest, results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp)
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) if dkey == "landsat": # TODO: Download file manually if needed (9,7GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # TODO: Adapt paths accordingly fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000) else: raise Exception("Unknown data set!") Xtrain, ytrain = traingen.get_all() Xtest, ytest = testgen.get_all() print("") print("Number of training patterns:\t%i" % Xtrain.shape[0]) print("Number of test patterns:\t%i" % Xtest.shape[0]) print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1]) if param['tree_type'] == "randomized": from sklearn.ensemble import ExtraTreesClassifier as RF elif param['tree_type'] == "standard": from sklearn.ensemble import RandomForestClassifier as RF model = RF(n_estimators=param['n_estimators'], criterion="gini", max_features=param['max_features'], min_samples_split=2, n_jobs=param['n_jobs'], random_state=seed, bootstrap=param['bootstrap'], min_samples_leaf=1, max_depth=None, verbose=0) if profile == True: import yep assert param['n_jobs'] == 1 yep.start("train.prof") # training fit_start_time = time.time() model.fit(Xtrain, ytrain) fit_end_time = time.time() if profile == True: yep.stop() ypreds_train = model.predict(Xtrain) # testing test_start_time = time.time() ypred_test = model.predict(Xtest) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time: %f" % results['training_time']) print("Testing time: %f" % results['testing_time']) evaluate(ypreds_train, ytrain, results, "training") evaluate(ypred_test, ytest, results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp)
def single_run(dkey, train_size, n_bottom, param, seed, profile=False): print( "Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param))) if dkey == "covtype": traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed) elif dkey == "higgs": traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed) elif dkey == "susy": traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) param_wood = param['param_wood'] wood = WoodClassifier(n_estimators=1, criterion="gini", max_features=param_wood['max_features'], min_samples_split=2, n_jobs=param_wood['n_jobs'], seed=seed, bootstrap=param_wood['bootstrap'], tree_traversal_mode="dfs", tree_type=param_wood['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=0) model = HugeWoodClassifier( n_estimators=int(24 / n_bottom), n_estimators_bottom=int(n_bottom), n_top="auto", n_patterns_leaf=75000, balanced_top_tree=True, top_tree_lambda=1.0, top_tree_max_depth=None, top_tree_type="standard", top_tree_leaf_stopping_mode="ignore_impurity", n_jobs=param_wood['n_jobs'], seed=seed, verbose=1, plot_intermediate={}, chunk_max_megabytes=2048, wrapped_instance=wood, store=MemoryStore(), ) # training if profile == True: import yep assert param_wood['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen) fit_end_time = time.time() if profile == True: yep.stop() ypreds_train = model.predict(generator=traingen) # testing test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) evaluate(ypreds_train, traingen.get_all_target(), results, "training") evaluate(ypred_test, testgen.get_all_target(), results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param_wood['n_estimators']), str(param_wood['max_features']), str(param_wood['n_jobs']), str(param_wood['bootstrap']), str(param_wood['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), str(n_bottom), "hugewood_75K", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del (testgen) del (traingen) model.cleanup() time.sleep(1)
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) tmp_dir = "tmp/subsetwood" if dkey == "landsat": # TODO: Download file manually if needed (9,7GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # TODO: Adapt paths accordingly fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) # set to top trees size n_subset = 500000 model = SubsetWoodClassifier( n_estimators=param['n_estimators'], criterion="gini", max_features=param['max_features'], min_samples_split=2, n_jobs=param['n_jobs'], seed=seed, bootstrap=param['bootstrap'], tree_traversal_mode="dfs", tree_type=param['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=1, odir=tmp_dir, store=DiskStore()) # training if profile == True: import yep assert param['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen, n_subset=n_subset) fit_end_time = time.time() if profile == True: yep.stop() # testing print("Computing predictions ...") test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time results['total'] = model.get_training_times()['total'] results['retrieve'] = model.get_training_times()['retrieve'] results['subset'] = model.get_training_times()['subset'] print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) print("Evaluating test error ...") ytest = testgen.get_all_target() ytrain = traingen.get_all_target() ytrain = ytrain.astype(numpy.int64) ytest = ytest.astype(numpy.int64) ypred_test = ypred_test.astype(numpy.int64) evaluate(ypred_test, ytest, results, "testing") print("Training distribution") print(numpy.bincount(ytrain)) print("Test distribution") print(numpy.bincount(ytest)) print("Predict distribution") print(numpy.bincount(ypred_test)) fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood_" + str(n_subset), fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del(testgen) del(traingen) model.cleanup() time.sleep(1)
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) if dkey == "landsat": # TODO: Download file manually if needed (9,7GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd # TODO: Adapt paths accordingly fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000) # TODO: Adapt paths accordingly fname_train_csv = "tmp/landsat_train_small_%lu.csv" % train_size fname_test_csv = "tmp/landsat_test.csv" traingen.to_csv(fname_train_csv, cache=False, remove=True) testgen.to_csv(fname_test_csv, cache=False, remove=True) import h2o from skutil.h2o import h2o_col_to_numpy h2o.init(max_mem_size="12G", nthreads=param['n_jobs']) h2o.remove_all() from h2o.estimators.random_forest import H2ORandomForestEstimator if dkey == "landsat_small" or dkey == "landsat": train_df = h2o.import_file(fname_train_csv) test_df = h2o.import_file(fname_test_csv) Xcols, ycol = train_df.col_names[:-1], train_df.col_names[-1] else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % train_df.shape[0]) print("Number of test patterns:\t%i" % test_df.shape[0]) print("Dimensionality of the data:\t%i\n" % train_df.shape[1]) if param['max_features'] is None: mtries = train_df.shape[1] - 2 elif param['max_features'] == "sqrt": mtries = int(math.sqrt(train_df.shape[1] - 2)) if param['bootstrap'] == False: sample_rate = 1.0 else: sample_rate = 0.632 model = H2ORandomForestEstimator( mtries=mtries, sample_rate=sample_rate, #nbins=1000, #crash min_rows=1, build_tree_one_node=True, max_depth=20, balance_classes=False, ntrees=param['n_estimators'], seed=seed) # training fit_start_time = time.time() model.train(Xcols, ycol, training_frame=train_df) fit_end_time = time.time() # testing test_start_time = time.time() ypreds_test = model.predict(test_df) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time: %f" % results['training_time']) print("Testing time: %f" % results['testing_time']) evaluate(numpy.rint(ypreds_test.as_data_frame().values), test_df[ycol].as_data_frame().values, results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "h2", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp)
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) tmp_dir = "tmp/hugewood" if dkey == "landsat": # TODO: Download file manually if needed (255GB and 524MB): # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train.h5pd # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd fname_train = "data/landsat_train.h5pd" fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd" traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size) testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=10000000) else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) param_wood = param['param_wood'] wood = WoodClassifier(n_estimators=1, criterion="gini", max_features=param_wood['max_features'], min_samples_split=2, n_jobs=param_wood['n_jobs'], seed=params.seed, bootstrap=param_wood['bootstrap'], tree_traversal_mode="dfs", tree_type=param_wood['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=0) model = HugeWoodClassifier( n_estimators=param['n_estimators'], n_estimators_bottom=param['n_estimators_bottom'], n_top="auto", n_patterns_leaf="auto", balanced_top_tree=True, top_tree_max_depth=None, top_tree_type="standard", top_tree_leaf_stopping_mode="ignore_impurity", n_jobs=param_wood['n_jobs'], seed=params.seed, verbose=1, plot_intermediate={}, chunk_max_megabytes=2048, wrapped_instance=wood, odir=tmp_dir, store=DiskStore(), ) # training if profile == True: import yep assert param_wood['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen) fit_end_time = time.time() if profile == True: yep.stop() # testing print("Computing predictions ...") test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time results['total'] = model.get_training_times()['total'] results['retrieve'] = model.get_training_times()['retrieve'] results['top'] = model.get_training_times()['top'] results['distribute'] = model.get_training_times()['distribute'] results['bottom'] = model.get_training_times()['bottom'] print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) print("Evaluating test error ...") ytest = testgen.get_all_target() evaluate(ypred_test, ytest, results, "testing") fname = '%s_%s_%s_%s_%s.json' % ( str(param_wood['n_estimators']), str(param_wood['max_features']), str(param_wood['n_jobs']), str(param_wood['bootstrap']), str(param_wood['tree_type']), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "hugewood", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del (testgen) del (traingen) model.cleanup() time.sleep(1)
def single_run(dkey, train_size, param, seed, profile=False): print( "Processing data set %s with train_size %s, seed %s, and parameters %s ..." % (str(dkey), str(train_size), str(seed), str(param))) if dkey == "covtype": traingen, testgen = covtype_generators(train_size=train_size, store="mem", seed=seed) n_subset = 50000 elif dkey == "higgs": traingen, testgen = higgs_generators(train_size=train_size, store="mem", seed=seed) n_subset = 500000 elif dkey == "susy": traingen, testgen = susy_generators(train_size=train_size, store="mem", seed=seed) n_subset = 500000 else: raise Exception("Unknown data set!") print("") print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0]) print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0]) print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1]) model = SubsetWoodClassifier(n_estimators=param['n_estimators'], criterion="gini", max_features=param['max_features'], min_samples_split=2, n_jobs=param['n_jobs'], seed=seed, bootstrap=param['bootstrap'], tree_traversal_mode="dfs", tree_type=param['tree_type'], min_samples_leaf=1, float_type="double", max_depth=None, verbose=1, store=MemoryStore()) # training if profile == True: import yep assert param['n_jobs'] == 1 yep.start("train.prof") fit_start_time = time.time() model.fit(traingen, n_subset=n_subset) fit_end_time = time.time() if profile == True: yep.stop() ypreds_train = model.predict(generator=traingen) # testing test_start_time = time.time() ypred_test = model.predict(generator=testgen) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time:\t\t%f" % results['training_time']) print("Testing time:\t\t%f" % results['testing_time']) evaluate(ypreds_train, traingen.get_all_target(), results, "training") evaluate(ypred_test, testgen.get_all_target(), results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp) del (testgen) del (traingen) model.cleanup() time.sleep(1)
def single_run(dkey, train_size, param, seed, profile=False): print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param))) import h2o from skutil.h2o import h2o_col_to_numpy h2o.init(max_mem_size="12G", nthreads=param['n_jobs']) h2o.remove_all() from h2o.estimators.random_forest import H2ORandomForestEstimator # get and convert data if dkey == "covtype": fname_train, fname_test = covtype_files(train_size=train_size) train_df = h2o.import_file(fname_train) test_df = h2o.import_file(fname_test) Xcols, ycol = train_df.col_names[:-1], train_df.col_names[-1] elif dkey == "higgs": fname_train, fname_test = higgs_files(train_size=train_size) train_df = h2o.import_file(fname_train) test_df = h2o.import_file(fname_test) Xcols, ycol = train_df.col_names[1:], train_df.col_names[0] elif dkey == "susy": fname_train, fname_test = susy_files(train_size=train_size) train_df = h2o.import_file(fname_train) test_df = h2o.import_file(fname_test) Xcols, ycol = train_df.col_names[1:], train_df.col_names[0] print("") print("Number of training patterns:\t%i" % train_df.shape[0]) print("Number of test patterns:\t%i" % test_df.shape[0]) print("Dimensionality of the data:\t%i\n" % train_df.shape[1]) if param['max_features'] is None: mtries = train_df.shape[1] - 2 elif param['max_features'] == "sqrt": mtries = int(math.sqrt(train_df.shape[1] - 2)) if param['bootstrap'] == False: sample_rate = 1.0 else: sample_rate = 0.632 model = H2ORandomForestEstimator( mtries=mtries, sample_rate=sample_rate, #nbins=1000, #crash min_rows=1, build_tree_one_node=True, max_depth=20, balance_classes=False, ntrees=param['n_estimators'], seed=seed) # training fit_start_time = time.time() model.train(Xcols, ycol, training_frame=train_df) fit_end_time = time.time() ypreds_train = model.predict(train_df) # testing test_start_time = time.time() ypreds_test = model.predict(test_df) test_end_time = time.time() results = {} results['dataset'] = dkey results['param'] = param results['training_time'] = fit_end_time - fit_start_time results['testing_time'] = test_end_time - test_start_time print("Training time: %f" % results['training_time']) print("Testing time: %f" % results['testing_time']) evaluate(numpy.rint(ypreds_train.as_data_frame().values), train_df[ycol].as_data_frame().values, results, "training") evaluate(numpy.rint(ypreds_test.as_data_frame().values), test_df[ycol].as_data_frame().values, results, "testing") fname = '%s_%s_%s_%s_%s_%s.json' % ( str(param['n_estimators']), str(param['max_features']), str(param['n_jobs']), str(param['bootstrap']), str(param['tree_type']), str(seed), ) fname = os.path.join(params.odir, str(dkey), str(train_size), "h2", fname) ensure_dir_for_file(fname) with open(fname, 'w') as fp: json.dump(results, fp)