def setUp(self): self._sp = pcs_parser.read(file(os.path.join(os.path.dirname(os.path.realpath(__file__)), "Testdata/nips2011.pcs"))) # Read data from csv header, self._data = read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "Testdata/hpnnet_nocv_convex_all_fastrf_results.csv"), has_header=True, num_header_rows=3) self._para_header = header[0][:-2] self._checkpoint = hash(numpy.array_repr(self._data))
def setUp(self): self._sp = pcs_parser.read(file(os.path.join(os.path.dirname(os.path.realpath(__file__)), "Testdata/nips2011.pcs"))) # Read data from csv header, self._data = read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "Testdata/hpnnet_nocv_convex_all_fastrf_results.csv"), has_header=True, num_header_rows=3) self._para_header = header[0][:-2] self._checkpoint = hash(numpy.array_repr(self._data)) self.assertEqual(246450380584980815, self._checkpoint)
def setUp(self): self._sp = pcs_parser.read(file(os.path.join(os.path.dirname(os.path.realpath(__file__)), "Testdata/nips2011.pcs"))) # Read data from csv header, self._data = read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "Testdata/hpnnet_nocv_convex_all_fastrf_results.csv"), has_header=True, num_header_rows=3) self._para_header = header[0][:-2] self._data = self._data[:1000, :-2] self._data = Surrogates.RegressionModels.model_util.replace_cat_variables(catdict=get_cat_val_map(sp=self._sp), x=self._data, param_names=self._para_header)
def test(): from sklearn.metrics import mean_squared_error import Surrogates.DataExtraction.pcs_parser as pcs_parser sp = pcs_parser.read( file( "/home/eggenspk/Surrogates/Data_extraction/Experiments2014/hpnnet/smac_2_06_01-dev/nips2011.pcs" )) # Read data from csv header, data = read_csv( "/home/eggenspk/Surrogates/Data_extraction/hpnnet_nocv_convex_all/hpnnet_nocv_convex_all_fastrf_results.csv", has_header=True, num_header_rows=3) para_header = header[0][:-2] type_header = header[1] cond_header = header[2] #print data.shape checkpoint = hash(numpy.array_repr(data)) assert checkpoint == 246450380584980815 model = GradientBoosting(sp=sp, encode=False, debug=True) x_train_data = data[:1000, :-2] y_train_data = data[:1000, -1] x_test_data = data[1000:, :-2] y_test_data = data[1000:, -1] model.train(x=x_train_data, y=y_train_data, param_names=para_header, rng=1) y = model.predict(x=x_train_data[1, :]) print "Is: %100.70f, Should: %f" % (y, y_train_data[1]) assert y[0] == 0.45366000254662230961599789225147105753421783447265625 print "Predict whole data" y_whole = model.predict(x=x_test_data) mse = mean_squared_error(y_true=y_test_data, y_pred=y_whole) print "MSE: %100.70f" % mse assert mse == 0.00188246958253847243396073007914992558653466403484344482421875 print "Soweit so gut" # Try the same with encoded features model = GradientBoosting(sp=sp, encode=True, debug=True) #print data[:10, :-2] model.train(x=x_train_data, y=y_train_data, param_names=para_header, rng=1) y = model.predict(x=x_train_data[1, :]) print "Is: %100.70f, Should: %f" % (y, y_train_data[1]) assert y[0] == 0.460818965082699205648708584703854285180568695068359375 print "Predict whole data" y_whole = model.predict(x=x_test_data) mse = mean_squared_error(y_true=y_test_data, y_pred=y_whole) print "MSE: %100.70f" % mse assert mse == 0.002064362783199560034963493393433964229188859462738037109375 assert hash(numpy.array_repr(data)) == checkpoint
def test(): from sklearn.metrics import mean_squared_error import Surrogates.DataExtraction.pcs_parser as pcs_parser sp = pcs_parser.read(file("/home/eggenspk/Surrogates/Data_extraction/Experiments2014/hpnnet/smac_2_06_01-dev/nips2011.pcs")) # Read data from csv header, data = read_csv("/home/eggenspk/Surrogates/Data_extraction/hpnnet_nocv_convex_all/hpnnet_nocv_convex_all_fastrf_results.csv", has_header=True, num_header_rows=3) para_header = header[0][:-2] type_header = header[1] cond_header = header[2] #print data.shape checkpoint = hash(numpy.array_repr(data)) assert checkpoint == 246450380584980815 model = GradientBoosting(sp=sp, encode=False, debug=True) x_train_data = data[:1000, :-2] y_train_data = data[:1000, -1] x_test_data = data[1000:, :-2] y_test_data = data[1000:, -1] model.train(x=x_train_data, y=y_train_data, param_names=para_header, rng=1) y = model.predict(x=x_train_data[1, :]) print "Is: %100.70f, Should: %f" % (y, y_train_data[1]) assert y[0] == 0.45366000254662230961599789225147105753421783447265625 print "Predict whole data" y_whole = model.predict(x=x_test_data) mse = mean_squared_error(y_true=y_test_data, y_pred=y_whole) print "MSE: %100.70f" % mse assert mse == 0.00188246958253847243396073007914992558653466403484344482421875 print "Soweit so gut" # Try the same with encoded features model = GradientBoosting(sp=sp, encode=True, debug=True) #print data[:10, :-2] model.train(x=x_train_data, y=y_train_data, param_names=para_header, rng=1) y = model.predict(x=x_train_data[1, :]) print "Is: %100.70f, Should: %f" % (y, y_train_data[1]) assert y[0] == 0.460818965082699205648708584703854285180568695068359375 print "Predict whole data" y_whole = model.predict(x=x_test_data) mse = mean_squared_error(y_true=y_test_data, y_pred=y_whole) print "MSE: %100.70f" % mse assert mse == 0.002064362783199560034963493393433964229188859462738037109375 assert hash(numpy.array_repr(data)) == checkpoint
def test(): from sklearn.metrics import mean_squared_error import Surrogates.DataExtraction.pcs_parser as pcs_parser sp = pcs_parser.read(file("/home/eggenspk/Surrogates/Data_extraction/Experiments2014/hpnnet/smac_2_06_01-dev/nips2011.pcs")) # Read data from csv header, data = read_csv("/home/eggenspk/Surrogates/Data_extraction/hpnnet_nocv_convex_all/hpnnet_nocv_convex_all_fastrf_results.csv", has_header=True, num_header_rows=3) para_header = header[0][:-2] type_header = header[1] cond_header = header[2] #print data.shape checkpoint = hash(numpy.array_repr(data)) assert checkpoint == 246450380584980815 model = GaussianProcess(sp=sp, encode=False, rng=1, debug=True) x_train_data = data[:100, :-2] y_train_data = data[:100, -1] x_test_data = data[100:, :-2] y_test_data = data[100:, -1] model.train(x=x_train_data, y=y_train_data, param_names=para_header) y = model.predict(x=x_train_data[1, :]) print "Is: %100.70f, Should: %f" % (y, y_train_data[1]) assert y[0] == 0.470745153514900149804844886602950282394886016845703125 print "Predict whole data" y_whole = model.predict(x=x_test_data) mse = mean_squared_error(y_true=y_test_data, y_pred=y_whole) print "MSE: %100.70f" % mse assert mse == 0.006257598609004190459703664828339242376387119293212890625 print "Soweit so gut" # Try the same with encoded features model = GaussianProcess(sp=sp, encode=True, rng=1, debug=True) #print data[:10, :-2] model.train(x=x_train_data, y=y_train_data, param_names=para_header) y = model.predict(x=x_train_data[1, :]) print "Is: %100.70f, Should: %f" % (y, y_train_data[1]) assert y[0] == 0.464671665294324409689608046392095275223255157470703125 print "Predict whole data" y_whole = model.predict(x=x_test_data) mse = mean_squared_error(y_true=y_test_data, y_pred=y_whole) print "MSE: %100.70f" % mse assert mse == 0.00919265128042330570412588031103950925171375274658203125 assert hash(numpy.array_repr(data)) == checkpoint
def setUp(self): self._sp = pcs_parser.read( file( os.path.join(os.path.dirname(os.path.realpath(__file__)), "Testdata/nips2011.pcs"))) # Read data from csv header, self._data = read_csv(os.path.join( os.path.dirname(os.path.realpath(__file__)), "Testdata/hpnnet_nocv_convex_all_fastrf_results.csv"), has_header=True, num_header_rows=3) self._para_header = header[0][:-2] self._data = self._data[:1000, :-2] self._data = Surrogates.RegressionModels.model_util.replace_cat_variables( catdict=get_cat_val_map(sp=self._sp), x=self._data, param_names=self._para_header)
def main(): prog = "python whole_training.py" parser = ArgumentParser(description="", prog=prog) parser.add_argument("-d", dest="data", required=True) parser.add_argument("-s", dest="save", required=True) parser.add_argument("-r", dest="num_random", default=100, type=int, help="If randomsearch is available, how many runs?") parser.add_argument("-m", "--model", dest="model", default="all", help="Train only one model?", choices=[#"RFstruct", "ArcGP", "GaussianProcess", "GradientBoosting", "KNN", "LassoRegression", "LinearRegression", "NuSupportVectorRegression", "RandomForest", "RidgeRegression", "SupportVectorRegression"]) parser.add_argument("-t", "--time", dest="time", default=False, action="store_true", help="Train on duration?") parser.add_argument("--pcs", dest="pcs", default=None, required=True, help="PCS file") parser.add_argument("--encode", dest="encode", default=False, action="store_true") args, unknown = parser.parse_known_args() if args.model == "Fastrf" and args.encode: raise ValueError("This cannot work") sp = pcs_parser.read(file(args.pcs)) model_type = args.model if args.encode: model_type += "_onehot" # Read data from csv header, data = read_csv(args.data, has_header=True, num_header_rows=3) para_header = header[0] type_header = header[1] cond_header = header[2] # Hardcoded number of crossvalidations num_cv = 5 # Cut out the objective data_x = data[:, :-2] if args.time: print "TRAINING ON TIME" data_y = data[:, -2] # -1 -> perf, -2 -> duration else: data_y = data[:, -1] # -1 -> perf, -2 -> duration # Split into num_cv folds cv_idx = cross_validation.KFold(data_x.shape[0], n_folds=num_cv, indices=True, random_state=RNG, shuffle=True) # Get subsample idx ct = int(data_x.shape[0] / num_cv) * (num_cv-1) train_idx_list = list() test_idx_list = list() # For largest training set, we simply take all indices train_idx_list.append([train_idx for train_idx, _n in cv_idx]) test_idx_list.extend([test_idx for _none, test_idx in cv_idx]) # Prepare new csv tmp_result_header = list() tmp_result_header.extend([str(len(i)) for i in train_idx_list[0]]) ct = 2000 if ct < int(data_x.shape[0] / num_cv) * (num_cv-1): train_idx_list.append(list()) for train_idx, test_idx in cv_idx: # NOTE: We have to change seed, otherwise trainingsamples # will always be the same for different ct subsample_cv = cross_validation.ShuffleSplit(len(train_idx), n_iter=1, train_size=ct, test_size=None, random_state=RNG) for sub_train_idx, _none in subsample_cv: train_idx_list[-1].append(train_idx[sub_train_idx]) tmp_result_header.append(str(len(sub_train_idx))) """ # # Right now we don't need this # # Now reduce in each step training set by half and subsample # save_ct = None # ct /= 2 # if ct < 1500: # save_ct = ct # ct = 1500 # # seed = numpy.random.randint(100, size=[num_cv]) # while ct > 10: # train_idx_list.append(list()) # idx = 0 # for train_idx, test_idx in cv_idx: # # NOTE: We have to change seed, otherwise trainingsamples will # # always be the same for different ct # subsample_cv = cross_validation.ShuffleSplit(len(train_idx), # n_iter=1, # train_size=ct, # test_size=None, # random_state=seed[idx]*ct) # for sub_train_idx, _none in subsample_cv: # train_idx_list[-1].append(train_idx[sub_train_idx]) # tmp_result_header.append(str(len(sub_train_idx))) # idx += 1 # # if ct > 2000 and ct/2 < 2000 and save_ct is None: # # Trick to evaluate 2000 in any case # save_ct = ct/2 # ct = 2000 # elif ct > 1500 and ct/2 < 1500 and save_ct is None: # # Trick to evaluate 1500 in any case # save_ct = ct/2 # ct = 1500 # elif save_ct is not None: # ct = save_ct # save_ct = None # else: # ct /= 2 """ # Reverse train_idx to start with small dataset sizes train_idx_list = train_idx_list[::-1] result_header = ['model'] result_header.extend(tmp_result_header[::-1]) # print result_header # print [[len(j) for j in i] for i in train_idx_list] # print [len(i) for i in test_idx_list] # We could write the ground truth for this experiment ground_truth_fn = args.save + "ground_truth_" if not os.path.exists(ground_truth_fn + "training.csv") or \ not os.path.exists(ground_truth_fn + "test.csv"): write_truth(train_idx=train_idx_list, test_idx=test_idx_list, data=data_y, fn=ground_truth_fn, save_dir=args.save) # Now init the csv init_csv(args.save + '/train_duration.csv', result_header, override=False) init_csv(args.save + '/predict_duration.csv', result_header, override=False) # We need one csv containing the raw predictions # Just in case we already trained this model, create random filename if not os.path.exists(os.path.join(args.save + "prediction")): os.mkdir(os.path.join(args.save + "prediction")) _none, model_test_fn = \ tempfile.mkstemp(suffix=".csv_running", prefix="%s_test_prediction_" % model_type, dir=os.path.join(args.save + "prediction")) _none, model_train_fn = \ tempfile.mkstemp(suffix=".csv_running", prefix="%s_train_prediction_" % model_type, dir=os.path.join(args.save + "prediction")) # Now fill the array with zeros, which is fine if training failed train_duration_array = numpy.zeros(len(train_idx_list)*num_cv) predict_duration_array = numpy.zeros(len(train_idx_list)*num_cv) # Some variables train_duration = sys.maxint predict_duration = sys.maxint # Save hash to check whether we changed something during training data_x_hash = hash(numpy.array_repr(data_x)) data_y_hash = hash(data_y.tostring()) print "Train %s\n" % model_type, # Do all subsamples for train_idx_idx, train_idx_index in enumerate(train_idx_list): # Start training for this dataset size fold = -1 for _none, test_idx in cv_idx: fold += 1 current_idx = train_idx_idx*num_cv+fold # Start training for this fold sys.stdout.write("\r\t[%d | %d ]: %d" % (current_idx, len(train_idx_list)*num_cv, len(train_idx_index[fold]))) sys.stdout.flush() train_data_x = numpy.array(data_x[train_idx_index[fold], :], copy=True) train_data_y = numpy.array(data_y[train_idx_index[fold]], copy=True) #num_folds = max(1, max(train_data_x[:, 0])) #print " Found %s folds" % num_folds model = fetch_model(args.model) model = model(rng=model_RNG, sp=sp, encode=args.encode, debug=False) if model.maximum_number_train_data() < train_data_x.shape[0]: model = None train_duration = numpy.nan predict_duration = numpy.nan train_predictions = numpy.zeros(train_data_x.shape[0]) * \ numpy.nan test_predictions = numpy.zeros(len(test_idx)) * numpy.nan else: train_duration = model.train(x=train_data_x, y=train_data_y, param_names=para_header[:-2]) test_data_x = numpy.array(data_x[test_idx, :], copy=True) train_predictions = model.predict(x=train_data_x, tol=10) start = time.time() test_predictions = model.predict(x=test_data_x, tol=10) dur = time.time() - start predict_duration = dur # Also check hashes assert(data_y_hash == hash(data_y.tostring()) and data_x_hash == hash(numpy.array_repr(data_x))) del test_data_x del train_data_x del train_data_y del model train_duration_array[current_idx] = max(0, train_duration) predict_duration_array[current_idx] = max(0, predict_duration) save_one_line_to_csv(model_test_fn, test_predictions, len(train_predictions)) save_one_line_to_csv(model_train_fn, train_predictions, len(train_predictions)) # We're done, so remove the running from filename os.rename(model_train_fn, os.path.join(args.save + "prediction", "%s_train_prediction.csv" % model_type)) os.rename(model_test_fn, os.path.join(args.save + "prediction", "%s_test_prediction.csv" % model_type)) print "\nSaved to %s" % os.path.join(args.save + "prediction", "%s_test_prediction.csv" % model_type) # And save before proceeding to next model_type save_one_line_to_csv(args.save + '/train_duration.csv', train_duration_array, model_type) save_one_line_to_csv(args.save + '/predict_duration.csv', predict_duration_array, model_type)
def main(): prog = "python train.py" parser = ArgumentParser(description="", prog=prog) # Data stuff for training surrogate parser.add_argument("-m", "--model", dest="model", default=None, required=True, help="What model?", choices=[#"ArcGP", "RFstruct", "GaussianProcess", "GradientBoosting", "KNN", "LassoRegression", "LinearRegression", "SupportVectorRegression", "RidgeRegression", "NuSupportVectorRegression", "RandomForest"]) parser.add_argument("--data", dest="data_fn", default=None, required=True, help="Where is the csv with training data?") parser.add_argument("--pcs", dest="pcs", default=None, required=False, help="Smac pcs file for this experiment") parser.add_argument("--encode", dest="encode", default=False, action="store_true") parser.add_argument("--saveto", dest="saveto", required=True) args, unknown = parser.parse_known_args() if os.path.exists(args.saveto): raise ValueError("%s already exists" % args.saveto) if not os.path.isdir(os.path.dirname(args.saveto)): raise ValueError("%s, directory does not exist") saveto = os.path.abspath(args.saveto) if args.model == "Fastrf" and args.encode: raise ValueError("This cannot work") sp = pcs_parser.read(file(args.pcs)) model_type = args.model if args.encode: model_type += "_onehot" # Read data from csv header, data = read_csv(args.data_fn, has_header=True, num_header_rows=3) para_header = header[0][:-2] #type_header = header[1][:-2] #cond_header = header[2][:-2] # Cut out the objective data_x = data[:, :-2] data_y = data[:, -1] # -1 -> perf, -2 -> duration # Save hash to check whether we changed something during training data_x_hash = hash(numpy.array_repr(data_x)) data_y_hash = hash(data_y.tostring()) print "Train %s\n" % model_type, train_data_x = numpy.array(data_x, copy=True) train_data_y = numpy.array(data_y, copy=True) model = fetch_model(args.model) model = model(rng=RNG, sp=sp, encode=args.encode, debug=False) if model.maximum_number_train_data() < train_data_x.shape[0]: max_n = model.maximum_number_train_data() print "Limited model, reducing #data from %d" % train_data_x.shape[0] train_data_x, _n_x, train_data_y, _n_y = \ cross_validation.train_test_split(train_data_x, train_data_y, train_size=max_n, random_state=RNG) print "to %d" % train_data_x.shape[0] else: print "Reducing data not neccessary" dur = model.train(x=train_data_x, y=train_data_y, param_names=para_header) print "Training took %fsec" % dur if args.model == "Fastrf" or "RFstruct": # We need to save the forest print "Saved forest to %s" % saveto model.save_forest(fn=saveto + "_forest") assert data_x_hash, hash(numpy.array_repr(data_x)) assert data_y_hash, hash(data_y.tostring()) fn = open(saveto, "wb") cPickle.dump(obj=model, file=fn, protocol=cPickle.HIGHEST_PROTOCOL) fn.close() print "Saved to %s" % saveto
def main(): prog = "python whole_training.py" parser = ArgumentParser(description="", prog=prog) parser.add_argument("--traindata", dest="traindata", required=True) parser.add_argument("--testdata", dest="testdata", required=True) parser.add_argument("-s", dest="save", required=True) parser.add_argument("-r", dest="num_random", default=100, type=int, help="If randomsearch is available, how many runs?") parser.add_argument( "-m", "--model", dest="model", default="all", help="Train only one model?", choices=[ #"ArcGP", "RFstruct", "Fastrf", "GaussianProcess", "GradientBoosting", "KNN", "LassoRegression", "LinearRegression", "NuSupportVectorRegression", "RidgeRegression", "SupportVectorRegression", "RandomForest" ]) parser.add_argument("--pcs", dest="pcs", default=None, required=True, help="PCS file") parser.add_argument("--encode", dest="encode", default=False, action="store_true") args, unknown = parser.parse_known_args() if args.model == "Fastrf" and args.encode: raise ValueError("This cannot work") sp = pcs_parser.read(file(args.pcs)) model_type = args.model if args.encode: model_type += "_onehot" # Read data from csv header, data = read_csv(args.traindata, has_header=True, num_header_rows=3) para_header = header[0] type_header = header[1] cond_header = header[2] # Cut out the objective data_x = data[:, :-2] data_y = data[:, -1] # -1 -> perf, -2 -> duration # Save hash to check whether we changed something during training data_x_hash = hash(numpy.array_repr(data_x)) data_y_hash = hash(data_y.tostring()) print "Train %s\n" % model_type, train_data_x = numpy.array(data_x, copy=True) train_data_y = numpy.array(data_y, copy=True) model = fetch_model(args.model) model = model(rng=RNG, sp=sp, encode=args.encode, debug=False) if model.maximum_number_train_data() < train_data_x.shape[0]: max_n = model.maximum_number_train_data() print "Limited model, reducing #data from %d" % train_data_x.shape[0] train_data_x, _n_x, train_data_y, _n_y = \ cross_validation.train_test_split(train_data_x, train_data_y, train_size=max_n, random_state=RNG) print "to %d" % train_data_x.shape[0] else: print "Reducing data not neccessary" dur = model.train(x=train_data_x, y=train_data_y, param_names=para_header[:-2]) print "Training took %fsec" % dur _header, test_data = read_csv(args.testdata, has_header=True, num_header_rows=3) assert para_header == _header[0] assert type_header == _header[1] assert cond_header == _header[2] assert (data_y_hash == hash(data_y.tostring()) and data_x_hash == hash(numpy.array_repr(data_x))) # Cut out the objective test_data_x = test_data[:, :-2] test_predictions = model.predict(x=test_data_x, tol=10) model_test_fn = os.path.join(args.save, "test_prediction.csv") # Dirty hack to initialize, because it's quite late if not os.path.isfile(model_test_fn): fh = open(model_test_fn, "w") fh.close() print test_predictions.shape save_one_line_to_csv(model_test_fn, test_predictions, model_type)
def main(): prog = "python train.py" parser = ArgumentParser(description="", prog=prog) # Data stuff for training surrogate parser.add_argument( "-m", "--model", dest="model", default=None, required=True, help="What model?", choices=[ #"ArcGP", "RFstruct", "GaussianProcess", "GradientBoosting", "KNN", "LassoRegression", "LinearRegression", "SupportVectorRegression", "RidgeRegression", "NuSupportVectorRegression", "RandomForest" ]) parser.add_argument("--data", dest="data_fn", default=None, required=True, help="Where is the csv with training data?") parser.add_argument("--pcs", dest="pcs", default=None, required=False, help="Smac pcs file for this experiment") parser.add_argument("--encode", dest="encode", default=False, action="store_true") parser.add_argument("--saveto", dest="saveto", required=True) args, unknown = parser.parse_known_args() if os.path.exists(args.saveto): raise ValueError("%s already exists" % args.saveto) if not os.path.isdir(os.path.dirname(args.saveto)): raise ValueError("%s, directory does not exist") saveto = os.path.abspath(args.saveto) if args.model == "Fastrf" and args.encode: raise ValueError("This cannot work") sp = pcs_parser.read(file(args.pcs)) model_type = args.model if args.encode: model_type += "_onehot" # Read data from csv header, data = read_csv(args.data_fn, has_header=True, num_header_rows=3) para_header = header[0][:-2] #type_header = header[1][:-2] #cond_header = header[2][:-2] # Cut out the objective data_x = data[:, :-2] data_y = data[:, -1] # -1 -> perf, -2 -> duration # Save hash to check whether we changed something during training data_x_hash = hash(numpy.array_repr(data_x)) data_y_hash = hash(data_y.tostring()) print "Train %s\n" % model_type, train_data_x = numpy.array(data_x, copy=True) train_data_y = numpy.array(data_y, copy=True) model = fetch_model(args.model) model = model(rng=RNG, sp=sp, encode=args.encode, debug=False) if model.maximum_number_train_data() < train_data_x.shape[0]: max_n = model.maximum_number_train_data() print "Limited model, reducing #data from %d" % train_data_x.shape[0] train_data_x, _n_x, train_data_y, _n_y = \ cross_validation.train_test_split(train_data_x, train_data_y, train_size=max_n, random_state=RNG) print "to %d" % train_data_x.shape[0] else: print "Reducing data not neccessary" dur = model.train(x=train_data_x, y=train_data_y, param_names=para_header) print "Training took %fsec" % dur if args.model == "Fastrf" or "RFstruct": # We need to save the forest print "Saved forest to %s" % saveto model.save_forest(fn=saveto + "_forest") assert data_x_hash, hash(numpy.array_repr(data_x)) assert data_y_hash, hash(data_y.tostring()) fn = open(saveto, "wb") cPickle.dump(obj=model, file=fn, protocol=cPickle.HIGHEST_PROTOCOL) fn.close() print "Saved to %s" % saveto
def main(): prog = "python whole_training.py" parser = ArgumentParser(description="", prog=prog) parser.add_argument("-d", dest="data", required=True) parser.add_argument("-s", dest="save", required=True) parser.add_argument("-r", dest="num_random", default=100, type=int, help="If randomsearch is available, how many runs?") parser.add_argument( "-m", "--model", dest="model", default="all", help="Train only one model?", choices=[ #"RFstruct", "ArcGP", "GaussianProcess", "GradientBoosting", "KNN", "LassoRegression", "LinearRegression", "NuSupportVectorRegression", "RandomForest", "RidgeRegression", "SupportVectorRegression" ]) parser.add_argument("-t", "--time", dest="time", default=False, action="store_true", help="Train on duration?") parser.add_argument("--pcs", dest="pcs", default=None, required=True, help="PCS file") parser.add_argument("--encode", dest="encode", default=False, action="store_true") args, unknown = parser.parse_known_args() if args.model == "Fastrf" and args.encode: raise ValueError("This cannot work") sp = pcs_parser.read(file(args.pcs)) model_type = args.model if args.encode: model_type += "_onehot" # Read data from csv header, data = read_csv(args.data, has_header=True, num_header_rows=3) para_header = header[0] type_header = header[1] cond_header = header[2] # Hardcoded number of crossvalidations num_cv = 5 # Cut out the objective data_x = data[:, :-2] if args.time: print "TRAINING ON TIME" data_y = data[:, -2] # -1 -> perf, -2 -> duration else: data_y = data[:, -1] # -1 -> perf, -2 -> duration # Split into num_cv folds cv_idx = cross_validation.KFold(data_x.shape[0], n_folds=num_cv, indices=True, random_state=RNG, shuffle=True) # Get subsample idx ct = int(data_x.shape[0] / num_cv) * (num_cv - 1) train_idx_list = list() test_idx_list = list() # For largest training set, we simply take all indices train_idx_list.append([train_idx for train_idx, _n in cv_idx]) test_idx_list.extend([test_idx for _none, test_idx in cv_idx]) # Prepare new csv tmp_result_header = list() tmp_result_header.extend([str(len(i)) for i in train_idx_list[0]]) ct = 2000 if ct < int(data_x.shape[0] / num_cv) * (num_cv - 1): train_idx_list.append(list()) for train_idx, test_idx in cv_idx: # NOTE: We have to change seed, otherwise trainingsamples # will always be the same for different ct subsample_cv = cross_validation.ShuffleSplit(len(train_idx), n_iter=1, train_size=ct, test_size=None, random_state=RNG) for sub_train_idx, _none in subsample_cv: train_idx_list[-1].append(train_idx[sub_train_idx]) tmp_result_header.append(str(len(sub_train_idx))) """ # # Right now we don't need this # # Now reduce in each step training set by half and subsample # save_ct = None # ct /= 2 # if ct < 1500: # save_ct = ct # ct = 1500 # # seed = numpy.random.randint(100, size=[num_cv]) # while ct > 10: # train_idx_list.append(list()) # idx = 0 # for train_idx, test_idx in cv_idx: # # NOTE: We have to change seed, otherwise trainingsamples will # # always be the same for different ct # subsample_cv = cross_validation.ShuffleSplit(len(train_idx), # n_iter=1, # train_size=ct, # test_size=None, # random_state=seed[idx]*ct) # for sub_train_idx, _none in subsample_cv: # train_idx_list[-1].append(train_idx[sub_train_idx]) # tmp_result_header.append(str(len(sub_train_idx))) # idx += 1 # # if ct > 2000 and ct/2 < 2000 and save_ct is None: # # Trick to evaluate 2000 in any case # save_ct = ct/2 # ct = 2000 # elif ct > 1500 and ct/2 < 1500 and save_ct is None: # # Trick to evaluate 1500 in any case # save_ct = ct/2 # ct = 1500 # elif save_ct is not None: # ct = save_ct # save_ct = None # else: # ct /= 2 """ # Reverse train_idx to start with small dataset sizes train_idx_list = train_idx_list[::-1] result_header = ['model'] result_header.extend(tmp_result_header[::-1]) # print result_header # print [[len(j) for j in i] for i in train_idx_list] # print [len(i) for i in test_idx_list] # We could write the ground truth for this experiment ground_truth_fn = args.save + "ground_truth_" if not os.path.exists(ground_truth_fn + "training.csv") or \ not os.path.exists(ground_truth_fn + "test.csv"): write_truth(train_idx=train_idx_list, test_idx=test_idx_list, data=data_y, fn=ground_truth_fn, save_dir=args.save) # Now init the csv init_csv(args.save + '/train_duration.csv', result_header, override=False) init_csv(args.save + '/predict_duration.csv', result_header, override=False) # We need one csv containing the raw predictions # Just in case we already trained this model, create random filename if not os.path.exists(os.path.join(args.save + "prediction")): os.mkdir(os.path.join(args.save + "prediction")) _none, model_test_fn = \ tempfile.mkstemp(suffix=".csv_running", prefix="%s_test_prediction_" % model_type, dir=os.path.join(args.save + "prediction")) _none, model_train_fn = \ tempfile.mkstemp(suffix=".csv_running", prefix="%s_train_prediction_" % model_type, dir=os.path.join(args.save + "prediction")) # Now fill the array with zeros, which is fine if training failed train_duration_array = numpy.zeros(len(train_idx_list) * num_cv) predict_duration_array = numpy.zeros(len(train_idx_list) * num_cv) # Some variables train_duration = sys.maxint predict_duration = sys.maxint # Save hash to check whether we changed something during training data_x_hash = hash(numpy.array_repr(data_x)) data_y_hash = hash(data_y.tostring()) print "Train %s\n" % model_type, # Do all subsamples for train_idx_idx, train_idx_index in enumerate(train_idx_list): # Start training for this dataset size fold = -1 for _none, test_idx in cv_idx: fold += 1 current_idx = train_idx_idx * num_cv + fold # Start training for this fold sys.stdout.write("\r\t[%d | %d ]: %d" % (current_idx, len(train_idx_list) * num_cv, len(train_idx_index[fold]))) sys.stdout.flush() train_data_x = numpy.array(data_x[train_idx_index[fold], :], copy=True) train_data_y = numpy.array(data_y[train_idx_index[fold]], copy=True) #num_folds = max(1, max(train_data_x[:, 0])) #print " Found %s folds" % num_folds model = fetch_model(args.model) model = model(rng=model_RNG, sp=sp, encode=args.encode, debug=False) if model.maximum_number_train_data() < train_data_x.shape[0]: model = None train_duration = numpy.nan predict_duration = numpy.nan train_predictions = numpy.zeros(train_data_x.shape[0]) * \ numpy.nan test_predictions = numpy.zeros(len(test_idx)) * numpy.nan else: train_duration = model.train(x=train_data_x, y=train_data_y, param_names=para_header[:-2]) test_data_x = numpy.array(data_x[test_idx, :], copy=True) train_predictions = model.predict(x=train_data_x, tol=10) start = time.time() test_predictions = model.predict(x=test_data_x, tol=10) dur = time.time() - start predict_duration = dur # Also check hashes assert (data_y_hash == hash(data_y.tostring()) and data_x_hash == hash(numpy.array_repr(data_x))) del test_data_x del train_data_x del train_data_y del model train_duration_array[current_idx] = max(0, train_duration) predict_duration_array[current_idx] = max(0, predict_duration) save_one_line_to_csv(model_test_fn, test_predictions, len(train_predictions)) save_one_line_to_csv(model_train_fn, train_predictions, len(train_predictions)) # We're done, so remove the running from filename os.rename( model_train_fn, os.path.join(args.save + "prediction", "%s_train_prediction.csv" % model_type)) os.rename( model_test_fn, os.path.join(args.save + "prediction", "%s_test_prediction.csv" % model_type)) print "\nSaved to %s" % os.path.join(args.save + "prediction", "%s_test_prediction.csv" % model_type) # And save before proceeding to next model_type save_one_line_to_csv(args.save + '/train_duration.csv', train_duration_array, model_type) save_one_line_to_csv(args.save + '/predict_duration.csv', predict_duration_array, model_type)