def model_diagnoser_test(): X, y = datasets.load_boston(return_X_y=True) X = pd.DataFrame(X) model_name = "random_forest" random_seed = 1001 obj_func_name = "mse" eval_func_names = ["r_squared", "rmse"] n_estimators = 400 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_seed) model = RegressionModel(X_train=X_train, y_train=y_train, model_type=model_name, obj_func_name=obj_func_name, random_seed=random_seed) if model_name == "random_forest": model_params = {"n_estimators": n_estimators} else: model_params = {} model.fit(model_params=model_params) model_diagnoser = ModelDiagnoser(model, train_valid_folds=10, eval_func_names=eval_func_names, X_test=X_test, y_test=y_test) model_diagnoser.show_all_diagnostics()
def main(): print('Train and test in batch') train_results_f = open('data/temp/train_results.csv', 'w') distributions = ['uniform', 'diagonal', 'gauss', 'parcel', 'bit'] test_path = 'data/train_and_test_all_features_split/test_join_results_combined_data.csv' for r in range(1, len(distributions)): print(r) groups = combinations(distributions, r) for g in groups: name = '_'.join(g) output_name = '{}distribution.{}'.format(r, name) train_path = 'data/train_and_test_all_features_split/train_join_results_combined_data.{}.csv'.format(output_name) print(train_path) os.system('python main.py --model random_forest --tab {} --hist data/histograms/ --result data/join_results/train/join_results_small_x_small_uniform.csv --path trained_models/model_uniform.h5 --weights trained_models/model_weights_uniform.h5 --train'.format(train_path)) # os.system('python main.py --model random_forest --tab data/train_and_test_all_features_split/test_join_results_combined_data.csv --hist data/histograms/ --result data/join_results/train/join_results_small_x_small_uniform.csv --path trained_models/model_uniform.h5 --weights trained_models/model_weights_uniform.h5 --no-train') model = RegressionModel('random_forest') mae, mape, mse, msle = model.test('data/train_and_test_all_features_split/test_join_results_combined_data.csv', '', 'trained_models/model_uniform.h5', 'trained_models/model_weights_uniform.h5', '') train_results_f.writelines('{},{},{},{},{},{}\n'.format(r, name, mae, mape, mse, msle)) train_results_f.close()
def evaluate(self, vectors, vector_file, vector_size, results_folder, log_dictionary, scores_dictionary): log_errors = "" gold_standard_filenames = self.get_gold_standard_file() totalscores = defaultdict(dict) for gold_standard_filename in gold_standard_filenames: script_dir = os.path.dirname(__file__) rel_path = "data/" + gold_standard_filename + '.tsv' gold_standard_file = os.path.join(script_dir, rel_path) regression_model_names = ["LR", "KNN", "M5"] scores = defaultdict(list) totalscores_element = defaultdict(list) data, ignored = self.data_manager.intersect_vectors_goldStandard( vectors, vector_file, vector_size, gold_standard_file) self.storeIgnored(results_folder, gold_standard_filename, ignored) if data.size == 0: log_errors += 'Regression : Problems in merging vector with gold standard ' + gold_standard_file + '\n' if self.debugging_mode: print( 'Regression : Problems in merging vector with gold standard ' + gold_standard_file) else: for i in range(10): data = data.sample(frac=1, random_state=i).reset_index(drop=True) for model_name in regression_model_names: # initialize the model model = Model(task_name, model_name, self.debugging_mode) # train and print score try: result = model.train(data) result[ 'gold_standard_file'] = gold_standard_filename scores[model_name].append(result) totalscores_element[model_name].append(result) except Exception as e: log_errors += 'File used as gold standard: ' + gold_standard_filename + '\n' log_errors += 'Regression method: ' + model_name + '\n' log_errors += str(e) + '\n' self.storeResults(results_folder, gold_standard_filename, scores) totalscores[gold_standard_filename] = totalscores_element results_df = self.resultsAsDataFrame(totalscores) scores_dictionary[task_name] = results_df log_dictionary[task_name] = log_errors
def hyperparameter_optimizer_test(): X, y = datasets.load_boston(return_X_y=True) X = pd.DataFrame(X) model_name = "random_forest" random_seed = 1001 obj_func_name = "mse" n_estimators = 400 total_n_iterations = 50 base_model = RegressionModel(X_train=X, y_train=y, model_type=model_name, obj_func_name=obj_func_name, random_seed=random_seed) hpo = HyperParameterOptimizer(verbosity=1) if model_name == "random_forest": override_params = {"n_estimators": n_estimators} else: override_params = {} hpo.tune_and_fit(model=base_model, total_n_iterations=total_n_iterations, train_valid_folds=10, override_params=override_params, use_model_copy=True) tuned_model = hpo.model return tuned_model
def main(): parser = OptionParser() parser.add_option('-m', '--model', type='string', help='Model name: {linear, dnn}') parser.add_option('-t', '--tab', type='string', help='Path to the tabular data file(CSV)') parser.add_option('-g', '--hist', type='string', help='Path to the histograms of input datasets') parser.add_option('-r', '--result', type='string', help='Path to the join result (CSV)') parser.add_option('-p', '--path', type='string', help='Path to the model to be saved') parser.add_option('-w', '--weights', type='string', help='Path to the model weights to be saved') parser.add_option('--train', action="store_true", dest="train", default=True) parser.add_option('--no-train', action="store_false", dest="train") (options, args) = parser.parse_args() options_dict = vars(options) model_names = ['linear', 'decision_tree', 'random_forest', 'dnn', 'hist_dnn', 'clf_decision_tree', 'clf_random_forest', 'rnk_random_forest'] try: model_name = options_dict['model'] if model_name not in model_names: print('Available model are {}'.format(', '.join(model_names))) return else: if model_name in ['linear', 'decision_tree', 'random_forest']: model = RegressionModel(model_name) elif model_name == 'dnn': model = DNNModel() elif model_name == 'hist_dnn': model = HistogramDNNModel() elif model_name in ['clf_decision_tree', 'clf_random_forest']: model = ClassificationModel(model_name) elif model_name in ['rnk_random_forest']: model = RankingModel(model_name) tabular_path = options_dict['tab'] histogram_path = options_dict['hist'] join_result_path = options_dict['result'] model_path = options_dict['path'] model_weights_path = options_dict['weights'] is_train = options_dict['train'] if is_train: model.train(tabular_path, join_result_path, model_path, model_weights_path, histogram_path) else: mae, mape, mse, msle = model.test(tabular_path, join_result_path, model_path, model_weights_path, histogram_path) if model_name in ['clf_decision_tree', 'clf_random_forest']: exit(1) print('mae: {}\nmape: {}\nmse: {}\nmlse: {}'.format(mae, mape, mse, msle)) print('{}\t{}\t{}\t{}'.format(mae, mape, mse, msle)) except RuntimeError: print('Please check your arguments')
def regression_model_test(): X, y = datasets.load_boston(return_X_y=True) X = pd.DataFrame(X) model_name = "random_forest" random_seed = 1001 obj_func_name = "mse" eval_func_names = ["r_squared", "rmse"] n_estimators = 400 model = RegressionModel(X_train=X, y_train=y, model_type=model_name, obj_func_name=obj_func_name, random_seed=random_seed) if model_name == "random_forest": model_params = {"n_estimators": n_estimators} else: model_params = {} model.fit(model_params=model_params) training_set_preds = model.predict(X) print(training_set_preds) cv_metrics = model.cross_validate(train_valid_folds=10, eval_func_names=eval_func_names, model_params=model_params) print(cv_metrics)
y_valid_data = np.asarray(pickle.load( open( os.path.join(data_serialization_dir, train_configs["y_valid_data"]), 'rb')), dtype=np.float32) xArray, yArray = iterator(x_train_data, y_train_data, train_configs["batch_size"]) # train with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer( -train_configs["init_scale"], train_configs["init_scale"]) with tf.variable_scope("text_classification", reuse=None, initializer=initializer): #CNN model TextModel = RegressionModel(model_configs) # optimizer op global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = create_optimizer( configs=train_configs["optimizer_conf"]) grads_and_vars = optimizer.compute_gradients(TextModel.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # CheckPoint State check_point_dir = os.path.join(DataDirPath, train_configs["check_point_path"]) if not os.path.exists(check_point_dir): os.mkdir(check_point_dir) else: for _file in os.listdir(check_point_dir):
minimal_dates = minimal_dates & dates new_dfs = [] for df in dfs: new_df = df[df['Date'].map(lambda x: x.timestamp() in minimal_dates)] new_dfs.append(new_df) dfs = new_dfs print("Generating Models") for i, df in enumerate(dfs): data = get_data(df, data_start, valid_start, test_start, data_end, features, label) model = RegressionModel(1) model.train(data['Xtrain'], data['Ytrain']) y_pred = model.predict(data['Xtest']) predicted_returns[str(i)] = y_pred actual_returns[str(i)] = data['Ytest'] train_returns[str(i)] = data['Ytrain'] cov = np.cov(pd.DataFrame(train_returns).values.T) n = len(train_returns.keys()) opt = SimplePortfolio(n) desired_variance = (0.001)**2 pred_df = pd.DataFrame(predicted_returns) actual_df = pd.DataFrame(actual_returns)