Beispiel #1
0
def model_diagnoser_test():
    X, y = datasets.load_boston(return_X_y=True)
    X = pd.DataFrame(X)

    model_name = "random_forest"
    random_seed = 1001
    obj_func_name = "mse"
    eval_func_names = ["r_squared", "rmse"]
    n_estimators = 400

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_seed)

    model = RegressionModel(X_train=X_train,
                            y_train=y_train,
                            model_type=model_name,
                            obj_func_name=obj_func_name,
                            random_seed=random_seed)

    if model_name == "random_forest":
        model_params = {"n_estimators": n_estimators}
    else:
        model_params = {}

    model.fit(model_params=model_params)

    model_diagnoser = ModelDiagnoser(model,
                                     train_valid_folds=10,
                                     eval_func_names=eval_func_names,
                                     X_test=X_test,
                                     y_test=y_test)
    model_diagnoser.show_all_diagnostics()
def main():
    print('Train and test in batch')

    train_results_f = open('data/temp/train_results.csv', 'w')

    distributions = ['uniform', 'diagonal', 'gauss', 'parcel', 'bit']

    test_path = 'data/train_and_test_all_features_split/test_join_results_combined_data.csv'

    for r in range(1, len(distributions)):
        print(r)
        groups = combinations(distributions, r)
        for g in groups:
            name = '_'.join(g)
            output_name = '{}distribution.{}'.format(r, name)
            train_path = 'data/train_and_test_all_features_split/train_join_results_combined_data.{}.csv'.format(output_name)
            print(train_path)
            os.system('python main.py --model random_forest --tab {} --hist data/histograms/ --result data/join_results/train/join_results_small_x_small_uniform.csv --path trained_models/model_uniform.h5 --weights trained_models/model_weights_uniform.h5 --train'.format(train_path))
            # os.system('python main.py --model random_forest --tab data/train_and_test_all_features_split/test_join_results_combined_data.csv --hist data/histograms/ --result data/join_results/train/join_results_small_x_small_uniform.csv --path trained_models/model_uniform.h5 --weights trained_models/model_weights_uniform.h5 --no-train')
            model = RegressionModel('random_forest')
            mae, mape, mse, msle = model.test('data/train_and_test_all_features_split/test_join_results_combined_data.csv',
                                                        '', 'trained_models/model_uniform.h5', 'trained_models/model_weights_uniform.h5', '')
            train_results_f.writelines('{},{},{},{},{},{}\n'.format(r, name, mae, mape, mse, msle))

    train_results_f.close()
    def evaluate(self, vectors, vector_file, vector_size, results_folder,
                 log_dictionary, scores_dictionary):
        log_errors = ""

        gold_standard_filenames = self.get_gold_standard_file()

        totalscores = defaultdict(dict)

        for gold_standard_filename in gold_standard_filenames:
            script_dir = os.path.dirname(__file__)
            rel_path = "data/" + gold_standard_filename + '.tsv'
            gold_standard_file = os.path.join(script_dir, rel_path)

            regression_model_names = ["LR", "KNN", "M5"]

            scores = defaultdict(list)
            totalscores_element = defaultdict(list)

            data, ignored = self.data_manager.intersect_vectors_goldStandard(
                vectors, vector_file, vector_size, gold_standard_file)

            self.storeIgnored(results_folder, gold_standard_filename, ignored)

            if data.size == 0:
                log_errors += 'Regression : Problems in merging vector with gold standard ' + gold_standard_file + '\n'
                if self.debugging_mode:
                    print(
                        'Regression : Problems in merging vector with gold standard '
                        + gold_standard_file)
            else:
                for i in range(10):
                    data = data.sample(frac=1,
                                       random_state=i).reset_index(drop=True)

                    for model_name in regression_model_names:
                        # initialize the model
                        model = Model(task_name, model_name,
                                      self.debugging_mode)
                        # train and print score
                        try:
                            result = model.train(data)
                            result[
                                'gold_standard_file'] = gold_standard_filename
                            scores[model_name].append(result)
                            totalscores_element[model_name].append(result)
                        except Exception as e:
                            log_errors += 'File used as gold standard: ' + gold_standard_filename + '\n'
                            log_errors += 'Regression method: ' + model_name + '\n'
                            log_errors += str(e) + '\n'

                self.storeResults(results_folder, gold_standard_filename,
                                  scores)
                totalscores[gold_standard_filename] = totalscores_element

            results_df = self.resultsAsDataFrame(totalscores)
            scores_dictionary[task_name] = results_df

        log_dictionary[task_name] = log_errors
Beispiel #4
0
def hyperparameter_optimizer_test():
    X, y = datasets.load_boston(return_X_y=True)
    X = pd.DataFrame(X)

    model_name = "random_forest"
    random_seed = 1001
    obj_func_name = "mse"
    n_estimators = 400
    total_n_iterations = 50

    base_model = RegressionModel(X_train=X,
                                 y_train=y,
                                 model_type=model_name,
                                 obj_func_name=obj_func_name,
                                 random_seed=random_seed)

    hpo = HyperParameterOptimizer(verbosity=1)
    if model_name == "random_forest":
        override_params = {"n_estimators": n_estimators}
    else:
        override_params = {}
    hpo.tune_and_fit(model=base_model,
                     total_n_iterations=total_n_iterations,
                     train_valid_folds=10,
                     override_params=override_params,
                     use_model_copy=True)
    tuned_model = hpo.model
    return tuned_model
Beispiel #5
0
def main():
    parser = OptionParser()
    parser.add_option('-m', '--model', type='string', help='Model name: {linear, dnn}')
    parser.add_option('-t', '--tab', type='string', help='Path to the tabular data file(CSV)')
    parser.add_option('-g', '--hist', type='string', help='Path to the histograms of input datasets')
    parser.add_option('-r', '--result', type='string', help='Path to the join result (CSV)')
    parser.add_option('-p', '--path', type='string', help='Path to the model to be saved')
    parser.add_option('-w', '--weights', type='string', help='Path to the model weights to be saved')
    parser.add_option('--train', action="store_true", dest="train", default=True)
    parser.add_option('--no-train', action="store_false", dest="train")

    (options, args) = parser.parse_args()
    options_dict = vars(options)

    model_names = ['linear', 'decision_tree', 'random_forest', 'dnn', 'hist_dnn', 'clf_decision_tree', 'clf_random_forest', 'rnk_random_forest']

    try:
        model_name = options_dict['model']
        if model_name not in model_names:
            print('Available model are {}'.format(', '.join(model_names)))
            return
        else:
            if model_name in ['linear', 'decision_tree', 'random_forest']:
                model = RegressionModel(model_name)
            elif model_name == 'dnn':
                model = DNNModel()
            elif model_name == 'hist_dnn':
                model = HistogramDNNModel()
            elif model_name in ['clf_decision_tree', 'clf_random_forest']:
                model = ClassificationModel(model_name)
            elif model_name in ['rnk_random_forest']:
                model = RankingModel(model_name)

        tabular_path = options_dict['tab']
        histogram_path = options_dict['hist']
        join_result_path = options_dict['result']
        model_path = options_dict['path']
        model_weights_path = options_dict['weights']
        is_train = options_dict['train']

        if is_train:
            model.train(tabular_path, join_result_path, model_path, model_weights_path, histogram_path)
        else:
            mae, mape, mse, msle = model.test(tabular_path, join_result_path, model_path, model_weights_path, histogram_path)
            if model_name in ['clf_decision_tree', 'clf_random_forest']:
                exit(1)
            print('mae: {}\nmape: {}\nmse: {}\nmlse: {}'.format(mae, mape, mse, msle))
            print('{}\t{}\t{}\t{}'.format(mae, mape, mse, msle))

    except RuntimeError:
        print('Please check your arguments')
Beispiel #6
0
def regression_model_test():
    X, y = datasets.load_boston(return_X_y=True)
    X = pd.DataFrame(X)

    model_name = "random_forest"
    random_seed = 1001
    obj_func_name = "mse"
    eval_func_names = ["r_squared", "rmse"]
    n_estimators = 400

    model = RegressionModel(X_train=X,
                            y_train=y,
                            model_type=model_name,
                            obj_func_name=obj_func_name,
                            random_seed=random_seed)

    if model_name == "random_forest":
        model_params = {"n_estimators": n_estimators}
    else:
        model_params = {}

    model.fit(model_params=model_params)

    training_set_preds = model.predict(X)
    print(training_set_preds)

    cv_metrics = model.cross_validate(train_valid_folds=10,
                                      eval_func_names=eval_func_names,
                                      model_params=model_params)
    print(cv_metrics)
Beispiel #7
0
    y_valid_data = np.asarray(pickle.load(
        open(
            os.path.join(data_serialization_dir,
                         train_configs["y_valid_data"]), 'rb')),
                              dtype=np.float32)
    xArray, yArray = iterator(x_train_data, y_train_data,
                              train_configs["batch_size"])
    # train
    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_uniform_initializer(
            -train_configs["init_scale"], train_configs["init_scale"])
        with tf.variable_scope("text_classification",
                               reuse=None,
                               initializer=initializer):
            #CNN model
            TextModel = RegressionModel(model_configs)
            # optimizer op
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = create_optimizer(
                configs=train_configs["optimizer_conf"])
            grads_and_vars = optimizer.compute_gradients(TextModel.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # CheckPoint State
            check_point_dir = os.path.join(DataDirPath,
                                           train_configs["check_point_path"])
            if not os.path.exists(check_point_dir):
                os.mkdir(check_point_dir)
            else:
                for _file in os.listdir(check_point_dir):
        minimal_dates = minimal_dates & dates

new_dfs = []

for df in dfs:
    new_df = df[df['Date'].map(lambda x: x.timestamp() in minimal_dates)]
    new_dfs.append(new_df)

dfs = new_dfs

print("Generating Models")

for i, df in enumerate(dfs):
    data = get_data(df, data_start, valid_start, test_start, data_end,
                    features, label)
    model = RegressionModel(1)
    model.train(data['Xtrain'], data['Ytrain'])
    y_pred = model.predict(data['Xtest'])
    predicted_returns[str(i)] = y_pred
    actual_returns[str(i)] = data['Ytest']
    train_returns[str(i)] = data['Ytrain']

cov = np.cov(pd.DataFrame(train_returns).values.T)

n = len(train_returns.keys())

opt = SimplePortfolio(n)
desired_variance = (0.001)**2

pred_df = pd.DataFrame(predicted_returns)
actual_df = pd.DataFrame(actual_returns)