コード例 #1
0
def get_baselines(save=False):
    '''

	'''

    names = ['dataset_1', 'dataset_2', 'dataset_3']
    models = [
        RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor,
        ExtraTreesRegressor
    ]
    model_names = [
        'random_forest', 'adaboost', 'gradient_boosting', 'extra_trees'
    ]
    model_dict = dict(zip(models, model_names))
    display_name = ds.get_names()
    results = pd.DataFrame()
    for name in names:
        X_train, X_test, y_train, y_test, train = split.split_subset(name)
        disp_name = display_name[name]
        for func in models:
            preds = func().fit(X_train, y_train).predict(X_test)
            performance = metrics.apply_metrics(
                '{} {}'.format(disp_name, model_dict[func]), y_test, preds)
            results = pd.concat([results, performance], axis=0)

    if save:
        to_save = Path().resolve().joinpath('models', 'ensemble',
                                            '{}.csv'.format('baseline_all'))
        results.to_csv(to_save)

    return results
コード例 #2
0
def single_layer_network_randomized_cv(name, n_iter=20, cv=5, save=True):

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    param_grid = {
        'shape': [(X_train.shape[1], )],
        'batch_size': [256, 512, 1028, 2056],
        'epochs': [25, 50],
        'learning_rate': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
    }

    # 'reg': np.linspace(1e-4, 750, 500),
    # 'if_reg': [True],
    # 'shuffle': [False,True]}
    #kernel_regularizer=regularizers.l2(reg))

    def single_layer_network(shape, learning_rate):
        """

		"""

        net = Sequential()
        net.add(Dense(45, activation='relu', input_shape=shape))
        net.add(Dense(1, activation='linear'))
        optimizer = Adam(learning_rate=learning_rate)
        net.compile(optimizer=optimizer, loss='mean_squared_error')
        return net

    net = KerasRegressor(build_fn=single_layer_network,
                         verbose=0,
                         workers=8,
                         use_multiprocessing=True)
    net_cv = RandomizedSearchCV(estimator=net,
                                param_distributions=param_grid,
                                n_jobs=-1,
                                pre_dispatch=16,
                                refit=True,
                                cv=cv,
                                scoring='neg_mean_absolute_error',
                                n_iter=n_iter,
                                random_state=18).fit(X_train, y_train)

    display_name = ds.get_names()[name]
    performance = metrics.apply_metrics(
        '{} Single Layer Neural Network'.format(display_name), y_test,
        net_cv.predict(X_test), y_train)
    performance['Tuning Parameters'] = [net_cv.best_params_]

    if save:
        to_save_cv = Path().resolve().joinpath('models',
                                               'cross_validation_outcomes',
                                               'neural_network',
                                               '{}.csv'.format(name))
        results = pd.DataFrame.from_dict(net_cv.cv_results_)
        results.to_csv(to_save_cv)
        to_save_perf = Path().resolve().joinpath(
            'models', 'neural_network', '{}_performance.csv'.format(name))
        performance.to_csv(to_save_perf)

    return net_cv, performance
コード例 #3
0
def adaboost_randomized_cv(name, n_iter=30, cv=5):
    """Conducts a randomized search of cross validation for given parameters of AdaBoost and returns results.

	"""

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    to_score = metrics.create_metrics()[0]

    param_grid = {
        'base_estimator': [
            DecisionTreeRegressor(max_depth=2),
            DecisionTreeRegressor(max_depth=3),
            DecisionTreeRegressor(max_depth=4),
            DecisionTreeRegressor(max_depth=5)
        ],
        'n_estimators':
        np.linspace(50, 500, 20, dtype=int),
        'loss': ['linear', 'square', 'exponential'],
        'learning_rate':
        np.append(np.array([0]), np.geomspace(1e-3, 5, 10))
    }

    adaboost = AdaBoostRegressor()
    adaboost_cv = RandomizedSearchCV(estimator=adaboost,
                                     param_distributions=param_grid,
                                     n_iter=n_iter,
                                     n_jobs=-1,
                                     pre_dispatch=16,
                                     cv=cv,
                                     refit=False,
                                     random_state=18,
                                     scoring='neg_mean_absolute_error').fit(
                                         X_train, y_train)

    return adaboost_cv
コード例 #4
0
def elastic_net(name, cv=5):
    '''Outputs a fitted Elastic Net Regression Model with tuning parameters found through cross validation.
	
	Inputs must be standardized.
	l1_ratios are spread out on a log scale as recommended by package authors.
	Number of folds in cross validation is by default 5.
	n_jobs = -1 allows for all local processors to be utilized.
	# '''
    # if np.any(X_train.mean(axis = 0) > 1):
    # 	raise ValueError('Numerical features must be standardized')

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    l1_ratios = np.geomspace(1e-8, 1, 50)
    model = ElasticNetCV(l1_ratio=l1_ratios,
                         n_alphas=50,
                         cv=5,
                         verbose=0,
                         n_jobs=-1,
                         random_state=18).fit(X_train, y_train)

    performance = metrics.apply_metrics('{} Elastic Net'.format(display_name),
                                        y_test, model.predict(X_test), y_train)
    performance['Tuning Parameters'] = [{
        'Alpha': model.alpha_,
        'L1 Ratio': model.l1_ratio_
    }]
    return model, performance
コード例 #5
0
def huber(name, cv=5):
    '''

	'''

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    to_score, scoring = metrics.create_metrics()
    param_grid = {
        'epsilon': np.linspace(1 + 1e-15, 1.2, 10),
        'alpha': np.linspace(1e-8, 2, 10)
    }
    model = HuberRegressor()
    model_cv = GridSearchCV(model,
                            param_grid=param_grid,
                            scoring='neg_mean_absolute_error',
                            n_jobs=-1,
                            pre_dispatch=16,
                            cv=cv,
                            refit=True).fit(X_train, y_train)
    performance = metrics.apply_metrics('{} Huber'.format(display_name),
                                        y_test, model_cv.predict(X_test),
                                        y_train)
    performance['Tuning Parameters'] = [model_cv.best_params_]

    return model_cv, performance
コード例 #6
0
ファイル: other.py プロジェクト: wyattowalsh/ieor_142_project
def k_neighbors_randomized_cv(name, n_iter=50, cv=5):
    """

	"""

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train = split.standardize(name, X_train)
    X_test = split.standardize(name, X_test)
    to_score = metrics.create_metrics()[0]
    param_grid = {
        'n_neighbors': np.arange(2, 50, 2, dtype=int),
        'weights': ['uniform', 'distance'],
        'leaf_size': [2, 4, 8, 16, 32, 64, 128, 256]
    }

    model = KNeighborsRegressor(n_jobs=-1)
    model_cv = RandomizedSearchCV(estimator=model,
                                  param_distributions=param_grid,
                                  n_jobs=-1,
                                  pre_dispatch=16,
                                  n_iter=n_iter,
                                  cv=cv,
                                  scoring=to_score,
                                  refit=False,
                                  random_state=18).fit(X_train, y_train)

    return model_cv
コード例 #7
0
def single_layer_network_grid_cv(name, cv=5, save=True):

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)

    param_grid = {
        'shape': [(X_train.shape[1], )],
        'neurons': np.arange(230),
        'batch_size': [1028],
        'epochs': [50],
        'reg': np.geomspace(1e-4, 2, 25),
        'if_reg': [True],
        'shuffle': [True]
    }

    # param_grid = {'shape' : [(X_train.shape[1],)],
    # 'neurons': np.arange(20,275,5),
    # 'batch_size': [1028],
    # 'epochs': [50],
    # 'learning_rate': np.linspace(0.1,1,50),
    # 'reg': np.geomspace(1e-8, 2, 50),
    # 'if_reg': [True],
    # 'shuffle': [False, True]}

    def single_layer_network(shape, learning_rate, reg, if_reg):
        """

		"""

        net = Sequential()
        if if_reg:
            net.add(
                Dense(45,
                      activation='relu',
                      input_shape=shape,
                      kernel_regularizer=regularizers.l2(reg)))
        else:
            net.add(Dense(45, activation='relu', input_shape=shape))
        net.add(Dense(1, activation='linear'))
        optimizer = Adam(learning_rate=learning_rate)
        net.compile(optimizer=optimizer, loss='mean_squared_error')
        return net

    net = KerasRegressor(build_fn=single_layer_network, verbose=0)
    net_cv = GridSearchCV(estimator=net,
                          param_grid=param_grid,
                          n_jobs=-1,
                          pre_dispatch=16,
                          refit=True,
                          cv=cv,
                          scoring='neg_mean_absolute_error').fit(
                              X_train, y_train)

    display_name = ds.get_names()[name]
    performance = metrics.apply_metrics(
        '{} Single Layer Neural Network'.format(display_name), y_test,
        net_cv.predict(X_test), y_train)
    performance['Tuning Parameters'] = [net_cv.best_params_]

    return net_cv
コード例 #8
0
def elbow_method_kmeans(name, max_clusters = 30, min_clusters = 2, save = False):
	'''Creates elbow method plot for varying number of clusters.

	Y-axis is the sum of squared distances of samples to their closest cluster center.
	'''

	display_name = ds.get_names()[name]
	X_train, X_test= split.split_subset(name)[0:1]
	num_numerical = ds.get_number_numerical()[name]
	X_train_numerical = X_train.iloc[:,0:num_numerical]
	distortions = []
	cluster_range = range(min_clusters,max_clusters)
	for clusters in cluster_range:
		kmean = kmeans(name, clusters, X_train_numerical)
		distortions.append(kmean.inertia_)

	fig, ax = plt.subplots(figsize=(10, 10))
	ax.plot(cluster_range, distortions, marker = 'o')
	ax.set_title('Elbow Method for KMeans for {}'.format(display_name), size=25)
	ax.set_xlabel('Number of Clusters', fontsize = 20)
	ax.set_ylabel('Sum of Squared Distances', fontsize = 20)
	plt.show()

	if save:
		to_save = Path().resolve().joinpath('visualizations', 'clustering_elbow_{}.png'.format(name))
		fig.savefig(to_save, dpi=300) 
コード例 #9
0
ファイル: other.py プロジェクト: wyattowalsh/ieor_142_project
def svr_randomized_cv(name, n_iter=25, cv=5):
    """

	"""

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train = split.standardize(name, X_train)
    X_test = split.standardize(name, X_test)
    to_score = metrics.create_metrics()[0]
    param_grid = {
        'kernel': ['poly', 'rbf'],
        'degree': np.arange(2, 6),
        'gamma': ['scale', 'auto'],
        'C': np.linspace(1e-5, 5, 20),
        'epsilon': np.linspace(0, 1, 20),
        'shrinking': [True, False]
    }

    model = SVR()
    model_cv = RandomizedSearchCV(estimator=model,
                                  param_distributions=param_grid,
                                  n_jobs=-1,
                                  pre_dispatch=16,
                                  n_iter=n_iter,
                                  cv=cv,
                                  scoring=to_score,
                                  random_state=18,
                                  refit=False).fit(X_train, y_train)

    return model_cv
コード例 #10
0
def random_forest_randomized_cv(name, n_iter=30, cv=5):
    '''Conducts a randomized search of cross validation for given parameters of the random forest and returns results.

	Implements scoring criteria based off of custom dictionary.
	'''

    X_train, X_test, y_train, y_test, train = split.split_subset(name)

    random_grid_0 = {
        'n_estimators': np.linspace(start=100, stop=500, num=20, dtype=int),
        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'bootstrap': [True, False],
        'max_features': np.linspace(2, len(X_train.columns), num=20,
                                    dtype=int),
        'min_samples_split': [2, 4, 8, 16, 32, 64, 128],
        'min_samples_leaf': [1, 2, 4, 8, 16, 32, 64]
    }

    rf = RandomForestRegressor()
    rf_cv = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid_0,
                               n_jobs=-1,
                               n_iter=n_iter,
                               cv=cv,
                               pre_dispatch=16,
                               scoring='neg_mean_absolute_error',
                               random_state=18,
                               refit=True).fit(X_train, y_train)

    return rf_cv
コード例 #11
0
def extra_trees_randomized_cv(name, n_iter=30, cv=5):
    """

	"""

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    to_score = metrics.create_metrics()[0]
    extra_trees = ExtraTreesRegressor(random_state=18,
                                      n_jobs=-1,
                                      max_features=None,
                                      bootstrap=False)

    random_grid = {
        'n_estimators': [np.linspace(start=100, stop=500, num=20, dtype=int)],
        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'bootstrap': [True, False],
        'max_features': np.linspace(2, len(X_train.columns), num=20,
                                    dtype=int),
        'min_samples_split': [2, 4, 8, 16, 32, 64, 128],
        'min_samples_leaf': [1, 2, 4, 8, 16, 32, 64]
    }

    extra_trees_cv = RandomizedSearchCV(estimator=extra_trees,
                                        param_distributions=random_grid,
                                        n_iter=n_iter,
                                        cv=cv,
                                        n_jobs=-1,
                                        pre_dispatch=16,
                                        refit=False,
                                        scoring='neg_mean_absolute_error').fit(
                                            X_train, y_train)

    return extra_trees_cv
コード例 #12
0
def support_vector_machine(name, cv=5):
    """

	"""

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    to_score, scoring = metrics.create_metrics()
    param_grid = {
        'epsilon': np.linspace(-2, 2, 4),
        'fit_intercept': [True],
        'C': np.linspace(1e6, 1e10, 50),
        'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
        'dual': [False],
        'random_state': [18]
    }
    model = LinearSVR()
    model_cv = GridSearchCV(model,
                            param_grid=param_grid,
                            scoring='neg_mean_absolute_error',
                            n_jobs=-1,
                            pre_dispatch=16,
                            cv=cv,
                            refit=True).fit(X_train, y_train)
    performance = metrics.apply_metrics(
        '{} Linear Support Vector Machine'.format(display_name), y_test,
        model_cv.predict(X_test), y_train)
    performance['Tuning Parameters'] = [model_cv.best_params_]
    return model_cv, performance
コード例 #13
0
def linear(name):
    '''Outputs a fitted Linear Regression Model.

	Inputs can be standardized or not
	'''

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name, True)
    model = LinearRegression().fit(X_train, y_train)
    performance = metrics.apply_metrics('{} OLS'.format(display_name), y_test,
                                        model.predict(X_test), y_train)
    performance['Tuning Parameters'] = ""
    return model, performance
コード例 #14
0
def ridge(name, cv=5):
    '''Outputs a fitted Ridge Regression Model with a penalty term tuned through cross validation.

	'''

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    alphas = np.linspace(500, 750, 50)
    model = RidgeCV(alphas=alphas, fit_intercept=True,
                    cv=cv).fit(X_train, y_train)
    performance = metrics.apply_metrics('{} Ridge'.format(display_name),
                                        y_test, model.predict(X_test), y_train)
    performance['Tuning Parameters'] = [{'Alpha': model.alpha_}]
    return model, performance
コード例 #15
0
def collect_tests(name, save=False):
    '''Aggregates statistical test data for a dataset into a single DataFrame for notebook presentation.

	'''
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    vif = find_vifs(name, X_train).sort_index(0)
    sig = find_numerical_significance(name, X_train, y_train).sort_index(0)
    df = pd.concat([vif, sig], axis=1)

    if save:
        to_save = Path().resolve().joinpath('features', 'statistical_tests',
                                            '{}.csv'.format(name))
        df.to_csv(to_save)

    return df
コード例 #16
0
ファイル: other.py プロジェクト: wyattowalsh/ieor_142_project
def k_neighbors_grid_cv(name, cv=5, save=True):
    '''Conducts a grid search over all possible combinations of given parameters and returns result.

	Uses parameters closely clustered around the best randomized search results.
	Also returns back best fitted model by specified criteria (MAE).
	'''

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(name, X_train, X_test)
    to_score = metrics.create_metrics()[0]
    param_grid = {
        'n_neighbors': np.arange(20, 51, 2, dtype=int),
        'weights': ['distance'],
        'leaf_size': [8, 16, 128, 256]
    }

    model = KNeighborsRegressor(n_jobs=-1)
    model_cv = GridSearchCV(n_jobs=-1,
                            estimator=model,
                            param_grid=param_grid,
                            scoring=to_score,
                            pre_dispatch=16,
                            refit=False,
                            cv=cv).fit(X_train, y_train)

    display_name = ds.get_names()[name]
    performance = pd.DataFrame()
    variations = linear.get_model_variants(KNeighborsRegressor, model_cv)
    for variation in variations:
        model = variation.fit(X_train, y_train).predict(X_test)
        performance = pd.concat([
            performance,
            metrics.apply_metrics('{} K Neighbors'.format(display_name),
                                  y_test, model)
        ],
                                axis=0)

    if save:
        to_save = Path().resolve().joinpath('models',
                                            'cross_validation_outcomes',
                                            'other', 'k_neighbors',
                                            '{}.csv'.format('grid'))
        results = pd.DataFrame.from_dict(model_cv.cv_results_)
        results.to_csv(to_save)

    return model_cv, performance
コード例 #17
0
ファイル: other.py プロジェクト: wyattowalsh/ieor_142_project
def svr_grid_cv(name, standardize=False, cv=5):
    """

	"""

    if cv == 5:
        cv_type = 'K-Fold'
    else:
        cv_type = "Time Series Split"

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    if standardize:
        X_train = split.standardize(name, X_train)
        X_test = split.standardize(name, X_test)
    to_score = metrics.create_metrics()[0]
    param_grid = {
        'kernel': ['poly', 'rbf', 'sigmoid'],
        'degree': np.arange(3, 9),
        'gamma': ['scale', 'auto'],
        'C': [2e-5, 2e-3, 2e-1, 2e1, 2e3, 2e5, 2e7, 2e9, 2e11]
    }

    model = SVR(n_jobs=-1)
    model_cv = GridSearchCV(estimator=model,
                            param_distributions=param_grid,
                            n_jobs=-1,
                            pre_dispatch=16,
                            refit=False,
                            cv=cv,
                            scoring=to_score).fit(X_train, y_train)

    display_name = ds.get_names()[name]
    performance = pd.DataFrame()
    variations = linear.get_model_variants(KNeighborsRegressor, model_cv)
    for variation in variations:
        model = variation.fit(X_train, y_train).predict(X_test)
        performance = pd.concat([
            performance,
            metrics.apply_metrics(
                '{} {} Support Vector Machine'.format(display_name, cv_type),
                y_test, model)
        ],
                                axis=0)

    return model_cv, performance
コード例 #18
0
def gradient_boosting_randomized_cv(name, n_iter=50, cv=5, save=True):
    """Conducts a randomized search of cross validation for given parameters of Gradient Boosting and returns results.

	"""

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    param_grid = {
        'loss': ['ls', 'lad', 'huber'],
        'learning_rate': np.append(np.array([0]), np.geomspace(1e-6, 1, 50)),
        'n_estimators': np.linspace(500, 1000, 50, dtype=int),
        'min_samples_split': [2, 4, 8, 16, 32, 64, 128, 256],
        'min_samples_leaf': [1, 2, 4, 8, 16, 32, 64, 128],
        'max_depth': [2, 3, 4, 5, 10, 15],
        'alpha': np.linspace(1e-6, 1, 25),
        'max_features': np.linspace(2, len(X_train.columns), num=50,
                                    dtype=int),
    }

    gradient_boosting = GradientBoostingRegressor()
    gradient_boosting_cv = RandomizedSearchCV(
        estimator=gradient_boosting,
        n_jobs=-1,
        pre_dispatch=16,
        param_distributions=param_grid,
        n_iter=n_iter,
        cv=cv,
        refit=True,
        scoring='neg_mean_absolute_error').fit(X_train, y_train)

    display_name = ds.get_names()[name]
    performance = metrics.apply_metrics(
        '{} Gradient Boosting'.format(display_name), y_test,
        gradient_boosting_cv.predict(X_test), y_train)
    performance['Tuning Parameters'] = [gradient_boosting_cv.best_params_]

    if save:
        to_save = Path().resolve().joinpath('models',
                                            'cross_validation_outcomes',
                                            'ensemble', 'gradient_boosting',
                                            '{}.csv'.format('randomized'))
        results = pd.DataFrame.from_dict(gradient_boosting_cv.cv_results_)
        results.to_csv(to_save)

    return gradient_boosting_cv, performance
コード例 #19
0
def gradient_boosting_grid_cv(name, cv=5, save=True):
    """Conducts a grid search over all possible combinations of given parameters and returns the result

	Uses parameters closely clustered around the best randomized search results.
	Also returns back best fitted model by specified criteria (MAE).
	"""

    X_train, X_test, y_train, y_test, train = split.split_subset(name)

    param_grid = {
        'loss': ['ls', 'lad', 'huber'],
        'learning_rate': np.geomspace(1e-6, 0.1, 5),
        'n_estimators': [900],
        'min_samples_split': [4, 64, 128, 256],
        'min_samples_leaf': [8, 128],
        'max_depth': [4, 5, 15],
        'alpha': np.linspace(0.1, 1, 5),
        'max_features': [3, 40, 60]
    }

    gradient_boosting = GradientBoostingRegressor(random_state=18)
    gradient_boosting_cv = GridSearchCV(n_jobs=-1,
                                        estimator=gradient_boosting,
                                        param_grid=param_grid,
                                        cv=cv,
                                        refit=True,
                                        scoring='neg_mean_absolute_error',
                                        pre_dispatch=16).fit(X_train, y_train)

    display_name = ds.get_names()[name]
    performance = metrics.apply_metrics(
        '{} Gradient Boosting'.format(display_name), y_test,
        gradient_boosting_cv.predict(X_test), y_train)
    performance['Tuning Parameters'] = [gradient_boosting_cv.best_params_]

    if save:
        to_save = Path().resolve().joinpath('models',
                                            'cross_validation_outcomes',
                                            'ensemble', 'gradient_boosting',
                                            '{}.csv'.format('grid'))
        results = pd.DataFrame.from_dict(gradient_boosting_cv.cv_results_)
        results.to_csv(to_save)

    return gradient_boosting_cv, performance
コード例 #20
0
def extra_trees_grid_cv(name, cv=5, save=True):
    """

	"""

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    extra_trees = ExtraTreesRegressor(n_jobs=-1,
                                      random_state=18,
                                      max_features=None,
                                      bootstrap=False)

    param_grid = {
        'n_estimators': [250],
        'max_depth': [20, 35],
        'bootstrap': [True, False],
        'max_features': [30, 45, 80],
        'min_samples_split': [2, 8, 16],
        'min_samples_leaf': [1, 2]
    }

    extra_trees_cv = GridSearchCV(n_jobs=-1,
                                  estimator=extra_trees,
                                  param_grid=param_grid,
                                  pre_dispatch=16,
                                  cv=cv,
                                  refit=True,
                                  scoring='neg_mean_absolute_error').fit(
                                      X_train, y_train)

    display_name = ds.get_names()[name]
    performance = metrics.apply_metrics('{} Extra Trees'.format(display_name),
                                        y_test, extra_trees_cv.predict(X_test),
                                        y_train)
    performance['Tuning Parameters'] = [extra_trees_cv.best_params_]

    if save:
        to_save = Path().resolve().joinpath('models',
                                            'cross_validation_outcomes',
                                            'ensemble', 'extra_trees',
                                            '{}.csv'.format('grid'))
        results = pd.DataFrame.from_dict(extra_trees_cv.cv_results_)
        results.to_csv(to_save)

    return extra_trees_cv, performance
コード例 #21
0
def pca_cv(name, save=False):
    '''

	'''

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    num_numerical = ds.get_number_numerical()[name]
    X_train_s, X_test_s = split.standardize(name, X_train, X_test)
    X_train_s_numerical = X_train_s.iloc[:, 0:num_numerical]
    X_train_s_categorical = X_train_s.iloc[:, num_numerical:]
    X_test_s_numerical = X_test_s.iloc[:, 0:num_numerical]
    X_test_s_categorical = X_test_s.iloc[:, num_numerical:]
    df = pd.DataFrame()
    ols = LinearRegression()
    ev = []
    for i in np.arange(1, num_numerical):
        pca = PCA(i, random_state=18)
        X_train_s_numerical_reduced = pd.DataFrame(
            pca.fit_transform(X_train_s_numerical),
            index=X_train_s_categorical.index)
        X_test_s_numerical_reduced = pd.DataFrame(
            pca.transform(X_test_s_numerical),
            index=X_test_s_categorical.index)
        X_train_s = pd.concat(
            [X_train_s_numerical_reduced, X_train_s_categorical], axis=1)
        X_test_s = pd.concat(
            [X_test_s_numerical_reduced, X_test_s_categorical], axis=1)

        model = ols.fit(X_train_s, y_train)
        preds = model.predict(X_test_s)
        preds = metrics.apply_metrics(
            '{}: {} dimensions'.format(display_name, i), y_test, preds.ravel(),
            y_train)
        df = pd.concat([df, preds], axis=0)
        ev.append(1 - sum(pca.explained_variance_))

    if save:
        to_save = Path().resolve().joinpath('features', 'pca',
                                            '{}.csv'.format(name))
        df.to_csv(to_save)

    return df, ev
コード例 #22
0
def random_forest_grid_cv(name, cv=5, save=True):
    '''Conducts a grid search over all possible combinations of given parameters and returns result.

	Uses parameters closely clustered around the best randomized search results.
	Also returns back best fitted model by specified criteria (MAE).
	'''

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)

    param_grid = {
        'n_estimators': [400],
        'max_depth': [30, 50, 90],
        'bootstrap': [False],
        'max_features': [30, 40, 50],
        'min_samples_split': [4],
        'min_samples_leaf': [1, 2, 8]
    }

    rf = RandomForestRegressor()
    rf_cv = GridSearchCV(n_jobs=-1,
                         estimator=rf,
                         param_grid=param_grid,
                         scoring='neg_mean_absolute_error',
                         pre_dispatch=16,
                         refit=True,
                         cv=cv).fit(X_train, y_train)

    performance = metrics.apply_metrics(
        '{} Random Forest'.format(display_name), y_test, rf_cv.predict(X_test),
        y_train)
    performance['Tuning Parameters'] = [rf_cv.best_params_]

    if save:
        to_save = Path().resolve().joinpath('models',
                                            'cross_validation_outcomes',
                                            'ensemble', 'random_forest',
                                            '{}.csv'.format('grid'))
        results = pd.DataFrame.from_dict(rf_cv.cv_results_)
        results.to_csv(to_save)

    return rf_cv, performance
コード例 #23
0
def lasso(name, cv=5):
    '''Outputs a fitted Lasso Regression Model with a penalty term tuned through cross validation.

	Inputs must be standardized.
	Number of folds in cross validation is by default 5.
	n_jobs = -1 allows for all local processors to be utilized.
	'''
    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    model = LassoCV(n_alphas=50,
                    verbose=0,
                    cv=5,
                    n_jobs=-1,
                    copy_X=True,
                    tol=1e-3,
                    random_state=18).fit(X_train, y_train)
    performance = metrics.apply_metrics('{} Lasso'.format(display_name),
                                        y_test, model.predict(X_test), y_train)
    performance['Tuning Parameters'] = [{'Alpha': model.alpha_}]
    params = model.coef_
    return model, performance, params
コード例 #24
0
def adaboost_grid_cv(name, cv=5, save=True):
    '''Conducts a grid search over all possible combinations of given parameters and returns the result.

	Uses parameters closely clustered around the best randomized search results.
	'''

    X_train, X_test, y_train, y_test, train = split.split_subset(name)

    param_grid = {
        'base_estimator': [DecisionTreeRegressor(max_depth=5)],
        'n_estimators': [250],
        'loss': ['linear', 'exponential'],
        'learning_rate': np.geomspace(1e-6, 0.2, 20)
    }

    adaboost = AdaBoostRegressor()
    adaboost_cv = GridSearchCV(estimator=adaboost,
                               param_grid=param_grid,
                               scoring='neg_mean_absolute_error',
                               refit=True,
                               cv=cv,
                               n_jobs=-1,
                               pre_dispatch=16).fit(X_train, y_train)

    display_name = ds.get_names()[name]
    performance = metrics.apply_metrics('{} AdaBoost'.format(display_name),
                                        y_test, adaboost_cv.predict(X_test),
                                        y_train)
    performance['Tuning Parameters'] = [adaboost_cv.best_params_]

    if save:
        to_save = Path().resolve().joinpath('models',
                                            'cross_validation_outcomes',
                                            'ensemble', 'adaboost',
                                            '{}.csv'.format('grid'))
        results = pd.DataFrame.from_dict(adaboost_cv.cv_results_)
        results.to_csv(to_save)

    return adaboost_cv, performance
コード例 #25
0
def multi_layer_network_performance(name, save=True):
    """

	"""

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(name, X_train, X_test)

    def multi_layer_network(neurons_l1,
                            neurons_ol,
                            num_layers,
                            shape,
                            learning_rate,
                            reg_l1,
                            reg_ol,
                            epochs,
                            batch_size,
                            X_train=X_train,
                            y_train=y_train):
        """

		"""

        net = Sequential()
        net.add(
            Dense(neurons_l1,
                  activation='relu',
                  input_shape=shape,
                  kernel_regularizer=regularizers.l2(reg_l1)))
        for i in np.arange(num_layers):
            net.add(
                Dense(neurons_ol[i],
                      activation='relu',
                      input_shape=shape,
                      kernel_regularizer=regularizers.l2(reg_ol[i])))
        net.add(Dense(1, activation='linear'))
        optimizer = Adam(learning_rate=learning_rate)
        net.compile(optimizer=optimizer, loss='mean_squared_error')
        net.fit(X_train,
                y_train,
                epochs=epochs,
                batch_size=batch_size,
                workers=7,
                use_multiprocessing=True)
        return net

    results = pd.read_csv(Path().resolve().joinpath(
        'models', 'cross_validation_outcomes', 'neural_network',
        '{}_{}_{}.csv'.format(name, 'multi_layer_network', 'randomized')),
                          index_col=0)

    bestr2 = results.loc[results['rank_test_$R^2$'] == 1, 'params'].values[0]
    bestmae = results.loc[results['rank_test_Mean Absolute Error'] == 1,
                          'params'].values[0]
    bestrmse = results.loc[results['rank_test_Root Mean Square Error'] == 1,
                           'params'].values[0]

    display_name = ds.get_names()[name]
    dict_list = [bestr2, bestmae, bestrmse]
    unique_dict_list = [
        dict(t) for t in {tuple(sorted(d.items()))
                          for d in dict_list}
    ]
    performance = pd.DataFrame()
    for item in unique_dict_list:
        preds = multi_layer_network(**item).predict(X_test)
        performance = pd.concat([
            performance,
            metrics.apply_metrics('{} Single Layer NN'.format(display_name),
                                  y_test, preds)
        ],
                                axis=0)

    if save:
        to_save = Path().resolve().joinpath(
            'models', 'neural_network', '{}.csv'.format('multi_layer_network'))
        performance.to_csv(to_save)
コード例 #26
0
def multi_layer_network_randomized_cv(name, n_iter=30, cv=5, save=True):

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(name, X_train, X_test)
    to_score = metrics.create_metrics()[0]
    reg = list(np.geomspace(1e-6, 5, 49))
    param_grid = {
        'shape': [(X_train.shape[1], )],
        'neurons_l1':
        np.arange(5, 275, 5),
        'neurons_ol': [
            range(5, 275, 5),
            range(5, 275, 5),
            range(5, 275, 5),
            range(5, 275, 5),
            range(5, 275, 5)
        ],
        'batch_size': [4, 8, 16, 32, 64, 128, 256, 512, 1028],
        'epochs': [25, 50, 100],
        'num_layers': [0, 1, 2, 3, 4, 5],
        'learning_rate':
        np.linspace(0.1, 1, 20),
        'reg_l1':
        np.append(np.array([0]), np.geomspace(1e-6, 5, 49)),
        'reg_ol': [[0] + reg, [0] + reg, [0] + reg, [0] + reg, [0] + reg,
                   [0] + reg],
        'shuffle': [False, True]
    }

    def multi_layer_network(neurons_l1, neurons_ol, num_layers, shape,
                            learning_rate, reg_l1, reg_ol):
        """

		"""

        net = Sequential()
        net.add(
            Dense(neurons_l1,
                  activation='relu',
                  input_shape=shape,
                  kernel_regularizer=regularizers.l2(reg_l1)))
        for i in np.arange(num_layers):
            net.add(
                Dense(neurons_ol[i],
                      activation='relu',
                      input_shape=shape,
                      kernel_regularizer=regularizers.l2(reg_ol[i])))
        net.add(Dense(1, activation='linear'))
        optimizer = Adam(learning_rate=learning_rate)
        net.compile(optimizer=optimizer, loss='mean_squared_error')
        return net

    net = KerasRegressor(build_fn=multi_layer_network, verbose=0)
    net_cv = RandomizedSearchCV(estimator=net,
                                param_distributions=param_grid,
                                n_jobs=-1,
                                pre_dispatch=16,
                                refit=False,
                                cv=cv,
                                scoring=to_score,
                                n_iter=n_iter).fit(X_train, y_train)

    if save:
        to_save_cv = Path().resolve().joinpath(
            'models', 'cross_validation_outcomes', 'neural_network',
            '{}_{}_{}.csv'.format(name, 'multi_layer_network', 'randomized'))
        results = pd.DataFrame.from_dict(net_cv.cv_results_)
        results.to_csv(to_save_cv)

    return net_cv
コード例 #27
0
def create_cluster_plots(name, save = False): 
	'''

	'''

	X_train,X_test = split.split_subset(name)[0:2]
	num_numerical = ds.get_number_numerical()[name]
	X_train_numerical = X_train.iloc[:,0:num_numerical]
	X_test_numerical = X_test.iloc[:,0:num_numerical]
	distortions = []
	cluster_range = range(2,31)
	for clusters in cluster_range:
		kmean = kmeans(name, clusters, X_train, X_test)
		distortions.append(kmean.inertia_)

	fig, ax = plt.subplots(ncols = 2, figsize=(40, 12))
	ax[0].plot(cluster_range, distortions, marker = 'o')
	ax[0].set_title('KMeans Scree Plot', fontsize = 40)
	ax[0].set_xlabel('Number of Clusters', fontsize = 30)
	ax[0].set_ylabel('Sum of Squared Distances', fontsize = 30)
	ax[0].tick_params(labelsize=20)
	for i, txt in enumerate(cluster_range):
		annot = ax[0].annotate('{}'.format(txt), (cluster_range[i],distortions[i]))
		annot.set_fontsize(25)

	X_train_s_numerical = split.standardize(name, X_train_numerical, X_test_numerical)[0]
	clusters = 7
	if name == 'dataset_3':
		clusters = 9
	ax[1].set_xlim([-0.3, 0.8])
	ax[1].set_ylim([0, len(X_train_s_numerical) + (clusters + 1) * 10])

	cluster_labels = kmeans(name, clusters, X_train, X_test).predict(X_train_s_numerical)
	silhouette_avg = silhouette_score(X_train_s_numerical, cluster_labels)

	cluster_silhouette = silhouette_samples(X_train_s_numerical, cluster_labels)

	y_lower = 10
	for i in range(clusters):
		ith_cluster_silhouette_values = cluster_silhouette[cluster_labels == i]
		ith_cluster_silhouette_values.sort()

		size_cluster_i = ith_cluster_silhouette_values.shape[0]
		y_upper = y_lower + size_cluster_i

		color = cm.nipy_spectral(float(i) / clusters)
		ax[1].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, 
							facecolor=color, edgecolor=color, alpha=0.7)

		ax[1].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize = 25)

		y_lower = y_upper + 10  

	ax[1].set_title("Silhouette plot for {} clusters.".format(clusters) , fontsize = 40)
	ax[1].set_xlabel("Silhouette Coefficient Values",fontsize = 30)
	ax[1].set_ylabel("Cluster label",fontsize = 30)

	ax[1].axvline(x=silhouette_avg, color="red", linestyle="--")

	ax[1].set_yticks([]) 
	ax[1].set_xticks(np.arange(-0.3,0.9,0.1))
	ax[1].tick_params(labelsize=20)

	plt.tight_layout()
	plt.show()

	if save:
		to_save = Path().resolve().joinpath('visualizations', 'clustering', '{}.png'.format(name))
		fig.savefig(to_save, dpi = 300)