Python standardizeの例、src.data.train_test_split.standardize Pythonの例

コード例 #1

0

ファイルを表示

ファイル: other.py プロジェクト: wyattowalsh/ieor_142_project

def k_neighbors_randomized_cv(name, n_iter=50, cv=5):
    """

	"""

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train = split.standardize(name, X_train)
    X_test = split.standardize(name, X_test)
    to_score = metrics.create_metrics()[0]
    param_grid = {
        'n_neighbors': np.arange(2, 50, 2, dtype=int),
        'weights': ['uniform', 'distance'],
        'leaf_size': [2, 4, 8, 16, 32, 64, 128, 256]
    }

    model = KNeighborsRegressor(n_jobs=-1)
    model_cv = RandomizedSearchCV(estimator=model,
                                  param_distributions=param_grid,
                                  n_jobs=-1,
                                  pre_dispatch=16,
                                  n_iter=n_iter,
                                  cv=cv,
                                  scoring=to_score,
                                  refit=False,
                                  random_state=18).fit(X_train, y_train)

    return model_cv

コード例 #2

0

ファイルを表示

ファイル: other.py プロジェクト: wyattowalsh/ieor_142_project

def svr_randomized_cv(name, n_iter=25, cv=5):
    """

	"""

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train = split.standardize(name, X_train)
    X_test = split.standardize(name, X_test)
    to_score = metrics.create_metrics()[0]
    param_grid = {
        'kernel': ['poly', 'rbf'],
        'degree': np.arange(2, 6),
        'gamma': ['scale', 'auto'],
        'C': np.linspace(1e-5, 5, 20),
        'epsilon': np.linspace(0, 1, 20),
        'shrinking': [True, False]
    }

    model = SVR()
    model_cv = RandomizedSearchCV(estimator=model,
                                  param_distributions=param_grid,
                                  n_jobs=-1,
                                  pre_dispatch=16,
                                  n_iter=n_iter,
                                  cv=cv,
                                  scoring=to_score,
                                  random_state=18,
                                  refit=False).fit(X_train, y_train)

    return model_cv

コード例 #3

0

ファイルを表示

ファイル: decomposition.py プロジェクト: wyattowalsh/ieor_142_project

def pca(name, X_train, X_test, dimension):
    """

	"""

    X_train = X_train.copy()
    X_test = X_test.copy()
    num_numerical = ds.get_number_numerical(name)
    X_train_s = split.standardize(name, X_train)
    X_test_s = split.standardize(name, X_test)
    X_train_s_numerical = X_train_s.iloc[:, 0:num_numerical]
    X_train_s_categorical = X_train_s.iloc[:, num_numerical:]
    X_test_s_numerical = X_test_s.iloc[:, 0:num_numerical]
    X_test_s_categorical = X_test_s.iloc[:, num_numerical:]
    estimator = PCA(dimension)
    X_train_s_numerical_reduced = pd.DataFrame(
        estimator.fit_transform(X_train_s_numerical),
        index=X_train_s_categorical.index)
    X_test_s_numerical_reduced = pd.DataFrame(
        estimator.transform(X_test_s_numerical),
        index=X_test_s_categorical.index)
    X_train_s = pd.concat([X_train_s_numerical_reduced, X_train_s_categorical],
                          axis=1)
    X_test_s = pd.concat([X_test_s_numerical_reduced, X_test_s_categorical],
                         axis=1)
    return X_train_s, X_test_s

コード例 #4

0

ファイルを表示

ファイル: neural_networks.py プロジェクト: wyattowalsh/ieor_142_project

def single_layer_network_grid_cv(name, cv=5, save=True):

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)

    param_grid = {
        'shape': [(X_train.shape[1], )],
        'neurons': np.arange(230),
        'batch_size': [1028],
        'epochs': [50],
        'reg': np.geomspace(1e-4, 2, 25),
        'if_reg': [True],
        'shuffle': [True]
    }

    # param_grid = {'shape' : [(X_train.shape[1],)],
    # 'neurons': np.arange(20,275,5),
    # 'batch_size': [1028],
    # 'epochs': [50],
    # 'learning_rate': np.linspace(0.1,1,50),
    # 'reg': np.geomspace(1e-8, 2, 50),
    # 'if_reg': [True],
    # 'shuffle': [False, True]}

    def single_layer_network(shape, learning_rate, reg, if_reg):
        """

		"""

        net = Sequential()
        if if_reg:
            net.add(
                Dense(45,
                      activation='relu',
                      input_shape=shape,
                      kernel_regularizer=regularizers.l2(reg)))
        else:
            net.add(Dense(45, activation='relu', input_shape=shape))
        net.add(Dense(1, activation='linear'))
        optimizer = Adam(learning_rate=learning_rate)
        net.compile(optimizer=optimizer, loss='mean_squared_error')
        return net

    net = KerasRegressor(build_fn=single_layer_network, verbose=0)
    net_cv = GridSearchCV(estimator=net,
                          param_grid=param_grid,
                          n_jobs=-1,
                          pre_dispatch=16,
                          refit=True,
                          cv=cv,
                          scoring='neg_mean_absolute_error').fit(
                              X_train, y_train)

    display_name = ds.get_names()[name]
    performance = metrics.apply_metrics(
        '{} Single Layer Neural Network'.format(display_name), y_test,
        net_cv.predict(X_test), y_train)
    performance['Tuning Parameters'] = [net_cv.best_params_]

    return net_cv

コード例 #5

0

ファイルを表示

ファイル: linear.py プロジェクト: wyattowalsh/ieor_142_project

def elastic_net(name, cv=5):
    '''Outputs a fitted Elastic Net Regression Model with tuning parameters found through cross validation.
	
	Inputs must be standardized.
	l1_ratios are spread out on a log scale as recommended by package authors.
	Number of folds in cross validation is by default 5.
	n_jobs = -1 allows for all local processors to be utilized.
	# '''
    # if np.any(X_train.mean(axis = 0) > 1):
    # 	raise ValueError('Numerical features must be standardized')

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    l1_ratios = np.geomspace(1e-8, 1, 50)
    model = ElasticNetCV(l1_ratio=l1_ratios,
                         n_alphas=50,
                         cv=5,
                         verbose=0,
                         n_jobs=-1,
                         random_state=18).fit(X_train, y_train)

    performance = metrics.apply_metrics('{} Elastic Net'.format(display_name),
                                        y_test, model.predict(X_test), y_train)
    performance['Tuning Parameters'] = [{
        'Alpha': model.alpha_,
        'L1 Ratio': model.l1_ratio_
    }]
    return model, performance

コード例 #6

0

ファイルを表示

ファイル: clustering.py プロジェクト: wyattowalsh/ieor_142_project

def silhouettes(name, X_train, max_clusters = 15, min_clusters = 5, save = False):
	'''

	'''

	X_train = X_train.copy()
	num_numerical = ds.get_number_numerical(name)
	X_train_s_numerical = split.standardize(name, X_train).iloc[:,0:num_numerical]
	cluster_range = range(min_clusters,max_clusters+1)
	for clusters in cluster_range:
		fig, ax = plt.subplots()

		fig.set_size_inches(18, 7)

		# The 1st subplot is the silhouette plot
		# The silhouette coefficient can range from -1, 1 but in this example all
		# lie within [-0.1, 1]
		ax.set_xlim([-0.7, 1])
		# The (n_clusters+1)*10 is for inserting blank space between silhouette
		# plots of individual clusters, to demarcate them clearly.
		ax.set_ylim([0, len(X_train_s_numerical) + (clusters + 1) * 10])

		cluster_labels = kmeans(name, clusters, X_train_s_numerical).predict(X_train_s_numerical)
		silhouette_avg = silhouette_score(X_train_s_numerical, cluster_labels)
		print("For n_clusters =", clusters, "The average silhouette_score is :", silhouette_avg)

		cluster_silhouette = silhouette_samples(X_train_s_numerical, cluster_labels)

		y_lower = 10
		for i in range(clusters):
			ith_cluster_silhouette_values = cluster_silhouette[cluster_labels == i]
			ith_cluster_silhouette_values.sort()

			size_cluster_i = ith_cluster_silhouette_values.shape[0]
			y_upper = y_lower + size_cluster_i

			color = cm.nipy_spectral(float(i) / clusters)
			ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, 
								facecolor=color, edgecolor=color, alpha=0.7)

			# Label the silhouette plots with their cluster numbers at the middle
			ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

			# Compute the new y_lower for next plot
			y_lower = y_upper + 10  # 10 for the 0 samples

		ax.set_title("The silhouette plot for the various clusters.")
		ax.set_xlabel("The silhouette coefficient values")
		ax.set_ylabel("Cluster label")

		# The vertical line for average silhouette score of all the values
		ax.axvline(x=silhouette_avg, color="red", linestyle="--")

		ax.set_yticks([])  # Clear the yaxis labels / ticks
		ax.set_xticks(np.arange(-0.6,1.1,0.2))
		plt.show()

	if save:
		to_save = Path().resolve().joinpath('data', 'visualizations', '{}_elbow.png'.format(name))
		fig.savefig(to_save)

コード例 #7

0

ファイルを表示

ファイル: neural_networks.py プロジェクト: wyattowalsh/ieor_142_project

def single_layer_network_randomized_cv(name, n_iter=20, cv=5, save=True):

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    param_grid = {
        'shape': [(X_train.shape[1], )],
        'batch_size': [256, 512, 1028, 2056],
        'epochs': [25, 50],
        'learning_rate': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
    }

    # 'reg': np.linspace(1e-4, 750, 500),
    # 'if_reg': [True],
    # 'shuffle': [False,True]}
    #kernel_regularizer=regularizers.l2(reg))

    def single_layer_network(shape, learning_rate):
        """

		"""

        net = Sequential()
        net.add(Dense(45, activation='relu', input_shape=shape))
        net.add(Dense(1, activation='linear'))
        optimizer = Adam(learning_rate=learning_rate)
        net.compile(optimizer=optimizer, loss='mean_squared_error')
        return net

    net = KerasRegressor(build_fn=single_layer_network,
                         verbose=0,
                         workers=8,
                         use_multiprocessing=True)
    net_cv = RandomizedSearchCV(estimator=net,
                                param_distributions=param_grid,
                                n_jobs=-1,
                                pre_dispatch=16,
                                refit=True,
                                cv=cv,
                                scoring='neg_mean_absolute_error',
                                n_iter=n_iter,
                                random_state=18).fit(X_train, y_train)

    display_name = ds.get_names()[name]
    performance = metrics.apply_metrics(
        '{} Single Layer Neural Network'.format(display_name), y_test,
        net_cv.predict(X_test), y_train)
    performance['Tuning Parameters'] = [net_cv.best_params_]

    if save:
        to_save_cv = Path().resolve().joinpath('models',
                                               'cross_validation_outcomes',
                                               'neural_network',
                                               '{}.csv'.format(name))
        results = pd.DataFrame.from_dict(net_cv.cv_results_)
        results.to_csv(to_save_cv)
        to_save_perf = Path().resolve().joinpath(
            'models', 'neural_network', '{}_performance.csv'.format(name))
        performance.to_csv(to_save_perf)

    return net_cv, performance

コード例 #8

0

ファイルを表示

ファイル: linear.py プロジェクト: wyattowalsh/ieor_142_project

def support_vector_machine(name, cv=5):
    """

	"""

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    to_score, scoring = metrics.create_metrics()
    param_grid = {
        'epsilon': np.linspace(-2, 2, 4),
        'fit_intercept': [True],
        'C': np.linspace(1e6, 1e10, 50),
        'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
        'dual': [False],
        'random_state': [18]
    }
    model = LinearSVR()
    model_cv = GridSearchCV(model,
                            param_grid=param_grid,
                            scoring='neg_mean_absolute_error',
                            n_jobs=-1,
                            pre_dispatch=16,
                            cv=cv,
                            refit=True).fit(X_train, y_train)
    performance = metrics.apply_metrics(
        '{} Linear Support Vector Machine'.format(display_name), y_test,
        model_cv.predict(X_test), y_train)
    performance['Tuning Parameters'] = [model_cv.best_params_]
    return model_cv, performance

コード例 #9

0

ファイルを表示

ファイル: linear.py プロジェクト: wyattowalsh/ieor_142_project

def huber(name, cv=5):
    '''

	'''

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    to_score, scoring = metrics.create_metrics()
    param_grid = {
        'epsilon': np.linspace(1 + 1e-15, 1.2, 10),
        'alpha': np.linspace(1e-8, 2, 10)
    }
    model = HuberRegressor()
    model_cv = GridSearchCV(model,
                            param_grid=param_grid,
                            scoring='neg_mean_absolute_error',
                            n_jobs=-1,
                            pre_dispatch=16,
                            cv=cv,
                            refit=True).fit(X_train, y_train)
    performance = metrics.apply_metrics('{} Huber'.format(display_name),
                                        y_test, model_cv.predict(X_test),
                                        y_train)
    performance['Tuning Parameters'] = [model_cv.best_params_]

    return model_cv, performance

コード例 #10

0

ファイルを表示

ファイル: other.py プロジェクト: wyattowalsh/ieor_142_project

def svr_grid_cv(name, standardize=False, cv=5):
    """

	"""

    if cv == 5:
        cv_type = 'K-Fold'
    else:
        cv_type = "Time Series Split"

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    if standardize:
        X_train = split.standardize(name, X_train)
        X_test = split.standardize(name, X_test)
    to_score = metrics.create_metrics()[0]
    param_grid = {
        'kernel': ['poly', 'rbf', 'sigmoid'],
        'degree': np.arange(3, 9),
        'gamma': ['scale', 'auto'],
        'C': [2e-5, 2e-3, 2e-1, 2e1, 2e3, 2e5, 2e7, 2e9, 2e11]
    }

    model = SVR(n_jobs=-1)
    model_cv = GridSearchCV(estimator=model,
                            param_distributions=param_grid,
                            n_jobs=-1,
                            pre_dispatch=16,
                            refit=False,
                            cv=cv,
                            scoring=to_score).fit(X_train, y_train)

    display_name = ds.get_names()[name]
    performance = pd.DataFrame()
    variations = linear.get_model_variants(KNeighborsRegressor, model_cv)
    for variation in variations:
        model = variation.fit(X_train, y_train).predict(X_test)
        performance = pd.concat([
            performance,
            metrics.apply_metrics(
                '{} {} Support Vector Machine'.format(display_name, cv_type),
                y_test, model)
        ],
                                axis=0)

    return model_cv, performance

コード例 #11

0

ファイルを表示

ファイル: clustering.py プロジェクト: wyattowalsh/ieor_142_project

def kmeans(name, n_clusters, X_train, X_test):
	'''

	'''

	X_train = X_train.copy()
	X_test = X_test.copy()
	num_numerical = ds.get_number_numerical()[name]
	X_train_s_numerical = split.standardize(name, X_train, X_test)[0].iloc[:,0:num_numerical]
	return KMeans(n_clusters= n_clusters, random_state=18, n_jobs = -1).fit(X_train_s_numerical)

コード例 #12

0

ファイルを表示

ファイル: linear.py プロジェクト: wyattowalsh/ieor_142_project

def ridge(name, cv=5):
    '''Outputs a fitted Ridge Regression Model with a penalty term tuned through cross validation.

	'''

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    alphas = np.linspace(500, 750, 50)
    model = RidgeCV(alphas=alphas, fit_intercept=True,
                    cv=cv).fit(X_train, y_train)
    performance = metrics.apply_metrics('{} Ridge'.format(display_name),
                                        y_test, model.predict(X_test), y_train)
    performance['Tuning Parameters'] = [{'Alpha': model.alpha_}]
    return model, performance

コード例 #13

0

ファイルを表示

ファイル: other.py プロジェクト: wyattowalsh/ieor_142_project

def k_neighbors_grid_cv(name, cv=5, save=True):
    '''Conducts a grid search over all possible combinations of given parameters and returns result.

	Uses parameters closely clustered around the best randomized search results.
	Also returns back best fitted model by specified criteria (MAE).
	'''

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(name, X_train, X_test)
    to_score = metrics.create_metrics()[0]
    param_grid = {
        'n_neighbors': np.arange(20, 51, 2, dtype=int),
        'weights': ['distance'],
        'leaf_size': [8, 16, 128, 256]
    }

    model = KNeighborsRegressor(n_jobs=-1)
    model_cv = GridSearchCV(n_jobs=-1,
                            estimator=model,
                            param_grid=param_grid,
                            scoring=to_score,
                            pre_dispatch=16,
                            refit=False,
                            cv=cv).fit(X_train, y_train)

    display_name = ds.get_names()[name]
    performance = pd.DataFrame()
    variations = linear.get_model_variants(KNeighborsRegressor, model_cv)
    for variation in variations:
        model = variation.fit(X_train, y_train).predict(X_test)
        performance = pd.concat([
            performance,
            metrics.apply_metrics('{} K Neighbors'.format(display_name),
                                  y_test, model)
        ],
                                axis=0)

    if save:
        to_save = Path().resolve().joinpath('models',
                                            'cross_validation_outcomes',
                                            'other', 'k_neighbors',
                                            '{}.csv'.format('grid'))
        results = pd.DataFrame.from_dict(model_cv.cv_results_)
        results.to_csv(to_save)

    return model_cv, performance

コード例 #14

0

ファイルを表示

ファイル: decomposition.py プロジェクト: wyattowalsh/ieor_142_project

def pca_cv(name, save=False):
    '''

	'''

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    num_numerical = ds.get_number_numerical()[name]
    X_train_s, X_test_s = split.standardize(name, X_train, X_test)
    X_train_s_numerical = X_train_s.iloc[:, 0:num_numerical]
    X_train_s_categorical = X_train_s.iloc[:, num_numerical:]
    X_test_s_numerical = X_test_s.iloc[:, 0:num_numerical]
    X_test_s_categorical = X_test_s.iloc[:, num_numerical:]
    df = pd.DataFrame()
    ols = LinearRegression()
    ev = []
    for i in np.arange(1, num_numerical):
        pca = PCA(i, random_state=18)
        X_train_s_numerical_reduced = pd.DataFrame(
            pca.fit_transform(X_train_s_numerical),
            index=X_train_s_categorical.index)
        X_test_s_numerical_reduced = pd.DataFrame(
            pca.transform(X_test_s_numerical),
            index=X_test_s_categorical.index)
        X_train_s = pd.concat(
            [X_train_s_numerical_reduced, X_train_s_categorical], axis=1)
        X_test_s = pd.concat(
            [X_test_s_numerical_reduced, X_test_s_categorical], axis=1)

        model = ols.fit(X_train_s, y_train)
        preds = model.predict(X_test_s)
        preds = metrics.apply_metrics(
            '{}: {} dimensions'.format(display_name, i), y_test, preds.ravel(),
            y_train)
        df = pd.concat([df, preds], axis=0)
        ev.append(1 - sum(pca.explained_variance_))

    if save:
        to_save = Path().resolve().joinpath('features', 'pca',
                                            '{}.csv'.format(name))
        df.to_csv(to_save)

    return df, ev

コード例 #15

0

ファイルを表示

ファイル: linear.py プロジェクト: wyattowalsh/ieor_142_project

def lasso(name, cv=5):
    '''Outputs a fitted Lasso Regression Model with a penalty term tuned through cross validation.

	Inputs must be standardized.
	Number of folds in cross validation is by default 5.
	n_jobs = -1 allows for all local processors to be utilized.
	'''
    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(X_train, X_test)
    model = LassoCV(n_alphas=50,
                    verbose=0,
                    cv=5,
                    n_jobs=-1,
                    copy_X=True,
                    tol=1e-3,
                    random_state=18).fit(X_train, y_train)
    performance = metrics.apply_metrics('{} Lasso'.format(display_name),
                                        y_test, model.predict(X_test), y_train)
    performance['Tuning Parameters'] = [{'Alpha': model.alpha_}]
    params = model.coef_
    return model, performance, params

コード例 #16

0

ファイルを表示

ファイル: clustering.py プロジェクト: wyattowalsh/ieor_142_project

def create_cluster_plots(name, save = False): 
	'''

	'''

	X_train,X_test = split.split_subset(name)[0:2]
	num_numerical = ds.get_number_numerical()[name]
	X_train_numerical = X_train.iloc[:,0:num_numerical]
	X_test_numerical = X_test.iloc[:,0:num_numerical]
	distortions = []
	cluster_range = range(2,31)
	for clusters in cluster_range:
		kmean = kmeans(name, clusters, X_train, X_test)
		distortions.append(kmean.inertia_)

	fig, ax = plt.subplots(ncols = 2, figsize=(40, 12))
	ax[0].plot(cluster_range, distortions, marker = 'o')
	ax[0].set_title('KMeans Scree Plot', fontsize = 40)
	ax[0].set_xlabel('Number of Clusters', fontsize = 30)
	ax[0].set_ylabel('Sum of Squared Distances', fontsize = 30)
	ax[0].tick_params(labelsize=20)
	for i, txt in enumerate(cluster_range):
		annot = ax[0].annotate('{}'.format(txt), (cluster_range[i],distortions[i]))
		annot.set_fontsize(25)

	X_train_s_numerical = split.standardize(name, X_train_numerical, X_test_numerical)[0]
	clusters = 7
	if name == 'dataset_3':
		clusters = 9
	ax[1].set_xlim([-0.3, 0.8])
	ax[1].set_ylim([0, len(X_train_s_numerical) + (clusters + 1) * 10])

	cluster_labels = kmeans(name, clusters, X_train, X_test).predict(X_train_s_numerical)
	silhouette_avg = silhouette_score(X_train_s_numerical, cluster_labels)

	cluster_silhouette = silhouette_samples(X_train_s_numerical, cluster_labels)

	y_lower = 10
	for i in range(clusters):
		ith_cluster_silhouette_values = cluster_silhouette[cluster_labels == i]
		ith_cluster_silhouette_values.sort()

		size_cluster_i = ith_cluster_silhouette_values.shape[0]
		y_upper = y_lower + size_cluster_i

		color = cm.nipy_spectral(float(i) / clusters)
		ax[1].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, 
							facecolor=color, edgecolor=color, alpha=0.7)

		ax[1].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize = 25)

		y_lower = y_upper + 10  

	ax[1].set_title("Silhouette plot for {} clusters.".format(clusters) , fontsize = 40)
	ax[1].set_xlabel("Silhouette Coefficient Values",fontsize = 30)
	ax[1].set_ylabel("Cluster label",fontsize = 30)

	ax[1].axvline(x=silhouette_avg, color="red", linestyle="--")

	ax[1].set_yticks([]) 
	ax[1].set_xticks(np.arange(-0.3,0.9,0.1))
	ax[1].tick_params(labelsize=20)

	plt.tight_layout()
	plt.show()

	if save:
		to_save = Path().resolve().joinpath('visualizations', 'clustering', '{}.png'.format(name))
		fig.savefig(to_save, dpi = 300)

コード例 #17

0

ファイルを表示

ファイル: neural_networks.py プロジェクト: wyattowalsh/ieor_142_project

def multi_layer_network_performance(name, save=True):
    """

	"""

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(name, X_train, X_test)

    def multi_layer_network(neurons_l1,
                            neurons_ol,
                            num_layers,
                            shape,
                            learning_rate,
                            reg_l1,
                            reg_ol,
                            epochs,
                            batch_size,
                            X_train=X_train,
                            y_train=y_train):
        """

		"""

        net = Sequential()
        net.add(
            Dense(neurons_l1,
                  activation='relu',
                  input_shape=shape,
                  kernel_regularizer=regularizers.l2(reg_l1)))
        for i in np.arange(num_layers):
            net.add(
                Dense(neurons_ol[i],
                      activation='relu',
                      input_shape=shape,
                      kernel_regularizer=regularizers.l2(reg_ol[i])))
        net.add(Dense(1, activation='linear'))
        optimizer = Adam(learning_rate=learning_rate)
        net.compile(optimizer=optimizer, loss='mean_squared_error')
        net.fit(X_train,
                y_train,
                epochs=epochs,
                batch_size=batch_size,
                workers=7,
                use_multiprocessing=True)
        return net

    results = pd.read_csv(Path().resolve().joinpath(
        'models', 'cross_validation_outcomes', 'neural_network',
        '{}_{}_{}.csv'.format(name, 'multi_layer_network', 'randomized')),
                          index_col=0)

    bestr2 = results.loc[results['rank_test_$R^2$'] == 1, 'params'].values[0]
    bestmae = results.loc[results['rank_test_Mean Absolute Error'] == 1,
                          'params'].values[0]
    bestrmse = results.loc[results['rank_test_Root Mean Square Error'] == 1,
                           'params'].values[0]

    display_name = ds.get_names()[name]
    dict_list = [bestr2, bestmae, bestrmse]
    unique_dict_list = [
        dict(t) for t in {tuple(sorted(d.items()))
                          for d in dict_list}
    ]
    performance = pd.DataFrame()
    for item in unique_dict_list:
        preds = multi_layer_network(**item).predict(X_test)
        performance = pd.concat([
            performance,
            metrics.apply_metrics('{} Single Layer NN'.format(display_name),
                                  y_test, preds)
        ],
                                axis=0)

    if save:
        to_save = Path().resolve().joinpath(
            'models', 'neural_network', '{}.csv'.format('multi_layer_network'))
        performance.to_csv(to_save)

コード例 #18

0

ファイルを表示

ファイル: neural_networks.py プロジェクト: wyattowalsh/ieor_142_project

def multi_layer_network_randomized_cv(name, n_iter=30, cv=5, save=True):

    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    X_train, X_test = split.standardize(name, X_train, X_test)
    to_score = metrics.create_metrics()[0]
    reg = list(np.geomspace(1e-6, 5, 49))
    param_grid = {
        'shape': [(X_train.shape[1], )],
        'neurons_l1':
        np.arange(5, 275, 5),
        'neurons_ol': [
            range(5, 275, 5),
            range(5, 275, 5),
            range(5, 275, 5),
            range(5, 275, 5),
            range(5, 275, 5)
        ],
        'batch_size': [4, 8, 16, 32, 64, 128, 256, 512, 1028],
        'epochs': [25, 50, 100],
        'num_layers': [0, 1, 2, 3, 4, 5],
        'learning_rate':
        np.linspace(0.1, 1, 20),
        'reg_l1':
        np.append(np.array([0]), np.geomspace(1e-6, 5, 49)),
        'reg_ol': [[0] + reg, [0] + reg, [0] + reg, [0] + reg, [0] + reg,
                   [0] + reg],
        'shuffle': [False, True]
    }

    def multi_layer_network(neurons_l1, neurons_ol, num_layers, shape,
                            learning_rate, reg_l1, reg_ol):
        """

		"""

        net = Sequential()
        net.add(
            Dense(neurons_l1,
                  activation='relu',
                  input_shape=shape,
                  kernel_regularizer=regularizers.l2(reg_l1)))
        for i in np.arange(num_layers):
            net.add(
                Dense(neurons_ol[i],
                      activation='relu',
                      input_shape=shape,
                      kernel_regularizer=regularizers.l2(reg_ol[i])))
        net.add(Dense(1, activation='linear'))
        optimizer = Adam(learning_rate=learning_rate)
        net.compile(optimizer=optimizer, loss='mean_squared_error')
        return net

    net = KerasRegressor(build_fn=multi_layer_network, verbose=0)
    net_cv = RandomizedSearchCV(estimator=net,
                                param_distributions=param_grid,
                                n_jobs=-1,
                                pre_dispatch=16,
                                refit=False,
                                cv=cv,
                                scoring=to_score,
                                n_iter=n_iter).fit(X_train, y_train)

    if save:
        to_save_cv = Path().resolve().joinpath(
            'models', 'cross_validation_outcomes', 'neural_network',
            '{}_{}_{}.csv'.format(name, 'multi_layer_network', 'randomized'))
        results = pd.DataFrame.from_dict(net_cv.cv_results_)
        results.to_csv(to_save_cv)

    return net_cv