Esempi in Python per get_number_numerical, esempi in Python per src.data.datasets.get_number_numerical

Esempio n. 1

0

Mostra file

File: clustering.py Progetto: wyattowalsh/ieor_142_project

def silhouettes(name, X_train, max_clusters = 15, min_clusters = 5, save = False):
	'''

	'''

	X_train = X_train.copy()
	num_numerical = ds.get_number_numerical(name)
	X_train_s_numerical = split.standardize(name, X_train).iloc[:,0:num_numerical]
	cluster_range = range(min_clusters,max_clusters+1)
	for clusters in cluster_range:
		fig, ax = plt.subplots()

		fig.set_size_inches(18, 7)

		# The 1st subplot is the silhouette plot
		# The silhouette coefficient can range from -1, 1 but in this example all
		# lie within [-0.1, 1]
		ax.set_xlim([-0.7, 1])
		# The (n_clusters+1)*10 is for inserting blank space between silhouette
		# plots of individual clusters, to demarcate them clearly.
		ax.set_ylim([0, len(X_train_s_numerical) + (clusters + 1) * 10])

		cluster_labels = kmeans(name, clusters, X_train_s_numerical).predict(X_train_s_numerical)
		silhouette_avg = silhouette_score(X_train_s_numerical, cluster_labels)
		print("For n_clusters =", clusters, "The average silhouette_score is :", silhouette_avg)

		cluster_silhouette = silhouette_samples(X_train_s_numerical, cluster_labels)

		y_lower = 10
		for i in range(clusters):
			ith_cluster_silhouette_values = cluster_silhouette[cluster_labels == i]
			ith_cluster_silhouette_values.sort()

			size_cluster_i = ith_cluster_silhouette_values.shape[0]
			y_upper = y_lower + size_cluster_i

			color = cm.nipy_spectral(float(i) / clusters)
			ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, 
								facecolor=color, edgecolor=color, alpha=0.7)

			# Label the silhouette plots with their cluster numbers at the middle
			ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

			# Compute the new y_lower for next plot
			y_lower = y_upper + 10  # 10 for the 0 samples

		ax.set_title("The silhouette plot for the various clusters.")
		ax.set_xlabel("The silhouette coefficient values")
		ax.set_ylabel("Cluster label")

		# The vertical line for average silhouette score of all the values
		ax.axvline(x=silhouette_avg, color="red", linestyle="--")

		ax.set_yticks([])  # Clear the yaxis labels / ticks
		ax.set_xticks(np.arange(-0.6,1.1,0.2))
		plt.show()

	if save:
		to_save = Path().resolve().joinpath('data', 'visualizations', '{}_elbow.png'.format(name))
		fig.savefig(to_save)

Esempio n. 2

0

Mostra file

File: statistical_tests.py Progetto: wyattowalsh/ieor_142_project

def find_vifs(name, X_train, tolerance=5):
    '''Iteratively drops features from set of numerical training features based off of VIF scores.

	'''

    X_train = X_train.copy()
    number_numerical = ds.get_number_numerical()[name]
    train_num = X_train.iloc[:, 0:number_numerical]
    vif = pd.DataFrame(index=train_num.columns)
    vif_x = train_num.copy()
    cols = vif_x.columns.values
    for i in np.arange(len(train_num.columns)):
        try:
            vifs = [
                variance_inflation_factor(vif_x.values, i)
                for i in range(vif_x.shape[1])
            ]
            vifs_df = pd.DataFrame(vifs,
                                   columns=["VIF Round {}".format(i)],
                                   index=cols)
            if max(vifs) > tolerance:
                loc = np.where(vifs == max(vifs))[0][0]
                vif_x = vif_x.drop([cols[loc]], axis=1)
                cols = np.delete(cols, loc, axis=0)
            else:
                vif = pd.concat([vif, vifs_df], axis=1)
                break
            vif = pd.concat([vif, vifs_df], axis=1)
        except:
            break
    return vif

Esempio n. 3

0

Mostra file

File: decomposition.py Progetto: wyattowalsh/ieor_142_project

def pca(name, X_train, X_test, dimension):
    """

	"""

    X_train = X_train.copy()
    X_test = X_test.copy()
    num_numerical = ds.get_number_numerical(name)
    X_train_s = split.standardize(name, X_train)
    X_test_s = split.standardize(name, X_test)
    X_train_s_numerical = X_train_s.iloc[:, 0:num_numerical]
    X_train_s_categorical = X_train_s.iloc[:, num_numerical:]
    X_test_s_numerical = X_test_s.iloc[:, 0:num_numerical]
    X_test_s_categorical = X_test_s.iloc[:, num_numerical:]
    estimator = PCA(dimension)
    X_train_s_numerical_reduced = pd.DataFrame(
        estimator.fit_transform(X_train_s_numerical),
        index=X_train_s_categorical.index)
    X_test_s_numerical_reduced = pd.DataFrame(
        estimator.transform(X_test_s_numerical),
        index=X_test_s_categorical.index)
    X_train_s = pd.concat([X_train_s_numerical_reduced, X_train_s_categorical],
                          axis=1)
    X_test_s = pd.concat([X_test_s_numerical_reduced, X_test_s_categorical],
                         axis=1)
    return X_train_s, X_test_s

Esempio n. 4

0

Mostra file

File: clustering.py Progetto: wyattowalsh/ieor_142_project

def elbow_method_kmeans(name, max_clusters = 30, min_clusters = 2, save = False):
	'''Creates elbow method plot for varying number of clusters.

	Y-axis is the sum of squared distances of samples to their closest cluster center.
	'''

	display_name = ds.get_names()[name]
	X_train, X_test= split.split_subset(name)[0:1]
	num_numerical = ds.get_number_numerical()[name]
	X_train_numerical = X_train.iloc[:,0:num_numerical]
	distortions = []
	cluster_range = range(min_clusters,max_clusters)
	for clusters in cluster_range:
		kmean = kmeans(name, clusters, X_train_numerical)
		distortions.append(kmean.inertia_)

	fig, ax = plt.subplots(figsize=(10, 10))
	ax.plot(cluster_range, distortions, marker = 'o')
	ax.set_title('Elbow Method for KMeans for {}'.format(display_name), size=25)
	ax.set_xlabel('Number of Clusters', fontsize = 20)
	ax.set_ylabel('Sum of Squared Distances', fontsize = 20)
	plt.show()

	if save:
		to_save = Path().resolve().joinpath('visualizations', 'clustering_elbow_{}.png'.format(name))
		fig.savefig(to_save, dpi=300)

Esempio n. 5

0

Mostra file

def main_datasets():
	'''

	'''

	pd.set_option('display.max_rows', 2)
	number_numerical = ds.get_number_numerical()
	dsets = ds.load_datasets(names = ['dataset_1', 'dataset_2', 'dataset_3'])
	display(Markdown('### `Dataset 1:` {} features: {} numerical, {} categorical, 1 response'.format(\
	                                                                                     len(dsets['dataset_1'].columns)-1, 
	                                                                                     number_numerical['dataset_1'], 
	                                                                                     len(dsets['dataset_1'].columns)\
	                                                                                     -1-number_numerical['dataset_1'])))
	display(dsets['dataset_1'])
	display(Markdown('---'))
	display(Markdown('### `Dataset 2:` {} features: {} numerical, {} categorical, 1 response'.format(\
	                                                                                     len(dsets['dataset_2'].columns)-1, 
	                                                                                     number_numerical['dataset_2'], 
	                                                                                     len(dsets['dataset_2'].columns)\
	                                                                                     -1-number_numerical['dataset_2'])))
	display(dsets['dataset_2'])
	display(Markdown('---'))
	display(Markdown('### `Dataset 3:` {} features: {} numerical, {} categorical, 1 response'.format(\
	                                                                                     len(dsets['dataset_3'].columns)-1, 
	                                                                                     number_numerical['dataset_3'], 
	                                                                                     len(dsets['dataset_3'].columns)\
	                                                                                     -1-number_numerical['dataset_3'])))
	display(dsets['dataset_3'])
	display(Markdown('---'))

Esempio n. 6

0

Mostra file

def main_datasets_split():
	'''

	'''

	number_numerical = ds.get_number_numerical()
	pd.set_option('display.max_rows', 2)
	dsets = split.split_subsets(['dataset_1', 'dataset_2', 'dataset_3'])
	display(Markdown('### `Dataset 1 Training Set:` {} features: {} numerical, {} categorical, 1 response'.format(\
	                                                                                     len(dsets['dataset_1'][4].columns), 
	                                                                                     number_numerical['dataset_1'], 
	                                                                                     len(dsets['dataset_1'][4].columns)\
	                                                                                     -number_numerical['dataset_1'])))
	display(dsets['dataset_1'][4])
	display(Markdown('---'))
	display(Markdown('### `Dataset 2 Training Set:` {} features: {} numerical, {} categorical, 1 response'.format(\
	                                                                                     len(dsets['dataset_2'][4].columns), 
	                                                                                     number_numerical['dataset_2'], 
	                                                                                     len(dsets['dataset_2'][4].columns)\
	                                                                                     -number_numerical['dataset_2'])))
	display(dsets['dataset_2'][4])
	display(Markdown('---'))
	display(Markdown('### `Dataset 3 Training Set:` {} features: {} numerical, {} categorical, 1 response'.format(\
	                                                                                     len(dsets['dataset_3'][4].columns), 
	                                                                                     number_numerical['dataset_3'], 
	                                                                                     len(dsets['dataset_3'][4].columns)\
	                                                                                     -number_numerical['dataset_3'])))
	display(dsets['dataset_3'][4])
	display(Markdown('---'))

Esempio n. 7

0

Mostra file

File: clustering.py Progetto: wyattowalsh/ieor_142_project

def kmeans(name, n_clusters, X_train, X_test):
	'''

	'''

	X_train = X_train.copy()
	X_test = X_test.copy()
	num_numerical = ds.get_number_numerical()[name]
	X_train_s_numerical = split.standardize(name, X_train, X_test)[0].iloc[:,0:num_numerical]
	return KMeans(n_clusters= n_clusters, random_state=18, n_jobs = -1).fit(X_train_s_numerical)

Esempio n. 8

0

Mostra file

def create_heatmap(name, train, save=False):
    '''Creates and/or saves a heatmap of the correlation between the numerical variables of a train.

	'''
    train = train.copy()
    num_numerical = ds.get_number_numerical(name)
    train_num = train.iloc[:, 0:num_numerical]
    fig, ax = plt.subplots(figsize=(15, 15))
    heat = sns.heatmap(train_num.corr(),
                       annot=True,
                       ax=ax,
                       fmt='.2f',
                       cbar=True,
                       square=True,
                       xticklabels=True,
                       yticklabels=True,
                       annot_kws={'size': 16},
                       cmap='coolwarm',
                       center=0,
                       vmin=-1,
                       vmax=1,
                       cbar_kws={"shrink": .82})
    ax.set_title(
        'Heatmap of Numerical Variable Correlation for {}'.format(name),
        size=25)
    plt.yticks(rotation=0, size=15)
    plt.xticks(rotation=30, size=15)
    ax.collections[0].colorbar.ax.tick_params(labelsize=15)

    # Make annotations larger if abs(correlation) above 0.2
    num_corrs = len(np.unique(train_num.corr().values.flatten()))
    bigs = []
    for i in np.arange(2, num_corrs + 1):
        val = round(
            np.sort(np.abs(np.unique(train_num.corr().values.flatten())))[-i],
            2)
        if val > 0.2:
            bigs = np.append(bigs, val)
    for text in heat.texts:
        num = pd.to_numeric(text.get_text())
        i = np.where(bigs == abs(num))[0]
        if i.size > 0:
            text.set_color('white')
            text.set_size(40 - (i[0] * 3))
    plt.show()

    if save:
        to_save = Path().resolve().joinpath('data', 'visualizations',
                                            '{}_heatmap.png'.format(name))
        fig.savefig(to_save)

Esempio n. 9

0

Mostra file

File: decomposition.py Progetto: wyattowalsh/ieor_142_project

def pca_cv(name, save=False):
    '''

	'''

    display_name = ds.get_names()[name]
    X_train, X_test, y_train, y_test, train = split.split_subset(name)
    num_numerical = ds.get_number_numerical()[name]
    X_train_s, X_test_s = split.standardize(name, X_train, X_test)
    X_train_s_numerical = X_train_s.iloc[:, 0:num_numerical]
    X_train_s_categorical = X_train_s.iloc[:, num_numerical:]
    X_test_s_numerical = X_test_s.iloc[:, 0:num_numerical]
    X_test_s_categorical = X_test_s.iloc[:, num_numerical:]
    df = pd.DataFrame()
    ols = LinearRegression()
    ev = []
    for i in np.arange(1, num_numerical):
        pca = PCA(i, random_state=18)
        X_train_s_numerical_reduced = pd.DataFrame(
            pca.fit_transform(X_train_s_numerical),
            index=X_train_s_categorical.index)
        X_test_s_numerical_reduced = pd.DataFrame(
            pca.transform(X_test_s_numerical),
            index=X_test_s_categorical.index)
        X_train_s = pd.concat(
            [X_train_s_numerical_reduced, X_train_s_categorical], axis=1)
        X_test_s = pd.concat(
            [X_test_s_numerical_reduced, X_test_s_categorical], axis=1)

        model = ols.fit(X_train_s, y_train)
        preds = model.predict(X_test_s)
        preds = metrics.apply_metrics(
            '{}: {} dimensions'.format(display_name, i), y_test, preds.ravel(),
            y_train)
        df = pd.concat([df, preds], axis=0)
        ev.append(1 - sum(pca.explained_variance_))

    if save:
        to_save = Path().resolve().joinpath('features', 'pca',
                                            '{}.csv'.format(name))
        df.to_csv(to_save)

    return df, ev

Esempio n. 10

0

Mostra file

File: statistical_tests.py Progetto: wyattowalsh/ieor_142_project

def find_numerical_significance(name, X_train, y_train):
    '''Returns P-value of hypothesis test where H0 is that the feature has no effect on the outcome and 
	an estimate of the mutual information 

	'''

    X_train = X_train.copy()
    y_train = y_train.copy()
    number_numerical = ds.get_number_numerical()
    train_num = X_train.iloc[:, 0:number_numerical[name]]

    f_r = pd.DataFrame(f_regression(train_num, y_train)[1],
                       index=train_num.columns.values,
                       columns=["F-test (P-Value)"])
    mir = pd.DataFrame(mutual_info_regression(train_num,
                                              y_train,
                                              random_state=18),
                       index=train_num.columns.values,
                       columns=["Estimated Mutual Information"])
    df = pd.concat([f_r, mir], axis=1)

    return df

Esempio n. 11

0

Mostra file

File: clustering.py Progetto: wyattowalsh/ieor_142_project

def create_cluster_plots(name, save = False): 
	'''

	'''

	X_train,X_test = split.split_subset(name)[0:2]
	num_numerical = ds.get_number_numerical()[name]
	X_train_numerical = X_train.iloc[:,0:num_numerical]
	X_test_numerical = X_test.iloc[:,0:num_numerical]
	distortions = []
	cluster_range = range(2,31)
	for clusters in cluster_range:
		kmean = kmeans(name, clusters, X_train, X_test)
		distortions.append(kmean.inertia_)

	fig, ax = plt.subplots(ncols = 2, figsize=(40, 12))
	ax[0].plot(cluster_range, distortions, marker = 'o')
	ax[0].set_title('KMeans Scree Plot', fontsize = 40)
	ax[0].set_xlabel('Number of Clusters', fontsize = 30)
	ax[0].set_ylabel('Sum of Squared Distances', fontsize = 30)
	ax[0].tick_params(labelsize=20)
	for i, txt in enumerate(cluster_range):
		annot = ax[0].annotate('{}'.format(txt), (cluster_range[i],distortions[i]))
		annot.set_fontsize(25)

	X_train_s_numerical = split.standardize(name, X_train_numerical, X_test_numerical)[0]
	clusters = 7
	if name == 'dataset_3':
		clusters = 9
	ax[1].set_xlim([-0.3, 0.8])
	ax[1].set_ylim([0, len(X_train_s_numerical) + (clusters + 1) * 10])

	cluster_labels = kmeans(name, clusters, X_train, X_test).predict(X_train_s_numerical)
	silhouette_avg = silhouette_score(X_train_s_numerical, cluster_labels)

	cluster_silhouette = silhouette_samples(X_train_s_numerical, cluster_labels)

	y_lower = 10
	for i in range(clusters):
		ith_cluster_silhouette_values = cluster_silhouette[cluster_labels == i]
		ith_cluster_silhouette_values.sort()

		size_cluster_i = ith_cluster_silhouette_values.shape[0]
		y_upper = y_lower + size_cluster_i

		color = cm.nipy_spectral(float(i) / clusters)
		ax[1].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, 
							facecolor=color, edgecolor=color, alpha=0.7)

		ax[1].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize = 25)

		y_lower = y_upper + 10  

	ax[1].set_title("Silhouette plot for {} clusters.".format(clusters) , fontsize = 40)
	ax[1].set_xlabel("Silhouette Coefficient Values",fontsize = 30)
	ax[1].set_ylabel("Cluster label",fontsize = 30)

	ax[1].axvline(x=silhouette_avg, color="red", linestyle="--")

	ax[1].set_yticks([]) 
	ax[1].set_xticks(np.arange(-0.3,0.9,0.1))
	ax[1].tick_params(labelsize=20)

	plt.tight_layout()
	plt.show()

	if save:
		to_save = Path().resolve().joinpath('visualizations', 'clustering', '{}.png'.format(name))
		fig.savefig(to_save, dpi = 300)

Esempio n. 12

0

Mostra file

def create_plots(name, save=False):
    '''

	'''

    dset = ds.load_dataset(name)
    train = dset
    sns.set_style("whitegrid")
    sns.set_palette(sns.color_palette('bright', 12))

    fig = plt.figure(figsize=(40, 27))
    gs = gridspec.GridSpec(4, 4)
    ax00 = plt.subplot(gs[0, 0])
    ax01 = plt.subplot(gs[0, 1])
    ax02 = plt.subplot(gs[0, 2])
    ax03 = plt.subplot(gs[0, 3])
    ax10 = plt.subplot(gs[1, 0])
    ax11 = plt.subplot(gs[1, 1])
    ax12 = plt.subplot(gs[1, 2])
    ax13 = plt.subplot(gs[1, 3])
    ax20 = plt.subplot(gs[2, 0])
    ax21 = plt.subplot(gs[2, 1])
    ax22 = plt.subplot(gs[2, 2])
    ax23 = plt.subplot(gs[2, 3])
    ax30 = plt.subplot(gs[3, 0:2])
    ax31 = plt.subplot(gs[3, 2:4])

    bins = np.arange(4000, 26001, 500)
    # fig, ax = plt.subplots(nrows = 4, ncols = 4, figsize=(40, 20))
    sns.distplot(train['Attendance'],
                 ax=ax00,
                 kde=False,
                 norm_hist=True,
                 bins=bins)
    ax00.set_xlabel('Attendance (# of people)', fontsize=20)
    ax00.set_ylabel('Percent per person', fontsize=20)
    ax00.tick_params(labelsize=15)
    ax00.set_title(label="Overall Attendance", fontsize=25)

    days = train['Day of Week'].unique()
    for day in days:
        sns.distplot(train.loc[train['Day of Week'] == day]['Attendance'],
                     ax=ax01,
                     kde=False,
                     norm_hist=True,
                     bins=bins)
    ax01.set_title(label="Attendance per Day", fontsize=25)
    ax01.set_xlabel('Attendance (# of people)', fontsize=20)
    ax01.set_ylabel('Percent per person', fontsize=20)
    ax01.tick_params(labelsize=15)
    ax01.legend(days, loc="upper right", fontsize=15)

    grouped = train[['Day of Week',
                     'Attendance']].groupby('Day of Week').mean()
    order = [
        'Monday', "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
        "Sunday"
    ]
    sns.barplot(x='Attendance',
                y=grouped.index,
                data=grouped,
                order=order,
                ci=None,
                orient='h',
                saturation=1,
                ax=ax20,
                palette=sns.color_palette("cubehelix", 7))
    ax20.set_xlim(16500, 19000)
    ax20.set_xticks(range(16500, 19001, 500))
    ax20.set_xlabel('Average Attendance (# of people)', fontsize=20)
    ax20.set_ylabel('Day of the Week', fontsize=20)
    ax20.tick_params(labelsize=15)
    ax20.set_title(label="Average Attendance per Day", fontsize=25)

    months = train['Month'].unique()
    for month in months:
        sns.distplot(train.loc[train['Month'] == month]['Attendance'],
                     ax=ax02,
                     kde=False,
                     norm_hist=True,
                     bins=bins)
    ax02.set_title(label="Attendance per Month", fontsize=25)
    ax02.set_xlabel('Attendance (# of people)', fontsize=20)
    ax02.set_ylabel('Percent per person', fontsize=20)
    ax02.tick_params(labelsize=15)
    ax02.legend(months, loc="upper right", fontsize=15)

    grouped = train[['Month', 'Attendance']].groupby('Month').mean()
    order = [
        'October', 'November', 'December', 'January', 'February', 'March',
        'April', 'May', 'June'
    ]
    sns.barplot(x='Attendance',
                y=grouped.index,
                data=grouped,
                order=order,
                ci=None,
                orient='h',
                saturation=1,
                ax=ax21,
                palette=sns.color_palette("cubehelix", 9))
    ax21.set_xlim(16500, 20000)
    ax21.set_xticks(range(16500, 20001, 500))
    ax21.set_xlabel('Average Attendance (# of people)', fontsize=20)
    ax21.set_ylabel('Month', fontsize=20)
    ax21.tick_params(labelsize=15)
    ax21.set_title(label="Average Attendance per Month", fontsize=25)

    train['Year'] = train.index.year
    grouped = train[['Year', 'Attendance']].groupby('Year').mean()
    order = np.sort(train.index.year.unique())
    sns.barplot(x='Attendance',
                y=grouped.index,
                data=grouped,
                order=order,
                ci=None,
                orient='h',
                saturation=1,
                ax=ax22,
                palette=sns.color_palette("cubehelix", 13))
    ax22.set_xlim(16500, 18500)
    ax22.set_xticks(range(16500, 18501, 500))
    ax22.set_xlabel('Average Attendance (# of people)', fontsize=20)
    ax22.set_ylabel('Year', fontsize=20)
    ax22.tick_params(labelsize=15)
    ax22.set_title(label="Average Attendance per Year", fontsize=25)

    years = np.arange(
        max(train['Year'].values) - 4,
        max(train['Year'].values) + 1)
    for year in years:
        sns.distplot(train.loc[train.index.year == year]['Attendance'],
                     ax=ax03,
                     kde=False,
                     norm_hist=True,
                     bins=bins)
    ax03.set_title(label="Attendance per Year", fontsize=25)
    ax03.set_xlabel('Attendance (# of people)', fontsize=20)
    ax03.set_ylabel('Percent per person', fontsize=20)
    ax03.tick_params(labelsize=15)
    ax03.legend(years, loc="upper right", fontsize=20)

    for j in [0, 1]:
        sns.distplot(train.loc[train['Playoffs?'] == j]['Attendance'],
                     ax=ax10,
                     kde=False,
                     norm_hist=True,
                     bins=bins)
    ax10.set_title(label="Attendance for Regular and Playoff Games",
                   fontsize=25)
    ax10.set_xlabel('Attendance (# of people)', fontsize=20)
    ax10.set_ylabel('Percent per person', fontsize=20)
    ax10.tick_params(labelsize=15)
    ax10.legend(['Regular Season', 'Playoffs'], loc="upper right", fontsize=15)

    win_percent = np.arange(0, 1, 0.10)
    for i in win_percent:
        sns.distplot(
            train.loc[(train['Curr Win %'] >= i)
                      & (train['Curr Win %'] < i + 0.1)]['Attendance'],
            ax=ax11,
            kde=False,
            norm_hist=True,
            bins=bins)
    ax11.set_title(label="Attendance for Different Win Percentages",
                   fontsize=25)
    ax11.set_xlabel('Attendance (# of people)', fontsize=20)
    ax11.set_ylabel('Percent per person', fontsize=20)
    ax11.tick_params(labelsize=15)
    ax11.legend([
        '0 %', '10 %', '20 %', '30 %', '40 %', '50 %', '60 %', '70 %', '80 %',
        '90 %', '100 %'
    ],
                loc="upper right",
                fontsize=17)

    last_five = np.arange(0, 6)
    for i in last_five:
        sns.distplot(train.loc[train['Last Five'] == i]['Attendance'],
                     ax=ax12,
                     kde=False,
                     norm_hist=True,
                     bins=bins)
    ax12.set_title(label="Attendance by Last Five Record", fontsize=25)
    ax12.set_xlabel('Attendance (# of people)', fontsize=20)
    ax12.set_ylabel('Percent per person', fontsize=20)
    ax12.tick_params(labelsize=15)
    ax12.legend(['0 Wins', '1 Win', '2 Wins', "3 Wins", "4 Wins", "5 Wins"],
                loc="upper right",
                fontsize=20)

    grouped = train[['Last Five', 'Attendance']].groupby('Last Five').mean()
    order = np.arange(0, 6)
    sns.barplot(x='Attendance',
                y=grouped.index,
                data=grouped,
                order=order,
                ci=None,
                orient='h',
                saturation=1,
                ax=ax31,
                palette=sns.color_palette("cubehelix", 5))
    ax31.set_xlim(16500, 18500)
    ax31.set_xticks(range(16500, 18501, 500))
    ax31.set_xlabel('Average Attendance (# of people)', fontsize=20)
    ax31.set_ylabel('Record Over Last Five Games', fontsize=20)
    ax31.tick_params(labelsize=15)
    ax31.set_title(label="Average Attendance per Last Five Record",
                   fontsize=25)

    num_numerical = ds.get_number_numerical()[name]
    train_num = train.iloc[:, 0:num_numerical]
    heat = sns.heatmap(train_num.corr(),
                       annot=True,
                       ax=ax13,
                       fmt='.2f',
                       cbar=True,
                       square=True,
                       xticklabels=True,
                       yticklabels=True,
                       annot_kws={'size': 16},
                       cmap='coolwarm',
                       center=0,
                       vmin=-1,
                       vmax=1,
                       cbar_kws={"shrink": 1})
    ax13.set_title('Heatmap of Numerical Variable Correlation', size=25)
    ax13.set_xticklabels(ax13.xaxis.get_majorticklabels(),
                         rotation=60,
                         size=15)
    ax13.set_yticklabels(ax13.yaxis.get_majorticklabels(), rotation=0, size=15)
    ax13.collections[0].colorbar.ax.tick_params(labelsize=15)

    # Make annotations larger if abs(correlation) above 0.2
    num_corrs = len(np.unique(train_num.corr().values.flatten()))
    bigs = []
    for i in np.arange(2, num_corrs + 1):
        val = round(
            np.sort(np.abs(np.unique(train_num.corr().values.flatten())))[-i],
            2)
        if val > 0.2:
            bigs = np.append(bigs, val)
    for text in heat.texts:
        num = pd.to_numeric(text.get_text())
        i = np.where(bigs == abs(num))[0]
        if i.size > 0:
            text.set_color('white')
            text.set_size(27 - (i[0] * 3))

    train.loc[train['Playoffs?'] == 0, "Playoffs?"] = 'Regular Season'
    train.loc[train['Playoffs?'] == 1, "Playoffs?"] = 'Playoffs'
    grouped = train[['Playoffs?', 'Attendance']].groupby('Playoffs?').mean()
    order = ['Regular Season', 'Playoffs']
    sns.barplot(x='Attendance',
                y=grouped.index,
                data=grouped,
                order=order,
                ci=None,
                orient='h',
                saturation=1,
                ax=ax23,
                palette=sns.color_palette("cubehelix", 5))
    ax23.set_xlim(16500, 19500)
    ax23.set_xticks(range(16500, 19501, 500))
    ax23.set_xlabel('Average Attendance (# of people)', fontsize=20)
    ax23.set_ylabel('Game Type', fontsize=20)
    ax23.tick_params(labelsize=15)
    ax23.set_title(label="Average Attendance per Game Type", fontsize=25)

    train[['Curr Win %']] = np.round(train[['Curr Win %']], 1) * 100
    grouped = train[['Curr Win %', 'Attendance']].groupby('Curr Win %').mean()
    order = np.arange(0, 101, 10)
    sns.barplot(x='Attendance',
                y=grouped.index,
                data=grouped,
                order=order,
                ci=None,
                orient='h',
                saturation=1,
                ax=ax30,
                palette=sns.color_palette("cubehelix", 5))
    ax30.set_xlim(16500, 19500)
    ax30.set_xticks(range(16500, 19501, 500))
    ax30.set_xlabel('Average Attendance (# of people)', fontsize=20)
    ax30.set_ylabel('Current Win %', fontsize=20)
    ax30.tick_params(labelsize=15)
    ax30.set_title(label="Average Attendance per Current Win %", fontsize=25)

    # ax[3][2] = sns.pairplot(data = train_num)
    # ax[3][2].set_title('Pairplot of Numerical Variable Correlation', size=25)
    # ax[3][2].set_xticklabels(ax[3][2].xaxis.get_majorticklabels(), rotation=60, size = 15)
    # ax[3][2].set_yticklabels(ax[3][2].yaxis.get_majorticklabels(), rotation=0, size = 15)

    gs.tight_layout(fig)
    # plt.tight_layout()
    plt.show()

    if save:
        to_save = Path().resolve().joinpath('visualizations',
                                            'all_plots_{}.png'.format(name))
        fig.savefig(to_save, dpi=300)