def silhouettes(name, X_train, max_clusters = 15, min_clusters = 5, save = False): ''' ''' X_train = X_train.copy() num_numerical = ds.get_number_numerical(name) X_train_s_numerical = split.standardize(name, X_train).iloc[:,0:num_numerical] cluster_range = range(min_clusters,max_clusters+1) for clusters in cluster_range: fig, ax = plt.subplots() fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax.set_xlim([-0.7, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax.set_ylim([0, len(X_train_s_numerical) + (clusters + 1) * 10]) cluster_labels = kmeans(name, clusters, X_train_s_numerical).predict(X_train_s_numerical) silhouette_avg = silhouette_score(X_train_s_numerical, cluster_labels) print("For n_clusters =", clusters, "The average silhouette_score is :", silhouette_avg) cluster_silhouette = silhouette_samples(X_train_s_numerical, cluster_labels) y_lower = 10 for i in range(clusters): ith_cluster_silhouette_values = cluster_silhouette[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / clusters) ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax.set_title("The silhouette plot for the various clusters.") ax.set_xlabel("The silhouette coefficient values") ax.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax.axvline(x=silhouette_avg, color="red", linestyle="--") ax.set_yticks([]) # Clear the yaxis labels / ticks ax.set_xticks(np.arange(-0.6,1.1,0.2)) plt.show() if save: to_save = Path().resolve().joinpath('data', 'visualizations', '{}_elbow.png'.format(name)) fig.savefig(to_save)
def find_vifs(name, X_train, tolerance=5): '''Iteratively drops features from set of numerical training features based off of VIF scores. ''' X_train = X_train.copy() number_numerical = ds.get_number_numerical()[name] train_num = X_train.iloc[:, 0:number_numerical] vif = pd.DataFrame(index=train_num.columns) vif_x = train_num.copy() cols = vif_x.columns.values for i in np.arange(len(train_num.columns)): try: vifs = [ variance_inflation_factor(vif_x.values, i) for i in range(vif_x.shape[1]) ] vifs_df = pd.DataFrame(vifs, columns=["VIF Round {}".format(i)], index=cols) if max(vifs) > tolerance: loc = np.where(vifs == max(vifs))[0][0] vif_x = vif_x.drop([cols[loc]], axis=1) cols = np.delete(cols, loc, axis=0) else: vif = pd.concat([vif, vifs_df], axis=1) break vif = pd.concat([vif, vifs_df], axis=1) except: break return vif
def pca(name, X_train, X_test, dimension): """ """ X_train = X_train.copy() X_test = X_test.copy() num_numerical = ds.get_number_numerical(name) X_train_s = split.standardize(name, X_train) X_test_s = split.standardize(name, X_test) X_train_s_numerical = X_train_s.iloc[:, 0:num_numerical] X_train_s_categorical = X_train_s.iloc[:, num_numerical:] X_test_s_numerical = X_test_s.iloc[:, 0:num_numerical] X_test_s_categorical = X_test_s.iloc[:, num_numerical:] estimator = PCA(dimension) X_train_s_numerical_reduced = pd.DataFrame( estimator.fit_transform(X_train_s_numerical), index=X_train_s_categorical.index) X_test_s_numerical_reduced = pd.DataFrame( estimator.transform(X_test_s_numerical), index=X_test_s_categorical.index) X_train_s = pd.concat([X_train_s_numerical_reduced, X_train_s_categorical], axis=1) X_test_s = pd.concat([X_test_s_numerical_reduced, X_test_s_categorical], axis=1) return X_train_s, X_test_s
def elbow_method_kmeans(name, max_clusters = 30, min_clusters = 2, save = False): '''Creates elbow method plot for varying number of clusters. Y-axis is the sum of squared distances of samples to their closest cluster center. ''' display_name = ds.get_names()[name] X_train, X_test= split.split_subset(name)[0:1] num_numerical = ds.get_number_numerical()[name] X_train_numerical = X_train.iloc[:,0:num_numerical] distortions = [] cluster_range = range(min_clusters,max_clusters) for clusters in cluster_range: kmean = kmeans(name, clusters, X_train_numerical) distortions.append(kmean.inertia_) fig, ax = plt.subplots(figsize=(10, 10)) ax.plot(cluster_range, distortions, marker = 'o') ax.set_title('Elbow Method for KMeans for {}'.format(display_name), size=25) ax.set_xlabel('Number of Clusters', fontsize = 20) ax.set_ylabel('Sum of Squared Distances', fontsize = 20) plt.show() if save: to_save = Path().resolve().joinpath('visualizations', 'clustering_elbow_{}.png'.format(name)) fig.savefig(to_save, dpi=300)
def main_datasets(): ''' ''' pd.set_option('display.max_rows', 2) number_numerical = ds.get_number_numerical() dsets = ds.load_datasets(names = ['dataset_1', 'dataset_2', 'dataset_3']) display(Markdown('### `Dataset 1:` {} features: {} numerical, {} categorical, 1 response'.format(\ len(dsets['dataset_1'].columns)-1, number_numerical['dataset_1'], len(dsets['dataset_1'].columns)\ -1-number_numerical['dataset_1']))) display(dsets['dataset_1']) display(Markdown('---')) display(Markdown('### `Dataset 2:` {} features: {} numerical, {} categorical, 1 response'.format(\ len(dsets['dataset_2'].columns)-1, number_numerical['dataset_2'], len(dsets['dataset_2'].columns)\ -1-number_numerical['dataset_2']))) display(dsets['dataset_2']) display(Markdown('---')) display(Markdown('### `Dataset 3:` {} features: {} numerical, {} categorical, 1 response'.format(\ len(dsets['dataset_3'].columns)-1, number_numerical['dataset_3'], len(dsets['dataset_3'].columns)\ -1-number_numerical['dataset_3']))) display(dsets['dataset_3']) display(Markdown('---'))
def main_datasets_split(): ''' ''' number_numerical = ds.get_number_numerical() pd.set_option('display.max_rows', 2) dsets = split.split_subsets(['dataset_1', 'dataset_2', 'dataset_3']) display(Markdown('### `Dataset 1 Training Set:` {} features: {} numerical, {} categorical, 1 response'.format(\ len(dsets['dataset_1'][4].columns), number_numerical['dataset_1'], len(dsets['dataset_1'][4].columns)\ -number_numerical['dataset_1']))) display(dsets['dataset_1'][4]) display(Markdown('---')) display(Markdown('### `Dataset 2 Training Set:` {} features: {} numerical, {} categorical, 1 response'.format(\ len(dsets['dataset_2'][4].columns), number_numerical['dataset_2'], len(dsets['dataset_2'][4].columns)\ -number_numerical['dataset_2']))) display(dsets['dataset_2'][4]) display(Markdown('---')) display(Markdown('### `Dataset 3 Training Set:` {} features: {} numerical, {} categorical, 1 response'.format(\ len(dsets['dataset_3'][4].columns), number_numerical['dataset_3'], len(dsets['dataset_3'][4].columns)\ -number_numerical['dataset_3']))) display(dsets['dataset_3'][4]) display(Markdown('---'))
def kmeans(name, n_clusters, X_train, X_test): ''' ''' X_train = X_train.copy() X_test = X_test.copy() num_numerical = ds.get_number_numerical()[name] X_train_s_numerical = split.standardize(name, X_train, X_test)[0].iloc[:,0:num_numerical] return KMeans(n_clusters= n_clusters, random_state=18, n_jobs = -1).fit(X_train_s_numerical)
def create_heatmap(name, train, save=False): '''Creates and/or saves a heatmap of the correlation between the numerical variables of a train. ''' train = train.copy() num_numerical = ds.get_number_numerical(name) train_num = train.iloc[:, 0:num_numerical] fig, ax = plt.subplots(figsize=(15, 15)) heat = sns.heatmap(train_num.corr(), annot=True, ax=ax, fmt='.2f', cbar=True, square=True, xticklabels=True, yticklabels=True, annot_kws={'size': 16}, cmap='coolwarm', center=0, vmin=-1, vmax=1, cbar_kws={"shrink": .82}) ax.set_title( 'Heatmap of Numerical Variable Correlation for {}'.format(name), size=25) plt.yticks(rotation=0, size=15) plt.xticks(rotation=30, size=15) ax.collections[0].colorbar.ax.tick_params(labelsize=15) # Make annotations larger if abs(correlation) above 0.2 num_corrs = len(np.unique(train_num.corr().values.flatten())) bigs = [] for i in np.arange(2, num_corrs + 1): val = round( np.sort(np.abs(np.unique(train_num.corr().values.flatten())))[-i], 2) if val > 0.2: bigs = np.append(bigs, val) for text in heat.texts: num = pd.to_numeric(text.get_text()) i = np.where(bigs == abs(num))[0] if i.size > 0: text.set_color('white') text.set_size(40 - (i[0] * 3)) plt.show() if save: to_save = Path().resolve().joinpath('data', 'visualizations', '{}_heatmap.png'.format(name)) fig.savefig(to_save)
def pca_cv(name, save=False): ''' ''' display_name = ds.get_names()[name] X_train, X_test, y_train, y_test, train = split.split_subset(name) num_numerical = ds.get_number_numerical()[name] X_train_s, X_test_s = split.standardize(name, X_train, X_test) X_train_s_numerical = X_train_s.iloc[:, 0:num_numerical] X_train_s_categorical = X_train_s.iloc[:, num_numerical:] X_test_s_numerical = X_test_s.iloc[:, 0:num_numerical] X_test_s_categorical = X_test_s.iloc[:, num_numerical:] df = pd.DataFrame() ols = LinearRegression() ev = [] for i in np.arange(1, num_numerical): pca = PCA(i, random_state=18) X_train_s_numerical_reduced = pd.DataFrame( pca.fit_transform(X_train_s_numerical), index=X_train_s_categorical.index) X_test_s_numerical_reduced = pd.DataFrame( pca.transform(X_test_s_numerical), index=X_test_s_categorical.index) X_train_s = pd.concat( [X_train_s_numerical_reduced, X_train_s_categorical], axis=1) X_test_s = pd.concat( [X_test_s_numerical_reduced, X_test_s_categorical], axis=1) model = ols.fit(X_train_s, y_train) preds = model.predict(X_test_s) preds = metrics.apply_metrics( '{}: {} dimensions'.format(display_name, i), y_test, preds.ravel(), y_train) df = pd.concat([df, preds], axis=0) ev.append(1 - sum(pca.explained_variance_)) if save: to_save = Path().resolve().joinpath('features', 'pca', '{}.csv'.format(name)) df.to_csv(to_save) return df, ev
def find_numerical_significance(name, X_train, y_train): '''Returns P-value of hypothesis test where H0 is that the feature has no effect on the outcome and an estimate of the mutual information ''' X_train = X_train.copy() y_train = y_train.copy() number_numerical = ds.get_number_numerical() train_num = X_train.iloc[:, 0:number_numerical[name]] f_r = pd.DataFrame(f_regression(train_num, y_train)[1], index=train_num.columns.values, columns=["F-test (P-Value)"]) mir = pd.DataFrame(mutual_info_regression(train_num, y_train, random_state=18), index=train_num.columns.values, columns=["Estimated Mutual Information"]) df = pd.concat([f_r, mir], axis=1) return df
def create_cluster_plots(name, save = False): ''' ''' X_train,X_test = split.split_subset(name)[0:2] num_numerical = ds.get_number_numerical()[name] X_train_numerical = X_train.iloc[:,0:num_numerical] X_test_numerical = X_test.iloc[:,0:num_numerical] distortions = [] cluster_range = range(2,31) for clusters in cluster_range: kmean = kmeans(name, clusters, X_train, X_test) distortions.append(kmean.inertia_) fig, ax = plt.subplots(ncols = 2, figsize=(40, 12)) ax[0].plot(cluster_range, distortions, marker = 'o') ax[0].set_title('KMeans Scree Plot', fontsize = 40) ax[0].set_xlabel('Number of Clusters', fontsize = 30) ax[0].set_ylabel('Sum of Squared Distances', fontsize = 30) ax[0].tick_params(labelsize=20) for i, txt in enumerate(cluster_range): annot = ax[0].annotate('{}'.format(txt), (cluster_range[i],distortions[i])) annot.set_fontsize(25) X_train_s_numerical = split.standardize(name, X_train_numerical, X_test_numerical)[0] clusters = 7 if name == 'dataset_3': clusters = 9 ax[1].set_xlim([-0.3, 0.8]) ax[1].set_ylim([0, len(X_train_s_numerical) + (clusters + 1) * 10]) cluster_labels = kmeans(name, clusters, X_train, X_test).predict(X_train_s_numerical) silhouette_avg = silhouette_score(X_train_s_numerical, cluster_labels) cluster_silhouette = silhouette_samples(X_train_s_numerical, cluster_labels) y_lower = 10 for i in range(clusters): ith_cluster_silhouette_values = cluster_silhouette[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / clusters) ax[1].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax[1].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize = 25) y_lower = y_upper + 10 ax[1].set_title("Silhouette plot for {} clusters.".format(clusters) , fontsize = 40) ax[1].set_xlabel("Silhouette Coefficient Values",fontsize = 30) ax[1].set_ylabel("Cluster label",fontsize = 30) ax[1].axvline(x=silhouette_avg, color="red", linestyle="--") ax[1].set_yticks([]) ax[1].set_xticks(np.arange(-0.3,0.9,0.1)) ax[1].tick_params(labelsize=20) plt.tight_layout() plt.show() if save: to_save = Path().resolve().joinpath('visualizations', 'clustering', '{}.png'.format(name)) fig.savefig(to_save, dpi = 300)
def create_plots(name, save=False): ''' ''' dset = ds.load_dataset(name) train = dset sns.set_style("whitegrid") sns.set_palette(sns.color_palette('bright', 12)) fig = plt.figure(figsize=(40, 27)) gs = gridspec.GridSpec(4, 4) ax00 = plt.subplot(gs[0, 0]) ax01 = plt.subplot(gs[0, 1]) ax02 = plt.subplot(gs[0, 2]) ax03 = plt.subplot(gs[0, 3]) ax10 = plt.subplot(gs[1, 0]) ax11 = plt.subplot(gs[1, 1]) ax12 = plt.subplot(gs[1, 2]) ax13 = plt.subplot(gs[1, 3]) ax20 = plt.subplot(gs[2, 0]) ax21 = plt.subplot(gs[2, 1]) ax22 = plt.subplot(gs[2, 2]) ax23 = plt.subplot(gs[2, 3]) ax30 = plt.subplot(gs[3, 0:2]) ax31 = plt.subplot(gs[3, 2:4]) bins = np.arange(4000, 26001, 500) # fig, ax = plt.subplots(nrows = 4, ncols = 4, figsize=(40, 20)) sns.distplot(train['Attendance'], ax=ax00, kde=False, norm_hist=True, bins=bins) ax00.set_xlabel('Attendance (# of people)', fontsize=20) ax00.set_ylabel('Percent per person', fontsize=20) ax00.tick_params(labelsize=15) ax00.set_title(label="Overall Attendance", fontsize=25) days = train['Day of Week'].unique() for day in days: sns.distplot(train.loc[train['Day of Week'] == day]['Attendance'], ax=ax01, kde=False, norm_hist=True, bins=bins) ax01.set_title(label="Attendance per Day", fontsize=25) ax01.set_xlabel('Attendance (# of people)', fontsize=20) ax01.set_ylabel('Percent per person', fontsize=20) ax01.tick_params(labelsize=15) ax01.legend(days, loc="upper right", fontsize=15) grouped = train[['Day of Week', 'Attendance']].groupby('Day of Week').mean() order = [ 'Monday', "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday" ] sns.barplot(x='Attendance', y=grouped.index, data=grouped, order=order, ci=None, orient='h', saturation=1, ax=ax20, palette=sns.color_palette("cubehelix", 7)) ax20.set_xlim(16500, 19000) ax20.set_xticks(range(16500, 19001, 500)) ax20.set_xlabel('Average Attendance (# of people)', fontsize=20) ax20.set_ylabel('Day of the Week', fontsize=20) ax20.tick_params(labelsize=15) ax20.set_title(label="Average Attendance per Day", fontsize=25) months = train['Month'].unique() for month in months: sns.distplot(train.loc[train['Month'] == month]['Attendance'], ax=ax02, kde=False, norm_hist=True, bins=bins) ax02.set_title(label="Attendance per Month", fontsize=25) ax02.set_xlabel('Attendance (# of people)', fontsize=20) ax02.set_ylabel('Percent per person', fontsize=20) ax02.tick_params(labelsize=15) ax02.legend(months, loc="upper right", fontsize=15) grouped = train[['Month', 'Attendance']].groupby('Month').mean() order = [ 'October', 'November', 'December', 'January', 'February', 'March', 'April', 'May', 'June' ] sns.barplot(x='Attendance', y=grouped.index, data=grouped, order=order, ci=None, orient='h', saturation=1, ax=ax21, palette=sns.color_palette("cubehelix", 9)) ax21.set_xlim(16500, 20000) ax21.set_xticks(range(16500, 20001, 500)) ax21.set_xlabel('Average Attendance (# of people)', fontsize=20) ax21.set_ylabel('Month', fontsize=20) ax21.tick_params(labelsize=15) ax21.set_title(label="Average Attendance per Month", fontsize=25) train['Year'] = train.index.year grouped = train[['Year', 'Attendance']].groupby('Year').mean() order = np.sort(train.index.year.unique()) sns.barplot(x='Attendance', y=grouped.index, data=grouped, order=order, ci=None, orient='h', saturation=1, ax=ax22, palette=sns.color_palette("cubehelix", 13)) ax22.set_xlim(16500, 18500) ax22.set_xticks(range(16500, 18501, 500)) ax22.set_xlabel('Average Attendance (# of people)', fontsize=20) ax22.set_ylabel('Year', fontsize=20) ax22.tick_params(labelsize=15) ax22.set_title(label="Average Attendance per Year", fontsize=25) years = np.arange( max(train['Year'].values) - 4, max(train['Year'].values) + 1) for year in years: sns.distplot(train.loc[train.index.year == year]['Attendance'], ax=ax03, kde=False, norm_hist=True, bins=bins) ax03.set_title(label="Attendance per Year", fontsize=25) ax03.set_xlabel('Attendance (# of people)', fontsize=20) ax03.set_ylabel('Percent per person', fontsize=20) ax03.tick_params(labelsize=15) ax03.legend(years, loc="upper right", fontsize=20) for j in [0, 1]: sns.distplot(train.loc[train['Playoffs?'] == j]['Attendance'], ax=ax10, kde=False, norm_hist=True, bins=bins) ax10.set_title(label="Attendance for Regular and Playoff Games", fontsize=25) ax10.set_xlabel('Attendance (# of people)', fontsize=20) ax10.set_ylabel('Percent per person', fontsize=20) ax10.tick_params(labelsize=15) ax10.legend(['Regular Season', 'Playoffs'], loc="upper right", fontsize=15) win_percent = np.arange(0, 1, 0.10) for i in win_percent: sns.distplot( train.loc[(train['Curr Win %'] >= i) & (train['Curr Win %'] < i + 0.1)]['Attendance'], ax=ax11, kde=False, norm_hist=True, bins=bins) ax11.set_title(label="Attendance for Different Win Percentages", fontsize=25) ax11.set_xlabel('Attendance (# of people)', fontsize=20) ax11.set_ylabel('Percent per person', fontsize=20) ax11.tick_params(labelsize=15) ax11.legend([ '0 %', '10 %', '20 %', '30 %', '40 %', '50 %', '60 %', '70 %', '80 %', '90 %', '100 %' ], loc="upper right", fontsize=17) last_five = np.arange(0, 6) for i in last_five: sns.distplot(train.loc[train['Last Five'] == i]['Attendance'], ax=ax12, kde=False, norm_hist=True, bins=bins) ax12.set_title(label="Attendance by Last Five Record", fontsize=25) ax12.set_xlabel('Attendance (# of people)', fontsize=20) ax12.set_ylabel('Percent per person', fontsize=20) ax12.tick_params(labelsize=15) ax12.legend(['0 Wins', '1 Win', '2 Wins', "3 Wins", "4 Wins", "5 Wins"], loc="upper right", fontsize=20) grouped = train[['Last Five', 'Attendance']].groupby('Last Five').mean() order = np.arange(0, 6) sns.barplot(x='Attendance', y=grouped.index, data=grouped, order=order, ci=None, orient='h', saturation=1, ax=ax31, palette=sns.color_palette("cubehelix", 5)) ax31.set_xlim(16500, 18500) ax31.set_xticks(range(16500, 18501, 500)) ax31.set_xlabel('Average Attendance (# of people)', fontsize=20) ax31.set_ylabel('Record Over Last Five Games', fontsize=20) ax31.tick_params(labelsize=15) ax31.set_title(label="Average Attendance per Last Five Record", fontsize=25) num_numerical = ds.get_number_numerical()[name] train_num = train.iloc[:, 0:num_numerical] heat = sns.heatmap(train_num.corr(), annot=True, ax=ax13, fmt='.2f', cbar=True, square=True, xticklabels=True, yticklabels=True, annot_kws={'size': 16}, cmap='coolwarm', center=0, vmin=-1, vmax=1, cbar_kws={"shrink": 1}) ax13.set_title('Heatmap of Numerical Variable Correlation', size=25) ax13.set_xticklabels(ax13.xaxis.get_majorticklabels(), rotation=60, size=15) ax13.set_yticklabels(ax13.yaxis.get_majorticklabels(), rotation=0, size=15) ax13.collections[0].colorbar.ax.tick_params(labelsize=15) # Make annotations larger if abs(correlation) above 0.2 num_corrs = len(np.unique(train_num.corr().values.flatten())) bigs = [] for i in np.arange(2, num_corrs + 1): val = round( np.sort(np.abs(np.unique(train_num.corr().values.flatten())))[-i], 2) if val > 0.2: bigs = np.append(bigs, val) for text in heat.texts: num = pd.to_numeric(text.get_text()) i = np.where(bigs == abs(num))[0] if i.size > 0: text.set_color('white') text.set_size(27 - (i[0] * 3)) train.loc[train['Playoffs?'] == 0, "Playoffs?"] = 'Regular Season' train.loc[train['Playoffs?'] == 1, "Playoffs?"] = 'Playoffs' grouped = train[['Playoffs?', 'Attendance']].groupby('Playoffs?').mean() order = ['Regular Season', 'Playoffs'] sns.barplot(x='Attendance', y=grouped.index, data=grouped, order=order, ci=None, orient='h', saturation=1, ax=ax23, palette=sns.color_palette("cubehelix", 5)) ax23.set_xlim(16500, 19500) ax23.set_xticks(range(16500, 19501, 500)) ax23.set_xlabel('Average Attendance (# of people)', fontsize=20) ax23.set_ylabel('Game Type', fontsize=20) ax23.tick_params(labelsize=15) ax23.set_title(label="Average Attendance per Game Type", fontsize=25) train[['Curr Win %']] = np.round(train[['Curr Win %']], 1) * 100 grouped = train[['Curr Win %', 'Attendance']].groupby('Curr Win %').mean() order = np.arange(0, 101, 10) sns.barplot(x='Attendance', y=grouped.index, data=grouped, order=order, ci=None, orient='h', saturation=1, ax=ax30, palette=sns.color_palette("cubehelix", 5)) ax30.set_xlim(16500, 19500) ax30.set_xticks(range(16500, 19501, 500)) ax30.set_xlabel('Average Attendance (# of people)', fontsize=20) ax30.set_ylabel('Current Win %', fontsize=20) ax30.tick_params(labelsize=15) ax30.set_title(label="Average Attendance per Current Win %", fontsize=25) # ax[3][2] = sns.pairplot(data = train_num) # ax[3][2].set_title('Pairplot of Numerical Variable Correlation', size=25) # ax[3][2].set_xticklabels(ax[3][2].xaxis.get_majorticklabels(), rotation=60, size = 15) # ax[3][2].set_yticklabels(ax[3][2].yaxis.get_majorticklabels(), rotation=0, size = 15) gs.tight_layout(fig) # plt.tight_layout() plt.show() if save: to_save = Path().resolve().joinpath('visualizations', 'all_plots_{}.png'.format(name)) fig.savefig(to_save, dpi=300)