def on_draw(self): plt.sca(self.ax) plt.clf() self.ax = plt.axes() if self.df is None: message = "Select two or more variables from list" self.ax.text(0.5, 0.5, message, horizontalalignment='center', verticalalignment='center', fontsize=16) else: plt.sca(self.ax) sns.corrplot(self.df, annot=False, sig_stars=True, cmap_range="full", diag_names=False, sig_corr=False, cmap=self.cmap, ax=self.ax, cbar=True) plt.tight_layout() self.draw()
def res_matrix(mark,state,cut_off=40): path = os.path.join(get_data_dir(), "tmp", "{0} in {1}-{2}.csv".format(mark, state,cut_off)) DF = pd.read_csv(path, sep='\t') Full_EID_list = get_full_EID_list() res_matrix = [] tmp = [0.]*len(Full_EID_list) for i in range(0,len(DF.index),1): try: if DF.chromMiddle[i-1] == DF.chromMiddle[i]: tmp[Full_EID_list.index(DF.EID[i])] = DF.signalValue[i] else: res_matrix.append(tmp) tmp = [0.]*len(Full_EID_list) except: pass f, ax = plt.subplots(figsize=(15, 15)) cmap = sns.diverging_palette(210, 10, as_cmap=True) sns.corrplot(np.array(res_matrix), annot=False, sig_stars=False, # .T?? diag_names=False, cmap=cmap, ax=ax) f.tight_layout() plt.show() path2 = os.path.join(get_data_dir(), "tmp","{0} in {1}-{2}_diff.csv".format(mark,state,cut_off)) a = open(path2,'w') for i in range(0,len(res_matrix[0]),1): for j in range(0,len(res_matrix),1): a.write(str(res_matrix[j][i])+"\t") a.write("\n") a.close()
def make_plot(X_train, y_train, X, y, test_data, model, model_name, features, response): feature = X.columns f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey=False) sns.regplot(X[feature[4]], y, test_data, ax=ax1) sns.boxplot(X[feature[4]], y, color="Blues_r", ax=ax2) model.fit(X_train, y_train) sns.residplot(X[feature[4]], (model.predict(X) - y) ** 2, color="indianred", lowess=True, ax=ax3) if model_name is 'linear': sns.interactplot(X[feature[3]], X[feature[4]], y, ax=ax4, filled=True, scatter_kws={"color": "dimgray"}, contour_kws={"alpha": .5}) elif model_name is 'logistic': pal = sns.blend_palette(["#4169E1", "#DFAAEF", "#E16941"], as_cmap=True) levels = np.linspace(0, 1, 11) sns.interactplot(X[feature[3]], X[feature[4]], y, levels=levels, cmap=pal, logistic=True) else: pass ax1.set_title('Regression') ax2.set_title(feature[4]+' Value') ax3.set_title(feature[4]+' Residuals') ax4.set_title('Two-value Interaction') f.tight_layout() plt.savefig(model_name+'_'+feature[4], bbox_inches='tight') # Multi-variable correlation significance level f, ax = plt.subplots(figsize=(10, 10)) cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF", "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True) sns.corrplot(test_data, annot=False, diag_names=False, cmap=cmap) ax.grid(False) ax.set_title('Multi-variable correlation significance level') plt.savefig(model_name+'_multi-variable_correlation', bbox_inches='tight') # complete coefficient plot - believe this is only for linear regression sns.coefplot("diagnosis ~ "+' + '.join(features), test_data, intercept=True) plt.xticks(rotation='vertical') plt.savefig(model_name+'_coefficient_effects', bbox_inches='tight')
def make_corr_plot(d, title="plot"): f, ax = plt.subplots(figsize=(9, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.corrplot(d, annot=False, sig_stars=False, diag_names=False, cmap=cmap, ax=ax) f.tight_layout() plt.title(title) f.savefig(title)
def computeCorrelation(dataFrame, candidatesList,name): fig, axes = plt.subplots(figsize=(12,12)) dfCorr = dataFrame[candidatesList] cmap = sb.blend_palette(["#6B229F", "#FD3232", "#F66433", "#E78520", "#FFBB39"], as_cmap=True) sb.corrplot(dfCorr, annot=False, sig_stars=False, diag_names=False, cmap=cmap) axes.set_title("Correlation Matrix - " + name ) plt.savefig('Correlation_'+candidatesList[0]+'_.png')
def corrplot(mod_dis): df_model = [] for label, data in mod_dis.items(): inds = np.triu_indices(data.shape[0], k=1) df_model.append(data[inds]) df_model = pandas.DataFrame(np.array(df_model).T, columns=mod_dis.keys()) sns.corrplot(df_model)
def correlateRDMs(allrsasimspaces, models): spaces = [] for m in models: spaces.append(np.array(allrsasimspaces[m]['simmat_across']).flatten()) spaces = np.array(spaces) spaces = pd.DataFrame(data={model: spaces[modeln] for modeln, model in enumerate(models)}) spaces = spaces[models] f, ax = plt.subplots(figsize=[12, 12]) sns.corrplot(spaces, diag_names=False, sig_stars=False) return spaces.corr()
def correlations(data, X): X_title = "_".join([i for i in X.columns.tolist()]) f, ax = plt.subplots(figsize=(10, 10)) cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF", "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True) sns.corrplot(data, annot=False, diag_names=False, cmap=cmap) ax.grid(False) plt.savefig('visuals/'+X_title+'_correlation') print('visuals/'+X_title+'_correlation') plt.close()
def seaborn_plot(df,plot_type='pairplot',columns=False): sns.set() mpl.rc("figure", figsize=(16, 8.65)) plotting_df=(df[columns] if columns else df) if plot_type=='pairplot': sns.pairplot(plotting_df) elif plot_type=='corr_plot': sns.corrplot(plotting_df) sns.plt.show() return
def seaborn_plot(df, columns, plot_type='pairplot'): sns.set() mpl.rc("figure", figsize=(16, 8.65)) plotting_df = df[columns] if plot_type == 'pairplot': sns.pairplot(plotting_df) elif plot_type == 'corr_plot': sns.corrplot(plotting_df) sns.plt.show() return
def visualize_correlations(training_data): """ Generates a correlation matrix heat map. """ fig, ax = plt.subplots(figsize=(16, 10)) colormap = sb.blend_palette(sb.color_palette('coolwarm'), as_cmap=True) if len(training_data.columns) < 30: sb.corrplot(training_data, annot=True, sig_stars=False, diag_names=True, cmap=colormap, ax=ax) else: sb.corrplot(training_data, annot=False, sig_stars=False, diag_names=False, cmap=colormap, ax=ax) fig.tight_layout()
def l_reg(input_path): DF = pd.read_csv(input_path) DF.drop('gene_id', axis=1, inplace=True) #corr_mat = np.corrcoef(DF.as_matrix()) f, ax = plt.subplots(figsize=(20, 20)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.corrplot(DF.as_matrix().T, annot=False, sig_stars=False, diag_names=False, cmap=cmap, ax=ax) ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) plt.savefig(os.path.join(get_data_dir(), "tmp", "H3K27me3_corrplot.png"))
def corrplot(ax): rs = np.random.RandomState(0) x0, x1 = rs.randn(2, 60) x2, x3 = rs.multivariate_normal([0, 0], [(1, -.5), (-.5, 1)], 60).T x2 += x0 / 8 x4 = x1 + rs.randn(60) * 2 data = np.c_[x0, x1, x2, x3, x4] sns.corrplot(data, ax=ax) ax.set_title("corrplot()", verticalalignment="top")
def plot_pt_corr(df): """ plot the correlation matrix of the posteriors of the parameters """ f, ax = P.subplots(figsize=(9, 9)) cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF", "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True) sns.corrplot(df, annot=True, sig_stars=True, method='spearman', diag_names=True, cmap=cmap, ax=ax) f.tight_layout()
def get_cor_matrix( self,method="pearson" ): self.method = method out_cor_file = "%s.corMat.%s.pdf" % ( ".".join( self.infile.split(".")[:-2] ), self.method ) pd_mat = pd.DataFrame( self.mat.matrix ) pd_mat.columns = self.mat.colname pd_mat.index = self.mat.rowname self.cor_mat = pd_mat.corr( self.method ).values sns.set(style="darkgrid") f, ax = plt.subplots(figsize=(9, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.corrplot(pd_mat, annot=False, sig_stars=False, diag_names=False, cmap=cmap, ax=ax, cmap_range=(0.0, 1.0),method=self.method ) f.savefig( out_cor_file,format="pdf" )
def plot_correlations(df, include_numbers, filename): ''' Plot maps of cross correlations of input and output variables. ''' plt.figure() sns.set() sns.corrplot(df, annot=include_numbers) plt.savefig(filename) plt.close() return #===================================================================== # EOF #=====================================================================
def corrplot_example(): """ Birds-eye view of a large dataset to see correlation matrix with a heat map. Also gets a permutationt test to get p values. If you have a huge dataset, will take a while and p values aren't relevant. """ titanic = sns.load_dataset("titanic").dropna() # load dataset1 attention = sns.load_dataset("attention") # load dataset2 sns.set_context(rc={"figure.figsize": (8, 8)}) # set size sns.corrplot(titanic) # plot dataset1 #sns.corrplot(titanic, # dataset # sig_tail="upper", # specify if only want pos or neg values # cmap_range(-.3, 0)) # specify colormap range sns.corrplot(attention) # plot dataset2 plt.show()
def corrplot_example(): """ Birds-eye view of a large dataset to see correlation matrix with a heat map. Also gets a permutationt test to get p values. If you have a huge dataset, will take a while and p values aren't relevant. """ titanic = sns.load_dataset("titanic").dropna() # load dataset1 attention = sns.load_dataset("attention") # load dataset2 sns.set_context(rc={"figure.figsize": (8,8)}) # set size sns.corrplot(titanic) # plot dataset1 #sns.corrplot(titanic, # dataset # sig_tail="upper", # specify if only want pos or neg values # cmap_range(-.3, 0)) # specify colormap range sns.corrplot(attention) # plot dataset2 plt.show()
def plot_corr_matrix(X) : """ Plots correlation matrix for data """ import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt sns.set(style="white") labels = ["radar dist", "Ref", "Ref 5x5 10th", "Ref 5x5 50th", "Ref 5x5 90th", "RefComposite", "RefComposite 5x5 10th", "RefComposite 5x5 50th", "RefComposite 5x5 90th", "Rho_HV", "Rho_HV 5x5 10th", "Rho_HV 5x5 50th", "Rho_HV 5x5 90th", "Zdr", "Zdr 5x5 10th", "Zdr 5x5 50th", "Zdr 5x5 90th", "Kdp", "Kdp 5x5 10th", "Kdp 5x5 50th", "Kdp 5x5 90th", "Expected"] d = pd.DataFrame(data=X[:,2:].copy(), columns=labels) # Compute the correlation matrix corr = d.corr() # Set up the matplotlib figure f, ax = plt.subplots(figsize=(11, 9)) sns.corrplot(d) ax.set_title('Correlation Matrix for Radar Features and Output Variable') ax.set_xlabel('Features (along diagonal)') ax.set_ylabel('Correlation Values (upper triangle)') f.tight_layout() plt.show()
def preplot(df): _ = sns.pairplot(df[:50], vars = [8, 11, 12, 14, 19], hue = 'class', size = 1.5) plt.show() plt.figure(figsize = (12, 10)) _ = sns.corrplot(df, annot = False) plt.show()
def build_corrmatrix_dashboard(train_pre): plt = sns.corrplot(train_pre, annot=False) #sns.corrplot(train_pre) print 'Saving correlation matrix in figures/.' plt.savefig("figures/corr_matrix.png")
def build_corrmatrix_dashboard(train_pre): plt = sns.corrplot(train_pre,annot=False) #sns.corrplot(train_pre) print 'Saving correlation matrix in figures/.' plt.savefig("figures/corr_matrix.png")
def attribute_correlations(df, img_file='attr_correlations.png'): logging.debug('Plotting attribute pairwise correlations') # custom figure size (in inches) to cotrol the relative font size fig, ax = plt.subplots(figsize=(10, 10)) # nice custom red-blue diverging colormap with white center cmap = sns.diverging_palette(250, 10, n=3, as_cmap=True) # Correlation plot # - attribute names on diagonal # - color-coded correlation value in lower triangle # - values and significance in the upper triangle # - color bar # If there a lot of attributes we can disable the annotations: # annot=False, sig_stars=False, diag_names=False sns.corrplot(df, ax=ax, cmap=cmap) # remove white borders fig.tight_layout() fig.savefig(img_file) plt.close(fig)
def display_corr_matrix(): ''' function plots a correlation matrix heat map ''' global DF ### create a correlation matrix heatmap to look for colinearity data = DF sns.set(color_codes=True) f, ax = plt.subplots(figsize=(9, 9)) cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF", "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True) sns.corrplot(data, annot=False, sig_stars=False, diag_names=False, cmap=cmap, ax=ax) sns.plt.title('Figure 1: Correlation Matrix Heatmap') f.tight_layout() sns.despine() sns.plt.show()
def plotSocialMedia(df): df = df.apply(pd.to_numeric, errors='ignore') print(df.info()) fig, ax = plt.subplots(figsize=(10, 10)) sns.corrplot(df, ax=ax) plt.savefig('socialNetworkCorrelations.png', tight_layout=True) plt.close() # mean session duration df.sort_values("avgSessionDuration", ascending=False, inplace=True) g = sns.barplot(y='socialNetwork', x='avgSessionDuration', hue='userType', data=df) g.set(xlabel='Average Session Duration', ylabel='') plt.subplots_adjust(left=.17) sns.despine(left=True, bottom=True) plt.savefig('socialNetworkAvgDuration.png') plt.close() # bounceRate df.sort_values("bounceRate", inplace=True) g = sns.barplot(y='socialNetwork', x='bounceRate', hue='userType', data=df) g.set(xlabel='Bounce Rate', ylabel='') plt.subplots_adjust(left=.17) sns.despine(left=True, bottom=True) plt.savefig('socialNetworkBounceRate.png') plt.close() # avgTimeOnPage df.sort_values("avgTimeOnPage", ascending=False, inplace=True) g = sns.barplot(y='socialNetwork', x='avgTimeOnPage', hue='userType', data=df) g.set(xlabel='Average Time on Page', ylabel='') plt.subplots_adjust(left=.17) sns.despine(left=True, bottom=True) plt.savefig('socialNetworkavgTimeOnPage.png') plt.close() # goals completed df.sort_values("goalCompletionsAll", ascending=False, inplace=True) g = sns.barplot(y='socialNetwork', x='goalCompletionsAll', hue='userType', data=df) g.set(xlabel='Goal Completions', ylabel='', xscale='log') plt.subplots_adjust(left=.17) sns.despine(left=True, bottom=True) plt.savefig('socialNetworkgoalCompletionsAll.png') plt.close()
def get_feature_corr(df_k, keepers, scale_it=True): if scale_it: X_k = scale(np.array(df_k, dtype=float)) else: X_k = np.array(df_k, dtype=float) df_xk = pd.DataFrame(X_k) df_xk.columns = keepers sns.set(style="darkgrid") fig, ax = plt.subplots(figsize=(15, 15)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.corrplot(df_xk, annot=True, sig_stars=True, diag_names=False, cmap=cmap, ax=ax) ax.set_title('Correlation between training features') fig.tight_layout() return fig
def fig_correlations(data, aly_title, fig_save = True): """ Plot correlations Parameters ---------- data : pd.DataFrame aly_title : str fig_save : bool, optional False if data should not be saved """ ff = file_folder_specs() plt.figure() sns.corrplot(data, diag_names = False) plt.title(aly_title) if fig_save: _save_fig(aly_title, ff['fig']) plt.show() plt.close()
def tech_summary(): closing_df = DataReader(['AAPL', 'GOOG', 'MSFT', 'AMZN'], 'yahoo', start, end)['Adj Close'] tech_rets = closing_df.pct_change() # from IPython.display import SVG # SVG(url='http://upload.wikimedia.org/wikipedia/commons/d/d4/Correlation_examples2.svg') sns.jointplot('GOOG', 'GOOG', tech_rets, kind='scatter', color='seagreen') sns.jointplot('GOOG', 'MSFT', tech_rets, kind='scatter') sns.pairplot(tech_rets.dropna()) # Set up our figure by naming it returns_fig, call PairPLot on the DataFrame returns_fig = sns.PairGrid(tech_rets.dropna()) # Using map_upper we can specify what the upper triangle will look like. returns_fig.map_upper(plt.scatter, color='purple') # We can also define the lower triangle in the figure, inclufing the plot type (kde) or the color map (BluePurple) returns_fig.map_lower(sns.kdeplot, cmap='cool_d') # Finally we'll define the diagonal as a series of histogram plots of the daily return returns_fig.map_diag(plt.hist, bins=30) # Set up our figure by naming it returns_fig, call PairPLot on the DataFrame returns_fig = sns.PairGrid(closing_df) # Using map_upper we can specify what the upper triangle will look like. returns_fig.map_upper(plt.scatter, color='purple') # We can also define the lower triangle in the figure, inclufing the plot type (kde) or the color map (BluePurple) returns_fig.map_lower(sns.kdeplot, cmap='cool_d') # Finally we'll define the diagonal as a series of histogram plots of the closing price returns_fig.map_diag(plt.hist, bins=30) # Let's go ahead and use sebron for a quick correlation plot for the daily returns sns.corrplot(tech_rets.dropna(), annot=True) return tech_rets
def get_cor_matrix(self, method="pearson"): self.method = method out_cor_file = "%s.corMat.%s.pdf" % (".".join( self.infile.split(".")[:-2]), self.method) pd_mat = pd.DataFrame(self.mat.matrix) pd_mat.columns = self.mat.colname pd_mat.index = self.mat.rowname self.cor_mat = pd_mat.corr(self.method).values sns.set(style="darkgrid") f, ax = plt.subplots(figsize=(9, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.corrplot(pd_mat, annot=False, sig_stars=False, diag_names=False, cmap=cmap, ax=ax, cmap_range=(0.0, 1.0), method=self.method) f.savefig(out_cor_file, format="pdf")
def explorer(data, name, hue=None, trel=True, corr=True): """ Draw and save Trellis plots including scatter plots (upper triangle) and kernal density (lower triangle and lower triangle), correlation map with person R and p value. Takes long time with big data. Args: data: dataFrame. Input data arrays. name: str. Name of output figure file. hue: str, optional. Name of variable used as hue. Return: PairGrid """ if name[-4:] == '.pdf': mpl.use('PDF') import matplotlib.pyplot as plt #sns.set_context("talk", font_scale=1.3) if trel: print 'Plotting Trellis plots.' #sns.set(style="white") #f, ax = plt.subplots(figsize=(7, 7)) #ax.set(xscale="log", yscale="log") g = sns.PairGrid(data, hue=hue) g.map_lower(sns.kdeplot, cmap="Purples", shade=True) g.map_diag(plt.hist) g.map_upper(plt.scatter, s=10, alpha=.05) g.savefig('trel_' + name, dpi=300) plt.close() if corr: print 'Plotting correlation map.' #sns.set_context(rc={"figure.figsize": (16, 16)}) plt.figure() ax = sns.corrplot(data) ax.figure.savefig('corr_' + name, dpi=300) plt.close()
def explorer(data, name, hue=None, trel=True, corr=True): """ Draw and save Trellis plots including scatter plots (upper triangle) and kernal density (lower triangle and lower triangle), correlation map with person R and p value. Takes long time with big data. Args: data: dataFrame. Input data arrays. name: str. Name of output figure file. hue: str, optional. Name of variable used as hue. Return: PairGrid """ if name[-4:]=='.pdf': mpl.use('PDF') import matplotlib.pyplot as plt #sns.set_context("talk", font_scale=1.3) if trel: print 'Plotting Trellis plots.' #sns.set(style="white") #f, ax = plt.subplots(figsize=(7, 7)) #ax.set(xscale="log", yscale="log") g = sns.PairGrid(data, hue=hue) g.map_lower(sns.kdeplot, cmap="Purples",shade=True) g.map_diag(plt.hist) g.map_upper(plt.scatter, s=10, alpha=.05) g.savefig('trel_'+name, dpi = 300) plt.close() if corr: print 'Plotting correlation map.' #sns.set_context(rc={"figure.figsize": (16, 16)}) plt.figure() ax = sns.corrplot(data) ax.figure.savefig('corr_'+name, dpi = 300) plt.close()
def plotpair(df): # pr = sns.pairplot(df[:500], vars=['codeFragNum', 'liNum', 'popTagsNum', # 'bodyLength', 'titleLength'], # hue='class', size=1.5) cor = sns.corrplot(df, annot=False)
print rawdata.cov() print rawdata[['Age', 'Ca']].corr() pd.DataFrame.corr(rawdata) plt.show() # define colors list, to be used to plot survived either red (=0) or green (=1) colors = ['red', 'green'] # make a scatter plot # rawdata.info() from scipy import stats import seaborn as sns # just a conventional alias, don't know why sns.corrplot(rawdata) # compute and plot the pair-wise correlations # save to file, remove the big white borders #plt.savefig('attribute_correlations.png', tight_layout=True) plt.show() attr = rawdata['Age'] sns.distplot(attr) plt.show() sns.distplot(attr, kde=False, fit=stats.gamma) plt.show() # Two subplots, the axes array is 1-d plt.figure(1) plt.title('Histogram of Age') plt.subplot(211) # 21,1 means first one of 2 rows, 1 col
def make_correlation_plot(df): f, ax = plt.subplots(figsize=(12, 12)) sns.corrplot(df, annot=True, sig_stars=False, diag_names=False, ax=ax)
sns.interactplot(x1, x2, y, colorbar=False, ax=ax) # Correlation matrix # ------------------ ax = plt.subplot(gs[4:, 0]) rs = np.random.RandomState(0) x0, x1 = rs.randn(2, 60) x2, x3 = rs.multivariate_normal([0, 0], [(1, -.5), (-.5, 1)], 60).T x2 += x0 / 8 x4 = x1 + rs.randn(60) * 2 data = np.c_[x0, x1, x2, x3, x4] sns.corrplot(data, ax=ax) ax.set_title("corrplot()", verticalalignment="top") # Beta distributions # ------------------ sns.set(style="nogrid") ax = plt.subplot(gs[4, 1]) plt.title("distplot()") plt.xlim(0, 1) ax.set_xticklabels([]) g, _, p = sns.color_palette("Set2", 3, desat=.75) n = 1000
sns.pairplot(tech_rets.dropna()) plt.show() returns_fig = sns.PairGrid(tech_rets.dropna()) returns_fig.map_upper(plt.scatter, color='purple') returns_fig.map_lower(sns.kdeplot, cmap='cool_d') returns_fig.map_diag(plt.hist, bins=30) plt.show() returns_fig = sns.PairGrid(closing_df) returns_fig.map_upper(plt.scatter, color='purple') returns_fig.map_lower(sns.kdeplot, cmap='cool_d') returns_fig.map_diag(plt.hist, bins=30) plt.show() sns.corrplot(tech_rets.dropna(), annot=True) plt.show() sns.corrplot(closing_df, annot=True) plt.show() ''' analyze the risk of a stock ''' rets = tech_rets.dropna() area = np.pi * 20 plt.scatter(rets.mean(), rets.std(), s = area) plt.xlabel('Expected Return') plt.ylabel('Risk') for label, x, y in zip(rets.columns, rets.mean(), rets.std()): plt.annotate( label,
dataset = load_boston() df = pd.DataFrame(dataset.data, columns=dataset.feature_names) df['target'] = dataset.target # correlation corr = df.corr(method='pearson') corr.sort_values(by = 'target', inplace = True) # find pairs with high correlations import seaborn as sns # just a conventional alias, don't know why fig, ax = plt.subplots(figsize=(10, 10)) sns.corrplot(df, ax = ax) fig, ax = plt.subplots(figsize=(10, 10)) sns.distplot(attr) import matplotlib.pyplot as plt attr = df.target plt.hist(attr) plt.scatter(df.target, df['LSTAT']) sns.jointplot(df.target, df['LSTAT'], kind='scatter') sns.jointplot(df.target, df['LSTAT'], kind='hex') ### explore some diagnostic plots: QQ
# In[16]: sns.pairplot(tech_rets.dropna()) # In[17]: returns_fig = sns.PairGrid(closing_df) returns_fig.map_upper(plt.scatter,color='purple') # the lower triangle in the figure, inclufing the plot type (kde) or the color map (BluePurple) returns_fig.map_lower(sns.kdeplot,cmap='cool_d') # a series of histogram plots of the closing price returns_fig.map_diag(plt.hist,bins=30) # In[18]: sns.corrplot(tech_rets.dropna(),annot=True) # In[ ]:
i+=1 #Dem most anticorrelated sns_plot = sns.jointplot('Hillary Clinton','Bernie Sanders',pr_piv,kind='scatter') sns_plot.savefig(OutputFolder+'HillaryClinton_BernieSanders_joinplot.png') #Primary results assume a choice between Democrats candidates only or #Republican candidates only #So comparing Democrats to Republicans based on these results #does not have a lot of sense #However let's look on the picture as a whole heatmap(rvalue,'rvalue.png') #seabron for a quick correlation plot which is pandas.DataFrame.corr('pearson') f, ax = plt.subplots(figsize=(15, 15)) sns_plot = sns.corrplot(pr_piv,annot=True, ax=ax) plt.savefig(OutputFolder+'corrplot.png') #Let's look now how high is the possibility of the correlation #between democrat and republican candidates #we can not trust such results heatmap(pvalue,'pvalue.png') #You can take a look at the StdError of the correlation as well #heatmap(stderr,'stderr.png') #Hillary Clinton to Republican sns_plot = sns.jointplot('Hillary Clinton','Donald Trump',pr_piv,kind='scatter') sns_plot.savefig(OutputFolder+'HillaryClinton_DonaldTrump_joinplot.png')
def extra_viz(loansData): f, ax = plt.subplots(figsize=(10, 10)) sns.corrplot(loansData, ax=ax) plt.savefig('../figs/loan_corr_matrix.png')
##Try Correlation and Corrplot to see what features popout more than others: df.corr(method='pearson') pearson = df.corr(method='pearson') #print pearson # assume target attr is the last, then remove corr with itself corr_with_target = pearson.ix[-7][:-1] #print pearson.ix print corr_with_target # correlations by the absolute value: corr_with_target[abs(corr_with_target).argsort()[::-1]] # Set up the matplotlib figure f, ax = plt.subplots(figsize=(20, 15)) sns.corrplot(df) # compute and plot the pair-wise correlations # save to file, remove the big white borders plt.savefig('attribute_correlations.png', tight_layout=True) ##Use scikit-learn's SelectKBest feature selection: def get_k_best(enron_data, features_list, k): """ runs scikit-learn's SelectKBest feature selection returns dict where keys=features, values=scores """ data = featureFormat(enron_data, features_list) labels, features = targetFeatureSplit(data) k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_
import numpy as np import pandas as pd from numpy.random import randn import matplotlib.pyplot as plt import seaborn as sns from pandas import Series,DataFrame array = np.array([[1,3,4,4],[1,4,5,5]]) dframe1 = DataFrame(array,index=list('AB'),columns=list('abcd')) print(dframe1) print(dframe1.describe()) import pandas.io.data as pdweb import datetime prices_oils = pdweb.get_data_yahoo(['CVX','XOM','BP'],start=datetime.datetime(2013,1,1),end=datetime.datetime(2016,1,1))['Adj Close'] print(prices_oils.head()) prices_volume = pdweb.get_data_yahoo(['CVX','XOM','BP'],start=datetime.datetime(2013,1,1),end=datetime.datetime(2016,1,1))['Volume'] prices_volume.head() rets = prices_oils.pct_change() corr = rets.corr prices_oils.plot() sns.corrplot(rets,annot=False,diag_names=False)
# We can simply call pairplot on our DataFrame for an automatic visual analysis of all the comparisons sns.pairplot(tech_rets.dropna()) # In[38]: returns_fig = sns.PairGrid(tech_rets.dropna()) returns_fig.map_upper(plt.scatter,color='purple') returns_fig.map_lower(sns.kdeplot,cmap='cool_d') returns_fig.map_diag(plt.hist,bins=30) # In[42]: sns.corrplot(tech_rets.dropna(),annot=True) # In[43]: sns.corrplot(closing_df,annot=True) # ## Risk Analysis # In[48]: # Let's start by defining a new DataFrame as a clenaed version of the oriignal tech_rets DataFrame rets = tech_rets.dropna() area = np.pi*20 plt.scatter(rets.mean(), rets.std(),alpha = 0.5,s =area)
returns_fig = sns.PairGrid(tech_rets.dropna()) returns_fig.map_upper(plt.scatter, color='purple') returns_fig.map_lower(sns.kdeplot, cmap='cool_d') returns_fig.map_diag(plt.hist, bins=30) # In[31]: returns_fig = sns.PairGrid(closing_df) returns_fig.map_upper(plt.scatter, color='purple') returns_fig.map_lower(sns.kdeplot, cmap='cool_d') returns_fig.map_diag(plt.hist, bins=30) # In[32]: sns.corrplot(tech_rets.dropna(), annot=True) # In[ ]: #risk analysis # In[36]: # Let's start by defining a new DataFrame as a clenaed version of the oriignal tech_rets DataFrame rets = tech_rets.dropna() area = np.pi * 20 plt.scatter(rets.mean(), rets.std(), alpha=0.5, s=area) # Set the x and y limits of the plot (optional, remove this if you don't see anything in your plot)
''' FacetGrid. ''' # FacetGrid is used to draw plots with multiple Axes where each Axes shows the same relationship conditioned on different levels of some variable myimg = sns.FacetGrid(dframe,hue = 'Stories', col = 'zone', row = 'homebr') # set the grid myimg = myimg.map(sns.pointplot, 'pricePerSqft') # set the plot type myimg = sns.FacetGrid(dframe,hue = 'Stories', row = 'zone', aspect = 4) # set the grid myimg = myimg.map(sns.kdeplot, 'homeprice', shade = True).add_legend().set_axis_labels("Home Prices") # set the plot type ''' Correlation Visualization ''' # aka correlation plots sns.pairplot(dlyReturns_df) dlyReturns_fig = sns.PairGrid(dlyReturns_df, size = 5, aspect = 2) dlyReturns_fig.map_upper(plt.scatter, color = 'darkblue') dlyReturns_fig.map_lower(sns.kdeplot, cmap = 'cool_d') dlyReturns_fig.map_diag(plt.hist, bins = 30) sns.corrplot(dlyReturns_df, annot = True) ############################################################### ### ### ### ### ### Importing stock prices ### ### ### ### ### ############################################################### import pandas.io.data as pdweb import datetime from pandas.io.data import DataReader from datetime import datetime from __future__ import division # dont have to worry about division complications with python 2.7
groupby('Stock').resample('M', how='sum') #convert back to dataframe AEV_vol_comb = AEV_vol_comb.reset_index().reindex(columns=['Date','Volume']) AEV_vol_comb['Month'] = AEV_vol_comb['Date'].dt.month#create col for Month sns.barplot("Month", y="Volume",data=AEV_vol_comb, palette="BuGn_d") #loop through different stocks to compare each other using seaborn sns.pairplot(myport_rets.dropna()) #correlation bet. closing prices of all stock tickers returns_fig = sns.PairGrid(myport_Close.dropna()) returns_fig.map_upper(plt.scatter,color='purple') returns_fig.map_lower(sns.kdeplot,cmap='cool_d') returns_fig.map_diag(plt.hist,bins=30) #correlation plot bet. daily returns of all stock tickers sns.corrplot(myport_rets.dropna(),annot=True) #sns.jointplot(myport_rets['URC'],myport_rets['CEB'])#joint plot of both datasets #sns.jointplot(myport_rets['URC'],myport_rets['CEB'],kind='hex')#plot using hex #sns.jointplot(myport_rets['URC'],myport_rets['JGS'])#joint plot of both datasets #sns.jointplot(myport_rets['URC'],myport_rets['JGS'],kind='hex')#plot using hex #sns.jointplot(myport_rets['JGS'],myport_rets['CEB'])#joint plot of both datasets #sns.jointplot(myport_rets['JGS'],myport_rets['CEB'],kind='hex')#plot using hex #correlation plot bet. closing prices of all stock tickers sns.corrplot(myport_Close.dropna(),annot=True) ####################################### # RISK ANALYSIS # (A) There are many ways we can quantify risk, one of the most basic ways # using the info. we've gathered on daily percentage returns is by # comparing the expected return with the standard deviation of the
data[col] = data[col].astype('float') # plot the distribution of the predictors using histograms for col in columns: fig, ax = plt.subplots() data[col].hist() plt.title(col + 'distribution') plt.savefig(col) # plot the correlation matrix of the dataset to see if some predictors are more correlated to the response seaborn.corrplot(data, sig_stars=True, annot=False, sig_tail='both', sig_corr=False, cmap=None, cmap_range=None, cbar=True, diag_names=True, method='spearman', ax=None) plt.savefig('correlation_spearman_matrix.png') # separate the response from the features y = data['loan_status'] data.drop('loan_status', inplace=True, axis=1) def Logistic_Regression(X, y, fold): # shuffle and split training and test sets gini = 0
# -*- coding: UTF-8 -*- #numpy科学计算工具箱 import numpy as np #使用make_classification构造1000个样本,每个样本有20个feature from sklearn.datasets import make_classification X, y = make_classification(1000, n_features=20, n_informative=2, n_redundant=2, n_classes=2, random_state=0) #存为dataframe格式 from pandas import DataFrame df = DataFrame(np.hstack((X, y[:, None])), columns=range(20) + ["class"]) print df[:6] import matplotlib.pyplot as plt import seaborn as sns #使用pairplot去看不同特征维度pair下数据的空间分布状况 _ = sns.pairplot(df[:50], vars=[8, 11, 12, 14, 19], hue="class", size=1.5) plt.show() import matplotlib.pyplot as plt plt.figure(figsize=(12, 10)) _ = sns.corrplot(df, annot=False) plt.show()
def inspect_correlations(ModelTrains, filedir='data/plots', FIGWIDTH=FIGWIDTH, FIGHEIGHT=FIGHEIGHT): '''Produce Correlation Matrices with Nonmissing Trainer Objects''' plt.close('all') nonissing_trainers = [] for trainer in ModelTrains.trainers: save_this_directory = filedir + '/{}'.format(trainer.name) save_this_here = save_this_directory + '/correlations' try: os.mkdir(filedir) except: pass try: os.mkdir(save_this_directory) except: pass try: os.mkdir(save_this_here) except: pass try: plt.close('all') # Compute the correlation matrix corr = trainer.now.corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure fig, axs = plt.subplots(figsize=(FIGWIDTH, FIGHEIGHT)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio # with sns.axes_style("white"): # g = sns.heatmap(corr, mask=mask, cmap=cmap, cbar_ax=1, # vmax=.3, square=True, cbar=True, # cbar_kws={"shrink": .5}, linewidths=.5) # g = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, # square=True, xticklabels=5, yticklabels=5, # linewidths=.5, cbar_kws={"shrink": .5}, # legend_out=True, ax=axs) g = sns.corrplot(trainer.now, annot=False, diag_names=False) doc = save_this_here + '/matrix_{}_{}.png'.format(trainer.name, trainer.id) t = '\n{}\n(Anti)Correlation Matrix\n{}'.format(trainer.name, trainer.shape) plt.title(t, fontsize=12) plt.tight_layout() fig.savefig(doc) plt.close('all') except: pass if trainer.now.isnull().sum().sum() == 0: plt.close('all') # x, y, z, a = trainer.get_attributes() nonissing_trainers.append(trainer) fig, axs = plt.subplots(figsize=(FIGWIDTH, FIGWIDTH)) # plt.figure(figsize=()) g = sns.corrplot(trainer.now, annot=False, diag_names=False) t = "\n{}\nNon-Missing Only Correlation Matrix,\n{}".format(trainer.name, trainer.shape) doc = '{}/{}.png'.format(save_this_directory,'corrplot_{}_{}'.format(trainer.name, trainer.id)) # g.add_legend() plt.title(t) plt.tight_layout() fig.savefig(doc) plt.close('all') plt.close('all')
import pandas import seaborn import numpy from matplotlib import pyplot df = pandas.read_csv("df.csv") counts = df['hur.count'].copy() del df['hur.count'] df.columns = [x.replace(".data", "") for x in df.columns] seaborn.set(style="darkgrid") rs = numpy.random.RandomState(33) f, ax = pyplot.subplots(figsize=(9, 9)) cmap = seaborn.diverging_palette(220, 10, as_cmap=True) seaborn.corrplot(df, annot=False, sig_stars=False, diag_names=False, cmap=cmap, ax=ax) f.tight_layout() pyplot.savefig("corr.png") pyplot.close() pyplot.figure(figsize=(9, 9)) seaborn.distplot(counts) pyplot.xlim([0, 20]) pyplot.xlabel("Hurricanes") pyplot.title("Histogram of Annual Hurricane Counts") pyplot.savefig("histogram.png")
def visualize(training_data, X, y, pca): """ Computes statistics describing the data and creates some visualizations that attempt to highlight the underlying structure. Note: Use '%matplotlib inline' and '%matplotlib qt' at the IPython console to switch between display modes. """ print('Generating individual feature histograms...') num_features = len(training_data.columns) num_plots = num_features / 16 if num_features % 16 == 0 else num_features / 16 + 1 for i in range(num_plots): fig, ax = plt.subplots(4, 4, figsize=(20, 10)) for j in range(16): index = (i * 16) + j if index == 0: ax[j / 4, j % 4].hist(y, bins=30) ax[j / 4, j % 4].set_title(training_data.columns[index]) ax[j / 4, j % 4].set_xlim((min(y), max(y))) elif index < num_features: ax[j / 4, j % 4].hist(X[:, index - 1], bins=30) ax[j / 4, j % 4].set_title(training_data.columns[index]) ax[j / 4, j % 4].set_xlim( (min(X[:, index - 1]), max(X[:, index - 1]))) fig.tight_layout() print('Generating correlation matrix...') fig2, ax2 = plt.subplots(figsize=(16, 10)) colormap = sb.blend_palette( ["#00008B", "#6A5ACD", "#F0F8FF", "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True) sb.corrplot(training_data, annot=False, sig_stars=False, diag_names=False, cmap=colormap, ax=ax2) fig2.tight_layout() if pca is not None: print('Generating principal component plots...') X = pca.transform(X) class_count = np.count_nonzero(np.unique(y)) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] fig3, ax3 = plt.subplots(figsize=(16, 10)) for i in range(class_count): class_idx = i + 1 # add 1 if class labels start at 1 instead of 0 ax3.scatter(X[y == class_idx, 0], X[y == class_idx, 1], c=colors[i], label=class_idx) ax3.set_title('First & Second Principal Components') ax3.legend() fig3.tight_layout() fig4, ax4 = plt.subplots(figsize=(16, 10)) for i in range(class_count): class_idx = i + 1 # add 1 if class labels start at 1 instead of 0 ax4.scatter(X[y == class_idx, 1], X[y == class_idx, 2], c=colors[i], label=class_idx) ax4.set_title('Second & Third Principal Components') ax4.legend() fig4.tight_layout() fig5, ax5 = plt.subplots(figsize=(16, 10)) for i in range(class_count): class_idx = i + 1 # add 1 if class labels start at 1 instead of 0 ax5.scatter(X[y == class_idx, 2], X[y == class_idx, 3], c=colors[i], label=class_idx) ax5.set_title('Third & Fourth Principal Components') ax5.legend() fig5.tight_layout()
# -*- coding: utf-8 -*- """ Created on Fri Jan 9 12:43:14 2015 @author: davekensinger """ import numpy as np import seaborn as sns import matplotlib.pyplot as plt sns.set(style="darkgrid") rs = np.random.RandomState(33) d = rs.normal(size=(100, 30)) f, ax = plt.subplots(figsize=(9, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.corrplot(d, annot=False, sig_stars=False, diag_names=False, cmap=cmap, ax=ax) f.tight_layout()
df = pd.DataFrame(np.transpose([x, y]), columns=["X", "Y"]) sns.regplot("X", 'Y', df) sns.regplot("X", 'Y', df, ci=None, color='slategray') r2 = lambda x, y: stats.pearson(x, y)[0] ** 2 sns.regplot('X', 'Y', df, corr_func=r2, func_name='$R^2$', color='seagreen') tips = pd.read_csv("https://raw.github.com/mwaskom/seaborn/master/examples/tips.csv") tips["big_tip"] = tips.tip > (.2 * tips.total_bill) tips["smoker"] = tips["smoker"] == "Yes" tips["female"] = tips["sex"] == "Female" mpl.rc("figure", figsize=(7, 7)) sns.corrplot(tips) sns.corrplot(tips, sig_stars=False) sns.corrplot(tips, sig_tail='upper', cmap='PuRd', cmap_range=(-.2, .8)) mpl.rc('figure', figsize=(5, 5)) sns.lmplot('total_bill', 'tip', tips) sns.lmplot('total_bill', 'tip', tips, color='time') sns.lmplot('total_bill', 'tip', tips, color='day', palette='muted', ci=None) tips['tip_sqr'] = tips.tip ** 2 sns.lmplot('total_bill', 'tip_sqr', tips, order=2) sns.lmplot('size', 'big_tip', tips) sns.lmplot('size', 'big_tip', tips, x_jitter=0.3, y_jitter=0.075) sns.lmplot('size', 'big_tip', tips, x_jitter=0.3, y_jitter=0.075, logistic=True, n_boot=1000) sns.lmplot('total_bill', 'tip', tips, col='sex')