def plotCorrMatrix(data, target, data_columns): #creates the correlation matrix from numpy corr_matrix = np.corrcoef(target, data.T) #calls statsmodels.graphics.api to create the image of the correlation matrix smg.plot_corr(corr_matrix, xnames=data_columns) plt.show()
def plot_correlations(): """ Prints a correlation matrix """ print("plotting correlation matrix") red_corr, white_corr = np.corrcoef(red_wines, rowvar=False), np.corrcoef(white_wines, rowvar=False) smg.plot_corr(red_corr, xnames=variables, ynames=variables, cmap="Reds", normcolor=True) smg.plot_corr(white_corr, xnames=variables, ynames=variables, cmap="YlGn", normcolor=True) plt.show()
def main(): data_fs,target = Feature_Selection.Feature_select() # print(data2.info()) data_fs = data_fs.fillna(method='bfill') #correltaion analysis columns_names = pd.Series(data_fs.columns) smg.plot_corr(data_fs.corr(), xnames=columns_names) plt.show() pd.plotting.scatter_matrix(data_fs, marker='O') plt.show() #Drop one of the 2 features whose correlation is above 0.9 corr_matrix=data_fs.corr().abs() mask=np.triu(np.ones_like(corr_matrix, dtype=bool)) tri_df = corr_matrix.mask(mask) to_drop = [c for c in tri_df.columns if any(tri_df[c]>0.9)] data_fs = data_fs.drop(to_drop, axis=1) X = data_fs y = target (X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.20, random_state=1) lr = LogisticRegression() lr.fit(X_train, y_train) y_pred = lr.predict(X_test) y_prob = lr.predict_proba(X_test)[:1] # fpr,tpr,thresholds = roc_curve(y_test,y_prob) print(confusion_matrix(y_test,y_pred)) print(classification_report(y_test,y_pred)) # print(roc_auc_score(y_test,y_prob)) print(f"\n train accuracy: {lr.score(X_train, y_train)}\n") print(f"test accuracy: {lr.score(X_test, y_test)}\n") print(f"Intercept per class: {lr.intercept_}\n") print(f"Coeficients per class: {lr.coef_}\n") print(f"Available classes: {lr.classes_}\n") print(f"Named Coeficients for class 1: {pd.DataFrame(lr.coef_[0], data_fs.columns)}\n") print(f"Number of iterations generating model: {lr.n_iter_}")
def corr_matrix(data: pd.DataFrame): """ Plot correlation_matrix for data """ d = data.select_dtypes(np.number) f, g = plt.subplots(figsize=(8, 8)) corr_matrix = np.corrcoef(d.T) g = smg.plot_corr(corr_matrix, xnames=d.columns) g.axes[0].set_title('Correlation Matrix', color = cNoFocus) for axis in g.axes: axis.tick_params(colors=cNoFocus) axis.spines['bottom'].set_color(cNoFocus) axis.spines['top'].set_color(cNoFocus) axis.spines['left'].set_color(cNoFocus) axis.spines['right'].set_color(cNoFocus) axis.set_xticklabels(axis.get_xticklabels(), size = 'small') axis.set_yticklabels(axis.get_yticklabels(), size = 'small') f.savefig(f"figs/corr_matrix.png", bbox_inches='tight', transparent = True, dpi = 200) # plt.close(f) return g
# -*- coding: utf-8 -*- """ Create a plot of correlation among many variables in a grid """ import matplotlib.pyplot as plt import numpy as np import statsmodels.api as sm import statsmodels.graphics.api as smg hie_data = sm.datasets.randhie.load_pandas() corr_matrix = np.corrcoef(hie_data.data.T) smg.plot_corr(corr_matrix, xnames=hie_data.names) plt.show()
# EDA Lets look at our data # (What are the variables?) print(faa_df) # Future calculations won't work on strings, so let's convert "airbus" and "boeing" to 0 nd 1, respectively faa_df = faa_df.replace(to_replace="airbus", value=0) faa_df = faa_df.replace(to_replace="boeing", value=1) # Check results print(faa_df) # Create a correlation matrix # (Note the results for speed_air) # np.corrcoef treats each row as a variable, so we use .T to transpose our data_frame corr_matrix = np.corrcoef(faa_df.T) smg.plot_corr(corr_matrix, xnames=list(faa_df)) plt.show() # Remove NaN values and run correlation matrix again faa_df_no_na = faa_df.dropna() # Check results # (How much data is left?) print(faa_df_no_na) # Create our correlation matrix again corr_matrix = np.corrcoef(faa_df_no_na.T) smg.plot_corr(corr_matrix, xnames=list(faa_df_no_na)) plt.show() # Lets remove "speed_air" from our original data set
def correlationMatrix(self, corr): smg.plot_corr(corr, xnames=["Open Price", "Close Price"], normcolor=True) plt.show()
''' analyze baskets using statsmodels will be put somehwere else later. a file sjould not depend on the package but rather the anlaysis being made. ''' import statsmodels.graphics.api as smg import statsmodels.graphics.tsaplots as tsa hie_data = sm.datasets.randhie.load_pandas() retmat1 corr_matrix = np.corrcoef(retmat1.T) corr_matrix.shape smg.plot_corr(corr_matrix, cmap='viridis') #tsa.plot_acf(x, ax=None, lags=None, alpha=0.05, use_vlines=True, unbiased=False, fft=False, title='Autocorrelation', zero=True, vlines_kwargs=None, **kwargs) # plot tsa.plot_acf(r1, lags=30, alpha=0.05, use_vlines=False) plt.title('Autocorrelation ' + r1.name) ylabel1 = 'corr of day t with day t minus lag' xlabel1 = 'Lag' plt.xlabel(xlabel1) plt.ylabel(ylabel1) plt.savefig('output/bsk/ret/ACF_bsk1.png') # plot tsa.plot_acf(ret_vcc_mat.BTC, use_vlines=False, lags=30, alpha=0.05) plt.title('Autocorrelation BTC') plt.xlabel(xlabel1) plt.ylabel(ylabel1)
# Future calculations won't work on strings, so let's convert "airbus" and "boeing" to 0 nd 1, respectively faa_df = faa_df.replace(to_replace="airbus", value=0) faa_df = faa_df.replace(to_replace="boeing", value=1) # Lets remove "speed_air" from our original data set del faa_df["speed_air"] # Create a new data frame for boeing data boeing_df = faa_df[faa_df.aircraft == 1] # Examine data print(boeing_df) # Create our correlation matrix corr_matrix = np.corrcoef(boeing_df.T) smg.plot_corr(corr_matrix, xnames=list(boeing_df)) plt.show() # Pairwise plot using seaborn sns.pairplot(boeing_df) plt.show() # Build model with distance as dependant variable and all other variables as independent variables model = smf.ols( formula="distance ~ duration + no_pasg + speed_ground + height + pitch", data=boeing_df) results = model.fit() print(results.summary()) # Build revised model with distance as dependant variable and significant variables as independent variables revised_model = smf.ols(formula="distance ~ speed_ground + height",
# Future calculations won't work on strings, so let's convert "airbus" and "boeing" to 0 nd 1, respectively faa_df = faa_df.replace(to_replace="airbus", value=0) faa_df = faa_df.replace(to_replace="boeing", value=1) # Lets remove "speed_air" from our original data set del faa_df["speed_air"] # Create a new data frame for boeing data airbus_df = faa_df[faa_df.aircraft == 0] # Examine data print(airbus_df) # Create our correlation matrix corr_matrix = np.corrcoef(airbus_df.T) smg.plot_corr(corr_matrix, xnames=list(airbus_df)) plt.show() # Pairwise plot using seaborn sns.pairplot(airbus_df) plt.show() # Build model with distance as dependant variable and all other variables as independent variables model = smf.ols( formula="distance ~ duration + no_pasg + speed_ground + height + pitch", data=airbus_df) results = model.fit() print(results.summary()) # Build revised model with distance as dependant variable and significant variables as independent variables revised_model = smf.ols(formula="distance ~ speed_ground + height + pitch",
# (bc_data['Single Epithelial Cell Size'] > stats_desc.loc['Single Epithelial Cell Size'].rwhisker) | # (bc_data['Bare Nuclei'] > stats_desc.loc['Bare Nuclei'].rwhisker) | # (bc_data['Normal Nucleoli'] > stats_desc.loc['Normal Nucleoli'].rwhisker) | # (bc_data['Bland Chromatin'] > stats_desc.loc['Bland Chromatin'].rwhisker) | # (bc_data['Mitoses'] > stats_desc.loc['Mitoses'].rwhisker)].index) bc_data_new.info() # In[332]: corr = bc_data_new.corr() corr # In[333]: import statsmodels.graphics.api as smg smg.plot_corr(corr, xnames=list(corr.columns)) plt.show() # In[334]: bc_data_new.var() # In[335]: # Drop Mitoses as it has zero variance bc_data_new.drop('Mitoses', inplace=True, axis=1) # In[336]: # Drop Cell Shape as both Cell Size & Shape has very strong correlation bc_data_new.drop('Cell Shape', inplace=True, axis=1)