def plot_correlation_map(train, sensor_columns): corr = train[sensor_columns].corr() plot_corr(corr, xnames=sensor_columns, ynames=sensor_columns, title='Sensor Correlations') plt.show()
def test_plot_corr(): hie_data = randhie.load_pandas() corr_matrix = np.corrcoef(hie_data.data.values.T) fig = plot_corr(corr_matrix, xnames=hie_data.names) plt.close(fig) fig = plot_corr(corr_matrix, xnames=[], ynames=hie_data.names) plt.close(fig) fig = plot_corr(corr_matrix, normcolor=True, title='', cmap='jet') plt.close(fig)
def azureml_main(BikeShare): from sklearn import preprocessing import numpy as np import matplotlib.pyplot as plt import statsmodels.graphics.correlation as pltcor import statsmodels.nonparametric.smoothers_lowess as lw ## Set the font size for the plots import matplotlib matplotlib.rcParams.update({'font.size': 20}) Azure = False arry = BikeShare.drop('dteday', axis=1).as_matrix() arry = preprocessing.scale(arry, axis=1) corrs = np.corrcoef(arry, rowvar=0) np.fill_diagonal(corrs, 0) col_nms = list(BikeShare)[1:] fig = plt.figure(figsize=(9, 9)) ax = fig.gca() pltcor.plot_corr(corrs, xnames=col_nms, ax=ax) plt.show() if (Azure == True): fig.savefig('cor1.png') ## Make time series plots of bike demand by times of the day. times = [7, 9, 12, 15, 18, 20, 22] for tm in times: fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() BikeShare[BikeShare.hr == tm].plot(kind='line', x='dayCount', y='cnt', ax=ax) plt.xlabel("Days from start of plot") plt.ylabel("Count of bikes rented") plt.title("Bikes rented by days for hour = " + str(tm)) plt.show() ## Boxplots to for the predictor values vs the demand for bikes. BikeShare = set_day(BikeShare) labels = [ "Box plots of hourly bike demand", "Box plots of monthly bike demand", "Box plots of bike demand by weather factor", "Box plots of bike demand by workday vs. holiday", "Box plots of bike demand by day of the week" ] xAxes = ["hr", "mnth", "weathersit", "isWorking", "dayWeek"] for lab, xaxs in zip(labels, xAxes): fig = plt.figure(figsize=(10, 6)) fig.clf() ax = fig.gca() BikeShare.boxplot(column=['cnt'], by=[xaxs], ax=ax) plt.xlabel('') plt.ylabel('Number of bikes') plt.show() ## Make scater plot of bike demand vs. various features. labels = [ "Bike demand vs temperature", "Bike demand vs humidity", "Bike demand vs windspeed", "Bike demand vs hr", "Bike demand vs xformHr", "Bike demand vs xformWorkHr" ] xAxes = ["temp", "hum", "windspeed", "hr", "xformHr", "xformWorkHr"] for lab, xaxs in zip(labels, xAxes): ## first compute a lowess fit to the data los = lw.lowess(BikeShare['cnt'], BikeShare[xaxs], frac=0.2) ## Now make the plots fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() BikeShare.plot(kind='scatter', x=xaxs, y='cnt', ax=ax, alpha=0.05) plt.plot(los[:, 0], los[:, 1], axes=ax, color='red') plt.show() ## Explore bike demand for certain times on working and nonworking days labels = [ "Boxplots of bike demand at 0900 \n\n", "Boxplots of bike demand at 1800 \n\n" ] times = [8, 17] for lab, tms in zip(labels, times): temp = BikeShare[BikeShare.hr == tms] fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() temp.boxplot(column=['cnt'], by=['isWorking'], ax=ax) plt.xlabel('') plt.ylabel('Number of bikes') plt.title(lab) plt.show() return BikeShare
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from statsmodels.graphics.correlation import plot_corr df = pd.read_csv('https://raw.githubusercontent.com/Explore-AI/Public-Data/master/bootcamps/Personal_Loans.csv') df.head() # Insert underscores on column names df.columns = [col.replace(" ","_") for col in df.columns] # Look at customers who've already taken out a loan df = df[df['Personal_Loan']==1] # return df where personal loan has been taken out y = df['Personal_Loan'] df = df.drop(["Personal_Loan"], axis=1) # get dummies df_dummies = pd.get_dummies(df) # Move response to end of df col_titles = [col for col in df_dummies.columns if col!='Loan_Size'] + ["Loan_Size"] df_dummies = df_dummies.reindex(columns=col_titles) # plot corralation heatmap using statsmodels fig = plt.figure(figsize=(15,15)) ax = fig.add_subplot(111) plot_corr(df_dummies.corr(), xnames=df_dummies.corr().columns, ax=ax)
import statsmodels.sandbox.tools as sbtools from statsmodels.graphics.correlation import plot_corr, plot_corr_grid try: rrdm = pickle.load(file('dj30rr', 'rb')) except Exception: #blanket for any unpickling error print "Error with unpickling, a new pickle file can be created with findow_1" raise ticksym = rrdm.columns.tolist() rr = rrdm.values[1:400] rrcorr = np.corrcoef(rr, rowvar=0) plot_corr(rrcorr, xnames=ticksym) nvars = rrcorr.shape[0] plt.figure() plt.hist(rrcorr[np.triu_indices(nvars, 1)]) plt.title('Correlation Coefficients') xreda, facta, evaa, evea = sbtools.pcasvd(rr) evallcs = (evaa).cumsum() print evallcs / evallcs[-1] xred, fact, eva, eve = sbtools.pcasvd(rr, keepdim=4) pcacorr = np.corrcoef(xred, rowvar=0) plot_corr(pcacorr, xnames=ticksym, title='Correlation PCA') resid = rr - xred residcorr = np.corrcoef(resid, rowvar=0)
# ### Correlazione # Si indaga sulla correlazione tra le variabili di interesse # ##### Pearson Correlation Coefficient # In[13]: from statsmodels.graphics.correlation import plot_corr columns = [ 'k_avg', 'street_length_avg', 'ratio_nodi_interesse', 'ratio_edges', 'avg_betweenness', 'avg_closeness', 'avg_clustering', 'prezzo_medio' ] fig = plot_corr(df[columns].corr(), xnames=columns) fig.set_size_inches(12, 9) # In[16]: df[columns].corr() # ##### Kendall Rank Correlation Coefficient # In[14]: from scipy.stats import kendalltau kendalltau_matrix = df[columns].corr() for col1 in columns: for col2 in columns:
# now model our data # create training set x_set = pd.DataFrame(df, columns=['SQ_Home_Passer_Rating']) y_set = pd.DataFrame(df, columns=['Home_Score']) X_train, X_test, y_train, y_test = train_test_split(x_set, y_set, random_state=1) # Create linear regression model lin_reg_mod = LinearRegression() # Fit linear regression lin_reg_mod.fit(X_train, y_train) # Make prediction on the testing data pred = lin_reg_mod.predict(X_test) print(lin_reg_mod.intercept_) print(lin_reg_mod.coef_) # Calculate the R^2 or coefficent of determination between the actual & predicted test_set_r2 = r2_score(y_test, pred) # The closer towards 1, the better the fit print(test_set_r2) df2 = df[['Home_Score', 'Home_Passer_Rating', 'SQ_Home_Passer_Rating']] corr2 = df2.corr() fig = plot_corr(corr2, xnames=corr2.columns) plt.show()
], axis=1) metabolite_difference.columns = [ 'Missing Case Mean', 'Complete Case Mean', 'Difference in Mean', 'Missing Case Variance', 'Complete Case Variance', 'Difference in Variance' ] metabolite_difference.to_csv('metabolite_difference_table.csv') # https://stanford.edu/~mwaskom/software/seaborn/examples/many_pairwise_correlations.html # Correlation between variables, plot heatmap corr = ld.data_df.corr() N, M = 12, 12 fig, ax = plt.subplots(figsize=(N, M)) stat.plot_corr(corr, ax=ax) # CAD/noCAD split CAD_missing_case = sum(ld.data_withmissing_df.ix[:, 225]) total_missing_case = len(ld.data_withmissing_df.ix[:, 225]) noCAD_missing_case = total_missing_case - CAD_missing_case propCAD_missing_case = CAD_missing_case / total_missing_case CAD_complete_case = sum(ld.data_withoutmissing_df.ix[:, 225]) total_complete_case = len(ld.data_withoutmissing_df.ix[:, 225]) noCAD_complete_case = total_complete_case - CAD_complete_case propCAD_complete_case = CAD_complete_case / total_complete_case # CAD/noCAD split in training and test sets of missing data case CAD_missing_train = sum(ldmi.train1.ix[:, 225])
import numpy as np import matplotlib.pyplot as plt from sklearn import preprocessing import statsmodels.graphics.correlation as sm df = pd.read_csv('../Data/data_3/Housing_Modified.csv') # Convert binary fields to numeric boolean fields lb = preprocessing.LabelBinarizer() df.driveway = lb.fit_transform(df.driveway) df.recroom = lb.fit_transform(df.recroom) df.fullbase = lb.fit_transform(df.fullbase) df.gashw = lb.fit_transform(df.gashw) df.airco = lb.fit_transform(df.airco) df.prefarea = lb.fit_transform(df.prefarea) # Create dummy variables for stories df_stories = pd.get_dummies(df['stories'], prefix='stories', drop_first=True) # Join the dummy variables to the main dataframe df = pd.concat([df, df_stories], axis=1) del df['stories'] # lets plot correlation matrix using statmodels graphics packages`s plot_corr # create correlation matrix corr = df.corr() sm.plot_corr(corr, xnames=list(corr.columns)) plt.show()
def azureml_main(BikeShare): import matplotlib matplotlib.use('agg') # Set backend matplotlib.rcParams.update({'font.size': 20}) from sklearn import preprocessing from sklearn import linear_model import numpy as np import matplotlib.pyplot as plt import statsmodels.graphics.correlation as pltcor import statsmodels.nonparametric.smoothers_lowess as lw Azure = False ## Sort the data frame based on the dayCount BikeShare.sort('dayCount', axis=0, inplace=True) ## De-trend the bike demand with time. nrow = BikeShare.shape[0] X = BikeShare.dayCount.as_matrix().reshape((nrow, 1)) Y = BikeShare.cnt.as_matrix() ## Compute the linear model. clf = linear_model.LinearRegression() bike_lm = clf.fit(X, Y) ## Remove the trend BikeShare.cnt = BikeShare.cnt - bike_lm.predict(X) ## Compute the correlation matrix and set the diagonal ## elements to 0. arry = BikeShare.drop('dteday', axis=1).as_matrix() arry = preprocessing.scale(arry, axis=1) corrs = np.corrcoef(arry, rowvar=0) np.fill_diagonal(corrs, 0) col_nms = list(BikeShare)[1:] fig = plt.figure(figsize=(9, 9)) ax = fig.gca() pltcor.plot_corr(corrs, xnames=col_nms, ax=ax) plt.show() if (Azure == True): fig.savefig('cor1.png') ## Compute and plot the correlation matrix with ## a smaller subset of columns. cols = [ 'yr', 'mnth', 'isWorking', 'xformWorkHr', 'dayCount', 'temp', 'hum', 'windspeed', 'cnt' ] arry = BikeShare[cols].as_matrix() arry = preprocessing.scale(arry, axis=1) corrs = np.corrcoef(arry, rowvar=0) np.fill_diagonal(corrs, 0) fig = plt.figure(figsize=(9, 9)) ax = fig.gca() pltcor.plot_corr(corrs, xnames=cols, ax=ax) plt.show() if (Azure == True): fig.savefig('cor2.png') ## Make time series plots of bike demand by times of the day. times = [7, 9, 12, 15, 18, 20, 22] for tm in times: fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() BikeShare[BikeShare.hr == tm].plot(kind='line', x='dayCount', y='cnt', ax=ax) plt.xlabel("Days from start of plot") plt.ylabel("Count of bikes rented") plt.title("Bikes rented by days for hour = " + str(tm)) plt.show() if (Azure == True): fig.savefig('tsplot' + str(tm) + '.png') ## Boxplots to for the predictor values vs the demand for bikes. BikeShare = set_day(BikeShare) labels = [ "Box plots of hourly bike demand", "Box plots of monthly bike demand", "Box plots of bike demand by weather factor", "Box plots of bike demand by workday vs. holiday", "Box plots of bike demand by day of the week", "Box plots by transformed work hour of the day" ] xAxes = ["hr", "mnth", "weathersit", "isWorking", "dayWeek", "xformWorkHr"] for lab, xaxs in zip(labels, xAxes): fig = plt.figure(figsize=(10, 6)) fig.clf() ax = fig.gca() BikeShare.boxplot(column=['cnt'], by=[xaxs], ax=ax) plt.xlabel('') plt.ylabel('Number of bikes') plt.show() if (Azure == True): fig.savefig('boxplot' + xaxs + '.png') ## Make scater plot of bike demand vs. various features. labels = [ "Bike demand vs temperature", "Bike demand vs humidity", "Bike demand vs windspeed", "Bike demand vs hr", "Bike demand vs xformHr", "Bike demand vs xformWorkHr" ] xAxes = ["temp", "hum", "windspeed", "hr", "xformHr", "xformWorkHr"] for lab, xaxs in zip(labels, xAxes): ## first compute a lowess fit to the data los = lw.lowess(BikeShare['cnt'], BikeShare[xaxs], frac=0.2) ## Now make the plots fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() BikeShare.plot(kind='scatter', x=xaxs, y='cnt', ax=ax, alpha=0.05) plt.plot(los[:, 0], los[:, 1], axes=ax, color='red') plt.show() if (Azure == True): fig.savefig('scatterplot' + xaxs + '.png') ## Explore bike demand for certain times on working and nonworking days labels = [ "Boxplots of bike demand at 0900 \n\n", "Boxplots of bike demand at 1800 \n\n" ] times = [8, 17] for lab, tms in zip(labels, times): temp = BikeShare[BikeShare.hr == tms] fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() temp.boxplot(column=['cnt'], by=['isWorking'], ax=ax) plt.xlabel('') plt.ylabel('Number of bikes') plt.title(lab) plt.show() if (Azure == True): fig.savefig('timeplot' + str(tms) + '.png') return BikeShare
means = [lls.mean() for lls in ll_samp_chains] for i, lls in enumerate(ll_samp_chains): plt.plot(lls[lls != 0], label="chain %d" % i) plt.legend() plt.show() plt.close("all") ### compare thetas #fig, axarr = plt.subplots(4, 1) #for i in range(len(axarr)): # axarr[i].hist(src_samp_chains[i]['theta'], bins=20) #plt.show() print " comparing autocorrelation" from statsmodels.graphics.correlation import plot_corr plot_corr(np.corrcoef(samps)) theta_mat = np.array([schain['theta'] for schain in src_samp_chains]) plot_acf(theta_mat[1]) plt.show() #def plot_samps(src_samps, num_burnin=500): # """ plots source samples """ # samp_dict = load_samples("experiment_cache/samp_cache/gal_samps_stamp_5.0026-0.1581_chain_0.bin") # src_samps = samp_dict['srcs'][2500:,0] # # #fig = plt.figure() # with sns.axes_style("white"): # # ## plot ANGLE vs RATIO # jgrid = sns.jointplot(src_samps['phi'], src_samps['rho'], kind = "kde")
plt.plot(lls[lls != 0], label="chain %d"%i) plt.legend() plt.show() plt.close("all") ### compare thetas #fig, axarr = plt.subplots(4, 1) #for i in range(len(axarr)): # axarr[i].hist(src_samp_chains[i]['theta'], bins=20) #plt.show() print " comparing autocorrelation" from statsmodels.graphics.correlation import plot_corr plot_corr(np.corrcoef(samps)) theta_mat = np.array([schain['theta'] for schain in src_samp_chains]) plot_acf(theta_mat[1]) plt.show() #def plot_samps(src_samps, num_burnin=500): # """ plots source samples """ # samp_dict = load_samples("experiment_cache/samp_cache/gal_samps_stamp_5.0026-0.1581_chain_0.bin") # src_samps = samp_dict['srcs'][2500:,0] # # #fig = plt.figure() # with sns.axes_style("white"): # # ## plot ANGLE vs RATIO # jgrid = sns.jointplot(src_samps['phi'], src_samps['rho'], kind = "kde")
def azureml_main(BikeShare): from sklearn import preprocessing import numpy as np import matplotlib.pyplot as plt import statsmodels.graphics.correlation as pltcor import statsmodels.nonparametric.smoothers_lowess as lw ## Set the font size for the plots import matplotlib matplotlib.rcParams.update({'font.size': 20}) Azure = False arry = BikeShare.drop('dteday', axis = 1).as_matrix() arry = preprocessing.scale(arry, axis = 1) corrs = np.corrcoef(arry, rowvar = 0) np.fill_diagonal(corrs, 0) col_nms = list(BikeShare)[1:] fig = plt.figure(figsize = (9,9)) ax = fig.gca() pltcor.plot_corr(corrs, xnames = col_nms, ax = ax) plt.show() if(Azure == True): fig.savefig('cor1.png') ## Make time series plots of bike demand by times of the day. times = [7, 9, 12, 15, 18, 20, 22] for tm in times: fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() BikeShare[BikeShare.hr == tm].plot(kind = 'line', x = 'dayCount', y = 'cnt', ax = ax) plt.xlabel("Days from start of plot") plt.ylabel("Count of bikes rented") plt.title("Bikes rented by days for hour = " + str(tm)) plt.show() ## Boxplots to for the predictor values vs the demand for bikes. BikeShare = set_day(BikeShare) labels = ["Box plots of hourly bike demand", "Box plots of monthly bike demand", "Box plots of bike demand by weather factor", "Box plots of bike demand by workday vs. holiday", "Box plots of bike demand by day of the week"] xAxes = ["hr", "mnth", "weathersit", "isWorking", "dayWeek"] for lab, xaxs in zip(labels, xAxes): fig = plt.figure(figsize=(10, 6)) fig.clf() ax = fig.gca() BikeShare.boxplot(column = ['cnt'], by = [xaxs], ax = ax) plt.xlabel('') plt.ylabel('Number of bikes') plt.show() ## Make scater plot of bike demand vs. various features. labels = ["Bike demand vs temperature", "Bike demand vs humidity", "Bike demand vs windspeed", "Bike demand vs hr", "Bike demand vs xformHr", "Bike demand vs xformWorkHr"] xAxes = ["temp", "hum", "windspeed", "hr", "xformHr", "xformWorkHr"] for lab, xaxs in zip(labels, xAxes): ## first compute a lowess fit to the data los = lw.lowess(BikeShare['cnt'], BikeShare[xaxs], frac = 0.2) ## Now make the plots fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() BikeShare.plot(kind = 'scatter', x = xaxs, y = 'cnt', ax = ax, alpha = 0.05) plt.plot(los[:, 0], los[:, 1], axes = ax, color = 'red') plt.show() ## Explore bike demand for certain times on working and nonworking days labels = ["Boxplots of bike demand at 0900 \n\n", "Boxplots of bike demand at 1800 \n\n"] times = [8, 17] for lab, tms in zip(labels, times): temp = BikeShare[BikeShare.hr == tms] fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() temp.boxplot(column = ['cnt'], by = ['isWorking'], ax = ax) plt.xlabel('') plt.ylabel('Number of bikes') plt.title(lab) plt.show() return BikeShare
def azureml_main(BikeShare): import matplotlib matplotlib.use("agg") # Set backend matplotlib.rcParams.update({"font.size": 20}) from sklearn import preprocessing from sklearn import linear_model import numpy as np import matplotlib.pyplot as plt import statsmodels.graphics.correlation as pltcor import statsmodels.nonparametric.smoothers_lowess as lw Azure = False ## Sort the data frame based on the dayCount BikeShare.sort("dayCount", axis=0, inplace=True) ## De-trend the bike demand with time. nrow = BikeShare.shape[0] X = BikeShare.dayCount.as_matrix().reshape((nrow, 1)) Y = BikeShare.cnt.as_matrix() ## Compute the linear model. clf = linear_model.LinearRegression() bike_lm = clf.fit(X, Y) ## Remove the trend BikeShare.cnt = BikeShare.cnt - bike_lm.predict(X) ## Compute the correlation matrix and set the diagonal ## elements to 0. arry = BikeShare.drop("dteday", axis=1).as_matrix() arry = preprocessing.scale(arry, axis=1) corrs = np.corrcoef(arry, rowvar=0) np.fill_diagonal(corrs, 0) col_nms = list(BikeShare)[1:] fig = plt.figure(figsize=(9, 9)) ax = fig.gca() pltcor.plot_corr(corrs, xnames=col_nms, ax=ax) plt.show() if Azure == True: fig.savefig("cor1.png") ## Compute and plot the correlation matrix with ## a smaller subset of columns. cols = ["yr", "mnth", "isWorking", "xformWorkHr", "dayCount", "temp", "hum", "windspeed", "cnt"] arry = BikeShare[cols].as_matrix() arry = preprocessing.scale(arry, axis=1) corrs = np.corrcoef(arry, rowvar=0) np.fill_diagonal(corrs, 0) fig = plt.figure(figsize=(9, 9)) ax = fig.gca() pltcor.plot_corr(corrs, xnames=cols, ax=ax) plt.show() if Azure == True: fig.savefig("cor2.png") ## Make time series plots of bike demand by times of the day. times = [7, 9, 12, 15, 18, 20, 22] for tm in times: fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() BikeShare[BikeShare.hr == tm].plot(kind="line", x="dayCount", y="cnt", ax=ax) plt.xlabel("Days from start of plot") plt.ylabel("Count of bikes rented") plt.title("Bikes rented by days for hour = " + str(tm)) plt.show() if Azure == True: fig.savefig("tsplot" + str(tm) + ".png") ## Boxplots to for the predictor values vs the demand for bikes. BikeShare = set_day(BikeShare) labels = [ "Box plots of hourly bike demand", "Box plots of monthly bike demand", "Box plots of bike demand by weather factor", "Box plots of bike demand by workday vs. holiday", "Box plots of bike demand by day of the week", "Box plots by transformed work hour of the day", ] xAxes = ["hr", "mnth", "weathersit", "isWorking", "dayWeek", "xformWorkHr"] for lab, xaxs in zip(labels, xAxes): fig = plt.figure(figsize=(10, 6)) fig.clf() ax = fig.gca() BikeShare.boxplot(column=["cnt"], by=[xaxs], ax=ax) plt.xlabel("") plt.ylabel("Number of bikes") plt.show() if Azure == True: fig.savefig("boxplot" + xaxs + ".png") ## Make scater plot of bike demand vs. various features. labels = [ "Bike demand vs temperature", "Bike demand vs humidity", "Bike demand vs windspeed", "Bike demand vs hr", "Bike demand vs xformHr", "Bike demand vs xformWorkHr", ] xAxes = ["temp", "hum", "windspeed", "hr", "xformHr", "xformWorkHr"] for lab, xaxs in zip(labels, xAxes): ## first compute a lowess fit to the data los = lw.lowess(BikeShare["cnt"], BikeShare[xaxs], frac=0.2) ## Now make the plots fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() BikeShare.plot(kind="scatter", x=xaxs, y="cnt", ax=ax, alpha=0.05) plt.plot(los[:, 0], los[:, 1], axes=ax, color="red") plt.show() if Azure == True: fig.savefig("scatterplot" + xaxs + ".png") ## Explore bike demand for certain times on working and nonworking days labels = ["Boxplots of bike demand at 0900 \n\n", "Boxplots of bike demand at 1800 \n\n"] times = [8, 17] for lab, tms in zip(labels, times): temp = BikeShare[BikeShare.hr == tms] fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() temp.boxplot(column=["cnt"], by=["isWorking"], ax=ax) plt.xlabel("") plt.ylabel("Number of bikes") plt.title(lab) plt.show() if Azure == True: fig.savefig("timeplot" + str(tms) + ".png") return BikeShare
from statsmodels.graphics.correlation import plot_corr, plot_corr_grid try: rrdm = cPickle.load(file('dj30rr','rb')) except Exception: #blanket for any unpickling error print("Error with unpickling, a new pickle file can be created with findow_1") raise ticksym = rrdm.columns.tolist() rr = rrdm.values[1:400] rrcorr = np.corrcoef(rr, rowvar=0) plot_corr(rrcorr, xnames=ticksym) nvars = rrcorr.shape[0] plt.figure() plt.hist(rrcorr[np.triu_indices(nvars,1)]) plt.title('Correlation Coefficients') xreda, facta, evaa, evea = sbtools.pcasvd(rr) evallcs = (evaa).cumsum() print(evallcs/evallcs[-1]) xred, fact, eva, eve = sbtools.pcasvd(rr, keepdim=4) pcacorr = np.corrcoef(xred, rowvar=0) plot_corr(pcacorr, xnames=ticksym, title='Correlation PCA') resid = rr-xred residcorr = np.corrcoef(resid, rowvar=0)
plt.figure(figsize=(8,5)) plt.title("{} vs. \nConcrete Compressive Strength".format(c),fontsize=16) plt.scatter(x=df[c],y=df['Concrete compressive strength(MPa, megapascals) '],color='blue',edgecolor='k') plt.grid(True) plt.xlabel(c,fontsize=14) plt.ylabel('Concrete compressive strength\n(MPa, megapascals)',fontsize=14) plt.show() #Creating a copy with suitable column names for processing with statsmodels.OLS() df1 = df.copy() df1.columns=['Component'+str(i) for i in range(1,8)]+['Age']+['y'] df1.head() #Pairwise scatter plots from seaborn import pairplot pairplot(df1) #Correlation matrix and heatmap to visually check for multicollinearity #In statistics, multicollinearity (also collinearity) is a phenomenon in which one predictor variable in a multiple regression model #can be linearly predicted from the others with a substantial degree of accuracy. corr = df1[:-1].corr() corr from statsmodels.graphics.correlation import plot_corr fig=plot_corr(corr,xnames=corr.columns)