Example #1
0
def plot_correlation_map(train, sensor_columns):
    corr = train[sensor_columns].corr()
    plot_corr(corr,
              xnames=sensor_columns,
              ynames=sensor_columns,
              title='Sensor Correlations')
    plt.show()
Example #2
0
def test_plot_corr():
    hie_data = randhie.load_pandas()
    corr_matrix = np.corrcoef(hie_data.data.values.T)

    fig = plot_corr(corr_matrix, xnames=hie_data.names)
    plt.close(fig)

    fig = plot_corr(corr_matrix, xnames=[], ynames=hie_data.names)
    plt.close(fig)

    fig = plot_corr(corr_matrix, normcolor=True, title='', cmap='jet')
    plt.close(fig)
Example #3
0
def test_plot_corr():
    hie_data = randhie.load_pandas()
    corr_matrix = np.corrcoef(hie_data.data.values.T)

    fig = plot_corr(corr_matrix, xnames=hie_data.names)
    plt.close(fig)

    fig = plot_corr(corr_matrix, xnames=[], ynames=hie_data.names)
    plt.close(fig)

    fig = plot_corr(corr_matrix, normcolor=True, title='', cmap='jet')
    plt.close(fig)
def azureml_main(BikeShare):
    from sklearn import preprocessing
    import numpy as np
    import matplotlib.pyplot as plt
    import statsmodels.graphics.correlation as pltcor
    import statsmodels.nonparametric.smoothers_lowess as lw

    ## Set the font size for the plots
    import matplotlib
    matplotlib.rcParams.update({'font.size': 20})

    Azure = False

    arry = BikeShare.drop('dteday', axis=1).as_matrix()
    arry = preprocessing.scale(arry, axis=1)
    corrs = np.corrcoef(arry, rowvar=0)
    np.fill_diagonal(corrs, 0)

    col_nms = list(BikeShare)[1:]
    fig = plt.figure(figsize=(9, 9))
    ax = fig.gca()
    pltcor.plot_corr(corrs, xnames=col_nms, ax=ax)
    plt.show()
    if (Azure == True): fig.savefig('cor1.png')

    ## Make time series plots of bike demand by times of the day.
    times = [7, 9, 12, 15, 18, 20, 22]
    for tm in times:
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare[BikeShare.hr == tm].plot(kind='line',
                                           x='dayCount',
                                           y='cnt',
                                           ax=ax)
        plt.xlabel("Days from start of plot")
        plt.ylabel("Count of bikes rented")
        plt.title("Bikes rented by days for hour = " + str(tm))
        plt.show()

## Boxplots to for the predictor values vs the demand for bikes.
    BikeShare = set_day(BikeShare)
    labels = [
        "Box plots of hourly bike demand", "Box plots of monthly bike demand",
        "Box plots of bike demand by weather factor",
        "Box plots of bike demand by workday vs. holiday",
        "Box plots of bike demand by day of the week"
    ]
    xAxes = ["hr", "mnth", "weathersit", "isWorking", "dayWeek"]
    for lab, xaxs in zip(labels, xAxes):
        fig = plt.figure(figsize=(10, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare.boxplot(column=['cnt'], by=[xaxs], ax=ax)
        plt.xlabel('')
        plt.ylabel('Number of bikes')
        plt.show()

## Make scater plot of bike demand vs. various features.

    labels = [
        "Bike demand vs temperature", "Bike demand vs humidity",
        "Bike demand vs windspeed", "Bike demand vs hr",
        "Bike demand vs xformHr", "Bike demand vs xformWorkHr"
    ]
    xAxes = ["temp", "hum", "windspeed", "hr", "xformHr", "xformWorkHr"]
    for lab, xaxs in zip(labels, xAxes):
        ## first compute a lowess fit to the data
        los = lw.lowess(BikeShare['cnt'], BikeShare[xaxs], frac=0.2)

        ## Now make the plots
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare.plot(kind='scatter', x=xaxs, y='cnt', ax=ax, alpha=0.05)
        plt.plot(los[:, 0], los[:, 1], axes=ax, color='red')
        plt.show()

## Explore bike demand for certain times on working and nonworking days
    labels = [
        "Boxplots of bike demand at 0900 \n\n",
        "Boxplots of bike demand at 1800 \n\n"
    ]
    times = [8, 17]
    for lab, tms in zip(labels, times):
        temp = BikeShare[BikeShare.hr == tms]
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        temp.boxplot(column=['cnt'], by=['isWorking'], ax=ax)
        plt.xlabel('')
        plt.ylabel('Number of bikes')
        plt.title(lab)
        plt.show()

    return BikeShare
Example #5
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.correlation import plot_corr

df = pd.read_csv('https://raw.githubusercontent.com/Explore-AI/Public-Data/master/bootcamps/Personal_Loans.csv')
df.head()

# Insert underscores on column names
df.columns = [col.replace(" ","_") for col in df.columns]

# Look at customers who've already taken out a loan
df = df[df['Personal_Loan']==1] # return df where personal loan has been taken out
y = df['Personal_Loan']
df = df.drop(["Personal_Loan"], axis=1)

# get dummies
df_dummies = pd.get_dummies(df)

# Move response to end of df
col_titles = [col for col in df_dummies.columns if col!='Loan_Size'] + ["Loan_Size"]
df_dummies = df_dummies.reindex(columns=col_titles)

# plot corralation heatmap using statsmodels
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)
plot_corr(df_dummies.corr(), xnames=df_dummies.corr().columns, ax=ax)
Example #6
0
import statsmodels.sandbox.tools as sbtools

from statsmodels.graphics.correlation import plot_corr, plot_corr_grid

try:
    rrdm = pickle.load(file('dj30rr', 'rb'))
except Exception:  #blanket for any unpickling error
    print "Error with unpickling, a new pickle file can be created with findow_1"
    raise

ticksym = rrdm.columns.tolist()
rr = rrdm.values[1:400]

rrcorr = np.corrcoef(rr, rowvar=0)

plot_corr(rrcorr, xnames=ticksym)
nvars = rrcorr.shape[0]
plt.figure()
plt.hist(rrcorr[np.triu_indices(nvars, 1)])
plt.title('Correlation Coefficients')

xreda, facta, evaa, evea = sbtools.pcasvd(rr)
evallcs = (evaa).cumsum()
print evallcs / evallcs[-1]
xred, fact, eva, eve = sbtools.pcasvd(rr, keepdim=4)
pcacorr = np.corrcoef(xred, rowvar=0)

plot_corr(pcacorr, xnames=ticksym, title='Correlation PCA')

resid = rr - xred
residcorr = np.corrcoef(resid, rowvar=0)
Example #7
0
# ### Correlazione

# Si indaga sulla correlazione tra le variabili di interesse

# ##### Pearson Correlation Coefficient

# In[13]:

from statsmodels.graphics.correlation import plot_corr

columns = [
    'k_avg', 'street_length_avg', 'ratio_nodi_interesse', 'ratio_edges',
    'avg_betweenness', 'avg_closeness', 'avg_clustering', 'prezzo_medio'
]

fig = plot_corr(df[columns].corr(), xnames=columns)
fig.set_size_inches(12, 9)

# In[16]:

df[columns].corr()

# ##### Kendall Rank Correlation Coefficient

# In[14]:

from scipy.stats import kendalltau

kendalltau_matrix = df[columns].corr()
for col1 in columns:
    for col2 in columns:
Example #8
0
# now model our data

# create training set
x_set = pd.DataFrame(df, columns=['SQ_Home_Passer_Rating'])
y_set = pd.DataFrame(df, columns=['Home_Score'])

X_train, X_test, y_train, y_test = train_test_split(x_set,
                                                    y_set,
                                                    random_state=1)

# Create linear regression model
lin_reg_mod = LinearRegression()
# Fit linear regression
lin_reg_mod.fit(X_train, y_train)
# Make prediction on the testing data
pred = lin_reg_mod.predict(X_test)

print(lin_reg_mod.intercept_)
print(lin_reg_mod.coef_)
# Calculate the R^2 or coefficent of determination between the actual & predicted
test_set_r2 = r2_score(y_test, pred)
# The closer towards 1, the better the fit
print(test_set_r2)

df2 = df[['Home_Score', 'Home_Passer_Rating', 'SQ_Home_Passer_Rating']]

corr2 = df2.corr()
fig = plot_corr(corr2, xnames=corr2.columns)

plt.show()
],
                                  axis=1)
metabolite_difference.columns = [
    'Missing Case Mean', 'Complete Case Mean', 'Difference in Mean',
    'Missing Case Variance', 'Complete Case Variance', 'Difference in Variance'
]

metabolite_difference.to_csv('metabolite_difference_table.csv')

# https://stanford.edu/~mwaskom/software/seaborn/examples/many_pairwise_correlations.html
# Correlation between variables, plot heatmap
corr = ld.data_df.corr()

N, M = 12, 12
fig, ax = plt.subplots(figsize=(N, M))
stat.plot_corr(corr, ax=ax)

# CAD/noCAD split

CAD_missing_case = sum(ld.data_withmissing_df.ix[:, 225])
total_missing_case = len(ld.data_withmissing_df.ix[:, 225])
noCAD_missing_case = total_missing_case - CAD_missing_case
propCAD_missing_case = CAD_missing_case / total_missing_case

CAD_complete_case = sum(ld.data_withoutmissing_df.ix[:, 225])
total_complete_case = len(ld.data_withoutmissing_df.ix[:, 225])
noCAD_complete_case = total_complete_case - CAD_complete_case
propCAD_complete_case = CAD_complete_case / total_complete_case

# CAD/noCAD split in training and test sets of missing data case
CAD_missing_train = sum(ldmi.train1.ix[:, 225])
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import statsmodels.graphics.correlation as sm

df = pd.read_csv('../Data/data_3/Housing_Modified.csv')

# Convert binary fields to numeric boolean fields
lb = preprocessing.LabelBinarizer()

df.driveway = lb.fit_transform(df.driveway)
df.recroom = lb.fit_transform(df.recroom)
df.fullbase = lb.fit_transform(df.fullbase)
df.gashw = lb.fit_transform(df.gashw)
df.airco = lb.fit_transform(df.airco)
df.prefarea = lb.fit_transform(df.prefarea)

# Create dummy variables for stories
df_stories = pd.get_dummies(df['stories'], prefix='stories', drop_first=True)

# Join the dummy variables to the main dataframe
df = pd.concat([df, df_stories], axis=1)
del df['stories']

# lets plot correlation matrix using statmodels graphics packages`s plot_corr

# create correlation matrix
corr = df.corr()
sm.plot_corr(corr, xnames=list(corr.columns))
plt.show()
def azureml_main(BikeShare):
    import matplotlib
    matplotlib.use('agg')  # Set backend
    matplotlib.rcParams.update({'font.size': 20})

    from sklearn import preprocessing
    from sklearn import linear_model
    import numpy as np
    import matplotlib.pyplot as plt
    import statsmodels.graphics.correlation as pltcor
    import statsmodels.nonparametric.smoothers_lowess as lw

    Azure = False

    ## Sort the data frame based on the dayCount
    BikeShare.sort('dayCount', axis=0, inplace=True)

    ## De-trend the bike demand with time.
    nrow = BikeShare.shape[0]
    X = BikeShare.dayCount.as_matrix().reshape((nrow, 1))
    Y = BikeShare.cnt.as_matrix()
    ## Compute the linear model.
    clf = linear_model.LinearRegression()
    bike_lm = clf.fit(X, Y)
    ## Remove the trend
    BikeShare.cnt = BikeShare.cnt - bike_lm.predict(X)

    ## Compute the correlation matrix and set the diagonal
    ## elements to 0.
    arry = BikeShare.drop('dteday', axis=1).as_matrix()
    arry = preprocessing.scale(arry, axis=1)
    corrs = np.corrcoef(arry, rowvar=0)
    np.fill_diagonal(corrs, 0)

    col_nms = list(BikeShare)[1:]
    fig = plt.figure(figsize=(9, 9))
    ax = fig.gca()
    pltcor.plot_corr(corrs, xnames=col_nms, ax=ax)
    plt.show()
    if (Azure == True): fig.savefig('cor1.png')

    ## Compute and plot the correlation matrix with
    ## a smaller subset of columns.
    cols = [
        'yr', 'mnth', 'isWorking', 'xformWorkHr', 'dayCount', 'temp', 'hum',
        'windspeed', 'cnt'
    ]
    arry = BikeShare[cols].as_matrix()
    arry = preprocessing.scale(arry, axis=1)
    corrs = np.corrcoef(arry, rowvar=0)
    np.fill_diagonal(corrs, 0)

    fig = plt.figure(figsize=(9, 9))
    ax = fig.gca()
    pltcor.plot_corr(corrs, xnames=cols, ax=ax)
    plt.show()
    if (Azure == True): fig.savefig('cor2.png')

    ## Make time series plots of bike demand by times of the day.
    times = [7, 9, 12, 15, 18, 20, 22]
    for tm in times:
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare[BikeShare.hr == tm].plot(kind='line',
                                           x='dayCount',
                                           y='cnt',
                                           ax=ax)
        plt.xlabel("Days from start of plot")
        plt.ylabel("Count of bikes rented")
        plt.title("Bikes rented by days for hour = " + str(tm))
        plt.show()
        if (Azure == True): fig.savefig('tsplot' + str(tm) + '.png')

## Boxplots to for the predictor values vs the demand for bikes.
    BikeShare = set_day(BikeShare)
    labels = [
        "Box plots of hourly bike demand", "Box plots of monthly bike demand",
        "Box plots of bike demand by weather factor",
        "Box plots of bike demand by workday vs. holiday",
        "Box plots of bike demand by day of the week",
        "Box plots by transformed work hour of the day"
    ]
    xAxes = ["hr", "mnth", "weathersit", "isWorking", "dayWeek", "xformWorkHr"]
    for lab, xaxs in zip(labels, xAxes):
        fig = plt.figure(figsize=(10, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare.boxplot(column=['cnt'], by=[xaxs], ax=ax)
        plt.xlabel('')
        plt.ylabel('Number of bikes')
        plt.show()
        if (Azure == True): fig.savefig('boxplot' + xaxs + '.png')

## Make scater plot of bike demand vs. various features.

    labels = [
        "Bike demand vs temperature", "Bike demand vs humidity",
        "Bike demand vs windspeed", "Bike demand vs hr",
        "Bike demand vs xformHr", "Bike demand vs xformWorkHr"
    ]
    xAxes = ["temp", "hum", "windspeed", "hr", "xformHr", "xformWorkHr"]
    for lab, xaxs in zip(labels, xAxes):
        ## first compute a lowess fit to the data
        los = lw.lowess(BikeShare['cnt'], BikeShare[xaxs], frac=0.2)

        ## Now make the plots
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare.plot(kind='scatter', x=xaxs, y='cnt', ax=ax, alpha=0.05)
        plt.plot(los[:, 0], los[:, 1], axes=ax, color='red')
        plt.show()
        if (Azure == True): fig.savefig('scatterplot' + xaxs + '.png')

## Explore bike demand for certain times on working and nonworking days
    labels = [
        "Boxplots of bike demand at 0900 \n\n",
        "Boxplots of bike demand at 1800 \n\n"
    ]
    times = [8, 17]
    for lab, tms in zip(labels, times):
        temp = BikeShare[BikeShare.hr == tms]
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        temp.boxplot(column=['cnt'], by=['isWorking'], ax=ax)
        plt.xlabel('')
        plt.ylabel('Number of bikes')
        plt.title(lab)
        plt.show()
        if (Azure == True): fig.savefig('timeplot' + str(tms) + '.png')

    return BikeShare
Example #12
0
    means = [lls.mean() for lls in ll_samp_chains]
    for i, lls in enumerate(ll_samp_chains):
        plt.plot(lls[lls != 0], label="chain %d" % i)
    plt.legend()
    plt.show()
    plt.close("all")

    ### compare thetas
    #fig, axarr = plt.subplots(4, 1)
    #for i in range(len(axarr)):
    #    axarr[i].hist(src_samp_chains[i]['theta'], bins=20)
    #plt.show()

    print " comparing autocorrelation"
    from statsmodels.graphics.correlation import plot_corr
    plot_corr(np.corrcoef(samps))

    theta_mat = np.array([schain['theta'] for schain in src_samp_chains])
    plot_acf(theta_mat[1])
    plt.show()

#def plot_samps(src_samps, num_burnin=500):
#    """ plots source samples """
#    samp_dict = load_samples("experiment_cache/samp_cache/gal_samps_stamp_5.0026-0.1581_chain_0.bin")
#    src_samps = samp_dict['srcs'][2500:,0]
#
#    #fig = plt.figure()
#    with sns.axes_style("white"):
#
#        ## plot ANGLE vs RATIO
#        jgrid = sns.jointplot(src_samps['phi'], src_samps['rho'], kind = "kde")
Example #13
0
        plt.plot(lls[lls != 0], label="chain %d"%i)
    plt.legend()
    plt.show()
    plt.close("all")

    ### compare thetas
    #fig, axarr = plt.subplots(4, 1)
    #for i in range(len(axarr)):
    #    axarr[i].hist(src_samp_chains[i]['theta'], bins=20)
    #plt.show()



    print " comparing autocorrelation"
    from statsmodels.graphics.correlation import plot_corr
    plot_corr(np.corrcoef(samps))

    theta_mat = np.array([schain['theta'] for schain in src_samp_chains])
    plot_acf(theta_mat[1])
    plt.show()

#def plot_samps(src_samps, num_burnin=500):
#    """ plots source samples """
#    samp_dict = load_samples("experiment_cache/samp_cache/gal_samps_stamp_5.0026-0.1581_chain_0.bin")
#    src_samps = samp_dict['srcs'][2500:,0]
#
#    #fig = plt.figure()
#    with sns.axes_style("white"):
#
#        ## plot ANGLE vs RATIO
#        jgrid = sns.jointplot(src_samps['phi'], src_samps['rho'], kind = "kde")
def azureml_main(BikeShare):
    from sklearn import preprocessing
    import numpy as np
    import matplotlib.pyplot as plt
    import statsmodels.graphics.correlation as pltcor
    import statsmodels.nonparametric.smoothers_lowess as lw

## Set the font size for the plots    
    import matplotlib
    matplotlib.rcParams.update({'font.size': 20})
    
    Azure = False
    
    arry = BikeShare.drop('dteday', axis = 1).as_matrix()  
    arry = preprocessing.scale(arry, axis = 1)
    corrs = np.corrcoef(arry, rowvar = 0)
    np.fill_diagonal(corrs, 0)
    
    col_nms = list(BikeShare)[1:]  
    fig = plt.figure(figsize = (9,9))
    ax = fig.gca()
    pltcor.plot_corr(corrs, xnames = col_nms, ax = ax) 
    plt.show()
    if(Azure == True): fig.savefig('cor1.png')
    

## Make time series plots of bike demand by times of the day.    
    times = [7, 9, 12, 15, 18, 20, 22]
    for tm in times:
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare[BikeShare.hr == tm].plot(kind = 'line', 
                                           x = 'dayCount', y = 'cnt',
                                           ax = ax)    
        plt.xlabel("Days from start of plot")
        plt.ylabel("Count of bikes rented")
        plt.title("Bikes rented by days for hour = " + str(tm))
        plt.show()
 
## Boxplots to for the predictor values vs the demand for bikes.
    BikeShare = set_day(BikeShare)
    labels = ["Box plots of hourly bike demand",
            "Box plots of monthly bike demand",
            "Box plots of bike demand by weather factor",
            "Box plots of bike demand by workday vs. holiday",
            "Box plots of bike demand by day of the week"]
    xAxes = ["hr", "mnth", "weathersit", 
              "isWorking", "dayWeek"]
    for lab, xaxs in zip(labels, xAxes):
        fig = plt.figure(figsize=(10, 6))
        fig.clf()
        ax = fig.gca()  
        BikeShare.boxplot(column = ['cnt'], by = [xaxs], ax = ax)   
        plt.xlabel('')
        plt.ylabel('Number of bikes')
        plt.show() 
        
## Make scater plot of bike demand vs. various features.
        
    labels = ["Bike demand vs temperature",
            "Bike demand vs humidity",
            "Bike demand vs windspeed",
            "Bike demand vs hr",
            "Bike demand vs xformHr",
            "Bike demand vs xformWorkHr"]
    xAxes = ["temp", "hum", "windspeed", "hr", 
           "xformHr", "xformWorkHr"]
    for lab, xaxs in zip(labels, xAxes):
        ## first compute a lowess fit to the data
        los = lw.lowess(BikeShare['cnt'], BikeShare[xaxs], frac = 0.2)
    
        ## Now make the plots
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare.plot(kind = 'scatter', x = xaxs, y = 'cnt', ax = ax, alpha = 0.05)
        plt.plot(los[:, 0], los[:, 1], axes = ax, color = 'red')
        plt.show() 
    
## Explore bike demand for certain times on working and nonworking days
    labels = ["Boxplots of bike demand at 0900 \n\n",
               "Boxplots of bike demand at 1800 \n\n"]
    times = [8, 17]
    for lab, tms in zip(labels, times):
        temp = BikeShare[BikeShare.hr == tms]
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()  
        temp.boxplot(column = ['cnt'], by = ['isWorking'], ax = ax)   
        plt.xlabel('')
        plt.ylabel('Number of bikes')
        plt.title(lab)
        plt.show() 

    return BikeShare    

      
        
def azureml_main(BikeShare):
    import matplotlib

    matplotlib.use("agg")  # Set backend
    matplotlib.rcParams.update({"font.size": 20})

    from sklearn import preprocessing
    from sklearn import linear_model
    import numpy as np
    import matplotlib.pyplot as plt
    import statsmodels.graphics.correlation as pltcor
    import statsmodels.nonparametric.smoothers_lowess as lw

    Azure = False

    ## Sort the data frame based on the dayCount
    BikeShare.sort("dayCount", axis=0, inplace=True)

    ## De-trend the bike demand with time.
    nrow = BikeShare.shape[0]
    X = BikeShare.dayCount.as_matrix().reshape((nrow, 1))
    Y = BikeShare.cnt.as_matrix()
    ## Compute the linear model.
    clf = linear_model.LinearRegression()
    bike_lm = clf.fit(X, Y)
    ## Remove the trend
    BikeShare.cnt = BikeShare.cnt - bike_lm.predict(X)

    ## Compute the correlation matrix and set the diagonal
    ## elements to 0.
    arry = BikeShare.drop("dteday", axis=1).as_matrix()
    arry = preprocessing.scale(arry, axis=1)
    corrs = np.corrcoef(arry, rowvar=0)
    np.fill_diagonal(corrs, 0)

    col_nms = list(BikeShare)[1:]
    fig = plt.figure(figsize=(9, 9))
    ax = fig.gca()
    pltcor.plot_corr(corrs, xnames=col_nms, ax=ax)
    plt.show()
    if Azure == True:
        fig.savefig("cor1.png")

    ## Compute and plot the correlation matrix with
    ## a smaller subset of columns.
    cols = ["yr", "mnth", "isWorking", "xformWorkHr", "dayCount", "temp", "hum", "windspeed", "cnt"]
    arry = BikeShare[cols].as_matrix()
    arry = preprocessing.scale(arry, axis=1)
    corrs = np.corrcoef(arry, rowvar=0)
    np.fill_diagonal(corrs, 0)

    fig = plt.figure(figsize=(9, 9))
    ax = fig.gca()
    pltcor.plot_corr(corrs, xnames=cols, ax=ax)
    plt.show()
    if Azure == True:
        fig.savefig("cor2.png")

    ## Make time series plots of bike demand by times of the day.
    times = [7, 9, 12, 15, 18, 20, 22]
    for tm in times:
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare[BikeShare.hr == tm].plot(kind="line", x="dayCount", y="cnt", ax=ax)
        plt.xlabel("Days from start of plot")
        plt.ylabel("Count of bikes rented")
        plt.title("Bikes rented by days for hour = " + str(tm))
        plt.show()
        if Azure == True:
            fig.savefig("tsplot" + str(tm) + ".png")

    ## Boxplots to for the predictor values vs the demand for bikes.
    BikeShare = set_day(BikeShare)
    labels = [
        "Box plots of hourly bike demand",
        "Box plots of monthly bike demand",
        "Box plots of bike demand by weather factor",
        "Box plots of bike demand by workday vs. holiday",
        "Box plots of bike demand by day of the week",
        "Box plots by transformed work hour of the day",
    ]
    xAxes = ["hr", "mnth", "weathersit", "isWorking", "dayWeek", "xformWorkHr"]
    for lab, xaxs in zip(labels, xAxes):
        fig = plt.figure(figsize=(10, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare.boxplot(column=["cnt"], by=[xaxs], ax=ax)
        plt.xlabel("")
        plt.ylabel("Number of bikes")
        plt.show()
        if Azure == True:
            fig.savefig("boxplot" + xaxs + ".png")

    ## Make scater plot of bike demand vs. various features.

    labels = [
        "Bike demand vs temperature",
        "Bike demand vs humidity",
        "Bike demand vs windspeed",
        "Bike demand vs hr",
        "Bike demand vs xformHr",
        "Bike demand vs xformWorkHr",
    ]
    xAxes = ["temp", "hum", "windspeed", "hr", "xformHr", "xformWorkHr"]
    for lab, xaxs in zip(labels, xAxes):
        ## first compute a lowess fit to the data
        los = lw.lowess(BikeShare["cnt"], BikeShare[xaxs], frac=0.2)

        ## Now make the plots
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        BikeShare.plot(kind="scatter", x=xaxs, y="cnt", ax=ax, alpha=0.05)
        plt.plot(los[:, 0], los[:, 1], axes=ax, color="red")
        plt.show()
        if Azure == True:
            fig.savefig("scatterplot" + xaxs + ".png")

    ## Explore bike demand for certain times on working and nonworking days
    labels = ["Boxplots of bike demand at 0900 \n\n", "Boxplots of bike demand at 1800 \n\n"]
    times = [8, 17]
    for lab, tms in zip(labels, times):
        temp = BikeShare[BikeShare.hr == tms]
        fig = plt.figure(figsize=(8, 6))
        fig.clf()
        ax = fig.gca()
        temp.boxplot(column=["cnt"], by=["isWorking"], ax=ax)
        plt.xlabel("")
        plt.ylabel("Number of bikes")
        plt.title(lab)
        plt.show()
        if Azure == True:
            fig.savefig("timeplot" + str(tms) + ".png")

    return BikeShare
Example #16
0
from statsmodels.graphics.correlation import plot_corr, plot_corr_grid

try:
    rrdm = cPickle.load(file('dj30rr','rb'))
except Exception: #blanket for any unpickling error
    print("Error with unpickling, a new pickle file can be created with findow_1")
    raise

ticksym = rrdm.columns.tolist()
rr = rrdm.values[1:400]

rrcorr = np.corrcoef(rr, rowvar=0)


plot_corr(rrcorr, xnames=ticksym)
nvars = rrcorr.shape[0]
plt.figure()
plt.hist(rrcorr[np.triu_indices(nvars,1)])
plt.title('Correlation Coefficients')

xreda, facta, evaa, evea  = sbtools.pcasvd(rr)
evallcs = (evaa).cumsum()
print(evallcs/evallcs[-1])
xred, fact, eva, eve  = sbtools.pcasvd(rr, keepdim=4)
pcacorr = np.corrcoef(xred, rowvar=0)

plot_corr(pcacorr, xnames=ticksym, title='Correlation PCA')

resid = rr-xred
residcorr = np.corrcoef(resid, rowvar=0)
Example #17
0
    plt.figure(figsize=(8,5))
    plt.title("{} vs. \nConcrete Compressive Strength".format(c),fontsize=16)
    plt.scatter(x=df[c],y=df['Concrete compressive strength(MPa, megapascals) '],color='blue',edgecolor='k')
    plt.grid(True)
    plt.xlabel(c,fontsize=14)
    plt.ylabel('Concrete compressive strength\n(MPa, megapascals)',fontsize=14)
    plt.show()


#Creating a copy with suitable column names for processing with statsmodels.OLS()

df1 = df.copy()
df1.columns=['Component'+str(i) for i in range(1,8)]+['Age']+['y']
df1.head()


#Pairwise scatter plots
from seaborn import pairplot
pairplot(df1)


#Correlation matrix and heatmap to visually check for multicollinearity
#In statistics, multicollinearity (also collinearity) is a phenomenon in which one predictor variable in a multiple regression model
#can be linearly predicted from the others with a substantial degree of accuracy.
corr = df1[:-1].corr()
corr


from statsmodels.graphics.correlation import plot_corr
fig=plot_corr(corr,xnames=corr.columns)