def plotCorrMatrix(data, target, data_columns):

    #creates the correlation matrix from numpy
    corr_matrix = np.corrcoef(target, data.T)

    #calls statsmodels.graphics.api to create the image of the correlation matrix
    smg.plot_corr(corr_matrix, xnames=data_columns)
    plt.show()
Exemple #2
0
def plot_correlations():
    """
    Prints a correlation matrix
    """
    print("plotting correlation matrix")
    red_corr, white_corr = np.corrcoef(red_wines,
                                       rowvar=False), np.corrcoef(white_wines,
                                                                  rowvar=False)
    smg.plot_corr(red_corr,
                  xnames=variables,
                  ynames=variables,
                  cmap="Reds",
                  normcolor=True)
    smg.plot_corr(white_corr,
                  xnames=variables,
                  ynames=variables,
                  cmap="YlGn",
                  normcolor=True)
    plt.show()
Exemple #3
0
def main():
    data_fs,target = Feature_Selection.Feature_select()
 #   print(data2.info())
    data_fs = data_fs.fillna(method='bfill')

    #correltaion analysis
    columns_names = pd.Series(data_fs.columns)
    smg.plot_corr(data_fs.corr(), xnames=columns_names)
    plt.show()
    pd.plotting.scatter_matrix(data_fs, marker='O')
    plt.show()

    #Drop one of the 2 features whose correlation is above 0.9
    corr_matrix=data_fs.corr().abs()
    mask=np.triu(np.ones_like(corr_matrix, dtype=bool))
    tri_df = corr_matrix.mask(mask)
    to_drop = [c for c in tri_df.columns if any(tri_df[c]>0.9)]
    data_fs = data_fs.drop(to_drop, axis=1)

    X = data_fs
    y = target

    (X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.20, random_state=1)
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    y_prob = lr.predict_proba(X_test)[:1]
    # fpr,tpr,thresholds  = roc_curve(y_test,y_prob)

    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    # print(roc_auc_score(y_test,y_prob))
    print(f"\n train accuracy: {lr.score(X_train, y_train)}\n")
    print(f"test accuracy: {lr.score(X_test, y_test)}\n")
    print(f"Intercept per class: {lr.intercept_}\n")
    print(f"Coeficients per class: {lr.coef_}\n")
    print(f"Available classes: {lr.classes_}\n")
    print(f"Named Coeficients for class 1: {pd.DataFrame(lr.coef_[0], data_fs.columns)}\n")
    print(f"Number of iterations generating model: {lr.n_iter_}")
def corr_matrix(data: pd.DataFrame):
    """ Plot correlation_matrix for data """
    d = data.select_dtypes(np.number)
    f, g = plt.subplots(figsize=(8, 8))
    corr_matrix = np.corrcoef(d.T)
    g = smg.plot_corr(corr_matrix, xnames=d.columns)
    g.axes[0].set_title('Correlation Matrix', color = cNoFocus)

    for axis in g.axes:
        axis.tick_params(colors=cNoFocus)
        axis.spines['bottom'].set_color(cNoFocus)
        axis.spines['top'].set_color(cNoFocus)
        axis.spines['left'].set_color(cNoFocus)
        axis.spines['right'].set_color(cNoFocus)
        axis.set_xticklabels(axis.get_xticklabels(), size = 'small')
        axis.set_yticklabels(axis.get_yticklabels(), size = 'small')
    f.savefig(f"figs/corr_matrix.png", bbox_inches='tight', transparent = True, dpi = 200)
    # plt.close(f)
    return g
Exemple #5
0
# -*- coding: utf-8 -*-
"""
Create a plot of correlation among many variables in a grid

"""

import matplotlib.pyplot as plt
import numpy as np

import statsmodels.api as sm
import statsmodels.graphics.api as smg

hie_data = sm.datasets.randhie.load_pandas()
corr_matrix = np.corrcoef(hie_data.data.T)
smg.plot_corr(corr_matrix, xnames=hie_data.names)
plt.show()
Exemple #6
0
# EDA Lets look at our data
# (What are the variables?)
print(faa_df)

# Future calculations won't work on strings, so let's convert "airbus" and "boeing" to 0 nd 1, respectively
faa_df = faa_df.replace(to_replace="airbus", value=0)
faa_df = faa_df.replace(to_replace="boeing", value=1)

# Check results
print(faa_df)

# Create a correlation matrix
# (Note the results for speed_air)
# np.corrcoef treats each row as a variable, so we use .T to transpose our data_frame
corr_matrix = np.corrcoef(faa_df.T)
smg.plot_corr(corr_matrix, xnames=list(faa_df))
plt.show()

# Remove NaN values and run correlation matrix again
faa_df_no_na = faa_df.dropna()

# Check results
# (How much data is left?)
print(faa_df_no_na)

# Create our correlation matrix again
corr_matrix = np.corrcoef(faa_df_no_na.T)
smg.plot_corr(corr_matrix, xnames=list(faa_df_no_na))
plt.show()

# Lets remove "speed_air" from our original data set
Exemple #7
0
 def correlationMatrix(self, corr):
     smg.plot_corr(corr, xnames=["Open Price", "Close Price"], normcolor=True)
     plt.show()
Exemple #8
0
'''
analyze baskets using statsmodels

will be put somehwere else later. a file sjould not depend on the package but rather the anlaysis being made.
'''

import statsmodels.graphics.api as smg
import statsmodels.graphics.tsaplots as tsa

hie_data = sm.datasets.randhie.load_pandas()
retmat1
corr_matrix = np.corrcoef(retmat1.T)
corr_matrix.shape
smg.plot_corr(corr_matrix, cmap='viridis')

#tsa.plot_acf(x, ax=None, lags=None, alpha=0.05, use_vlines=True, unbiased=False, fft=False, title='Autocorrelation', zero=True, vlines_kwargs=None, **kwargs)

# plot
tsa.plot_acf(r1, lags=30, alpha=0.05, use_vlines=False)
plt.title('Autocorrelation ' + r1.name)
ylabel1 = 'corr of day t with day t minus lag'
xlabel1 = 'Lag'
plt.xlabel(xlabel1)
plt.ylabel(ylabel1)
plt.savefig('output/bsk/ret/ACF_bsk1.png')

# plot
tsa.plot_acf(ret_vcc_mat.BTC, use_vlines=False, lags=30, alpha=0.05)
plt.title('Autocorrelation BTC')
plt.xlabel(xlabel1)
plt.ylabel(ylabel1)
Exemple #9
0
# Future calculations won't work on strings, so let's convert "airbus" and "boeing" to 0 nd 1, respectively
faa_df = faa_df.replace(to_replace="airbus", value=0)
faa_df = faa_df.replace(to_replace="boeing", value=1)

# Lets remove "speed_air" from our original data set
del faa_df["speed_air"]

# Create a new data frame for boeing data
boeing_df = faa_df[faa_df.aircraft == 1]

# Examine data
print(boeing_df)

# Create our correlation matrix
corr_matrix = np.corrcoef(boeing_df.T)
smg.plot_corr(corr_matrix, xnames=list(boeing_df))
plt.show()

# Pairwise plot using seaborn
sns.pairplot(boeing_df)
plt.show()

# Build model with distance as dependant variable and all other variables as independent variables
model = smf.ols(
    formula="distance ~ duration + no_pasg + speed_ground + height + pitch",
    data=boeing_df)
results = model.fit()
print(results.summary())

# Build revised model with distance as dependant variable and significant variables as independent variables
revised_model = smf.ols(formula="distance ~ speed_ground + height",
Exemple #10
0
# Future calculations won't work on strings, so let's convert "airbus" and "boeing" to 0 nd 1, respectively
faa_df = faa_df.replace(to_replace="airbus", value=0)
faa_df = faa_df.replace(to_replace="boeing", value=1)

# Lets remove "speed_air" from our original data set
del faa_df["speed_air"]

# Create a new data frame for boeing data
airbus_df = faa_df[faa_df.aircraft == 0]

# Examine data
print(airbus_df)

# Create our correlation matrix
corr_matrix = np.corrcoef(airbus_df.T)
smg.plot_corr(corr_matrix, xnames=list(airbus_df))
plt.show()

# Pairwise plot using seaborn
sns.pairplot(airbus_df)
plt.show()

# Build model with distance as dependant variable and all other variables as independent variables
model = smf.ols(
    formula="distance ~ duration + no_pasg + speed_ground + height + pitch",
    data=airbus_df)
results = model.fit()
print(results.summary())

# Build revised model with distance as dependant variable and significant variables as independent variables
revised_model = smf.ols(formula="distance ~ speed_ground + height + pitch",
Exemple #11
0
#                       (bc_data['Single Epithelial Cell Size'] > stats_desc.loc['Single Epithelial Cell Size'].rwhisker) |
#                       (bc_data['Bare Nuclei'] > stats_desc.loc['Bare Nuclei'].rwhisker) |
#                       (bc_data['Normal Nucleoli'] > stats_desc.loc['Normal Nucleoli'].rwhisker) |
#                       (bc_data['Bland Chromatin'] > stats_desc.loc['Bland Chromatin'].rwhisker) |
#                       (bc_data['Mitoses'] > stats_desc.loc['Mitoses'].rwhisker)].index)
bc_data_new.info()

# In[332]:

corr = bc_data_new.corr()
corr

# In[333]:

import statsmodels.graphics.api as smg
smg.plot_corr(corr, xnames=list(corr.columns))
plt.show()

# In[334]:

bc_data_new.var()

# In[335]:

# Drop Mitoses as it has zero variance
bc_data_new.drop('Mitoses', inplace=True, axis=1)

# In[336]:

# Drop Cell Shape as both Cell Size & Shape has very strong correlation
bc_data_new.drop('Cell Shape', inplace=True, axis=1)