Esempio n. 1
0
def test_rank2d(seaborn=False, outpath=None):
    """
    Runs the radviz visualizer on the dataset.

    Parameters
    ----------
    pandas : bool
        Run the pandas version of the function
    outpath : path or None
        Save the figure to disk rather than show (if None)
    """
    data = load_data('occupancy')  # Load the data
    features = ['temp', 'humid', 'light', 'co2', 'hratio']
    classes = ['unoccupied', 'occupied']
    X = data[features].as_matrix()
    y = data.occupied.as_matrix()

    if seaborn:
        raise NotImplementedError("Not yet!")

    else:
        visualizer = Rank2D(features=features, algorithm='covariance')
        visualizer.fit(X, y)  # Fit the data to the visualizer
        visualizer.transform(X)  # Transform the data
        visualizer.poof(outpath=outpath)  # Draw/show/poof the data
Esempio n. 2
0
    def generate_rank_2d(self, X, algorithm='pearson', **kwargs):
        """
        Given the entire (train+test) input features, returns a plotly
        Heatmap figure showing the feature x feature correlation.
        
        :param X: the input features to the model
        :param algorithm: the algorithm to calculate the importance with 
                          (pearson, covariance, spearman, kendalltau)
        """

        visualizer = Rank2D(algorithm=algorithm)
        visualizer.fit_transform(X)

        # values
        ranks_ = visualizer.ranks_

        #grabbing feature shape
        feats = ranks_.shape[0]

        # zero-ing out one of the diagonals features
        #iu = np.triu_indices(feats, )
        #ranks_[iu] = 0

        fig = go.Figure([
            go.Heatmap(z=ranks_,
                       x=self.feature_names,
                       y=self.feature_names,
                       **kwargs)
        ])

        return fig
Esempio n. 3
0
def rank_2d(features, algorithm, X, y):
    from yellowbrick.features import Rank2D

    # Instantiate the visualizer with the Covariance ranking algorithm
    visualizer = Rank2D(features=features, algorithm=algorithm)

    visualizer.fit(X, y)  # Fit the data to the visualizer
    visualizer.transform(X)  # Transform the data
    visualizer.poof()  # Draw/show/poof the data
Esempio n. 4
0
def visualizeFeatureImportance(features, labels):

    # Instantiate the visualizer with the Covariance ranking algorithm
    visualizer = Rank2D(algorithm='covariance')

    visualizer.fit(features.drop(["appid", "name"], axis=1),
                   list(map(convertLabelToNumber,
                            labels)))  # Fit the data to the visualizer
    visualizer.transform(features.drop(["appid", "name"],
                                       axis=1))  # Transform the data
    visualizer.poof()  # Draw/show/poof the data
Esempio n. 5
0
def explore_features(df):
    df_copy = df.copy()

    #for some reason, the visualize doesn't accept categorical
    #variables. those have to be converted to strings
    for (col, data) in df_copy.iteritems():
        if df_copy[col].dtype.name == "category":
            df_copy[col] = df_copy[col].astype(str)

    numeric_df = autoclean(df_copy)
    visualizer = Rank2D(algorithm="pearson")
    visualizer.fit_transform(numeric_df)
    visualizer.poof()
Esempio n. 6
0
def testFunc5(savepath='Results/bikeshare_Rank2D.png'):
    '''
    共享单车数据集预测
    '''
    data = pd.read_csv('fixtures/bikeshare/bikeshare.csv')
    X = data[[
        "season", "month", "hour", "holiday", "weekday", "workingday",
        "weather", "temp", "feelslike", "humidity", "windspeed"
    ]]
    Y = data["riders"]

    visualizer = Rank2D(algorithm="pearson")
    visualizer.fit_transform(X)
    visualizer.poof(outpath=savepath)
Esempio n. 7
0
def Corr_vision(X):
    """ Correlation visualization according to Pearson

    Parameters
    ----------
    X: matrix of features

    Returns
    -------
    
    - A plot with correlation features
    
    """

    fig, ax = plt.subplots(figsize=(20, 20))
    visualizer = Rank2D(algorithm="pearson")
    visualizer.fit_transform(X)
    #visualizer.show('corr_matrix') // to output png
    plt.show()
Esempio n. 8
0
def rank2d(ax, algorithm='pearson'):
    from yellowbrick.features import Rank2D

    # Specify the features of interest
    features = [
        'limit',
        'sex',
        'edu',
        'married',
        'age',
        'apr_delay',
        'may_delay',
        'jun_delay',
        'jul_delay',
        'aug_delay',
        'sep_delay',
        'apr_bill',
        'may_bill',
        'jun_bill',
        'jul_bill',
        'aug_bill',
        'sep_bill',
        'apr_pay',
        'may_pay',
        'jun_pay',
        'jul_pay',
        'aug_pay',
        'sep_pay',
    ]

    # Load the data
    X, y = load_data('credit', cols=features, target='default')

    # Instantiate and fit the visualizer
    visualizer = Rank2D(features=features, algorithm=algorithm)
    visualizer.title = "2D Ranking of Pairs of Features by {}".format(
        algorithm.title())
    visualizer.fit(X, y)
    visualizer.transform(X)
    return visualizer
 def rank2(data,
           name=name,
           location=location,
           dcol=dcol,
           algorithm=algorithm,
           colormap=colormap,
           show=show):
     df_data = data.drop(dcol, axis=1)
     df_data = df_data.astype(float)
     ax = plt.axes()
     rk2d2 = Rank2D(ax=ax,
                    algorithm=algorithm,
                    show_feature_names=show,
                    size=(1080, 720),
                    colormap=colormap)
     ax.set_title(name)
     rk2d2.fit(df_data)
     rk2d2.transform(df_data)
     rk2d2.show(outpath=os.path.join(location,
                                     f"Correlation_{algorithm}_{name}.png"))
     plt.close()
     return name
Esempio n. 10
0
def feature_analysis(fname="feature_analysis.png"):
    """
    Create figures for feature analysis
    """

    # Create side-by-side axes grid
    _, axes = plt.subplots(ncols=2, figsize=(18, 6))

    # Draw RadViz on the left
    data = load_occupancy(split=False)
    oz = RadViz(ax=axes[0], classes=["unoccupied", "occupied"])
    oz.fit(data.X, data.y)
    oz.finalize()

    # Draw Rank2D on the right
    data = load_concrete(split=False)
    oz = Rank2D(ax=axes[1])
    oz.fit_transform(data.X, data.y)
    oz.finalize()

    # Save figure
    path = os.path.join(FIGURES, fname)
    plt.tight_layout()
    plt.savefig(path)
Esempio n. 11
0
# Unpickle model
lasso = joblib.load('../lasso_total.pkl')
"""
Visualizations to create:
1. Rank2d Pearson Ranking of Features
2. Feature Importance
3. Residuals plot
4. Actual vs. Predicted with prediction error
5. Alpha Selection
"""

# Rank2d (naive, 18 variable case)
fig = plt.figure()
ax = fig.add_subplot()
rank = Rank2D(features=feature_cols, algorithm='pearson', ax=ax)
Xt = Xtrain[feature_cols]
rank.fit(Xt, ytrain)
rank.transform(Xt)
rank.poof(outpath="lasso_rank2d.png")

# Feature Importances (naive, 18 variable case)
fig = plt.figure()
ax = fig.add_subplot()
featimp = FeatureImportances(lasso, ax=ax)
featimp.fit(Xt, ytrain)
featimp.poof(outpath="lasso_featureimportances18.png")

# Residuals Plot
fig = plt.figure()
ax = fig.add_subplot()
Esempio n. 12
0
visualizer.fit(y)
visualizer.poof()

# %%
visualizer = ParallelCoordinates()
visualizer.fit_transform(X, y)
visualizer.poof()

# %%
visualizer = Rank1D()
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.poof()

# %%
visualizer = Rank2D()
visualizer.fit_transform(X)
visualizer.poof()

# %%
visualizer = FeatureCorrelation()
visualizer.fit(X, y)
visualizer.poof()

# %%
visualizer = FeatureCorrelation(method='mutual_info-classification')
visualizer.fit(X, y)
visualizer.poof()

# %%
visualizer = RadViz(classes=class_names)
Esempio n. 13
0
def rank2d():
    X, y = load_credit()
    oz = Rank2D(algorithm="covariance", ax=newfig())
    oz.fit_transform(X, y)
    savefig(oz, "rank2d_covariance")
                plt.ylabel('lambda_sigma', fontsize=14)
                plt.xlabel('lambda_weight', fontsize=14)
                locationFileNameJPV = os.path.join(
                    '/home/ak/Documents/Research/Papers/figures',
                    str(symbols[symbolIdx]) + '_idx_' + str(idx) + 'date' +
                    str(dateIdx) + '_label' + str(labelName) +
                    '_jointplotViz.png')
                visualizerJPV.show(outpath=locationFileNameJPV)
                plt.show()

                # # Instantiate the visualizer with the Covariance ranking algorithm

                set_palette('sns_dark')
                plt.figure()
                visualizerR2D = Rank2D(features=features,
                                       algorithm='pearson',
                                       title='  ')

                visualizerR2D.fit(X, y)  # Fit the data to the visualizer
                visualizerR2D.transform(X)  # Transform the data
                plt.xticks(fontsize=12)
                plt.yticks(fontsize=12)
                locationFileNameR2D = os.path.join(
                    '/home/ak/Documents/Research/Papers/figures',
                    str(symbols[symbolIdx]) + '_idx_' + str(idx) + '_label' +
                    str(labelName) + '_date_' + str(dateIdx) +
                    '_pearsonCorrel.png')
                visualizerR2D.show(outpath=locationFileNameR2D)
                plt.show()

                my_title = " "
dataset = pd.DataFrame({'Lexical_Diversity':hh[:,0],'Brunet_Index':hh[:,1],'Honore_Satistic':hh[:,2],'Flesch Reading':hh[:,3], 'Flesch-Kincaid':hh[:,4] })
                
SZ_Type = ['Incoherence', 'Incoherence','Incoherence','Incoherence','Incoherence', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality', 'Tangentiality','Tangentiality' ]                      
dataset['SZ_Type']=SZ_Type
                  

import seaborn as sns
sns.set(style="ticks")
#
sns.pairplot(dataset, hue='SZ_Type', size=1.75)

from yellowbrick.features import Rank2D
features = ['Lexical_Diversity','Brunet_Index', 'Honore_Satistic', 'Flesch Reading', 'Flesch-Kincaid']                         

# Instantiate the visualizer with the Pearson ranking algorithm
visualizer = Rank2D(features=features, algorithm='pearson')
X=np.transpose(h)
Y = np.asarray([0, 0 ,0, 0, 0, 1,1,1, 1,1,1, 1,1,1,1])


visualizer.fit(X, Y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.poof()           



from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
Esempio n. 16
0
frames = [morg_2_train_strs_broken, activ_inact_train]
import pandas as pd
dfrad = pd.concat(frames, axis=1)
dfrad = dfrad.dropna()
#dfrad.iloc[:,[2048]]
#dfrad.iloc[:,:100]

#CLASS BALANCE - No balanced
from yellowbrick.target import ClassBalance
visCB = ClassBalance(labels=[1, 0])
visCB.fit(dfrad['activities'])  #Fit the data to the visualizer
visCB.show()  #Finalize and render the figure

#RANK 2D "Pearson correlation" -No balanced
from yellowbrick.features import Rank2D
visualizer = Rank2D(algorithm='pearson')
visualizer.fit(dfrad.iloc[:, :50],
               dfrad['activities'])  # Fit the data to the visualizer
visualizer.transform(dfrad.iloc[:, :50])  # Transform the data
visualizer.show()  # Finalize and render the figure

#MANIFOLD - No balanced
from yellowbrick.features import Manifold
classes = [1, 0]
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder(
)  #label_encoder object knows how to understand word labels.
dfrad['activities'] = label_encoder.fit_transform(
    dfrad['activities'])  #Encode labels
dfrad['activities'].unique()
viz = Manifold(manifold="tsne", classes=classes)  # Instantiate the visualizer
Esempio n. 17
0
features = [
        'price', 'rating', 'review_count', 'high_risk_1',
       'medium_risk_2', 'low_risk_2', 'is_pickup', 'is_delivery', 'is_restaurant_reservation', 'Canvass',
       'Complaint', 'reinspection', 'License', 'FoodPoison', 'is_pickup', 'is_delivery', 'is_restaurant_reservation'
    ]
X = data[features]
y = data['pass']

visualizer = Rank1D(features=features, algorithm='shapiro')

visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.poof(outpath="1D_features.png")                   # Draw/show/poof the data

#2D
visualizer = Rank2D(features=features, algorithm='covariance')

visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.poof(outpath="2D_features.png")                   # Draw/show/poof the data


#1D with other features but including rating
features = ['rating',
        'is_african', 'is_asian_fusion', 'is_bakeries', 'is_bars',
       'is_breakfast_brunch', 'is_buffets', 'is_cafes', 'is_caribbean',
       'is_chinese', 'is_deli', 'is_eastern_european', 'is_european',
       'is_fast_food', 'is_hawaiian', 'is_health_food', 'is_icecream',
    ]
X = data[features]
y = data['pass']
def pearson_features(ml_array_):
    feat_visualizer = Rank2D(algorithm="pearson")
    feat_visualizer.fit_transform(ml_array_)
    feat_visualizer.show()
Esempio n. 19
0
#Create an Explainer inhertance of the data
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns.tolist(), class_names=y_train.unique())

# Create a lambda function to use model to predict the data
predict_fn = lambda x: model.predict_proba(x).astype(float)

#using Explainer to Explain the predictions
exp = explainer.explain_instence(X_test.values[0], predict_fn, num_features=6)
exp.show_in_notebook(show_all=False)

"""## Yellowbrick"""

# HeatMap for Co-Relation

visualizer = Rank2D(algorithm="pearson", size=(1080, 720))
visualizer.fit_transform(X_train)
visualizer.poof()


# Evaluation Metrics
visualizer = ClassificationReport(model, size=(1080, 720))
visualizer.fit(X_train, y_train)
visualizer.score(X_train, y_train)
visualizer.poff()

"""# Using API in WebApp (Flask)"""

# Commented out IPython magic to ensure Python compatibility.
# %%writefile server.py
# 
Esempio n. 20
0
       'Complaint', 'reinspection', 'License', 'FoodPoison', 'high_risk_1',
       'medium_risk_2', 'low_risk_2', 'grocery', 'Bakery', 'Mobile']

X = data[cols]
y = data['pass']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state = 42)

clf = linear_model.Lasso(alpha=0.5)
clf.fit(X_train, y_train)
clf.predict(X_test)

visualizer = PredictionError(Lasso())
visualizer.fit(X_train, y_train)

oz = Rank2D(features=cols)
oz.fit_transform(X, y)
oz.poof()

oz = Rank2D(features=cols, algorithm='covariance')
oz.fit_transform(X, y)
oz.poof()

g = sns.jointplot(x='review_count', y='rating', kind='hex', data=data)

h = sns.jointplot(x='price', y='rating', kind='hex', data=data)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
oz = RadViz(classes=label_encoder.classes_, features=cols)
oz.fit(X, y)
Esempio n. 21
0
import pandas as pd
from yellowbrick.features import Rank2D

data = pd.read_csv('../CSV/bikeshare.csv')
X = data[[
    "season", "month", "hour", "holiday", "weekday", "workingday", "weather",
    "temp", "feelslike", "humidity", "windspeed"
]]
y = data["riders"]

visualizer = Rank2D(algorithm="pearson")
visualizer.fit_transform(X)
visualizer.poof()
'''
This figure shows us the Pearson correlation between pairs of features such that each cell in the grid represents 
two features identified in order on the x and y axes and whose color displays the magnitude of the correlation. 
A Pearson correlation of 1.0 means that there is a strong positive, linear relationship between the pairs of 
variables and a value of -1.0 indicates a strong negative, linear relationship (a value of zero indicates no 
relationship). Therefore we are looking for dark red and dark blue boxes to identify further.

In this chart, we see that the features temp and feelslike have a strong correlation and also that the feature 
season has a strong correlation with the feature month. This seems to make sense; the apparent temperature we feel 
outside depends on the actual temperature and other airquality factors, and the season of the year is described by 
the month!
'''
Esempio n. 22
0
def visualize_features(classes, problem_type, curdir, default_features,
                       balance_data, test_size):

    # make features into label encoder here
    features, feature_labels, class_labels = get_features(
        classes, problem_type, default_features, balance_data)

    # now preprocess features for all the other plots
    os.chdir(curdir)
    le = preprocessing.LabelEncoder()
    le.fit(class_labels)
    tclass_labels = le.transform(class_labels)

    # process features to help with clustering
    se = preprocessing.StandardScaler()
    t_features = se.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        tclass_labels,
                                                        test_size=test_size,
                                                        random_state=42)

    # print(len(features))
    # print(len(feature_labels))
    # print(len(class_labels))
    # print(class_labels)

    # GET TRAINING DATA DURING MODELING PROCESS
    ##################################
    # get filename
    # csvfile=''
    # print(classes)
    # for i in range(len(classes)):
    # 	csvfile=csvfile+classes[i]+'_'

    # get training and testing data for later
    # try:
    # print('loading training files...')
    # X_train=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'train.csv')
    # y_train=X_train['class_']
    # X_train.drop(['class_'], axis=1)
    # X_test=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'test.csv')
    # y_test=X_test['class_']
    # X_test.drop(['class_'], axis=1)
    # y_train=le.inverse_transform(y_train)
    # y_test=le.inverse_transform(y_test)
    # except:
    # print('error loading in training files, making new test data')

    # Visualize each class (quick plot)
    ##################################
    visualization_dir = 'visualization_session'
    try:
        os.mkdir(visualization_dir)
        os.chdir(visualization_dir)
    except:
        shutil.rmtree(visualization_dir)
        os.mkdir(visualization_dir)
        os.chdir(visualization_dir)

    objects = tuple(set(class_labels))
    y_pos = np.arange(len(objects))
    performance = list()
    for i in range(len(objects)):
        performance.append(class_labels.count(objects[i]))

    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.xticks(rotation=90)
    plt.title('Counts per class')
    plt.ylabel('Count')
    plt.xlabel('Class')
    plt.tight_layout()
    plt.savefig('classes.png')
    plt.close()

    # set current directory
    curdir = os.getcwd()

    # ##################################
    # # CLUSTERING!!!
    # ##################################

    ##################################
    # Manifold type options
    ##################################
    '''
		"lle"
		Locally Linear Embedding (LLE) uses many local linear decompositions to preserve globally non-linear structures.
		"ltsa"
		LTSA LLE: local tangent space alignment is similar to LLE in that it uses locality to preserve neighborhood distances.
		"hessian"
		Hessian LLE an LLE regularization method that applies a hessian-based quadratic form at each neighborhood
		"modified"
		Modified LLE applies a regularization parameter to LLE.
		"isomap"
		Isomap seeks a lower dimensional embedding that maintains geometric distances between each instance.
		"mds"
		MDS: multi-dimensional scaling uses similarity to plot points that are near to each other close in the embedding.
		"spectral"
		Spectral Embedding a discrete approximation of the low dimensional manifold using a graph representation.
		"tsne" (default)
		t-SNE: converts the similarity of points into probabilities then uses those probabilities to create an embedding.
	'''
    os.mkdir('clustering')
    os.chdir('clustering')

    # tSNE
    plt.figure()
    viz = Manifold(manifold="tsne", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="tsne.png")
    plt.close()
    # os.system('open tsne.png')
    # viz.show()

    # PCA
    plt.figure()
    visualizer = PCADecomposition(scale=True, classes=set(classes))
    visualizer.fit_transform(np.array(features), tclass_labels)
    visualizer.poof(outpath="pca.png")
    plt.close()
    # os.system('open pca.png')

    # spectral embedding
    plt.figure()
    viz = Manifold(manifold="spectral", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="spectral.png")
    plt.close()

    # lle embedding
    plt.figure()
    viz = Manifold(manifold="lle", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="lle.png")
    plt.close()

    # ltsa
    # plt.figure()
    # viz = Manifold(manifold="ltsa", classes=set(classes))
    # viz.fit_transform(np.array(features), tclass_labels)
    # viz.poof(outpath="ltsa.png")
    # plt.close()

    # hessian
    # plt.figure()
    # viz = Manifold(manifold="hessian", method='dense', classes=set(classes))
    # viz.fit_transform(np.array(features), tclass_labels)
    # viz.poof(outpath="hessian.png")
    # plt.close()

    # modified
    plt.figure()
    viz = Manifold(manifold="modified", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="modified.png")
    plt.close()

    # isomap
    plt.figure()
    viz = Manifold(manifold="isomap", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="isomap.png")
    plt.close()

    # mds
    plt.figure()
    viz = Manifold(manifold="mds", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="mds.png")
    plt.close()

    # spectral
    plt.figure()
    viz = Manifold(manifold="spectral", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="spectral.png")
    plt.close()

    # UMAP embedding
    plt.figure()
    umap = UMAPVisualizer(metric='cosine',
                          classes=set(classes),
                          title="UMAP embedding")
    umap.fit_transform(np.array(features), class_labels)
    umap.poof(outpath="umap.png")
    plt.close()

    # alternative UMAP
    # import umap.plot
    # plt.figure()
    # mapper = umap.UMAP().fit(np.array(features))
    # fig=umap.plot.points(mapper, labels=np.array(tclass_labels))
    # fig = fig.get_figure()
    # fig.tight_layout()
    # fig.savefig('umap2.png')
    # plt.close(fig)

    #################################
    # 	  FEATURE RANKING!!
    #################################
    os.chdir(curdir)
    os.mkdir('feature_ranking')
    os.chdir('feature_ranking')

    # You can get the feature importance of each feature of your dataset
    # by using the feature importance property of the model.
    plt.figure(figsize=(12, 12))
    model = ExtraTreesClassifier()
    model.fit(np.array(features), tclass_labels)
    # print(model.feature_importances_)
    feat_importances = pd.Series(model.feature_importances_,
                                 index=feature_labels[0])
    feat_importances.nlargest(20).plot(kind='barh')
    plt.title('Feature importances (ExtraTrees)', size=16)
    plt.title('Feature importances with %s features' % (str(len(features[0]))))
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    # os.system('open feature_importance.png')

    # get selected labels for top 20 features
    selectedlabels = list(dict(feat_importances.nlargest(20)))
    new_features, new_labels = restructure_features(selectedlabels, t_features,
                                                    feature_labels[0])
    new_features_, new_labels_ = restructure_features(selectedlabels, features,
                                                      feature_labels[0])

    # Shapiro rank algorithm (1D)
    plt.figure(figsize=(28, 12))
    visualizer = Rank1D(algorithm='shapiro',
                        classes=set(classes),
                        features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    # plt.tight_layout()
    visualizer.poof(outpath="shapiro.png")
    plt.title('Shapiro plot (top 20 features)', size=16)
    plt.close()
    # os.system('open shapiro.png')
    # visualizer.show()

    # pearson ranking algorithm (2D)
    plt.figure(figsize=(12, 12))
    visualizer = Rank2D(algorithm='pearson',
                        classes=set(classes),
                        features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    plt.tight_layout()
    visualizer.poof(outpath="pearson.png")
    plt.title('Pearson ranking plot (top 20 features)', size=16)
    plt.close()
    # os.system('open pearson.png')
    # visualizer.show()

    # feature importances with top 20 features for Lasso
    plt.figure(figsize=(12, 12))
    viz = FeatureImportances(Lasso(), labels=new_labels_)
    viz.fit(np.array(new_features_), tclass_labels)
    plt.tight_layout()
    viz.poof(outpath="lasso.png")
    plt.close()

    # correlation plots with feature removal if corr > 0.90
    # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf

    # now remove correlated features
    # --> p values
    # --> https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e / https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/Plotly%20Whirlwind%20Introduction.ipynb- plotly for correlation heatmap and scatterplot matrix
    # --> https://seaborn.pydata.org/tutorial/distributions.html
    data = new_features
    corr = data.corr()

    plt.figure(figsize=(12, 12))
    fig = sns.heatmap(corr)
    fig = fig.get_figure()
    plt.title('Heatmap with correlated features (top 20 features)', size=16)
    fig.tight_layout()
    fig.savefig('heatmap.png')
    plt.close(fig)

    columns = np.full((corr.shape[0], ), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= 0.9:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    corr = data.corr()

    plt.figure(figsize=(12, 12))
    fig = sns.heatmap(corr)
    fig = fig.get_figure()
    plt.title('Heatmap without correlated features (top 20 features)', size=16)
    fig.tight_layout()
    fig.savefig('heatmap_clean.png')
    plt.close(fig)

    # radviz
    # Instantiate the visualizer
    plt.figure(figsize=(12, 12))
    visualizer = RadViz(classes=classes, features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    visualizer.poof(outpath="radviz.png")
    visualizer.show()
    plt.close()

    # feature correlation plot
    plt.figure(figsize=(28, 12))
    visualizer = feature_correlation(np.array(new_features),
                                     tclass_labels,
                                     labels=new_labels)
    visualizer.poof(outpath="correlation.png")
    visualizer.show()
    plt.tight_layout()
    plt.close()

    os.mkdir('feature_plots')
    os.chdir('feature_plots')

    newdata = new_features_
    newdata['classes'] = class_labels

    for j in range(len(new_labels_)):
        fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels_[j]])
        fig = fig.get_figure()
        fig.tight_layout()
        fig.savefig('%s_%s.png' % (str(j), new_labels_[j]))
        plt.close(fig)

    os.mkdir('feature_plots_transformed')
    os.chdir('feature_plots_transformed')

    newdata = new_features
    newdata['classes'] = class_labels

    for j in range(len(new_labels)):
        fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels[j]])
        fig = fig.get_figure()
        fig.tight_layout()
        fig.savefig('%s_%s.png' % (str(j), new_labels[j]))
        plt.close(fig)

    ##################################################
    # PRECISION-RECALL CURVES
    ##################################################

    os.chdir(curdir)
    os.mkdir('model_selection')
    os.chdir('model_selection')

    plt.figure()
    visualizer = precision_recall_curve(GaussianNB(), np.array(features),
                                        tclass_labels)
    visualizer.poof(outpath="precision-recall.png")
    plt.close()

    plt.figure()
    visualizer = roc_auc(LogisticRegression(), np.array(features),
                         tclass_labels)
    visualizer.poof(outpath="roc_curve_train.png")
    plt.close()

    plt.figure()
    visualizer = discrimination_threshold(
        LogisticRegression(multi_class="auto", solver="liblinear"),
        np.array(features), tclass_labels)
    visualizer.poof(outpath="thresholds.png")
    plt.close()

    plt.figure()
    visualizer = residuals_plot(Ridge(),
                                np.array(features),
                                tclass_labels,
                                train_color="maroon",
                                test_color="gold")
    visualizer.poof(outpath="residuals.png")
    plt.close()

    plt.figure()
    visualizer = prediction_error(Lasso(), np.array(features), tclass_labels)
    visualizer.poof(outpath='prediction_error.png')
    plt.close()

    # outlier detection
    plt.figure()
    visualizer = cooks_distance(np.array(features),
                                tclass_labels,
                                draw_threshold=True,
                                linefmt="C0-",
                                markerfmt=",")
    visualizer.poof(outpath='outliers.png')
    plt.close()

    # cluster numbers
    plt.figure()
    visualizer = silhouette_visualizer(
        KMeans(len(set(tclass_labels)), random_state=42), np.array(features))
    visualizer.poof(outpath='siloutte.png')
    plt.close()

    # cluster distance
    plt.figure()
    visualizer = intercluster_distance(
        KMeans(len(set(tclass_labels)), random_state=777), np.array(features))
    visualizer.poof(outpath='cluster_distance.png')
    plt.close()

    # plot percentile of features plot with SVM to see which percentile for features is optimal
    features = preprocessing.MinMaxScaler().fit_transform(features)
    clf = Pipeline([('anova', SelectPercentile(chi2)),
                    ('scaler', StandardScaler()),
                    ('logr', LogisticRegression())])
    score_means = list()
    score_stds = list()
    percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100)

    for percentile in percentiles:
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, np.array(features), class_labels)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())

    plt.errorbar(percentiles, score_means, np.array(score_stds))
    plt.title(
        'Performance of the LogisticRegression-Anova varying the percent features selected'
    )
    plt.xticks(np.linspace(0, 100, 11, endpoint=True))
    plt.xlabel('Percentile')
    plt.ylabel('Accuracy Score')
    plt.axis('tight')
    plt.savefig('logr_percentile_plot.png')
    plt.close()

    # get PCA
    pca = PCA(random_state=1)
    pca.fit(X_train)
    skplt.decomposition.plot_pca_component_variance(pca)
    plt.savefig('pca_explained_variance.png')
    plt.close()

    # estimators
    rf = RandomForestClassifier()
    skplt.estimators.plot_learning_curve(rf, X_train, y_train)
    plt.title('Learning Curve (Random Forest)')
    plt.savefig('learning_curve.png')
    plt.close()

    # elbow plot
    kmeans = KMeans(random_state=1)
    skplt.cluster.plot_elbow_curve(kmeans,
                                   X_train,
                                   cluster_ranges=range(1, 30),
                                   title='Elbow plot (KMeans clustering)')
    plt.savefig('elbow.png')
    plt.close()

    # KS statistic (only if 2 classes)
    lr = LogisticRegression()
    lr = lr.fit(X_train, y_train)
    y_probas = lr.predict_proba(X_test)
    skplt.metrics.plot_ks_statistic(y_test, y_probas)
    plt.savefig('ks.png')
    plt.close()

    # precision-recall
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    y_probas = nb.predict_proba(X_test)
    skplt.metrics.plot_precision_recall(y_test, y_probas)
    plt.tight_layout()
    plt.savefig('precision-recall.png')
    plt.close()

    ## plot calibration curve
    rf = RandomForestClassifier()
    lr = LogisticRegression()
    nb = GaussianNB()
    svm = LinearSVC()
    dt = DecisionTreeClassifier(random_state=0)
    ab = AdaBoostClassifier(n_estimators=100)
    gb = GradientBoostingClassifier(n_estimators=100,
                                    learning_rate=1.0,
                                    max_depth=1,
                                    random_state=0)
    knn = KNeighborsClassifier(n_neighbors=7)

    rf_probas = rf.fit(X_train, y_train).predict_proba(X_test)
    lr_probas = lr.fit(X_train, y_train).predict_proba(X_test)
    nb_probas = nb.fit(X_train, y_train).predict_proba(X_test)
    # svm_scores = svm.fit(X_train, y_train).predict_proba(X_test)
    dt_scores = dt.fit(X_train, y_train).predict_proba(X_test)
    ab_scores = ab.fit(X_train, y_train).predict_proba(X_test)
    gb_scores = gb.fit(X_train, y_train).predict_proba(X_test)
    knn_scores = knn.fit(X_train, y_train).predict_proba(X_test)

    probas_list = [
        rf_probas,
        lr_probas,
        nb_probas,  # svm_scores,
        dt_scores,
        ab_scores,
        gb_scores,
        knn_scores
    ]

    clf_names = [
        'Random Forest',
        'Logistic Regression',
        'Gaussian NB',  # 'SVM',
        'Decision Tree',
        'Adaboost',
        'Gradient Boost',
        'KNN'
    ]

    skplt.metrics.plot_calibration_curve(y_test, probas_list, clf_names)
    plt.savefig('calibration.png')
    plt.tight_layout()
    plt.close()

    # pick classifier type by ROC (without optimization)
    probs = [
        rf_probas[:, 1],
        lr_probas[:, 1],
        nb_probas[:, 1],  # svm_scores[:, 1],
        dt_scores[:, 1],
        ab_scores[:, 1],
        gb_scores[:, 1],
        knn_scores[:, 1]
    ]

    plot_roc_curve(y_test, probs, clf_names)
    # more elaborate ROC example with CV = 5 fold
    # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py

    os.chdir(curdir)

    return ''
Esempio n. 23
0
#データの先頭10行表示
print(CSV_data.head(10))
#統計量の確認
print(CSV_data.describe())

#説明変数X と 目的変数yへの分割
#説明変数X=すべての行(:),1列目と2列目([0,1])
print("説明変数")
X = CSV_data.loc[:, ['right', 'left']]
print(X)
#目的変数y=すべての行(:),3,4,5列目(2)
print("目的変数")
y = CSV_data.loc[:, ['wa', 'sa', 'seki']]
print(y)

visualiser = Rank2D(alorithm='pearson')
visualiser.fit(X, y)
visualiser.transform(X)
visualiser.poof()

#学習データとテストデータへの分割
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)


#ニューラルネット構築
def createModel():
    print("構築")
    model = Sequential()
Esempio n. 24
0

if os.environ.get('VIZ', '0') == '1':
    from yellowbrick.features import Rank1D, Rank2D, RadViz, ParallelCoordinates
    import matplotlib.pyplot as plt
    print("Rank1D...")
    features = train_columns
    visualizer = Rank1D(features=features, algorithm='shapiro')
    visualizer.fit(X, Y)  # Fit the data to the visualizer
    visualizer.transform(X)  # Transform the data
    visualizer.poof(outpath='viz_feature_rank1d.pdf', bbox_inches='tight')
    plt.close('all')
    feature_diversity = visualizer.ranks_
    # Instantiate the visualizer with the Covariance ranking algorithm
    print("Rank2D...")
    visualizer = Rank2D(features=features, algorithm='spearman')
    visualizer.fit(X, Y)  # Fit the data to the visualizer
    visualizer.transform(X)  # Transform the data
    visualizer.poof(outpath='viz_feature_rank2d.pdf', bbox_inches='tight')
    plt.close('all')
    """
	# reorder the features so similar ones are together
	features_to_handle = list(range(len(features)))
	print(features_to_handle)
	features_ordered = []
	last_feature = 0
	numpy.random.seed(1)
	while features_to_handle:
		print("%d ..." % last_feature, feature_distance.shape)
		invdists = 1. / (visualizer.ranks_[last_feature,features_to_handle]**2 + 1e-5)
		invdists /= invdists.sum()