def get_scatter_matrix(name, normalized, relationship):
    """
    Obtains the scatter matrix of the data set.

    Parameters
    ----------
    name: str
        Name of the image which is going to be saved.
    normalized: pd.DataFrame
        DataFrame which is going to be studied.
    relationship : pd.DataFrame
        DataFrame of the relationship type of each message.

    Returns
    -------
    None.

    """
    X = replace_nan(normalized, relationship)

    dic_colors = generate_colors()
    colors = relationship.map(dic_colors)
    scatter_matrix(pd.DataFrame(X),
                   figsize=(100, 100),
                   diagonal='hist',
                   color=colors)
    plt.savefig(name + 'scatter_matrix.png')
Exemple #2
0
def get_correlation_matrix(df, relationship):
    """
    Obtains the correlation matrix of the given DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        Data which is going to be studied.
    relationship : pd.DataFrame
        DataFrame of the relationship type of each message.

    Returns
    -------
    None.

    """
    X = replace_nan(df, relationship)
    X = pd.DataFrame(data=X, columns=df.columns)
    plt.figure(figsize=(20, 20))
    cor = X.corr()
    sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
    plt.savefig('correlation_matrix.png')
def classify_with_decission_tree(df, relationship, criteria, name):
    """
    Generates the decision tree of the given data.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame which is going to be studied.
    relationship : pd.DataFrame
        DataFrame of the relationship type of each message.
    criteria : str
        Criteria for generating it ('gini' or 'entropy').
    name: str
        Name of the image which is going to be saved.

    Returns
    -------
    None.

    """
    X = replace_nan(df, relationship)
    depth = calculate_best_depth(X, relationship, name, criteria)

    clf = DecisionTreeClassifier(criterion=criteria,
                                 splitter="best",
                                 max_depth=depth)
    clf = clf.fit(X, relationship)
    export_graphviz(clf,
                    out_file=name + criteria + '_tree.dot',
                    feature_names=df.columns,
                    class_names=[t.name for t in RelationshipType],
                    filled=True,
                    rounded=True,
                    special_characters=True)

    tree_to_json(clf, df.columns, name + criteria)
    calculate_features_importance(clf, df.columns, name + criteria)
def pca_analysis(df, relationship, name):
    """
    Executes a PCA analysis with the given DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame which is going to be studied.
    relationship : pd.DataFrame
        DataFrame of the relationship type of each message.
    name: str
        Name of the image which is going to be saved.

    Returns
    -------
    None.

    """
    X = replace_nan(df, relationship)
    X = StandardScaler().fit_transform(X)
    X = pd.DataFrame(data=X, columns=df.columns)
    labels = df.columns

    for n in range(1, cf.N_COMPONENTS + 1):
        pca = PCA(n_components=n)
        pca.fit(X)

        os.mkdir(name + 'PCA' + str(n) + '/')
        os.chdir(name + 'PCA' + str(n) + '/')
        with open('expvar.json', 'a') as f:
            f.write(json.dumps(pca.explained_variance_ratio_.tolist()))
        pd.DataFrame(pca.components_, columns=labels).to_csv(f'pca{n}.csv',
                                                             index=False)
        for i in range(n):
            comi = pca.components_[i].copy()
            comiabs = list(map(lambda x: abs(x), comi))
            total = sum(comiabs)
            comiabs = list(map(lambda x: (x / total) * 100, comiabs))

            plt.figure()
            patches, _ = plt.pie(comiabs,
                                 colors=cf.COLORS[:len(labels)],
                                 shadow=True,
                                 startangle=90)

            for j in range(len(comi)):
                if comi[j] < 0:
                    patches[j].set_hatch('/')
            plt.legend(patches,
                       prop={'size': 6},
                       labels=[
                           '%s, %1.1f %%' % (l, s)
                           for l, s in zip(labels, comiabs)
                       ],
                       loc="best")
            plt.axis('equal')
            plt.title('Component with ' + '%.4f' %
                      (pca.explained_variance_ratio_[i]) +
                      ' of explained variance ratio')
            plt.tight_layout()
            plt.savefig('comp' + str(i) + '_pie.png')

        os.chdir('../')
def study_dbscan_silhouette_score(name, normalized, relationship):
    """
    Obtains the Silhouette score (by using DBSCAN algorithm) of the data and 
    save it as an image.
    
    Parameters
    ----------
    name: str
        Name of the image which is going to be saved.
    normalized: pd.DataFrame
        DataFrame which is going to be studied.
    relationship : pd.DataFrame
        DataFrame of the relationship type of each message.

    Returns
    -------
    None.

    """
    X = replace_nan(normalized, relationship)
    
    relationship.map(lambda x : RelationshipType[x].value)
    
    metrics = ['euclidean', 'manhattan']
    
    for m in metrics:
        maxS = -1
        epsiOpt = 0
        silhouettes = []
        num_clust = []
        ari = []
        interval = np.arange(0.01, 1, 0.01)
        
        for epsilon in interval:
            s, n, labels = dbanalysis(X, epsilon, m)
            silhouettes.append(s)
            num_clust.append(n)
            ari.append(adjusted_rand_score(relationship, labels))
            if maxS < s:
                maxS = s
                epsiOpt = epsilon

        plt.figure(figsize=(13,4))
        plt.subplot(1,2,1)
        plt.plot(interval, silhouettes)
        plt.plot([epsiOpt, epsiOpt], [-1.1, maxS+0.1], linestyle = "--")
        plt.title(r'Silhouette score for different $\varepsilon$')
        plt.xlabel(r"$\varepsilon$")
        plt.ylabel("Silhouette score")
        plt.grid()

        plt.subplot(1,2,2)
        plt.plot(interval, num_clust)
        plt.plot([epsiOpt, epsiOpt], [0, max(num_clust)], linestyle = "--")
        plt.title(r'Number of clusters for different $\varepsilon$')
        plt.xlabel(r"$\varepsilon$")
        plt.ylabel("Number of clusters")
        plt.grid()

        plt.tight_layout()
        plt.savefig(name + 'dbscan_' + m + '_silhouette_score.png')
        
        plt.figure()
        plt.plot(interval, ari)
        plt.title(r'Adjusted Rand Score with DBSCAN for different $\varepsilon$')
        plt.savefig(name + m + '_dbscan_ari.png')