def get_scatter_matrix(name, normalized, relationship): """ Obtains the scatter matrix of the data set. Parameters ---------- name: str Name of the image which is going to be saved. normalized: pd.DataFrame DataFrame which is going to be studied. relationship : pd.DataFrame DataFrame of the relationship type of each message. Returns ------- None. """ X = replace_nan(normalized, relationship) dic_colors = generate_colors() colors = relationship.map(dic_colors) scatter_matrix(pd.DataFrame(X), figsize=(100, 100), diagonal='hist', color=colors) plt.savefig(name + 'scatter_matrix.png')
def get_correlation_matrix(df, relationship): """ Obtains the correlation matrix of the given DataFrame. Parameters ---------- df : pd.DataFrame Data which is going to be studied. relationship : pd.DataFrame DataFrame of the relationship type of each message. Returns ------- None. """ X = replace_nan(df, relationship) X = pd.DataFrame(data=X, columns=df.columns) plt.figure(figsize=(20, 20)) cor = X.corr() sns.heatmap(cor, annot=True, cmap=plt.cm.Reds) plt.savefig('correlation_matrix.png')
def classify_with_decission_tree(df, relationship, criteria, name): """ Generates the decision tree of the given data. Parameters ---------- df : pd.DataFrame DataFrame which is going to be studied. relationship : pd.DataFrame DataFrame of the relationship type of each message. criteria : str Criteria for generating it ('gini' or 'entropy'). name: str Name of the image which is going to be saved. Returns ------- None. """ X = replace_nan(df, relationship) depth = calculate_best_depth(X, relationship, name, criteria) clf = DecisionTreeClassifier(criterion=criteria, splitter="best", max_depth=depth) clf = clf.fit(X, relationship) export_graphviz(clf, out_file=name + criteria + '_tree.dot', feature_names=df.columns, class_names=[t.name for t in RelationshipType], filled=True, rounded=True, special_characters=True) tree_to_json(clf, df.columns, name + criteria) calculate_features_importance(clf, df.columns, name + criteria)
def pca_analysis(df, relationship, name): """ Executes a PCA analysis with the given DataFrame. Parameters ---------- df : pd.DataFrame DataFrame which is going to be studied. relationship : pd.DataFrame DataFrame of the relationship type of each message. name: str Name of the image which is going to be saved. Returns ------- None. """ X = replace_nan(df, relationship) X = StandardScaler().fit_transform(X) X = pd.DataFrame(data=X, columns=df.columns) labels = df.columns for n in range(1, cf.N_COMPONENTS + 1): pca = PCA(n_components=n) pca.fit(X) os.mkdir(name + 'PCA' + str(n) + '/') os.chdir(name + 'PCA' + str(n) + '/') with open('expvar.json', 'a') as f: f.write(json.dumps(pca.explained_variance_ratio_.tolist())) pd.DataFrame(pca.components_, columns=labels).to_csv(f'pca{n}.csv', index=False) for i in range(n): comi = pca.components_[i].copy() comiabs = list(map(lambda x: abs(x), comi)) total = sum(comiabs) comiabs = list(map(lambda x: (x / total) * 100, comiabs)) plt.figure() patches, _ = plt.pie(comiabs, colors=cf.COLORS[:len(labels)], shadow=True, startangle=90) for j in range(len(comi)): if comi[j] < 0: patches[j].set_hatch('/') plt.legend(patches, prop={'size': 6}, labels=[ '%s, %1.1f %%' % (l, s) for l, s in zip(labels, comiabs) ], loc="best") plt.axis('equal') plt.title('Component with ' + '%.4f' % (pca.explained_variance_ratio_[i]) + ' of explained variance ratio') plt.tight_layout() plt.savefig('comp' + str(i) + '_pie.png') os.chdir('../')
def study_dbscan_silhouette_score(name, normalized, relationship): """ Obtains the Silhouette score (by using DBSCAN algorithm) of the data and save it as an image. Parameters ---------- name: str Name of the image which is going to be saved. normalized: pd.DataFrame DataFrame which is going to be studied. relationship : pd.DataFrame DataFrame of the relationship type of each message. Returns ------- None. """ X = replace_nan(normalized, relationship) relationship.map(lambda x : RelationshipType[x].value) metrics = ['euclidean', 'manhattan'] for m in metrics: maxS = -1 epsiOpt = 0 silhouettes = [] num_clust = [] ari = [] interval = np.arange(0.01, 1, 0.01) for epsilon in interval: s, n, labels = dbanalysis(X, epsilon, m) silhouettes.append(s) num_clust.append(n) ari.append(adjusted_rand_score(relationship, labels)) if maxS < s: maxS = s epsiOpt = epsilon plt.figure(figsize=(13,4)) plt.subplot(1,2,1) plt.plot(interval, silhouettes) plt.plot([epsiOpt, epsiOpt], [-1.1, maxS+0.1], linestyle = "--") plt.title(r'Silhouette score for different $\varepsilon$') plt.xlabel(r"$\varepsilon$") plt.ylabel("Silhouette score") plt.grid() plt.subplot(1,2,2) plt.plot(interval, num_clust) plt.plot([epsiOpt, epsiOpt], [0, max(num_clust)], linestyle = "--") plt.title(r'Number of clusters for different $\varepsilon$') plt.xlabel(r"$\varepsilon$") plt.ylabel("Number of clusters") plt.grid() plt.tight_layout() plt.savefig(name + 'dbscan_' + m + '_silhouette_score.png') plt.figure() plt.plot(interval, ari) plt.title(r'Adjusted Rand Score with DBSCAN for different $\varepsilon$') plt.savefig(name + m + '_dbscan_ari.png')