def pearson_correlation(data, verbose=True): plt.rcdefaults() #---------------- norm_data = norm.normalization_with_minmax(data) # 2. Principal Component Analysis estimator, X_pca = norm.pca(norm_data) #---------------- features = data.columns.tolist()[:-1] # 1. Correlation corr = map(lambda x1: pearsonr(x1, norm_data.tolist()[-1])[0], [norm_data[x].tolist() for x in range(len(features))]) features_corr = zip(features, corr) if verbose: y_pos = np.arange(len(features)) plt.bar(y_pos, corr, align='center') plt.xticks(y_pos, features, rotation=90) plt.ylabel('Correlation') plt.title('Correlation features vs \'total_cases\'') plt.show() print '' print tabulate(features_corr, headers=['Feature', 'R value']) return features_corr
def hierarchical_clustering_features(data, verbose=False): names = list(data) data_aux = data data_droped = data_aux.dropna() data_transpose = data_droped.transpose() #1. Normalization of the data data_transpose_norm = norm.normalization_with_minmax(data_transpose) #1.2. Principal Component Analysis estimator, X_pca = norm.pca(data_transpose_norm) if verbose: plt.plot(X_pca[:, 0], X_pca[:, 1], 'x') print("Variance Ratio: ", estimator.explained_variance_ratio_) fig, ax = plt.subplots() for i in range(len(data_transpose)): plt.text(X_pca[i][0], X_pca[i][1], i + 1) plt.xlim(min(X_pca[:, 0] - 0.2), max(X_pca[:, 0]) + 0.2) plt.ylim(min(X_pca[:, 1] - 0.2), max(X_pca[:, 1]) + 0.2) ax.grid(True) fig.tight_layout() plt.show() data_transpose_name = zip(range(1, len(names) + 1), names) print tabulate(data_transpose_name, headers=['# ', 'Feature name']) # 2. Compute the similarity matrix dist = sklearn.neighbors.DistanceMetric.get_metric('euclidean') matsim = dist.pairwise(data_transpose_norm) avSim = np.average(matsim) if verbose: print "%s\t%6.2f" % ('Average Distance', avSim) plt.figure(figsize=(7, 7)) # 3. Building the Dendrogram # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage clusters = cluster.hierarchy.linkage(matsim, method='complete') # http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.dendrogram.html cluster.hierarchy.dendrogram(clusters, color_threshold=5, labels=names, leaf_rotation=90) plt.show()
def pearson_correlation(data, verbose=False): plt.rcdefaults() #---------------- norm_data = norm.normalization_with_minmax(data) # 2. Principal Component Analysis estimator, X_pca = norm.pca(norm_data) #---------------- features = data.columns.tolist()[:-1] # 1. Correlation corr_SanJuan = map(lambda x1: pearsonr(x1, norm_data.tolist()[-1])[0], [norm_data[x].tolist() for x in range(len(features))]) features_corr = zip(features, corr_SanJuan) if verbose: y_pos = np.arange(len(features)) plt.bar(y_pos, corr_SanJuan, align='center') plt.xticks(y_pos, features, rotation=90) plt.ylabel('Correlation') plt.title('Correlation features vs target') plt.show() print '' print tabulate(features_corr, headers=['Feature', 'R value']) #Selection of features with correlation greater than 0.7. features_selected = [ features_corr[i][0] for i in range(len(features_corr)) if abs(features_corr[i][1]) > 0.7 ] #Else, select features with high correlation. if len(features_selected) == 0: features_selected = [ features_corr[i][0] for i in range(len(features_corr)) if abs(features_corr[i][1]) >= 0.41 ] return features_selected
def kMeans_clustering(data, verbose=False): # 1. Data normalization no_nan_data = data.dropna(how='any') norm_data = norm.normalization_with_minmax(data) # 2. Principal Component Analysis estimator, X_pca = norm.pca(norm_data) #plt.plot(X_pca[:,0], X_pca[:,1],'x') # 3. Setting parameters (ad-hoc) # parameters init = 'k-means++' # initialization method iterations = 10 # to run 10 times with different random centroids to choose the final model as the one with the lowest SSE max_iter = 300 # maximum number of iterations for each single run tol = 1e-04 # controls the tolerance with regard to the changes in the within-cluster sum-squared-error to declare convergence random_state = 0 # random distortions = [] silhouettes = [] for i in range(2, 11): km = KMeans(i, init, n_init=iterations, max_iter=max_iter, tol=tol, random_state=random_state) labels = km.fit_predict(norm_data) distortions.append(km.inertia_) silhouettes.append(metrics.silhouette_score(norm_data, labels)) if verbose: # 4. Plot results to know which K set # Plot distoritions plt.plot(range(2, 11), distortions, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.show() # Plot Silhouette plt.plot(range(2, 11), silhouettes, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Silohouette') plt.show() # Set K value k = distortions.index(max(distortions)) + 2 ### 5. Execute clustering km = KMeans(k, init, n_init=iterations, max_iter=max_iter, tol=tol, random_state=random_state) labels = km.fit_predict(norm_data) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) ### 6. Plot the results if verbose: print '\n\n K value is: ' + str(k) + '\n\n' plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels) plt.grid() plt.show() print('Estimated number of clusters: %d' % n_clusters_) numbers = no_nan_data.index.values n_total = len(numbers) groups = {} outliers = [] for c in range(1, n_clusters_ + 1): elements = [] for i in range(len(norm_data)): if labels[i] == c - 1: elements.append(numbers[i]) groups[c] = elements n_elements = len(elements) percent = (float(n_elements) / float(n_total)) * 100.0 if (percent <= 2.0): outliers.append(elements) if verbose: print '\nGroup %2d, length: %d, total %d, percent %5.2f ' % ( c, n_elements, n_total, percent) if (len(outliers) == 0): outliers = None return groups, outliers
def hierarchical_clustering(data, cut=None, first_total=None, verbose=False): #Normalization Data no_nan_data = data.dropna(how='any') norm_data = norm.normalization_with_minmax(data) estimator, X_pca = norm.pca(norm_data) if verbose: norm.pca_plots(estimator, X_pca, no_nan_data.index.values) # 1. Hierarchical Clustering # 1.1. Compute the similarity matrix dist = sklearn.neighbors.DistanceMetric.get_metric('euclidean') matsim = dist.pairwise(X_pca) avSim = np.average(matsim) if verbose: print "%s\t%6.2f" % ('Average Distance', avSim) # 1.2. Building the Dendrogram methods = [ "single", "complete", "average", "weighted", "centroid", "median", "ward" ] selec_meth = methods[0] criterions = ["inconsistent", "distance", "maxclust"] sel_crit = criterions[1] #cut = 3.7 # ad-hoc clusters = cluster.hierarchy.linkage(matsim, method=selec_meth) # http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.dendrogram.html if verbose: plt.figure(figsize=(10, 10)) dendrogram_data = cluster.hierarchy.dendrogram(clusters, no_plot=(not verbose), color_threshold=7) flatten = lambda l: [item for sublist in l for item in sublist] dendrogram_flat = flatten(dendrogram_data.get('dcoord')) if not cut: cut = max(dendrogram_flat) dendrogram_flat.remove(cut) max_distance = cut - 0.1 else: max_distance = max(dendrogram_flat) - 0.1 if verbose: plt.title('%s color_threshold: %d' % (selec_meth, 7)) plt.show() labels = cluster.hierarchy.fcluster(clusters, max_distance, criterion=sel_crit) # 4. plot numbers = no_nan_data.index.values n_total = len(numbers) if not first_total: first_total = n_total if verbose: colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) fig, ax = plt.subplots() for i in range(n_total): plt.text(X_pca[i][0], X_pca[i][1], numbers[i], color=colors[labels[i]]) plt.xlim(min(X_pca[:, 0] - 0.2), max(X_pca[:, 0]) + 0.2) plt.ylim(min(X_pca[:, 1] - 0.2), max(X_pca[:, 1]) + 0.2) ax.grid(True) fig.tight_layout() plt.title('Method: %s, Cut: %5.2f, Criterion: %s' % (selec_meth, max_distance, sel_crit)) plt.show() # 5. characterization n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) if verbose: print('Estimated number of clusters: %d' % n_clusters_) #print 'cut: %.2f' % cut groups = {} outliers = [] cut_percent = cut * (1 - avSim + 0.2) max_cut = cut - cut_percent for c in range(1, n_clusters_ + 1): elements = [] for i in range(len(norm_data)): if labels[i] == c: elements.append(numbers[i]) groups[c] = elements n_elements = len(elements) percent = (float(n_elements + (first_total - n_total)) / float(n_total)) * 100.0 if (percent <= 2.0 and max_cut <= max_distance): outliers.append(elements) del groups[c] if verbose: print 'Para eliminar 2%: \t' print elements elif (percent <= 10.0 and max_cut <= max_distance): df = pd.DataFrame(data, index=elements) sub_elements, sub_outliers, cut = hierarchical_clustering( df, cut=cut, first_total=first_total, verbose=False) if verbose and sub_outliers: print 'Para eliminar 10%: \t' print sub_outliers if sub_outliers: outliers.append(sub_outliers) else: outliers = None flatten = lambda l: [item for sublist in l for item in sublist] if outliers: final_outliers = flatten(outliers) else: final_outliers = None return groups, final_outliers, cut
def hierarchical_clustering_features(data, verbose = True): names = list(data) data_transpose = data.transpose(); #1. Normalization of the data data_transpose_norm = norm.normalization_with_minmax(data_transpose) #1.2. Principal Component Analysis estimator, X_pca = norm.pca(data_transpose_norm) #------------------------ if verbose: plt.plot(X_pca[:,0], X_pca[:,1],'x') print("Variance Ratio: ", estimator.explained_variance_ratio_) fig, ax = plt.subplots() for i in range(len(data_transpose)): plt.text(X_pca[i][0], X_pca[i][1], i+1) plt.xlim(min(X_pca[: , 0] - 0.2), max(X_pca[: , 0]) + 0.2) plt.ylim(min(X_pca[: , 1] - 0.2), max(X_pca[: , 1]) + 0.2) ax.grid(True) fig.tight_layout() plt.show() data_transpose_name = zip(range(1,len(names) + 1), names) print tabulate(data_transpose_name, headers = ['# ','Feature name']) # 2. Compute the similarity matrix dist = sklearn.neighbors.DistanceMetric.get_metric('euclidean') matsim = dist.pairwise(data_transpose_norm) avSim = np.average(matsim) clusters = cluster.hierarchy.linkage(matsim, method = 'complete') labels = cluster.hierarchy.fcluster(clusters ,6, criterion = 'maxclust') feature_name = data_transpose.index.values n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) groups = {} for c in range(1, n_clusters_+1): elements = [] for i in range(len(data_transpose_norm)): if labels[i] == c: elements.append(feature_name[i]) groups[c] = elements if verbose: print "%s\t%6.2f" % ('Average Distance', avSim) plt.figure(figsize = (7,7)); # 3. Building the Dendrogram # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage # http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.dendrogram.html cluster.hierarchy.dendrogram(clusters, color_threshold = 5, labels = names, leaf_rotation=90) print('Estimated number of clusters: %d' % n_clusters_) plt.show() return groups