def main(argv): inputfile = '' outputfile = '' hookfile = '' try: opts, args = getopt.getopt(argv,"hi:o:t:",["ifile=","ofile=","transformation="]) except getopt.GetoptError: print sys.argv[0] + ' -i inputfile -o outputfile -t [hook script]' print sys.argv[0] + ' -i sample.pcap -o result.pcap -t example' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'test.py -i <inputfile> -o <outputfile>' sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg elif opt in ("-t", "--transformation"): hookfile = arg #copy packet data to dictionary object pktdict={} pkts=rdpcap(inputfile) i=0 for pkt in pkts: try: my_array = [] if pkt.haslayer(TCP): for d in str(pkt.getlayer(TCP).payload): my_array.append(d) if pkt.haslayer(UDP): for d in str(pkt.getlayer(UDP).payload): my_array.append(d) #reverse packet for backtrace on needleman wunch pktdict[i] = list("".join(reversed(my_array))) i=i+1 except: raise #Create distance matrix dictSize = len(pktdict) diffMatrix = zeros((packetsToSample,packetsToSample)) x=0 while x < packetsToSample: y=0 print "" print "Packet " + str(x) + ": " + str(pktdict[x]) while y < packetsToSample: #calculate common substring length between packets #similarity = lcs(pktdict[x], pktdict[y]) gms, similarity, distance, alignedseq1Discard, alignedseq2Discard = sequencealignment(pktdict[x], pktdict[y]) #distance = 1 - (similarity + 1)/2 print "Packet " + str(x) + " similarity to packet " + str(y) + " = " + str(similarity) print "Packet " + str(x) + " distance from packet " + str(y) + " = " + str(distance) #assign value to symmetrically opposite cells #as Smith-Waterman score follows triangle equality rule diffMatrix[x][y]=distance diffMatrix[y][x]=distance y=y+1 x=x+1 print " " print "Distance Matrix:" print diffMatrix print "" #Multi-Dimensional Scaling from distances to XY points # # Source: http://scikit-learn.org/stable/auto_examples/manifold/plot_mds.html#example-manifold-plot-mds-py # seed = np.random.RandomState(seed=3) mds = manifold.MDS(n_components=2, max_iter=1000, eps=0.8, random_state=seed, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(diffMatrix).embedding_ pos = manifold.MDS(dissimilarity="precomputed").fit_transform(diffMatrix) pos *= np.sqrt(100000) / np.sqrt((pos ** 2).sum()) clf = PCA(n_components=2) pos = clf.fit_transform(pos) #Display distance matrix print "Coordinates of plotted packets: " for p in pos: print p.astype(int) #Calculate number of clusters # # Source: http://scikit-learn.org/stable/auto_examples/cluster/plot_mean_shift.html # print "" bandwidth = estimate_bandwidth(pos, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(pos) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("Estimated number of clusters (k): %d" % n_clusters_) #Plot on graph pl.figure(1) pl.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): my_members = labels == k cluster_center = cluster_centers[k] pl.plot(pos[my_members, 0], pos[my_members, 1], col + '.') print "" print "Cluster: " + str(k) #print str(my_members) #Create GMS for cluster using Needleman-Wunch # #This section is not part of the clustering code # clusterPackets = []; origionalClusterPackets = []; offset = 0; #extract packets from each cluster for val in my_members: if(str(val) == "True"): clusterPackets.append(pktdict[offset]); offset += 1; origionalClusterPackets = copy.deepcopy(clusterPackets); print 'Compressing GMS .', #compress all GMS pairs to single GMS for the cluster while len(clusterPackets) > 1: print '.', gmsList1 = []; #calculate generic message sequence for each pair of messages for i in xrange(len(clusterPackets) - 1): current_item, next_item = clusterPackets[i], clusterPackets[i + 1] gms, totalMatch, totalDifference, alignedseq1Discard, alignedseq2Discard = sequencealignment(current_item, next_item) gmsList1.append(gms) clusterPackets = copy.deepcopy(gmsList1) print "" gmspkt = list(reversed(clusterPackets[0])) #gmsbin = str("".join(gmspkt)) #compress all substitution characters to a single character beforeGmsLen = len(gmspkt)+1 afterGmsLen = len(gmspkt) while(beforeGmsLen > afterGmsLen): beforeGmsLen = len(gmspkt) for i in xrange(len(gmspkt) - 1, 0, -1): if(gmspkt[i] == "-" and gmspkt[i-1] == "-"): del gmspkt[i] afterGmsLen = len(gmspkt) print str(gmspkt) #print list(reversed(gmspkt)) print "" #enumerate ngrams in variable data clusterTokens = []; for clusPkt in origionalClusterPackets: gmsDiscard, similarityDiscard, distanceDiscard, alignedseq1Keep, alignedseq2Keep = sequencealignment(clusPkt, list(reversed(gmspkt))) pktAlignedGMS1 = list(reversed(alignedseq1Keep)) GMSAlignedData = list(reversed(alignedseq2Keep)) tmpData = copy.deepcopy(GMSAlignedData) packettokens = []; packettoken = []; gmsoffset = 0 while gmsoffset < len(GMSAlignedData): if pktAlignedGMS1[gmsoffset]: if(pktAlignedGMS1[gmsoffset] != "-"): tmpData[gmsoffset] = "-" gmsoffset+=1; packettokens = copy.deepcopy(tmpData) splittoken = []; splittokens = []; gmsoffset = 0 while gmsoffset < len(packettokens): if packettokens[gmsoffset]: if(packettokens[gmsoffset] != "-"): splittoken.append(copy.deepcopy(packettokens[gmsoffset])) if(gmsoffset+1 < len(packettokens)): if(packettokens[gmsoffset+1] == "-"): if(len(splittoken) > 0): #TODO: # having problems with passing by reference # the beginning of the list vanishes. splittokens.append(copy.deepcopy(splittoken)) del splittoken[:] gmsoffset+=1 clusterTokens.append(splittokens) print print "GMS: " + str(pktAlignedGMS1) print "Data: " + str(GMSAlignedData) print "Masked Data: " + str(tmpData) print "Tokens: " + str(packettokens) print "Split Tokens: " + str(splittokens) print print "" #infer token data type #integer, float, character, string fieldtype = "Blob" for tokens in clusterTokens: for token in tokens: singleToken = ''.join(token) newToken = str(singleToken) if newToken == 'True' or newToken == 'False': fieldtype = "Flag" else: try: int(newToken) fieldtype = "Number" except ValueError: try: float(newToken) fieldtype = "Number" except ValueError: fieldtype = "Blob" chunkOffset = 0; staticFieldBuff = []; fieldSwitch = 0; fieldLength = 1; print '<DataModel name="cluster' + str(k) + '">' while chunkOffset < len(gmspkt): #print conditions if gmspkt[chunkOffset]: if(gmspkt[chunkOffset] != "-"): staticFieldBuff.append(gmspkt[chunkOffset]) if(chunkOffset+1 < len(gmspkt)): if(gmspkt[chunkOffset] != "-" and gmspkt[chunkOffset+1] == "-"): #print '<Blob valueType="hex" value="' + str(staticFieldBuff).replace("[", "").replace("]", "").replace("'", "").replace("\\x", "") + '" mutable="false"/>' print '<Blob valueType="hex" value="', for c in staticFieldBuff: print binascii.hexlify(c), print '" mutable="false"/>' del staticFieldBuff[:] if(chunkOffset == 0 and gmspkt[chunkOffset] == "-"): fieldSwitch = 1 if(chunkOffset-1 > 0): if(gmspkt[chunkOffset] == "-" and gmspkt[chunkOffset-1] != "-"): fieldSwitch = 1 if(fieldSwitch == 1): print '<' + str(fieldtype) + ' mutable="true"/>' fieldLength=0 fieldSwitch = 0 chunkOffset+=1 fieldLength+=1 print '</DataModel>' pl.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=8) pl.title('Estimated number of clusters: %d' % n_clusters_) pl.show()
ax.yaxis.set_major_formatter(NullFormatter()) plt.axis('tight') t0 = time() Y = manifold.Isomap(n_neighbors, n_components).fit_transform(X) t1 = time() print("Isomap: %.2g sec" % (t1 - t0)) ax = fig.add_subplot(257) plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) plt.title("Isomap (%.2g sec)" % (t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) plt.axis('tight') t0 = time() mds = manifold.MDS(n_components, max_iter=100, n_init=1) Y = mds.fit_transform(X) t1 = time() print("MDS: %.2g sec" % (t1 - t0)) ax = fig.add_subplot(258) plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) plt.title("MDS (%.2g sec)" % (t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) plt.axis('tight') t0 = time() se = manifold.SpectralEmbedding(n_components=n_components, n_neighbors=n_neighbors) Y = se.fit_transform(X) t1 = time()
def perform(self): mds = skm.MDS(n_components=self.n_components, random_state=self.random_state) transform = mds.fit_transform(self.data) return transform
# Making Rips simplicial complex rc = gudhi.RipsComplex(distance_matrix=df, max_edge_length=0.005) st = rc.create_simplex_tree(max_dimension=2) # We are only going to plot the triangles, edges and points triangles = np.array([s[0] for s in st.get_skeleton(2) if len(s[0]) == 3]) duzi = np.array([s[0] for s in st.get_skeleton(1) if len(s[0]) == 2]) tacke = np.array([s[0] for s in st.get_skeleton(0) if len(s[0]) == 1]) print(triangles) print() print(duzi) print() print(tacke) # Making 3D coordinates out of distance matrix mds = manifold.MDS(n_components=3, dissimilarity="precomputed", random_state=6) results = mds.fit(df) coords = results.embedding_ fig = plt.figure() ax = fig.gca(projection='3d') # Ploting points and naming them plt.scatter(coords[:, 0], coords[:, 1], coords[:, 2]) for label, x, y, z in zip(likovi, coords[:, 0], coords[:, 1], coords[:, 2]): ax.text(x, y, z, label) # Ploting edges if 0 != len(duzi): points = np.array(coords) edges = np.array(duzi)
files += dir_files # for file in files: # vectors.append(dict_from_file(file)) diss = np.ndarray(shape=(sum(lengths.values()), sum(lengths.values())), dtype=np.float32) queries = [] for index, file in enumerate(files): diss[index, index] = 0 for index2 in range(index + 1, len(files)): queries.append([index, index2, file, files[index2], args.z]) pool = Pool(os.cpu_count()) results = pool.starmap(dist_between_files, queries, chunksize=1) for result in results: diss[result[0], result[1]] = result[2] diss[result[1], result[0]] = result[2] mds = manifold.MDS(dissimilarity='precomputed') coords = mds.fit(diss).embedding_ if args.no_draw: # Data exist, but need to be dumped, not stored with open(args.data_filename + '.lengths', mode='w') as lenfile: for x in lengths: print(x + '\t' + str(lengths[x]), file=lenfile) np.savetxt(args.data_filename + '.coords', coords) print('Data written to {}'.format(args.data_filename)) if not args.no_draw: # If something is to be drawn, `lengths` and `coords` should be set. if not lengths and not coords: lengths = OrderedDict() for line in open(args.data_filename + '.lengths'): l = line.split('\t')
def do_scree_plot(A): num_vars = 191 U, S, V = np.linalg.svd(A) eigvals = S**2 / np.cumsum(S)[-1] fig = plt.figure(figsize=(8,5)) sing_vals = np.arange(num_vars) + 1 sing_vals = sing_vals[:9] eigvals = eigvals[:9] # getting rid of the first one sing_vals = sing_vals[1:] eigvals = eigvals[1:] plt.plot(sing_vals, eigvals, 'ro-', linewidth=2) plt.title('Scree Plot') plt.xlabel('Principal Component') plt.ylabel('Eigenvalue') leg = plt.legend(['Eigenvalues from SVD'], loc='best', borderpad=0.3, shadow=False, prop=matplotlib.font_manager.FontProperties(size='small'), markerscale=0.4) leg.get_frame().set_alpha(0.4) leg.draggable(state=True) plt.show() fold = manifold.MDS(n_components=2, dissimilarity='precomputed') fold.fit_transform(A) do_scree_plot(A) #print fold.stress_
def silhouette(self, range_n_clusters, cluster_labelss): X = self.ndf for n_cluster in range_n_clusters: fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(12, 6) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(X) + (n_cluster + 1) * 10]) cluster_labels = cluster_labelss[n_cluster - 2] # categories, cluster_labels, cluster_centers_, summary = self.kmeans_fit_predict(n_cluster, preproc) silhouette_avg = silhouette_score(X, cluster_labels) print("For n_clusters =", n_cluster, "The average silhouette_score is :", silhouette_avg) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_cluster): ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_cluster) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # mds # mds similarities = euclidean_distances(X) mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=random_state, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(similarities).embedding_ df_pos = pd.DataFrame(pos, columns=["comp1", "comp2"]) df_pos["pred"] = cluster_labels for i in range(n_cluster): color = cm.spectral(float(i) / n_cluster) ax2.scatter(df_pos[df_pos["pred"] == i].iloc[:, 0], df_pos[df_pos["pred"] == i].iloc[:, 1], c=color) ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st MDS feature") ax2.set_ylabel("Feature space for the 2nd MDS feature") plt.suptitle( ("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_cluster), fontsize=14, fontweight='bold') # end mds plt.show()
def visualize(df, cluster_labels, n_clusters, n_iterations): """ Visualize the points in a n-dimensional space and the silhouette for each cluster""" # Dimension for visualization target_dimension = 2 cluster_labels = np.array(cluster_labels) mds = manifold.MDS(target_dimension, max_iter=100, n_init=1) X = mds.fit_transform(df) # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(df, cluster_labels) y_lower = 10 num_elements = len(cluster_labels) for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = np.array([ sample_silhouette_values[k] for k in range(num_elements) if cluster_labels[k] == i ]) ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values # ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = {} and n_iterations = {}").format( n_clusters, n_iterations + 1), fontsize=14, fontweight='bold') plt.show()
def clusters(master, model_name, fld_save, D=2, use_bias=True, n_batch=1): n_sample = BATCH_SIZE * n_batch method = 'MDS' #method = 'tSNE' #method = 'isomap' latent_d = dict() colors = { 'base_conv': 'y', 'base_resp': 'r', 'bias_conv': 'k', 'bias_nonc': 'b', } print('building data...') d_inp_enc = master.dataset.feed_data('test', max_n=n_sample, check_src=True, mix_ratio=(0., 1.))['inp_enc'] latent_d['base_conv'] = master.model_encoder['S2S'].predict( d_inp_enc['ctxt']) if use_bias and 'AE' in master.prefix: latent_d['bias_nonc'] = master.model_encoder['AE'].predict( d_inp_enc['nonc']) #if use_bias and 'bias_conv' in master.dataset.files['test']: # d_inp_enc = master.dataset.feed_data('test', max_n=n_sample, check_src=True, mix_ratio=(1.,0.))['inp_enc'] # latent_d['bias_conv'] = master.model_encoder['S2S'].predict(d_inp_enc['ctxt']) #else: d_inp_enc = master.dataset.feed_data('test', max_n=n_sample, check_src=True, mix_ratio=(0., 0.))['inp_enc'] if 'AE' in master.prefix: #latent_d['base_nonc'] = master.model_encoder['AE'].predict(d_inp_enc['nonc']) latent_d['base_resp'] = master.model_encoder['AE'].predict( d_inp_enc['resp']) labels = list(sorted(latent_d.keys())) fname_suffix = args.restore.split('/')[-1].replace('.npz', '') if use_bias: fname_suffix += '_wbias' n_labels = len(labels) latent = np.concatenate([latent_d[k] for k in labels], axis=0) print('latent.shape', latent.shape) print('plotting bit hist...') bins = np.linspace(-1, 1, 31) for k in latent_d: l = latent_d[k].ravel() freq, _, _ = plt.hist(l, bins=bins, color='w') plt.plot(bins[:-1], 100. * freq / sum(freq), colors[k] + '.-') plt.ylim([0, 50]) plt.savefig(fld_save + '/hist_%s.png' % fname_suffix) plt.close() print('plotting dist mat...') d_norm = np.sqrt(latent.shape[1]) f, ax = plt.subplots() cax = ax.imshow(dist_mat(latent) / d_norm, cmap='bwr') #ax.set_title(model_name) f.colorbar(cax) ticks = [] ticklabels = [] n_prev = 0 for i in range(n_labels): ticks.append(n_prev + n_sample / 2) ticklabels.append(labels[i] + '\n') ticks.append(n_prev + n_sample) ticklabels.append('%i' % (n_sample * (i + 1))) n_prev = n_prev + n_sample ax.set_xticks(ticks) ax.set_xticklabels(ticklabels) ax.xaxis.tick_top() ax.set_yticks(ticks) ax.set_yticklabels([s.strip('\n') for s in ticklabels]) plt.savefig(fld_save + '/dist_%s.png' % fname_suffix) plt.close() if method == 'tSNE': approx = manifold.TSNE(init='pca', verbose=1).fit_transform(latent) elif method == 'MDS': approx = manifold.MDS(D, verbose=1, max_iter=500, n_init=1).fit_transform(latent) elif method == 'isomap': approx = manifold.Isomap().fit_transform(latent) else: raise ValueError f, ax = plt.subplots() for k in labels: ax.plot(np.nan, np.nan, colors[k] + '.', label=k) jj = list(range(approx.shape[0])) np.random.shuffle(jj) for j in jj: i_label = int(j / n_sample) ax.plot(approx[j, 0], approx[j, 1], colors[labels[i_label]] + '.') #plt.legend(loc='best') plt.title(model_name) #ax.set_xticks([]) #ax.set_yticks([]) plt.savefig(fld_save + '/%s_%s.png' % (method, fname_suffix)) plt.show()
def plot_iris_mds(): iris = datasets.load_iris() X = iris.data y = iris.target # MDS fig = pylab.figure(figsize=(10, 4)) ax = fig.add_subplot(121, projection='3d') # ax.set_axis_bgcolor('white') mds = manifold.MDS(n_components=3) Xtrans = mds.fit_transform(X) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter( Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black') pylab.title("MDS on Iris data set in 3 dimensions") ax.view_init(10, -15) mds = manifold.MDS(n_components=2) Xtrans = mds.fit_transform(X) ax = fig.add_subplot(122) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter( Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black') pylab.title("MDS on Iris data set in 2 dimensions") filename = "mds_demo_iris.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight") # PCA fig = pylab.figure(figsize=(10, 4)) ax = fig.add_subplot(121, projection='3d') # ax.set_axis_bgcolor('white') pca = decomposition.PCA(n_components=3) Xtrans = pca.fit(X).transform(X) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter( Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black') pylab.title("PCA on Iris data set in 3 dimensions") ax.view_init(50, -35) pca = decomposition.PCA(n_components=2) Xtrans = pca.fit_transform(X) ax = fig.add_subplot(122) for cl, color, marker in zip(np.unique(y), colors, markers): ax.scatter( Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black') pylab.title("PCA on Iris data set in 2 dimensions") filename = "pca_demo_iris.png" pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def renderD3(self, enc_email_addr): try: email = base64.b64decode(enc_email_addr) domain_list = self.load(email) count = len(domain_list) K = 3 # compute distance matrix for all domains similarity = [] # compute distance between two domains def domain_similarity(s1, s2): if len(s1) > len(s2): s1, s2 = s2, s1 distances = range(len(s1) + 1) for i2, c2 in enumerate(s2): distances_ = [i2 + 1] for i1, c1 in enumerate(s1): if c1 == c2: distances_.append(distances[i1]) else: distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1]))) distances = distances_ return distances[-1] # clustering all points according to given centroid def cluster_points(X, mu): clusters = {} for x in X: bestmukey = min([(i[0], np.linalg.norm(x - mu[i[0]])) \ for i in enumerate(mu)], key=lambda t: t[1])[0] try: clusters[bestmukey].append(x) except KeyError: clusters[bestmukey] = [x] return clusters # relocate centroids def reevaluate_centers(mu, clusters): newmu = [] keys = sorted(clusters.keys()) for key in keys: newmu.append(np.mean(clusters[key], axis=0)) return newmu # check convergence of centroids def has_converged(mu, oldmu): return (set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu])) # find stable centroids def find_centroids(X, k): # Initialize to K random centers oldmu = random.sample(X, k) mu = random.sample(X, k) while not has_converged(mu, oldmu): oldmu = mu # Assign all points in X to clusters clusters = cluster_points(X, mu) # Reevaluate centers mu = reevaluate_centers(oldmu, clusters) return (mu, clusters) # Euclidean distance def Eu_distance(P1, P2): dist = np.sqrt( pow((P1[0] - P2[0]), 2) + pow((P1[1] - P2[1]), 2)) return dist # Find corresponding domain name for given coordinates def find_domain(coordinates): for m in range(0, count): if coordinates[0] == coords[m][0] and coordinates[ 1] == coords[m][1]: return str(domain_list[m]) def find_result(X, k): (M, C) = find_centroids(X, k) # change to integer coordinates for l in range(0, k): for point_index in range(0, len(C[l])): C[l][point_index] = [ int(C[l][point_index][0]), int(C[l][point_index][1]), find_domain(C[l][point_index]) ] # find acutal center for i in range(0, k): dis_array = [] for point in C[i]: dis_array.append(Eu_distance(point, M[i])) index = dis_array.index(min(dis_array)) # Store center center_point = C[i].pop(index) C[i].insert(0, center_point) C[str(i)] = C.pop(i) return C for count_index1 in range(0, count): tmp = [] for count_index2 in range(0, count): if count_index1 == count_index2: simi = 0 elif count_index1 < count_index2: simi = domain_similarity(domain_list[count_index1], domain_list[count_index2]) else: simi = similarity[count_index2][count_index1] tmp.append(simi) similarity.append(tmp) # scale the distance matrix adist = np.array(similarity) adist = adist * 10 # compute coordinates matrix mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6) results = mds.fit(adist) coords = results.embedding_ output_data = find_result(coords, K) return {"data": json.dumps(output_data), "email": email} except: return {"data": "null", "email": "Something's wrong with the URL!"}
total_pt = [] total_data = [] for i in range(len(GPARAMS.Esoinn_setting.Model.learn_history_node)): total_pt.append([]) total_pt[-1].append(len(total_data)) total_data = total_data + list( GPARAMS.Esoinn_setting.Model.learn_history_node[i]) total_pt[-1].append(len(total_data)) total_pt.append([]) total_pt[-1].append(len(total_data)) total_data += list(GPARAMS.Esoinn_setting.Model.nodes) total_pt[-1].append(len(total_data)) similarities = euclidean_distances(np.array(total_data)) mds = manifold.MDS(n_components=2, max_iter=500, eps=1e-7, dissimilarity="precomputed", n_jobs=GPARAMS.Compute_setting.Ncoresperthreads) pos = mds.fit(similarities).embedding_ total_2D_data = [] for i in range(len(GPARAMS.Esoinn_setting.Model.learn_history_node)): total_2D_data.append([]) total_2D_data[-1] = pos[total_pt[i][0]:total_pt[i][1]] total_2D_data.append([]) total_2D_data[-1] = pos[total_pt[-1][0]:total_pt[-1][1]] with open("ESOI-Layer.History", 'wb') as file: pickle.dump(total_2D_data, file) else: with open("ESOI-Layer.History", 'rb') as file: total_2D_data = pickle.load(file)
return 1 # create Input class instance and vectorize quantizer (note: vectorizing # quantizer is default behavior but can be set to False if quantizer already vectorized) data_class = Input(data=X, is_categorical=True, is_synchronized=True, preproc=q) # decide on number of dimensions to use (default is 2) num_dim = 2 # instantiate another embedding class if desired # e.g. sklearn.manifold.MDS mds_emb = manifold.MDS(n_components=num_dim, dissimilarity="precomputed") # create SmashEmbedding class to run methods (require 2 dimensions in embedding) sec = SmashEmbedding(bin_path=bin_path, input_class=data_class, n_dim=num_dim, embed_class=mds_emb) # return distance matrix of input timeseries data (repeat calculation 3 times) # NOTE: fits both default Sippl Embedding and user-defined custom embedding class print(sec.fit(nr=3)) # return embedded coordinates using Sippl embedding (default) on distance matrix print(sec.fit_transform(nr=3, embedder='default')) # return embedded coordinates using Sippl embedding (default) on distance matrix
import matplotlib.pyplot as plt import mpl_toolkits.mplot3d as plt3d #dotpath='../dataset/total_graph.dot' dotpath = '../dataset/game_of_thrones_consistent.dot' similarities, G, nodes_index = gsm.get_similarity_matrix(dotpath) seed = np.random.RandomState(seed=3) print(nx.info(G)) #similarities, G, nodes_index = GetSimlarityMatrix(dotpath) mds = manifold.MDS(n_components=3, max_iter=300, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(similarities).embedding_ fig = plt.figure() ax = plt.axes(projection='3d') X, Y, Z = pos.T[0], pos.T[1], pos.T[2] color = [] ax.scatter3D(X, Y, Z, c='r', cmap='Greens') #for e in G.edges():
def mds(): print("MDS embedding is selected") embedder = manifold.MDS(n_components=n_components, n_init=1, max_iter=100) return embedder
def __init__(self, dimensions=2, metric=True, clusters=2, **kwargs): self.graph_to_points = manifold.MDS(dimensions, metric=metric, dissimilarity='precomputed', **kwargs) # self.graph_to_points = manifold.TSNE() self.cluster_engine = GaussianMixture(clusters)
X_lle = clf.fit_transform(X) plot_embedding(X_lle, "LLE") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=level, method='modified') X_mlle = clf.fit_transform(X) plot_embedding(X_mlle, "LLE modifiée") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=level, method='hessian') X_hlle = clf.fit_transform(X) plot_embedding(X_hlle, "LLE Hessian") clf = manifold.MDS(n_components=level, n_init=20, max_iter=100) X_mds = clf.fit_transform(X) plot_embedding(X_mds, "MDS") hasher = ensemble.RandomTreesEmbedding(n_estimators=100) X_transformed = hasher.fit_transform(X) pca = decomposition.TruncatedSVD(n_components=level) X_reduced = pca.fit_transform(X_transformed) plot_embedding(X_reduced, "Random forest") embedder = manifold.SpectralEmbedding(n_components=level) X_se = embedder.fit_transform(X) plot_embedding(X_se, "Spectral embedding") plotly_embedding(X_se, "Spectral embedding") tsne = manifold.TSNE(n_components=level, init='pca', random_state=0)
corrFrame = pd.DataFrame(data=corrmatrix, columns=[ 'Number', 'Team', 'Age', 'Height', 'Weight', 'College', 'Country', 'Draft Year', 'Draft Round', 'Draft Number', 'GP', 'PTS', 'REB', 'AST', 'NetRtg', 'OREB%', 'DREB%', 'USG%', 'TS%', 'AST%' ]) correlationMatrixOfficial = corrFrame.as_matrix() origData = origDataFrame.as_matrix() correlationMatrixOfficial = abs(1 - correlationMatrixOfficial) mds = manifold.MDS(dissimilarity="precomputed") attrmds = mds.fit(correlationMatrixOfficial).embedding_ np.savetxt("AttributeMds.csv", attrmds, delimiter=',') mds = manifold.MDS(dissimilarity="euclidean") origmds = mds.fit(origData).embedding_ np.savetxt("EuclideanMds.csv", origmds, delimiter=',') corrFrame.to_csv(path_or_buf='correlationMatrix.csv') pcaPlotXY = [[0.0 for x in range(2)] for y in range(numpyArray.shape[0])]
pylab.scatter(x,y_test8[:n,1],marker='*',s=200, color='darkgreen',label='Real data 2') pylab.plot(x,y_test825[:n,0],lw=2, color='steelblue',label='Kernel Ridge 1') pylab.plot(x,y_test825[:n,1],lw=2, color='seagreen',label='Kernel Ridge 2') pylab.xlabel('Observations'); pylab.ylabel('Targets') pylab.title('Kernel Ridge Regressor. Test Results. Toy Regression 2') pylab.legend(loc=2,fontsize=10); pylab.show() """# Unsupervised Learning""" usl=[mixture.GaussianMixture(n_components=4,n_init=4), mixture.BayesianGaussianMixture(n_components=4,n_init=4), manifold.Isomap(),manifold.LocallyLinearEmbedding(), manifold.SpectralEmbedding(),manifold.MDS(),manifold.TSNE()] # Gaussian Mixture; Toy blobs usl[0].fit(X_train9,y_train9); y_test91=usl[0].predict(X_test9) usl[1].fit(X_train9,y_train9); y_test92=usl[1].predict(X_test9) pylab.figure(figsize=(12,12)) pylab.scatter(X_test9[:,0],X_test9[:,1],c=y_test9,cmap=pylab.cm.tab10) pylab.scatter(X_test9[:,0]+0.03,X_test9[:,1]+0.03, c=y_test91,alpha=0.4,cmap=pylab.cm.autumn) pylab.scatter(X_test9[:,0]+0.06,X_test9[:,1]+0.06, c=y_test92,alpha=0.4,cmap=pylab.cm.winter) pylab.scatter([1,-1,1,-1],[1,-1,-1,1],c='black',marker='*',s=150) pylab.show() """# Neural Networks supervised
def plot_2D_distance_projection(rmsd_m, clusters_list, colors, logname): """ DESCRIPTION This function will create a 2D distance projection graph with the MDS methods Args: rmsd_m (np.array) : rmsd matrix (between clusters) clusters_list (list of Cluster): list of Clusters Return: None """ labels = range(1, len(clusters_list) + 1) # 1 - value normalisation (make value between 0 and 1) of RMSD matrix rmsd_norm = rmsd_m / np.max(rmsd_m) symmetrize_matrix(rmsd_norm) rmsd_norm = symmetrize_matrix(rmsd_norm) # 2 - create the MDS methods # mds = manifold.MDS(n_components=2, dissimilarity="euclidean", random_state=4) mds = manifold.MDS(n_components=2, dissimilarity="precomputed") # , random_state=2) # 3 - MDS projection rmsd_mds = mds.fit(rmsd_norm) # rmsd_mds = mds.fit(rmsd_m) # 4 - get X/Y coords coords = rmsd_mds.embedding_ # 5 - get spread and normalyse spreads = [] for clust in clusters_list: spreads.append(clust.spread) spreads = np.array(spreads) # spreads_norm = spreads / np.max(spreads) # minspread = np.min(spreads_norm)+0.05*np.min(spreads_norm) radii = np.pi * (25 * (spreads)**2) # radii = 5 to 20 x = coords[:, 0] y = coords[:, 1] # 6 - plot graph fig = plt.figure() ax = plt.subplot(111) box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) scatter = ax.scatter(x, y, s=radii, c=colors, alpha=0.5) for label, x, y in zip(labels, x, y): plt.annotate(label, xy=(x, y), ha='left', va='bottom', fontsize=8) # set the same axis for X and Y lims = [] lims.extend(ax.get_xlim()) lims.extend(ax.get_ylim()) ax.set_ylim((min(lims), max(lims))) ax.set_xlim((min(lims), max(lims))) plt.title("Relative distance between clusters") plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') # labels along the bottom edge are off plt.tick_params( axis='y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left="off", right="off", labelleft='off') # labels along the bottom edge are off # 7 - circle bar max_size = max(radii) min_size = min(radii) min_color = colors[np.argmin(radii)] max_color = colors[np.argmax(radii)] # add transparency min_color[-1] = 0.5 max_color[-1] = 0.5 leg_min = plt.scatter([], [], s=min_size, edgecolor='black', color=min_color) leg_max = plt.scatter([], [], s=max_size, edgecolor='black', color=max_color) labels = ["{:.2f}".format(min(spreads)), "{:.2f}".format(max(spreads))] legend = ax.legend([leg_min, leg_max], labels, ncol=1, frameon=False, fontsize=8, handlelength=2, loc="upper right", borderpad=1.8, handletextpad=1, scatterpoints=1, bbox_to_anchor=(1.3, 0.9)) legend.set_title('Spread radius', prop={"size": "small"}) # Add Text for distance information min_rmsd = np.min(rmsd_m[np.nonzero(rmsd_m)]) max_rmsd = np.max(rmsd_m[np.nonzero(rmsd_m)]) text_distance = ( "RMSD\n min : {:.2f}$ \AA$\n max : {:.2f} $\AA$".format( min_rmsd, max_rmsd)) # plt.gca().add_artist(legend1) ax.annotate(text_distance, xy=(1.05, 0.5), xycoords="axes fraction", fontsize="small") plt.savefig("{0}/{1}-dist.png".format(logname, logname.split(os.sep)[-1]), format="png", dpi=DPI, transparent=True) plt.close()
algorithms = [ decomposition.TruncatedSVD, manifold.MDS, manifold.Isomap, manifold.LocallyLinearEmbedding, manifold.TSNE ] fname = sys.argv[1] algorithm = int(sys.argv[2]) n_comps = int(sys.argv[3]) x = np.loadtxt(fname) if algorithm == 0: model = decomposition.TruncatedSVD(n_components=n_comps) X = model.fit_transform(x) elif algorithm == 1: model = manifold.MDS(n_components=n_comps) X = model.fit_transform(x) elif algorithm == 2: model = manifold.Isomap(n_components=n_comps) X = model.fit_transform(x) elif algorithm == 3: model = manifold.TSNE(n_components=n_comps) X = model.fit_transform(x) elif algorithm == 4: n_points, input_size = x.shape som_size = int(np.sqrt(n_points) / 2) model = MiniSom(som_size, som_size, input_size, sigma=0.9, learning_rate=0.5)
def k_means(weights, word): print('Start Kmeans:') true_k = 5 # Set the parameter of K-means clf = KMeans(n_clusters=true_k, max_iter=500, n_init=50) #need to find a good way to set the K s = clf.fit(weights) print(s) # print centroid points print(clf.cluster_centers_) # Print clusters for each samples label = [] print(clf.labels_) i = 1 while i <= len(clf.labels_): print(i, clf.labels_[i - 1]) label.append(clf.labels_[i - 1]) i = i + 1 # evaluate the number of clusters print(clf.inertia_) # Print top terms print("Top terms:") order_centroids = clf.cluster_centers_.argsort()[:, ::-1] for i in range(true_k): print("Cluster %d:" % i, ) for ind in order_centroids[i, :10]: print(' %s' % word[ind], ) print(weight[i][ind]) print() #PCA #pca = PCA(n_components=3) # Set the output dimension #newData = pca.fit_transform(weights) # Put the data in #print (newData) #MDS mds = manifold.MDS(n_components=2, dissimilarity='euclidean') newData = mds.fit_transform(weights) #visualisation x1 = [] y1 = [] #z1 = [] i = 1 while i <= len(clf.labels_): if clf.labels_[i - 1] == 0: x1.append(newData[i - 1][0]) y1.append(newData[i - 1][1]) #z1.append(newData[i-1][2]) i = i + 1 x2 = [] y2 = [] #z2 = [] i = 1 while i <= len(clf.labels_): if clf.labels_[i - 1] == 1: x2.append(newData[i - 1][0]) y2.append(newData[i - 1][1]) #z2.append(newData[i-1][2]) i = i + 1 x3 = [] y3 = [] #z3 = [] i = 1 while i <= len(clf.labels_): if clf.labels_[i - 1] == 2: x3.append(newData[i - 1][0]) y3.append(newData[i - 1][1]) #z3.append(newData[i-1][2]) i = i + 1 x4 = [] y4 = [] #z4 = [] i = 1 while i <= len(clf.labels_): if clf.labels_[i - 1] == 3: x4.append(newData[i - 1][0]) y4.append(newData[i - 1][1]) #z4.append(newData[i-1][2]) i = i + 1 x5 = [] y5 = [] #z5 = [] i = 1 while i <= len(clf.labels_): if clf.labels_[i - 1] == 4: x5.append(newData[i - 1][0]) y5.append(newData[i - 1][1]) #z5.append(newData[i-1][2]) i = i + 1 # produce the diagram plt.title('K-means Clustering with PCA', fontsize=20) plt.xlabel('Dimension 1', fontsize=15) plt.ylabel('Dimension 2', fontsize=15) plt.plot(x1, y1, 'or') plt.plot(x2, y2, 'og') plt.plot(x3, y3, 'ob') plt.plot(x4, y4, 'ok') plt.plot(x5, y5, 'oy') #plt.savefig('k-means.png', dpi=500) plt.show() plt.close() return 1
#Operate on random 2.5% sample of headlines seeder = 1918 random.seed(seeder) nsamp = 6000 index = random.sample(range(0,len(headlines)),nsamp) sample = [headlines[i] for i in index] #Get transformed sparse matrix sparse = tfidf.fit_transform(sample) #Calculate matrix of (dis)similarities similarities = euclidean_distances(sparse) tic = timeit.default_timer() #Project via multi-dimensional scaling mds = manifold.MDS(n_components=2, max_iter=1000, eps=1e-6, random_state=seeder, dissimilarity="precomputed", n_jobs=1) project = mds.fit(similarities).embedding_ #Varimax rotation pca = PCA(n_components=2) project = pca.fit_transform(project) toc = timeit.default_timer() timer = '%.2f' %((toc - tic)/60) logging.info("Time elapsed for MDS projection, %d sample size: %s mins", nsamp, timer) test = [] for (i,samplehead,row) in zip(index,sample,project2): w = [i,samplehead.encode('utf-8')] for dim in row: w.append(dim)
def myfunction(): randomData, stratifiedData, anotherX, targetForStrat, targetForRand, targetForOrg, attributeNames, latitude, longitude, stratLat, stratLong, numberInEachState, avArray = Task1.task1( ) #randData_std, stratData_std, orgData_std, targetForStrat, targetForRand, targetForOrg, attributeNames = Task1.task1() # I standardize/center the data --> MAKE SURE IT CENTERS stratData_std = StandardScaler().fit_transform(stratifiedData) randData_std = StandardScaler().fit_transform(randomData) orgData_std = StandardScaler().fit_transform(anotherX) #pca = decomposition.PCA(n_components=3) pcaStrat = decomposition.PCA() pcaRand = decomposition.PCA() pcaOrg = decomposition.PCA() # I transform the data and get respective eigenvalues sklearn_pcaStrat = pcaStrat.fit_transform(stratData_std) sklearn_pcaRand = pcaRand.fit_transform(randData_std) sklearn_pcaOrg = pcaOrg.fit_transform(orgData_std) stratEigVal = pcaStrat.explained_variance_ randEigVal = pcaRand.explained_variance_ orgEigVal = pcaOrg.explained_variance_ sumOfStratEig = 0 sumOfRandEig = 0 sumOfOrgEig = 0 contSumOfStratEig = [None] * 10 contSumOfRandEig = [None] * 10 contSumOfOrgEig = [None] * 10 #calculate sum of all eigenval for i in range(0, 10): sumOfStratEig = stratEigVal[i] + sumOfStratEig sumOfRandEig = randEigVal[i] + sumOfRandEig sumOfOrgEig = orgEigVal[i] + sumOfOrgEig contSumOfStratEig[i] = sumOfStratEig contSumOfRandEig[i] = sumOfRandEig contSumOfOrgEig[i] = sumOfOrgEig stratVarArray = [None] * 10 randVarArray = [None] * 10 orgVarArray = [None] * 10 #calculate variance array for i in range(0, 10): stratVarArray[i] = stratEigVal[i] / sumOfStratEig randVarArray[i] = randEigVal[i] / sumOfRandEig orgVarArray[i] = orgEigVal[i] / sumOfOrgEig sumStratVar = 0 sumRandVar = 0 sumOrgVar = 0 #get the sum of variances (total variance) for i in range(0, 10): sumStratVar = sumStratVar + stratVarArray[i] sumRandVar = sumRandVar + randVarArray[i] sumOrgVar = sumOrgVar + orgVarArray[i] tempStratVarSum = 0 tempRandVarSum = 0 tempOrgVarSum = 0 stratIntrDimCount = 0 randIntrDimCount = 0 orgIntrDimCount = 0 #get when 75% of the total variance occured for i in range(0, 10): tempStratVarSum = tempStratVarSum + stratVarArray[i] tempRandVarSum = tempRandVarSum + randVarArray[i] tempOrgVarSum = tempOrgVarSum + orgVarArray[i] if ((tempStratVarSum > (sumStratVar * .75)) and (stratIntrDimCount == 0)): stratIntrDimCount = i if ((tempRandVarSum > (sumRandVar * .75)) and (randIntrDimCount == 0)): randIntrDimCount = i if ((tempOrgVarSum > (sumOrgVar * .75)) and (orgIntrDimCount == 0)): orgIntrDimCount = i #calculate loading factors pcaStratNew = decomposition.PCA(n_components=stratIntrDimCount) pcaRandNew = decomposition.PCA(n_components=randIntrDimCount) pcaOrgNew = decomposition.PCA(n_components=orgIntrDimCount) sklearn_pcaStrat = pcaStratNew.fit_transform(stratData_std) sklearn_pcaRand = pcaRandNew.fit_transform(randData_std) sklearn_pcaOrg = pcaOrgNew.fit_transform(orgData_std) stratLoadFact = pcaStratNew.components_.T * np.sqrt( pcaStratNew.explained_variance_) randLoadFact = pcaRandNew.components_.T * np.sqrt( pcaRandNew.explained_variance_) orgLoadFact = pcaOrgNew.components_.T * np.sqrt( pcaOrgNew.explained_variance_) stratSumOfSquaredLoad = [[0 for i in range(0, 2)] for j in range(0, 10)] randSumOfSquaredLoad = [[0 for i in range(0, 2)] for j in range(0, 10)] orgSumOfSquaredLoad = [[0 for i in range(0, 2)] for j in range(0, 10)] #get attributes with highest PCA loading for i in range(0, 10): for j in range(0, stratIntrDimCount): stratSumOfSquaredLoad[i] = stratSumOfSquaredLoad[i] + ( stratLoadFact[i][j])**2 stratSumOfSquaredLoad[i][1] = i for i in range(0, 10): for j in range(0, randIntrDimCount): randSumOfSquaredLoad[i] = randSumOfSquaredLoad[i] + ( randLoadFact[i][j])**2 randSumOfSquaredLoad[i][1] = i for i in range(0, 10): for j in range(0, orgIntrDimCount): orgSumOfSquaredLoad[i] = orgSumOfSquaredLoad[i] + ( orgLoadFact[i][j])**2 orgSumOfSquaredLoad[i][1] = i #I sort the arrays stratSumOfSquaredLoad.sort(key=lambda x: x[0]) randSumOfSquaredLoad.sort(key=lambda x: x[0]) orgSumOfSquaredLoad.sort(key=lambda x: x[0]) #I get the highest 3 attributes stratThreeHighAttr = np.array(stratSumOfSquaredLoad[-3:]) randThreeHighAttr = np.array(randSumOfSquaredLoad[-3:]) orgThreeHighAttr = np.array(orgSumOfSquaredLoad[-3:]) stratThreeHighAttrData = [[0 for i in range(0, 3)] for j in range(0, 250)] randThreeHighAttrData = [[0 for i in range(0, 3)] for j in range(0, 250)] orgThreeHighAttrData = [[0 for i in range(0, 3)] for j in range(0, 999)] #I get the data associated with the three highest attribtues for j in range(0, 250): for i in range(0, 3): stratThreeHighAttrData[j][i] = stratData_std[j][int( stratThreeHighAttr[i][1])] randThreeHighAttrData[j][i] = randData_std[j][int( randThreeHighAttr[i][1])] for j in range(0, 999): for i in range(0, 3): orgThreeHighAttrData[j][i] = orgData_std[j][int( orgThreeHighAttr[i][1])] #names of the three highest attributes stratColumns = [None] * 3 randColumns = [None] * 3 orgColumns = [None] * 3 for i in range(0, 3): stratColumns[i] = (attributeNames[int(stratThreeHighAttr[i][1])]) randColumns[i] = (attributeNames[int(randThreeHighAttr[i][1])]) orgColumns[i] = (attributeNames[int(orgThreeHighAttr[i][1])]) strat3Data = pd.DataFrame(data=stratThreeHighAttrData, columns=stratColumns) rand3Data = pd.DataFrame(data=randThreeHighAttrData, columns=randColumns) org3Data = pd.DataFrame(data=orgThreeHighAttrData, columns=orgColumns) targetForStrat2 = pd.DataFrame(data=targetForStrat, columns=['Target']) targetForRand2 = pd.DataFrame(data=targetForRand, columns=['Target']) targetForOrg2 = pd.DataFrame(data=targetForOrg, columns=['Target']) #create the array with data points for 3 attr and cluster associated with that strat3DataFinal = pd.concat([strat3Data, targetForStrat2[['Target']]], axis=1) rand3DataFinal = pd.concat([rand3Data, targetForRand2[['Target']]], axis=1) org3DataFinal = pd.concat([org3Data, targetForOrg2[['Target']]], axis=1) #create an array with coordinates for 3 attr scatter plot bigStrat3Array = [[0 for i in range(0, 9)] for j in range(0, 250)] bigRand3Array = [[0 for i in range(0, 9)] for j in range(0, 250)] bigOrg3Array = [[0 for i in range(0, 9)] for j in range(0, 999)] for m in range(0, 250): count = 0 for j in range(0, 3): for i in range(0, 3): bigStrat3Array[m][count] = ([ strat3DataFinal.values[m][i], strat3DataFinal.values[m][j] ]) bigRand3Array[m][count] = ([ rand3DataFinal.values[m][i], rand3DataFinal.values[m][j] ]) count = count + 1 for m in range(0, 999): count = 0 for j in range(0, 3): for i in range(0, 3): bigOrg3Array[m][count] = ([ org3DataFinal.values[m][i], org3DataFinal.values[m][j] ]) count = count + 1 #to visualize data on top 2 pcaVectors pcaVisStrat = decomposition.PCA(n_components=2) pcaVisRand = decomposition.PCA(n_components=2) pcaVisOrg = decomposition.PCA(n_components=2) principalDFStrat = pd.DataFrame( data=pcaVisStrat.fit_transform(stratData_std), columns=['Principal Component 1', 'Principal Component 2']) principalDFRand = pd.DataFrame( data=pcaVisRand.fit_transform(randData_std), columns=['Principal Component 1', 'Principal Component 2']) principalDFOrg = pd.DataFrame( data=pcaVisOrg.fit_transform(orgData_std), columns=['Principal Component 1', 'Principal Component 2']) # targetForStrat2 = pd.DataFrame(data=targetForStrat, columns = ['Target']) # targetForRand2 = pd.DataFrame(data=targetForRand, columns = ['Target']) # targetForOrg2 = pd.DataFrame(data=targetForOrg, columns = ['Target']) #print(targetForStrat) #last row will show the cluster associated w/ each data point finalDFStrat = pd.concat([principalDFStrat, targetForStrat2[['Target']]], axis=1) finalDFRand = pd.concat([principalDFRand, targetForRand2[['Target']]], axis=1) finalDFOrg = pd.concat([principalDFOrg, targetForOrg2[['Target']]], axis=1) #mds mds_dataStrat = manifold.MDS(n_components=2, dissimilarity='precomputed') mds_dataRand = manifold.MDS(n_components=2, dissimilarity='precomputed') mds_dataOrg = manifold.MDS(n_components=2, dissimilarity='precomputed') #mds with euclidean stratSimEuc = pairwise_distances(stratData_std, metric='euclidean') randSimEuc = pairwise_distances(randData_std, metric='euclidean') orgSimEuc = pairwise_distances(orgData_std, metric='euclidean') stratDEuc = mds_dataStrat.fit_transform(stratSimEuc) randDEuc = mds_dataRand.fit_transform(randSimEuc) orgDEuc = mds_dataOrg.fit_transform(orgSimEuc) stratMDSdatEuc = pd.DataFrame(stratDEuc) randMDSdatEuc = pd.DataFrame(randDEuc) orgMDSdatEuc = pd.DataFrame(orgDEuc) finalMDSStratDataEuc = pd.concat( [stratMDSdatEuc, targetForStrat2[['Target']]], axis=1) finalMDSRandDataEuc = pd.concat( [randMDSdatEuc, targetForRand2[['Target']]], axis=1) finalMDSOrgDataEuc = pd.concat([orgMDSdatEuc, targetForOrg2[['Target']]], axis=1) #mds with corr stratSimCor = pairwise_distances(stratData_std, metric='correlation') randSimCor = pairwise_distances(randData_std, metric='correlation') orgSimCor = pairwise_distances(orgData_std, metric='correlation') stratDCor = mds_dataStrat.fit_transform(stratSimCor) randDCor = mds_dataRand.fit_transform(randSimCor) orgDCor = mds_dataOrg.fit_transform(orgSimCor) stratMDSdatCor = pd.DataFrame(stratDCor) randMDSdatCor = pd.DataFrame(randDCor) orgMDSdatCor = pd.DataFrame(orgDCor) finalMDSStratDataCor = pd.concat( [stratMDSdatCor, targetForStrat2[['Target']]], axis=1) finalMDSRandDataCor = pd.concat( [randMDSdatCor, targetForRand2[['Target']]], axis=1) finalMDSOrgDataCor = pd.concat([orgMDSdatCor, targetForOrg2[['Target']]], axis=1) #json data --> to export to front end data = {} # data["randData"] = randomData.tolist() # data["stratData"] = stratifiedData.tolist() # data["originalData"] = anotherX.tolist() data['stratEigVal'] = stratEigVal.tolist() data['randEigVal'] = randEigVal.tolist() data['orgEigVal'] = orgEigVal.tolist() data['stratLoadFact'] = stratLoadFact.tolist() data['randLoadFact'] = randLoadFact.tolist() data['orgLoadFact'] = orgLoadFact.tolist() data['stratSigNum'] = stratIntrDimCount data['randSigNum'] = randIntrDimCount data['orgSigNum'] = orgIntrDimCount data['sumOfStratEig'] = contSumOfStratEig data['sumOfRandEig'] = contSumOfRandEig data['sumOfOrgEig'] = contSumOfOrgEig # data['strat3HighAttr'] = stratThreeHighAttr.tolist() # data['rand3HighAttr'] = randThreeHighAttr.tolist() # data['org3HighAttr'] = orgThreeHighAttr.tolist() data['pca2StratValues'] = np.array(finalDFStrat).tolist() data['pca2RandValues'] = np.array(finalDFRand).tolist() data['pca2OrgValues'] = np.array(finalDFOrg).tolist() data['stratMDSDataEuc'] = np.array(finalMDSStratDataEuc).tolist() data['randMDSDataEuc'] = np.array(finalMDSRandDataEuc).tolist() data['orgMDSDataEuc'] = np.array(finalMDSOrgDataEuc).tolist() data['stratMDSDataCor'] = np.array(finalMDSStratDataCor).tolist() data['randMDSDataCor'] = np.array(finalMDSRandDataCor).tolist() data['orgMDSDataCor'] = np.array(finalMDSOrgDataCor).tolist() data['strat3LoadData'] = np.array(strat3DataFinal).tolist() data['rand3LoadData'] = np.array(rand3DataFinal).tolist() data['org3LoadData'] = np.array(org3DataFinal).tolist() data['strat3AttrNames'] = stratColumns data['rand3AttrNames'] = randColumns data['org3AttrNames'] = orgColumns data['bigStrat3Array'] = bigStrat3Array data['bigRand3Array'] = bigRand3Array data['bigOrg3Array'] = bigOrg3Array data['lat'] = latitude data['long'] = longitude data['stratLat'] = stratLat data['stratLong'] = stratLong data['numberInEachState'] = numberInEachState data['origData'] = anotherX.tolist() data['avArray'] = avArray json_data = json.dumps(data) return json_data
def features(self, sampleRate = 0.05): path = self.path print("Importing Geographic Dataset") readGeoData = self._loadData(path,self.sample) userCoordinates = readGeoData[["id","latitude","longitude"]] del(readGeoData) print("Sanitizing Data") latitudeFilter = [self._isSanitized(lat) for lat in userCoordinates.latitude] longitudeFilter = [self._isSanitized(lon) for lon in userCoordinates.longitude] finalFilter = [lat and lon for lat, lon in zip(latitudeFilter, longitudeFilter)] sanitizedData = userCoordinates[finalFilter] print("Collecting Random Subsample") userCoordinateSample = sanitizedData.sample(int(len(sanitizedData)*sampleRate)) userCoordinateSample = userCoordinateSample[["latitude","longitude"]] print("Generating Geographic Distance Matrix") transformedCoordinates = np.array(userCoordinateSample).astype(np.float) geoDistanceMatrix = pdist(transformedCoordinates, lambda a,b: geodesic((math.radians(a[0]),math.radians(a[1])),(math.radians(b[0]),math.radians(b[1]))).meters) del(transformedCoordinates) print("Executing Multidimensional Scaling Procedure") reshapedGeoDistMatrix = squareform(geoDistanceMatrix) del(geoDistanceMatrix) seed = np.random.RandomState(seed=3) mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1) fittedMds = mds.fit(reshapedGeoDistMatrix) del(reshapedGeoDistMatrix) self.stress = fittedMds.stress_ pos = fittedMds.embedding_ print("Initiating Embedding Estimation for Entire Database") dataset = pd.DataFrame({"latitude": userCoordinateSample["latitude"], "longitude": userCoordinateSample["longitude"], "Y1":[element[0] for element in pos.tolist()], "Y2":[element[1] for element in pos.tolist()]}) print("Training Model") training, validation, test = self._train_validate_test_split(dataset, train_percent=0.70, validate_percent=0.15) trainX = training[["latitude","longitude"]] trainY1 = training["Y1"] trainY2 = training["Y2"] validationX = validation[["latitude","longitude"]] validationY1 = validation["Y1"] validationY2 = validation["Y2"] testX = test[["latitude","longitude"]] testY1 = test["Y1"] testY2 = test["Y2"] print("Validating Model") n_neighbors = 5 knn11 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance") knn12 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance") n_neighbors = 7 knn21 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance") knn22 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance") n_neighbors = 11 knn31 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance") knn32 = neighbors.KNeighborsRegressor(n_neighbors, weights="distance") #Validation validationPredictedY11 = knn11.fit(trainX, trainY1).predict(validationX) validationPredictedY12 = knn12.fit(trainX, trainY2).predict(validationX) validationPredictedY21 = knn21.fit(trainX, trainY1).predict(validationX) validationPredictedY22 = knn22.fit(trainX, trainY2).predict(validationX) validationPredictedY31 = knn31.fit(trainX, trainY1).predict(validationX) validationPredictedY32 = knn32.fit(trainX, trainY2).predict(validationX) rSquared11 = self._rSquared(validationPredictedY11,validationY1) rSquared12 = self._rSquared(validationPredictedY12,validationY2) rSquared1 = np.mean([rSquared11, rSquared12]) rSquared21 = self._rSquared(validationPredictedY21,validationY1) rSquared22 = self._rSquared(validationPredictedY22,validationY2) rSquared2 = np.mean([rSquared21, rSquared22]) rSquared31 = self._rSquared(validationPredictedY31,validationY1) rSquared32 = self._rSquared(validationPredictedY32,validationY1) rSquared3 = np.mean([rSquared31, rSquared32]) if rSquared1 == max([rSquared1,rSquared2,rSquared3]): knn1 = knn11 knn2 = knn12 print("Best K=5") #print("Best R-squared: "+str(rSquared1)) elif rSquared2 == max([rSquared1,rSquared2,rSquared3]): knn1 = knn21 knn2 = knn22 print("Best K=7") #print("Best R-squared: "+str(rSquared2)) else: knn1 = knn31 knn2 = knn32 print("Best K=11") #print("Best R-squared: "+str(rSquared3)) del(validationPredictedY11) del(validationPredictedY12) del(validationPredictedY21) del(validationPredictedY22) del(validationPredictedY31) del(validationPredictedY32) print("Testing Model") #Test testPredictedY1 = knn1.fit(trainX, trainY1).predict(testX) testPredictedY2 = knn2.fit(trainX, trainY2).predict(testX) finalRSquared1 = self._rSquared(testPredictedY1,testY1) finalRSquared2 = self._rSquared(testPredictedY2,testY2) finalRSquared = np.mean([finalRSquared1, finalRSquared2]) print("Final R-Squared: "+str(finalRSquared)) del(testPredictedY1) del(testPredictedY2) print("Deploying Model") #Deployment finalModel1 = knn1.fit(trainX, trainY1) finalModel2 = knn2.fit(trainX, trainY2) finalPos1 = finalModel1.predict(sanitizedData[['latitude','longitude']]) finalPos2 = finalModel2.predict(sanitizedData[['latitude','longitude']]) print("Normalizing Position Vectors") normalizedPos1 = (finalPos1-min(finalPos1))/(max(finalPos1)-min(finalPos1)) normalizedPos2 = (finalPos2-min(finalPos2))/(max(finalPos2)-min(finalPos2)) normalizedIdPos = pd.DataFrame({"id":list([str(row) for row in sanitizedData['id']]), "x": normalizedPos1, "y": normalizedPos2}) return normalizedIdPos.to_sparse()
t0 = time() trans_data = manifold.Isomap(n_neighbors, n_components=2)\ .fit_transform(sphere_data).T t1 = time() print("%s: %.2g sec" % ('ISO', t1 - t0)) ax = fig.add_subplot(257) plt.scatter(trans_data[0], trans_data[1], c=colors, cmap=plt.cm.rainbow) plt.title("%s (%.2g sec)" % ('Isomap', t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) plt.axis('tight') # Perform Multi-dimensional scaling. t0 = time() mds = manifold.MDS(2, max_iter=100, n_init=1) trans_data = mds.fit_transform(sphere_data).T t1 = time() print("MDS: %.2g sec" % (t1 - t0)) ax = fig.add_subplot(258) plt.scatter(trans_data[0], trans_data[1], c=colors, cmap=plt.cm.rainbow) plt.title("MDS (%.2g sec)" % (t1 - t0)) ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) plt.axis('tight') # Perform Spectral Embedding. t0 = time() se = manifold.SpectralEmbedding(n_components=2, n_neighbors=n_neighbors)
# LTSA embedding of the digits dataset print("Computing LTSA embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='ltsa') t0 = time() X_ltsa = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding( X_ltsa, "Local Tangent Space Alignment of the digits (time %.2fs)" % (time() - t0)) #---------------------------------------------------------------------- # MDS embedding of the digits dataset print("Computing MDS embedding") clf = manifold.MDS(n_components=2, n_init=1, max_iter=100) t0 = time() X_mds = clf.fit_transform(X) print("Done. Stress: %f" % clf.stress_) plot_embedding(X_mds, "MDS embedding of the digits (time %.2fs)" % (time() - t0)) #---------------------------------------------------------------------- # Random Trees embedding of the digits dataset print("Computing Totally Random Trees embedding") hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5) t0 = time() X_transformed = hasher.fit_transform(X) pca = decomposition.RandomizedPCA(n_components=2)
def layout(self, layoutType='graph'): ###### need domain information for generating layout using DR # domain = self.f[:,:-1] if layoutType == 'graph': # g = nx.Graph() g = Graph('ER', engine='neato') i = 0 index_map = dict() inverse_index_map = dict() # for ext in self.seg.keys(): for s in self.graph.keys(): for ext in self.graph[s]: index_map[ext] = i inverse_index_map[i] = ext if ext in self.previousPos.keys(): ppos = self.previousPos[ext] # print 'previous position is provided for:', ext, ppos[0], ppos[1] g.node('%d' % ext, pos="%f,%f" % (ppos[0], ppos[1])) else: # print "\n\n No previousPos provided for:", ext, '\n\n' g.node('%d' % ext) i += 1 for s in self.graph.keys(): index_map[s] = i inverse_index_map[i] = s if s in self.previousPos.keys(): ppos = self.previousPos[s] # print 'previous position is provided for:', s, ppos[0], ppos[1] g.node('%d' % s, pos="%f,%f" % (ppos[0], ppos[1])) else: # print "\n\n No previousPos provided for:", ext, '\n\n' g.node('%d' % s) i += 1 for s in self.graph.keys(): for ext in self.graph[s]: g.edge(str(s), str(ext), len=str(linalg.norm(self.loc[s] - self.loc[ext]))) # g.add_edge(index_map[s], index_map[ext], weight=linalg.norm(f[s,:-1] - f[ext,:-1])) self.currentNodeSize = len(index_map) if self.currentNodeSize == self.previousNodeSize: return self.previousNodeSize = self.currentNodeSize ##print "============== Render Spine ==============" g.format = 'plain' path = g.render('spine') input = open('spine.plain') for l in input.readlines(): l = l.split() # print l if l[0] == 'node': self.pos[int(l[1])] = [float(l[2]), float(l[3])] #give position to all children #only if l[1] is extrema #init the position all the point to the extrema pos ## FIXME update children extrema to the parent location # print("extremaSet = ", self.extremaSet) #if int(l[1]) in self.extremaSet: # self.pos.update(dict.fromkeys(self.extremaSet[int(l[1])], [float(l[2]),float(l[3])])) elif layoutType == 'PCA': if recomputePCA: self.pca = decomposition.PCA(n_components=2) self.pca.fit(domain) pos2D = self.pca.transform(domain).tolist() for index, val in enumerate(pos2D): self.pos[nodeIndicesList[index]] = val elif layoutType == 'MDS': mds = manifold.MDS(n_components=2, max_iter=100, n_init=1) pos2D = mds.fit_transform(domain).tolist() for index, val in enumerate(pos2D): self.pos[nodeIndicesList[index]] = val elif layoutType == 'tSNE': tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) pos2D = tsne.fit_transform(domain).tolist() for index, val in enumerate(pos2D): self.pos[nodeIndicesList[index]] = val else: #PCA is the default pca = decomposition.PCA(n_components=2) pos2D = pca.fit_transform(domain).tolist() for index, val in enumerate(pos2D): self.pos[nodeIndicesList[index]] = val #store position # print self.pos # print "compare self.pos vs self.previousPos" # for p in self.previousPos.keys(): # if self.previousPos[p] != self.pos[p]: # print p, self.previousPos[p], self.pos[p] self.previousPos = self.pos #if only layout one point clear the position cache if len(self.graph.keys()) <= 1: self.previousPos = dict()
[587, 0, 920, 940, 1745, 1188, 713, 1858, 1737, 597], [1212, 920, 0, 878, 831, 1726, 1631, 949, 1021, 1494], [701, 940, 878, 0, 1374, 968, 1420, 1645, 1891, 1220], [1936, 1745, 831, 1374, 0, 2339, 2451, 347, 959, 2300], [604, 1188, 1726, 968, 2339, 0, 1092, 2594, 2734, 923], [748, 713, 1631, 1420, 2451, 1092, 0, 2571, 2408, 205], [2139, 1858, 949, 1645, 347, 2594, 2571, 0, 678, 2442], [2182, 1737, 1021, 1891, 959, 2734, 2408, 678, 0, 2329], [543, 597, 1494, 1220, 2300, 923, 205, 2442, 2329, 0]]) # check to see that the distance structure has been entered correctly print(distance_matrix) print(type(distance_matrix)) # apply the multidimensional scaling algorithm and plot the map mds_method = manifold.MDS(n_components = 2, random_state = 9999,\ dissimilarity = 'precomputed') mds_fit = mds_method.fit(distance_matrix) mds_coordinates = mds_method.fit_transform(distance_matrix) city_label = [ 'Atlanta', 'Chicago', 'Denver', 'Houston', 'Los Angeles', 'Miami', 'New York', 'San Francisco', 'Seattle', 'Washington D.C.' ] # plot mds solution in two dimensions using city labels # defined by multidimensional scaling plt.figure() plt.scatter(mds_coordinates[:,0],mds_coordinates[:,1],\ facecolors = 'none', edgecolors = 'none') # points in white (invisible) labels = city_label for label, x, y in zip(labels, mds_coordinates[:, 0], mds_coordinates[:, 1]):
import pandas as pd import numpy as np from scipy.spatial.distance import euclidean, pdist, squareform from matplotlib import pyplot as plt from sklearn import manifold data = pd.read_excel("matrix.xlSx") def similarity_func(u, v): return 1 / (1 + euclidean(u, v)) dists = pdist(data[data.columns[1:]], similarity_func) similarities = pd.DataFrame(squareform(dists)) mds = manifold.MDS(n_components=2, max_iter=200, eps=1e-9, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(similarities).embedding_ plt.scatter(pos[:, 0], pos[:, 1], color='turquoise', s=111, lw=0, label='MDS') plt.savefig('plot.png') plt.close()