def create_2dprojection(distmat): #uses isomap to return a species distance map in 2d based on the topological distmat of all species in tree print 'map to 3d space' mapper=MDS(n_components=3, metric=True, n_init=4, max_iter=300, verbose=0, eps=0.001, n_jobs=-1, random_state=0, dissimilarity='precomputed') projmat =mapper.fit_transform(distmat) print 'DONE' return projmat
def main(): # load sample data data = np.loadtxt("distmat799.txt", delimiter=",") dists = data / np.amax(data) # load images img_files = [img for img in os.listdir("799_patch") if re.search(r"\.png", img)] # mds mds = MDS(n_components=2, dissimilarity="precomputed") results = mds.fit(dists) # plot fig, ax = plt.subplots() for i, img_file in enumerate(img_files): img_file = os.path.join("799_patch", img_file) img = read_png(img_file) imagebox = OffsetImage(img, zoom=2.0) coords = results.embedding_[i, :] xy = tuple(coords) ab = AnnotationBbox(imagebox, xy) ax.add_artist(ab) ax.set_xlim(-1.0, 1.0) ax.set_ylim(-1.0, 1.0) plt.show()
def plotFlatClusterGraph(tf_idf_matrix, clusters, headlines_utf): dist = 1 - cosine_similarity(tf_idf_matrix) MDS() mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) xs, ys = pos[:, 0], pos[:, 1] cluster_colors = {0: '#FE642E', 1: '#B40404', 2: '#D7DF01', 3: '#01DF01', 4: '#00FFBF', 5: '#2E64FE', 6:'#8904B1', 7:'#FA58F4', 8:'#FE2E9A', 9:'#A4A4A4'} #create data frame that has the result of the MDS plus the cluster numbers and titles df = pandas.DataFrame(dict(x=xs, y=ys, label=clusters, title=headlines_utf)) groups = df.groupby('label') # set up plots fig, ax = plt.subplots(figsize=(17, 9)) # set size #iterate through groups to layer the plots for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, color=cluster_colors[name], mec='none') ax.set_aspect('auto') ax.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off') ax.tick_params(axis= 'y', which='both', left='off', top='off', labelleft='off') ax.legend(numpoints=1) #show legend with only 1 point #add label in x,y position with the label as the film title for t_n in range(len(df)): ax.text(df.ix[t_n]['x'], df.ix[t_n]['y'], df.ix[t_n]['title'], size=8) plt.savefig('../plots/flat_clusters.png', dpi=400)
def reorder_channels_by_xyz_coord(data, channel_names=None): """ :param data: 2-d array in the format [n_samples, n_channels] :param channel_names: names of the EEG channels :return: data, channel_names permutated accordingly """ # work on transposed view, i.e. [channel, samples] data = data.T # map channels to 1-d coordinates through MDS from sklearn.manifold import MDS distances = compute_electrode_distance_matrix() mds = MDS(n_components=1, dissimilarity='precomputed') projection = mds.fit_transform(distances).reshape(data.shape[0]) order = np.argsort(projection) print mds.stress_ print order # re-order channels data = data[order] # restore initial axes layout data = data.T # re-order channel_names channel_names = reorder_channel_names(channel_names, order) return data, channel_names
def plot_cities(): #distance_matrix = get_distances() cities = 'BOS CHI DC DEN LA MIA NY SEA SF'.split() distance_matrix = np.array([ [0 , 963 , 429 , 1949, 2979, 1504, 206 , 2976, 3095], [963 , 0 , 671 , 996 , 2054, 1329, 802 , 2013, 2142], [429 , 671 , 0 , 1616, 2631, 1075, 233 , 2684, 2799], [1949, 996 , 1616, 0 , 1059, 2037, 1771, 1307, 1235], [2979, 2054, 2631, 1059, 0 , 2687, 2786, 1131, 379], [1504, 1329, 1075, 2037, 2687, 0 , 1308, 3273, 3053], [206 , 802 , 233 , 1771, 2786, 1308, 0 , 2815, 2934], [2976, 2013, 2684, 1307, 1131, 3273, 2815, 0 , 808], [3095, 2142, 2799, 1235, 379 , 3053, 2934, 808 , 0] ]) # assert symmetric for (i, j) in [(i, j) for i in range(0, 8) for j in range(0, 8)]: try: assert(distance_matrix[i][j] == distance_matrix[j][i]) except AssertionError: print((i, j)) print(distance_matrix) mds = MDS(dissimilarity='precomputed') mds.fit(distance_matrix) print(mds.embedding_) for idx, points in enumerate(mds.embedding_): plt.plot(points[0], points[1], 'r.') plt.text(points[0], points[1], cities[idx]) plt.show() return
def scale_plot(input_data, data_colors=None, cluster_colors=None, cluster_sizes=None, dissimilarity='euclidean', filey=None): """ Plot MDS of data and clusters """ if data_colors is None: data_colors = 'r' if cluster_colors is None: cluster_colors='b' if cluster_sizes is None: cluster_sizes = 2200 # scale mds = MDS(dissimilarity=dissimilarity) mds_out = mds.fit_transform(input_data) with sns.axes_style('white'): f=plt.figure(figsize=(14,14)) plt.scatter(mds_out[n_clusters:,0], mds_out[n_clusters:,1], s=75, color=data_colors) plt.scatter(mds_out[:n_clusters,0], mds_out[:n_clusters,1], marker='*', s=cluster_sizes, color=cluster_colors, edgecolor='black', linewidth=2) # plot cluster number offset = .011 font_dict = {'fontsize': 17, 'color':'white'} for i,(x,y) in enumerate(mds_out[:n_clusters]): if i<9: plt.text(x-offset,y-offset,i+1, font_dict) else: plt.text(x-offset*2,y-offset,i+1, font_dict) if filey is not None: plt.title(path.basename(filey)[:-4], fontsize=20) save_figure(f, filey) plt.close()
def embed_two_dimensions(data, vectorizer, size=10, n_components=5, colormap='YlOrRd'): if hasattr(data, '__iter__'): iterable = data else: raise Exception('ERROR: Input must be iterable') import itertools iterable_1, iterable_2 = itertools.tee(iterable) # get labels labels = [] for graph in iterable_2: label = graph.graph.get('id', None) if label: labels.append(label) # transform iterable into sparse vectors data_matrix = vectorizer.transform(iterable_1) # embed high dimensional sparse vectors in 2D from sklearn import metrics distance_matrix = metrics.pairwise.pairwise_distances(data_matrix) from sklearn.manifold import MDS feature_map = MDS(n_components=n_components, dissimilarity='precomputed') explicit_data_matrix = feature_map.fit_transform(distance_matrix) from sklearn.decomposition import TruncatedSVD pca = TruncatedSVD(n_components=2) low_dimension_data_matrix = pca.fit_transform(explicit_data_matrix) plt.figure(figsize=(size, size)) embed_dat_matrix_two_dimensions(low_dimension_data_matrix, labels=labels, density_colormap=colormap) plt.show()
def main(): digits = load_digits() X = digits.data y = digits.target mds = MDS() X_mds = mds.fit_transform(X) plot_embedding(X_mds, y)
def labtest_MDS(PID): data = [patients[pid]['tests'] for pid in PID] X = pp.scale(data) mds = MDS(n_components = 2, metric = True, n_init = 4, max_iter = 300, verbose = 0, eps = 0.001, n_jobs = 1, dissimilarity = 'euclidean') pos = mds.fit(X).embedding_ return pos
def main(): args = docopt(__doc__) is_mds = args['--mds'] # load datasets digits = load_digits() X = digits.data y = digits.target labels = digits.target_names # dimension reduction if is_mds: model = MDS(n_components=2) else: model = PCA(n_components=2) X_fit = model.fit_transform(X) for i in range(labels.shape[0]): plt.scatter(X_fit[y == i, 0], X_fit[y == i, 1], color=COLORS[i], label=str(i)) plt.legend(loc='upper left') plt.autoscale() plt.grid() plt.show()
def plotMap(maparr, freq, nest, seqs, dbfile, map2d, outfile, plotm='T'): #mutli-dimensional scaling similarities = euclidean_distances(np.matrix(maparr)) mds = MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=np.random.RandomState(seed=3), dissimilarity="precomputed", n_jobs=1) pos = mds.fit(similarities).embedding_ #plot attributes N = len(pos) #size = [20*n for n in freq] size = 8000 color = np.array(range(N)) if str(plotm) == 'T': #plot MDS fig, ax = plt.subplots(figsize=(10,10)) warnings.filterwarnings("ignore") scatter = ax.scatter(np.array(pos[:,0]), np.array(pos[:,1]), c=color, s=size, alpha=0.3, cmap=plt.cm.viridis, marker='s') plt.xlabel('Dimension 1', fontsize=20, labelpad=20) plt.ylabel('Dimension 2', fontsize=20, labelpad=20) #plt.axis([xmin, xmax, ymin, ymax]) plt.tick_params(labelsize=15, length=14, direction='out', pad=15, top='off', right='off') #save figures fig.savefig(outfile + '.png', bbox_inches='tight', format='png') fig.savefig(outfile + '.pdf', bbox_inches='tight', format='pdf') plt.close(fig) warnings.resetwarnings() #write csv file writePlotMDS(freq, nest, seqs, dbfile, pos, maparr, map2d, outfile) return pos
def project_in_2D(distance_mat, method='mds'): """ Project SDRs onto a 2D space using manifold learning algorithms :param distance_mat: A square matrix with pairwise distances :param method: Select method from 'mds' and 'tSNE' :return: an array with dimension (numSDRs, 2). It contains the 2D projections of each SDR """ seed = np.random.RandomState(seed=3) if method == 'mds': mds = MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(distance_mat).embedding_ nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12, dissimilarity="precomputed", random_state=seed, n_jobs=1, n_init=1) pos = nmds.fit_transform(distance_mat, init=pos) elif method == 'tSNE': tsne = TSNE(n_components=2, init='pca', random_state=0) pos = tsne.fit_transform(distance_mat) else: raise NotImplementedError return pos
def visualize_clusters(tfidf_matrix, vocabulary, km): # calcuate the cosine distance between each document # this will be used for plotting on a euclidean (2-dimensional) plane. dist = 1 - cosine_similarity(tfidf_matrix) clusters = km.labels_.tolist() # convert two components as we are plotting points in a two-dimensional plane # 'precomputed' because we provide a distance matrix # we will also specify 'random_state' so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] # set up colors per clusters using a dict cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e', 5: '#99cc00'} # set up cluster names using a dict (perhaps using the top terms of each cluster) cluster_names = {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5'} #create data frame that has the result of the MDS plus the cluster numbers and titles df = pd.DataFrame(dict(x=xs, y=ys, label=clusters)) #group by cluster groups = df.groupby('label') # set up plot fig, ax = plt.subplots(figsize=(17, 9)) # set size ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling #iterate through groups to layer the plot #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label for name, group in groups: ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none') ax.set_aspect('auto') ax.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') ax.tick_params(\ axis= 'y', # changes apply to the y-axis which='both', # both major and minor ticks are affected left='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelleft='off') ax.legend(numpoints=1) #show legend with only 1 point plt.show() #show the plot
def generate_cluster_plot_frame(self): MDS() mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) dist = 1 - cosine_similarity(self.tfidf_matrix) pos = mds.fit_transform(dist) xs, ys = pos[:,0], pos[:,1] self.cluster_plot_frame = pd.DataFrame(dict(x=xs, y=ys, label=self.clusters, chapter=self.chapter_list, book=self.book_list))
def non_param_multi_dim_scaling(dists, n_dims=3, n_threads=None, metric=True): mds = MDS(n_components=n_dims, metric=metric, n_jobs=n_threads, dissimilarity='precomputed') mds.fit(squareform(dists)) projs = mds.embedding_ res = {'stress': mds.stress_, 'projections': projs} return res
def plot_clusters(num_clusters, feature_matrix, cluster_data, movie_data, plot_size=(16,8)): # generate random color for clusters def generate_random_color(): color = '#%06x' % random.randint(0, 0xFFFFFF) return color # define markers for clusters markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd'] # build cosine distance matrix cosine_distance = 1 - cosine_similarity(feature_matrix) # dimensionality reduction using MDS mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) # get coordinates of clusters in new low-dimensional space plot_positions = mds.fit_transform(cosine_distance) x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1] # build cluster plotting data cluster_color_map = {} cluster_name_map = {} for cluster_num, cluster_details in cluster_data.items(): # assign cluster features to unique label cluster_color_map[cluster_num] = generate_random_color() cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip() # map each unique cluster label with its coordinates and movies cluster_plot_frame = pd.DataFrame({'x': x_pos, 'y': y_pos, 'label': movie_data['Cluster'].values.tolist(), 'title': movie_data['Title'].values.tolist() }) grouped_plot_frame = cluster_plot_frame.groupby('label') # set plot figure size and axes fig, ax = plt.subplots(figsize=plot_size) ax.margins(0.05) # plot each cluster using co-ordinates and movie titles for cluster_num, cluster_frame in grouped_plot_frame: marker = markers[cluster_num] if cluster_num < len(markers) \ else np.random.choice(markers, size=1)[0] ax.plot(cluster_frame['x'], cluster_frame['y'], marker=marker, linestyle='', ms=12, label=cluster_name_map[cluster_num], color=cluster_color_map[cluster_num], mec='none') ax.set_aspect('auto') ax.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off') ax.tick_params(axis= 'y', which='both', left='off', top='off', labelleft='off') fontP = FontProperties() fontP.set_size('small') ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True, shadow=True, ncol=5, numpoints=1, prop=fontP) #add labels as the film titles for index in range(len(cluster_plot_frame)): ax.text(cluster_plot_frame.ix[index]['x'], cluster_plot_frame.ix[index]['y'], cluster_plot_frame.ix[index]['title'], size=8) # show the plot plt.show()
def md_scaling(co_matrix, is_distance_matrix=False): if not is_distance_matrix: distance_matrix = -np.log(co_matrix.matrix) else: distance_matrix = co_matrix mds = MDS(dissimilarity='precomputed') mds.fit(distance_matrix) return mds.embedding_
def mds_embed(graph): sorted_node_list = sorted(list(graph.nodes()), key=len) dmat = nx.floyd_warshall_numpy(graph, nodelist=sorted_node_list) gmds = MDS(n_jobs=-2, dissimilarity="precomputed") embed_pts = gmds.fit_transform(dmat) return (embed_pts, dmat, sorted_node_list)
def mds(similarity, euclid=False): if euclid: model = MDS(max_iter=1000) result = model.fit_transform(similarity) else: model = MDS(max_iter=1000, dissimilarity='precomputed') result = model.fit_transform(1 - similarity) return result.T
def compute_2d_mapping(layout): sphere_coords = layout.sphere_coords() radius = layout.sphere_radius() from sklearn.manifold import MDS distances = compute_electrode_distance_matrix(sphere_coords, radius) mds = MDS(n_components=2, dissimilarity='precomputed') projection = mds.fit_transform(distances) # print projection.shape return projection
def cluster(D, k=3, verbose=False): """Cluster LDS's via Multi-Dimensional Scaling and KMeans. Strategy: 1. Build NxN matrix of pairwise similarities 2. Run MDS to embed data in R^2 3. Run KMeans with k cluster centers 4. Find samples closest to the k centers Paramters: ---------- D: numpy.ndarray, shape = (N, N) Precomputed distance matrix. k: int (default: 3) Number of desired cluster centers. verbose: boolean Enable verbose output. Returns: -------- eData: numpy.ndarray, shape (N, k) N d-dimensional samples embedded in R^d. ids: numpy.ndarray, shape = (k,) List of indices identifying the k representatives. """ assert D.shape[0] == D.shape[1], "OOps (distance matrix not square)!" # build MDS for precomputed similarity matrix mds = MDS(metric=True, n_components=2, verbose=True, dissimilarity="precomputed") def __symmetrize(A): return A + A.T - np.diag(A.diagonal()) # run MDS on symmetrized similarity matrix eData = mds.fit(__symmetrize(D)).embedding_ kmObj = KMeans(k) kmObj.fit_predict(eData) ids = np.zeros((k,), dtype=np.int) for i in range(k): # sanity check cDat = eData[np.where(kmObj.labels_ == i)[0],:] assert len(cDat) > 0, "Oops, empty cluster ..." kCen = kmObj.cluster_centers_[i,:] x = euclidean_distances(eData, kCen) ids[i] = int(np.argsort(x.ravel())[0]) # return distance matrix and ID's of representative LDS's return (eData, ids)
def get_mds(similarities): seed = np.random.RandomState(seed=3) print(np.amax(similarities)) print(np.amin(similarities)) nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12, dissimilarity="precomputed", random_state=seed, n_jobs=1, n_init=1) pos = nmds.fit(similarities).embedding_ X=np.array(pos) return X
def convert_matrix_to_coordinates(sym_matrix, components): """ :param sym_matrix: array, [n_samples, n_samples] :param components: int: 2 or 3 for MDS :return: Output of MDS, xy or xyz coordinates as 2d numpy array with shape [n_samples, components] """ # Create coordinates based on multi dimensional scaling mds = MDS(n_components=components, dissimilarity="precomputed", random_state=1) coordinates = mds.fit_transform(sym_matrix) return coordinates
def plotMDS(X, Y): #computes and plots MDS (measure for how well data separates) D = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(X)) tmodel = MDS(n_components=2, dissimilarity='precomputed') X2D = tmodel.fit_transform(D) plt.figure() plt.title('MDS') plt.ylabel('MDS1') plt.xlabel('MDS2') plt.scatter(X2D[:, 0], X2D[:, 1], c=Y) plt.show()
def embedDistanceMatrix(dmatDf, method='kpca', n_components=2, **kwargs): """Two-dimensional embedding of sequence distances in dmatDf, returning Nx2 x,y-coords: tsne, isomap, pca, mds, kpca, sklearn-tsne""" if isinstance(dmatDf, pd.DataFrame): dmat = dmatDf.values else: dmat = dmatDf if method == 'tsne': xy = tsne.run_tsne(dmat, no_dims=n_components, perplexity=kwargs['perplexity']) elif method == 'isomap': isoObj = Isomap(n_neighbors=10, n_components=n_components) xy = isoObj.fit_transform(dmat) elif method == 'mds': mds = MDS(n_components=n_components, max_iter=3000, eps=1e-9, random_state=15, dissimilarity="precomputed", n_jobs=1) xy = mds.fit(dmat).embedding_ rot = PCA(n_components=n_components) xy = rot.fit_transform(xy) elif method == 'pca': pcaObj = PCA(n_components=None) xy = pcaObj.fit_transform(dmat)[:, :n_components] elif method == 'kpca': pcaObj = KernelPCA(n_components=dmat.shape[0], kernel='precomputed', eigen_solver='dense') try: gram = dist2kernel(dmat) except: print('Could not convert dmat to kernel for KernelPCA; using 1 - dmat/dmat.max() instead') gram = 1 - dmat / dmat.max() xy = pcaObj.fit_transform(gram)[:, :n_components] elif method == 'lle': lle = manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=n_components, method='standard') xy = lle.fit_transform(dist) elif method == 'sklearn-tsne': tsneObj = TSNE(n_components=n_components, metric='precomputed', random_state=0, perplexity=kwargs['perplexity']) xy = tsneObj.fit_transform(dmat) elif method == 'umap': umapObj = umap.UMAP(n_components=n_components, metric='precomputed', **kwargs) xy = umapObj.fit_transform(dmat) else: print('Method unknown: %s' % method) return assert xy.shape[0] == dmatDf.shape[0] xyDf = pd.DataFrame(xy[:, :n_components], index=dmatDf.index, columns=np.arange(n_components)) if method == 'kpca': """Not sure how negative eigenvalues should be handled here, but they are usually small so it shouldn't make a big difference""" setattr(xyDf, 'explained_variance_', pcaObj.lambdas_[:n_components]/pcaObj.lambdas_[pcaObj.lambdas_>0].sum()) return xyDf
def mds(cos_simil_mtr): # convert two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify `random_state` so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(cos_simil_mtr) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] print() return xs, ys
def generate_cluster_plot_frame(self): mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) dist = 1 - cosine_similarity(self.tfidf_matrix) pos = mds.fit_transform(dist) xs, ys = pos[:, 0], pos[:, 1] cluster_data = dict() cluster_data["x"] = xs cluster_data["y"] = ys cluster_data["label"] = self.clusters cluster_data["presentation"] = self.presentation_list cluster_data["innovation_list"] = self.innovation_list self.cluster_plot_frame = pd.DataFrame(cluster_data)
def transform_and_plot_data(seed, distance_matrix, dim_x, dim_y, title, plot3D, ax): if plot3D: n_components = 3 else: n_components = 2 mds = MDS(n_components=n_components, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1) transformed_data = mds.fit_transform(distance_matrix) corner_points, pair_list = create_pairs_to_plot_from_list(transformed_data, dim_x, dim_y) if plot3D: my_plot3D(corner_points, pair_list, False, title, ax) else: my_plot2D(corner_points, pair_list, False, title, ax)
def make_mds_image(m, filename, labels=None, colour=None): """Given a matrix of distances, project into 2D space using multi-dimensional scaling and produce an image.""" mds_data_filename = filename + ".dat" try: # if we've previously computed, load it p = np.genfromtxt(mds_data_filename) except: # else, compute it now (and save) # Construct MDS object with various defaults including 2d mds = MDS(dissimilarity="precomputed") # Fit try: f = mds.fit(m) except ValueError as e: print("Can't run MDS for " + filename + ": " + str(e)) return # Get the embedding in 2d space p = f.embedding_ # save np.savetxt(mds_data_filename, p) # Make an image fig, ax = plt.subplots(figsize=(5, 5)) # x- and y-coordinates ax.set_aspect('equal') ax.scatter(p[:,0], p[:,1], edgecolors='none') if labels != None: print filename # hard-coded for GP depth-2 indices = [0, 2, 50, 52] for i in indices: print labels[i], p[i,0], p[i,1] # can print some labels directly on the graph as follows, # but maybe it's better done manually, after printing # their locations to terminal? # plt.text(p[i,0], p[i,1], labels[i], style='italic', # bbox={'facecolor':'red', 'alpha':0.5, 'pad':10}) fig.savefig(filename + ".pdf") fig.savefig(filename + ".eps") fig.savefig(filename + ".png") plt.close(fig)
def mds_bib_data_with_sklearn(fname): bib_data = get_bib_data() mat, years, term_list, years_cnt = get_year_by_term_mat(bib_data, freq=5) # Euclidean-based MDS aMDS = MDS(n_components=2, dissimilarity='euclidean') coords = aMDS.fit_transform(mat) fig = plt.figure() fig.clf() for label, x, y in zip(years, coords[:,0], coords[:,1]): plt.annotate(label, xy=(x,y)) plt.savefig(fname)
for kmean in range(6): for x in range(len(datamat2)): if results2[x] == kmean: kmeanSums2[kmean] += cosine_distance(datamat2[x], means2[kmean])**2 for kmean in range(6): for x in range(len(datamat2)): if results3[x] == kmean: kmeanSums3[kmean] += spatial.distance.jaccard( datamat2[x], means3[kmean])**2 print(sum(kmeanSums1)) print(sum(kmeanSums2)) print(sum(kmeanSums3)) MDS() mds = MDS(n_components=2, dissimilarity="euclidean", random_state=1) #mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(datamat2) #pos = mds.fit_transform(cosmat) #pos = mds.fit_transform(jacmat) xs, ys = pos[:, 0], pos[:, 1] cluster_colors = { 0: '#1b9e77', 1: '#d95f02', 2: '#7570b3',
import numpy as np from scipy import sparse import pandas as pd from sklearn.metrics.pairwise import cosine_similarity from sklearn.manifold import MDS # get num comics from my_utils import get_latest_comic_num if __name__ == "__main__": tfidf_vectors = sparse.load_npz("../data/text_vectors/tfidf_vectors.npz") cosine_array = cosine_similarity(tfidf_vectors, tfidf_vectors) dissimilarities = 1 - cosine_array # compute the embedding embedded = MDS(dissimilarity='precomputed').fit_transform(dissimilarities) # save coord np.save("../data/document_relations/mds.npy", embedded) print("MDS SHAPE: ", embedded.shape) # save coords as dataframe num_comics = get_latest_comic_num() + 1 comic_serial_numbers = [str(i) for i in range(1, num_comics)] df = pd.DataFrame(embedded, columns=['x', 'y'], index=comic_serial_numbers) pd.to_pickle(df, "../data/document_relations/mds_df.pkl")
clusterCounts = np.empty((nDifferentDataSet, )) dist = DistanceMetric.get_metric(metric) print("MDS Metric: {}".format(metric)) for i in range(nDifferentDataSet): data = generateOneClusterData(DEFAULT_NUMBER_OF_FEATURES, DEFAULT_NUMBER_OF_RECORDS_PER_CLASS, DEFAULT_FEATURE_MEAN_RANGE, i, distribution="normal") precomputedMetricData = dist.pairwise(data) mds = MDS(n_components=8, n_jobs=-1, dissimilarity="precomputed") mdsData = mds.fit_transform(precomputedMetricData) optimalK = OptimalK(parallel_backend='joblib', n_jobs=-1) clusterCount = optimalK(mdsData, n_refs=3, cluster_array=np.arange(1, 10)) clusterCounts[i] = clusterCount stress[i, j] = mds.stress_ meanClusterCount[j] = np.mean(clusterCounts) stdClusterCount[j] = np.std(clusterCounts) meanStress[j] = np.mean(stress[:, j]) stdStress[j] = np.std(stress[:, j])
COV_X_PD = pd.DataFrame(data=COV_X, index=index_PD, columns=Columns_PD) Mu = np.repeat(0.3, p) #%% Init MDS import Toolbox from Toolbox import two_d_eq, Assign_features_to_pixels, Random_Image_Gen, REFINED_Im_Gen from sklearn.manifold import MDS from sklearn.metrics.pairwise import euclidean_distances import pickle #%% MDS nn = math.ceil(np.sqrt(p)) # Image dimension Nn = p # Number of features Euc_Dist = COV_X # Making the Euclidean distance matrix symmetric embedding = MDS( n_components=2) # Reduce the dimensionality by MDS into 2 components mds_xy = embedding.fit_transform(COV_X) # Apply MDS print(">>>> MDS dimensionality reduction is done") eq_xy = two_d_eq(mds_xy, Nn) Img = Assign_features_to_pixels( eq_xy, nn, verbose=1) # Img is the none-overlapping coordinates generated by MDS Desc = Columns_PD # Drug descriptors name Dist = pd.DataFrame( data=Euc_Dist, columns=Desc, index=Desc ) # Generating a distance matrix which includes the Euclidean distance between each and every descriptor data = (Desc, Dist, Img) # Preparing the hill climbing inputs
#plt.scatter(X[:, 0], X[:, 1], **colorize) #plt.axis('equal'); #MDS def rotate(X, angle): theta = np.deg2rad(angle) R = [[np.cos(theta), np.sin(theta)], [-np.sin(theta), np.cos(theta)]] return np.dot(X, R) X2 = rotate(X, 20) + 5 #plt.scatter(X2[:, 0], X2[:, 1], **colorize) #plt.axis('equal') from sklearn.metrics import pairwise_distances D = pairwise_distances(X) D.shape #plt.imshow(D, zorder=2, cmap='Blues', interpolation='nearest') #%plt.colorbar(); D2 = pairwise_distances(X2) #plt.imshow(D, zorder=2, cmap='Greens', interpolation='nearest') #plt.colorbar(); np.allclose(D, D2) from sklearn.manifold import MDS model = MDS(n_components=2, dissimilarity='precomputed', random_state=1) out = model.fit_transform(D) plt.scatter(out[:, 0], out[:, 1], **colorize) plt.axis('equal');
count = count + 1 # Mean Centering n1 = np.ones([len(K), len(K)]) * 1.0 / len(K) K2 = K - np.dot(n1, K) - np.dot(K, n1) + np.linalg.multi_dot([n1, K, n1]) [t1, t2] = np.linalg.eig(K2) print(np.min(np.real(t1))) K3 = K2 - np.min(np.real(t1)) * np.eye(len(K2)) [t1, t2] = np.linalg.eig(K3) print(np.min(np.real(t1))) U = np.real(np.matmul(t2, np.diag(np.sqrt(t1)))) # Apply MDS q = 50 Ksym = (K + np.transpose(K)) / 2 mds = MDS(n_components=q, metric=True, dissimilarity='precomputed') U = mds.fit_transform(Ksym) #pca = PCA(n_components=q) #U = U[:,:q] #For Drugs temp = (compSim[trainInds, :])[:, trainInds] + 0.1 * np.eye(nd) A = np.linalg.multi_dot([ np.linalg.inv(temp), U[:nd, :], np.transpose(U[:nd, :]), np.linalg.inv(temp) ]) #A = UU^T [t1, t2] = np.linalg.eig(A) W = np.real(np.matmul(t2, np.diag(np.sqrt(t1))))[:, :q] '''
# 차원 축소: (1) 투영(Project)-주성분 분석, (2) Manifold # Manifold 방법: # 1) LLE(Locally Linear Embedding): # 각 훈련 샘플들이 가장 가까운 이웃들에 얼마나 선형적으로 연관되어 있는지를 측정. X, y = make_swiss_roll(n_samples=1000, noise=0.2, random_state=41) lle = LocallyLinearEmbedding(n_neighbors=10, n_components=2, random_state=1) # X_reduced = lle.fit_transform(X) # plt.scatter(X_reduced[:, 0], X_reduced[:, 1], # c=y, cmap=plt.cm.hot) # plt.show() # MDS(Multi-Distance Scaling): # 샘플들 간의 거리(distance)를 유지하면서 차원을 축소하는 기법. mds = MDS(n_components=2, random_state=1) # Isomap(Isometric Mapping): # 각 샘플들을 가장 가까운 이웃에 연결하는 그래프를 만듦. # 그래프 거리(graph distance, geodesic distance)를 유지하도록 차원을 축소. isomap = Isomap(n_components=2) # t-SNE(t-distribution Stochastic Neighbor Embedding) # 비슷한 샘플들은 가까이, 비슷하지 않은 샘플들은 멀리 떨어지도록 차원 축소하는 기법. tsne = TSNE(n_components=2, random_state=1) titles = ['LLE', 'MDS', 'Isomap', 't-SNE'] manifold_reducers = [lle, mds, isomap, tsne] for title, reducer in zip(titles, manifold_reducers): plt.title(title) # reducer.__class__.__name__ # 원본 데이터를 manifold 방법을 사용해서 차원 축소
# PCA + LLE time and visualizations pca_lle = Pipeline([ ("pca", PCA(n_components=0.95, random_state=42)), ("lle", LocallyLinearEmbedding(n_components=2, random_state=42)), ]) t0 = time.time() X_pca_lle_reduced = pca_lle.fit_transform(X) t1 = time.time() print("PCA+LLE took {:.1f}s.".format(t1 - t0)) plot_digits(X_pca_lle_reduced, y) # MDS time and visualizations m = 2000 t0 = time.time() X_mds_reduced = MDS(n_components=2, random_state=42).fit_transform(X[:m]) t1 = time.time() print("MDS took {:.1f}s (on just 2,000 MNIST images instead of 10,000).".format(t1 - t0)) plot_digits(X_mds_reduced, y[:m]) # PCA + MDS time and visualizations pca_mds = Pipeline([ ("pca", PCA(n_components=0.95, random_state=42)), ("mds", MDS(n_components=2, random_state=42)), ]) t0 = time.time() X_pca_mds_reduced = pca_mds.fit_transform(X[:2000]) t1 = time.time() print("PCA+MDS took {:.1f}s (on 2,000 MNIST images).".format(t1 - t0)) plot_digits(X_pca_mds_reduced, y[:2000])
representation_func=lambda m: pd.Series(np.random.random(100)), metadata="""Uniformly distributed random feature vector of length 100""" """implemented using <a href="http://www.numpy.org">numpy</a> <a href="http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.random.html#numpy.random.random">random</a> module""" ) DEFAULT_REPRESENTATION_TYPES = [morg2, targets, random] pca = ReductionMethod( name='PCA', model=PCA(n_components=2), metadata= """<a href="http://en.wikipedia.org/wiki/Principal_component_analysis">Principal component analysis</a> implemented in <a href="http://scikit-learn.org/stable/" target="_blank">scikit-learn</a>\n""" """<br/>Default parameters used.""") mds = ReductionMethod( name='MDS', model=MDS(), metadata= """<a href="http://en.wikipedia.org/wiki/Multidimensional_scaling" target="_blank">Multidimensional Scaling</a> implemented in <a href="http://scikit-learn.org/stable/" target="_blank">scikit-learn</a>""" """<br/>Default parameters used.""") tsne = ReductionMethod( name='t-SNE', model=TSNE(perplexity=10), metadata= """<a href="http://lvdmaaten.github.io/tsne/">Student's t-distributed stochastic neighbour embedding</a>, """ """implemented according to <a href="http://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf">van der Maartin et al. 2008</a>\n""" """<br/>Parameters used: Perplexity = 10, theta=0""") DEFAULT_REDUCTION_METHODS = [pca, mds, tsne]
axis=1).min() if closest_distance > min_distance: neighbors = np.r_[neighbors, [image_coord]] if images is None: plt.text(image_coord[0], image_coord[1], str(int(y[index])), color=cmap(y[index] / 9), fontdict={ "weight": "bold", "size": 16 }) else: image = images[index].reshape(28, 28) imagebox = AnnotationBbox(OffsetImage(image, cmap="binary"), image_coord) ax.add_artist(imagebox) from sklearn.manifold import MDS import time startTime = time.time() X_mds_reduced = MDS(n_components=2).fit_transform(x_subset) endTime = time.time() print( "MDS took {:.1f}s (on just 2,000 MNIST images instead of 10,000).".format( startTime - endTime)) plot_digits(X_mds_reduced, y_subset) plt.show()
def plot_repr_trajectories(res, snap_type, dims=2, title_label='', epochs_to_mark=()): """ Plot trajectories of each item or context representation over training using MDS. Can plot in 3D by settings dims to 3. Returns figure and axes. """ embedding = MDS(n_components=dims, dissimilarity='precomputed') reprs_embedded = embedding.fit_transform( res['repr_dists'][snap_type]['all']) # reshape and permute to aid plotting n_snaps = len(res['snap_epochs']) n_domains = res['net_params']['n_train_domains'] reprs_embedded = reprs_embedded.reshape((n_snaps, n_domains, -1, dims)) reprs_embedded = reprs_embedded.transpose((1, 2, 3, 0)) fig = plt.figure() ax = fig.add_subplot(111, projection=('3d' if dims == 3 else None)) input_names = _get_names_for_snapshots(snap_type, **res['net_params']) if 'item' in snap_type: if 'item_clusters' in res['net_params']: input_groups = dd.item_group( clusters=res['net_params']['item_clusters']) elif 'cluster_info' in res['net_params']: input_groups = dd.item_group( clusters=res['net_params']['cluster_info']) else: input_groups = dd.item_group() elif 'context' in snap_type: # No "groups," but use symbols for individual contexts (per domain) instead. input_groups = np.arange(4) else: raise ValueError('Unrecognized snapshot type') input_names = np.array(input_names).reshape((n_domains, -1)) colors = dd.get_domain_colors() markers = ['o', 's', '*', '^'] for dom_reprs, dom_labels, color in zip(reprs_embedded, input_names, colors): for reprs, label, group in zip(dom_reprs, dom_labels, input_groups): linestyle = markers[group] + '-' ax.plot(*reprs, linestyle, label=label, markersize=4, color=color, linewidth=0.5) # add start and end markers on top of everything else inds_to_mark = [] if len(epochs_to_mark) > 0: snap_epochs = res['snap_epochs'] for epoch in epochs_to_mark: if epoch in snap_epochs: inds_to_mark.append(snap_epochs.index(epoch)) for dom_reprs, dom_labels, color in zip(reprs_embedded, input_names, colors): for reprs, label, group in zip(dom_reprs, dom_labels, input_groups): marker = markers[group] def mark_epoch(epoch_ind, bordercolor): ax.plot(*reprs[:, epoch_ind], marker, markersize=8, color=bordercolor) ax.plot(*reprs[:, epoch_ind], marker, markersize=5, color=color) mark_epoch(0, 'g') mark_epoch(-1, 'r') for ind in inds_to_mark: mark_epoch(ind, 'k') ax.set_title(f'{title_label} {snap_type} representations over training\n' + 'color = domain, marker = type within domain') return fig, ax
return pair.distance M = np.zeros((100, 100)) for i in range(100): print(i, end=" ") for j in range(i, 100): M[i][j] = distance(getpart(i), getpart(j)) pickle.dump(M, open("NCtransMat.p", "wb")) #plot M = M + M.T from sklearn.manifold import MDS mds = MDS(n_components=2, dissimilarity='precomputed', max_iter=50000, n_init=100) pos = mds.fit(M).embedding_ X = [] Y = [] for i in range(100): X.append(pos[i][0]) Y.append(pos[i][1]) plt.scatter(X, Y) plt.scatter(X[:3], Y[:3], color='red') plt.annotate("judge", (X[0], Y[0])) plt.annotate("2012", (X[1], Y[1])) plt.annotate("2016", (X[2], Y[2])) plt.savefig("NCtransplot.png")
analysis_group.CALC.hist() plt.show() analysis_group.NObeyesdad.hist() plt.show() analysis_group.Age.hist() plt.show() analysis_group.hist() plt.show() from sklearn.manifold import MDS embedding = MDS(n_components=2,verbose=1,max_iter=100,n_init=2) data_emb = embedding.fit_transform(data_dummies[0:]) for cluster in data['clusters'].unique(): _ = plt.scatter(data_emb[0:][data.clusters[0:]==cluster][:,0], data_emb[0:][data.clusters[0:]==cluster][:,1], cmap=plt.cm.Spectral, label='Cluster'+str(cluster) ) plt.legend() plt.show() from sklearn.decomposition import PCA pca = PCA(n_components=38) data_pca = pca.fit_transform(data_dummies)
temp.append('nan') mdsData.iloc[i]=temp mdsData = mdsData.loc[pd.isnull(mdsData["ID"]) == False] #剔除ID为‘nan’的数据 ''' 利用df.interpolate方法填充缺失值 ''' mdsfillnan = mdsData[["Dac_23","Dac_34","Dac_45","Dac_56","Dac_67","Dac_78"]] #mdsfillnan = mdsData[["Acc_23","Acc_34","Acc_45","Acc_56","Acc_67","Acc_78", # "Dac_23","Dac_34","Dac_45","Dac_56","Dac_67","Dac_78"]] mdsfillnan = mdsfillnan.apply(lambda x: pd.to_numeric(x, errors='coerce')) #a = a.interpolate(method='spline', order=2) mdsfillnan = mdsfillnan.interpolate(method='values', axis=0, limit=testNum, limit_direction='both') ''' sklearn ''' mds = MDS() mds.fit(mdsfillnan) mdsResult = mds.embedding_ plt.scatter(mdsResult[0:128,0],mdsResult[0:128,1],color='turquoise') # ============================================================================= # 利用Kmeans聚类,筛选高危异常驾驶行为驾驶人 # =============================================================================
X3t = emb.transform(X3) listDistance = distancias(X2, X3t) listClase = clasificador(yy, X3t, listDistance) graphPoints(X3t, ntoc(int(listClase)), False, False) graphPoints(X, listClasificador) graphPoints(X2, yy, False) plt.show() def llamarPronostico(): pronostico(X2, yy, X, listClasificador) df = pd.read_csv("data.csv") data = df.values X = data[:, 0:] y = np.zeros((len(X))) pca = PCA(n_components=2) emb1 = MDS(n_components=2) emb = Isomap(n_components=2) emb.fit(X) X = reduccion(X) obtenerK(X) K = silhouette(X) #print("K = ",K) [X2, yy, X, listClasificador] = KMeans2D(K)
def cluster_us(movie_list=None): # if not movie_list: summary_list = [] films = [] ranks = [] genres = [] years = [] foreigns = [] languages = [] countries = [] for movie in movie_list: summary_list.append(movie.summary) films.append(movie.name) ranks.append(movie.metascore) genres.append(str(movie.genre)) years.append(int(movie.year)) foreigns.append(1 if movie.competition_category == 'FOREIGN LANGUAGE FILM' else 0) languages.append(movie.languages) countries.append(movie.countries) """this method gets a list of dictionaries and cluster the movies by it. """ totalvocab_stemmed = [] totalvocab_tokenized = [] for i in range(len(summary_list)): allwords_stemmed = tokenize_and_stem(summary_list[i]) # for each item in 'synopses', tokenize/stem totalvocab_stemmed.extend(allwords_stemmed) # extend the 'totalvocab_stemmed' list # allwords_tokenized = tokenize_only(summary_list[i]) # totalvocab_tokenized.extend(allwords_tokenized) # extend the 'totalvocab_stemmed' list # define vectorizer parameters tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 1)) # tfidf_vectorizered = TfidfVectorizer(max_df=0.8, max_features=200000, # min_df=0.05, stop_words='english', # use_idf=True, tokenizer=tokenize_only, ngram_range=(1, 3), max_df=0.8) tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, tokenizer=tokenize_and_stem, ngram_range=(1, 1), max_df=0.8, min_df=0.01) tfidf_matrix = tfidf_vectorizer.fit_transform(summary_list) # fit the vectorizer to synopses print(tfidf_matrix.shape) terms = tfidf_vectorizer.get_feature_names() dist = 1 - cosine_similarity(tfidf_matrix) num_clusters = 5 km = KMeans(n_clusters=num_clusters) km.fit(tfidf_matrix) clusters = km.labels_.tolist() # uncomment the below to save your model # since I've already run my model I am loading from the pickle # joblib.dump(km, 'doc_cluster.pkl') # # km = joblib.load('doc_cluster.pkl') clusters = km.labels_.tolist() vocab_frame = pd.DataFrame({'words': totalvocab_stemmed}, index=totalvocab_stemmed) films = {'title': films, 'rank': ranks, 'synopsis': summary_list, 'cluster': clusters, 'genre': genres, 'year': years, 'foreign': foreigns, 'language': languages, 'country': countries} frame = pd.DataFrame(films, index=clusters, columns=['title', 'rank', 'cluster', 'genre', 'year', 'foreign', 'language', 'country']) print(frame['cluster'].value_counts()) grouped = frame['rank'].groupby(frame['cluster']) # groupby cluster for aggregation purposes print(grouped.mean()) # average rank (1 to 100) per cluster # frame = pd.DataFrame([films], index=[clusters], columns=['rank', 'title', 'cluster', 'genre']) print("Top terms per cluster:") frame.to_csv("finished_output.csv") # fig = px.scatter(frame, x='cluster', y='rank', color='cluster', hover_name='title', custom_data=['foreign'], # symbol='foreign') # chart_studio.plotly.plot(fig, filename='interactive_clustering', auto_open=True) # fig.show() print() # sort cluster centers by proximity to centroid order_centroids = km.cluster_centers_.argsort()[:, ::-1] with open("nimni", "w") as f: for i in range(num_clusters): f.write("Cluster %d words:" % i) print("Cluster %d words:" % i, end='') for ind in order_centroids[i, :5]: # replace 6 with n words per cluster print(terms[ind]) f.write(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore')) # print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), # end=',') print() # add whitespace; print() # add whitespace print("Cluster %d titles:" % i, end='') f.write("Cluster %d titles:" % i) titles = frame.loc[i]['title'] f.write(str(titles)) print(titles) print() # add whitespace print() # add whitespace print() print() MDS() # convert two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify `random_state` so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] frame = pd.DataFrame(dict(x=xs, y=ys, title=frame['title'], foreign=foreigns, cluster=clusters)) # groups = frame.groupby('label') fig = px.scatter(frame, x='x', y='y', color='cluster', symbol='foreign',hover_name='title') chart_studio.plotly.plot(fig, filename='interactive_clustering_mds', auto_open=True)
def make_map(): # representation types morg2 = RepresentationType( name='morg2', representation_func=skchemize(morg, radius=2, nBits=2048), metadata= """Hashed Circular fingerprint generated by the Morgan algorithm, """ """implemented in <a href="http://www.rdkit.org">RDKit</a>. <br/>""" """Parameters used: Radius = 2, Bit length = 2048""") targets = RepresentationType( name='targets', representation_func=PIDGIN(), metadata= """Bayes affinity fingerprint for 1080 human targets, produced """ """using the <a href="https://github.com/lhm30/PIDGIN">PIDGIN (Prediction of targets IncluDinG INactives)</a>""" """Target Prediction algorithm, implemented in <a href="https://github.com/richlewis42/scikit-chem">scikit-chem</a>.""" ) random = RepresentationType( name='random', representation_func=lambda m: pd.Series(np.random.random(100)), metadata="""Uniformly distributed random feature vector of length 100""" """implemented using <a href="http://www.numpy.org">numpy</a> <a href="http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.random.html#numpy.random.random">random</a> module""" ) representation_types = [morg2, targets, random] # reduction types pca = ReductionMethod( name='PCA', model=PCA(n_components=2), metadata= """<a href="http://en.wikipedia.org/wiki/Principal_component_analysis">Principal component analysis</a> implemented in <a href="http://scikit-learn.org/stable/" target="_blank">scikit-learn</a>\n""" """<br/>Default parameters used.""") mds = ReductionMethod( name='MDS', model=MDS(), metadata= """<a href="http://en.wikipedia.org/wiki/Multidimensional_scaling" target="_blank">Multidimensional Scaling</a> implemented in <a href="http://scikit-learn.org/stable/" target="_blank">scikit-learn</a>""" """<br/>Default parameters used.""") tsne = ReductionMethod( name='t-SNE', model=TSNE(perplexity=1), metadata= """<a href="http://lvdmaaten.github.io/tsne/">Student's t-distributed stochastic neighbour embedding</a>, """ """implemented according to <a href="http://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf">van der Maartin et al. 2008</a>\n""" """<br/>Parameters used: Perplexity = 1, theta=0""") reduction_types = [pca, mds, tsne] # activity types pIC20 = ActivityType( name='pIC20', metadata= """negative based-10 logarithm of the <a href="http://en.wikipedia.org/wiki/IC50">IC20</a>, the concentation of""" """compound required for 20% inhibition of growth of Lymphoma cells""") IC20 = ActivityType( name='IC20', metadata= """<a href="http://en.wikipedia.org/wiki/IC50">IC20</a>, the concentation of""" """compound required for 20% inhibition of growth of Lymphoma cells""") activity_types = [pIC20, IC20] # synergy types excessOverBliss = SynergyType( name='ExcessOverBliss', metadata= """Difference in observed vs expected activity of the component compounds,""" """each at the IC20 concentration (when known) assuming the <a href="http://doi.wiley.com/10.1111/j.1744-7348.1939.tb06990.x">Bliss Independence model</a>""" ) synergy_types = [excessOverBliss] # data compound_df = skc.read_smiles(os.path.join(DIRNAME, 'compounds.smiles'), name_column=1, title_line=True) compound_df['pIC20'] = -np.log10(compound_df['IC20']) combination_df = pd.read_csv(os.path.join(DIRNAME, 'combinations.csv')) combination_df.set_index('id', inplace=True) synergy_map = SynergyMap(compound_df=compound_df, combination_df=combination_df, representation_types=representation_types, reduction_types=reduction_types, activity_types=activity_types, synergy_types=synergy_types, metadata='DREAM Drug Combination Challenge Data') return synergy_map
text += s + ' ' text = text.translate(table_p) text = nltk.word_tokenize(text) for word in text: if word not in stopwords and len(word) > 1: cleandoc += word + ' ' twtall.append(cleandoc) cleandoc = '' tweetcount += 1 twtname.append('{}-{}'.format(tweetcount * 100 - 99, tweetcount * 100)) vectorizer = TfidfVectorizer() twt_matrix = vectorizer.fit_transform(twtall) cos_dist = cosine_distances(twt_matrix) mds = MDS(n_components=2, dissimilarity='precomputed', random_state=1) pos = mds.fit_transform(cos_dist) xs, ys = pos[:, 0], pos[:, 1] for x, y, name in zip(xs, ys, twtname): plt.scatter(x, y) plt.text(x, y, name) plt.title('tweet MDS') plt.savefig(path.join("factor_analysis.png"), dpi=600) plt.show() linkage_matrix = ward(cos_dist) dendrogram(linkage_matrix, orientation='left', labels=twtname) plt1 = plt.tight_layout() # reference: https://stackoverflow.com/questions/9622163/save-plot-to-image-file-instead-of-displaying-it-using-matplotlib
# Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # Multi-dimensional Scaling from sklearn.manifold import MDS MDS() # two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify `random_state` so the plot is reproducible. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(dist) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] # 2nd Plot showing the actual clusters formed colors = cm.spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter(pos[:, 0], pos[:, 1], marker='.', s=120, lw=0,
def img(start, end, investement_type, sharpe_ratio, std, beta, treynor_ratio, revenue, btest_time, money, buy_ratio, strategy, frequency): profit = pd.DataFrame() hold = np.zeros((4), dtype=np.float) response_data = {} response_data['start'] = start.strftime('%Y-%m') response_data['mean_similarity'] = 0 distance = [] length = 12 * (end.year - start.year) + (end.month - start.month) + 1 choose = np.asarray([" ", " ", " ", " "], dtype='<U32') choose = selection(start, btest_time, investement_type, 0, sharpe_ratio, std, beta, treynor_ratio, revenue, choose) if strategy == 2: money /= (12 * (end.year - start.year) + (end.month - start.month) + 1) / frequency total_money = money for i in range(length): start_unix = time.mktime((start + relativedelta(months=i)).timetuple()) end_unix = time.mktime( (start + relativedelta(months=i+1, days=-1)).timetuple()) data_df = pd.read_sql(sql='select * from price where date between ? and ? order by date asc', con=engine, params=[start_unix, end_unix]) data_df = data_df.pivot(index='date', columns='fund_id', values='nav') data_df = data_df.fillna(method="ffill") data_df = data_df.fillna(method="bfill") if i == 0: hold = (buy_ratio * money / data_df[choose].iloc[0].T).values elif strategy != 0 and i % frequency == frequency - 1: if strategy == 2: hold += (buy_ratio * money / data_df[choose].iloc[0].T).values total_money += money else: temp = (hold * data_df[choose].iloc[0]).sum() if strategy == 3: choose = selection(start, btest_time, investement_type, i, sharpe_ratio, std, beta, treynor_ratio, revenue, choose) hold = (buy_ratio * temp / data_df[choose].iloc[0].T).values if strategy == 2: profit = pd.concat( [profit, (data_df[choose] * hold).T.sum() / total_money], axis=0) else: profit = pd.concat( [profit, (data_df[choose] * hold).T.sum() / money], axis=0) for j, ch in enumerate(choose): interest = pd.read_sql(sql='select sum(interest) from interest where date between ? and ? and fund_id == ? order by date asc', con=engine, params=[start_unix, end_unix, ch]) hold[j] += (interest * hold[j] / data_df[ch].iloc[-1]).fillna(0).loc[0][0] if i == length-1: response_data['money'] = (hold * data_df.iloc[-1][choose]).sum() price = data_df[choose].iloc[-1].mean() data_df = pd.concat([data_df[choose], data_df.T.sample( n=296).T], axis=1).T.drop_duplicates().T data_df = data_df.pct_change() data_df_std = data_df.std() data_df = data_df.drop( data_df_std[data_df_std == 0].index.values, axis=1) data_df = data_df.corr() data_df = 1 - data_df * 0.5 - 0.5 response_data['mean_similarity'] += np.square( data_df[choose].T[choose].sum().sum()/2) distance.append(np.square(data_df[choose].T[choose].sum().sum()/2)) color = np.asarray(["yellow" for i in range(len(data_df))]) color[0:4] = "purple" mds = MDS(n_components=2, dissimilarity='precomputed').fit( data_df).embedding_ source = ColumnDataSource(data=dict(x=mds[:, 0], y=mds[:, 1], name=data_df.index, color=color, )) TOOLTIPS = [ ("fund_id", "@name"), ] p = figure(plot_width=500, plot_height=500, tooltips=TOOLTIPS, title="MDS", toolbar_location=None, tools="") p.x_range = Range1d(-0.6, 0.6) p.y_range = Range1d(-0.6, 0.6) p.circle(x='x', y='y', color='color', size=8, source=source) script, div = components(p, CDN) response_data[(start + relativedelta(months=i) ).strftime('%Y-%m')] = {'script': script, 'div': div} profit = profit.rename(columns={0: "profit"}) profit["profit"] = (profit["profit"]-1) profit_indicator(profit, start, end, response_data) profit["profit"] = profit["profit"] * 100 profit.index.name = "date" response_data['profit'] = profit.iloc[-1][0] response_data['mean_similarity'] /= length # price_simulation(end, price, response_data) profit.index = profit.index + 28800 profit.index = pd.to_datetime(profit.index, unit='s') totalStock = pd.read_csv("totalStock.csv") totalStock.date = totalStock.date.astype('datetime64') totalStock = totalStock[( totalStock.date < end + relativedelta(months=1)) & (totalStock.date >= start)] totalStock.profit = ( (totalStock.profit / totalStock.iloc[0].profit) - 1) * 100 p = figure(x_axis_type="datetime", plot_width=940, plot_height=300, title="Profit", toolbar_location=None, tools="") p.line(x='date', y='profit', line_width=3, source=profit, color='red', legend='Choose') p.add_tools(HoverTool(tooltips=[("date", "@date{%F}"), ("profit", "@profit%")], formatters={'date': 'datetime', }, mode='vline')) p.line(x='date', y='profit', line_width=3, source=totalStock, color='blue', legend='Compare') script, div = components(p, CDN) response_data['profit_img'] = {'script': script, 'div': div} p = figure(plot_width=940, plot_height=300, title="Distance", toolbar_location=None, tools="") p.line([i+1 for i in range(len(distance))], distance, line_width=2) script, div = components(p, CDN) response_data['distance'] = {'script': script, 'div': div} response_data['sharpe_ratio'] = round(response_data['sharpe_ratio'], 3) response_data['market_sharpe'] = round(response_data['market_sharpe'], 3) response_data['std'] = round(response_data['std'], 3) response_data['market_std'] = round(response_data['market_std'], 3) response_data['beta'] = round(response_data['beta'], 3) response_data['treynor_ratio'] = round(response_data['treynor_ratio'], 3) response_data['money'] = round(response_data['money'], 3) response_data['profit'] = round(response_data['profit'], 3) response_data['market_revenue'] = round(response_data['market_revenue'], 3) response_data['market_std'] = round(response_data['market_std'], 3) return response_data
result = tsne.fit_transform(X) df['D1'] = result[:, 0] df['D2'] = result[:, 1] plt.figure(figsize=(12, 9), dpi=300) sns.scatterplot(x='D1', y='D2', hue='Cluster9', palette=sns.color_palette(n_colors=df['Cluster9'].nunique()), data=df) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0) plt.savefig('tsne.png') ''' MDS ''' from sklearn.manifold import MDS mds = MDS(n_components=2) result = mds.fit_transform(X) print(mds.embedding_) print(mds.stress_) df['D1'] = result[:, 0] df['D2'] = result[:, 1] plt.figure(figsize=(12, 9), dpi=300) sns.scatterplot(x='D1', y='D2', hue='Cluster9', palette=sns.color_palette(n_colors=df['Cluster9'].nunique()), data=df)
def calculate_MDS(): embeddings = MDS(2) transformed = embeddings.fit_transform(df) return transformed
import pandas as pd import nltk import re import os import codecs from sklearn import feature_extraction from nltk.stem.snowball import SnowballStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.metrics.pairwise import cosine_similarity import matplotlib.pyplot as plt import matplotlib as mpl from sklearn.manifold import MDS from sklearn.metrics import silhouette_score MDS() df = pd.read_excel('Cricket_inc_data_orig.xlsx', sheetname='Test_data') data = [] inc_list = [] inc_links = [] for i in df.index: data.append(df['Inc Summary'][i]) inc_list.append(df['Inc ID'][i]) inc_links.append(df['Inc Uts Link'][i]) print("reading finished") stopwords = set(nltk.corpus.stopwords.words('english')) stopwords.add("please") stopwords.add("Please")
# 標準化 sc = StandardScaler() X_std = sc.fit_transform(X) X_train_std = sc.fit_transform(X_train) X_test_std = sc.transform(X_test) # Logistic Regressionによる訓練および評価 kpca_list = [ KernelPCA(n_components=i + 1, kernel="rbf") for i in range(num_features - 1) ] lle_list = [ LocallyLinearEmbedding(n_components=i + 1) for i in range(num_features - 1) ] mds_list = [MDS(n_components=i + 1) for i in range(num_features - 1)] ism_list = [Isomap(n_components=i + 1) for i in range(num_features - 1)] sample_dimensions = [i for i in range(num_features)] random.shuffle(sample_dimensions) # 1次元〜(num_features-1)次元まで削減した各々のデータを格納 X_kpca_train = [ kpca_list[i].fit_transform(X_train_std) for i in range(num_features - 1) ] X_kpca_test = [ kpca_list[i].transform(X_test_std) for i in range(num_features - 1) ] X_lle_train = [ lle_list[i].fit_transform(X_train_std) for i in range(num_features - 1) ]
An___ = n.arccos(c) # An___ = n.arccos((G / (n.array(n.dot(sc, u.T)) * n.array( n.dot(u, sc.T)))) ** .5) An__ = n.degrees(An___) min_angle = __mangle An_ = An__ + min_angle - n.identity(N) * min_angle An = n.real(n.maximum(An_, An_.T)) # communicability angles matrix print('communcability calculations', t.time() - tt, 'net size', N) tt = t.time() # E_original = n.linalg.eigvals(An) if __dimred == 'MDS': embedding = MDS(n_components=__dim, n_init=__inits, max_iter=__iters, n_jobs=-1, dissimilarity='precomputed') elif __dimred == 'PCA': embedding = PCA(n_components=__dim) else: embedding = TSNE(n_components=__dim, n_iter=__iters, metric='precomputed', learning_rate=__lrate, perplexity=__perplexity) p = positions = embedding.fit_transform(An) # p = positions = embedding.fit_transform(X) print('embedding', t.time() - tt) tt = t.time()
def calculate_and_cluster(): # Variables for storing the data data_list = {} tag_list = {} tag_map = {} data_tag_map = {} counter = 0 index = 0 ptr = "" # Parse the CSV file (this will be denoted by a string variable) with open('../../data/sets/complete_set.csv','rb') as csvfile: reader = csv.reader(csvfile,delimiter=',') for row in reader: data_list[counter] = ''.join(row) counter +=1 counter = 0 # Loop through data in range for data in range(0,len(data_list)): # Split the last token in the string split = data_list[data].split(" ")[-1:] # print split[0], "Tag set: ", get_tag_set(split[0]) data_tag_map[split[0]] = get_tag_set(split[0]) od = OrderedDict(sorted(data_tag_map.items())) names = [] data_tagged_list = {} counter = 0 for key, value in od.iteritems(): # Maintain old file name file_old = str(counter) + '.txt' tag = '' if len(value) == 1: tag = 'Tagged' names.append(str(counter) + "_" + tag) data_tagged_list[str(counter)] = True else: tag = 'Untagged' names.append(str(counter) + "_" + tag) data_tagged_list[str(counter)] = False # Create new file name with tagged / untagged appended file_new = str(counter) + '_' + tag + '.txt' # Rename the file for later use in color co-ordination rename_file(file_old,file_new) counter += 1 dataNodes = [] for x in range(0,len(data_list)): dataNodes.append(data_list[x]) vect = TfidfVectorizer(min_df=1) tfidf = vect.fit_transform(dataNodes) X = genfromtxt('../semantic_similarity_algorithms/semantic_similarity_matrix/matrix.csv', delimiter=',') X = symmetrize(X) print (X.transpose() == X).all() # N Components: plotting points in a two-dimensional plane # Dissimilirity: "precomputed" because of the Distance Matrix # Random state is fixed so we can reproduce the plot. mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) mds.fit(X.astype(np.float64)) pos = mds.fit_transform(X) # shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] # Set figure size to have dimensions of at least 15 inches for the width. # Height can be scaled accordingly. plt.figure(figsize=(15,8)) plt.subplot(211) # Loop through the points, label approriately and scatter # Ensure figure size has enough room for legend plotting. Each plot must have a label. # In this case, label is the split value denoting the POI tag for x, y, name in zip(xs, ys, names): plt.scatter(x, y, s=100,c=get_colour_tag(name.split('_',1)[1]), label = name.split('_',1)[1]) #plt.text(x,y,name.split('_',1)[0]) handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) legend = plt.legend(by_label.values(), by_label.keys(),loc='lower center',ncol=4,bbox_to_anchor=(0.5, -0.6)) plt.show()
# ISOMAP from sklearn.manifold import Isomap iso = Isomap(n_components=3, n_neighbors=15) fdata = iso.fit_transform(digits['data']) plot_figure(fdata, 'ISOMAP') # LLE from sklearn.manifold import LocallyLinearEmbedding lle = LocallyLinearEmbedding(n_neighbors=15, n_components=3, method='modified') fdata = lle.fit_transform(digits['data']) plot_figure(fdata, 'LLE') # MDS from sklearn.manifold import MDS mds = MDS(n_components=3) fdata = mds.fit_transform(digits['data']) plot_figure(fdata, 'MDS') # TSNE from sklearn.manifold import TSNE tsne = TSNE(n_components=3, perplexity=25, early_exaggeration=100) fdata = tsne.fit_transform(digits['data']) plot_figure(fdata, 't-SNE')
input_csv = "/home/li/torch/data/Data_Input_164_nakamura_20190605.csv" output_csv = "/home/li/torch/data/Data_Output_164_nakamura_20190605.csv" plot_path = "/home/li/torch/normal_net/figure/output/object_8_nakamura_output_mds_figure.png" csv_path = "/home/li/torch/normal_net/figure/output/object_8_nakamura_output_distance.csv" item_name_path = "/home/li/torch/normal_net/figure/output/item_name_nakamura.txt" model = torch.load(model_path) model.eval() item_list = model.item_list dataset = GlobalModelDataset(input_csv, output_csv) embedding = MDS(n_components=2, dissimilarity="precomputed") input_sample = random.sample(range(64), OBJECT_NUM) #input_sample = [4,14,45,62,35,22,54,23] input_name_list = [] for i in input_sample: input_name_list.append(item_list[i]) with open(item_name_path, 'w') as item_f: for item in input_name_list: item_f.write(str(item) + "\r\n") input_test = [] for item in item_list:
columns, MyList, MyList2, MyList3, MyList4, MyList5, MyList6, MyList7, MyList8, MyList9, MyList10, MyList11, MyList12, MyList13 ]) print(df2) #calculate distance between documents #Euclidean Distance dist = euclidean_distances(dtm) print(np.round(dist, 0)) #Cosine Similarity cosdist = 1 - cosine_similarity(dtm) print(np.round(cosdist, 3)) #Visualizations (three methods) #visualize in 2D mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) #"precomputed" -> cosine similarity pos = mds.fit_transform(cosdist) #shape (n_components, n_samples) xs, ys = pos[:, 0], pos[:, 1] names = [ 'Austen_Emma', 'Austen_Pride', 'Austen_Sense', 'CBronte_Jane', 'CBronte_Professor', 'CBronte_Villette', 'Dickens_Bleak', 'Dickens_David', 'Dickens_Hard', 'EBronte_Wuthering', 'Eliot_Adam', 'Eliot_Middlemarch', 'Eliot_Mill' ] for x, y, name in zip(xs, ys, names): plt.scatter(x, y, color="blue") plt.text(x, y, name, fontsize=10) plt.title("Visualization in 2D") #fig = plt.figure() #fig.savefig('2D.png') plt.show()
def plot_clusters(num_clusters, feature_matrix, cluster_data, movie_data, plot_size=(16, 8)): # generate random color for clusters def generate_random_color(): color = '#%06x' % random.randint(0, 0xFFFFFF) return color # define markers for clusters markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd'] # build cosine distance matrix cosine_distance = 1 - cosine_similarity(feature_matrix) # dimensionality reduction using MDS mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) # get coordinates of clusters in new low-dimensional space plot_positions = mds.fit_transform(cosine_distance) x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1] # build cluster plotting data cluster_color_map = {} cluster_name_map = {} for cluster_num, cluster_details in cluster_data.items(): # assign cluster features to unique label cluster_color_map[cluster_num] = generate_random_color() cluster_name_map[cluster_num] = ', '.join( cluster_details['key_features'][:5]).strip() # map each unique cluster label with its coordinates and movies cluster_plot_frame = pd.DataFrame({ 'x': x_pos, 'y': y_pos, 'label': movie_data['Cluster'].values.tolist(), 'title': movie_data['Title'].values.tolist() }) grouped_plot_frame = cluster_plot_frame.groupby('label') # set plot figure size and axes fig, ax = plt.subplots(figsize=plot_size) ax.margins(0.05) # plot each cluster using co-ordinates and movie titles for cluster_num, cluster_frame in grouped_plot_frame: marker = markers[cluster_num] if cluster_num < len(markers) \ else np.random.choice(markers, size=1)[0] ax.plot(cluster_frame['x'], cluster_frame['y'], marker=marker, linestyle='', ms=12, label=cluster_name_map[cluster_num], color=cluster_color_map[cluster_num], mec='none') ax.set_aspect('auto') ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') ax.tick_params(axis='y', which='both', left='off', top='off', labelleft='off') fontP = FontProperties() fontP.set_size('small') ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True, shadow=True, ncol=5, numpoints=1, prop=fontP) #add labels as the film titles for index in range(len(cluster_plot_frame)): ax.text(cluster_plot_frame.ix[index]['x'], cluster_plot_frame.ix[index]['y'], cluster_plot_frame.ix[index]['title'], size=8) plt.savefig('clusters_data.png', dpi=200) # show the plot plt.show()