def create_2dprojection(distmat):
	#uses isomap to return a species distance map in 2d based on the topological distmat of all species in tree
	print 'map to 3d space'
	mapper=MDS(n_components=3, metric=True, n_init=4, max_iter=300, verbose=0, eps=0.001, n_jobs=-1, random_state=0, dissimilarity='precomputed')
	projmat =mapper.fit_transform(distmat)
	print 'DONE'
	return projmat
def main():
    # load sample data
    data = np.loadtxt("distmat799.txt", delimiter=",")
    dists = data / np.amax(data)

    # load images
    img_files = [img for img in os.listdir("799_patch") if re.search(r"\.png", img)]

    # mds
    mds = MDS(n_components=2, dissimilarity="precomputed")
    results = mds.fit(dists)

    # plot
    fig, ax = plt.subplots()
    for i, img_file in enumerate(img_files):
        img_file = os.path.join("799_patch", img_file)
        img = read_png(img_file)
        imagebox = OffsetImage(img, zoom=2.0)
        coords = results.embedding_[i, :]
        xy = tuple(coords)
        ab = AnnotationBbox(imagebox, xy)
        ax.add_artist(ab)
    ax.set_xlim(-1.0, 1.0)
    ax.set_ylim(-1.0, 1.0)
    plt.show()
def plotFlatClusterGraph(tf_idf_matrix, clusters, headlines_utf):
    dist = 1 - cosine_similarity(tf_idf_matrix)
    MDS()
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    pos = mds.fit_transform(dist)
    xs, ys = pos[:, 0], pos[:, 1]
    cluster_colors = {0: '#FE642E', 1: '#B40404', 2: '#D7DF01', 3: '#01DF01', 4: '#00FFBF', 5: '#2E64FE', 6:'#8904B1', 7:'#FA58F4', 8:'#FE2E9A', 9:'#A4A4A4'}

    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pandas.DataFrame(dict(x=xs, y=ys, label=clusters, title=headlines_utf)) 
    groups = df.groupby('label')

    # set up plots
    fig, ax = plt.subplots(figsize=(17, 9)) # set size

    #iterate through groups to layer the plots
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, color=cluster_colors[name], mec='none')
        ax.set_aspect('auto')
        ax.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off')
        ax.tick_params(axis= 'y', which='both', left='off', top='off', labelleft='off')
        ax.legend(numpoints=1)  #show legend with only 1 point

    #add label in x,y position with the label as the film title
    for t_n in range(len(df)):
        ax.text(df.ix[t_n]['x'], df.ix[t_n]['y'], df.ix[t_n]['title'], size=8)  
    
    plt.savefig('../plots/flat_clusters.png', dpi=400)
Example #4
0
def reorder_channels_by_xyz_coord(data, channel_names=None):
    """
    :param data: 2-d array in the format [n_samples, n_channels]
    :param channel_names: names of the EEG channels
    :return: data, channel_names permutated accordingly
    """
    # work on transposed view, i.e. [channel, samples]
    data = data.T

    # map channels to 1-d coordinates through MDS
    from sklearn.manifold import MDS
    distances = compute_electrode_distance_matrix()
    mds = MDS(n_components=1, dissimilarity='precomputed')
    projection = mds.fit_transform(distances).reshape(data.shape[0])
    order = np.argsort(projection)
    print mds.stress_
    print order

    # re-order channels
    data = data[order]
    # restore initial axes layout
    data = data.T

    # re-order channel_names
    channel_names = reorder_channel_names(channel_names, order)

    return data, channel_names
Example #5
0
def plot_cities():
    #distance_matrix = get_distances()
    cities = 'BOS     CHI     DC      DEN     LA      MIA     NY      SEA     SF'.split()
    distance_matrix = np.array([
        [0   , 963 , 429 , 1949, 2979, 1504, 206 , 2976, 3095],
        [963 , 0   , 671 , 996 , 2054, 1329, 802 , 2013, 2142],
        [429 , 671 , 0   , 1616, 2631, 1075, 233 , 2684, 2799],
        [1949, 996 , 1616, 0   , 1059, 2037, 1771, 1307, 1235],
        [2979, 2054, 2631, 1059, 0   , 2687, 2786, 1131, 379],
        [1504, 1329, 1075, 2037, 2687, 0   , 1308, 3273, 3053],
        [206 , 802 , 233 , 1771, 2786, 1308, 0   , 2815, 2934],
        [2976, 2013, 2684, 1307, 1131, 3273, 2815, 0   , 808],
        [3095, 2142, 2799, 1235, 379 , 3053, 2934, 808 , 0]
        ])

    # assert symmetric
    for (i, j) in [(i, j) for i in range(0, 8) for j in range(0, 8)]:
        try:
            assert(distance_matrix[i][j] == distance_matrix[j][i])
        except AssertionError:
            print((i, j))

    print(distance_matrix)
    mds = MDS(dissimilarity='precomputed')
    mds.fit(distance_matrix)
    print(mds.embedding_)
    for idx, points in enumerate(mds.embedding_):
        plt.plot(points[0], points[1], 'r.')
        plt.text(points[0], points[1], cities[idx])
    plt.show()
    return
 def scale_plot(input_data, data_colors=None, cluster_colors=None,
                cluster_sizes=None, dissimilarity='euclidean', filey=None):
     """ Plot MDS of data and clusters """
     if data_colors is None:
         data_colors = 'r'
     if cluster_colors is None:
         cluster_colors='b'
     if cluster_sizes is None:
         cluster_sizes = 2200
         
     # scale
     mds = MDS(dissimilarity=dissimilarity)
     mds_out = mds.fit_transform(input_data)
     
     with sns.axes_style('white'):
         f=plt.figure(figsize=(14,14))
         plt.scatter(mds_out[n_clusters:,0], mds_out[n_clusters:,1], 
                     s=75, color=data_colors)
         plt.scatter(mds_out[:n_clusters,0], mds_out[:n_clusters,1], 
                     marker='*', s=cluster_sizes, color=cluster_colors,
                     edgecolor='black', linewidth=2)
         # plot cluster number
         offset = .011
         font_dict = {'fontsize': 17, 'color':'white'}
         for i,(x,y) in enumerate(mds_out[:n_clusters]):
             if i<9:
                 plt.text(x-offset,y-offset,i+1, font_dict)
             else:
                 plt.text(x-offset*2,y-offset,i+1, font_dict)
     if filey is not None:
         plt.title(path.basename(filey)[:-4], fontsize=20)
         save_figure(f, filey)
         plt.close()
Example #7
0
def embed_two_dimensions(data, vectorizer, size=10, n_components=5, colormap='YlOrRd'):
    if hasattr(data, '__iter__'):
        iterable = data
    else:
        raise Exception('ERROR: Input must be iterable')
    import itertools
    iterable_1, iterable_2 = itertools.tee(iterable)
    # get labels
    labels = []
    for graph in iterable_2:
        label = graph.graph.get('id', None)
        if label:
            labels.append(label)

    # transform iterable into sparse vectors
    data_matrix = vectorizer.transform(iterable_1)
    # embed high dimensional sparse vectors in 2D
    from sklearn import metrics
    distance_matrix = metrics.pairwise.pairwise_distances(data_matrix)

    from sklearn.manifold import MDS
    feature_map = MDS(n_components=n_components, dissimilarity='precomputed')
    explicit_data_matrix = feature_map.fit_transform(distance_matrix)

    from sklearn.decomposition import TruncatedSVD
    pca = TruncatedSVD(n_components=2)
    low_dimension_data_matrix = pca.fit_transform(explicit_data_matrix)

    plt.figure(figsize=(size, size))
    embed_dat_matrix_two_dimensions(low_dimension_data_matrix, labels=labels, density_colormap=colormap)
    plt.show()
def main():
    digits = load_digits()
    X = digits.data
    y = digits.target
    mds = MDS()
    X_mds = mds.fit_transform(X)
    plot_embedding(X_mds, y)
Example #9
0
def labtest_MDS(PID):
    data = [patients[pid]['tests'] for pid in PID]
    X = pp.scale(data)
    mds = MDS(n_components = 2, metric = True, n_init = 4, max_iter = 300, verbose = 0, eps = 0.001, n_jobs = 1, dissimilarity = 'euclidean')
    pos = mds.fit(X).embedding_
    
    return pos
def main():
    args = docopt(__doc__)
    is_mds = args['--mds']

    # load datasets
    digits = load_digits()
    X = digits.data
    y = digits.target
    labels = digits.target_names

    # dimension reduction
    if is_mds:
        model = MDS(n_components=2)
    else:
        model = PCA(n_components=2)
    X_fit = model.fit_transform(X)

    for i in range(labels.shape[0]):
        plt.scatter(X_fit[y == i, 0], X_fit[y == i, 1],
                    color=COLORS[i], label=str(i))

    plt.legend(loc='upper left')
    plt.autoscale()
    plt.grid()
    plt.show()
Example #11
0
def plotMap(maparr, freq, nest, seqs, dbfile, map2d, outfile, plotm='T'):

    #mutli-dimensional scaling
    similarities = euclidean_distances(np.matrix(maparr))
    mds = MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=np.random.RandomState(seed=3), dissimilarity="precomputed", n_jobs=1)
    pos = mds.fit(similarities).embedding_

    #plot attributes
    N = len(pos)
    #size = [20*n for n in freq]
    size = 8000
    color = np.array(range(N))
    
    if str(plotm) == 'T':
    
        #plot MDS
        fig, ax = plt.subplots(figsize=(10,10))
        warnings.filterwarnings("ignore")
        scatter = ax.scatter(np.array(pos[:,0]), np.array(pos[:,1]), c=color, s=size, alpha=0.3, cmap=plt.cm.viridis, marker='s')
        plt.xlabel('Dimension 1', fontsize=20, labelpad=20)
        plt.ylabel('Dimension 2', fontsize=20, labelpad=20)
        #plt.axis([xmin, xmax, ymin, ymax])
        plt.tick_params(labelsize=15, length=14, direction='out', pad=15, top='off', right='off')

        #save figures
        fig.savefig(outfile + '.png', bbox_inches='tight', format='png')
        fig.savefig(outfile + '.pdf', bbox_inches='tight', format='pdf')
        plt.close(fig)
        warnings.resetwarnings()
        
        #write csv file
        writePlotMDS(freq, nest, seqs, dbfile, pos, maparr, map2d, outfile)

    return pos
Example #12
0
def project_in_2D(distance_mat, method='mds'):
  """
  Project SDRs onto a 2D space using manifold learning algorithms
  :param distance_mat: A square matrix with pairwise distances
  :param method: Select method from 'mds' and 'tSNE'
  :return: an array with dimension (numSDRs, 2). It contains the 2D projections
     of each SDR
  """
  seed = np.random.RandomState(seed=3)

  if method == 'mds':
    mds = MDS(n_components=2, max_iter=3000, eps=1e-9,
              random_state=seed,
              dissimilarity="precomputed", n_jobs=1)

    pos = mds.fit(distance_mat).embedding_

    nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
               dissimilarity="precomputed", random_state=seed,
               n_jobs=1, n_init=1)

    pos = nmds.fit_transform(distance_mat, init=pos)
  elif method == 'tSNE':
    tsne = TSNE(n_components=2, init='pca', random_state=0)
    pos = tsne.fit_transform(distance_mat)
  else:
    raise NotImplementedError

  return pos
def visualize_clusters(tfidf_matrix, vocabulary, km):

    # calcuate the cosine distance between each document
    # this will be used for plotting on a euclidean (2-dimensional) plane.
    dist = 1 - cosine_similarity(tfidf_matrix)
    clusters = km.labels_.tolist()

    # convert two components as we are plotting points in a two-dimensional plane
    # 'precomputed' because we provide a distance matrix
    # we will also specify 'random_state' so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
    xs, ys = pos[:, 0], pos[:, 1]

    # set up colors per clusters using a dict
    cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e', 5: '#99cc00'}

    # set up cluster names using a dict (perhaps using the top terms of each cluster)
    cluster_names = {0: '0',
                     1: '1',
                     2: '2',
                     3: '3',
                     4: '4',
                     5: '5'}

    #create data frame that has the result of the MDS plus the cluster numbers and titles
    df = pd.DataFrame(dict(x=xs, y=ys, label=clusters))

    #group by cluster
    groups = df.groupby('label')


    # set up plot
    fig, ax = plt.subplots(figsize=(17, 9)) # set size
    ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

    #iterate through groups to layer the plot
    #note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
                label=cluster_names[name], color=cluster_colors[name],
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(\
            axis= 'x',         # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(\
            axis= 'y',         # changes apply to the y-axis
            which='both',      # both major and minor ticks are affected
            left='off',        # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelleft='off')

    ax.legend(numpoints=1)  #show legend with only 1 point

    plt.show() #show the plot
    def generate_cluster_plot_frame(self):
        MDS()
        mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
        dist = 1 - cosine_similarity(self.tfidf_matrix)
        pos = mds.fit_transform(dist)
        xs, ys = pos[:,0], pos[:,1]

        self.cluster_plot_frame = pd.DataFrame(dict(x=xs, y=ys, label=self.clusters, chapter=self.chapter_list, book=self.book_list))
Example #15
0
def non_param_multi_dim_scaling(dists, n_dims=3, n_threads=None, metric=True):
    mds = MDS(n_components=n_dims, metric=metric, n_jobs=n_threads,
              dissimilarity='precomputed')
    mds.fit(squareform(dists))
    projs = mds.embedding_
    res = {'stress': mds.stress_,
           'projections': projs}
    return res
def plot_clusters(num_clusters, feature_matrix,
                  cluster_data, movie_data,
                  plot_size=(16,8)):
    # generate random color for clusters                  
    def generate_random_color():
        color = '#%06x' % random.randint(0, 0xFFFFFF)
        return color
    # define markers for clusters    
    markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
    # build cosine distance matrix
    cosine_distance = 1 - cosine_similarity(feature_matrix) 
    # dimensionality reduction using MDS
    mds = MDS(n_components=2, dissimilarity="precomputed", 
              random_state=1)
    # get coordinates of clusters in new low-dimensional space
    plot_positions = mds.fit_transform(cosine_distance)  
    x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
    # build cluster plotting data
    cluster_color_map = {}
    cluster_name_map = {}
    for cluster_num, cluster_details in cluster_data.items():
        # assign cluster features to unique label
        cluster_color_map[cluster_num] = generate_random_color()
        cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip()
    # map each unique cluster label with its coordinates and movies
    cluster_plot_frame = pd.DataFrame({'x': x_pos,
                                       'y': y_pos,
                                       'label': movie_data['Cluster'].values.tolist(),
                                       'title': movie_data['Title'].values.tolist()
                                        })
    grouped_plot_frame = cluster_plot_frame.groupby('label')
    # set plot figure size and axes
    fig, ax = plt.subplots(figsize=plot_size) 
    ax.margins(0.05)
    # plot each cluster using co-ordinates and movie titles
    for cluster_num, cluster_frame in grouped_plot_frame:
         marker = markers[cluster_num] if cluster_num < len(markers) \
                  else np.random.choice(markers, size=1)[0]
         ax.plot(cluster_frame['x'], cluster_frame['y'], 
                 marker=marker, linestyle='', ms=12,
                 label=cluster_name_map[cluster_num], 
                 color=cluster_color_map[cluster_num], mec='none')
         ax.set_aspect('auto')
         ax.tick_params(axis= 'x', which='both', bottom='off', top='off',        
                        labelbottom='off')
         ax.tick_params(axis= 'y', which='both', left='off', top='off',         
                        labelleft='off')
    fontP = FontProperties()
    fontP.set_size('small')    
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True, 
              shadow=True, ncol=5, numpoints=1, prop=fontP) 
    #add labels as the film titles
    for index in range(len(cluster_plot_frame)):
        ax.text(cluster_plot_frame.ix[index]['x'], 
                cluster_plot_frame.ix[index]['y'], 
                cluster_plot_frame.ix[index]['title'], size=8)  
    # show the plot           
    plt.show() 
Example #17
0
def md_scaling(co_matrix, is_distance_matrix=False):
    if not is_distance_matrix:
        distance_matrix = -np.log(co_matrix.matrix)
    else:
        distance_matrix = co_matrix

    mds = MDS(dissimilarity='precomputed')
    mds.fit(distance_matrix)
    return mds.embedding_
Example #18
0
def mds_embed(graph):

    sorted_node_list = sorted(list(graph.nodes()), key=len)
    dmat = nx.floyd_warshall_numpy(graph, nodelist=sorted_node_list)

    gmds = MDS(n_jobs=-2, dissimilarity="precomputed")
    embed_pts = gmds.fit_transform(dmat)

    return (embed_pts, dmat, sorted_node_list)
Example #19
0
def mds(similarity, euclid=False):
    if euclid:
        model = MDS(max_iter=1000)
        result = model.fit_transform(similarity)
    else:
        model = MDS(max_iter=1000, dissimilarity='precomputed')
        result = model.fit_transform(1 - similarity)

    return result.T
Example #20
0
def compute_2d_mapping(layout):
    sphere_coords = layout.sphere_coords()
    radius = layout.sphere_radius()
    from sklearn.manifold import MDS
    distances = compute_electrode_distance_matrix(sphere_coords, radius)
    mds = MDS(n_components=2, dissimilarity='precomputed')
    projection = mds.fit_transform(distances)
    # print projection.shape
    return projection
Example #21
0
    def cluster(D, k=3, verbose=False):
        """Cluster LDS's via Multi-Dimensional Scaling and KMeans.

        Strategy:
            1. Build NxN matrix of pairwise similarities
            2. Run MDS to embed data in R^2
            3. Run KMeans with k cluster centers
            4. Find samples closest to the k centers

        Paramters:
        ----------
        D: numpy.ndarray, shape = (N, N)
            Precomputed distance matrix.

        k: int (default: 3)
            Number of desired cluster centers.

        verbose: boolean
            Enable verbose output.

        Returns:
        --------
        eData: numpy.ndarray, shape (N, k)
            N d-dimensional samples embedded in R^d.

        ids: numpy.ndarray, shape = (k,)
            List of indices identifying the k representatives.
        """

        assert D.shape[0] == D.shape[1], "OOps (distance matrix not square)!"

        # build MDS for precomputed similarity matrix
        mds = MDS(metric=True, n_components=2, verbose=True,
                  dissimilarity="precomputed")

        def __symmetrize(A):
            return A + A.T - np.diag(A.diagonal())

        # run MDS on symmetrized similarity matrix
        eData = mds.fit(__symmetrize(D)).embedding_

        kmObj = KMeans(k)
        kmObj.fit_predict(eData)

        ids = np.zeros((k,), dtype=np.int)
        for i in range(k):
            # sanity check
            cDat = eData[np.where(kmObj.labels_ == i)[0],:]
            assert len(cDat) > 0, "Oops, empty cluster ..."

            kCen = kmObj.cluster_centers_[i,:]
            x = euclidean_distances(eData, kCen)
            ids[i] = int(np.argsort(x.ravel())[0])

        # return distance matrix and ID's of representative LDS's
        return (eData, ids)
Example #22
0
def get_mds(similarities):
    seed = np.random.RandomState(seed=3)
    print(np.amax(similarities))
    print(np.amin(similarities))
    nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
                    dissimilarity="precomputed", random_state=seed, n_jobs=1,
                    n_init=1)
    pos = nmds.fit(similarities).embedding_
    X=np.array(pos)
    return X
def convert_matrix_to_coordinates(sym_matrix, components):
    """
    :param sym_matrix: array, [n_samples, n_samples]
    :param components: int: 2 or 3 for MDS
    :return: Output of MDS, xy or xyz coordinates as 2d numpy array
             with shape [n_samples, components]
    """
    # Create coordinates based on multi dimensional scaling
    mds = MDS(n_components=components, dissimilarity="precomputed", random_state=1)
    coordinates = mds.fit_transform(sym_matrix)
    return coordinates
def plotMDS(X, Y):
    #computes and plots MDS (measure for how well data separates)
    D = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(X))
    tmodel = MDS(n_components=2, dissimilarity='precomputed')
    X2D = tmodel.fit_transform(D)
    plt.figure()
    plt.title('MDS')
    plt.ylabel('MDS1')
    plt.xlabel('MDS2')
    plt.scatter(X2D[:, 0], X2D[:, 1], c=Y)
    plt.show()
Example #25
0
def embedDistanceMatrix(dmatDf, method='kpca', n_components=2, **kwargs):
    """Two-dimensional embedding of sequence distances in dmatDf,
    returning Nx2 x,y-coords: tsne, isomap, pca, mds, kpca, sklearn-tsne"""
    if isinstance(dmatDf, pd.DataFrame):
        dmat = dmatDf.values
    else:
        dmat = dmatDf

    if method == 'tsne':
        xy = tsne.run_tsne(dmat, no_dims=n_components, perplexity=kwargs['perplexity'])
    elif method == 'isomap':
        isoObj = Isomap(n_neighbors=10, n_components=n_components)
        xy = isoObj.fit_transform(dmat)
    elif method == 'mds':
        mds = MDS(n_components=n_components,
                  max_iter=3000,
                  eps=1e-9,
                  random_state=15,
                  dissimilarity="precomputed",
                  n_jobs=1)
        xy = mds.fit(dmat).embedding_
        rot = PCA(n_components=n_components)
        xy = rot.fit_transform(xy)
    elif method == 'pca':
        pcaObj = PCA(n_components=None)
        xy = pcaObj.fit_transform(dmat)[:, :n_components]
    elif method == 'kpca':
        pcaObj = KernelPCA(n_components=dmat.shape[0], kernel='precomputed', eigen_solver='dense')
        try:
            gram = dist2kernel(dmat)
        except:
            print('Could not convert dmat to kernel for KernelPCA; using 1 - dmat/dmat.max() instead')
            gram = 1 - dmat / dmat.max()
        xy = pcaObj.fit_transform(gram)[:, :n_components]
    elif method == 'lle':
        lle = manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=n_components, method='standard')
        xy = lle.fit_transform(dist)
    elif method == 'sklearn-tsne':
        tsneObj = TSNE(n_components=n_components, metric='precomputed', random_state=0, perplexity=kwargs['perplexity'])
        xy = tsneObj.fit_transform(dmat)
    elif method == 'umap':
        umapObj = umap.UMAP(n_components=n_components, metric='precomputed', **kwargs)
        xy = umapObj.fit_transform(dmat)
    else:
        print('Method unknown: %s' % method)
        return

    assert xy.shape[0] == dmatDf.shape[0]
    xyDf = pd.DataFrame(xy[:, :n_components], index=dmatDf.index, columns=np.arange(n_components))
    if method == 'kpca':
        """Not sure how negative eigenvalues should be handled here, but they are usually
        small so it shouldn't make a big difference"""
        setattr(xyDf, 'explained_variance_', pcaObj.lambdas_[:n_components]/pcaObj.lambdas_[pcaObj.lambdas_>0].sum())
    return xyDf
Example #26
0
def mds(cos_simil_mtr):
    # convert two components as we're plotting points in a two-dimensional plane
    # "precomputed" because we provide a distance matrix
    # we will also specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

    pos = mds.fit_transform(cos_simil_mtr)  # shape (n_components, n_samples)

    xs, ys = pos[:, 0], pos[:, 1]
    print()

    return xs, ys
Example #27
0
 def generate_cluster_plot_frame(self):
     mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
     dist = 1 - cosine_similarity(self.tfidf_matrix)
     pos = mds.fit_transform(dist)
     xs, ys = pos[:, 0], pos[:, 1]
     cluster_data = dict()
     cluster_data["x"] = xs
     cluster_data["y"] = ys
     cluster_data["label"] = self.clusters
     cluster_data["presentation"] = self.presentation_list
     cluster_data["innovation_list"] = self.innovation_list
     self.cluster_plot_frame = pd.DataFrame(cluster_data)
Example #28
0
def transform_and_plot_data(seed, distance_matrix, dim_x, dim_y, title, plot3D, ax):
    if plot3D:
        n_components = 3
    else:
        n_components = 2
    mds = MDS(n_components=n_components, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1)
    transformed_data = mds.fit_transform(distance_matrix)

    corner_points, pair_list = create_pairs_to_plot_from_list(transformed_data, dim_x, dim_y)
    if plot3D:
        my_plot3D(corner_points, pair_list, False, title, ax)
    else:
        my_plot2D(corner_points, pair_list, False, title, ax)
Example #29
0
def make_mds_image(m, filename, labels=None, colour=None):
    """Given a matrix of distances, project into 2D space using
    multi-dimensional scaling and produce an image."""

    mds_data_filename = filename + ".dat"

    try:
        # if we've previously computed, load it
        p = np.genfromtxt(mds_data_filename)
    except:
        # else, compute it now (and save)
        
        # Construct MDS object with various defaults including 2d
        mds = MDS(dissimilarity="precomputed")
        # Fit
        try:
            f = mds.fit(m)
        except ValueError as e:
            print("Can't run MDS for " + filename + ": " + str(e))
            return

        # Get the embedding in 2d space
        p = f.embedding_

        # save
        np.savetxt(mds_data_filename, p)

    # Make an image
    fig, ax = plt.subplots(figsize=(5, 5))
    # x- and y-coordinates
    ax.set_aspect('equal')

    ax.scatter(p[:,0], p[:,1], edgecolors='none')

    if labels != None:
        print filename
        # hard-coded for GP depth-2
        indices = [0, 2, 50, 52]
        for i in indices:
            print labels[i], p[i,0], p[i,1]
            # can print some labels directly on the graph as follows,
            # but maybe it's better done manually, after printing
            # their locations to terminal?

            # plt.text(p[i,0], p[i,1], labels[i], style='italic',
            #         bbox={'facecolor':'red', 'alpha':0.5, 'pad':10})

    fig.savefig(filename + ".pdf")
    fig.savefig(filename + ".eps")
    fig.savefig(filename + ".png")
    plt.close(fig)
def mds_bib_data_with_sklearn(fname):
    
    bib_data = get_bib_data()
    mat, years, term_list, years_cnt = get_year_by_term_mat(bib_data, freq=5)
    
    # Euclidean-based MDS
    aMDS = MDS(n_components=2, dissimilarity='euclidean')
    coords = aMDS.fit_transform(mat)
    fig = plt.figure()
    fig.clf()
    for label, x, y in zip(years, coords[:,0], coords[:,1]):
        plt.annotate(label, xy=(x,y))
    
    plt.savefig(fname)
Example #31
0
for kmean in range(6):
    for x in range(len(datamat2)):
        if results2[x] == kmean:
            kmeanSums2[kmean] += cosine_distance(datamat2[x], means2[kmean])**2

for kmean in range(6):
    for x in range(len(datamat2)):
        if results3[x] == kmean:
            kmeanSums3[kmean] += spatial.distance.jaccard(
                datamat2[x], means3[kmean])**2

print(sum(kmeanSums1))
print(sum(kmeanSums2))
print(sum(kmeanSums3))

MDS()

mds = MDS(n_components=2, dissimilarity="euclidean", random_state=1)

#mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(datamat2)
#pos = mds.fit_transform(cosmat)
#pos = mds.fit_transform(jacmat)

xs, ys = pos[:, 0], pos[:, 1]

cluster_colors = {
    0: '#1b9e77',
    1: '#d95f02',
    2: '#7570b3',
Example #32
0
import numpy as np
from scipy import sparse
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS

# get num comics
from my_utils import get_latest_comic_num

if __name__ == "__main__":

    tfidf_vectors = sparse.load_npz("../data/text_vectors/tfidf_vectors.npz")

    cosine_array = cosine_similarity(tfidf_vectors, tfidf_vectors)
    dissimilarities = 1 - cosine_array

    # compute the embedding
    embedded = MDS(dissimilarity='precomputed').fit_transform(dissimilarities)

    # save coord
    np.save("../data/document_relations/mds.npy", embedded)
    print("MDS SHAPE: ", embedded.shape)

    # save coords as dataframe
    num_comics = get_latest_comic_num() + 1
    comic_serial_numbers = [str(i) for i in range(1, num_comics)]
    df = pd.DataFrame(embedded, columns=['x', 'y'], index=comic_serial_numbers)
    pd.to_pickle(df, "../data/document_relations/mds_df.pkl")
    clusterCounts = np.empty((nDifferentDataSet, ))

    dist = DistanceMetric.get_metric(metric)
    print("MDS Metric: {}".format(metric))

    for i in range(nDifferentDataSet):

        data = generateOneClusterData(DEFAULT_NUMBER_OF_FEATURES,
                                      DEFAULT_NUMBER_OF_RECORDS_PER_CLASS,
                                      DEFAULT_FEATURE_MEAN_RANGE,
                                      i,
                                      distribution="normal")
        precomputedMetricData = dist.pairwise(data)

        mds = MDS(n_components=8, n_jobs=-1, dissimilarity="precomputed")
        mdsData = mds.fit_transform(precomputedMetricData)

        optimalK = OptimalK(parallel_backend='joblib', n_jobs=-1)
        clusterCount = optimalK(mdsData,
                                n_refs=3,
                                cluster_array=np.arange(1, 10))
        clusterCounts[i] = clusterCount
        stress[i, j] = mds.stress_

    meanClusterCount[j] = np.mean(clusterCounts)
    stdClusterCount[j] = np.std(clusterCounts)

    meanStress[j] = np.mean(stress[:, j])
    stdStress[j] = np.std(stress[:, j])
Example #34
0
    COV_X_PD = pd.DataFrame(data=COV_X, index=index_PD, columns=Columns_PD)
    Mu = np.repeat(0.3, p)

    #%% Init MDS
    import Toolbox
    from Toolbox import two_d_eq, Assign_features_to_pixels, Random_Image_Gen, REFINED_Im_Gen
    from sklearn.manifold import MDS
    from sklearn.metrics.pairwise import euclidean_distances
    import pickle

    #%% MDS
    nn = math.ceil(np.sqrt(p))  # Image dimension
    Nn = p  # Number of features
    Euc_Dist = COV_X  # Making the Euclidean distance matrix symmetric

    embedding = MDS(
        n_components=2)  # Reduce the dimensionality by MDS into 2 components
    mds_xy = embedding.fit_transform(COV_X)  # Apply MDS

    print(">>>> MDS dimensionality reduction is done")

    eq_xy = two_d_eq(mds_xy, Nn)
    Img = Assign_features_to_pixels(
        eq_xy, nn,
        verbose=1)  # Img is the none-overlapping coordinates generated by MDS

    Desc = Columns_PD  # Drug descriptors name
    Dist = pd.DataFrame(
        data=Euc_Dist, columns=Desc, index=Desc
    )  # Generating a distance matrix which includes the Euclidean distance between each and every descriptor
    data = (Desc, Dist, Img)  # Preparing the hill climbing inputs
Example #35
0
#plt.scatter(X[:, 0], X[:, 1], **colorize)
#plt.axis('equal');

#MDS
def rotate(X, angle):
    theta = np.deg2rad(angle)
    R = [[np.cos(theta), np.sin(theta)],
         [-np.sin(theta), np.cos(theta)]]
    return np.dot(X, R)
    
X2 = rotate(X, 20) + 5
#plt.scatter(X2[:, 0], X2[:, 1], **colorize)
#plt.axis('equal')

from sklearn.metrics import pairwise_distances
D = pairwise_distances(X)
D.shape

#plt.imshow(D, zorder=2, cmap='Blues', interpolation='nearest')
#%plt.colorbar();

D2 = pairwise_distances(X2)
#plt.imshow(D, zorder=2, cmap='Greens', interpolation='nearest')
#plt.colorbar();
np.allclose(D, D2)

from sklearn.manifold import MDS
model = MDS(n_components=2, dissimilarity='precomputed', random_state=1)
out = model.fit_transform(D)
plt.scatter(out[:, 0], out[:, 1], **colorize)
plt.axis('equal');
Example #36
0
    count = count + 1

# Mean Centering
n1 = np.ones([len(K), len(K)]) * 1.0 / len(K)
K2 = K - np.dot(n1, K) - np.dot(K, n1) + np.linalg.multi_dot([n1, K, n1])
[t1, t2] = np.linalg.eig(K2)
print(np.min(np.real(t1)))
K3 = K2 - np.min(np.real(t1)) * np.eye(len(K2))
[t1, t2] = np.linalg.eig(K3)
print(np.min(np.real(t1)))
U = np.real(np.matmul(t2, np.diag(np.sqrt(t1))))

# Apply MDS
q = 50
Ksym = (K + np.transpose(K)) / 2
mds = MDS(n_components=q, metric=True, dissimilarity='precomputed')
U = mds.fit_transform(Ksym)

#pca = PCA(n_components=q)
#U = U[:,:q]

#For Drugs
temp = (compSim[trainInds, :])[:, trainInds] + 0.1 * np.eye(nd)
A = np.linalg.multi_dot([
    np.linalg.inv(temp), U[:nd, :],
    np.transpose(U[:nd, :]),
    np.linalg.inv(temp)
])  #A = UU^T
[t1, t2] = np.linalg.eig(A)
W = np.real(np.matmul(t2, np.diag(np.sqrt(t1))))[:, :q]
'''
Example #37
0
    # 차원 축소: (1) 투영(Project)-주성분 분석, (2) Manifold
    # Manifold 방법:
    # 1) LLE(Locally Linear Embedding):
    #   각 훈련 샘플들이 가장 가까운 이웃들에 얼마나 선형적으로 연관되어 있는지를 측정.
    X, y = make_swiss_roll(n_samples=1000, noise=0.2, random_state=41)
    lle = LocallyLinearEmbedding(n_neighbors=10,
                                 n_components=2,
                                 random_state=1)
    # X_reduced = lle.fit_transform(X)
    # plt.scatter(X_reduced[:, 0], X_reduced[:, 1],
    #             c=y, cmap=plt.cm.hot)
    # plt.show()

    # MDS(Multi-Distance Scaling):
    #   샘플들 간의 거리(distance)를 유지하면서 차원을 축소하는 기법.
    mds = MDS(n_components=2, random_state=1)

    # Isomap(Isometric Mapping):
    #   각 샘플들을 가장 가까운 이웃에 연결하는 그래프를 만듦.
    #   그래프 거리(graph distance, geodesic distance)를 유지하도록 차원을 축소.
    isomap = Isomap(n_components=2)

    # t-SNE(t-distribution Stochastic Neighbor Embedding)
    #   비슷한 샘플들은 가까이, 비슷하지 않은 샘플들은 멀리 떨어지도록 차원 축소하는 기법.
    tsne = TSNE(n_components=2, random_state=1)

    titles = ['LLE', 'MDS', 'Isomap', 't-SNE']
    manifold_reducers = [lle, mds, isomap, tsne]
    for title, reducer in zip(titles, manifold_reducers):
        plt.title(title)  # reducer.__class__.__name__
        # 원본 데이터를 manifold 방법을 사용해서 차원 축소
Example #38
0
    # PCA + LLE time and visualizations
    pca_lle = Pipeline([
        ("pca", PCA(n_components=0.95, random_state=42)),
        ("lle", LocallyLinearEmbedding(n_components=2, random_state=42)),
    ])
    t0 = time.time()
    X_pca_lle_reduced = pca_lle.fit_transform(X)
    t1 = time.time()
    print("PCA+LLE took {:.1f}s.".format(t1 - t0))
    plot_digits(X_pca_lle_reduced, y)

    # MDS time and visualizations
    m = 2000
    t0 = time.time()
    X_mds_reduced = MDS(n_components=2, random_state=42).fit_transform(X[:m])
    t1 = time.time()
    print("MDS took {:.1f}s (on just 2,000 MNIST images instead of 10,000).".format(t1 - t0))
    plot_digits(X_mds_reduced, y[:m])

    # PCA + MDS time and visualizations
    pca_mds = Pipeline([
        ("pca", PCA(n_components=0.95, random_state=42)),
        ("mds", MDS(n_components=2, random_state=42)),
    ])
    t0 = time.time()
    X_pca_mds_reduced = pca_mds.fit_transform(X[:2000])
    t1 = time.time()
    print("PCA+MDS took {:.1f}s (on 2,000 MNIST images).".format(t1 - t0))
    plot_digits(X_pca_mds_reduced, y[:2000])
Example #39
0
    representation_func=lambda m: pd.Series(np.random.random(100)),
    metadata="""Uniformly distributed random feature vector of length 100"""
    """implemented using <a href="http://www.numpy.org">numpy</a> <a href="http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.random.html#numpy.random.random">random</a> module"""
)

DEFAULT_REPRESENTATION_TYPES = [morg2, targets, random]

pca = ReductionMethod(
    name='PCA',
    model=PCA(n_components=2),
    metadata=
    """<a href="http://en.wikipedia.org/wiki/Principal_component_analysis">Principal component analysis</a> implemented in <a href="http://scikit-learn.org/stable/" target="_blank">scikit-learn</a>\n"""
    """<br/>Default parameters used.""")

mds = ReductionMethod(
    name='MDS',
    model=MDS(),
    metadata=
    """<a href="http://en.wikipedia.org/wiki/Multidimensional_scaling" target="_blank">Multidimensional Scaling</a> implemented in <a href="http://scikit-learn.org/stable/" target="_blank">scikit-learn</a>"""
    """<br/>Default parameters used.""")

tsne = ReductionMethod(
    name='t-SNE',
    model=TSNE(perplexity=10),
    metadata=
    """<a href="http://lvdmaaten.github.io/tsne/">Student's t-distributed stochastic neighbour embedding</a>, """
    """implemented according to <a href="http://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf">van der Maartin et al. 2008</a>\n"""
    """<br/>Parameters used: Perplexity = 10, theta=0""")

DEFAULT_REDUCTION_METHODS = [pca, mds, tsne]
Example #40
0
                                          axis=1).min()
        if closest_distance > min_distance:
            neighbors = np.r_[neighbors, [image_coord]]
            if images is None:
                plt.text(image_coord[0],
                         image_coord[1],
                         str(int(y[index])),
                         color=cmap(y[index] / 9),
                         fontdict={
                             "weight": "bold",
                             "size": 16
                         })
            else:
                image = images[index].reshape(28, 28)
                imagebox = AnnotationBbox(OffsetImage(image, cmap="binary"),
                                          image_coord)
                ax.add_artist(imagebox)


from sklearn.manifold import MDS
import time

startTime = time.time()
X_mds_reduced = MDS(n_components=2).fit_transform(x_subset)
endTime = time.time()
print(
    "MDS took {:.1f}s (on just 2,000 MNIST images instead of 10,000).".format(
        startTime - endTime))
plot_digits(X_mds_reduced, y_subset)
plt.show()
Example #41
0
def plot_repr_trajectories(res,
                           snap_type,
                           dims=2,
                           title_label='',
                           epochs_to_mark=()):
    """
    Plot trajectories of each item or context representation over training
    using MDS. Can plot in 3D by settings dims to 3.
    Returns figure and axes.
    """
    embedding = MDS(n_components=dims, dissimilarity='precomputed')
    reprs_embedded = embedding.fit_transform(
        res['repr_dists'][snap_type]['all'])

    # reshape and permute to aid plotting
    n_snaps = len(res['snap_epochs'])
    n_domains = res['net_params']['n_train_domains']
    reprs_embedded = reprs_embedded.reshape((n_snaps, n_domains, -1, dims))
    reprs_embedded = reprs_embedded.transpose((1, 2, 3, 0))

    fig = plt.figure()
    ax = fig.add_subplot(111, projection=('3d' if dims == 3 else None))

    input_names = _get_names_for_snapshots(snap_type, **res['net_params'])

    if 'item' in snap_type:
        if 'item_clusters' in res['net_params']:
            input_groups = dd.item_group(
                clusters=res['net_params']['item_clusters'])
        elif 'cluster_info' in res['net_params']:
            input_groups = dd.item_group(
                clusters=res['net_params']['cluster_info'])
        else:
            input_groups = dd.item_group()
    elif 'context' in snap_type:
        # No "groups," but use symbols for individual contexts (per domain) instead.
        input_groups = np.arange(4)
    else:
        raise ValueError('Unrecognized snapshot type')

    input_names = np.array(input_names).reshape((n_domains, -1))

    colors = dd.get_domain_colors()
    markers = ['o', 's', '*', '^']
    for dom_reprs, dom_labels, color in zip(reprs_embedded, input_names,
                                            colors):
        for reprs, label, group in zip(dom_reprs, dom_labels, input_groups):
            linestyle = markers[group] + '-'
            ax.plot(*reprs,
                    linestyle,
                    label=label,
                    markersize=4,
                    color=color,
                    linewidth=0.5)

    # add start and end markers on top of everything else
    inds_to_mark = []
    if len(epochs_to_mark) > 0:
        snap_epochs = res['snap_epochs']
        for epoch in epochs_to_mark:
            if epoch in snap_epochs:
                inds_to_mark.append(snap_epochs.index(epoch))

    for dom_reprs, dom_labels, color in zip(reprs_embedded, input_names,
                                            colors):
        for reprs, label, group in zip(dom_reprs, dom_labels, input_groups):
            marker = markers[group]

            def mark_epoch(epoch_ind, bordercolor):
                ax.plot(*reprs[:, epoch_ind],
                        marker,
                        markersize=8,
                        color=bordercolor)
                ax.plot(*reprs[:, epoch_ind],
                        marker,
                        markersize=5,
                        color=color)

            mark_epoch(0, 'g')
            mark_epoch(-1, 'r')
            for ind in inds_to_mark:
                mark_epoch(ind, 'k')

    ax.set_title(f'{title_label} {snap_type} representations over training\n' +
                 'color = domain, marker = type within domain')

    return fig, ax
Example #42
0
    return pair.distance


M = np.zeros((100, 100))
for i in range(100):
    print(i, end=" ")
    for j in range(i, 100):
        M[i][j] = distance(getpart(i), getpart(j))

pickle.dump(M, open("NCtransMat.p", "wb"))

#plot
M = M + M.T
from sklearn.manifold import MDS
mds = MDS(n_components=2,
          dissimilarity='precomputed',
          max_iter=50000,
          n_init=100)
pos = mds.fit(M).embedding_
X = []
Y = []
for i in range(100):
    X.append(pos[i][0])
    Y.append(pos[i][1])

plt.scatter(X, Y)
plt.scatter(X[:3], Y[:3], color='red')
plt.annotate("judge", (X[0], Y[0]))
plt.annotate("2012", (X[1], Y[1]))
plt.annotate("2016", (X[2], Y[2]))
plt.savefig("NCtransplot.png")
Example #43
0
analysis_group.CALC.hist()
plt.show()

analysis_group.NObeyesdad.hist()
plt.show()

analysis_group.Age.hist()
plt.show()

analysis_group.hist()
plt.show()

from sklearn.manifold import MDS

embedding = MDS(n_components=2,verbose=1,max_iter=100,n_init=2)

data_emb = embedding.fit_transform(data_dummies[0:])

for cluster in data['clusters'].unique():
    _ = plt.scatter(data_emb[0:][data.clusters[0:]==cluster][:,0], data_emb[0:][data.clusters[0:]==cluster][:,1], cmap=plt.cm.Spectral,
            label='Cluster'+str(cluster)
            )
plt.legend()
plt.show()

from sklearn.decomposition import PCA

pca = PCA(n_components=38)
data_pca = pca.fit_transform(data_dummies)
Example #44
0
                temp.append('nan')
            mdsData.iloc[i]=temp

mdsData = mdsData.loc[pd.isnull(mdsData["ID"]) == False] #剔除ID为‘nan’的数据
'''
利用df.interpolate方法填充缺失值
'''
mdsfillnan = mdsData[["Dac_23","Dac_34","Dac_45","Dac_56","Dac_67","Dac_78"]]

#mdsfillnan = mdsData[["Acc_23","Acc_34","Acc_45","Acc_56","Acc_67","Acc_78",
#                   "Dac_23","Dac_34","Dac_45","Dac_56","Dac_67","Dac_78"]]

mdsfillnan = mdsfillnan.apply(lambda x: pd.to_numeric(x, errors='coerce'))
#a = a.interpolate(method='spline', order=2)    
mdsfillnan = mdsfillnan.interpolate(method='values', axis=0,
                                    limit=testNum, limit_direction='both')   
'''
sklearn
'''
mds = MDS()
mds.fit(mdsfillnan)
mdsResult = mds.embedding_
plt.scatter(mdsResult[0:128,0],mdsResult[0:128,1],color='turquoise')

# =============================================================================
# 利用Kmeans聚类,筛选高危异常驾驶行为驾驶人
# =============================================================================



Example #45
0
    X3t = emb.transform(X3)

    listDistance = distancias(X2, X3t)
    listClase = clasificador(yy, X3t, listDistance)

    graphPoints(X3t, ntoc(int(listClase)), False, False)
    graphPoints(X, listClasificador)
    graphPoints(X2, yy, False)
    plt.show()


def llamarPronostico():
    pronostico(X2, yy, X, listClasificador)


df = pd.read_csv("data.csv")
data = df.values
X = data[:, 0:]
y = np.zeros((len(X)))

pca = PCA(n_components=2)
emb1 = MDS(n_components=2)
emb = Isomap(n_components=2)
emb.fit(X)
X = reduccion(X)

obtenerK(X)
K = silhouette(X)
#print("K = ",K)
[X2, yy, X, listClasificador] = KMeans2D(K)
Example #46
0
def cluster_us(movie_list=None):
    # if not movie_list:

    summary_list = []
    films = []
    ranks = []
    genres = []
    years = []
    foreigns = []
    languages = []
    countries = []

    for movie in movie_list:
        summary_list.append(movie.summary)
        films.append(movie.name)
        ranks.append(movie.metascore)
        genres.append(str(movie.genre))
        years.append(int(movie.year))
        foreigns.append(1 if movie.competition_category == 'FOREIGN LANGUAGE FILM' else 0)
        languages.append(movie.languages)
        countries.append(movie.countries)
    """this method gets a list of dictionaries and cluster the movies by it. """
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for i in range(len(summary_list)):
        allwords_stemmed = tokenize_and_stem(summary_list[i])  # for each item in 'synopses', tokenize/stem
        totalvocab_stemmed.extend(allwords_stemmed)  # extend the 'totalvocab_stemmed' list

        # allwords_tokenized = tokenize_only(summary_list[i])
        # totalvocab_tokenized.extend(allwords_tokenized)  # extend the 'totalvocab_stemmed' list

    # define vectorizer parameters
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                       min_df=0.2, stop_words='english',
                                       use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 1))
    # tfidf_vectorizered = TfidfVectorizer(max_df=0.8, max_features=200000,
    #                                    min_df=0.05, stop_words='english',
    #                                    use_idf=True, tokenizer=tokenize_only, ngram_range=(1, 3), max_df=0.8)
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, tokenizer=tokenize_and_stem,
                                       ngram_range=(1, 1), max_df=0.8, min_df=0.01)

    tfidf_matrix = tfidf_vectorizer.fit_transform(summary_list)  # fit the vectorizer to synopses

    print(tfidf_matrix.shape)
    terms = tfidf_vectorizer.get_feature_names()

    dist = 1 - cosine_similarity(tfidf_matrix)

    num_clusters = 5

    km = KMeans(n_clusters=num_clusters)

    km.fit(tfidf_matrix)

    clusters = km.labels_.tolist()

    # uncomment the below to save your model
    # since I've already run my model I am loading from the pickle

    # joblib.dump(km, 'doc_cluster.pkl')
    #
    # km = joblib.load('doc_cluster.pkl')
    clusters = km.labels_.tolist()
    vocab_frame = pd.DataFrame({'words': totalvocab_stemmed}, index=totalvocab_stemmed)

    films = {'title': films, 'rank': ranks, 'synopsis': summary_list, 'cluster': clusters, 'genre': genres,
             'year': years, 'foreign': foreigns, 'language': languages, 'country': countries}

    frame = pd.DataFrame(films, index=clusters,
                         columns=['title', 'rank', 'cluster', 'genre', 'year', 'foreign', 'language', 'country'])
    print(frame['cluster'].value_counts())
    grouped = frame['rank'].groupby(frame['cluster'])  # groupby cluster for aggregation purposes

    print(grouped.mean())  # average rank (1 to 100) per cluster
    # frame = pd.DataFrame([films], index=[clusters], columns=['rank', 'title', 'cluster', 'genre'])

    print("Top terms per cluster:")
    frame.to_csv("finished_output.csv")
    # fig = px.scatter(frame, x='cluster', y='rank', color='cluster', hover_name='title', custom_data=['foreign'],
    #                  symbol='foreign')
    # chart_studio.plotly.plot(fig, filename='interactive_clustering', auto_open=True)
    # fig.show()
    print()
    # sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    with open("nimni", "w") as f:
        for i in range(num_clusters):
            f.write("Cluster %d words:" % i)
            print("Cluster %d words:" % i, end='')

            for ind in order_centroids[i, :5]:  # replace 6 with n words per cluster
                print(terms[ind])
                f.write(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'))
                # print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'),
                #       end=',')
            print()  # add whitespace;
            print()  # add whitespace

            print("Cluster %d titles:" % i, end='')
            f.write("Cluster %d titles:" % i)
            titles = frame.loc[i]['title']
            f.write(str(titles))

            print(titles)
            print()  # add whitespace
            print()  # add whitespace

        print()
        print()
        MDS()

        # convert two components as we're plotting points in a two-dimensional plane
        # "precomputed" because we provide a distance matrix
        # we will also specify `random_state` so the plot is reproducible.
        mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

        pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

        xs, ys = pos[:, 0], pos[:, 1]
        frame = pd.DataFrame(dict(x=xs, y=ys, title=frame['title'], foreign=foreigns, cluster=clusters))
        # groups = frame.groupby('label')
        fig = px.scatter(frame, x='x', y='y', color='cluster', symbol='foreign',hover_name='title')
        chart_studio.plotly.plot(fig, filename='interactive_clustering_mds', auto_open=True)
def make_map():

    # representation types
    morg2 = RepresentationType(
        name='morg2',
        representation_func=skchemize(morg, radius=2, nBits=2048),
        metadata=
        """Hashed Circular fingerprint generated by the Morgan algorithm, """
        """implemented in <a href="http://www.rdkit.org">RDKit</a>. <br/>"""
        """Parameters used: Radius = 2, Bit length = 2048""")

    targets = RepresentationType(
        name='targets',
        representation_func=PIDGIN(),
        metadata=
        """Bayes affinity fingerprint for 1080 human targets, produced """
        """using the <a href="https://github.com/lhm30/PIDGIN">PIDGIN (Prediction of targets IncluDinG INactives)</a>"""
        """Target Prediction algorithm, implemented in <a href="https://github.com/richlewis42/scikit-chem">scikit-chem</a>."""
    )

    random = RepresentationType(
        name='random',
        representation_func=lambda m: pd.Series(np.random.random(100)),
        metadata="""Uniformly distributed random feature vector of length 100"""
        """implemented using <a href="http://www.numpy.org">numpy</a> <a href="http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.random.html#numpy.random.random">random</a> module"""
    )

    representation_types = [morg2, targets, random]

    # reduction types
    pca = ReductionMethod(
        name='PCA',
        model=PCA(n_components=2),
        metadata=
        """<a href="http://en.wikipedia.org/wiki/Principal_component_analysis">Principal component analysis</a> implemented in <a href="http://scikit-learn.org/stable/" target="_blank">scikit-learn</a>\n"""
        """<br/>Default parameters used.""")

    mds = ReductionMethod(
        name='MDS',
        model=MDS(),
        metadata=
        """<a href="http://en.wikipedia.org/wiki/Multidimensional_scaling" target="_blank">Multidimensional Scaling</a> implemented in <a href="http://scikit-learn.org/stable/" target="_blank">scikit-learn</a>"""
        """<br/>Default parameters used.""")

    tsne = ReductionMethod(
        name='t-SNE',
        model=TSNE(perplexity=1),
        metadata=
        """<a href="http://lvdmaaten.github.io/tsne/">Student's t-distributed stochastic neighbour embedding</a>, """
        """implemented according to <a href="http://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf">van der Maartin et al. 2008</a>\n"""
        """<br/>Parameters used: Perplexity = 1, theta=0""")

    reduction_types = [pca, mds, tsne]

    # activity types
    pIC20 = ActivityType(
        name='pIC20',
        metadata=
        """negative based-10 logarithm of the <a href="http://en.wikipedia.org/wiki/IC50">IC20</a>, the concentation of"""
        """compound required for 20% inhibition of growth of Lymphoma cells""")

    IC20 = ActivityType(
        name='IC20',
        metadata=
        """<a href="http://en.wikipedia.org/wiki/IC50">IC20</a>, the concentation of"""
        """compound required for 20% inhibition of growth of Lymphoma cells""")

    activity_types = [pIC20, IC20]

    # synergy types
    excessOverBliss = SynergyType(
        name='ExcessOverBliss',
        metadata=
        """Difference in observed vs expected activity of the component compounds,"""
        """each at the IC20 concentration (when known) assuming the <a href="http://doi.wiley.com/10.1111/j.1744-7348.1939.tb06990.x">Bliss Independence model</a>"""
    )

    synergy_types = [excessOverBliss]

    # data
    compound_df = skc.read_smiles(os.path.join(DIRNAME, 'compounds.smiles'),
                                  name_column=1,
                                  title_line=True)
    compound_df['pIC20'] = -np.log10(compound_df['IC20'])

    combination_df = pd.read_csv(os.path.join(DIRNAME, 'combinations.csv'))
    combination_df.set_index('id', inplace=True)

    synergy_map = SynergyMap(compound_df=compound_df,
                             combination_df=combination_df,
                             representation_types=representation_types,
                             reduction_types=reduction_types,
                             activity_types=activity_types,
                             synergy_types=synergy_types,
                             metadata='DREAM Drug Combination Challenge Data')

    return synergy_map
                text += s + ' '
        text = text.translate(table_p)
        text = nltk.word_tokenize(text)
        for word in text:
            if word not in stopwords and len(word) > 1:
                cleandoc += word + ' '
    twtall.append(cleandoc)
    cleandoc = ''
    tweetcount += 1
    twtname.append('{}-{}'.format(tweetcount * 100 - 99, tweetcount * 100))

vectorizer = TfidfVectorizer()
twt_matrix = vectorizer.fit_transform(twtall)

cos_dist = cosine_distances(twt_matrix)
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=1)
pos = mds.fit_transform(cos_dist)
xs, ys = pos[:, 0], pos[:, 1]

for x, y, name in zip(xs, ys, twtname):
    plt.scatter(x, y)
    plt.text(x, y, name)

plt.title('tweet MDS')
plt.savefig(path.join("factor_analysis.png"), dpi=600)
plt.show()

linkage_matrix = ward(cos_dist)
dendrogram(linkage_matrix, orientation='left', labels=twtname)
plt1 = plt.tight_layout()
# reference: https://stackoverflow.com/questions/9622163/save-plot-to-image-file-instead-of-displaying-it-using-matplotlib
Example #49
0
        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # Multi-dimensional Scaling
    from sklearn.manifold import MDS
    MDS()

    # two components as we're plotting points in a two-dimensional plane
    # "precomputed" because we provide a distance matrix
    # we will also specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
    xs, ys = pos[:, 0], pos[:, 1]

    # 2nd Plot showing the actual clusters formed
    colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(pos[:, 0],
                pos[:, 1],
                marker='.',
                s=120,
                lw=0,
Example #50
0
def img(start, end, investement_type, sharpe_ratio, std, beta, treynor_ratio, revenue, btest_time, money, buy_ratio, strategy, frequency):
    profit = pd.DataFrame()
    hold = np.zeros((4), dtype=np.float)
    response_data = {}
    response_data['start'] = start.strftime('%Y-%m')
    response_data['mean_similarity'] = 0
    distance = []
    length = 12 * (end.year - start.year) + (end.month - start.month) + 1
    choose = np.asarray([" ", " ", " ", " "], dtype='<U32')
    choose = selection(start, btest_time, investement_type, 0, sharpe_ratio, std,
                       beta, treynor_ratio, revenue, choose)

    if strategy == 2:
        money /= (12 * (end.year - start.year) +
                  (end.month - start.month) + 1) / frequency
        total_money = money

    for i in range(length):
        start_unix = time.mktime((start + relativedelta(months=i)).timetuple())
        end_unix = time.mktime(
            (start + relativedelta(months=i+1, days=-1)).timetuple())
        data_df = pd.read_sql(sql='select * from price where date between ? and ? order by date asc',
                              con=engine, params=[start_unix, end_unix])
        data_df = data_df.pivot(index='date', columns='fund_id', values='nav')
        data_df = data_df.fillna(method="ffill")
        data_df = data_df.fillna(method="bfill")

        if i == 0:
            hold = (buy_ratio * money / data_df[choose].iloc[0].T).values
        elif strategy != 0 and i % frequency == frequency - 1:
            if strategy == 2:
                hold += (buy_ratio * money / data_df[choose].iloc[0].T).values
                total_money += money
            else:
                temp = (hold * data_df[choose].iloc[0]).sum()
                if strategy == 3:
                    choose = selection(start, btest_time, investement_type, i, sharpe_ratio,
                                       std, beta, treynor_ratio, revenue, choose)
                hold = (buy_ratio * temp /
                        data_df[choose].iloc[0].T).values

        if strategy == 2:
            profit = pd.concat(
                [profit, (data_df[choose] * hold).T.sum() / total_money], axis=0)
        else:
            profit = pd.concat(
                [profit, (data_df[choose] * hold).T.sum() / money], axis=0)

        for j, ch in enumerate(choose):
            interest = pd.read_sql(sql='select sum(interest) from interest where date between ? and ? and fund_id == ? order by date asc',
                                   con=engine, params=[start_unix, end_unix, ch])
            hold[j] += (interest * hold[j] /
                        data_df[ch].iloc[-1]).fillna(0).loc[0][0]

        if i == length-1:
            response_data['money'] = (hold * data_df.iloc[-1][choose]).sum()

        price = data_df[choose].iloc[-1].mean()

        data_df = pd.concat([data_df[choose], data_df.T.sample(
            n=296).T], axis=1).T.drop_duplicates().T
        data_df = data_df.pct_change()
        data_df_std = data_df.std()
        data_df = data_df.drop(
            data_df_std[data_df_std == 0].index.values, axis=1)
        data_df = data_df.corr()
        data_df = 1 - data_df * 0.5 - 0.5

        response_data['mean_similarity'] += np.square(
            data_df[choose].T[choose].sum().sum()/2)
        distance.append(np.square(data_df[choose].T[choose].sum().sum()/2))

        color = np.asarray(["yellow" for i in range(len(data_df))])
        color[0:4] = "purple"
        mds = MDS(n_components=2, dissimilarity='precomputed').fit(
            data_df).embedding_
        source = ColumnDataSource(data=dict(x=mds[:, 0],
                                            y=mds[:, 1],
                                            name=data_df.index,
                                            color=color,
                                            ))
        TOOLTIPS = [
            ("fund_id", "@name"),
        ]
        p = figure(plot_width=500, plot_height=500, tooltips=TOOLTIPS,
                   title="MDS", toolbar_location=None, tools="")
        p.x_range = Range1d(-0.6, 0.6)
        p.y_range = Range1d(-0.6, 0.6)
        p.circle(x='x', y='y', color='color', size=8, source=source)
        script, div = components(p, CDN)
        response_data[(start + relativedelta(months=i)
                       ).strftime('%Y-%m')] = {'script': script, 'div': div}

    profit = profit.rename(columns={0: "profit"})
    profit["profit"] = (profit["profit"]-1)
    profit_indicator(profit, start, end, response_data)
    profit["profit"] = profit["profit"] * 100
    profit.index.name = "date"

    response_data['profit'] = profit.iloc[-1][0]
    response_data['mean_similarity'] /= length
    # price_simulation(end, price, response_data)

    profit.index = profit.index + 28800
    profit.index = pd.to_datetime(profit.index, unit='s')
    totalStock = pd.read_csv("totalStock.csv")
    totalStock.date = totalStock.date.astype('datetime64')
    totalStock = totalStock[(
        totalStock.date < end + relativedelta(months=1)) & (totalStock.date >= start)]
    totalStock.profit = (
        (totalStock.profit / totalStock.iloc[0].profit) - 1) * 100
    p = figure(x_axis_type="datetime", plot_width=940,
               plot_height=300, title="Profit", toolbar_location=None, tools="")
    p.line(x='date', y='profit', line_width=3,
           source=profit, color='red', legend='Choose')
    p.add_tools(HoverTool(tooltips=[("date", "@date{%F}"), ("profit", "@profit%")],
                          formatters={'date': 'datetime', }, mode='vline'))
    p.line(x='date', y='profit', line_width=3,
           source=totalStock, color='blue', legend='Compare')
    script, div = components(p, CDN)
    response_data['profit_img'] = {'script': script, 'div': div}

    p = figure(plot_width=940, plot_height=300,
               title="Distance", toolbar_location=None, tools="")
    p.line([i+1 for i in range(len(distance))],
           distance, line_width=2)
    script, div = components(p, CDN)
    response_data['distance'] = {'script': script, 'div': div}

    response_data['sharpe_ratio'] = round(response_data['sharpe_ratio'], 3)
    response_data['market_sharpe'] = round(response_data['market_sharpe'], 3)
    response_data['std'] = round(response_data['std'], 3)
    response_data['market_std'] = round(response_data['market_std'], 3)
    response_data['beta'] = round(response_data['beta'], 3)
    response_data['treynor_ratio'] = round(response_data['treynor_ratio'], 3)
    response_data['money'] = round(response_data['money'], 3)
    response_data['profit'] = round(response_data['profit'], 3)
    response_data['market_revenue'] = round(response_data['market_revenue'], 3)
    response_data['market_std'] = round(response_data['market_std'], 3)
    return response_data
Example #51
0
result = tsne.fit_transform(X)

df['D1'] = result[:, 0]
df['D2'] = result[:, 1]

plt.figure(figsize=(12, 9), dpi=300)
sns.scatterplot(x='D1',
                y='D2',
                hue='Cluster9',
                palette=sns.color_palette(n_colors=df['Cluster9'].nunique()),
                data=df)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.savefig('tsne.png')
''' MDS '''
from sklearn.manifold import MDS
mds = MDS(n_components=2)
result = mds.fit_transform(X)

print(mds.embedding_)

print(mds.stress_)

df['D1'] = result[:, 0]
df['D2'] = result[:, 1]

plt.figure(figsize=(12, 9), dpi=300)
sns.scatterplot(x='D1',
                y='D2',
                hue='Cluster9',
                palette=sns.color_palette(n_colors=df['Cluster9'].nunique()),
                data=df)
Example #52
0
def calculate_MDS():
    embeddings = MDS(2)
    transformed = embeddings.fit_transform(df)
    return transformed
Example #53
0
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS
from sklearn.metrics import silhouette_score

MDS()

df = pd.read_excel('Cricket_inc_data_orig.xlsx', sheetname='Test_data')
data = []
inc_list = []
inc_links = []
for i in df.index:
    data.append(df['Inc Summary'][i])
    inc_list.append(df['Inc ID'][i])
    inc_links.append(df['Inc Uts Link'][i])

print("reading finished")

stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.add("please")
stopwords.add("Please")
Example #54
0
# 標準化
sc = StandardScaler()
X_std = sc.fit_transform(X)
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# Logistic Regressionによる訓練および評価
kpca_list = [
    KernelPCA(n_components=i + 1, kernel="rbf")
    for i in range(num_features - 1)
]
lle_list = [
    LocallyLinearEmbedding(n_components=i + 1) for i in range(num_features - 1)
]
mds_list = [MDS(n_components=i + 1) for i in range(num_features - 1)]
ism_list = [Isomap(n_components=i + 1) for i in range(num_features - 1)]
sample_dimensions = [i for i in range(num_features)]
random.shuffle(sample_dimensions)

# 1次元〜(num_features-1)次元まで削減した各々のデータを格納
X_kpca_train = [
    kpca_list[i].fit_transform(X_train_std) for i in range(num_features - 1)
]
X_kpca_test = [
    kpca_list[i].transform(X_test_std) for i in range(num_features - 1)
]

X_lle_train = [
    lle_list[i].fit_transform(X_train_std) for i in range(num_features - 1)
]
An___ = n.arccos(c)
# An___ = n.arccos((G / (n.array(n.dot(sc, u.T)) * n.array( n.dot(u, sc.T)))) ** .5)
An__ = n.degrees(An___)
min_angle = __mangle
An_ = An__ + min_angle - n.identity(N) * min_angle
An = n.real(n.maximum(An_, An_.T))  # communicability angles matrix

print('communcability calculations', t.time() - tt, 'net size', N)
tt = t.time()

# E_original = n.linalg.eigvals(An)

if __dimred == 'MDS':
    embedding = MDS(n_components=__dim,
                    n_init=__inits,
                    max_iter=__iters,
                    n_jobs=-1,
                    dissimilarity='precomputed')
elif __dimred == 'PCA':
    embedding = PCA(n_components=__dim)
else:
    embedding = TSNE(n_components=__dim,
                     n_iter=__iters,
                     metric='precomputed',
                     learning_rate=__lrate,
                     perplexity=__perplexity)
p = positions = embedding.fit_transform(An)
# p = positions = embedding.fit_transform(X)
print('embedding', t.time() - tt)
tt = t.time()
def calculate_and_cluster():
    
    # Variables for storing the data
    data_list = {}
    tag_list = {}
    tag_map = {}
    data_tag_map = {}
    counter = 0
    index = 0
    ptr = ""

    # Parse the CSV file (this will be denoted by a string variable)
    with open('../../data/sets/complete_set.csv','rb') as csvfile:
        reader = csv.reader(csvfile,delimiter=',')
        for row in reader:
            data_list[counter] = ''.join(row)
            counter +=1
    counter = 0
    
    # Loop through data in range
    for data in range(0,len(data_list)):
        # Split the last token in the string
        split = data_list[data].split(" ")[-1:]
        # print split[0], "Tag set: ", get_tag_set(split[0])
        data_tag_map[split[0]] = get_tag_set(split[0])
    od = OrderedDict(sorted(data_tag_map.items()))
    
    names = []
    data_tagged_list = {}
    counter = 0
    for key, value in od.iteritems():
        # Maintain old file name
        file_old = str(counter) + '.txt'
        
        tag = ''
        if len(value) == 1:
            tag = 'Tagged'
            names.append(str(counter) + "_" + tag)
            data_tagged_list[str(counter)] = True
        else:
            tag = 'Untagged'
            names.append(str(counter) + "_" + tag)
            data_tagged_list[str(counter)] = False
            
        # Create new file name with tagged / untagged appended
        file_new = str(counter) + '_' + tag + '.txt'
        # Rename the file for later use in color co-ordination
        rename_file(file_old,file_new)
        counter += 1
    
    
    dataNodes = []
    for x in range(0,len(data_list)):
        dataNodes.append(data_list[x])
    vect = TfidfVectorizer(min_df=1)

    tfidf = vect.fit_transform(dataNodes)
    X = genfromtxt('../semantic_similarity_algorithms/semantic_similarity_matrix/matrix.csv', delimiter=',')
    X = symmetrize(X)
    print (X.transpose() == X).all()
    # N Components: plotting points in a two-dimensional plane
    # Dissimilirity: "precomputed" because of the Distance Matrix
    # Random state is fixed so we can reproduce the plot.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    mds.fit(X.astype(np.float64))
    pos = mds.fit_transform(X)  # shape (n_components, n_samples)
    xs, ys = pos[:, 0], pos[:, 1]


    # Set figure size to have dimensions of at least 15 inches for the width.
    # Height can be scaled accordingly.
    plt.figure(figsize=(15,8))
    plt.subplot(211)
    
    # Loop through the points, label approriately and scatter
    # Ensure figure size has enough room for legend plotting. Each plot must have a label.
    # In this case, label is the split value denoting the POI tag
    
    for x, y, name in zip(xs, ys, names):
        plt.scatter(x, y, s=100,c=get_colour_tag(name.split('_',1)[1]), label = name.split('_',1)[1])
        #plt.text(x,y,name.split('_',1)[0])
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    legend = plt.legend(by_label.values(), by_label.keys(),loc='lower center',ncol=4,bbox_to_anchor=(0.5, -0.6))
    
    plt.show()
Example #57
0
    # ISOMAP
    from sklearn.manifold import Isomap

    iso = Isomap(n_components=3, n_neighbors=15)
    fdata = iso.fit_transform(digits['data'])
    plot_figure(fdata, 'ISOMAP')

    # LLE
    from sklearn.manifold import LocallyLinearEmbedding

    lle = LocallyLinearEmbedding(n_neighbors=15,
                                 n_components=3,
                                 method='modified')
    fdata = lle.fit_transform(digits['data'])
    plot_figure(fdata, 'LLE')

    # MDS
    from sklearn.manifold import MDS

    mds = MDS(n_components=3)
    fdata = mds.fit_transform(digits['data'])
    plot_figure(fdata, 'MDS')

    # TSNE
    from sklearn.manifold import TSNE

    tsne = TSNE(n_components=3, perplexity=25, early_exaggeration=100)
    fdata = tsne.fit_transform(digits['data'])
    plot_figure(fdata, 't-SNE')
    input_csv = "/home/li/torch/data/Data_Input_164_nakamura_20190605.csv"
    output_csv = "/home/li/torch/data/Data_Output_164_nakamura_20190605.csv"

    plot_path = "/home/li/torch/normal_net/figure/output/object_8_nakamura_output_mds_figure.png"
    csv_path = "/home/li/torch/normal_net/figure/output/object_8_nakamura_output_distance.csv"

    item_name_path = "/home/li/torch/normal_net/figure/output/item_name_nakamura.txt"

    model = torch.load(model_path)
    model.eval()

    item_list = model.item_list
    dataset = GlobalModelDataset(input_csv, output_csv)

    embedding = MDS(n_components=2, dissimilarity="precomputed")

    input_sample = random.sample(range(64), OBJECT_NUM)

    #input_sample = [4,14,45,62,35,22,54,23]

    input_name_list = []
    for i in input_sample:
        input_name_list.append(item_list[i])

    with open(item_name_path, 'w') as item_f:
        for item in input_name_list:
            item_f.write(str(item) + "\r\n")

    input_test = []
    for item in item_list:
Example #59
0
    columns, MyList, MyList2, MyList3, MyList4, MyList5, MyList6, MyList7,
    MyList8, MyList9, MyList10, MyList11, MyList12, MyList13
])
print(df2)

#calculate distance between documents
#Euclidean Distance
dist = euclidean_distances(dtm)
print(np.round(dist, 0))
#Cosine Similarity
cosdist = 1 - cosine_similarity(dtm)
print(np.round(cosdist, 3))

#Visualizations (three methods)
#visualize in 2D
mds = MDS(n_components=2, dissimilarity="precomputed",
          random_state=1)  #"precomputed" -> cosine similarity
pos = mds.fit_transform(cosdist)  #shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]
names = [
    'Austen_Emma', 'Austen_Pride', 'Austen_Sense', 'CBronte_Jane',
    'CBronte_Professor', 'CBronte_Villette', 'Dickens_Bleak', 'Dickens_David',
    'Dickens_Hard', 'EBronte_Wuthering', 'Eliot_Adam', 'Eliot_Middlemarch',
    'Eliot_Mill'
]
for x, y, name in zip(xs, ys, names):
    plt.scatter(x, y, color="blue")
    plt.text(x, y, name, fontsize=10)
    plt.title("Visualization in 2D")
#fig = plt.figure()
#fig.savefig('2D.png')
plt.show()
Example #60
0
def plot_clusters(num_clusters,
                  feature_matrix,
                  cluster_data,
                  movie_data,
                  plot_size=(16, 8)):
    # generate random color for clusters
    def generate_random_color():
        color = '#%06x' % random.randint(0, 0xFFFFFF)
        return color

    # define markers for clusters
    markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
    # build cosine distance matrix
    cosine_distance = 1 - cosine_similarity(feature_matrix)
    # dimensionality reduction using MDS
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    # get coordinates of clusters in new low-dimensional space
    plot_positions = mds.fit_transform(cosine_distance)
    x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
    # build cluster plotting data
    cluster_color_map = {}
    cluster_name_map = {}
    for cluster_num, cluster_details in cluster_data.items():
        # assign cluster features to unique label
        cluster_color_map[cluster_num] = generate_random_color()
        cluster_name_map[cluster_num] = ', '.join(
            cluster_details['key_features'][:5]).strip()
    # map each unique cluster label with its coordinates and movies
    cluster_plot_frame = pd.DataFrame({
        'x':
        x_pos,
        'y':
        y_pos,
        'label':
        movie_data['Cluster'].values.tolist(),
        'title':
        movie_data['Title'].values.tolist()
    })
    grouped_plot_frame = cluster_plot_frame.groupby('label')
    # set plot figure size and axes
    fig, ax = plt.subplots(figsize=plot_size)
    ax.margins(0.05)
    # plot each cluster using co-ordinates and movie titles
    for cluster_num, cluster_frame in grouped_plot_frame:
        marker = markers[cluster_num] if cluster_num < len(markers) \
                 else np.random.choice(markers, size=1)[0]
        ax.plot(cluster_frame['x'],
                cluster_frame['y'],
                marker=marker,
                linestyle='',
                ms=12,
                label=cluster_name_map[cluster_num],
                color=cluster_color_map[cluster_num],
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(axis='x',
                       which='both',
                       bottom='off',
                       top='off',
                       labelbottom='off')
        ax.tick_params(axis='y',
                       which='both',
                       left='off',
                       top='off',
                       labelleft='off')
    fontP = FontProperties()
    fontP.set_size('small')
    ax.legend(loc='upper center',
              bbox_to_anchor=(0.5, -0.01),
              fancybox=True,
              shadow=True,
              ncol=5,
              numpoints=1,
              prop=fontP)
    #add labels as the film titles
    for index in range(len(cluster_plot_frame)):
        ax.text(cluster_plot_frame.ix[index]['x'],
                cluster_plot_frame.ix[index]['y'],
                cluster_plot_frame.ix[index]['title'],
                size=8)
    plt.savefig('clusters_data.png', dpi=200)
    # show the plot
    plt.show()