def main():
    Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()
    dbn = DBN([1000, 750, 500], UnsupervisedModel=AutoEncoder)
    # dbn = DBN([1000, 750, 500, 10])
    output = dbn.fit(Xtrain, pretrain_epochs=2)
    print "output.shape", output.shape

    # sample before using t-SNE because it requires lots of RAM
    sample_size = 600
    tsne = TSNE()
    reduced = tsne.fit_transform(output[:sample_size])
    plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain[:sample_size], alpha=0.5)
    plt.title("t-SNE visualization")
    plt.show()

    # t-SNE on raw data
    reduced = tsne.fit_transform(Xtrain[:sample_size])
    plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain[:sample_size], alpha=0.5)
    plt.title("t-SNE visualization")
    plt.show()

    pca = PCA()
    reduced = pca.fit_transform(output)
    plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain, alpha=0.5)
    plt.title("PCA visualization")
    plt.show()
Beispiel #2
0
def vizualize_clusters(X, y, py, hist=None):
    """ Using T-SNE to visualize the site clusters.
        Plot and save the scatter (and the histogramm).
    """
    model = TSNE(n_components=2, random_state=0)

    fig = model.fit_transform(X, y)
    fig1 = model.fit_transform(X, py)

    pyplot.figure(figsize=(16, 8))
    pyplot.subplot(121)

    classes = list(set(y))
    for c, color in zip(classes, plt.colors.cnames.iteritems()):
        indeces = [i for i, p in enumerate(y) if p == c]
        pyplot.scatter(fig[indeces, 0], fig[indeces, 1], marker="o", c=color[0])

    pyplot.subplot(122)

    clusters = list(set(py))
    for c, color in zip(clusters, plt.colors.cnames.iteritems()):
        indeces = [i for i, p in enumerate(py) if p == c]
        pyplot.scatter(fig1[indeces, 0], fig1[indeces, 1], marker="o", c=color[0])

    # pyplot.show()
    pyplot.savefig("clusters" + "_scatter.png")

    if hist is not None:
        pyplot.figure(figsize=(4, 4))
        pyplot.xticks(clusters)

        pyplot.bar(clusters, hist, align="center")
        # pyplot.show()
        pyplot.savefig("clusters" + "_hist.png")
Beispiel #3
0
def tsne(similarity, euclid=False, perplexity=30):
    if euclid:
        model = TSNE(learning_rate=100, perplexity=perplexity, n_iter=200000)
        result = model.fit_transform(similarity)
    else:
        model = TSNE(learning_rate=100, n_iter=100000, init='random', metric='precomputed')
        result = model.fit_transform(1 - similarity)

    return result.T
Beispiel #4
0
def MyTSNE(train,test):
    #MyTSNE(train.iloc[:100,:],test.iloc[:20,:])
    model = TSNE(n_components=2, random_state=0)
    a = np.vstack(
            [train.values,
            test.values]
            )
    model.fit_transform(a)
    return
def programmer_5(data_zs, r):
    # 进行数据降维
    tsne = TSNE()
    tsne.fit_transform(data_zs)
    tsne = pd.DataFrame(tsne.embedding_, index=data_zs.index)

    # 不同类别用不同颜色和样式绘图
    d = tsne[r[u'聚类类别'] == 0]
    plt.plot(d[0], d[1], 'r.')
    d = tsne[r[u'聚类类别'] == 1]
    plt.plot(d[0], d[1], 'go')
    d = tsne[r[u'聚类类别'] == 2]
    plt.plot(d[0], d[1], 'b*')
    plt.show()
def learn_embedding(precompute_metric=False, use_saved=False):
    base = 'datasets/dspace_topics'
    new_base = 'datasets/dspace_embeddings'
    # Delete previous saved embedding
    if os.path.exists(new_base):
        shutil.rmtree(new_base)
    os.makedirs(new_base)

    print 'Embedding: Extracting topics'
    # choose a random subset of documents
    filename_vec = os.listdir(base)
    subsample = 5000
    filename_vec = np.random.choice(filename_vec, subsample)
    topic_vec = []
    for filename in tqdm(filename_vec):
        path = os.path.join(base, filename)
        with open(path) as f:
            d = json.load(f)
            topics = d['topics']
            topic_vec.append(topics)

    print 'Embedding: Computing pairwise distances'
    if precompute_metric:
        if use_saved:
            with open('metric.npy') as f:
                metric = np.load(f)
        else:
            metric = pairwise_distances(np.array(topic_vec), metric=KL, n_jobs=-1)
            with open('metric.npy', 'w') as f:
                np.save(f, metric)

        print 'Embedding: Learning embedding'
        tsne = TSNE(n_iter=1000, verbose=10, metric='precomputed')
        y = tsne.fit_transform(metric)
    else:
        print 'Embedding: Learning embedding'
        tsne = TSNE(n_iter=1000, verbose=10)
        y = tsne.fit_transform(topic_vec)

    print 'Embedding: Saving embedding'
    for (index, filename) in tqdm(enumerate(filename_vec), total=len(filename_vec)):
        path = os.path.join(base, filename)
        with open(path, 'r') as f:
            d = json.load(f)
            d['embedding'] = list(y[index])
            new_path = os.path.join(new_base, filename)
            with open(new_path, 'w') as new_f:
                json.dump(d, new_f)
Beispiel #7
0
def add_tsne_features(x_train, x_test):
    print('add_tsne_features <<')

    x_train_data = x_train.data_
    x_test_data = x_test.data_

    x = np.vstack((x_train_data, x_test_data))

    print('applying pca...')
    pca = PCA(n_components=25)
    x_pca = pca.fit_transform(x)

    print('applying t-SNE...')
    tsne_model = TSNE(n_components=2, random_state=0)
    x_tsne = tsne_model.fit_transform(x_pca)
    x_train_data = np.hstack((x_train_data, x_tsne[:x_train_data.shape[0], :]))
    x_test_data = np.hstack((x_test_data, x_tsne[-x_test_data.shape[0]:, :]))

    assert(x_train.columns_ == x_test.columns_)
    columns = x_train.columns_ + ['tsne_1', 'tsne_2']
    x_train = DataSet(x_train.ids_, columns, x_train_data)
    x_test = DataSet(x_test.ids_, columns, x_test_data)

    print('add_tsne_features >>')
    return x_train, x_test
Beispiel #8
0
    def sendTSNE(self, people):
        d = self.getData()
        if d is None:
            return
        else:
            (X, y) = d

        X_pca = PCA(n_components=50).fit_transform(X, X)
        tsne = TSNE(n_components=2, init='random', random_state=0)
        X_r = tsne.fit_transform(X_pca)

        yVals = list(np.unique(y))
        colors = cm.rainbow(np.linspace(0, 1, len(yVals)))

        # print(yVals)

        plt.figure()
        for c, i in zip(colors, yVals):
            name = "Unknown" if i == -1 else people[i]
            plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=name)
            plt.legend()

        imgdata = StringIO.StringIO()
        plt.savefig(imgdata, format='png')
        imgdata.seek(0)

        content = 'data:image/png;base64,' + \
                  urllib.quote(base64.b64encode(imgdata.buf))
        msg = {
            "type": "TSNE_DATA",
            "content": content
        }
        self.sendMessage(json.dumps(msg))
Beispiel #9
0
def display_closestwords_tsnescatterplot(model, word):
    arr = np.empty((0,300), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()
Beispiel #10
0
def make_tsne_plot(model, rel_wds, plot_lims, title):

    dim = 30
    X, keys = make_data_matrix(model)

    # first we actually do PCA to reduce the
    # dimensionality to make tSNE easier to calculate
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = PCA(n_components=2)
    X = sklearn_pca.fit_transform(X_std)[:,:dim]

    # do downsample
    k = 5000
    sample = []
    important_words = []
    r_wds = [word[0] for word in rel_wds]
    for i, key in enumerate(keys):
        if key in r_wds:
            sample.append(i)
    sample = np.concatenate((np.array(sample),
                np.random.choice(len(keys), k-10, replace = False),
             ))
    X = X[sample,:]
    keys = [keys[i] for i in sample]



    # Do tSNE
    tsne = TSNE(n_components=2, random_state=0, metric="cosine")
    X_transf = tsne.fit_transform(X)

    k_means = KMeans(n_clusters=8)
    labels = k_means.fit_predict(X_transf)

    scatter_plot(X_transf[:,0], X_transf[:,1],  rel_wds, labels, title, keys, plot_lims)
def plot_features(subject, data_path, model_path, test_labels, dataset='test'):
    with open(model_path + '/' + subject + '.pickle', 'rb') as f:
        state_dict = cPickle.load(f)
    cnn = ConvNet(state_dict['params'])
    cnn.set_weights(state_dict['weights'])
    scalers = state_dict['scalers']

    if dataset == 'test':
        d = load_test_data(data_path, subject)
        x = d['x']
        y = test_labels['preictal']
    elif dataset == 'train':
        d = load_train_data(data_path, subject)
        x, y = d['x'], d['y']
    else:
        raise ValueError('dataset')

    x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \
        else scale_across_features(x, x_test=None, scalers=scalers)

    cnn.batch_size.set_value(x.shape[0])
    get_features = theano.function([cnn.x, Param(cnn.training_mode, default=0)], cnn.feature_extractor.output,
                                 allow_input_downcast=True)

    logits_test = get_features(x)
    model = TSNE(n_components=2, random_state=0)
    z = model.fit_transform(np.float64(logits_test))
    plt.scatter(z[:, 0], z[:, 1], s=60, c=y)
    plt.show()
Beispiel #12
0
def plot_data(data, has_label=True):
	import numpy as np
	import seaborn as sns
	from sklearn.manifold import TSNE
	from sklearn.decomposition import PCA

	if not has_label:
		data = data.copy()
		data['label'] = np.zeros([len(data),1])

	LIMIT = 4000
	if data.shape[0] > LIMIT:
		dt = data.sample(n=LIMIT, replace=False)
		X = dt.ix[:,:-1]
		labels = dt.ix[:,-1]
	else:
		X = data.ix[:,:-1]
		labels = data.ix[:,-1]

	tsne_model = TSNE(n_components=2, random_state=0)
	np.set_printoptions(suppress=True)
	points1 = tsne_model.fit_transform(X)
	df1 = pd.DataFrame(data=np.column_stack([points1,labels]), columns=["x","y","class"])
	sns.lmplot("x", "y", data=df1, hue='class', fit_reg=False, palette=sns.color_palette('colorblind'))
	sns.plt.title('TNSE')

	pca = PCA(n_components=2)
	pca.fit(X)
	points2 = pca.transform(X)
	df2 = pd.DataFrame(data=np.column_stack([points2,labels]), columns=["x","y","class"])
	sns.lmplot("x", "y", data=df2, hue='class', fit_reg=False, palette=sns.color_palette('colorblind'))
	sns.plt.title('PCA')
Beispiel #13
0
    def plot_mfi(self, outputfile='embeddings.pdf', nb_clusters=8, weights='NA'):
        # collect embeddings for mfi:
        X = np.asarray([self.w2v_model[w] for w in self.mfi \
                            if w in self.w2v_model], dtype='float32')
        # dimension reduction:
        tsne = TSNE(n_components=2)
        coor = tsne.fit_transform(X) # unsparsify

        plt.clf()
        sns.set_style('dark')
        sns.plt.rcParams['axes.linewidth'] = 0.4
        fig, ax1 = sns.plt.subplots()  

        labels = self.mfi
        # first plot slices:
        x1, x2 = coor[:,0], coor[:,1]
        ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none')
        # clustering on top (add some colouring):
        clustering = AgglomerativeClustering(linkage='ward',
                            affinity='euclidean', n_clusters=nb_clusters)
        clustering.fit(coor)
        # add names:
        for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_):
            ax1.text(x, y, name, ha='center', va="center",
                     color=plt.cm.spectral(cluster_label / 10.),
                     fontdict={'family': 'Arial', 'size': 8})
        # control aesthetics:
        ax1.set_xlabel('')
        ax1.set_ylabel('')
        ax1.set_xticklabels([])
        ax1.set_xticks([])
        ax1.set_yticklabels([])
        ax1.set_yticks([])
        sns.plt.savefig(outputfile, bbox_inches=0)
Beispiel #14
0
def perform_AE(X, dim=2, tsne=False):
    y = np.zeros(shape=X.shape[0], dtype=int)
    
    if tsne:
        hidden_layers = [X.shape[1], 500, 100, 32]
        encoder_weights, decoder_weights = pretrain(X, hidden_layers)
        X_32d = ae(X, encoder_weights, decoder_weights, hidden_layers)

        ae_tsne = TSNE(n_components=dim, learning_rate=800, verbose=1)
        X_2d = ae_tsne.fit_transform(X_32d)

        method = 'ae_tsne_scaled'
    ### END - if tsne

    else:
        hidden_layers = [X.shape[1], 500, 100, 20, dim]
        encoder_weights, decoder_weights = pretrain(X, hidden_layers)
        X_2d = ae(X, encoder_weights, decoder_weights, hidden_layers)
        
        method = 'ae_scaled'
    ### END - else

    print('***** ' + method + ' *****')
    cluster(X_2d, method)
    np.save("{0}_{1}_X_2d".format(species, method), X_2d)
Beispiel #15
0
def plot_phonemes(path):
    phoneme_embeddings = dict()
    for line in codecs.open(path,"r"):
        line = line.split(",")
        key= line[0][1:-1]
        emb = line[1:]
        emb[-1] = emb[-1][:-1]
        emb = np.array([float(e) for e in emb])
        phoneme_embeddings[key] = emb
    
    phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys())
    print(phoneme_embeddings.columns)
    
    m = TSNE()
    phoneme_embeddings_tsne = m.fit_transform(phoneme_embeddings.transpose())
    print(len(phoneme_embeddings_tsne))
    for p,emb in zip(phoneme_embeddings.columns, phoneme_embeddings_tsne):
        c = "black"
        if regex.search("^[aeiou3E][*]?$", p):
            c = "red"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*w~$", p):
            c = "blue"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*y~$", p):
            c = "yellow"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*h~$", p):
            c = "brown"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*\"$", p):
            c = "green"
            plt.annotate(p,(emb[0],emb[1]),color=c)
Beispiel #16
0
def t_sne_view(norm_table, subj_cond, cohorts, image_type):

    # t-SNE analysis: Use stochastic neighbor embedding to reduce dimensionality of
    # data set to two dimensions in a non-linear, distance dependent fashion

    # Perform PCA data reduction if dimensionality of feature space is large:
    if len(norm_table.columns) > 12:
        pca = PCA(n_components = 12)
        pca.fit(norm_table.as_matrix())
        
        raw_data = pca.transform(norm_table.as_matrix())
    else:
        raw_data = norm_table.as_matrix()
 
    # Transform data into a two-dimensional embedded space:
    tsne = TSNE(n_components = 2, perplexity = 40.0, early_exaggeration= 2.0, 
        learning_rate = 100.0, init = 'pca')

    tsne_data = tsne.fit_transform(raw_data)

    # Prepare for normalization and view:
    cols = ['t-SNE', 'Cluster Visualization']
    tsne_table = pd.DataFrame(tsne_data, index = norm_table.index, columns = cols)
           
    # The output is no longer centered or normalized, so shift & scale it before display:
    tsne_avg = ppmi.data_stats(tsne_table, subj_cond, cohorts)
    tsne_norm_table = ppmi.normalize_table(tsne_table, tsne_avg)       
    
    # Send out to graphics rendering engine:

    if (image_type == 'Gauss'):
        return scg.scatter_gauss(tsne_norm_table[cols[0]], tsne_norm_table[cols[1]], subj_cond)
    elif (image_type == 'Scatter'):
        return scg.scatter_plain(tsne_norm_table[cols[0]], tsne_norm_table[cols[1]], subj_cond)
def main():
    embedding = WordEmbedding(embeddingpath(default_embeddingconfig))


    for old, new in spelling_changes:
        print(old, '--', new)
        print(embedding.nearest_words([old]))
        print()

    print()
    war, ist = tense_changes[0]
    tensediff = embedding[ist] - embedding[war]
    for past, present in tense_changes[1 : ]:
        print(past, '+ tensediff:', *embedding.nearest_words([embedding[past] + tensediff]))
        print('Should be:', present)
        print()

    # word_diffs = [embedding[new] - embedding[old] for (old, new) in word_changes]

    spelling_diffs = [embedding[new] - embedding[old] for (old, new) in spelling_changes[10 : 20]]
    tense_diffs = [embedding[present] - embedding[past] for (past, present) in tense_changes]

    def metric(u, v):
        return max(distance.cosine(u, v), 0)

    while True:
        try:
            model = TSNE(n_components=2, metric=metric)
            reduced = model.fit_transform(spelling_diffs + tense_diffs)
            print(reduced)
            return
        except Exception:
            pass
Beispiel #18
0
def apply_tSNE30(proj_data, proj_weights=None):
    model = TSNE(n_components=2, perplexity=30.0, metric="euclidean",
                 learning_rate=100, early_exaggeration=4.0,
                 random_state=RANDOM_SEED);
    norm_data = normalize_columns(proj_data);
    result = model.fit_transform(norm_data.T);
    return result;
Beispiel #19
0
    def plot_mean_activation_and_stuff(some_probs, Y, do_tsne=False):
        pyplot.clf()
        probs = numpy.float32(some_probs)
        xv = numpy.arange(probs.shape[1])#probs.var(axis=0)
        yv = probs.mean(axis=0)
        pyplot.axis([-0.1, probs.shape[1],0,1])
        for k in range(probs.shape[1]):
            pyplot.plot(xv[k]*numpy.ones(probs.shape[0]),probs[:,k],'o',ms=4.,
                        markeredgecolor=(1, 0, 0, 0.01),
                        markerfacecolor=(1, 0, 0, 0.01),)
        pyplot.plot(xv,yv, 'bo')
        pyplot.show(block=False)
        if do_video:
            pyplot.savefig(video.stdin, format='jpeg')
            video.stdin.flush()
        pyplot.savefig('epoch_probs.png')

        if not do_tsne: return
        try:
            from sklearn.manifold import TSNE
            tsne = TSNE(random_state=0)
            ps = tsne.fit_transform(numpy.float64(probs[:400]))
            pyplot.clf()
            Y = numpy.int32(Y)[:400]
            for i,c,s in zip(range(10),list('bgrcmyk')+[(.4,.3,.9),(.9,.4,.3),(.3,.9,.4)],'ov'*5):
                sub = ps[Y == i]
                pyplot.plot(sub[:,0], sub[:,1], s,color=c,ms=3,mec=c)
            pyplot.show(block=False)
            pyplot.savefig('probs_embed.png')
        except ImportError:
            print "cant do tsne"
Beispiel #20
0
def topic_dimen_reduce(words, word2vec):
    dictionary, matrix = terms_analysis.get_words_matrix(words, word2vec)
    pca = PCA(n_components=50)
    pca_matrix = pca.fit_transform(matrix)
    tsne = TSNE(n_components=2)
    t_matrix = tsne.fit_transform(pca_matrix)
    return dictionary, t_matrix
Beispiel #21
0
def tsnePlot(plotname, modelName, word, dest):
    
    """Plots a tsne graph of words most similar to the word passed in the argument (as represented in the model previously calculated)"""
    
    model = word2vec.Word2Vec.load(modelName)
    words = [model.most_similar(word)[i][0] for i in range(0, len(model.most_similar(word)))]
    words.append(word)

    #nested list constaining 100 dimensional word vectors of each most-similar word
    
    word_vectors = [model[word] for word in words]
    word_vectors = np.array(word_vectors)

    tsne_model = TSNE(n_components=2, random_state=0)
    Y = tsne_model.fit_transform(word_vectors)
    sb.plt.plot(Y[:,0], Y[:,1], 'o') 

    for word, x, y in zip(words, Y[:,0], Y[:,1]):  
        sb.plt.annotate(word, (x, y), size=12)
        #sb.plt.pause(10)

    plotname = plotname + ".png"

    if not os.path.exists(dest):
        os.makedirs(dest)

    path = os.path.join(dest, plotname)

    sb.plt.savefig(path)
 def reduce_dimentionality(self):
     self.vectors = []
     for key in self.selected_words:
         self.vectors.append(self.model[key])
     tnse_model = TSNE(n_components=2, random_state=0)
     np.set_printoptions(suppress=True)
     self.reduced_vectors = tnse_model.fit_transform(self.vectors)
Beispiel #23
0
def plotTSNEDecisionBoundaries(): 
    
    tsne = TSNE()
    tsne_data = tsne.fit_transform(feature_set)
    x_min,x_max = tsne_data[:,0].min()-1, tsne_data[:,0].max() + 1
    y_min,y_max = tsne_data[:,1].min()-1, tsne_data[:,1].max() + 1
    step_size = 2.0
    
    xx,yy = np.meshgrid(np.arange(x_min,x_max,step_size),np.arange(y_min,y_max,step_size))
    
    for index,classifier in enumerate(classifiers):
        
        plt.subplot(2,3,index+1)
        plt.subplots_adjust(wspace=0.5,hspace=0.5)
        classifier.fit(tsne_data,class_labels)
        
        Z = classifier.predict(zip(xx.ravel(),yy.ravel()))
        Z = Z.reshape(xx.shape)
        
        plt.contourf(xx,yy,Z,cmap=plt.cm.Paired,alpha=0.7)
        plt.scatter(tsne_data[:,0],tsne_data[:,1],c=class_labels,cmap=plt.cm.rainbow,alpha=0.6)
        plt.xlabel("Feature 1")
        plt.ylabel("Feature 2")
        plt.xlim(x_min,x_max)
        plt.ylim(y_min,y_max)
        plt.xticks(())
        plt.yticks(())
        plt.title(classifier_names[index])
        
    plt.show()
Beispiel #24
0
def visualization(result, word_dict):
	tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
	plot_only = 500

	low_dim_embs = tsne.fit_transform(result[0:500])
	labels = [ word_dict[i] for i in range(500) ]
	plot_with_labels(low_dim_embs, labels)
Beispiel #25
0
def project_in_2D(distance_mat, method='mds'):
  """
  Project SDRs onto a 2D space using manifold learning algorithms
  :param distance_mat: A square matrix with pairwise distances
  :param method: Select method from 'mds' and 'tSNE'
  :return: an array with dimension (numSDRs, 2). It contains the 2D projections
     of each SDR
  """
  seed = np.random.RandomState(seed=3)

  if method == 'mds':
    mds = MDS(n_components=2, max_iter=3000, eps=1e-9,
              random_state=seed,
              dissimilarity="precomputed", n_jobs=1)

    pos = mds.fit(distance_mat).embedding_

    nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
               dissimilarity="precomputed", random_state=seed,
               n_jobs=1, n_init=1)

    pos = nmds.fit_transform(distance_mat, init=pos)
  elif method == 'tSNE':
    tsne = TSNE(n_components=2, init='pca', random_state=0)
    pos = tsne.fit_transform(distance_mat)
  else:
    raise NotImplementedError

  return pos
Beispiel #26
0
def PlotTSNE (data, labels):										#Takes the data and the labels
	# Visualize the results on TSNE reduced data

	print "BUSY IN TSNE"

	model = TSNE(n_components=2, random_state=0)
	reduced_data = model.fit_transform(data)

	print "DATA REDUCED"

	# Plot the decision boundary. For that, we will assign a color to each
	x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max()
	y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max()
	
	plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
	
	#Adds labels to the plot
	for label, x, y in zip(labels, reduced_data[:, 0], reduced_data[:, 1]):
	    plt.annotate(
	        label, 
	        xy = (x, y), xytext = (-20, 20),
	        textcoords = 'offset points', ha = 'right', va = 'bottom',
	        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'green', alpha = 0.5),
	        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

	plt.title('TSNE Plot')
	plt.xlim(x_min, x_max)
	plt.ylim(y_min, y_max)
	plt.xticks(())
	plt.yticks(())
	plt.show()
def plotly_js_viz(word_2_vec_model):
    tsne_model=TSNE(n_components=2,random_state=5)
    data=tsne_model.fit_transform(word_2_vec_model.syn0)
    xd=list(data[:,0])
    yd=list(data[:,1])
    names_our=word_2_vec_model.index2word
    plot([Scatter(x=xd,y=yd,mode="markers",text=names_our)])
def visualize_latent_rep(args, model, x_latent):
    print("pca_on=%r pca_comp=%d tsne_comp=%d tsne_perplexity=%f tsne_lr=%f" % (
        args.use_pca,
        args.pca_components,
        args.tsne_components,
        args.tsne_perplexity,
        args.tsne_lr
    ))

    if args.use_pca:
        pca = PCA(n_components = args.pca_components)
        x_latent = pca.fit_transform(x_latent)

    figure(figsize=(6, 6))
    scatter(x_latent[:, 0], x_latent[:, 1], marker='.')
    show()

    tsne = TSNE(n_components = args.tsne_components,
                perplexity = args.tsne_perplexity,
                learning_rate = args.tsne_lr,
                n_iter = args.tsne_iterations,
                verbose = 4)
    x_latent_proj = tsne.fit_transform(x_latent)
    del x_latent

    figure(figsize=(6, 6))
    scatter(x_latent_proj[:, 0], x_latent_proj[:, 1], marker='.')
    show()
Beispiel #29
0
def tsne_plot(model):
    #"Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)

    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
Beispiel #30
0
def infer(FLAGS):
    """
    Inference.
    """

    # Retrieve embeddings for docs
    words = ["tennis", "wimbledon", "icecream", "cake", "bear", "pie"]

     # Get index in doc embeddings
    with open(os.path.join(basedir, FLAGS.data_dir, "doc_to_idx.json"), 'r') as f:
        doc_to_idx = json.load(f)

    # Load the trained model
    model = torch.load(os.path.join(basedir, FLAGS.data_dir, "model.pt"))
    doc_embeddings = model.doc_embeddings.weight.data

    my_embeddings = np.array(
        [doc_embeddings[doc_to_idx[word]].numpy() for word in words])

    # Use TSNE model to reduce dimensionality
    model = TSNE(n_components=2, random_state=0)
    points = model.fit_transform(my_embeddings)

    # Visualize
    for i, word in enumerate(words):
        x, y = points[i, 0]*1e4, points[i, 1]*1e4
        plt.scatter(x, y)
        plt.annotate(word, xy=(x, y), xytext=(25, 5),
            textcoords='offset points', ha='right', va='bottom')
    plt.show()
bottleneck_representation = encoder.predict([X_scRNAseq, X_scProteomics])
print(pd.DataFrame(bottleneck_representation).shape)
print(pd.DataFrame(bottleneck_representation).iloc[0:5,0:5])

# Dimensionality reduction plot
#plt.figure(figsize=(20, 15))
plt.scatter(bottleneck_representation[:, 0], bottleneck_representation[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10)
plt.title('Autoencoder Data Integration')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
#plt.colorbar()
plt.show()

# tSNE on Autoencoder bottleneck representation
model_tsne_auto = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 90, n_iter = 1000, verbose = 1)
tsne_auto = model_tsne_auto.fit_transform(bottleneck_representation)
plt.scatter(tsne_auto[:, 0], tsne_auto[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10)
plt.title('tSNE on Autoencoder: Data Integration, CITEseq')
plt.xlabel("tSNE1")
plt.ylabel("tSNE2")
plt.show()

# UNIFORM MANIFOLD APPROXIMATION AND PROJECTION (UMAP)
#model_umap = UMAP(n_neighbors = 20, min_dist = 0.3, n_components = 2)
#umap = model_umap.fit_transform(bottleneck_representation)
#plt.scatter(umap[:, 0], umap[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10)
#plt.title('UMAP on Autoencoder')
#plt.xlabel("UMAP1")
#plt.ylabel("UMAP2")
#plt.show()
Beispiel #32
0
    'spleenV5_rel', 'spleenV10_rel', 'spleenV15_rel', 'spleenV20_rel',
    'spleenV25_rel', 'spleenV30_rel', 'spleenV35_rel', 'spleenV40_rel',
    'spleenV45_rel', 'spleenV50_rel', 'meanspleendose', 'spleenvolume'
]

new_data_dense_features2 = string_float(new_data[dense_features2]).values
N = 20000
n_components = 3
tsne = TSNE(n_components=n_components,
            perplexity=50,
            n_iter=N,
            init='pca',
            random_state=0,
            verbose=1,
            method='exact')
tsne_results = tsne.fit_transform(new_data_dense_features2)

new_data = new_data.drop(dense_features2, axis=1)
new_data["DoseComp1"] = tsne_results[:, 0]
new_data["DoseComp2"] = tsne_results[:, 1]
new_data["DoseComp3"] = tsne_results[:, 2]
dense_features2_new = ["DoseComp1", "DoseComp2", "DoseComp3"]

#new_data=np.concatenate([new_data.values,tsne_results],axis=1)
new_data.to_csv('Data_tsne_20K_3c.csv', index=False)

data_train, data_test = train_test_split(new_data,
                                         test_size=.3,
                                         random_state=8)

y_train_class = data_train.pop('G4RIL').values
def project(cls1_data,
            cls2_data,
            projection='mds',
            setsize=None,
            with_debiasing=None,
            figname=None):
    if type(cls1_data) == list:
        fig, axes = plt.subplots(3, 4, figsize=(9, 12))

        fig.tight_layout(pad=0.5)
        flataxes = [ax for tmp in axes for ax in tmp]

        setsize = len(cls1_data)
        if setsize > 500: setsize = 500

        X = np.r_[np.concatenate(cls1_data, axis=0)[:setsize, :, :],
                  np.concatenate(cls2_data, axis=0)[:setsize, :, :]]

        if with_debiasing:
            X = with_debiasing.clean_data(X)

        for layer in range(0, 12):
            print('plotting layer %d' % (layer + 1))

            if projection == 'mds':
                mds = MDS(n_components=2)
                X_transformed = mds.fit_transform(X[:, layer, :].astype(
                    np.float64))
            if projection == 'pca':
                pca = PCA(n_components=2)
                X_transformed = pca.fit_transform(X[:, layer, :].astype(
                    np.float64))
            if projection == 'tsne':
                tsne = TSNE(n_components=2, verbose=1)
                X_transformed = tsne.fit_transform(X[:, layer, :].astype(
                    np.float64))

            colors = ['red'] * setsize + ['blue'] * setsize

            ax = flataxes[layer]
            ax.set_aspect('equal', adjustable='box')
            ax.scatter(X_transformed[:, 0], X_transformed[:, 1], s=2, c=colors)

            #ax.set_xlim((-20, 20))
            #ax.set_ylim((-20, 20))
            ax.set_title('Layer %d' % (layer + 1), fontsize=12)

            print('plotting done.')

    else:
        if not setsize:
            setsize = cls1_data.shape[0]

        colors = ['red'] * setsize + ['blue'] * setsize

        X = np.r_[cls1_data[:setsize, :], cls2_data[:setsize, :]]

        mds = MDS(n_components=2)
        X_transformed = mds.fit_transform(X)

        fig = plt.plot()
        plt.scatter(X_transformed[:, 0], X_transformed[:, 1], s=2, c=colors)

    if figname:
        plt.savefig(figname)
    else:
        plt.show()
Beispiel #34
0

#Set df4 equal to a set of a sample of 1000 deafault and 1000 non-default observations.
df2 = tsne_data[tsne_data.default == 0].sample(n = 1000)
df3 = tsne_data[tsne_data.default == 1].sample(n = 1000)
df4 = pd.concat([df2, df3], axis = 0)

#Scale features to improve the training ability of TSNE.
standard_scaler = StandardScaler()
df4_std = standard_scaler.fit_transform(df4)

#Set y equal to the target values.
y = df4.ix[:,-1].values

tsne = TSNE(n_components=2, random_state=0)
x_test_2d = tsne.fit_transform(df4_std)

#Build the scatter plot with the two types of transactions.
color_map = {0:'red', 1:'blue'}
plt.figure()
for idx, cl in enumerate(np.unique(y)):
    plt.scatter(x = x_test_2d[y==cl,0], 
                y = x_test_2d[y==cl,1], 
                c = color_map[idx], 
                label = cl)
plt.xlabel('X in t-SNE')
plt.ylabel('Y in t-SNE')
plt.legend(loc='upper left')
plt.title('t-SNE visualization of test data')
plt.show()
        pBarLen = 20
        sys.stdout.write("|%s| - Training(%s/%s)-%s\r"%(progressBar((i//200)%pBarLen, pBarLen),i,steps,
            estimate_time(startTime, steps, i)))

        if i % logStep == 0:
            print("Loss: %s" % lossVal)
            sim = similarity.eval()
            for i in range(len(valid_set)):
                word = WORDS[valid_set[i]]
                top_k = 5
                nearest = (-sim[i,:]).argsort()[1:1+top_k]
                msg = "%s: "% word
                for k in range(top_k):
                    close_word = WORDS[nearest[k]]
                    if k > 0: msg += ", "#"\t"
                    msg += close_word #+ ": %.08f\n"%sim[i,k]
                print(msg)
            print("------------------------------------------")
            saveModel(sess, LOG_DIR+"model.ckpt")
            
            final_embeddings = normalized_embeddings.eval()
            writeToFile(final_embeddings, "savedEmbeddings/embeddings.pkl")
            # plotting using t-SNE
            # two_d_embeddings = tsne.fit_transform(final_embeddings)
            # plot(two_d_embeddings, WORDS)
        
    final_embeddings = normalized_embeddings.eval()
    two_d_embeddings = tsne.fit_transform(final_embeddings)
    plot(two_d_embeddings, WORDS)

    writeToFile(final_embeddings, "savedEmbeddings/embeddings.pkl")
Beispiel #36
0
print(y.shape)
z_vec.flatten()
z_vec = z_vec[:, 0, :]
type(z_vec)
print(z_vec.shape)
b.flatten()
b = b[:, 0, :]
type(b)
print(b.shape)

from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)

# Convert the image to a numpy array

X_z_2d = tsne.fit_transform(z_vec)
print(X_z_2d.shape)
X_b_2d = tsne.fit_transform(b)
print(X_b_2d.shape)

# Plot z's
from matplotlib import pyplot as plt
target_ids = range(10)
plt.figure(figsize=(6, 5))
target_names = np.array([0, 1, 2,3,4,5,6,7,8,9])
print(target_names)
colors = 'k', 'b', 'y', 'r', 'g', 'm', 'c', 'orange', 'purple', 'brown'
for i, c, label in zip(target_ids, colors, target_names):
    plt.scatter(X_z_2d[y == i, 0], X_z_2d[y == i, 1], s=15, c=c, label=label)
plt.legend()
plt.show()
Beispiel #37
0
    np.savetxt('{0}_c{1}_labels.tsv'.format(arguments.fout, k),
               (nmf.cluster_labels, nmf.remain_cell_inds),
               fmt='%u',
               delimiter='\t')

    # --------------------------------------------------
    # 3.4. T-SNE PLOT
    # --------------------------------------------------
    if arguments.tsne:
        model = TSNE(n_components=2,
                     random_state=0,
                     init='pca',
                     method='exact',
                     metric='euclidean',
                     perplexity=30)
        ret = model.fit_transform(nmf.pp_data.T)
        plt.title('{0} cluster (Euclidean)'.format(k))
        plt.scatter(ret[:, 0], ret[:, 1], 20, nmf.cluster_labels)
        plt.xticks([])
        plt.yticks([])

        plt.savefig('{0}_c{1}_tsne.png'.format(arguments.fout, k),
                    format='png',
                    bbox_inches=None,
                    pad_inches=0.1)
        # plt.show()

# --------------------------------------------------
# 6. SUMMARIZE RESULTS
# --------------------------------------------------
print '\n------------------------------ Summary:'
Beispiel #38
0
    def process(self):
        # parse parameters
        input_words = self.parameters.get("words", "")
        if not input_words or not input_words.split(","):
            self.dataset.update_status(
                "No input words provided, cannot look for similar words.",
                is_final=True)
            self.dataset.finish(0)
            return

        input_words = input_words.split(",")

        try:
            threshold = float(self.parameters.get("threshold"))
        except ValueError:
            threshold = float(self.get_options()["threshold"]["default"])

        threshold = max(-1.0, min(1.0, threshold))
        num_words = convert_to_int(self.parameters.get("num-words"))
        overlay = self.parameters.get("overlay")
        reduction_method = self.parameters.get("method")
        all_words = self.parameters.get("all-words")

        # load model files and initialise
        self.dataset.update_status("Unpacking word embedding models")
        staging_area = self.unpack_archive_contents(self.source_file)
        common_vocab = None
        vector_size = None
        models = {}

        # find words that are common to all models
        self.dataset.update_status("Determining cross-model common vocabulary")
        for model_file in staging_area.glob("*.model"):
            if self.interrupted:
                shutil.rmtree(staging_area)
                raise ProcessorInterruptedException(
                    "Interrupted while processing word embedding models")

            model = KeyedVectors.load(str(model_file)).wv
            models[model_file.stem] = model
            if vector_size is None:
                vector_size = model.vector_size  # needed later for dimensionality reduction

            if common_vocab is None:
                common_vocab = set(model.vocab.keys())
            else:
                common_vocab &= set(model.vocab.keys())  # intersect

        # sort common vocabulary by combined frequency across all models
        # this should make filtering for common words a bit faster further down
        self.dataset.update_status("Sorting vocabulary")
        common_vocab = list(common_vocab)
        common_vocab.sort(key=lambda w: sum(
            [model.vocab[w].count for model in models.values()]),
                          reverse=True)

        # initial boundaries of 2D space (to be adjusted later based on t-sne
        # outcome)
        max_x = 0.0 - sys.float_info.max
        max_y = 0.0 - sys.float_info.max
        min_x = sys.float_info.max
        min_y = sys.float_info.max

        # for each model, find the words that we may want to plot - these are
        # the nearest neighbours for the given query words
        relevant_words = {}

        # the vectors need to be reduced all at once - but the vectors are
        # grouped by model. To solve this, keep one numpy array of vectors,
        # but also keep track of which indexes of this array belong to which
        # model, by storing the index of the first vector for a model
        vectors = numpy.empty((0, vector_size))
        vector_offsets = {}

        # now process each model
        for model_name, model in models.items():
            relevant_words[model_name] = set(
            )  # not a set, since order needs to be preserved
            self.dataset.update_status("Finding similar words in model '%s'" %
                                       model_name)

            for query in input_words:
                if query not in model.vocab:
                    self.dataset.update_status(
                        "Query '%s' was not found in model %s; cannot find nearest neighbours."
                        % (query, model_name),
                        is_final=True)
                    self.dataset.finish(0)
                    return

                if self.interrupted:
                    shutil.rmtree(staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while finding similar words")

                # use a larger sample (topn) than required since some of the
                # nearest neighbours may not be in the common vocabulary and
                # will therefore need to be ignored
                context = set([
                    word[0] for word in model.most_similar(query, topn=1000)
                    if word[0] in common_vocab and word[1] >= threshold
                ][:num_words])

                relevant_words[model_name] |= {
                    query
                } | context  # always include query word

        # now do another loop to determine which words to plot for each model
        # this is either the same as relevant_words, or a superset which
        # combines all relevant words for all models
        plottable_words = {}
        last_model = max(relevant_words.keys())
        all_relevant_words = set().union(*relevant_words.values())

        for model_name, words in relevant_words.items():
            plottable_words[model_name] = []
            vector_offsets[model_name] = len(vectors)

            # determine which words to plot for this model. either the nearest
            # neighbours for this model, or all nearest neighbours found across
            # all models
            words_to_include = all_relevant_words if all_words else relevant_words[
                model_name]

            for word in words_to_include:
                if word in plottable_words[model_name] or (
                        not overlay and model_name != last_model
                        and word not in input_words):
                    # only plot each word once per model, or if 'overlay'
                    # is not set, only once overall (for the most recent
                    # model)
                    continue

                vector = models[model_name][word]
                plottable_words[model_name].append(word)
                vectors = numpy.append(vectors, [vector], axis=0)

        del models  # no longer needed

        # reduce the vectors of all words to be plotted for this model to
        # a two-dimensional coordinate with the previously initialised tsne
        # transformer. here the two-dimensional vectors are interpreted as
        # cartesian coordinates
        if reduction_method == "PCA":
            pca = PCA(n_components=2, random_state=0)
            vectors = pca.fit_transform(vectors)
        elif reduction_method == "t-SNE":
            # initialise t-sne transformer
            # parameters taken from Hamilton et al.
            # https://github.com/williamleif/histwords/blob/master/viz/common.py
            tsne = TSNE(n_components=2,
                        random_state=0,
                        learning_rate=150,
                        init="pca")
            vectors = tsne.fit_transform(vectors)
        elif reduction_method == "TruncatedSVD":
            # standard sklearn parameters made explicit
            svd = TruncatedSVD(n_components=2,
                               algorithm="randomized",
                               n_iter=5,
                               random_state=0)
            vectors = svd.fit_transform(vectors)
        else:
            shutil.rmtree(staging_area)
            self.dataset.update_status(
                "Invalid dimensionality reduction technique selected",
                is_final=True)
            self.dataset.finish(0)
            return

        # also keep track of the boundaries of our 2D space, so we can plot
        # them properly later
        for position in vectors:
            max_x = max(max_x, position[0])
            max_y = max(max_y, position[1])
            min_x = min(min_x, position[0])
            min_y = min(min_y, position[1])

        # now we know for each model which words should be plotted and at what
        # position
        # with this knowledge, we can normalize the positions, and start
        # plotting them in a graph

        # a palette generated with https://medialab.github.io/iwanthue/
        colours = [
            "#d58eff", "#cf9000", "#3391ff", "#a15700", "#911ca7", "#00ddcb",
            "#cc25a9", "#d5c776", "#6738a8", "#ff9470", "#47c2ff", "#a4122c",
            "#00b0ca", "#9a0f76", "#ff70c8", "#713c88"
        ]
        colour_index = 0

        # make sure all coordinates are positive
        max_x -= min_x
        max_y -= min_y

        # determine graph dimensions and proportions
        width = 1000  # arbitrary
        height = width * (max_y / max_x)  # retain proportions
        scale = width / max_x

        # margin around the plot to give room for labels and to look better
        margin = width * 0.1
        width += 2 * margin
        height += 2 * margin

        # normalize all known positions to fit within the graph
        vectors = [(margin + ((position[0] - min_x) * scale),
                    margin + ((position[1] - min_y) * scale))
                   for position in vectors]

        # now all positions are finalised, we can determine the "journey" of
        # each query - the sequence of positions in the graph it takes, so we
        # can draw lines from position to position later
        journeys = {}
        for query in input_words:
            journeys[query] = []
            for model_name, words in plottable_words.items():
                index = words.index(query)
                journeys[query].append(vectors[vector_offsets[model_name] +
                                               index])

        # font sizes proportional to width (which is static and thus predictable)
        fontsize_large = width / 50
        fontsize_normal = width / 75
        fontsize_small = width / 100

        # now we have the dimensions, the canvas can be instantiated
        model_type = self.source_dataset.parameters.get(
            "model-type", "word2vec")
        canvas = get_4cat_canvas(
            self.dataset.get_results_path(),
            width,
            height,
            header="%s nearest neighbours (fitting: %s) - '%s'" %
            (model_type, reduction_method, ",".join(input_words)),
            fontsize_normal=fontsize_normal,
            fontsize_large=fontsize_large,
            fontsize_small=fontsize_small)

        # use colour-coded backgrounds to distinguish the query words in the
        # graph, each model (= interval) with a separate colour
        for model_name in plottable_words:
            solid = Filter(id="solid-%s" % model_name)
            solid.feFlood(flood_color=colours[colour_index])
            solid.feComposite(in_="SourceGraphic")
            canvas.defs.add(solid)

            # this can get kind of confusing, but you shouldn't be using this
            # with more than 16 models anyway
            colour_index = 0 if colour_index >= len(
                colours) - 1 else colour_index + 1

        # now plot each word for each model
        self.dataset.update_status("Plotting graph")
        words = SVG(insert=(0, 0), size=(width, height))
        queries = SVG(insert=(0, 0), size=(width, height))
        colour_index = 0

        for model_name, labels in plottable_words.items():
            positions = vectors[
                vector_offsets[model_name]:vector_offsets[model_name] +
                len(labels)]

            label_index = 0
            for position in positions:
                word = labels[label_index]
                is_query = word in input_words
                label_index += 1

                filter = ("url(#solid-%s)" %
                          model_name) if is_query else "none"
                colour = "#FFF" if is_query else colours[colour_index]
                fontsize = fontsize_normal if is_query else fontsize_small

                if word in input_words:
                    word += " (" + model_name + ")"

                label_container = SVG(insert=position,
                                      size=(1, 1),
                                      overflow="visible")
                label_container.add(
                    Text(insert=("50%", "50%"),
                         text=word,
                         dominant_baseline="middle",
                         text_anchor="middle",
                         style="fill:%s;font-size:%ipx" % (colour, fontsize),
                         filter=filter))

                # we make sure the queries are always rendered on top by
                # putting them in a separate SVG container
                if is_query:
                    queries.add(label_container)
                else:
                    words.add(label_container)

            colour_index = 0 if colour_index >= len(
                colours) - 1 else colour_index + 1

        # plot a line between positions for query words
        lines = SVG(insert=(0, 0), size=(width, height))
        for query, journey in journeys.items():
            previous_position = None
            for position in journey:
                if previous_position is None:
                    previous_position = position
                    continue

                lines.add(
                    Line(start=previous_position,
                         end=position,
                         stroke="#CE1B28",
                         stroke_width=2))
                previous_position = position

        canvas.add(lines)
        canvas.add(words)
        canvas.add(queries)

        canvas.save(pretty=True)
        shutil.rmtree(staging_area)
        self.dataset.finish(len(journeys))
Beispiel #39
0
w2v_model.wv.most_similar('人民', topn=5)
w2v_model.wv.most_similar('台灣', topn=5)

similar_words = {key_word:[similar_word[0] for similar_word in w2v_model.wv.most_similar(key_word, topn=6)]
                          for key_word in ['台灣','人民','國家','民主','中共','大陸','共匪','自由']}
similar_words

## Visualization

from sklearn.manifold import TSNE
all_words = sum([[key_word]+similar_words for key_word, similar_words in similar_words.items()], [])
all_words_vec = w2v_model.wv[all_words]

tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(all_words_vec)
labels=all_words

## Chinese Font Issues in Plotting

from matplotlib import rcParams
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
# 解决负号'-'显示为方块的问题
# rcParams['axes.unicode_minus']=False
myfont = FontProperties(fname='/System/Library/Fonts/PingFang.ttc',
 size=12)
# plt.title('乘客等级分布', fontproperties=myfont)
# plt.ylabel('人数', fontproperties=myfont)
# plt.legend(('头等舱', '二等舱', '三等舱'), loc='best', prop=myfont)
###   ~ Data Viz ~  ###
#######################

df_train_both = pd.merge(df_train_genetic_scaled,
                         df_train_non_genetic_pre_processed,
                         left_index=True,
                         right_index=True,
                         how='outer')
df_train_both_with_target = pd.merge(df_train_both,
                                     df_solution,
                                     left_index=True,
                                     right_index=True,
                                     how='outer')

#Applying t-SNE to vizualize data
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
x = df_train_both.values
x_tsne = tsne.fit_transform(x)

df_tsne = pd.DataFrame()
df_tsne['x-tsne'] = x_tsne[:, 0]
df_tsne['y-tsne'] = x_tsne[:, 1]
df_tsne['label'] = df_solution.values


chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \
        + geom_point(size=70,alpha=0.1) \
        + ggtitle("tSNE dimensions colored by cancer stage")

print(chart)
                   b_sight=3000,
                   b_maxl=1500,
                   n_jobs=20)

#print('fit gammas')
vlm.fit_gammas()

print('calculate velocity')
vlm.predict_U()
vlm.calculate_velocity()
vlm.calculate_shift(assumption="constant_velocity")
vlm.extrapolate_cell_at_t(delta_t=1.)

print('running tsne')
bh_tsne = TSNE(random_state=1)
vlm.ts = bh_tsne.fit_transform(vlm.pcs[:, :20])

print('projection of velocity onto embeddings')
vlm.estimate_transition_prob(hidim="Sx_sz",
                             embed="ts",
                             transform="sqrt",
                             psc=1,
                             n_neighbors=3500,
                             knn_random=True,
                             sampled_fraction=0.5)

print('calculate embedding shift')
vlm.calculate_embedding_shift(sigma_corr=0.05, expression_scaling=True)

print('calculate grid arrows')
vlm.calculate_grid_arrows(smooth=0.8, steps=(40, 40), n_neighbors=100)
Beispiel #42
0
                log_str="Nearest to %s:" % valid_word
                for k in range(top_k):
                    close_word=reverse_dictionary[nearest[k]]
                    log_str="%s %s," %(log_str,close_word)
                print(log_str)
    final_embedding=normalized_embeddings.eval()
'''
可视化Word2Vec散点图并保存
'''
def plot_with_labels(low_dim_embs,labels,filename):
    assert low_dim_embs.shape[0]>=len(labels),"more labels than embedding"
    plt.figure(figsize=(18,18))
    for i,label in enumerate(labels):
        x,y=low_dim_embs[i,:]
        plt.scatter(x, y)
        plt.annotate(label,xy=(x,y),xytext=(5,2),textcoords='offset points',ha='right',va='bottom')
    plt.savefig(filename)

'''
tsne实现降维,将原始的128维的嵌入向量降到2维
'''

tsne=TSNE(perplexity=30,n_components=2,init='pca',n_iter=5000)
plot_number=150
low_dim_embs=tsne.fit_transform(final_embedding[:plot_number,:])
labels=[reverse_dictionary[i] for i in range(plot_number)]
plot_with_labels(low_dim_embs, labels, './plot.png')

       

    
## Remove duplicates of fighters, only keep the most up to date entry
relevant_data.sort_values(by='date', ascending=False, inplace=True)
relevant_data = relevant_data.drop_duplicates(subset='fighter')
relevant_data.drop(columns='date', inplace=True)
relevant_data.drop(
    columns='Stance', inplace=True
)  # Removed Stance for dataset simplicity, consider adding in later
relevant_data.fillna(0, inplace=True)

x = relevant_data.iloc[:, 1:]
y = relevant_data.iloc[:, 0]
#y = y.astype('category').cat.codes

model = TSNE(learning_rate=100)

tsne_transformed = model.fit_transform(x)

xs = tsne_transformed[:, 0]
ys = tsne_transformed[:, 1]

## Plot the clusters
#plt.scatter(xs, ys, c = y)
#plt.ylim(-50, 50)
#plt.xlim(-50,50)
#plt.show()

fig, ax = plt.subplots(figsize=(8, 6))
ax.set_ylim(-50, 50)
ax.set_xlim(-50, 50)
ax.scatter(xs, ys)
Beispiel #44
0
index = 0
for ine in inertia:
    b = (inertia[0] - ine) * (inertia[0] - ine)
    c = (ine - inertia[-1]) * (ine - inertia[-1])
    a = math.sqrt(b + c)
    if a < mininum:
        get = index
        mininum = a
    index += 1
kmeans = KMeans(n_clusters=get, random_state=0).fit(data_sparse)

model = AgglomerativeClustering(n_clusters=get)
gp = model.fit_predict(data_sparse)

tsne = TSNE()
visualisation = tsne.fit_transform(data_sparse)


def count(data, qtd):
    all_words = ' '.join([text for text in data])
    token_phrase = token_space.tokenize(all_words)
    frequency = nltk.FreqDist(token_phrase)
    df_frequency = pd.DataFrame({
        "Palavra": list(frequency.keys()),
        "Frequencia": list(frequency.values())
    })
    de_frequency = df_frequency.nlargest(columns="Frequencia", n=qtd)
    return (de_frequency)


list_city = []
Beispiel #45
0
#-*- coding: utf-8 -*-
# 接k_means.py
from sklearn.manifold import TSNE

tsne = TSNE()
tsne.fit_transform(data_zs)  # 进行数据降维
tsne = pd.DataFrame(tsne.embedding_, index=data_zs.index)  # 转换数据格式

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

# 不同类别用不同颜色和样式绘图
d = tsne[r[u'聚类类别'] == 0]
plt.plot(d[0], d[1], 'r.')
d = tsne[r[u'聚类类别'] == 1]
plt.plot(d[0], d[1], 'go')
d = tsne[r[u'聚类类别'] == 2]
plt.plot(d[0], d[1], 'b*')
plt.show()
Beispiel #46
0
 list_avec = all_list_adocs_vec + list_awords_sim_vec
 
 
 #6
 #-------------- PCA ------------------#
 #pca = PCA(n_components=2)
 #dresult_X_all = pca.fit_transform(list_dvec)
 #aresult_X_all = pca.fit_transform(list_avec)
 
 
 #-------------- TSNE -----------------#
 #tsne = TSNE(n_components=2)
 #tsne = TSNE(n_components=2, random_state=None, verbose=1, perplexity=40, n_iter=300)
 #tsne = TSNE(n_components=2, init='pca', random_state=0, perplexity=40)
 tsne = TSNE(n_components=2, perplexity=40, metric='euclidean', init='pca', verbose=0, random_state=0)
 dresult_X_all = tsne.fit_transform(list_dvec)
 aresult_X_all = tsne.fit_transform(list_avec)
 
 
 #------------- Setting plot -----------------#
 word_vec = "Word Vectors"
 docs_vec = "Paragraph Vectors"
 list_genre_plot = ["Economic", "Education", "Entertainment", "Foreign", "IT", "Sports"]
 list_wovec_plot = [list_genre_plot[0]+' '+word_vec,list_genre_plot[1]+' '+word_vec,list_genre_plot[2]+' '+word_vec,list_genre_plot[3]+' '+word_vec,list_genre_plot[4]+' '+word_vec,list_genre_plot[5]+' '+word_vec]
 list_dovec_plot = [list_genre_plot[0]+' '+docs_vec,list_genre_plot[1]+' '+docs_vec,list_genre_plot[2]+' '+docs_vec,list_genre_plot[3]+' '+docs_vec,list_genre_plot[4]+' '+word_vec,list_genre_plot[5]+' '+docs_vec]
 plt.rcParams['font.family'] = 'TH SarabunPSK'
 plt.rcParams['font.size'] = 14
 #with plt.style.context('dark_background'):
 
 #********---------------------------------------------------------********#
 
Beispiel #47
0
# extract output of the final embedding layer (before the softmax)
# in test mode, we should set the 'learning_phase' flag to 0 (e.g., we don't want to use dropout)
get_doc_embedding = K.function(
    [model.layers[0].input, K.learning_phase()], [model.layers[9].output])

n_plot = 1000
print('plotting embeddings of first', n_plot, 'documents')

doc_emb = get_doc_embedding([np.array(x_test[:n_plot]), 0])[0]

my_pca = PCA(n_components=10)
my_tsne = TSNE(n_components=2,
               perplexity=10)  #https://lvdmaaten.github.io/tsne/
doc_emb_pca = my_pca.fit_transform(doc_emb)
doc_emb_tsne = my_tsne.fit_transform(doc_emb_pca)

labels_plt = y_test[:n_plot]
my_colors = ['blue', 'red']

fig, ax = plt.subplots()

for label in list(set(labels_plt)):
    idxs = [idx for idx, elt in enumerate(labels_plt) if elt == label]
    ax.scatter(doc_emb_tsne[idxs, 0],
               doc_emb_tsne[idxs, 1],
               c=my_colors[label],
               label=str(label),
               alpha=0.7,
               s=40)
Beispiel #48
0
    model = word2vec.load(args.model_path)

    vocabs = []
    vecs = []
    for vocab in model.vocab:
        vocabs.append(vocab)
        vecs.append(model[vocab])
    vecs = np.array(vecs)[:args.plot_num]
    vocabs = vocabs[:args.plot_num]
    '''
    Dimensionality Reduction
    '''
    # from sklearn.decomposition import PCA

    tsne = TSNE(n_components=2)
    reduced = tsne.fit_transform(vecs)
    '''
    Plotting
    '''

    # filtering
    use_tags = set(['JJ', 'NNP', 'NN', 'NNS'])
    puncts = ["'", '.', ':', ";", ',', "?", "!", u"’"]

    plt.figure()
    texts = []
    for i, label in enumerate(vocabs):
        pos = nltk.pos_tag([label])
        if (label[0].isupper() and len(label) > 1 and pos[0][1] in use_tags
                and all(c not in label for c in puncts)):
            x, y = reduced[i, :]
Beispiel #49
0
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

    plt.savefig(filename)


try:
    # pylint: disable=g-import-not-at-top
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt

    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    plot_only = 500
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
    plot_with_labels(low_dim_embs, labels)

except ImportError:
    print('Please install sklearn, matplotlib, and scipy to show embeddings.')
# Threshold filters out unconfident topics

threshold = 0.5
_idx = np.amax(refac_matrix, axis=1) > threshold  # idx of doc that above the threshold
refac_matrix = refac_matrix[_idx]

# Dimentionality reduction gives us a better plot

from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
from sklearn.manifold import TSNE

tsne = TSNE(random_state=2017, perplexity=30) # 5 30 50
tsne_embedding = tsne.fit_transform(refac_matrix)
tsne_embedding = pd.DataFrame(tsne_embedding, columns =['x','y'])
tsne_embedding['hue'] = refac_matrix.argmax(axis=1)

dataset['Date'] = pd.to_datetime(dataset.Date)


'''t-SNE scatter plot made with bokeh.
   You can move your mouse over a point
   to see specific words clustered in 
   their respective topics'''

source = ColumnDataSource(
        data=dict(
            x = tsne_embedding.x,
            y = tsne_embedding.y,
for k, col in zip(tstInd['clust'].unique(), colors):
    tmp = np.vstack(np.asarray(tstInd[tstInd['clust'] == k]['PCA']))
    if len(tmp) > 0:
        plt.scatter(tmp[:, 0], tmp[:, 1], c=col)

#draw indications of same drug in same color

tstInd = pn.read_csv("/home/galiasn/DATA/MechanismBasedRepurposing/full.csv")
tstInd = tstInd[tstInd['status'] == 'Approved']
tstInd['w2v'] = tstInd['ind_id'].apply(getIndVector)

pca_model = PCA(n_components=30)
tsne_model = TSNE(n_components=2, perplexity=10, n_iter=1000)
XOne = pca_model.fit_transform(list(tstInd['w2v']))
tsne_pca = tsne_model.fit_transform(XOne)
tstInd['PCA'] = list(tsne_pca)
tstInd['sum'] = len(tstInd) * [1]

tstIndgb = tstInd.groupby('drug_id', as_index=False).sum()
tstIndgb = tstIndgb[tstIndgb['sum'] > 1]
tstIndgb = tstIndgb[(tstIndgb['sum'] < 10)]

drugIds = tstIndgb['drug_id'].unique()
drugSampl = random.sample(set(drugIds), 400)
for d in drugSampl:
    tmp = np.vstack(np.asarray(tstInd[tstInd['drug_id'] == d]['PCA']))
    if len(tmp) > 0:
        plt.scatter(tmp[:, 0], tmp[:, 1])
########################
#Mol2Vec
Beispiel #52
0
for i in range(10):
    idx_by_label = np.array(data.index[data['label'] == i].tolist())
    idx_by_label_sample = np.random.choice(idx_by_label,
                                           size=100,
                                           replace=False)
    if i == 0:
        idx_all = idx_by_label_sample
    else:
        idx_all = np.append(idx_all, idx_by_label_sample)

data_sample = data.iloc[np.sort(idx_all)]
""" Divide data to features and labels """
data_features = data_sample.iloc[:, 1:]
data_labels = data_sample.iloc[:, 0]

points = tsne.fit_transform(data_features)
points_3d = tsne_3d.fit_transform(data_features)

"plotting function 2d space"


def plot_embedding(x, y, title):
    plt.figure()
    plt.scatter(x[:, 0], x[:, 1], color=plt.cm.Set1(y / 10))

    plt.xticks([])
    plt.yticks([])
    plt.title(title)
    plt.show()

plt.figure(figsize=(16, 2))
plt.plot(list(clust_dict.keys()), list(clust_dict.values()), '+-')
plt.grid()
plt.title('Silhouette average vs number of clusters (using K-means)')
plt.show()

# In[15]:

from sklearn.manifold import TSNE

NUM_CLUSTERS = 8

# loop for all assets and create plot of the system
df_labels = pd.DataFrame(index=df.index, columns=assets)
for a in assets:
    dfa = df[[c for c in df.columns if a in c]]
    X_clust = dfa.interpolate(axis=0).values
    clf = clf = KMeans(n_clusters=NUM_CLUSTERS)
    df_labels[a] = clf.fit_predict(X_clust)
    model = TSNE(n_components=2, random_state=0)
    Y_clust = model.fit_transform(X_clust)
    plt.scatter(x=Y_clust.transpose()[0],
                y=Y_clust.transpose()[1],
                c=df_labels[a],
                cmap='jet')
    plt.title("T-SNE representation of {} clusters for asset {}".format(
        NUM_CLUSTERS, a))
    plt.show()

# The analysis does not conclude that the state of each asset can just be describe on two dimensions that can be used to explain the full system behavior.
Beispiel #54
0
def getTSNE(data):
    ts = TSNE(n_components=2)
    ts_data = ts.fit_transform(data)
    return ts_data
Beispiel #55
0
df['pca-one'] = pca_result[:, 0]
df['pca-two'] = pca_result[:, 1]

from plotnine import *

chart = (ggplot(df, aes(x='pca-one', y='pca-two', color='label')) +
         geom_point(size=2, alpha=1) +
         ggtitle("First and Second Principal Components  by Label"))

ggplot.save(chart, 'PCA.png')
print("Image save as PCA.png in current dir")

import time

from sklearn.manifold import TSNE

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=500)
tsne_results = tsne.fit_transform(df[feat_cols].values)

df_tsne = df.copy()
df_tsne['x-tsne'] = tsne_results[:, 0]
df_tsne['y-tsne'] = tsne_results[:, 1]

chart = ggplot(df_tsne, aes(
    x='x-tsne', y='y-tsne', color='label')) + geom_point(
        size=2, alpha=1) + ggtitle("t-SNE dimensions colored by Label")
ggplot.save(chart, 'tsne.png')

print("plot has been save in current dir as t-SNE.png")
Beispiel #56
0
torch.manual_seed(42)

for epoch in range(num_epochs):
    train_images = overall_train_data
    train_images = train_images.requires_grad_()
    # Clear gradients w.r.t. parameters
    optimizer4_1.zero_grad()
    # Forward pass to get output/logits
    train_outputs = model4(train_images)

    # Get predictions from the maximum value
    _, predicted_labels = torch.max(train_outputs.data, 1)

    f = train_outputs.detach().numpy()
    tsne = TSNE(n_components=2, verbose=1)
    t1 = tsne.fit_transform(f)
    fig, ax = plt.subplots()
    groups = predicted_labels.numpy()
    for g in np.unique(groups):
        i = np.where(groups == g)
        ax.scatter(t1[i, 0], t1[i, 1], label=g)
    ax.legend([
        'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog',
        'horse', 'ship', 'truck'
    ])
    plt.show()

    train_labels = overall_train_labels

    # Calculate Loss: softmax --> cross entropy loss
    train_loss = criterion(train_outputs, train_labels)
Beispiel #57
0
                             'Disciplinary failure', 'Education',
                             'Social drinker', 'Social smoker'
                         ])

    cdf = cdf.drop(labels=[
        'Reason for absence', 'Month of absence', 'Day of the week', 'Seasons',
        'Disciplinary failure', 'Education', 'Social drinker', 'Social smoker'
    ]).astype(np.float64)

    # Standardize the dataset
    ss = StandardScaler(with_std=False)
    sdf = ss.fit_transform(cdf)

    # Perform the TSNE non-linear dimensionality reduction
    tsne = TSNE(n_components=2, perplexity=15, random_state=1000)
    data_tsne = tsne.fit_transform(sdf)

    df_tsne = pd.DataFrame(data_tsne, columns=['x', 'y'], index=cdf.index)
    dff = pd.concat([cdf, df_tsne], axis=1)

    # Show the dataset
    sns.set()

    fig, ax = plt.subplots(figsize=(18, 11))

    with sns.plotting_context("notebook", font_scale=1.5):
        sns.scatterplot(x='x',
                        y='y',
                        size='Age',
                        sizes=(30, 400),
                        palette=sns.color_palette("husl", 2),
Beispiel #58
0
        optimizer.zero_grad()  # clear gradients for this training step
        loss.backward()  # back propagation, compute gradients
        optimizer.step()  # apply gradients

        if step % BATCH_SIZE == 0:
            test_output, last_layer = cnn(test_x)
            pred_y = torch.max(test_output, 1)[1].data.numpy()
            accuracy = float(
                (pred_y == test_y.data.numpy()).astype(int).sum()) / float(
                    test_y.size(0))
            print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.numpy(),
                  '| test accuracy: %.2f' % accuracy)
            if HAS_SK:
                # Visualization of trained flatten layer (T-SNE)
                tsne = TSNE(perplexity=30,
                            n_components=2,
                            init='pca',
                            n_iter=5000)
                plot_only = 500
                low_dim_embs = tsne.fit_transform(
                    last_layer.data.numpy()[:plot_only, :])
                labels = test_y.numpy()[:plot_only]
                plot_with_labels(low_dim_embs, labels)

plt.ioff()

# print 10 predictions from test data
test_output, _ = cnn(test_x[:10])
pred_y = torch.max(test_output, 1)[1].data.numpy()
print(pred_y, 'prediction number')
print(test_y[:10].numpy(), 'real number')
Beispiel #59
0
            classes = classes.numpy()

            #outputs = outputs.cpu()
            #X = X.to(device)

            X = np.vstack((X, outputs))
            Y_class = np.concatenate((Y_class, classes))
            Y_domain = np.concatenate((Y_domain, np.array([1 for _ in range(inputs.size(0))], dtype=np.int16)))
            
            print("Target stpes: [{}/{}]".format(i, steps))
        
        print(X.shape)
        print(Y_class.shape)
        print(Y_domain.shape)

    X_tsne = tsne.fit_transform(X)
    print("Org data dimension is {}. Embedded data dimension is {}".format(X.shape[-1], X_tsne.shape[-1]))

    x_min, x_max = X_tsne.min(0), X_tsne.max(0)
    X_norm = (X_tsne - x_min) / (x_max - x_min)


    color = ['r', 'g', 'b', 'k', 'gold', 'm', 'c', 'orange', 'cyan', 'pink']
    class_color = [color[label] for label in Y_class]
    domain_color = [color[label] for label in Y_domain]


    plt.figure(1, figsize=(8, 8))
    plt.scatter(X_norm[:, 0], X_norm[:, 1], c=class_color, s=1)
    plt.savefig("./dann{}_{}_class.png".format(source, target))
    plt.close("all")
Beispiel #60
0
X = model[model.wv.vocab]

label = list(reversed(sorted(l_k, key=f_key)))

X_sort = X
for j in range(0, len(label)):
    X_sort[j, :] = model[label[j]]

#tsne visualize

plot_i = 0
plot_f = 500

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_sort[plot_i:plot_f, :])


def plot_with_labels(low_dim_embs, labels):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')