def main():
    Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()
    dbn = DBN([1000, 750, 500], UnsupervisedModel=AutoEncoder)
    # dbn = DBN([1000, 750, 500, 10])
    output = dbn.fit(Xtrain, pretrain_epochs=2)
    print "output.shape", output.shape

    # sample before using t-SNE because it requires lots of RAM
    sample_size = 600
    tsne = TSNE()
    reduced = tsne.fit_transform(output[:sample_size])
    plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain[:sample_size], alpha=0.5)
    plt.title("t-SNE visualization")
    plt.show()

    # t-SNE on raw data
    reduced = tsne.fit_transform(Xtrain[:sample_size])
    plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain[:sample_size], alpha=0.5)
    plt.title("t-SNE visualization")
    plt.show()

    pca = PCA()
    reduced = pca.fit_transform(output)
    plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain, alpha=0.5)
    plt.title("PCA visualization")
    plt.show()
Esempio n. 2
0
def vizualize_clusters(X, y, py, hist=None):
    """ Using T-SNE to visualize the site clusters.
        Plot and save the scatter (and the histogramm).
    """
    model = TSNE(n_components=2, random_state=0)

    fig = model.fit_transform(X, y)
    fig1 = model.fit_transform(X, py)

    pyplot.figure(figsize=(16, 8))
    pyplot.subplot(121)

    classes = list(set(y))
    for c, color in zip(classes, plt.colors.cnames.iteritems()):
        indeces = [i for i, p in enumerate(y) if p == c]
        pyplot.scatter(fig[indeces, 0], fig[indeces, 1], marker="o", c=color[0])

    pyplot.subplot(122)

    clusters = list(set(py))
    for c, color in zip(clusters, plt.colors.cnames.iteritems()):
        indeces = [i for i, p in enumerate(py) if p == c]
        pyplot.scatter(fig1[indeces, 0], fig1[indeces, 1], marker="o", c=color[0])

    # pyplot.show()
    pyplot.savefig("clusters" + "_scatter.png")

    if hist is not None:
        pyplot.figure(figsize=(4, 4))
        pyplot.xticks(clusters)

        pyplot.bar(clusters, hist, align="center")
        # pyplot.show()
        pyplot.savefig("clusters" + "_hist.png")
Esempio n. 3
0
def tsne(similarity, euclid=False, perplexity=30):
    if euclid:
        model = TSNE(learning_rate=100, perplexity=perplexity, n_iter=200000)
        result = model.fit_transform(similarity)
    else:
        model = TSNE(learning_rate=100, n_iter=100000, init='random', metric='precomputed')
        result = model.fit_transform(1 - similarity)

    return result.T
Esempio n. 4
0
def MyTSNE(train,test):
    #MyTSNE(train.iloc[:100,:],test.iloc[:20,:])
    model = TSNE(n_components=2, random_state=0)
    a = np.vstack(
            [train.values,
            test.values]
            )
    model.fit_transform(a)
    return
def programmer_5(data_zs, r):
    # 进行数据降维
    tsne = TSNE()
    tsne.fit_transform(data_zs)
    tsne = pd.DataFrame(tsne.embedding_, index=data_zs.index)

    # 不同类别用不同颜色和样式绘图
    d = tsne[r[u'聚类类别'] == 0]
    plt.plot(d[0], d[1], 'r.')
    d = tsne[r[u'聚类类别'] == 1]
    plt.plot(d[0], d[1], 'go')
    d = tsne[r[u'聚类类别'] == 2]
    plt.plot(d[0], d[1], 'b*')
    plt.show()
Esempio n. 6
0
def learn_embedding(precompute_metric=False, use_saved=False):
    base = 'datasets/dspace_topics'
    new_base = 'datasets/dspace_embeddings'
    # Delete previous saved embedding
    if os.path.exists(new_base):
        shutil.rmtree(new_base)
    os.makedirs(new_base)

    print 'Embedding: Extracting topics'
    # choose a random subset of documents
    filename_vec = os.listdir(base)
    subsample = 5000
    filename_vec = np.random.choice(filename_vec, subsample)
    topic_vec = []
    for filename in tqdm(filename_vec):
        path = os.path.join(base, filename)
        with open(path) as f:
            d = json.load(f)
            topics = d['topics']
            topic_vec.append(topics)

    print 'Embedding: Computing pairwise distances'
    if precompute_metric:
        if use_saved:
            with open('metric.npy') as f:
                metric = np.load(f)
        else:
            metric = pairwise_distances(np.array(topic_vec), metric=KL, n_jobs=-1)
            with open('metric.npy', 'w') as f:
                np.save(f, metric)

        print 'Embedding: Learning embedding'
        tsne = TSNE(n_iter=1000, verbose=10, metric='precomputed')
        y = tsne.fit_transform(metric)
    else:
        print 'Embedding: Learning embedding'
        tsne = TSNE(n_iter=1000, verbose=10)
        y = tsne.fit_transform(topic_vec)

    print 'Embedding: Saving embedding'
    for (index, filename) in tqdm(enumerate(filename_vec), total=len(filename_vec)):
        path = os.path.join(base, filename)
        with open(path, 'r') as f:
            d = json.load(f)
            d['embedding'] = list(y[index])
            new_path = os.path.join(new_base, filename)
            with open(new_path, 'w') as new_f:
                json.dump(d, new_f)
Esempio n. 7
0
def add_tsne_features(x_train, x_test):
    print('add_tsne_features <<')

    x_train_data = x_train.data_
    x_test_data = x_test.data_

    x = np.vstack((x_train_data, x_test_data))

    print('applying pca...')
    pca = PCA(n_components=25)
    x_pca = pca.fit_transform(x)

    print('applying t-SNE...')
    tsne_model = TSNE(n_components=2, random_state=0)
    x_tsne = tsne_model.fit_transform(x_pca)
    x_train_data = np.hstack((x_train_data, x_tsne[:x_train_data.shape[0], :]))
    x_test_data = np.hstack((x_test_data, x_tsne[-x_test_data.shape[0]:, :]))

    assert(x_train.columns_ == x_test.columns_)
    columns = x_train.columns_ + ['tsne_1', 'tsne_2']
    x_train = DataSet(x_train.ids_, columns, x_train_data)
    x_test = DataSet(x_test.ids_, columns, x_test_data)

    print('add_tsne_features >>')
    return x_train, x_test
Esempio n. 8
0
    def sendTSNE(self, people):
        d = self.getData()
        if d is None:
            return
        else:
            (X, y) = d

        X_pca = PCA(n_components=50).fit_transform(X, X)
        tsne = TSNE(n_components=2, init='random', random_state=0)
        X_r = tsne.fit_transform(X_pca)

        yVals = list(np.unique(y))
        colors = cm.rainbow(np.linspace(0, 1, len(yVals)))

        # print(yVals)

        plt.figure()
        for c, i in zip(colors, yVals):
            name = "Unknown" if i == -1 else people[i]
            plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=name)
            plt.legend()

        imgdata = StringIO.StringIO()
        plt.savefig(imgdata, format='png')
        imgdata.seek(0)

        content = 'data:image/png;base64,' + \
                  urllib.quote(base64.b64encode(imgdata.buf))
        msg = {
            "type": "TSNE_DATA",
            "content": content
        }
        self.sendMessage(json.dumps(msg))
Esempio n. 9
0
def display_closestwords_tsnescatterplot(model, word):
    arr = np.empty((0,300), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()
Esempio n. 10
0
def make_tsne_plot(model, rel_wds, plot_lims, title):

    dim = 30
    X, keys = make_data_matrix(model)

    # first we actually do PCA to reduce the
    # dimensionality to make tSNE easier to calculate
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = PCA(n_components=2)
    X = sklearn_pca.fit_transform(X_std)[:,:dim]

    # do downsample
    k = 5000
    sample = []
    important_words = []
    r_wds = [word[0] for word in rel_wds]
    for i, key in enumerate(keys):
        if key in r_wds:
            sample.append(i)
    sample = np.concatenate((np.array(sample),
                np.random.choice(len(keys), k-10, replace = False),
             ))
    X = X[sample,:]
    keys = [keys[i] for i in sample]



    # Do tSNE
    tsne = TSNE(n_components=2, random_state=0, metric="cosine")
    X_transf = tsne.fit_transform(X)

    k_means = KMeans(n_clusters=8)
    labels = k_means.fit_predict(X_transf)

    scatter_plot(X_transf[:,0], X_transf[:,1],  rel_wds, labels, title, keys, plot_lims)
def plot_features(subject, data_path, model_path, test_labels, dataset='test'):
    with open(model_path + '/' + subject + '.pickle', 'rb') as f:
        state_dict = cPickle.load(f)
    cnn = ConvNet(state_dict['params'])
    cnn.set_weights(state_dict['weights'])
    scalers = state_dict['scalers']

    if dataset == 'test':
        d = load_test_data(data_path, subject)
        x = d['x']
        y = test_labels['preictal']
    elif dataset == 'train':
        d = load_train_data(data_path, subject)
        x, y = d['x'], d['y']
    else:
        raise ValueError('dataset')

    x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \
        else scale_across_features(x, x_test=None, scalers=scalers)

    cnn.batch_size.set_value(x.shape[0])
    get_features = theano.function([cnn.x, Param(cnn.training_mode, default=0)], cnn.feature_extractor.output,
                                 allow_input_downcast=True)

    logits_test = get_features(x)
    model = TSNE(n_components=2, random_state=0)
    z = model.fit_transform(np.float64(logits_test))
    plt.scatter(z[:, 0], z[:, 1], s=60, c=y)
    plt.show()
Esempio n. 12
0
def plot_data(data, has_label=True):
	import numpy as np
	import seaborn as sns
	from sklearn.manifold import TSNE
	from sklearn.decomposition import PCA

	if not has_label:
		data = data.copy()
		data['label'] = np.zeros([len(data),1])

	LIMIT = 4000
	if data.shape[0] > LIMIT:
		dt = data.sample(n=LIMIT, replace=False)
		X = dt.ix[:,:-1]
		labels = dt.ix[:,-1]
	else:
		X = data.ix[:,:-1]
		labels = data.ix[:,-1]

	tsne_model = TSNE(n_components=2, random_state=0)
	np.set_printoptions(suppress=True)
	points1 = tsne_model.fit_transform(X)
	df1 = pd.DataFrame(data=np.column_stack([points1,labels]), columns=["x","y","class"])
	sns.lmplot("x", "y", data=df1, hue='class', fit_reg=False, palette=sns.color_palette('colorblind'))
	sns.plt.title('TNSE')

	pca = PCA(n_components=2)
	pca.fit(X)
	points2 = pca.transform(X)
	df2 = pd.DataFrame(data=np.column_stack([points2,labels]), columns=["x","y","class"])
	sns.lmplot("x", "y", data=df2, hue='class', fit_reg=False, palette=sns.color_palette('colorblind'))
	sns.plt.title('PCA')
Esempio n. 13
0
    def plot_mfi(self, outputfile='embeddings.pdf', nb_clusters=8, weights='NA'):
        # collect embeddings for mfi:
        X = np.asarray([self.w2v_model[w] for w in self.mfi \
                            if w in self.w2v_model], dtype='float32')
        # dimension reduction:
        tsne = TSNE(n_components=2)
        coor = tsne.fit_transform(X) # unsparsify

        plt.clf()
        sns.set_style('dark')
        sns.plt.rcParams['axes.linewidth'] = 0.4
        fig, ax1 = sns.plt.subplots()  

        labels = self.mfi
        # first plot slices:
        x1, x2 = coor[:,0], coor[:,1]
        ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none')
        # clustering on top (add some colouring):
        clustering = AgglomerativeClustering(linkage='ward',
                            affinity='euclidean', n_clusters=nb_clusters)
        clustering.fit(coor)
        # add names:
        for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_):
            ax1.text(x, y, name, ha='center', va="center",
                     color=plt.cm.spectral(cluster_label / 10.),
                     fontdict={'family': 'Arial', 'size': 8})
        # control aesthetics:
        ax1.set_xlabel('')
        ax1.set_ylabel('')
        ax1.set_xticklabels([])
        ax1.set_xticks([])
        ax1.set_yticklabels([])
        ax1.set_yticks([])
        sns.plt.savefig(outputfile, bbox_inches=0)
Esempio n. 14
0
def perform_AE(X, dim=2, tsne=False):
    y = np.zeros(shape=X.shape[0], dtype=int)
    
    if tsne:
        hidden_layers = [X.shape[1], 500, 100, 32]
        encoder_weights, decoder_weights = pretrain(X, hidden_layers)
        X_32d = ae(X, encoder_weights, decoder_weights, hidden_layers)

        ae_tsne = TSNE(n_components=dim, learning_rate=800, verbose=1)
        X_2d = ae_tsne.fit_transform(X_32d)

        method = 'ae_tsne_scaled'
    ### END - if tsne

    else:
        hidden_layers = [X.shape[1], 500, 100, 20, dim]
        encoder_weights, decoder_weights = pretrain(X, hidden_layers)
        X_2d = ae(X, encoder_weights, decoder_weights, hidden_layers)
        
        method = 'ae_scaled'
    ### END - else

    print('***** ' + method + ' *****')
    cluster(X_2d, method)
    np.save("{0}_{1}_X_2d".format(species, method), X_2d)
Esempio n. 15
0
def plot_phonemes(path):
    phoneme_embeddings = dict()
    for line in codecs.open(path,"r"):
        line = line.split(",")
        key= line[0][1:-1]
        emb = line[1:]
        emb[-1] = emb[-1][:-1]
        emb = np.array([float(e) for e in emb])
        phoneme_embeddings[key] = emb
    
    phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys())
    print(phoneme_embeddings.columns)
    
    m = TSNE()
    phoneme_embeddings_tsne = m.fit_transform(phoneme_embeddings.transpose())
    print(len(phoneme_embeddings_tsne))
    for p,emb in zip(phoneme_embeddings.columns, phoneme_embeddings_tsne):
        c = "black"
        if regex.search("^[aeiou3E][*]?$", p):
            c = "red"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*w~$", p):
            c = "blue"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*y~$", p):
            c = "yellow"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*h~$", p):
            c = "brown"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*\"$", p):
            c = "green"
            plt.annotate(p,(emb[0],emb[1]),color=c)
Esempio n. 16
0
def t_sne_view(norm_table, subj_cond, cohorts, image_type):

    # t-SNE analysis: Use stochastic neighbor embedding to reduce dimensionality of
    # data set to two dimensions in a non-linear, distance dependent fashion

    # Perform PCA data reduction if dimensionality of feature space is large:
    if len(norm_table.columns) > 12:
        pca = PCA(n_components = 12)
        pca.fit(norm_table.as_matrix())
        
        raw_data = pca.transform(norm_table.as_matrix())
    else:
        raw_data = norm_table.as_matrix()
 
    # Transform data into a two-dimensional embedded space:
    tsne = TSNE(n_components = 2, perplexity = 40.0, early_exaggeration= 2.0, 
        learning_rate = 100.0, init = 'pca')

    tsne_data = tsne.fit_transform(raw_data)

    # Prepare for normalization and view:
    cols = ['t-SNE', 'Cluster Visualization']
    tsne_table = pd.DataFrame(tsne_data, index = norm_table.index, columns = cols)
           
    # The output is no longer centered or normalized, so shift & scale it before display:
    tsne_avg = ppmi.data_stats(tsne_table, subj_cond, cohorts)
    tsne_norm_table = ppmi.normalize_table(tsne_table, tsne_avg)       
    
    # Send out to graphics rendering engine:

    if (image_type == 'Gauss'):
        return scg.scatter_gauss(tsne_norm_table[cols[0]], tsne_norm_table[cols[1]], subj_cond)
    elif (image_type == 'Scatter'):
        return scg.scatter_plain(tsne_norm_table[cols[0]], tsne_norm_table[cols[1]], subj_cond)
def main():
    embedding = WordEmbedding(embeddingpath(default_embeddingconfig))


    for old, new in spelling_changes:
        print(old, '--', new)
        print(embedding.nearest_words([old]))
        print()

    print()
    war, ist = tense_changes[0]
    tensediff = embedding[ist] - embedding[war]
    for past, present in tense_changes[1 : ]:
        print(past, '+ tensediff:', *embedding.nearest_words([embedding[past] + tensediff]))
        print('Should be:', present)
        print()

    # word_diffs = [embedding[new] - embedding[old] for (old, new) in word_changes]

    spelling_diffs = [embedding[new] - embedding[old] for (old, new) in spelling_changes[10 : 20]]
    tense_diffs = [embedding[present] - embedding[past] for (past, present) in tense_changes]

    def metric(u, v):
        return max(distance.cosine(u, v), 0)

    while True:
        try:
            model = TSNE(n_components=2, metric=metric)
            reduced = model.fit_transform(spelling_diffs + tense_diffs)
            print(reduced)
            return
        except Exception:
            pass
Esempio n. 18
0
def apply_tSNE30(proj_data, proj_weights=None):
    model = TSNE(n_components=2, perplexity=30.0, metric="euclidean",
                 learning_rate=100, early_exaggeration=4.0,
                 random_state=RANDOM_SEED);
    norm_data = normalize_columns(proj_data);
    result = model.fit_transform(norm_data.T);
    return result;
Esempio n. 19
0
    def plot_mean_activation_and_stuff(some_probs, Y, do_tsne=False):
        pyplot.clf()
        probs = numpy.float32(some_probs)
        xv = numpy.arange(probs.shape[1])#probs.var(axis=0)
        yv = probs.mean(axis=0)
        pyplot.axis([-0.1, probs.shape[1],0,1])
        for k in range(probs.shape[1]):
            pyplot.plot(xv[k]*numpy.ones(probs.shape[0]),probs[:,k],'o',ms=4.,
                        markeredgecolor=(1, 0, 0, 0.01),
                        markerfacecolor=(1, 0, 0, 0.01),)
        pyplot.plot(xv,yv, 'bo')
        pyplot.show(block=False)
        if do_video:
            pyplot.savefig(video.stdin, format='jpeg')
            video.stdin.flush()
        pyplot.savefig('epoch_probs.png')

        if not do_tsne: return
        try:
            from sklearn.manifold import TSNE
            tsne = TSNE(random_state=0)
            ps = tsne.fit_transform(numpy.float64(probs[:400]))
            pyplot.clf()
            Y = numpy.int32(Y)[:400]
            for i,c,s in zip(range(10),list('bgrcmyk')+[(.4,.3,.9),(.9,.4,.3),(.3,.9,.4)],'ov'*5):
                sub = ps[Y == i]
                pyplot.plot(sub[:,0], sub[:,1], s,color=c,ms=3,mec=c)
            pyplot.show(block=False)
            pyplot.savefig('probs_embed.png')
        except ImportError:
            print "cant do tsne"
Esempio n. 20
0
def topic_dimen_reduce(words, word2vec):
    dictionary, matrix = terms_analysis.get_words_matrix(words, word2vec)
    pca = PCA(n_components=50)
    pca_matrix = pca.fit_transform(matrix)
    tsne = TSNE(n_components=2)
    t_matrix = tsne.fit_transform(pca_matrix)
    return dictionary, t_matrix
Esempio n. 21
0
def tsnePlot(plotname, modelName, word, dest):
    
    """Plots a tsne graph of words most similar to the word passed in the argument (as represented in the model previously calculated)"""
    
    model = word2vec.Word2Vec.load(modelName)
    words = [model.most_similar(word)[i][0] for i in range(0, len(model.most_similar(word)))]
    words.append(word)

    #nested list constaining 100 dimensional word vectors of each most-similar word
    
    word_vectors = [model[word] for word in words]
    word_vectors = np.array(word_vectors)

    tsne_model = TSNE(n_components=2, random_state=0)
    Y = tsne_model.fit_transform(word_vectors)
    sb.plt.plot(Y[:,0], Y[:,1], 'o') 

    for word, x, y in zip(words, Y[:,0], Y[:,1]):  
        sb.plt.annotate(word, (x, y), size=12)
        #sb.plt.pause(10)

    plotname = plotname + ".png"

    if not os.path.exists(dest):
        os.makedirs(dest)

    path = os.path.join(dest, plotname)

    sb.plt.savefig(path)
 def reduce_dimentionality(self):
     self.vectors = []
     for key in self.selected_words:
         self.vectors.append(self.model[key])
     tnse_model = TSNE(n_components=2, random_state=0)
     np.set_printoptions(suppress=True)
     self.reduced_vectors = tnse_model.fit_transform(self.vectors)
Esempio n. 23
0
def plotTSNEDecisionBoundaries(): 
    
    tsne = TSNE()
    tsne_data = tsne.fit_transform(feature_set)
    x_min,x_max = tsne_data[:,0].min()-1, tsne_data[:,0].max() + 1
    y_min,y_max = tsne_data[:,1].min()-1, tsne_data[:,1].max() + 1
    step_size = 2.0
    
    xx,yy = np.meshgrid(np.arange(x_min,x_max,step_size),np.arange(y_min,y_max,step_size))
    
    for index,classifier in enumerate(classifiers):
        
        plt.subplot(2,3,index+1)
        plt.subplots_adjust(wspace=0.5,hspace=0.5)
        classifier.fit(tsne_data,class_labels)
        
        Z = classifier.predict(zip(xx.ravel(),yy.ravel()))
        Z = Z.reshape(xx.shape)
        
        plt.contourf(xx,yy,Z,cmap=plt.cm.Paired,alpha=0.7)
        plt.scatter(tsne_data[:,0],tsne_data[:,1],c=class_labels,cmap=plt.cm.rainbow,alpha=0.6)
        plt.xlabel("Feature 1")
        plt.ylabel("Feature 2")
        plt.xlim(x_min,x_max)
        plt.ylim(y_min,y_max)
        plt.xticks(())
        plt.yticks(())
        plt.title(classifier_names[index])
        
    plt.show()
Esempio n. 24
0
def visualization(result, word_dict):
	tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
	plot_only = 500

	low_dim_embs = tsne.fit_transform(result[0:500])
	labels = [ word_dict[i] for i in range(500) ]
	plot_with_labels(low_dim_embs, labels)
Esempio n. 25
0
def project_in_2D(distance_mat, method='mds'):
  """
  Project SDRs onto a 2D space using manifold learning algorithms
  :param distance_mat: A square matrix with pairwise distances
  :param method: Select method from 'mds' and 'tSNE'
  :return: an array with dimension (numSDRs, 2). It contains the 2D projections
     of each SDR
  """
  seed = np.random.RandomState(seed=3)

  if method == 'mds':
    mds = MDS(n_components=2, max_iter=3000, eps=1e-9,
              random_state=seed,
              dissimilarity="precomputed", n_jobs=1)

    pos = mds.fit(distance_mat).embedding_

    nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
               dissimilarity="precomputed", random_state=seed,
               n_jobs=1, n_init=1)

    pos = nmds.fit_transform(distance_mat, init=pos)
  elif method == 'tSNE':
    tsne = TSNE(n_components=2, init='pca', random_state=0)
    pos = tsne.fit_transform(distance_mat)
  else:
    raise NotImplementedError

  return pos
Esempio n. 26
0
def PlotTSNE (data, labels):										#Takes the data and the labels
	# Visualize the results on TSNE reduced data

	print "BUSY IN TSNE"

	model = TSNE(n_components=2, random_state=0)
	reduced_data = model.fit_transform(data)

	print "DATA REDUCED"

	# Plot the decision boundary. For that, we will assign a color to each
	x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max()
	y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max()
	
	plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
	
	#Adds labels to the plot
	for label, x, y in zip(labels, reduced_data[:, 0], reduced_data[:, 1]):
	    plt.annotate(
	        label, 
	        xy = (x, y), xytext = (-20, 20),
	        textcoords = 'offset points', ha = 'right', va = 'bottom',
	        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'green', alpha = 0.5),
	        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

	plt.title('TSNE Plot')
	plt.xlim(x_min, x_max)
	plt.ylim(y_min, y_max)
	plt.xticks(())
	plt.yticks(())
	plt.show()
def plotly_js_viz(word_2_vec_model):
    tsne_model=TSNE(n_components=2,random_state=5)
    data=tsne_model.fit_transform(word_2_vec_model.syn0)
    xd=list(data[:,0])
    yd=list(data[:,1])
    names_our=word_2_vec_model.index2word
    plot([Scatter(x=xd,y=yd,mode="markers",text=names_our)])
Esempio n. 28
0
def visualize_latent_rep(args, model, x_latent):
    print("pca_on=%r pca_comp=%d tsne_comp=%d tsne_perplexity=%f tsne_lr=%f" % (
        args.use_pca,
        args.pca_components,
        args.tsne_components,
        args.tsne_perplexity,
        args.tsne_lr
    ))

    if args.use_pca:
        pca = PCA(n_components = args.pca_components)
        x_latent = pca.fit_transform(x_latent)

    figure(figsize=(6, 6))
    scatter(x_latent[:, 0], x_latent[:, 1], marker='.')
    show()

    tsne = TSNE(n_components = args.tsne_components,
                perplexity = args.tsne_perplexity,
                learning_rate = args.tsne_lr,
                n_iter = args.tsne_iterations,
                verbose = 4)
    x_latent_proj = tsne.fit_transform(x_latent)
    del x_latent

    figure(figsize=(6, 6))
    scatter(x_latent_proj[:, 0], x_latent_proj[:, 1], marker='.')
    show()
Esempio n. 29
0
def tsne_plot(model):
    #"Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)

    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
Esempio n. 30
0
def infer(FLAGS):
    """
    Inference.
    """

    # Retrieve embeddings for docs
    words = ["tennis", "wimbledon", "icecream", "cake", "bear", "pie"]

     # Get index in doc embeddings
    with open(os.path.join(basedir, FLAGS.data_dir, "doc_to_idx.json"), 'r') as f:
        doc_to_idx = json.load(f)

    # Load the trained model
    model = torch.load(os.path.join(basedir, FLAGS.data_dir, "model.pt"))
    doc_embeddings = model.doc_embeddings.weight.data

    my_embeddings = np.array(
        [doc_embeddings[doc_to_idx[word]].numpy() for word in words])

    # Use TSNE model to reduce dimensionality
    model = TSNE(n_components=2, random_state=0)
    points = model.fit_transform(my_embeddings)

    # Visualize
    for i, word in enumerate(words):
        x, y = points[i, 0]*1e4, points[i, 1]*1e4
        plt.scatter(x, y)
        plt.annotate(word, xy=(x, y), xytext=(25, 5),
            textcoords='offset points', ha='right', va='bottom')
    plt.show()
bottleneck_representation = encoder.predict([X_scRNAseq, X_scProteomics])
print(pd.DataFrame(bottleneck_representation).shape)
print(pd.DataFrame(bottleneck_representation).iloc[0:5,0:5])

# Dimensionality reduction plot
#plt.figure(figsize=(20, 15))
plt.scatter(bottleneck_representation[:, 0], bottleneck_representation[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10)
plt.title('Autoencoder Data Integration')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
#plt.colorbar()
plt.show()

# tSNE on Autoencoder bottleneck representation
model_tsne_auto = TSNE(learning_rate = 200, n_components = 2, random_state = 123, perplexity = 90, n_iter = 1000, verbose = 1)
tsne_auto = model_tsne_auto.fit_transform(bottleneck_representation)
plt.scatter(tsne_auto[:, 0], tsne_auto[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10)
plt.title('tSNE on Autoencoder: Data Integration, CITEseq')
plt.xlabel("tSNE1")
plt.ylabel("tSNE2")
plt.show()

# UNIFORM MANIFOLD APPROXIMATION AND PROJECTION (UMAP)
#model_umap = UMAP(n_neighbors = 20, min_dist = 0.3, n_components = 2)
#umap = model_umap.fit_transform(bottleneck_representation)
#plt.scatter(umap[:, 0], umap[:, 1], c = Y_scRNAseq, cmap = 'tab20', s = 10)
#plt.title('UMAP on Autoencoder')
#plt.xlabel("UMAP1")
#plt.ylabel("UMAP2")
#plt.show()
Esempio n. 32
0
    'spleenV5_rel', 'spleenV10_rel', 'spleenV15_rel', 'spleenV20_rel',
    'spleenV25_rel', 'spleenV30_rel', 'spleenV35_rel', 'spleenV40_rel',
    'spleenV45_rel', 'spleenV50_rel', 'meanspleendose', 'spleenvolume'
]

new_data_dense_features2 = string_float(new_data[dense_features2]).values
N = 20000
n_components = 3
tsne = TSNE(n_components=n_components,
            perplexity=50,
            n_iter=N,
            init='pca',
            random_state=0,
            verbose=1,
            method='exact')
tsne_results = tsne.fit_transform(new_data_dense_features2)

new_data = new_data.drop(dense_features2, axis=1)
new_data["DoseComp1"] = tsne_results[:, 0]
new_data["DoseComp2"] = tsne_results[:, 1]
new_data["DoseComp3"] = tsne_results[:, 2]
dense_features2_new = ["DoseComp1", "DoseComp2", "DoseComp3"]

#new_data=np.concatenate([new_data.values,tsne_results],axis=1)
new_data.to_csv('Data_tsne_20K_3c.csv', index=False)

data_train, data_test = train_test_split(new_data,
                                         test_size=.3,
                                         random_state=8)

y_train_class = data_train.pop('G4RIL').values
Esempio n. 33
0
def project(cls1_data,
            cls2_data,
            projection='mds',
            setsize=None,
            with_debiasing=None,
            figname=None):
    if type(cls1_data) == list:
        fig, axes = plt.subplots(3, 4, figsize=(9, 12))

        fig.tight_layout(pad=0.5)
        flataxes = [ax for tmp in axes for ax in tmp]

        setsize = len(cls1_data)
        if setsize > 500: setsize = 500

        X = np.r_[np.concatenate(cls1_data, axis=0)[:setsize, :, :],
                  np.concatenate(cls2_data, axis=0)[:setsize, :, :]]

        if with_debiasing:
            X = with_debiasing.clean_data(X)

        for layer in range(0, 12):
            print('plotting layer %d' % (layer + 1))

            if projection == 'mds':
                mds = MDS(n_components=2)
                X_transformed = mds.fit_transform(X[:, layer, :].astype(
                    np.float64))
            if projection == 'pca':
                pca = PCA(n_components=2)
                X_transformed = pca.fit_transform(X[:, layer, :].astype(
                    np.float64))
            if projection == 'tsne':
                tsne = TSNE(n_components=2, verbose=1)
                X_transformed = tsne.fit_transform(X[:, layer, :].astype(
                    np.float64))

            colors = ['red'] * setsize + ['blue'] * setsize

            ax = flataxes[layer]
            ax.set_aspect('equal', adjustable='box')
            ax.scatter(X_transformed[:, 0], X_transformed[:, 1], s=2, c=colors)

            #ax.set_xlim((-20, 20))
            #ax.set_ylim((-20, 20))
            ax.set_title('Layer %d' % (layer + 1), fontsize=12)

            print('plotting done.')

    else:
        if not setsize:
            setsize = cls1_data.shape[0]

        colors = ['red'] * setsize + ['blue'] * setsize

        X = np.r_[cls1_data[:setsize, :], cls2_data[:setsize, :]]

        mds = MDS(n_components=2)
        X_transformed = mds.fit_transform(X)

        fig = plt.plot()
        plt.scatter(X_transformed[:, 0], X_transformed[:, 1], s=2, c=colors)

    if figname:
        plt.savefig(figname)
    else:
        plt.show()
Esempio n. 34
0

#Set df4 equal to a set of a sample of 1000 deafault and 1000 non-default observations.
df2 = tsne_data[tsne_data.default == 0].sample(n = 1000)
df3 = tsne_data[tsne_data.default == 1].sample(n = 1000)
df4 = pd.concat([df2, df3], axis = 0)

#Scale features to improve the training ability of TSNE.
standard_scaler = StandardScaler()
df4_std = standard_scaler.fit_transform(df4)

#Set y equal to the target values.
y = df4.ix[:,-1].values

tsne = TSNE(n_components=2, random_state=0)
x_test_2d = tsne.fit_transform(df4_std)

#Build the scatter plot with the two types of transactions.
color_map = {0:'red', 1:'blue'}
plt.figure()
for idx, cl in enumerate(np.unique(y)):
    plt.scatter(x = x_test_2d[y==cl,0], 
                y = x_test_2d[y==cl,1], 
                c = color_map[idx], 
                label = cl)
plt.xlabel('X in t-SNE')
plt.ylabel('Y in t-SNE')
plt.legend(loc='upper left')
plt.title('t-SNE visualization of test data')
plt.show()
        pBarLen = 20
        sys.stdout.write("|%s| - Training(%s/%s)-%s\r"%(progressBar((i//200)%pBarLen, pBarLen),i,steps,
            estimate_time(startTime, steps, i)))

        if i % logStep == 0:
            print("Loss: %s" % lossVal)
            sim = similarity.eval()
            for i in range(len(valid_set)):
                word = WORDS[valid_set[i]]
                top_k = 5
                nearest = (-sim[i,:]).argsort()[1:1+top_k]
                msg = "%s: "% word
                for k in range(top_k):
                    close_word = WORDS[nearest[k]]
                    if k > 0: msg += ", "#"\t"
                    msg += close_word #+ ": %.08f\n"%sim[i,k]
                print(msg)
            print("------------------------------------------")
            saveModel(sess, LOG_DIR+"model.ckpt")
            
            final_embeddings = normalized_embeddings.eval()
            writeToFile(final_embeddings, "savedEmbeddings/embeddings.pkl")
            # plotting using t-SNE
            # two_d_embeddings = tsne.fit_transform(final_embeddings)
            # plot(two_d_embeddings, WORDS)
        
    final_embeddings = normalized_embeddings.eval()
    two_d_embeddings = tsne.fit_transform(final_embeddings)
    plot(two_d_embeddings, WORDS)

    writeToFile(final_embeddings, "savedEmbeddings/embeddings.pkl")
Esempio n. 36
0
print(y.shape)
z_vec.flatten()
z_vec = z_vec[:, 0, :]
type(z_vec)
print(z_vec.shape)
b.flatten()
b = b[:, 0, :]
type(b)
print(b.shape)

from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)

# Convert the image to a numpy array

X_z_2d = tsne.fit_transform(z_vec)
print(X_z_2d.shape)
X_b_2d = tsne.fit_transform(b)
print(X_b_2d.shape)

# Plot z's
from matplotlib import pyplot as plt
target_ids = range(10)
plt.figure(figsize=(6, 5))
target_names = np.array([0, 1, 2,3,4,5,6,7,8,9])
print(target_names)
colors = 'k', 'b', 'y', 'r', 'g', 'm', 'c', 'orange', 'purple', 'brown'
for i, c, label in zip(target_ids, colors, target_names):
    plt.scatter(X_z_2d[y == i, 0], X_z_2d[y == i, 1], s=15, c=c, label=label)
plt.legend()
plt.show()
Esempio n. 37
0
    np.savetxt('{0}_c{1}_labels.tsv'.format(arguments.fout, k),
               (nmf.cluster_labels, nmf.remain_cell_inds),
               fmt='%u',
               delimiter='\t')

    # --------------------------------------------------
    # 3.4. T-SNE PLOT
    # --------------------------------------------------
    if arguments.tsne:
        model = TSNE(n_components=2,
                     random_state=0,
                     init='pca',
                     method='exact',
                     metric='euclidean',
                     perplexity=30)
        ret = model.fit_transform(nmf.pp_data.T)
        plt.title('{0} cluster (Euclidean)'.format(k))
        plt.scatter(ret[:, 0], ret[:, 1], 20, nmf.cluster_labels)
        plt.xticks([])
        plt.yticks([])

        plt.savefig('{0}_c{1}_tsne.png'.format(arguments.fout, k),
                    format='png',
                    bbox_inches=None,
                    pad_inches=0.1)
        # plt.show()

# --------------------------------------------------
# 6. SUMMARIZE RESULTS
# --------------------------------------------------
print '\n------------------------------ Summary:'
Esempio n. 38
0
    def process(self):
        # parse parameters
        input_words = self.parameters.get("words", "")
        if not input_words or not input_words.split(","):
            self.dataset.update_status(
                "No input words provided, cannot look for similar words.",
                is_final=True)
            self.dataset.finish(0)
            return

        input_words = input_words.split(",")

        try:
            threshold = float(self.parameters.get("threshold"))
        except ValueError:
            threshold = float(self.get_options()["threshold"]["default"])

        threshold = max(-1.0, min(1.0, threshold))
        num_words = convert_to_int(self.parameters.get("num-words"))
        overlay = self.parameters.get("overlay")
        reduction_method = self.parameters.get("method")
        all_words = self.parameters.get("all-words")

        # load model files and initialise
        self.dataset.update_status("Unpacking word embedding models")
        staging_area = self.unpack_archive_contents(self.source_file)
        common_vocab = None
        vector_size = None
        models = {}

        # find words that are common to all models
        self.dataset.update_status("Determining cross-model common vocabulary")
        for model_file in staging_area.glob("*.model"):
            if self.interrupted:
                shutil.rmtree(staging_area)
                raise ProcessorInterruptedException(
                    "Interrupted while processing word embedding models")

            model = KeyedVectors.load(str(model_file)).wv
            models[model_file.stem] = model
            if vector_size is None:
                vector_size = model.vector_size  # needed later for dimensionality reduction

            if common_vocab is None:
                common_vocab = set(model.vocab.keys())
            else:
                common_vocab &= set(model.vocab.keys())  # intersect

        # sort common vocabulary by combined frequency across all models
        # this should make filtering for common words a bit faster further down
        self.dataset.update_status("Sorting vocabulary")
        common_vocab = list(common_vocab)
        common_vocab.sort(key=lambda w: sum(
            [model.vocab[w].count for model in models.values()]),
                          reverse=True)

        # initial boundaries of 2D space (to be adjusted later based on t-sne
        # outcome)
        max_x = 0.0 - sys.float_info.max
        max_y = 0.0 - sys.float_info.max
        min_x = sys.float_info.max
        min_y = sys.float_info.max

        # for each model, find the words that we may want to plot - these are
        # the nearest neighbours for the given query words
        relevant_words = {}

        # the vectors need to be reduced all at once - but the vectors are
        # grouped by model. To solve this, keep one numpy array of vectors,
        # but also keep track of which indexes of this array belong to which
        # model, by storing the index of the first vector for a model
        vectors = numpy.empty((0, vector_size))
        vector_offsets = {}

        # now process each model
        for model_name, model in models.items():
            relevant_words[model_name] = set(
            )  # not a set, since order needs to be preserved
            self.dataset.update_status("Finding similar words in model '%s'" %
                                       model_name)

            for query in input_words:
                if query not in model.vocab:
                    self.dataset.update_status(
                        "Query '%s' was not found in model %s; cannot find nearest neighbours."
                        % (query, model_name),
                        is_final=True)
                    self.dataset.finish(0)
                    return

                if self.interrupted:
                    shutil.rmtree(staging_area)
                    raise ProcessorInterruptedException(
                        "Interrupted while finding similar words")

                # use a larger sample (topn) than required since some of the
                # nearest neighbours may not be in the common vocabulary and
                # will therefore need to be ignored
                context = set([
                    word[0] for word in model.most_similar(query, topn=1000)
                    if word[0] in common_vocab and word[1] >= threshold
                ][:num_words])

                relevant_words[model_name] |= {
                    query
                } | context  # always include query word

        # now do another loop to determine which words to plot for each model
        # this is either the same as relevant_words, or a superset which
        # combines all relevant words for all models
        plottable_words = {}
        last_model = max(relevant_words.keys())
        all_relevant_words = set().union(*relevant_words.values())

        for model_name, words in relevant_words.items():
            plottable_words[model_name] = []
            vector_offsets[model_name] = len(vectors)

            # determine which words to plot for this model. either the nearest
            # neighbours for this model, or all nearest neighbours found across
            # all models
            words_to_include = all_relevant_words if all_words else relevant_words[
                model_name]

            for word in words_to_include:
                if word in plottable_words[model_name] or (
                        not overlay and model_name != last_model
                        and word not in input_words):
                    # only plot each word once per model, or if 'overlay'
                    # is not set, only once overall (for the most recent
                    # model)
                    continue

                vector = models[model_name][word]
                plottable_words[model_name].append(word)
                vectors = numpy.append(vectors, [vector], axis=0)

        del models  # no longer needed

        # reduce the vectors of all words to be plotted for this model to
        # a two-dimensional coordinate with the previously initialised tsne
        # transformer. here the two-dimensional vectors are interpreted as
        # cartesian coordinates
        if reduction_method == "PCA":
            pca = PCA(n_components=2, random_state=0)
            vectors = pca.fit_transform(vectors)
        elif reduction_method == "t-SNE":
            # initialise t-sne transformer
            # parameters taken from Hamilton et al.
            # https://github.com/williamleif/histwords/blob/master/viz/common.py
            tsne = TSNE(n_components=2,
                        random_state=0,
                        learning_rate=150,
                        init="pca")
            vectors = tsne.fit_transform(vectors)
        elif reduction_method == "TruncatedSVD":
            # standard sklearn parameters made explicit
            svd = TruncatedSVD(n_components=2,
                               algorithm="randomized",
                               n_iter=5,
                               random_state=0)
            vectors = svd.fit_transform(vectors)
        else:
            shutil.rmtree(staging_area)
            self.dataset.update_status(
                "Invalid dimensionality reduction technique selected",
                is_final=True)
            self.dataset.finish(0)
            return

        # also keep track of the boundaries of our 2D space, so we can plot
        # them properly later
        for position in vectors:
            max_x = max(max_x, position[0])
            max_y = max(max_y, position[1])
            min_x = min(min_x, position[0])
            min_y = min(min_y, position[1])

        # now we know for each model which words should be plotted and at what
        # position
        # with this knowledge, we can normalize the positions, and start
        # plotting them in a graph

        # a palette generated with https://medialab.github.io/iwanthue/
        colours = [
            "#d58eff", "#cf9000", "#3391ff", "#a15700", "#911ca7", "#00ddcb",
            "#cc25a9", "#d5c776", "#6738a8", "#ff9470", "#47c2ff", "#a4122c",
            "#00b0ca", "#9a0f76", "#ff70c8", "#713c88"
        ]
        colour_index = 0

        # make sure all coordinates are positive
        max_x -= min_x
        max_y -= min_y

        # determine graph dimensions and proportions
        width = 1000  # arbitrary
        height = width * (max_y / max_x)  # retain proportions
        scale = width / max_x

        # margin around the plot to give room for labels and to look better
        margin = width * 0.1
        width += 2 * margin
        height += 2 * margin

        # normalize all known positions to fit within the graph
        vectors = [(margin + ((position[0] - min_x) * scale),
                    margin + ((position[1] - min_y) * scale))
                   for position in vectors]

        # now all positions are finalised, we can determine the "journey" of
        # each query - the sequence of positions in the graph it takes, so we
        # can draw lines from position to position later
        journeys = {}
        for query in input_words:
            journeys[query] = []
            for model_name, words in plottable_words.items():
                index = words.index(query)
                journeys[query].append(vectors[vector_offsets[model_name] +
                                               index])

        # font sizes proportional to width (which is static and thus predictable)
        fontsize_large = width / 50
        fontsize_normal = width / 75
        fontsize_small = width / 100

        # now we have the dimensions, the canvas can be instantiated
        model_type = self.source_dataset.parameters.get(
            "model-type", "word2vec")
        canvas = get_4cat_canvas(
            self.dataset.get_results_path(),
            width,
            height,
            header="%s nearest neighbours (fitting: %s) - '%s'" %
            (model_type, reduction_method, ",".join(input_words)),
            fontsize_normal=fontsize_normal,
            fontsize_large=fontsize_large,
            fontsize_small=fontsize_small)

        # use colour-coded backgrounds to distinguish the query words in the
        # graph, each model (= interval) with a separate colour
        for model_name in plottable_words:
            solid = Filter(id="solid-%s" % model_name)
            solid.feFlood(flood_color=colours[colour_index])
            solid.feComposite(in_="SourceGraphic")
            canvas.defs.add(solid)

            # this can get kind of confusing, but you shouldn't be using this
            # with more than 16 models anyway
            colour_index = 0 if colour_index >= len(
                colours) - 1 else colour_index + 1

        # now plot each word for each model
        self.dataset.update_status("Plotting graph")
        words = SVG(insert=(0, 0), size=(width, height))
        queries = SVG(insert=(0, 0), size=(width, height))
        colour_index = 0

        for model_name, labels in plottable_words.items():
            positions = vectors[
                vector_offsets[model_name]:vector_offsets[model_name] +
                len(labels)]

            label_index = 0
            for position in positions:
                word = labels[label_index]
                is_query = word in input_words
                label_index += 1

                filter = ("url(#solid-%s)" %
                          model_name) if is_query else "none"
                colour = "#FFF" if is_query else colours[colour_index]
                fontsize = fontsize_normal if is_query else fontsize_small

                if word in input_words:
                    word += " (" + model_name + ")"

                label_container = SVG(insert=position,
                                      size=(1, 1),
                                      overflow="visible")
                label_container.add(
                    Text(insert=("50%", "50%"),
                         text=word,
                         dominant_baseline="middle",
                         text_anchor="middle",
                         style="fill:%s;font-size:%ipx" % (colour, fontsize),
                         filter=filter))

                # we make sure the queries are always rendered on top by
                # putting them in a separate SVG container
                if is_query:
                    queries.add(label_container)
                else:
                    words.add(label_container)

            colour_index = 0 if colour_index >= len(
                colours) - 1 else colour_index + 1

        # plot a line between positions for query words
        lines = SVG(insert=(0, 0), size=(width, height))
        for query, journey in journeys.items():
            previous_position = None
            for position in journey:
                if previous_position is None:
                    previous_position = position
                    continue

                lines.add(
                    Line(start=previous_position,
                         end=position,
                         stroke="#CE1B28",
                         stroke_width=2))
                previous_position = position

        canvas.add(lines)
        canvas.add(words)
        canvas.add(queries)

        canvas.save(pretty=True)
        shutil.rmtree(staging_area)
        self.dataset.finish(len(journeys))
Esempio n. 39
0
w2v_model.wv.most_similar('人民', topn=5)
w2v_model.wv.most_similar('台灣', topn=5)

similar_words = {key_word:[similar_word[0] for similar_word in w2v_model.wv.most_similar(key_word, topn=6)]
                          for key_word in ['台灣','人民','國家','民主','中共','大陸','共匪','自由']}
similar_words

## Visualization

from sklearn.manifold import TSNE
all_words = sum([[key_word]+similar_words for key_word, similar_words in similar_words.items()], [])
all_words_vec = w2v_model.wv[all_words]

tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(all_words_vec)
labels=all_words

## Chinese Font Issues in Plotting

from matplotlib import rcParams
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
# 解决负号'-'显示为方块的问题
# rcParams['axes.unicode_minus']=False
myfont = FontProperties(fname='/System/Library/Fonts/PingFang.ttc',
 size=12)
# plt.title('乘客等级分布', fontproperties=myfont)
# plt.ylabel('人数', fontproperties=myfont)
# plt.legend(('头等舱', '二等舱', '三等舱'), loc='best', prop=myfont)
###   ~ Data Viz ~  ###
#######################

df_train_both = pd.merge(df_train_genetic_scaled,
                         df_train_non_genetic_pre_processed,
                         left_index=True,
                         right_index=True,
                         how='outer')
df_train_both_with_target = pd.merge(df_train_both,
                                     df_solution,
                                     left_index=True,
                                     right_index=True,
                                     how='outer')

#Applying t-SNE to vizualize data
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
x = df_train_both.values
x_tsne = tsne.fit_transform(x)

df_tsne = pd.DataFrame()
df_tsne['x-tsne'] = x_tsne[:, 0]
df_tsne['y-tsne'] = x_tsne[:, 1]
df_tsne['label'] = df_solution.values


chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \
        + geom_point(size=70,alpha=0.1) \
        + ggtitle("tSNE dimensions colored by cancer stage")

print(chart)
                   b_sight=3000,
                   b_maxl=1500,
                   n_jobs=20)

#print('fit gammas')
vlm.fit_gammas()

print('calculate velocity')
vlm.predict_U()
vlm.calculate_velocity()
vlm.calculate_shift(assumption="constant_velocity")
vlm.extrapolate_cell_at_t(delta_t=1.)

print('running tsne')
bh_tsne = TSNE(random_state=1)
vlm.ts = bh_tsne.fit_transform(vlm.pcs[:, :20])

print('projection of velocity onto embeddings')
vlm.estimate_transition_prob(hidim="Sx_sz",
                             embed="ts",
                             transform="sqrt",
                             psc=1,
                             n_neighbors=3500,
                             knn_random=True,
                             sampled_fraction=0.5)

print('calculate embedding shift')
vlm.calculate_embedding_shift(sigma_corr=0.05, expression_scaling=True)

print('calculate grid arrows')
vlm.calculate_grid_arrows(smooth=0.8, steps=(40, 40), n_neighbors=100)
Esempio n. 42
0
                log_str="Nearest to %s:" % valid_word
                for k in range(top_k):
                    close_word=reverse_dictionary[nearest[k]]
                    log_str="%s %s," %(log_str,close_word)
                print(log_str)
    final_embedding=normalized_embeddings.eval()
'''
可视化Word2Vec散点图并保存
'''
def plot_with_labels(low_dim_embs,labels,filename):
    assert low_dim_embs.shape[0]>=len(labels),"more labels than embedding"
    plt.figure(figsize=(18,18))
    for i,label in enumerate(labels):
        x,y=low_dim_embs[i,:]
        plt.scatter(x, y)
        plt.annotate(label,xy=(x,y),xytext=(5,2),textcoords='offset points',ha='right',va='bottom')
    plt.savefig(filename)

'''
tsne实现降维,将原始的128维的嵌入向量降到2维
'''

tsne=TSNE(perplexity=30,n_components=2,init='pca',n_iter=5000)
plot_number=150
low_dim_embs=tsne.fit_transform(final_embedding[:plot_number,:])
labels=[reverse_dictionary[i] for i in range(plot_number)]
plot_with_labels(low_dim_embs, labels, './plot.png')

       

    
## Remove duplicates of fighters, only keep the most up to date entry
relevant_data.sort_values(by='date', ascending=False, inplace=True)
relevant_data = relevant_data.drop_duplicates(subset='fighter')
relevant_data.drop(columns='date', inplace=True)
relevant_data.drop(
    columns='Stance', inplace=True
)  # Removed Stance for dataset simplicity, consider adding in later
relevant_data.fillna(0, inplace=True)

x = relevant_data.iloc[:, 1:]
y = relevant_data.iloc[:, 0]
#y = y.astype('category').cat.codes

model = TSNE(learning_rate=100)

tsne_transformed = model.fit_transform(x)

xs = tsne_transformed[:, 0]
ys = tsne_transformed[:, 1]

## Plot the clusters
#plt.scatter(xs, ys, c = y)
#plt.ylim(-50, 50)
#plt.xlim(-50,50)
#plt.show()

fig, ax = plt.subplots(figsize=(8, 6))
ax.set_ylim(-50, 50)
ax.set_xlim(-50, 50)
ax.scatter(xs, ys)
Esempio n. 44
0
index = 0
for ine in inertia:
    b = (inertia[0] - ine) * (inertia[0] - ine)
    c = (ine - inertia[-1]) * (ine - inertia[-1])
    a = math.sqrt(b + c)
    if a < mininum:
        get = index
        mininum = a
    index += 1
kmeans = KMeans(n_clusters=get, random_state=0).fit(data_sparse)

model = AgglomerativeClustering(n_clusters=get)
gp = model.fit_predict(data_sparse)

tsne = TSNE()
visualisation = tsne.fit_transform(data_sparse)


def count(data, qtd):
    all_words = ' '.join([text for text in data])
    token_phrase = token_space.tokenize(all_words)
    frequency = nltk.FreqDist(token_phrase)
    df_frequency = pd.DataFrame({
        "Palavra": list(frequency.keys()),
        "Frequencia": list(frequency.values())
    })
    de_frequency = df_frequency.nlargest(columns="Frequencia", n=qtd)
    return (de_frequency)


list_city = []
Esempio n. 45
0
#-*- coding: utf-8 -*-
# 接k_means.py
from sklearn.manifold import TSNE

tsne = TSNE()
tsne.fit_transform(data_zs)  # 进行数据降维
tsne = pd.DataFrame(tsne.embedding_, index=data_zs.index)  # 转换数据格式

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

# 不同类别用不同颜色和样式绘图
d = tsne[r[u'聚类类别'] == 0]
plt.plot(d[0], d[1], 'r.')
d = tsne[r[u'聚类类别'] == 1]
plt.plot(d[0], d[1], 'go')
d = tsne[r[u'聚类类别'] == 2]
plt.plot(d[0], d[1], 'b*')
plt.show()
Esempio n. 46
0
 list_avec = all_list_adocs_vec + list_awords_sim_vec
 
 
 #6
 #-------------- PCA ------------------#
 #pca = PCA(n_components=2)
 #dresult_X_all = pca.fit_transform(list_dvec)
 #aresult_X_all = pca.fit_transform(list_avec)
 
 
 #-------------- TSNE -----------------#
 #tsne = TSNE(n_components=2)
 #tsne = TSNE(n_components=2, random_state=None, verbose=1, perplexity=40, n_iter=300)
 #tsne = TSNE(n_components=2, init='pca', random_state=0, perplexity=40)
 tsne = TSNE(n_components=2, perplexity=40, metric='euclidean', init='pca', verbose=0, random_state=0)
 dresult_X_all = tsne.fit_transform(list_dvec)
 aresult_X_all = tsne.fit_transform(list_avec)
 
 
 #------------- Setting plot -----------------#
 word_vec = "Word Vectors"
 docs_vec = "Paragraph Vectors"
 list_genre_plot = ["Economic", "Education", "Entertainment", "Foreign", "IT", "Sports"]
 list_wovec_plot = [list_genre_plot[0]+' '+word_vec,list_genre_plot[1]+' '+word_vec,list_genre_plot[2]+' '+word_vec,list_genre_plot[3]+' '+word_vec,list_genre_plot[4]+' '+word_vec,list_genre_plot[5]+' '+word_vec]
 list_dovec_plot = [list_genre_plot[0]+' '+docs_vec,list_genre_plot[1]+' '+docs_vec,list_genre_plot[2]+' '+docs_vec,list_genre_plot[3]+' '+docs_vec,list_genre_plot[4]+' '+word_vec,list_genre_plot[5]+' '+docs_vec]
 plt.rcParams['font.family'] = 'TH SarabunPSK'
 plt.rcParams['font.size'] = 14
 #with plt.style.context('dark_background'):
 
 #********---------------------------------------------------------********#
 
Esempio n. 47
0
# extract output of the final embedding layer (before the softmax)
# in test mode, we should set the 'learning_phase' flag to 0 (e.g., we don't want to use dropout)
get_doc_embedding = K.function(
    [model.layers[0].input, K.learning_phase()], [model.layers[9].output])

n_plot = 1000
print('plotting embeddings of first', n_plot, 'documents')

doc_emb = get_doc_embedding([np.array(x_test[:n_plot]), 0])[0]

my_pca = PCA(n_components=10)
my_tsne = TSNE(n_components=2,
               perplexity=10)  #https://lvdmaaten.github.io/tsne/
doc_emb_pca = my_pca.fit_transform(doc_emb)
doc_emb_tsne = my_tsne.fit_transform(doc_emb_pca)

labels_plt = y_test[:n_plot]
my_colors = ['blue', 'red']

fig, ax = plt.subplots()

for label in list(set(labels_plt)):
    idxs = [idx for idx, elt in enumerate(labels_plt) if elt == label]
    ax.scatter(doc_emb_tsne[idxs, 0],
               doc_emb_tsne[idxs, 1],
               c=my_colors[label],
               label=str(label),
               alpha=0.7,
               s=40)
Esempio n. 48
0
    model = word2vec.load(args.model_path)

    vocabs = []
    vecs = []
    for vocab in model.vocab:
        vocabs.append(vocab)
        vecs.append(model[vocab])
    vecs = np.array(vecs)[:args.plot_num]
    vocabs = vocabs[:args.plot_num]
    '''
    Dimensionality Reduction
    '''
    # from sklearn.decomposition import PCA

    tsne = TSNE(n_components=2)
    reduced = tsne.fit_transform(vecs)
    '''
    Plotting
    '''

    # filtering
    use_tags = set(['JJ', 'NNP', 'NN', 'NNS'])
    puncts = ["'", '.', ':', ";", ',', "?", "!", u"’"]

    plt.figure()
    texts = []
    for i, label in enumerate(vocabs):
        pos = nltk.pos_tag([label])
        if (label[0].isupper() and len(label) > 1 and pos[0][1] in use_tags
                and all(c not in label for c in puncts)):
            x, y = reduced[i, :]
Esempio n. 49
0
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

    plt.savefig(filename)


try:
    # pylint: disable=g-import-not-at-top
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt

    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    plot_only = 500
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
    plot_with_labels(low_dim_embs, labels)

except ImportError:
    print('Please install sklearn, matplotlib, and scipy to show embeddings.')
# Threshold filters out unconfident topics

threshold = 0.5
_idx = np.amax(refac_matrix, axis=1) > threshold  # idx of doc that above the threshold
refac_matrix = refac_matrix[_idx]

# Dimentionality reduction gives us a better plot

from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
from sklearn.manifold import TSNE

tsne = TSNE(random_state=2017, perplexity=30) # 5 30 50
tsne_embedding = tsne.fit_transform(refac_matrix)
tsne_embedding = pd.DataFrame(tsne_embedding, columns =['x','y'])
tsne_embedding['hue'] = refac_matrix.argmax(axis=1)

dataset['Date'] = pd.to_datetime(dataset.Date)


'''t-SNE scatter plot made with bokeh.
   You can move your mouse over a point
   to see specific words clustered in 
   their respective topics'''

source = ColumnDataSource(
        data=dict(
            x = tsne_embedding.x,
            y = tsne_embedding.y,
for k, col in zip(tstInd['clust'].unique(), colors):
    tmp = np.vstack(np.asarray(tstInd[tstInd['clust'] == k]['PCA']))
    if len(tmp) > 0:
        plt.scatter(tmp[:, 0], tmp[:, 1], c=col)

#draw indications of same drug in same color

tstInd = pn.read_csv("/home/galiasn/DATA/MechanismBasedRepurposing/full.csv")
tstInd = tstInd[tstInd['status'] == 'Approved']
tstInd['w2v'] = tstInd['ind_id'].apply(getIndVector)

pca_model = PCA(n_components=30)
tsne_model = TSNE(n_components=2, perplexity=10, n_iter=1000)
XOne = pca_model.fit_transform(list(tstInd['w2v']))
tsne_pca = tsne_model.fit_transform(XOne)
tstInd['PCA'] = list(tsne_pca)
tstInd['sum'] = len(tstInd) * [1]

tstIndgb = tstInd.groupby('drug_id', as_index=False).sum()
tstIndgb = tstIndgb[tstIndgb['sum'] > 1]
tstIndgb = tstIndgb[(tstIndgb['sum'] < 10)]

drugIds = tstIndgb['drug_id'].unique()
drugSampl = random.sample(set(drugIds), 400)
for d in drugSampl:
    tmp = np.vstack(np.asarray(tstInd[tstInd['drug_id'] == d]['PCA']))
    if len(tmp) > 0:
        plt.scatter(tmp[:, 0], tmp[:, 1])
########################
#Mol2Vec
Esempio n. 52
0
for i in range(10):
    idx_by_label = np.array(data.index[data['label'] == i].tolist())
    idx_by_label_sample = np.random.choice(idx_by_label,
                                           size=100,
                                           replace=False)
    if i == 0:
        idx_all = idx_by_label_sample
    else:
        idx_all = np.append(idx_all, idx_by_label_sample)

data_sample = data.iloc[np.sort(idx_all)]
""" Divide data to features and labels """
data_features = data_sample.iloc[:, 1:]
data_labels = data_sample.iloc[:, 0]

points = tsne.fit_transform(data_features)
points_3d = tsne_3d.fit_transform(data_features)

"plotting function 2d space"


def plot_embedding(x, y, title):
    plt.figure()
    plt.scatter(x[:, 0], x[:, 1], color=plt.cm.Set1(y / 10))

    plt.xticks([])
    plt.yticks([])
    plt.title(title)
    plt.show()

plt.figure(figsize=(16, 2))
plt.plot(list(clust_dict.keys()), list(clust_dict.values()), '+-')
plt.grid()
plt.title('Silhouette average vs number of clusters (using K-means)')
plt.show()

# In[15]:

from sklearn.manifold import TSNE

NUM_CLUSTERS = 8

# loop for all assets and create plot of the system
df_labels = pd.DataFrame(index=df.index, columns=assets)
for a in assets:
    dfa = df[[c for c in df.columns if a in c]]
    X_clust = dfa.interpolate(axis=0).values
    clf = clf = KMeans(n_clusters=NUM_CLUSTERS)
    df_labels[a] = clf.fit_predict(X_clust)
    model = TSNE(n_components=2, random_state=0)
    Y_clust = model.fit_transform(X_clust)
    plt.scatter(x=Y_clust.transpose()[0],
                y=Y_clust.transpose()[1],
                c=df_labels[a],
                cmap='jet')
    plt.title("T-SNE representation of {} clusters for asset {}".format(
        NUM_CLUSTERS, a))
    plt.show()

# The analysis does not conclude that the state of each asset can just be describe on two dimensions that can be used to explain the full system behavior.
Esempio n. 54
0
def getTSNE(data):
    ts = TSNE(n_components=2)
    ts_data = ts.fit_transform(data)
    return ts_data
Esempio n. 55
0
df['pca-one'] = pca_result[:, 0]
df['pca-two'] = pca_result[:, 1]

from plotnine import *

chart = (ggplot(df, aes(x='pca-one', y='pca-two', color='label')) +
         geom_point(size=2, alpha=1) +
         ggtitle("First and Second Principal Components  by Label"))

ggplot.save(chart, 'PCA.png')
print("Image save as PCA.png in current dir")

import time

from sklearn.manifold import TSNE

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=500)
tsne_results = tsne.fit_transform(df[feat_cols].values)

df_tsne = df.copy()
df_tsne['x-tsne'] = tsne_results[:, 0]
df_tsne['y-tsne'] = tsne_results[:, 1]

chart = ggplot(df_tsne, aes(
    x='x-tsne', y='y-tsne', color='label')) + geom_point(
        size=2, alpha=1) + ggtitle("t-SNE dimensions colored by Label")
ggplot.save(chart, 'tsne.png')

print("plot has been save in current dir as t-SNE.png")
Esempio n. 56
0
torch.manual_seed(42)

for epoch in range(num_epochs):
    train_images = overall_train_data
    train_images = train_images.requires_grad_()
    # Clear gradients w.r.t. parameters
    optimizer4_1.zero_grad()
    # Forward pass to get output/logits
    train_outputs = model4(train_images)

    # Get predictions from the maximum value
    _, predicted_labels = torch.max(train_outputs.data, 1)

    f = train_outputs.detach().numpy()
    tsne = TSNE(n_components=2, verbose=1)
    t1 = tsne.fit_transform(f)
    fig, ax = plt.subplots()
    groups = predicted_labels.numpy()
    for g in np.unique(groups):
        i = np.where(groups == g)
        ax.scatter(t1[i, 0], t1[i, 1], label=g)
    ax.legend([
        'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog',
        'horse', 'ship', 'truck'
    ])
    plt.show()

    train_labels = overall_train_labels

    # Calculate Loss: softmax --> cross entropy loss
    train_loss = criterion(train_outputs, train_labels)
Esempio n. 57
0
                             'Disciplinary failure', 'Education',
                             'Social drinker', 'Social smoker'
                         ])

    cdf = cdf.drop(labels=[
        'Reason for absence', 'Month of absence', 'Day of the week', 'Seasons',
        'Disciplinary failure', 'Education', 'Social drinker', 'Social smoker'
    ]).astype(np.float64)

    # Standardize the dataset
    ss = StandardScaler(with_std=False)
    sdf = ss.fit_transform(cdf)

    # Perform the TSNE non-linear dimensionality reduction
    tsne = TSNE(n_components=2, perplexity=15, random_state=1000)
    data_tsne = tsne.fit_transform(sdf)

    df_tsne = pd.DataFrame(data_tsne, columns=['x', 'y'], index=cdf.index)
    dff = pd.concat([cdf, df_tsne], axis=1)

    # Show the dataset
    sns.set()

    fig, ax = plt.subplots(figsize=(18, 11))

    with sns.plotting_context("notebook", font_scale=1.5):
        sns.scatterplot(x='x',
                        y='y',
                        size='Age',
                        sizes=(30, 400),
                        palette=sns.color_palette("husl", 2),
Esempio n. 58
0
        optimizer.zero_grad()  # clear gradients for this training step
        loss.backward()  # back propagation, compute gradients
        optimizer.step()  # apply gradients

        if step % BATCH_SIZE == 0:
            test_output, last_layer = cnn(test_x)
            pred_y = torch.max(test_output, 1)[1].data.numpy()
            accuracy = float(
                (pred_y == test_y.data.numpy()).astype(int).sum()) / float(
                    test_y.size(0))
            print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.numpy(),
                  '| test accuracy: %.2f' % accuracy)
            if HAS_SK:
                # Visualization of trained flatten layer (T-SNE)
                tsne = TSNE(perplexity=30,
                            n_components=2,
                            init='pca',
                            n_iter=5000)
                plot_only = 500
                low_dim_embs = tsne.fit_transform(
                    last_layer.data.numpy()[:plot_only, :])
                labels = test_y.numpy()[:plot_only]
                plot_with_labels(low_dim_embs, labels)

plt.ioff()

# print 10 predictions from test data
test_output, _ = cnn(test_x[:10])
pred_y = torch.max(test_output, 1)[1].data.numpy()
print(pred_y, 'prediction number')
print(test_y[:10].numpy(), 'real number')
Esempio n. 59
0
            classes = classes.numpy()

            #outputs = outputs.cpu()
            #X = X.to(device)

            X = np.vstack((X, outputs))
            Y_class = np.concatenate((Y_class, classes))
            Y_domain = np.concatenate((Y_domain, np.array([1 for _ in range(inputs.size(0))], dtype=np.int16)))
            
            print("Target stpes: [{}/{}]".format(i, steps))
        
        print(X.shape)
        print(Y_class.shape)
        print(Y_domain.shape)

    X_tsne = tsne.fit_transform(X)
    print("Org data dimension is {}. Embedded data dimension is {}".format(X.shape[-1], X_tsne.shape[-1]))

    x_min, x_max = X_tsne.min(0), X_tsne.max(0)
    X_norm = (X_tsne - x_min) / (x_max - x_min)


    color = ['r', 'g', 'b', 'k', 'gold', 'm', 'c', 'orange', 'cyan', 'pink']
    class_color = [color[label] for label in Y_class]
    domain_color = [color[label] for label in Y_domain]


    plt.figure(1, figsize=(8, 8))
    plt.scatter(X_norm[:, 0], X_norm[:, 1], c=class_color, s=1)
    plt.savefig("./dann{}_{}_class.png".format(source, target))
    plt.close("all")
Esempio n. 60
0
X = model[model.wv.vocab]

label = list(reversed(sorted(l_k, key=f_key)))

X_sort = X
for j in range(0, len(label)):
    X_sort[j, :] = model[label[j]]

#tsne visualize

plot_i = 0
plot_f = 500

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_sort[plot_i:plot_f, :])


def plot_with_labels(low_dim_embs, labels):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')