Ejemplo n.º 1
0
def visualize():
  training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
  # Unzipping gives tuples, but we want arrays of values.
  training_input = [x.transpose()[0] for x in zip(*training_data)[0]]
  test_input = [x.transpose()[0] for x in zip(*test_data)[0]]
  # Get the y values.
  test_target = [y for y in zip(*test_data)[1]]

  # Apply SVD to the training input.
  u, s, v = np.linalg.svd(training_input, full_matrices=False)
  print u.shape
  print s.shape
  print v.shape
  
  print "Generating embeddings..."
  #print v[0]
  print v[0].shape
  embeddings = [np.dot(test_inp, np.transpose(v[:10][:])) for test_inp in test_input]
  print embeddings[0].shape
  
  # Do dimensionality reduction into 2 dimensions.
  print "Performing dimensionality reduction using t-sne..."
  tsne = TSNE()
  reduced_vecs = tsne.fit_transform(embeddings)
  print reduced_vecs[0]

  # Graph all of the points, where points corresponding to the same digit will have the same color.
  colors = ['r', 'b', 'g', 'c', 'm', 'k', 'y', (.2, .2, .2), (.4, 0, .5), (.8, .2, 0)]
  red_patch = mpatches.Patch(color='red', label='1')
  patches = [mpatches.Patch(color=colors[i], label='%i'% i) for i in range(len(colors))]
  plt.legend(handles=patches)
  for i in range(len(reduced_vecs)):
    plt.plot([reduced_vecs[i][0]], [reduced_vecs[i][1]], 'o', color=colors[test_target[i]])
  plt.show()
Ejemplo n.º 2
0
def add_tsne_features(x_train, x_test):
    print('add_tsne_features <<')

    x_train_data = x_train.data_
    x_test_data = x_test.data_

    x = np.vstack((x_train_data, x_test_data))

    print('applying pca...')
    pca = PCA(n_components=25)
    x_pca = pca.fit_transform(x)

    print('applying t-SNE...')
    tsne_model = TSNE(n_components=2, random_state=0)
    x_tsne = tsne_model.fit_transform(x_pca)
    x_train_data = np.hstack((x_train_data, x_tsne[:x_train_data.shape[0], :]))
    x_test_data = np.hstack((x_test_data, x_tsne[-x_test_data.shape[0]:, :]))

    assert(x_train.columns_ == x_test.columns_)
    columns = x_train.columns_ + ['tsne_1', 'tsne_2']
    x_train = DataSet(x_train.ids_, columns, x_train_data)
    x_test = DataSet(x_test.ids_, columns, x_test_data)

    print('add_tsne_features >>')
    return x_train, x_test
Ejemplo n.º 3
0
    def sendTSNE(self, people):
        d = self.getData()
        if d is None:
            return
        else:
            (X, y) = d

        X_pca = PCA(n_components=50).fit_transform(X, X)
        tsne = TSNE(n_components=2, init='random', random_state=0)
        X_r = tsne.fit_transform(X_pca)

        yVals = list(np.unique(y))
        colors = cm.rainbow(np.linspace(0, 1, len(yVals)))

        # print(yVals)

        plt.figure()
        for c, i in zip(colors, yVals):
            name = "Unknown" if i == -1 else people[i]
            plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=name)
            plt.legend()

        imgdata = StringIO.StringIO()
        plt.savefig(imgdata, format='png')
        imgdata.seek(0)

        content = 'data:image/png;base64,' + \
                  urllib.quote(base64.b64encode(imgdata.buf))
        msg = {
            "type": "TSNE_DATA",
            "content": content
        }
        self.sendMessage(json.dumps(msg))
Ejemplo n.º 4
0
def display_closestwords_tsnescatterplot(model, word):
    arr = np.empty((0,300), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()
Ejemplo n.º 5
0
def make_tsne_plot(model, rel_wds, plot_lims, title):

    dim = 30
    X, keys = make_data_matrix(model)

    # first we actually do PCA to reduce the
    # dimensionality to make tSNE easier to calculate
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = PCA(n_components=2)
    X = sklearn_pca.fit_transform(X_std)[:,:dim]

    # do downsample
    k = 5000
    sample = []
    important_words = []
    r_wds = [word[0] for word in rel_wds]
    for i, key in enumerate(keys):
        if key in r_wds:
            sample.append(i)
    sample = np.concatenate((np.array(sample),
                np.random.choice(len(keys), k-10, replace = False),
             ))
    X = X[sample,:]
    keys = [keys[i] for i in sample]



    # Do tSNE
    tsne = TSNE(n_components=2, random_state=0, metric="cosine")
    X_transf = tsne.fit_transform(X)

    k_means = KMeans(n_clusters=8)
    labels = k_means.fit_predict(X_transf)

    scatter_plot(X_transf[:,0], X_transf[:,1],  rel_wds, labels, title, keys, plot_lims)
def plot_features(subject, data_path, model_path, test_labels, dataset='test'):
    with open(model_path + '/' + subject + '.pickle', 'rb') as f:
        state_dict = cPickle.load(f)
    cnn = ConvNet(state_dict['params'])
    cnn.set_weights(state_dict['weights'])
    scalers = state_dict['scalers']

    if dataset == 'test':
        d = load_test_data(data_path, subject)
        x = d['x']
        y = test_labels['preictal']
    elif dataset == 'train':
        d = load_train_data(data_path, subject)
        x, y = d['x'], d['y']
    else:
        raise ValueError('dataset')

    x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \
        else scale_across_features(x, x_test=None, scalers=scalers)

    cnn.batch_size.set_value(x.shape[0])
    get_features = theano.function([cnn.x, Param(cnn.training_mode, default=0)], cnn.feature_extractor.output,
                                 allow_input_downcast=True)

    logits_test = get_features(x)
    model = TSNE(n_components=2, random_state=0)
    z = model.fit_transform(np.float64(logits_test))
    plt.scatter(z[:, 0], z[:, 1], s=60, c=y)
    plt.show()
Ejemplo n.º 7
0
def plot_data(data, has_label=True):
	import numpy as np
	import seaborn as sns
	from sklearn.manifold import TSNE
	from sklearn.decomposition import PCA

	if not has_label:
		data = data.copy()
		data['label'] = np.zeros([len(data),1])

	LIMIT = 4000
	if data.shape[0] > LIMIT:
		dt = data.sample(n=LIMIT, replace=False)
		X = dt.ix[:,:-1]
		labels = dt.ix[:,-1]
	else:
		X = data.ix[:,:-1]
		labels = data.ix[:,-1]

	tsne_model = TSNE(n_components=2, random_state=0)
	np.set_printoptions(suppress=True)
	points1 = tsne_model.fit_transform(X)
	df1 = pd.DataFrame(data=np.column_stack([points1,labels]), columns=["x","y","class"])
	sns.lmplot("x", "y", data=df1, hue='class', fit_reg=False, palette=sns.color_palette('colorblind'))
	sns.plt.title('TNSE')

	pca = PCA(n_components=2)
	pca.fit(X)
	points2 = pca.transform(X)
	df2 = pd.DataFrame(data=np.column_stack([points2,labels]), columns=["x","y","class"])
	sns.lmplot("x", "y", data=df2, hue='class', fit_reg=False, palette=sns.color_palette('colorblind'))
	sns.plt.title('PCA')
Ejemplo n.º 8
0
    def plot_mfi(self, outputfile='embeddings.pdf', nb_clusters=8, weights='NA'):
        # collect embeddings for mfi:
        X = np.asarray([self.w2v_model[w] for w in self.mfi \
                            if w in self.w2v_model], dtype='float32')
        # dimension reduction:
        tsne = TSNE(n_components=2)
        coor = tsne.fit_transform(X) # unsparsify

        plt.clf()
        sns.set_style('dark')
        sns.plt.rcParams['axes.linewidth'] = 0.4
        fig, ax1 = sns.plt.subplots()  

        labels = self.mfi
        # first plot slices:
        x1, x2 = coor[:,0], coor[:,1]
        ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none')
        # clustering on top (add some colouring):
        clustering = AgglomerativeClustering(linkage='ward',
                            affinity='euclidean', n_clusters=nb_clusters)
        clustering.fit(coor)
        # add names:
        for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_):
            ax1.text(x, y, name, ha='center', va="center",
                     color=plt.cm.spectral(cluster_label / 10.),
                     fontdict={'family': 'Arial', 'size': 8})
        # control aesthetics:
        ax1.set_xlabel('')
        ax1.set_ylabel('')
        ax1.set_xticklabels([])
        ax1.set_xticks([])
        ax1.set_yticklabels([])
        ax1.set_yticks([])
        sns.plt.savefig(outputfile, bbox_inches=0)
Ejemplo n.º 9
0
def perform_AE(X, dim=2, tsne=False):
    y = np.zeros(shape=X.shape[0], dtype=int)
    
    if tsne:
        hidden_layers = [X.shape[1], 500, 100, 32]
        encoder_weights, decoder_weights = pretrain(X, hidden_layers)
        X_32d = ae(X, encoder_weights, decoder_weights, hidden_layers)

        ae_tsne = TSNE(n_components=dim, learning_rate=800, verbose=1)
        X_2d = ae_tsne.fit_transform(X_32d)

        method = 'ae_tsne_scaled'
    ### END - if tsne

    else:
        hidden_layers = [X.shape[1], 500, 100, 20, dim]
        encoder_weights, decoder_weights = pretrain(X, hidden_layers)
        X_2d = ae(X, encoder_weights, decoder_weights, hidden_layers)
        
        method = 'ae_scaled'
    ### END - else

    print('***** ' + method + ' *****')
    cluster(X_2d, method)
    np.save("{0}_{1}_X_2d".format(species, method), X_2d)
Ejemplo n.º 10
0
def plot_phonemes(path):
    phoneme_embeddings = dict()
    for line in codecs.open(path,"r"):
        line = line.split(",")
        key= line[0][1:-1]
        emb = line[1:]
        emb[-1] = emb[-1][:-1]
        emb = np.array([float(e) for e in emb])
        phoneme_embeddings[key] = emb
    
    phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys())
    print(phoneme_embeddings.columns)
    
    m = TSNE()
    phoneme_embeddings_tsne = m.fit_transform(phoneme_embeddings.transpose())
    print(len(phoneme_embeddings_tsne))
    for p,emb in zip(phoneme_embeddings.columns, phoneme_embeddings_tsne):
        c = "black"
        if regex.search("^[aeiou3E][*]?$", p):
            c = "red"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*w~$", p):
            c = "blue"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*y~$", p):
            c = "yellow"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*h~$", p):
            c = "brown"
            plt.annotate(p,(emb[0],emb[1]),color=c)
        if regex.search("^.*\"$", p):
            c = "green"
            plt.annotate(p,(emb[0],emb[1]),color=c)
Ejemplo n.º 11
0
def t_sne_view(norm_table, subj_cond, cohorts, image_type):

    # t-SNE analysis: Use stochastic neighbor embedding to reduce dimensionality of
    # data set to two dimensions in a non-linear, distance dependent fashion

    # Perform PCA data reduction if dimensionality of feature space is large:
    if len(norm_table.columns) > 12:
        pca = PCA(n_components = 12)
        pca.fit(norm_table.as_matrix())
        
        raw_data = pca.transform(norm_table.as_matrix())
    else:
        raw_data = norm_table.as_matrix()
 
    # Transform data into a two-dimensional embedded space:
    tsne = TSNE(n_components = 2, perplexity = 40.0, early_exaggeration= 2.0, 
        learning_rate = 100.0, init = 'pca')

    tsne_data = tsne.fit_transform(raw_data)

    # Prepare for normalization and view:
    cols = ['t-SNE', 'Cluster Visualization']
    tsne_table = pd.DataFrame(tsne_data, index = norm_table.index, columns = cols)
           
    # The output is no longer centered or normalized, so shift & scale it before display:
    tsne_avg = ppmi.data_stats(tsne_table, subj_cond, cohorts)
    tsne_norm_table = ppmi.normalize_table(tsne_table, tsne_avg)       
    
    # Send out to graphics rendering engine:

    if (image_type == 'Gauss'):
        return scg.scatter_gauss(tsne_norm_table[cols[0]], tsne_norm_table[cols[1]], subj_cond)
    elif (image_type == 'Scatter'):
        return scg.scatter_plain(tsne_norm_table[cols[0]], tsne_norm_table[cols[1]], subj_cond)
def main():
    embedding = WordEmbedding(embeddingpath(default_embeddingconfig))


    for old, new in spelling_changes:
        print(old, '--', new)
        print(embedding.nearest_words([old]))
        print()

    print()
    war, ist = tense_changes[0]
    tensediff = embedding[ist] - embedding[war]
    for past, present in tense_changes[1 : ]:
        print(past, '+ tensediff:', *embedding.nearest_words([embedding[past] + tensediff]))
        print('Should be:', present)
        print()

    # word_diffs = [embedding[new] - embedding[old] for (old, new) in word_changes]

    spelling_diffs = [embedding[new] - embedding[old] for (old, new) in spelling_changes[10 : 20]]
    tense_diffs = [embedding[present] - embedding[past] for (past, present) in tense_changes]

    def metric(u, v):
        return max(distance.cosine(u, v), 0)

    while True:
        try:
            model = TSNE(n_components=2, metric=metric)
            reduced = model.fit_transform(spelling_diffs + tense_diffs)
            print(reduced)
            return
        except Exception:
            pass
Ejemplo n.º 13
0
def vizualize_clusters(X, y, py, hist=None):
    """ Using T-SNE to visualize the site clusters.
        Plot and save the scatter (and the histogramm).
    """
    model = TSNE(n_components=2, random_state=0)

    fig = model.fit_transform(X, y)
    fig1 = model.fit_transform(X, py)

    pyplot.figure(figsize=(16, 8))
    pyplot.subplot(121)

    classes = list(set(y))
    for c, color in zip(classes, plt.colors.cnames.iteritems()):
        indeces = [i for i, p in enumerate(y) if p == c]
        pyplot.scatter(fig[indeces, 0], fig[indeces, 1], marker="o", c=color[0])

    pyplot.subplot(122)

    clusters = list(set(py))
    for c, color in zip(clusters, plt.colors.cnames.iteritems()):
        indeces = [i for i, p in enumerate(py) if p == c]
        pyplot.scatter(fig1[indeces, 0], fig1[indeces, 1], marker="o", c=color[0])

    # pyplot.show()
    pyplot.savefig("clusters" + "_scatter.png")

    if hist is not None:
        pyplot.figure(figsize=(4, 4))
        pyplot.xticks(clusters)

        pyplot.bar(clusters, hist, align="center")
        # pyplot.show()
        pyplot.savefig("clusters" + "_hist.png")
Ejemplo n.º 14
0
    def plot_mean_activation_and_stuff(some_probs, Y, do_tsne=False):
        pyplot.clf()
        probs = numpy.float32(some_probs)
        xv = numpy.arange(probs.shape[1])#probs.var(axis=0)
        yv = probs.mean(axis=0)
        pyplot.axis([-0.1, probs.shape[1],0,1])
        for k in range(probs.shape[1]):
            pyplot.plot(xv[k]*numpy.ones(probs.shape[0]),probs[:,k],'o',ms=4.,
                        markeredgecolor=(1, 0, 0, 0.01),
                        markerfacecolor=(1, 0, 0, 0.01),)
        pyplot.plot(xv,yv, 'bo')
        pyplot.show(block=False)
        if do_video:
            pyplot.savefig(video.stdin, format='jpeg')
            video.stdin.flush()
        pyplot.savefig('epoch_probs.png')

        if not do_tsne: return
        try:
            from sklearn.manifold import TSNE
            tsne = TSNE(random_state=0)
            ps = tsne.fit_transform(numpy.float64(probs[:400]))
            pyplot.clf()
            Y = numpy.int32(Y)[:400]
            for i,c,s in zip(range(10),list('bgrcmyk')+[(.4,.3,.9),(.9,.4,.3),(.3,.9,.4)],'ov'*5):
                sub = ps[Y == i]
                pyplot.plot(sub[:,0], sub[:,1], s,color=c,ms=3,mec=c)
            pyplot.show(block=False)
            pyplot.savefig('probs_embed.png')
        except ImportError:
            print "cant do tsne"
Ejemplo n.º 15
0
def visualize_latent_rep(args, model, x_latent):
    print("pca_on=%r pca_comp=%d tsne_comp=%d tsne_perplexity=%f tsne_lr=%f" % (
        args.use_pca,
        args.pca_components,
        args.tsne_components,
        args.tsne_perplexity,
        args.tsne_lr
    ))

    if args.use_pca:
        pca = PCA(n_components = args.pca_components)
        x_latent = pca.fit_transform(x_latent)

    figure(figsize=(6, 6))
    scatter(x_latent[:, 0], x_latent[:, 1], marker='.')
    show()

    tsne = TSNE(n_components = args.tsne_components,
                perplexity = args.tsne_perplexity,
                learning_rate = args.tsne_lr,
                n_iter = args.tsne_iterations,
                verbose = 4)
    x_latent_proj = tsne.fit_transform(x_latent)
    del x_latent

    figure(figsize=(6, 6))
    scatter(x_latent_proj[:, 0], x_latent_proj[:, 1], marker='.')
    show()
Ejemplo n.º 16
0
def infer(FLAGS):
    """
    Inference.
    """

    # Retrieve embeddings for docs
    words = ["tennis", "wimbledon", "icecream", "cake", "bear", "pie"]

     # Get index in doc embeddings
    with open(os.path.join(basedir, FLAGS.data_dir, "doc_to_idx.json"), 'r') as f:
        doc_to_idx = json.load(f)

    # Load the trained model
    model = torch.load(os.path.join(basedir, FLAGS.data_dir, "model.pt"))
    doc_embeddings = model.doc_embeddings.weight.data

    my_embeddings = np.array(
        [doc_embeddings[doc_to_idx[word]].numpy() for word in words])

    # Use TSNE model to reduce dimensionality
    model = TSNE(n_components=2, random_state=0)
    points = model.fit_transform(my_embeddings)

    # Visualize
    for i, word in enumerate(words):
        x, y = points[i, 0]*1e4, points[i, 1]*1e4
        plt.scatter(x, y)
        plt.annotate(word, xy=(x, y), xytext=(25, 5),
            textcoords='offset points', ha='right', va='bottom')
    plt.show()
Ejemplo n.º 17
0
def labtest_TSNE(PID):
    data = [patients[pid]['tests'] for pid in PID]
    X = pp.scale(data)
    tsne = TSNE(n_components=2, perplexity=30.0, learning_rate=1000.0, n_iter=1000, n_iter_without_progress=30, min_grad_norm=1e-07, angle=0.5)
    pos = tsne.fit(X).embedding_
    
    return pos
Ejemplo n.º 18
0
def project_in_2D(distance_mat, method='mds'):
  """
  Project SDRs onto a 2D space using manifold learning algorithms
  :param distance_mat: A square matrix with pairwise distances
  :param method: Select method from 'mds' and 'tSNE'
  :return: an array with dimension (numSDRs, 2). It contains the 2D projections
     of each SDR
  """
  seed = np.random.RandomState(seed=3)

  if method == 'mds':
    mds = MDS(n_components=2, max_iter=3000, eps=1e-9,
              random_state=seed,
              dissimilarity="precomputed", n_jobs=1)

    pos = mds.fit(distance_mat).embedding_

    nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
               dissimilarity="precomputed", random_state=seed,
               n_jobs=1, n_init=1)

    pos = nmds.fit_transform(distance_mat, init=pos)
  elif method == 'tSNE':
    tsne = TSNE(n_components=2, init='pca', random_state=0)
    pos = tsne.fit_transform(distance_mat)
  else:
    raise NotImplementedError

  return pos
Ejemplo n.º 19
0
def dim_survey(X, entry_id):

    # convert to numpy
    X = np.array(X)

    # run the reduction.
    X_pca = PCA(n_components=3).fit_transform(X)
    X_tsne = TSNE(n_components=3).fit_transform(X)
    X_ica = FastICA(n_components=3).fit_transform(X)

    # connect to db.
    with mongoctx() as db:

        # update the stuff.
        db['entry'].update(
            {
                '_id': ObjectId(entry_id)
            },
            {
                '$set': {
                    'pca': X_pca.tolist(),
                    'tsne': X_tsne.tolist(),
                    'ica': X_ica.tolist(),
                }
            }
        )
Ejemplo n.º 20
0
def topic_dimen_reduce(words, word2vec):
    dictionary, matrix = terms_analysis.get_words_matrix(words, word2vec)
    pca = PCA(n_components=50)
    pca_matrix = pca.fit_transform(matrix)
    tsne = TSNE(n_components=2)
    t_matrix = tsne.fit_transform(pca_matrix)
    return dictionary, t_matrix
def main():
    Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()
    dbn = DBN([1000, 750, 500], UnsupervisedModel=AutoEncoder)
    # dbn = DBN([1000, 750, 500, 10])
    output = dbn.fit(Xtrain, pretrain_epochs=2)
    print "output.shape", output.shape

    # sample before using t-SNE because it requires lots of RAM
    sample_size = 600
    tsne = TSNE()
    reduced = tsne.fit_transform(output[:sample_size])
    plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain[:sample_size], alpha=0.5)
    plt.title("t-SNE visualization")
    plt.show()

    # t-SNE on raw data
    reduced = tsne.fit_transform(Xtrain[:sample_size])
    plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain[:sample_size], alpha=0.5)
    plt.title("t-SNE visualization")
    plt.show()

    pca = PCA()
    reduced = pca.fit_transform(output)
    plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain, alpha=0.5)
    plt.title("PCA visualization")
    plt.show()
Ejemplo n.º 22
0
def apply_tSNE30(proj_data, proj_weights=None):
    model = TSNE(n_components=2, perplexity=30.0, metric="euclidean",
                 learning_rate=100, early_exaggeration=4.0,
                 random_state=RANDOM_SEED);
    norm_data = normalize_columns(proj_data);
    result = model.fit_transform(norm_data.T);
    return result;
Ejemplo n.º 23
0
def tsnePlot(plotname, modelName, word, dest):
    
    """Plots a tsne graph of words most similar to the word passed in the argument (as represented in the model previously calculated)"""
    
    model = word2vec.Word2Vec.load(modelName)
    words = [model.most_similar(word)[i][0] for i in range(0, len(model.most_similar(word)))]
    words.append(word)

    #nested list constaining 100 dimensional word vectors of each most-similar word
    
    word_vectors = [model[word] for word in words]
    word_vectors = np.array(word_vectors)

    tsne_model = TSNE(n_components=2, random_state=0)
    Y = tsne_model.fit_transform(word_vectors)
    sb.plt.plot(Y[:,0], Y[:,1], 'o') 

    for word, x, y in zip(words, Y[:,0], Y[:,1]):  
        sb.plt.annotate(word, (x, y), size=12)
        #sb.plt.pause(10)

    plotname = plotname + ".png"

    if not os.path.exists(dest):
        os.makedirs(dest)

    path = os.path.join(dest, plotname)

    sb.plt.savefig(path)
Ejemplo n.º 24
0
def plotTSNEDecisionBoundaries(): 
    
    tsne = TSNE()
    tsne_data = tsne.fit_transform(feature_set)
    x_min,x_max = tsne_data[:,0].min()-1, tsne_data[:,0].max() + 1
    y_min,y_max = tsne_data[:,1].min()-1, tsne_data[:,1].max() + 1
    step_size = 2.0
    
    xx,yy = np.meshgrid(np.arange(x_min,x_max,step_size),np.arange(y_min,y_max,step_size))
    
    for index,classifier in enumerate(classifiers):
        
        plt.subplot(2,3,index+1)
        plt.subplots_adjust(wspace=0.5,hspace=0.5)
        classifier.fit(tsne_data,class_labels)
        
        Z = classifier.predict(zip(xx.ravel(),yy.ravel()))
        Z = Z.reshape(xx.shape)
        
        plt.contourf(xx,yy,Z,cmap=plt.cm.Paired,alpha=0.7)
        plt.scatter(tsne_data[:,0],tsne_data[:,1],c=class_labels,cmap=plt.cm.rainbow,alpha=0.6)
        plt.xlabel("Feature 1")
        plt.ylabel("Feature 2")
        plt.xlim(x_min,x_max)
        plt.ylim(y_min,y_max)
        plt.xticks(())
        plt.yticks(())
        plt.title(classifier_names[index])
        
    plt.show()
Ejemplo n.º 25
0
def tsne_plot(model):
    #"Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)

    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
def plotly_js_viz(word_2_vec_model):
    tsne_model=TSNE(n_components=2,random_state=5)
    data=tsne_model.fit_transform(word_2_vec_model.syn0)
    xd=list(data[:,0])
    yd=list(data[:,1])
    names_our=word_2_vec_model.index2word
    plot([Scatter(x=xd,y=yd,mode="markers",text=names_our)])
Ejemplo n.º 27
0
def visualization(result, word_dict):
	tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
	plot_only = 500

	low_dim_embs = tsne.fit_transform(result[0:500])
	labels = [ word_dict[i] for i in range(500) ]
	plot_with_labels(low_dim_embs, labels)
 def reduce_dimentionality(self):
     self.vectors = []
     for key in self.selected_words:
         self.vectors.append(self.model[key])
     tnse_model = TSNE(n_components=2, random_state=0)
     np.set_printoptions(suppress=True)
     self.reduced_vectors = tnse_model.fit_transform(self.vectors)
Ejemplo n.º 29
0
def PlotTSNE (data, labels):										#Takes the data and the labels
	# Visualize the results on TSNE reduced data

	print "BUSY IN TSNE"

	model = TSNE(n_components=2, random_state=0)
	reduced_data = model.fit_transform(data)

	print "DATA REDUCED"

	# Plot the decision boundary. For that, we will assign a color to each
	x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max()
	y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max()
	
	plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
	
	#Adds labels to the plot
	for label, x, y in zip(labels, reduced_data[:, 0], reduced_data[:, 1]):
	    plt.annotate(
	        label, 
	        xy = (x, y), xytext = (-20, 20),
	        textcoords = 'offset points', ha = 'right', va = 'bottom',
	        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'green', alpha = 0.5),
	        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

	plt.title('TSNE Plot')
	plt.xlim(x_min, x_max)
	plt.ylim(y_min, y_max)
	plt.xticks(())
	plt.yticks(())
	plt.show()
def performDimensionalityReduction(context_vector, n_component, perplexity):
    '''
        Applies TSNE on the feature vector of each of the word instances and creates
        one model for each word type
    '''
    feature_vector_data = defaultdict(dict)
    word_type_model     = {}
    
    for word_type, word_type_data in context_vector.iteritems():
        feature_vector_word_type = OrderedDict()
        
        #Reading in all the feature vectors for the given word type
        for data_type, instance_details in word_type_data.iteritems():
            for instance, context_details in instance_details.iteritems():
                
                #Training data with have the sense id's while test data will have ['<UNKNOWN>']
                senses = context_details.get('Sense')
                for sense in senses:
                    feature_vector_word_type[(instance, sense, data_type)] = context_details["Feature_Vector"]
        
        #Applying TSNE on all the feature vectors
        feature_vector_array = np.array(feature_vector_word_type.values())
        model = TSNE(n_components=n_component, random_state=0, perplexity=perplexity, metric="cosine")
        model.fit(feature_vector_array)
        
        #Storing the model since it will be needed to fit the test data
        word_type_model[word_type] = model
        
        #Converting to a structure of {WordType: {(instanceID, senseID): FeatureVector ... }}
        for i in range(len(feature_vector_word_type)):
            feature_vector_data[word_type][feature_vector_word_type.keys()[i]] = list(model.embedding_[i])

    return feature_vector_word_type, word_type_model
     'model': SVC(random_state=0, probability=True, kernel='rbf'),
     'methods': ['predict', 'predict_proba'],
     'dataset': 'classifier',
 },
 {
     'model': SVC(random_state=0, probability=True, kernel='linear'),
     'methods': ['predict', 'predict_proba'],
     'dataset': 'sparse',
 },
 {
     'model': SVC(random_state=0, probability=True, kernel='rbf'),
     'methods': ['predict', 'predict_proba'],
     'dataset': 'sparse',
 },
 {
     'model': TSNE(random_state=0),
     'methods': ['fit_transform'],
     'dataset': 'classifier',
 },
 {
     'model': KMeans(random_state=0, init="k-means++"),
     'methods': ['predict'],
     'dataset': 'blobs',
 },
 {
     'model': KMeans(random_state=0, init="random"),
     'methods': ['predict'],
     'dataset': 'blobs',
 },
 {
     'model': KMeans(random_state=0, init="k-means++"),
Ejemplo n.º 32
0
dist_neg = distances[identical == 0]

# plt.figure(figsize=(12,4))
#
# plt.subplot(121)
# plt.hist(dist_pos)
# plt.axvline(x=opt_tau, linestyle='--', lw=1, c='lightgrey', label='Threshold')
# plt.title('Distances (pos. pairs)')
# plt.legend();
#
# plt.subplot(122)
# plt.hist(dist_neg)
# plt.axvline(x=opt_tau, linestyle='--', lw=1, c='lightgrey', label='Threshold')
# plt.title('Distances (neg. pairs)')
# plt.legend();
####****Till Here****####

####****This code is used to plot the learning done by our model. Basically plots the points representing each image on a graph****####
targets = np.array([m.name for m in metadata])
from sklearn.manifold import TSNE
# sklearn stands for sci-kit learn library in python used for mathematical operations used in machine learning
X_embedded = TSNE(n_components=2).fit_transform(embedded)

for i, t in enumerate(set(targets)):
    idx = targets == t
    plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=t)

plt.legend(bbox_to_anchor=(1, 1))
plt.show()
####****Till Here****####
Ejemplo n.º 33
0
def sortTSNE(dataSpikes):
    (numSpikes, numChans, numSamples) = shape(dataSpikes.samples)
    allWaves = dataSpikes.samples.reshape(numSpikes, numChans*numSamples)

    model = TSNE(n_components=2, method='barnes_hut', verbose=20, n_iter=1000)
    Y = model.fit_transform(allWaves)
Ejemplo n.º 34
0
        # Write logs
        if (iteration < 5) or (iteration % 100 == 99):
            lib.plot.flush(outf, logfile)

        lib.plot.tick()

        # Generation and reconstruction
        if iteration % 5000 == 4999:
            generate_image(iteration, _data)
            reconstruct_image(iteration)

        # Latent space visualization
        if iteration % 50000 == 49999:
            z_dev, z_mean_dev, y_dev = [], [], []
            for xb, yb in dev_gen():
                zb = session.run(q_z, feed_dict={real_x: xb})
                z_dev.append(zb)
                y_dev.append(yb)
            z_dev_2D = TSNE().fit_transform(np.vstack(z_dev))
            lib.visualization.scatter(
                data=z_dev_2D,
                label=np.hstack(y_dev),
                dir=outf,
                file_name='{}_mnist_manifold_{}.png'.format(MODE, iteration))

        # Save model
        if iteration == ITERS - 1:
            save_path = saver.save(
                session,
                os.path.join(outf,
                             '{}_mnist_model_{}.ckpt'.format(MODE, iteration)))
Ejemplo n.º 35
0

	
#load risk factor docs
riskfactors_old = loadFacetDocs('./data/risk_abstracts.csv')
riskfactors = getRisks('./data/risk_sha.csv')


riskvectors = []
for factor in riskfactors:
	riskvectors.append(get_doc_vector(model, factor[1]))


print('start plotting')
#perplexity of 5 and learning rate of 500 gives good results
tsne = TSNE(n_components=2, perplexity=5, learning_rate = 500)
num_abstract = np.array(model.docvecs.vectors_docs).shape[0]

#printSNE1(model.docvecs.vectors_docs)
#printSNE2(model.docvecs.vectors_docs, num_abstract)
#printSNE2(np.concatenate((model.docvecs.vectors_docs, riskvectors)), num_abstract)
#printClusterTasks(abstract_vectors, labels, k)
printTasksRisks(abstract_vectors, labels, k)

#print("Check dit")
#print(abstract_vectors[0:2])
#print(list_of_tasks[0:2])
#print(riskfactors[0:2])


Ejemplo n.º 36
0
# Step 7: Visualize the embeddings.
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), \
      "More labels than embeddings"
    fig = plt.figure(figsize=(18, 18))  #in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        fig.scatter(x, y)
        fig.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

    fig.savefig(filename)


try:
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt

    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    plot_only = 500
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
    plot_with_labels(low_dim_embs, labels)

except ImportError:
    print("Please install sklearn and matplotlib to visualize embeddings.")
Ejemplo n.º 37
0
# In non-text mode, plot all characters that appear at least once in the corpus
charpoints = [
    i for i in range(256)
    if ((HIDE_OTHER_TYPES or char_type(i) in ALLOWED_TYPES)
        #and (char_type(i) not in ['non-ascii', 'unused']) # hacky
        and (ALLOW_RARE or is_frequent(i))) or (
            not TEXT_MODE and char_counts[i] > 50)
]

if MODE == 'SNE':
    X_sne = TSNE(
        perplexity=4,
        n_iter=2000,
        learning_rate=25,
        n_iter_without_progress=100,  # the goggles do nothing
        #method='exact',
        early_exaggeration=4,
        verbose=2,
        random_state=8,
    ).fit_transform(embedding[charpoints])
elif MODE == 'tSVD':
    X_sne = sklearn.decomposition.TruncatedSVD(n_components=2).fit_transform(
        embedding[charpoints])
elif MODE == 'PCA':
    X_sne = sklearn.decomposition.PCA(n_components=2).fit_transform(
        embedding[charpoints])
else:
    assert False ("unrecognized mode")

plt.figure(figsize=(10, 10))
x_min, x_max = np.min(X_sne, 0), np.max(X_sne, 0)
Ejemplo n.º 38
0
            if not len(vecs):
                break

            m.append(idx)
            vecs = np.array(vecs)
            tweets_w2v_avg.append(np.mean(vecs, axis=0))
            point_labels.append(film_label[k][labels_w2v[idx]])
            break

tweets_w2v = tweets_w2v[m]
tweets_w2v_avg = np.array(tweets_w2v_avg)
print("Selected {} tweets".format(len(tweets_w2v_avg)))

# In[15]:

tsne = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
Y = tsne.fit_transform(tweets_w2v_avg)

# In[16]:

plt.scatter(Y[:, 0], Y[:, 1])

# In[20]:

# plt.scatter(Y[:, 0], Y[:, 1], c=c)
# for label, x, y in zip(tweets_w2v, Y[:, 0], Y[:, 1]):
#     plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')

# In[21]:

def show_images(images, labels):
    _, figs = plt.subplots(1, len(images), figsize=(28, 28))
    for f, img, lbl in zip(figs, images, labels):
        f.imshow(img, cmap='gray')
        f.set_title(lbl)
        f.axes.get_xaxis().set_visible(False)
        f.axes.get_yaxis().set_visible(False)
    plt.show()


tsne = TSNE(n_components=hyper_params['n_components'],
            min_grad_norm=1e-5,
            init='pca',
            method='exact',
            angle=0.45,
            early_exaggeration=5,
            n_iter=1000)
pca = PCA(n_components=hyper_params['n_components'])

reduction_model = pca
all_code = np.concatenate([train_data, test_data], axis=0)
reduction_model.fit(all_code.reshape(all_code.shape[0], -1))
reduct_code = reduction_model.transform(all_code.reshape(
    all_code.shape[0], -1))

Q_code = reduct_code[:train_data.shape[0]]
Q1_code = reduct_code[train_data.shape[0]:]

train_len = Q_code.shape[0]
Ejemplo n.º 40
0
                genotypes[counter].append(2)
            if j=='./.':
                genotypes[counter].append(9)
            counter+=1






            
#TRANSFORM TO tSNE
X = np.asarray(genotypes)  
pca_for_tSNE = PCA(n_components=15).fit_transform(genotypes)  
print(np.sum(PCA(n_components=10).fit(genotypes).explained_variance_ratio_))
X_embedded = TSNE(verbose=0,early_exaggeration=12.0,n_components=2,learning_rate=100.0,n_iter=1000,perplexity=10.0).fit_transform(pca_for_tSNE)    
#print(X_embedded.shape)   



#PLOTING
plt.figure(figsize=(100, 60))

COLORPALLETE=get_colors(len(set(true_labels)))
COLORZ_TO_LABELS={}

uniquelabels=[x for x in set(true_labels)]
for j in range(0,len(uniquelabels)):
    COLORZ_TO_LABELS[uniquelabels[j]]=COLORPALLETE[j]

colors=[ COLORZ_TO_LABELS[x] for x in true_labels]
Ejemplo n.º 41
0
def fit_tsne(x, n_components=2, init='pca', *args, **kwargs):
    return TSNE(n_components=n_components, init=init, *args,
                **kwargs).fit_transform(x)
Ejemplo n.º 42
0
'''
INSTRUCTIONS

*   Import TSNE from sklearn.manifold.
*   Create a TSNE instance called model with learning_rate=50.
*   Apply the .fit_transform() method of model to normalized_movements. Assign the result to tsne_features.
*   Select column 0 and column 1 of tsne_features.
*   Make a scatter plot of the t-SNE features xs and ys. Specify the additional keyword argument alpha=0.5.
*   Code to label each point with its company name has been written for you using plt.annotate(), so just hit 'Submit Answer' to see the visualization!
'''

# Import TSNE
from sklearn.manifold import TSNE

# Create a TSNE instance: model
model = TSNE(learning_rate=50)

# Apply fit_transform to normalized_movements: tsne_features
tsne_features = model.fit_transform(normalized_movements)

# Select the 0th feature: xs
xs = tsne_features[:, 0]

# Select the 1th feature: ys
ys = tsne_features[:, 1]

# Scatter plot
plt.scatter(xs, ys, alpha=0.5)

# Annotate the points
for x, y, company in zip(xs, ys, companies):
Ejemplo n.º 43
0
minCount = 20
s = 250
w = 4
skip_model = Word2Vec(data, min_count=minCount, iter=5, size=s, window=w, sg=1)

skip_model.save('SkipGramFile')
print("size = %d" % s)
print("window = %d" % w)

store_model = g.Doc2Vec.load('SkipGramFile')
vocab = list(store_model.wv.vocab)

X = store_model[vocab]

# 이차원 그래프로 표현
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
#print(len(X_tsne))

# 표 그리기
df = pd.DataFrame(X_tsne, index=vocab[:], columns=['x', 'y'])
df.shape
#print(df)

# 그래프 그리기
fig = plt.figure()
fig.set_size_inches(40, 20)
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'], df['y'])
for word, pos in df.iterrows():
    ax.annotate(word, pos, fontsize=30)
Ejemplo n.º 44
0
data = pickle.load(open(dir + 'sentence_text.p', 'rb'))
document = data[1]

#compiled sentences
compiled_sentences = data[0]
for i in range(len(data)):
    compiled_sentences += data[i]

print(compiled_sentences)
model = gensim.models.Word2Vec(compiled_sentences, min_count=5)

print(model.similarity('Greece', 'January'))
print(
    model.most_similar(positive=['woman', 'bailout'],
                       negative=['finance'],
                       topn=1))
X_tot = list()
for word in model.wv.vocab:
    X_tot.append(model.wv[word])

X_tot = np.array(X_tot)
## visualize embedding using t-sne
X_embedded = TSNE(n_components=2, verbose=True,
                  perplexity=40).fit_transform(X_tot)
X_embedded.shape

## plot the t-sne result
plt.figure()
plt.scatter(X_embedded[:, 0], X_embedded[:, 1])
plt.show()
Ejemplo n.º 45
0
if args.device == 'cuda':
    torch.cuda.set_device(args.gpu)
    torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
else:
    torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)

x = np.load('/Users/tanyue/Desktop/saved/protos/' + args.alg + '_protos.npy',
            allow_pickle=True)
y = np.load('/Users/tanyue/Desktop/saved/protos/' + args.alg + '_labels.npy',
            allow_pickle=True)
# d = np.load('../protos/' + args.alg + '_idx.npy', allow_pickle=True)

tsne = TSNE()
x = tsne.fit_transform(x)

# x = x[:,0:2]

y = y.reshape((-1, 1))
# d = d.reshape((-1, 1))
# visualize(args, x, y, d)
visualize(args, x, y)

# from mlxtend.plotting import plot_decision_regions
# from sklearn.svm import SVC
# from mlxtend.data import iris_data
# clf = SVC(random_state=0, probability=True)
# # X, y = iris_data()
# # X = X[:,[0, 2]]
Ejemplo n.º 46
0
# for node, _ in model.most_similar('gk'):
#     # Show only players
#     if len(node) > 3:
#         print(node)
#
# for node, _ in model.most_similar('real_madrid'):
#     print(node)
#
# for node, _ in model.most_similar('paulo_dybala'):
#     print(node)

# Visualization
player_nodes = [x for x in model.vocab if len(x) > 3 and x not in clubs]
embeddings = np.array([model[x] for x in player_nodes])

tsne = TSNE(n_components=2, random_state=7, perplexity=15)
embeddings_2d = tsne.fit_transform(embeddings)

# Assign colors to players
team_colors = {
    'real_madrid': 'lightblue',
    'chelsea': 'b',
    'manchester_utd': 'r',
    'manchester_city': 'teal',
    'juventus': 'gainsboro',
    'napoli': 'deepskyblue',
    'fc_bayern': 'tomato'
}

data['color'] = data['club'].apply(lambda x: team_colors[x])
player_colors = dict(zip(data['name'], data['color']))
Ejemplo n.º 47
0
def main():
    csvdata = read_points()
    X = len(csvdata[0])
    Y = len(csvdata)
    New_matrix = np.zeros([Y,X])
    for y in range(Y):
        for x in range(X):
            New_matrix[y, x] = csvdata[y][x]



    #最后三列统计结果矩阵
    add_matrix = np.zeros([Y,3])
    for y in range(Y):
        add_matrix[y, 0] = New_matrix[y, 0] - New_matrix[y, 1]
        add_matrix[y, 1] = New_matrix[y, 2] - New_matrix[y, 3]
        add_matrix[y, 2] = New_matrix[y, 4] - New_matrix[y, 5]

    temp_mean = add_matrix[:, 0].mean()
    temp_mean1 = add_matrix[:, 1].mean()
    temp_mean2 = add_matrix[:, 2].mean()
    col_std = np.std(add_matrix, axis=0)
    col_mean = np.mean(add_matrix, axis=0)
    # 需要对csvdata进行中心化和标准化处理
    for y in range(Y):
        for x in range(0,3):
            add_matrix[y, x] -= col_mean[x]
            add_matrix[y, x] /= col_std[x]

#    for y in range(Y):
#        temp = str(add_matrix[y,0]) + "," + str(add_matrix[y,1]) + "," + str(add_matrix[y,2]) + "\n"
#        saveresult(temp)



    predict_matrix = np.array([(7898765467.24,3235676823.00,3957177004.54,3444000321.55,5432112345.77,2900000089.12),
                               (133241575988.56, 39872238928.11, 14551119352.78, 3164290276.21, 3444305407.86,1015886389.47),
                               (93805217949.67,34975605193.08,2326015727.05,1922978314.70,2273603448.91,4777927001.24)])
    predict_subtract = np.zeros([3,3])
    for i in range(3):
        predict_subtract[i, 0] = predict_matrix[i, 0] - predict_matrix[i, 1]
        predict_subtract[i, 1] = predict_matrix[i, 2] - predict_matrix[i, 3]
        predict_subtract[i, 2] = predict_matrix[i, 4] - predict_matrix[i, 5]
    for i in range(3):
        for x in range(0,3):
            predict_subtract[i, x] -= col_mean[x]
            predict_subtract[i, x] /= col_std[x]


    matrix2list = add_matrix.tolist()
    print(matrix2list)
    print(add_matrix)
#    聚类前降维显示
#    Dimensionality_reduction(matrix2list)

    X_pca = PCA(n_components=2).fit_transform(add_matrix)
    t1 = 20
    t2 = 15
    gc = ca.Canopy(X_pca)
    gc.setThreshold(t1, t2)
    canopies = gc.clustering()
#    showCanopy(canopies,X_pca,t1,t2)

    all_vrc = []
    all_silh = []
    sub = []
    for k in range(20):
        # kmeans聚类
        if k==0 or k==1:
            continue
        clf = KMeans(n_clusters=k,init='k-means++')
        y_pred = clf.fit_predict(matrix2list)
        add_pred = clf.predict(predict_subtract.tolist())
#        print(clf)
#        print(y_pred)

        sub.append(k)
        VRC = metrics.calinski_harabaz_score(add_matrix, y_pred)
        all_vrc.append(VRC)
        silh = metrics.silhouette_score(add_matrix, y_pred, metric='euclidean')
        all_silh.append(silh)
        print("k= ",k)
        print('VRC方差率:',VRC)
     #   print('轮廓系数:%10.3f' % silh)
        print('轮廓系数:', silh)

        lines = len(y_pred)
        static_result = np.zeros([1, k])
        first = 0
        second = 0
        third = 0

        for item in y_pred:
           for i in range(0,k):
               if item == i:
                   static_result[0,i] += 1

        for i , j in enumerate(y_pred):
            if j == 1:
                print(i,j)

        for i in range(0,k):
            print("第"+str(i)+"类数据占比: "+str(static_result[0,i]*100/lines)+"%")

# 降维显示数据
        X_tsne = TSNE(learning_rate=100).fit_transform(matrix2list)
        X_pca = PCA().fit_transform(matrix2list)
        #单点降维
        signal_pca = PCA().fit_transform(predict_subtract.tolist())
        plt.close()
        fig = plt.figure()
        plt.ion()  # interactive mode on
        plt.subplot(121)
        plt.title("T-SNE")
        plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_pred)
        plt.subplot(122)
        plt.title("PCA")
        plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred)
        plt.scatter(signal_pca[:,0],signal_pca[:,1],c='r')
        plt.pause(1)




    plt.figure(figsize=(10,5))
    plt.subplot(121)
    plt.title("VRC")
#    plt.scatter(sub,all_vrc,marker='o')
    plt.plot(all_vrc)
    plt.subplot(122)
    plt.title("silh")
#    plt.scatter(sub,all_silh,marker='x')
    plt.plot(all_silh)
    plt.show()


    print("第一类数据占比:" + str(((first * 100) / lines)) + "%")
    print("第二类数据占比:" + str(((second * 100) / lines)) + "%")
    print("第三类数据占比:" + str(((third * 100) / lines)) + "%")
    temp = "第一类数据占总数的:" + str(((first * 100) / lines)) + "%\n" + "第二类数据占总数的:" + str(((second * 100) / lines)) + "%\n" + "第三类数据占总数的:" + str(((third * 100) / lines)) + "%\n"
    saveresult(temp)

    X_tsne = TSNE(learning_rate=100).fit_transform(matrix2list)
    X_pca = PCA().fit_transform(matrix2list)
    plt.figure(figsize=(10, 5))
    plt.subplot(121)
    plt.title("T-SNE")
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_pred)
    plt.subplot(122)
    plt.title("PCA")
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred)
    plt.show()
Ejemplo n.º 48
0
plot_conf_matrix(y_test, y_predSGD, "Stochastic Gradient Descent")

# ### Feature Importance

# ### Clustering using Dimensionality Reduction.

# In[77]:

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches

# In[78]:

# T-SNE with Original Data.
X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X.values)

# In[79]:

plt.scatter(X_reduced_tsne[:, 0],
            X_reduced_tsne[:, 1],
            c=(y == 0),
            cmap='coolwarm',
            label='No Fraud',
            linewidths=2)
plt.scatter(X_reduced_tsne[:, 0],
            X_reduced_tsne[:, 1],
            c=(y == 1),
            cmap='rainbow',
            label='Fraud',
            linewidths=2)
Ejemplo n.º 49
0
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import pylab
import numpy as np
d = np.load('feature.npy').item()
X = d['feature']
labels = d['label']

data_pca_tsne = TSNE(n_components=2).fit_transform(X)
cls_num = -45
# pylab.figure()
pylab.scatter(data_pca_tsne[cls_num * 5:, 0], data_pca_tsne[cls_num * 5:, 1],
              10, np.zeros_like(labels[cls_num * 5:]))
pylab.scatter(data_pca_tsne[:cls_num * 5, 0], data_pca_tsne[:cls_num * 5, 1],
              10, labels[:cls_num * 5])
pylab.savefig('tsne.pdf')
Ejemplo n.º 50
0
y = dataset.iloc[:, 0]
labels, numbers = pd.factorize(y)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=19)
Xpca = pca.fit_transform(X)

# Applying tSNE
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
Xnew = tsne.fit_transform(Xpca)

# Scatter Plot
cdict = {
    0: 'yellow',
    1: 'red',
    2: 'blue',
    3: 'green',
    4: 'black',
    5: 'orange',
    6: 'pink'
}
fig, ax = plt.subplots()
for g in np.unique(labels):
    ix = np.where(labels == g)
Ejemplo n.º 51
0
    def run_pca(self, vals, clrs):

        my_mat = np.matrix(vals)
        #my_pca = PCA(n_components = 20).fit(my_mat.getT())
        my_pca = PCA().fit(my_mat.getT())
        my_pts = my_pca.transform(my_mat.getT())
        coefs = my_pca.components_
        top_coefs = []
        top_dict = []

        for i, sc in enumerate(coefs):
            sranked = sorted([(sj, self.f_names[j])
                              for j, sj in enumerate(sc)],
                             reverse=True)

            sHalf = int(len(sranked) / 2.0)
            sForward = sranked[0:sHalf - 2]
            sBack = sranked[-1::-1][0:sHalf - 2]

            kh, kl, listH, listL = 0, 0, [], []
            dictH, dictL = dd(int), dd(int)

            for z, (scr, ch) in enumerate(sForward):

                #print 'coef',i,'POS','rank stuff',z,scr,ch

                ns = self.summarize(ch)
                dictH[ns] += 1
                if ns not in [x[1] for x in listH]: listH.append((scr, ns))
                if len(listH) > 40: break

                if z > 40: break

            for z, (scr, ch) in enumerate(sBack):
                #print 'coef',i,'NEG','rank stuff',z,scr,ch
                ns = self.summarize(ch)
                dictL[ns] += 1
                if ns not in [x[1] for x in listL]: listL.append((scr, ns))
                if len(listL) > 40: break

                if z > 40: break

            top_coefs.append([listH, listL])
            top_dict.append([dictH, dictL])

            if i > 0: break

        for n in range(len(my_pts)):
            p = my_pts[n]
            c = clrs[n]
            if c == 'magenta':
                #print p[0],p[1],c
                if p[1] < 0.85: my_pts[n][1] += 1
            if c == 'cyan':
                #print p[0],p[1],c
                if p[0] < 1: my_pts[n][0] += 1

        tsne = TSNE(n_components=2, verbose=0, perplexity=100, n_iter=5000)
        ts = tsne.fit_transform(my_pts)

        return my_pts, ts, top_coefs, top_dict
Ejemplo n.º 52
0
def word2vec_basic(log_dir):
    """Example of building, training and visualizing a word2vec model."""
    # Create the directory for TensorBoard variables if there is not.
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # # Step 1: Download the data.
    # # Note: Source website does not support HTTPS right now.
    # url = 'http://mattmahoney.net/dc/'
    #
    # # pylint: disable=redefined-outer-name
    # def maybe_download(filename, expected_bytes):
    #   """Download a file if not present, and make sure it's the right size."""
    #   if not os.path.exists(filename):
    #     filename, _ = urllib.request.urlretrieve(url + filename,filename)
    #   #获取文件的相关属性信息
    #   statinfo = os.stat(filename)
    #   #判断文件大小是否相等
    #   if statinfo.st_size == expected_bytes:
    #     print('Found and verified', filename)
    #   else:
    #     print(statinfo.st_size)
    #     raise Exception('Failed to verify ' + filename +'. Can you get to it with a browser?')
    #   return filename

    #filename = maybe_download('text8.zip',31344016)
    filename = r'D:\程序\Text-classification-CNN\text8.zip'

    # Read the data into a list of strings.
    def read_data(filename):
        """Extract the first file enclosed in a zip file as a list of words."""
        with zipfile.ZipFile(filename) as f:
            data = tf.compat.as_str(f.read(f.namelist()[0])).split()
        return data

    vocabulary = read_data(filename)  #list()列表
    print('Data size', len(vocabulary))

    # Step 2: Build the dictionary and replace rare words with UNK token.
    vocabulary_size = 50000

    def build_dataset(words, n_words):
        """Process raw inputs into a dataset."""
        count = [['UNK', -1]]  #二维数组
        count.extend(collections.Counter(words).most_common(n_words -
                                                            1))  #截取前49999个高频词
        #dictionary为{},key为word value为index
        dictionary = {word: index
                      for index, (word, _) in enumerate(count)
                      }  #不需要但又必须定义的变量点以为“_”
        data = []
        unk_count = 0
        for word in words:
            index = dictionary.get(word, 0)
            if index == 0:  # dictionary['UNK']  最后统计下所有的低频词UNK的个数
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count
        reversed_dictionary = dict(
            zip(dictionary.values(),
                dictionary.keys()))  #反转字典 key为index value为word
        return data, count, dictionary, reversed_dictionary

    # Filling 4 global variables:(获取训练集中的信息,保存在下面的全局变量中)
    # data - list of codes (integers from 0 to vocabulary_size-1).
    #   This is the original text but words are replaced by their codes
    # count - map of words(strings) to count of occurrences
    # dictionary - map of words(strings) to their codes(integers)
    # reverse_dictionary - map of codes(integers) to words(strings)
    data, count, unused_dictionary, reverse_dictionary = build_dataset(
        vocabulary, vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    print('Most common words (+UNK)', count[:5])
    print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

    # Step 3: Function to generate a training batch for the skip-gram model.
    def generate_batch(batch_size, num_skips, skip_window):
        global data_index  #代表目前训练数据段的其实位置
        assert batch_size % num_skips == 0  #断言 如果batch_size % num_skips!=0 程序报错中断
        assert num_skips <= 2 * skip_window
        batch = np.ndarray(shape=(batch_size),
                           dtype=np.int32)  #batch内为8个word的数字索引
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
        span = 2 * skip_window + 1  # [ skip_window target skip_window ]
        buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin(双向队列)
        if data_index + span > len(data):
            data_index = 0
        buffer.extend(data[data_index:data_index +
                           span])  #data[0:3] 训练数据中索引为1,2,3的word,数据只在这有,其余的都为索引
        data_index += span
        for i in range(batch_size // num_skips):  #整除 4 i:0,1,2,3
            context_words = [w for w in range(span)
                             if w != skip_window]  #获取上下文的word [0,2]
            words_to_use = random.sample(context_words, num_skips)  #[0,2]  ?
            for j, context_word in enumerate(
                    words_to_use):  #j:0,1  context:0,2
                batch[i * num_skips + j] = buffer[
                    skip_window]  #batch[n] n:"0,1","2,3","4,5","6,7"每两个batch(target_word)内数据相同
                labels[i * num_skips + j, 0] = buffer[
                    context_word]  #labels[n,0] n:0,1,2,3,4,5,6,7  设计的还是很巧妙的
                #labels稍微复杂一些,labels[0,0]=buffer[0] labels[1,0]=buffer[2] labels[2,0]=buffer[1] lables[3,0]=buffer[3]...
            if data_index == len(data):
                buffer.extend(data[0:span])
                data_index = span
            else:
                buffer.append(data[data_index])  #这里会改buffer内的值
                data_index += 1
        # Backtrack a little bit to avoid skipping words in the end of a batch
        data_index = (data_index - span) % len(data)
        return batch, labels

    #batch_size训练一批单词的个数、num_skips
    batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
    for i in range(8):  #测试下效果
        print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
              reverse_dictionary[labels[i, 0]])

    # Step 4: Build and train a skip-gram model.

    batch_size = 128
    embedding_size = 128  # Dimension of the embedding vector.
    skip_window = 1  # How many words to consider left and right.
    num_skips = 2  # How many times to reuse an input to generate a label.
    num_sampled = 64  # Number of negative examples to sample. 负采样:减小计算量,达到较好效果的一种方式

    # We pick a random validation set to sample nearest neighbors. Here we limit
    # the validation samples to the words that have a low numeric ID, which by
    # construction are also the most frequent. These 3 variables are used only for
    # displaying model accuracy, they don't affect calculation.
    valid_size = 16  # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)

    graph = tf.Graph()
    with graph.as_default():
        # Input data.
        with tf.name_scope('inputs'):
            train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
            train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
            valid_dataset = tf.constant(valid_examples, dtype=tf.int32)  #验证集

        # Ops and variables pinned to the CPU because of missing GPU implementation
        with tf.device('/gpu:0'):  #'/cpu:0'
            # Look up embeddings for inputs.
            with tf.name_scope('embeddings'):
                embeddings = tf.Variable(  #定义 词向量 维度为:50000*128  50000个词,每个词128个维度
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0,
                                      1.0))
                #抽取要训练的词,train_inputs就是要训练的词,训练哪些就从embeddings中抽取出来
                #embedding_lookup(params, ids),比如说ids=[1,7,4],就是返回params中的1,7,4行组成的tensor
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)

            # Construct the variables for the NCE loss  噪声对比工具(负采样)
            with tf.name_scope('weights'):
                nce_weights = tf.Variable(
                    tf.truncated_normal([vocabulary_size, embedding_size],
                                        stddev=1.0 /
                                        math.sqrt(embedding_size)))
            with tf.name_scope('biases'):
                nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        # Explanation of the meaning of NCE loss and why choosing NCE over tf.nn.sampled_softmax_loss:
        #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
        #   http://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf
        with tf.name_scope('loss'):
            loss = tf.reduce_mean(  #二次代价函数
                tf.nn.nce_loss(  #nce负采样
                    weights=nce_weights,
                    biases=nce_biases,
                    labels=train_labels,
                    inputs=embed,
                    num_sampled=num_sampled,
                    num_classes=vocabulary_size))

        # Add the loss value as a scalar to summary.
        tf.summary.scalar('loss', loss)

        # Construct the SGD optimizer using a learning rate of 1.0.
        with tf.name_scope('optimizer'):
            optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(
                loss)  #梯度下降法

        # Compute the cosine similarity between minibatch examples and all 余弦相似度比较测试集数据
        # embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  valid_dataset)
        similarity = tf.matmul(valid_embeddings,
                               normalized_embeddings,
                               transpose_b=True)

        # Merge all summaries.
        merged = tf.summary.merge_all()

        # Add variable initializer.
        init = tf.global_variables_initializer()

        # Create a saver.
        saver = tf.train.Saver()

    # Step 5: Begin training.
    num_steps = 100001

    with tf.Session(graph=graph) as session:
        # Open a writer to write summaries.
        writer = tf.summary.FileWriter(log_dir, session.graph)

        # We must initialize all variables before we use them.
        init.run()
        print('Initialized')

        average_loss = 0
        for step in xrange(num_steps):
            batch_inputs, batch_labels = generate_batch(
                batch_size, num_skips, skip_window)
            feed_dict = {
                train_inputs: batch_inputs,
                train_labels: batch_labels
            }

            # Define metadata variable.
            run_metadata = tf.RunMetadata()

            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            # Also, evaluate the merged op to get all summaries from the returned
            # "summary" variable. Feed metadata variable to session for visualizing
            # the graph in TensorBoard.
            _, summary, loss_val = session.run([optimizer, merged, loss],
                                               feed_dict=feed_dict,
                                               run_metadata=run_metadata)
            average_loss += loss_val

            # Add returned summaries to writer in each step.
            writer.add_summary(summary, step)
            # Add metadata to visualize the graph for the last run.
            if step == (num_steps - 1):
                writer.add_run_metadata(run_metadata, 'step%d' % step)

            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000
                # batches.
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0

            # Note that this is expensive (~20% slowdown if computed every 500 steps)
            if step % 10000 == 0:
                sim = similarity.eval()
                for i in xrange(valid_size):
                    valid_word = reverse_dictionary[valid_examples[i]]
                    top_k = 8  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word  #打印16个测试集中前8个比较相似的词,好的词向量模型 比较相似的词的余弦距离也是比较相近的

                    print(
                        log_str, ', '.join([
                            reverse_dictionary[nearest[k]]
                            for k in range(top_k)
                        ]))
        final_embeddings = normalized_embeddings.eval()

        # Write corresponding labels for the embeddings.
        with open(log_dir + '/metadata.tsv', 'w') as f:
            for i in xrange(vocabulary_size):
                f.write(reverse_dictionary[i] + '\n')

        # Save the model for checkpoints.
        saver.save(session, os.path.join(log_dir, 'model.ckpt'))

        # Create a configuration for visualizing embeddings with the labels in
        # TensorBoard.
        config = projector.ProjectorConfig()
        embedding_conf = config.embeddings.add()
        embedding_conf.tensor_name = embeddings.name
        embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')
        projector.visualize_embeddings(writer, config)

    writer.close()

    # Step 6: Visualize the embeddings.

    # pylint: disable=missing-docstring
    # Function to draw visualization of distance between embeddings.
    def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
        assert low_dim_embs.shape[0] >= len(
            labels), 'More labels than embeddings'
        plt.figure(figsize=(18, 18))  # in inches
        for i, label in enumerate(labels):
            x, y = low_dim_embs[i, :]
            plt.scatter(x, y)
            plt.annotate(label,
                         xy=(x, y),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')

        plt.savefig(filename)

    try:
        # pylint: disable=g-import-not-at-top
        from sklearn.manifold import TSNE  #把词向量通过TSNE降维的方式给画出来
        import matplotlib
        #matplotlib.use("Agg")
        matplotlib.use("Pdf")
        import matplotlib.pyplot as plt

        tsne = TSNE(perplexity=30,
                    n_components=2,
                    init='pca',
                    n_iter=5000,
                    method='exact')
        plot_only = 500
        low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
        labels = [reverse_dictionary[i] for i in xrange(plot_only)]
        plot_with_labels(low_dim_embs, labels)

    except ImportError as ex:
        print(
            'Please install sklearn, matplotlib, and scipy to show embeddings.'
        )
        print(ex)
Ejemplo n.º 53
0
        # restored original order (not sorted by length)
        for i, predict_f in enumerate(predict_fs):
            predict_features[sorted_indexes[i] + batch_val] = predict_f
predict_features = torch.stack(predict_features)

# get test label csv content
dict = getVideoList(os.path.join(test_label_path))
action_labels = (dict['Action_labels'])

# tSNE to visualize
#x_t = (t_features.cpu()).numpy()
#y_t = (t_label.cpu()).numpy()
X = np.array(predict_features.tolist())
Y = np.array(dict['Action_labels']).astype(int)

tsne = TSNE(n_components=2, random_state=0)

#Project the data in 2D
X_2d = tsne.fit_transform(X)

#Visualize the data
target_names = [
    '0others', '1Inspect/Read', '2Open', '3Take', '4Cut', '5Put', '6Close',
    '7Move_around', '8Divide/Pull_apart', '9Pour', '10Transfer'
]
target_ids = range(len(target_names))  # 0~10 digits
fig1 = plt.figure(
    figsize=(12, 10)).suptitle('tSNE plot of cnn_based action recognition')
colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'peru', 'orange', 'purple', 'indigo'
for i, c, label in zip(target_ids, colors, target_names):
    plt.scatter(X_2d[Y == i, 0], X_2d[Y == i, 1], c=c, label=label)
W_outer = tf.Variable(tf.random_normal([emb_dims, vocab_size], mean=0.0, stddev=0.02, dtype=tf.float32))
b_outer = tf.Variable(tf.random_normal([vocab_size], mean=0.0, stddev=0.02, dtype=tf.float32))

hidden = tf.add(tf.matmul(x, W), b)
logits = tf.add(tf.matmul(hidden, W_outer), b_outer)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

epochs, batch_size = 100, 10
batch = len(x_train)//batch_size

# 迭代 n_iter 次
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print 'was here'
    for epoch in xrange(epochs):
        batch_index = 0
        for batch_num in xrange(batch):
            x_batch = x_train[batch_index: batch_index + batch_size]
            y_batch = y_train[batch_index: batch_index + batch_size]
            sess.run(optimizer, feed_dict={x: x_batch, y: y_batch})
            print('epoch:', epoch, 'loss :', sess.run(cost, feed_dict={x: x_batch, y: y_batch}))
    W_embed_trained = sess.run(W)

W_embedded = TSNE(n_components=2).fit_transform(W_embed_trained)
plt.figure(figsize=(10, 10))
for i in xrange(len(W_embedded)):
    plt.text(W_embedded[i, 0], W_embedded[i, 1], ind2word[i])

plt.xlim(-150, 150)
plt.ylim(-150, 150)
Ejemplo n.º 55
0
km = KMeans(n_clusters=k_cluster, random_state=0)
km.fit(tweet_vecs)
predictions_km = km.predict(tweet_vecs)

# birch
n_clusters = 7
brc = Birch(branching_factor=500,
            n_clusters=n_clusters,
            threshold=0.5,
            compute_labels=True)
brc.fit(tweet_vecs)
predictions = brc.predict(tweet_vecs)
#pdb.set_trace()

# tsne
model = TSNE(n_components=2, random_state=0)
tsne_vecs = model.fit_transform(tweet_vecs)

# visualize
ALL_COLORS = [
    'red', 'blue', "green", "orange", "yellow", "purple", "black", "brown",
    'cyan'
    "gold", "grey"
]


def get_colors(labels):
    colors = []
    for i in labels:
        if i > 11:
            print("Require more color")
Ejemplo n.º 56
0
    if subtitle != None:
        plt.suptitle(subtitle)

    plt.show()


# Getting a batch from training and validation data for visualization
x_train, y_train = get_batch(train_set, 32)
x_val, y_val = get_batch(valid_set, 32)

x_train = x_train.reshape(-1, 784)
x_val = x_val.reshape(-1, 784)

# Generating and visualizing t-SNE embeddings of the raw data
# of the first 512 samples.
tsne = TSNE()
train_tsne_embeds = tsne.fit_transform(x_train)
scatter(train_tsne_embeds, y_train, "Samples from Training Data")

eval_tsne_embeds = tsne.fit_transform(x_val)
scatter(eval_tsne_embeds, y_val, "Samples from Validation Data")

###
#Defining the quadruplet Loss function and Embedding model
###

# import tensorflow as tf


def all_diffs(a, b):
    # Returns a tensor of all combinations of a - b
Ejemplo n.º 57
0
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from knock67 import country_vector

country, country_name = country_vector()
tsne = TSNE(n_components=2, random_state=2021, perplexity=30, n_iter=1000)
embedded = tsne.fit_transform(country)
kmeans = KMeans(n_clusters=5, random_state=2021).fit_predict(country)
plt.figure(figsize=(10, 10))
colors =  ["r", "g", "b", "c", "m"]
for i in range(embedded.shape[0]):
    plt.scatter(embedded[i][0], embedded[i][1], marker='.', color=colors[kmeans[i]])
    plt.annotate(country_name[i], xy=(embedded[i][0], embedded[i][1]), color=colors[kmeans[i]])
plt.show()
Ejemplo n.º 58
0
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.savefig(filename)


tsne = TSNE(perplexity=30.0, n_components=2, n_iter=5000)
low_dim_embqedding = tsne.fit_transform(data1)

plot_with_labels(low_dim_embqedding, labels1)

target_y = np_utils.to_categorical(irish_Data["target"])

model = Sequential()
model.add(Dense(units=64, input_shape=(4, ), activation='tanh'))
model.add(Dense(units=3, activation='softmax'))
print(model.summary())

model.compile(optimizer='sgd',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
Ejemplo n.º 59
0
def tsne(
    s: pd.Series,
    n_components=2,
    perplexity=30.0,
    learning_rate=200.0,
    n_iter=1000,
    random_state=None,
    n_jobs=-1,
) -> pd.Series:
    """
    Performs TSNE on the given pandas series.

    t-distributed Stochastic Neighbor Embedding (t-SNE) is
    a machine learning algorithm used to visualize high-dimensional data in fewer
    dimensions. In natural language processing, the high-dimensional
    data is usually a document-term matrix
    (so in texthero usually a Series after applying :meth:`texthero.representation.tfidf`
    or some other first representation function that assigns a scalar (a weight)
    to each word) that is hard to visualize as there
    might be many terms. With t-SNE, every document
    gets a new, low-dimensional (n_components entries)
    vector in such a way that the differences / similarities between
    documents are preserved.


    Parameters
    ----------
    s : Pandas Series

    n_components : int, default is 2.
        Number of components to keep (dimensionality of output vectors).
        If n_components is not set or None, all components are kept.

    perplexity : float, optional (default: 30)
        The perplexity is related to the number of nearest neighbors that
        is used in other manifold learning algorithms. Larger datasets
        usually require a larger perplexity. Consider selecting a value
        between 5 and 50. Different values can result in significanlty
        different results.

    learning_rate : float, optional (default: 200.0)
        The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
        the learning rate is too high, the data may look like a 'ball' with any
        point approximately equidistant from its nearest neighbours. If the
        learning rate is too low, most points may look compressed in a dense
        cloud with few outliers. If the cost function gets stuck in a bad local
        minimum increasing the learning rate may help.

    n_iter : int, optional (default: 1000)
        Maximum number of iterations for the optimization. Should be at
        least 250.

    random_state : int, default=None
        Determines the random number generator. Pass an int for reproducible
        results across multiple function calls.

    n_jobs : int, optional, default=-1
        The number of parallel jobs to run for neighbors search.
        ``-1`` means using all processors.

    Returns
    -------
    Pandas Series with the vector calculated by t-SNE for the document in every cell.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra", "Football, Music"])
    >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency)
    >>> hero.tsne(s, random_state=42) # doctest: +SKIP
    0      [-18.833383560180664, -276.800537109375]
    1     [-210.60179138183594, 143.00535583496094]
    2    [-478.27984619140625, -232.97410583496094]
    dtype: object

    See also
    --------
    `t-SNE on Wikipedia <https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding>`_

    """
    tsne = TSNE(
        n_components=n_components,
        perplexity=perplexity,
        learning_rate=learning_rate,
        n_iter=n_iter,
        random_state=random_state,
        n_jobs=n_jobs,
    )
    return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index)
Ejemplo n.º 60
0
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    fig = plt.gcf()

    fig.savefig(save_path, dpi=300)
    print('png saved in: ', save_path)

sns.set(rc={'figure.figsize':(11.7,8.27)})
palette = sns.color_palette("bright",2)

modal = 'audio'
work_dir = '/m_fusion_data/'
path =  work_dir + f'representation/{modal}.npz'
print(path)
data = np.load(path)
print(data.files)
class_name=['non-sarcastic','sarcastic']
X = data['repre']
y4 = data['label']
y4 = [class_name[yi] for yi in y4]
tsne = TSNE()
X_embedded = tsne.fit_transform(X)
# print(y4)
print(X_embedded.shape)
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y4, legend='full', palette=palette)
plt.title(modal,fontsize=20)
plt.legend(fontsize=20)
# plt.show()
path =  work_dir + f'representation/img/{modal}.png'
save_plot(path)