def dim_survey(X, entry_id): # convert to numpy X = np.array(X) # run the reduction. X_pca = PCA(n_components=3).fit_transform(X) X_tsne = TSNE(n_components=3).fit_transform(X) X_ica = FastICA(n_components=3).fit_transform(X) # connect to db. with mongoctx() as db: # update the stuff. db['entry'].update( { '_id': ObjectId(entry_id) }, { '$set': { 'pca': X_pca.tolist(), 'tsne': X_tsne.tolist(), 'ica': X_ica.tolist(), } } )
def execute(self): X = self.X #X = np.array(self.X) #print("XXn",len(X)) X2 = TSNE(n_components=self.p, random_state=7, perplexity=40).fit_transform(X) return X2.tolist()
def get_2d_projection(articles): vectors = get_wordvecs(articles) vocab = [word for word in vectors.vocab] wordvecs = [] for word in vocab: wordvecs.append(vectors[word]) embedded = TSNE(n_components=2).fit_transform(wordvecs) return embedded.tolist(), vocab
def proj_docs(corpus, n_components=2): """文档投影""" features = matutils.corpus2dense(corpus['tfidfcorpus'], num_terms=len( corpus['dictionary'].keys()), num_docs=len(corpus['doc2bow'])).T proj_data = TSNE(n_components=n_components, random_state=0).fit_transform(features) proj_data = np.array(proj_data, dtype='float') return proj_data.tolist()
def proj_docs(self, n_components=2): """文档投影""" features = matutils.corpus2dense(self.__tfidf_corpus, num_terms=len( self.__dictionary.keys()), num_docs=len(self.corpus)).T proj_data = TSNE(n_components=n_components, random_state=0).fit_transform(features) proj_data = np.array(proj_data, dtype='float') # return np.around(proj_data, 5).tolist() return proj_data.tolist()
def evaluate(self, dataset): """ Evaluate the passed network on the given dataset for the Context retoration task. ---------- INPUT |---- dataset (torch.utils.data.Dataset) the dataset to use for evaluation. It should output the original image, | and the sample index. OUTPUT |---- None """ logger = logging.getLogger() # make loader loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers, worker_init_fn=lambda _: np.random.seed()) # put net on device self.net = self.net.to(self.device) # Evaluate logger.info('Start Evaluating the context restoration model.') start_time = time.time() idx_repr = [] # placeholder for bottleneck representation n_batch = len(loader) self.net.eval() with torch.no_grad(): for b, data in enumerate(loader): # get data : load in standard way (no patch swaped image) input, idx = data input = input.to(self.device).float() idx = idx.to(self.device) # get representation self.net.return_bottleneck = True _, repr = self.net(input) # down sample representation for reduced memory impact repr = nn.AdaptiveAvgPool2d((4,4))(repr) # add ravelled representations to placeholder idx_repr += list(zip(idx.cpu().data.tolist(), repr.view(repr.shape[0], -1).cpu().data.tolist())) # print_progress if self.print_progress: print_progessbar(b, n_batch, Name='\t\tEvaluation Batch', Size=40, erase=True) # reset the network attriubtes self.net.return_bottleneck = False # compute tSNE for representation idx, repr = zip(*idx_repr) repr = np.array(repr) logger.info('Computing the t-SNE representation.') repr_2D = TSNE(n_components=2).fit_transform(repr) self.outputs['eval']['repr'] = list(zip(idx, repr_2D.tolist())) logger.info('Succesfully computed the t-SNE representation.') # finish evluation self.outputs['eval']['time'] = time.time() - start_time logger.info(f"Finished evaluating on the context restoration task in {timedelta(seconds=int(self.outputs['eval']['time']))}")
def get_2d_projection(articles, filter): vectors = get_wordvecs(articles) vocab = [] for word in vectors.index_to_key: if filter.isValid(word): vocab.append(word) wordvecs = [] for word in vocab: wordvecs.append(vectors[word]) #print("start compressing") embedded = TSNE(n_components=2).fit_transform(wordvecs) #print("end compressing") return embedded.tolist(), vocab
def evaluate(self, dataset): """ Evaluate the network on the given dataset for the Contrastive task (get t-SNE representation of samples). Only if global task. ---------- INPUT |---- dataset (torch.utils.data.Dataset) the dataset to use for evaluation. It should output the original image, | and the sample index. OUTPUT |---- None """ if self.is_global: logger = logging.getLogger() # initiliatize Dataloader loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers, worker_init_fn=lambda _: np.random.seed()) # Evaluate logger.info("Start Evaluating the network on global contrastive task.") start_time = time.time() idx_repr = [] # placeholder for bottleneck representation n_batch = len(loader) self.net.eval() self.net.return_bottleneck = True with torch.no_grad(): for b, data in enumerate(loader): im, idx = data im = im.to(self.device).float() idx = idx.to(self.device) # get representations _, z = self.net(im) # keep representations (bottleneck) idx_repr += list(zip(idx.cpu().data.tolist(), z.squeeze().cpu().data.tolist())) # print_progress if self.print_progress: print_progessbar(b, n_batch, Name='\t\tEvaluation Batch', Size=40, erase=True) # reset the network attriubtes self.net.return_bottleneck = False # compute tSNE for representation idx, repr = zip(*idx_repr) repr = np.array(repr) logger.info('Computing the t-SNE representation.') repr_2D = TSNE(n_components=2).fit_transform(repr) self.outputs['eval']['repr'] = list(zip(idx, repr_2D.tolist())) logger.info('Succesfully computed the t-SNE representation.') # finish evluation self.outputs['eval']['time'] = time.time() - start_time logger.info(f"Finished evaluating of encoder on the global contrastive task in {timedelta(seconds=int(self.outputs['eval']['time']))}") else: warnings.warn("Evaluation is only possible with a global contrastive task.")
def tsne_embed(embedding_container): """ Function to run TSNE on list of embeddings contained in embedding_c ontainer. Returns: points: List of python dicts of format points [ { x: str(float), y: str(float), z: str(float) }, { x: str(float), y: str(float), z: str(float) } { ... } ... ] """ embeddings = np.array(embedding_container) tsne_output = TSNE(n_components=3).fit_transform(embeddings) tsne_output_list = tsne_output.tolist() # Makes an array of dictionaries of points points = [] for tsne_point in tsne_output_list: point_dict = { 'x': tsne_point[0], 'y': tsne_point[1], 'z': tsne_point[2] } points.append(point_dict) return points
def transGraphToMatrixAndtSNE(path): f_r = open(path, 'r') f_w = open('trans_query_graph_to_matrix2.txt', 'w+') allLine = f_r.readlines() seq = 0 result = [] while seq < len(allLine): line = allLine[seq].strip().split() if line[0] == 't': print(str(line[2])) seq += 1 result = [[0 for i in range(836)] for j in range(836)] line2 = allLine[seq].strip().split() while line2[0] == 'v': seq += 1 line2 = allLine[seq].strip().split() while line2[0] == 'e': result[int(line2[2])][int(line2[3])] = 1 result[int(line2[3])][int(line2[2])] = 1 seq += 1 if seq < len(allLine): line2 = allLine[seq].strip().split() else: break matrix = np.array(result) matrix_embedded = TSNE(n_components=1).fit_transform(matrix) matrix_embedded = np.hstack((matrix_embedded)) matrix_embedded = matrix_embedded.tolist() restr = '' for i in range(836): restr = restr + '{:.2f}'.format(matrix_embedded[i]) + ' ' restr += '\n' f_w.write(restr) f_w.close() f_r.close() print("end")
def evaluate(self, net, dataset, print_to_logger=True, return_auc=False, save_tSNE=True): """ Evaluate the natwork on the provided dataset. ---------- INPUT |---- net (nn.Module) The autoencoder to train. It must return two | embedding (after the convolution and after the MLP) as | well as the reconstruction |---- dataset (torch.utils.data.Dataset) the dataset on which the | network is validated. It must return an image and a mask | of where the loss is to be computed. |---- print_to_logger (bool) whether to print info in logger. |---- return_auc (bool) whether to return the computed AUC. |---- save_tSNE (bool) whether to save the intermediate representation | as a 2D vector using tSNE. OUTPUT |---- None """ if print_to_logger: logger = logging.getLogger() # make dataloader (with drop_last = True to ensure that the loss can be computed) loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.n_job_dataloader) # put net on device net = net.to(self.device) # define loss function loss_fn = MaskedMSELoss(reduction='none') if print_to_logger: logger.info("Start Evaluating AE.") idx_label_scores = [] n_batch = len(loader) net.eval() with torch.no_grad(): for b, data in enumerate(loader): input, label, mask, semi_label, idx = data # put inputs to device input = input.to(self.device).float().requires_grad_(True) label = label.to(self.device) mask = mask.to(self.device) semi_label = semi_label.to(self.device) idx = idx.to(self.device) h, z, rec = net(input) # compute score as mean loss over by sample rec_loss = loss_fn(rec, input, mask) score = torch.mean(rec_loss, dim=tuple(range(1, rec.dim()))) # append scores : idx label score h z idx_label_scores += list( zip(idx.cpu().data.numpy().tolist(), label.cpu().data.numpy().tolist(), score.cpu().data.numpy().tolist(), h.cpu().data.numpy().tolist(), z.cpu().data.numpy().tolist())) if self.print_batch_progress: print_progessbar(b, n_batch, Name='\t\tEvaluation Batch', Size=40, erase=True) if save_tSNE: if print_to_logger: logger.info("Computing the t-SNE representation.") # Apply t-SNE transform on embeddings index, label, scores, h, z = zip(*idx_label_scores) h, z = np.array(h), np.array(z) h = TSNE(n_components=2).fit_transform(h) z = TSNE(n_components=2).fit_transform(z) self.eval_repr = list( zip(index, label, scores, h.tolist(), z.tolist())) if print_to_logger: logger.info("Succesfully computed the t-SNE representation ") if return_auc: _, label, scores, _, _ = idx_label_scores auc = roc_auc_score(np.array(label), np.array(scores)) return auc
def evaluate(self, dataset, net, save_tSNE=False, return_loss=True, print_to_logger=True): """ """ if print_to_logger: logger = logging.getLogger() # make dataloader (with drop_last = True to ensure that the loss can be computed) loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.n_job_dataloader, drop_last=True) # put net on device net = net.to(self.device) # define loss function, supervised or self-supervised if self.supervised_loss: loss_fn = SupervisedContrastiveLoss(self.tau, self.batch_size, y_list=[1], device=self.device) else: loss_fn = NT_Xent_loss(self.tau, self.batch_size, device=self.device) if print_to_logger: logger.info("Start Evaluating SimCLR.") net.eval() with torch.no_grad(): sum_loss = 0.0 idx_h_z = [] n_batch = len(loader) for b, data in enumerate(loader): # get input input_1, input_2, semi_label, idx = data input_1 = input_1.to(self.device).float() input_2 = input_2.to(self.device).float() semi_label = semi_label.to(self.device) idx = idx.to(self.device) # forward h_1, z_1 = net(input_1) h_2, z_2 = net(input_2) # normalize z_1 = F.normalize(z_1, dim=1) z_2 = F.normalize(z_2, dim=1) # compute loss if self.supervised_loss: y = torch.where( semi_label == -1, torch.ones_like(semi_label), torch.zeros_like(semi_label) ) # generate labels (1 if known abnormal, else it's considered normal) loss = loss_fn(z_1, z_2, y) else: loss = loss_fn(z_1, z_2) sum_loss += loss.item() # save embeddings if save_tSNE: idx_h_z += list( zip(idx.cpu().data.numpy().tolist(), h_1.cpu().data.numpy().tolist(), z_1.cpu().data.numpy().tolist())) if self.print_batch_progress: print_progessbar(b, n_batch, Name='\t\tEvaluation Batch', Size=40, erase=True) if save_tSNE: if print_to_logger: logger.info("Computing the t-SNE representation.") # Apply t-SNE transform on embeddings index, h, z = zip(*idx_h_z) h, z = np.array(h), np.array(z) h = TSNE(n_components=2).fit_transform(h) z = TSNE(n_components=2).fit_transform(z) self.eval_repr = list(zip(index, h.tolist(), z.tolist())) if print_to_logger: logger.info("Succesfully computed the t-SNE representation ") if return_loss: return loss / n_batch
def evaluate(self, dataset, save_tsne=False, return_scores=False): """ Evaluate the passed network on the given dataset for the Context retoration task. ---------- INPUT |---- dataset (torch.utils.data.Dataset) the dataset to use for evaluation. It should output the original image, | and the sample index. |---- save_tsne (bool) whether to compute and store in self.outputs the tsne representation of the feature map | after the average pooling layer and before the MLP |---- return_scores (bool) whether to return the measured ROC AUC, accuracy, recall, precision and f1-score. OUTPUT |---- (auc) (float) the ROC AUC on the dataset. |---- (acc) (float) the accuracy on the dataset. |---- (recall) (float) the recall on the dataset. |---- (precision) (float) the precision on the dataset. |---- (f1) (float) the f1-score on the dataset. """ logger = logging.getLogger() # make loader loader = torch.utils.data.DataLoader( dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers, worker_init_fn=lambda _: np.random.seed()) # put net on device self.net = self.net.to(self.device) # Evaluate start_time = time.time() idx_repr = [] # placeholder for bottleneck representation idx_label_pred = [] # placeholder for label & prediction n_batch = len(loader) self.net.eval() with torch.no_grad(): for b, data in enumerate(loader): # get data : load in standard way (no patch swaped image) input, label, idx = data input = input.to(self.device).float() label = label.to(self.device).float() idx = idx.to(self.device) if save_tsne: # get representation self.net.return_bottleneck = True pred_score, repr = self.net(input) pred_score = torch.sigmoid(pred_score) pred = torch.where( pred_score > 0.5, torch.ones_like(pred_score, device=self.device), torch.zeros_like(pred_score, device=self.device)) # down sample representation for reduced memory impact #repr = nn.AdaptiveAvgPool2d((4,4))(repr) # add ravelled representations to placeholder idx_repr += list( zip(idx.cpu().data.tolist(), repr.view(repr.shape[0], -1).cpu().data.tolist())) idx_label_pred += list( zip(idx.cpu().data.tolist(), label.cpu().data.tolist(), pred.cpu().data.tolist(), pred_score.cpu().data.tolist()) ) # pred score is softmax activation of class 1 else: pred_score = self.net(input) pred_score = torch.sigmoid(pred_score) # B x N_class pred = torch.where( pred_score > 0.5, torch.ones_like(pred_score, device=self.device), torch.zeros_like(pred_score, device=self.device)) idx_label_pred += list( zip(idx.cpu().data.tolist(), label.cpu().data.tolist(), pred.cpu().data.tolist(), pred_score.cpu().data.tolist())) # print_progress if self.print_progress: print_progessbar(b, n_batch, Name='\t\tEvaluation Batch', Size=50, erase=True) # reset the network attriubtes if save_tsne: self.net.return_bottleneck = False # compute tSNE for representation if save_tsne: idx, repr = zip(*idx_repr) repr = np.array(repr) logger.info('Computing the t-SNE representation.') repr_2D = TSNE(n_components=2).fit_transform(repr) self.outputs['eval']['repr'] = list(zip(idx, repr_2D.tolist())) logger.info('Succesfully computed the t-SNE representation.') # Compute Accuracy _, label, pred, pred_score = zip(*idx_label_pred) label, pred, pred_score = np.array(label), np.array(pred), np.array( pred_score) auc = roc_auc_score(label, pred_score, average=self.score_average) acc = accuracy_score(label.ravel(), pred.ravel()) sub_acc = accuracy_score(label, pred) recall = recall_score(label, pred, average=self.score_average) precision = precision_score(label, pred, average=self.score_average) f1 = f1_score(label, pred, average=self.score_average) self.outputs['eval']['auc'] = auc self.outputs['eval']['acc'] = acc self.outputs['eval']['subset_acc'] = sub_acc self.outputs['eval']['recall'] = recall self.outputs['eval']['precision'] = precision self.outputs['eval']['f1'] = f1 self.outputs['eval']['pred'] = idx_label_pred # finish evluation self.outputs['eval']['time'] = time.time() - start_time if return_scores: return auc, acc, sub_acc, recall, precision, f1
def tsne(comp, perp, lr, init): print "N_components (fed to SVD) :", comp print "Perplexity (fed to TSNE) :", perp print "learning_rate (fed to TSNE) :", lr print "init(fed to TSNE) :", init X_reduced = TruncatedSVD(n_components=50, random_state=0).fit_transform(vectors) X_embedded = TSNE(n_components=comp, perplexity=perp, verbose=2, learning_rate=lr, init=init).fit_transform(X_reduced) # width and height of image iw = 16 ih = 9 fig = plt.figure(figsize=(iw, ih)) fig.patch.set_facecolor('white') ax = plt.axes(frameon=False) plt.setp(ax, xticks=(), yticks=()) plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9, wspace=0.0, hspace=0.0) plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c='black', marker=".") fig.savefig("img/" + datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M') + "_TSNE_LYRICS_NO_ANNOTATION" + type_of_run + "_perp" + str(perp) + "_comp" + str(comp) + "_lr" + str(lr) + "_" + str(init) + "_300dpi_1wh.png", transparent=False, dpi=300) fig = plt.figure(figsize=(iw * 2, ih * 2)) fig.patch.set_facecolor('white') ax = plt.axes(frameon=False) plt.setp(ax, xticks=(), yticks=()) plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9, wspace=0.0, hspace=0.0) plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c='black', marker=".") fig.savefig("img/" + datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M') + "_TSNE_LYRICS_NO_ANNOTATION" + type_of_run + "_perp" + str(perp) + "_comp" + str(comp) + "_lr" + str(lr) + "_" + str(init) + "_300dpi_2wh.png", transparent=False, dpi=300) fig = plt.figure(figsize=(iw * 3, ih * 3)) fig.patch.set_facecolor('white') ax = plt.axes(frameon=False) plt.setp(ax, xticks=(), yticks=()) plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9, wspace=0.0, hspace=0.0) plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c='black', marker=".") fig.savefig("img/" + datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M') + "_TSNE_LYRICS_NO_ANNOTATION" + type_of_run + "_perp" + str(perp) + "_comp" + str(comp) + "_lr" + str(lr) + "_" + str(init) + "_300dpi_3wh.png", transparent=False, dpi=300) X = X_embedded.tolist() for idx, x in enumerate(X): x.extend([authors[idx], titles[idx], ids[idx]]) #print X # SAVE to txt file as csv for later import to d3 txt_fn = datetime.datetime.now().strftime( '%Y-%m-%d_%Hh%M') + "_LYRICS_TSNE_" + type_of_run + "_perp" + str( perp) + "_comp" + str(comp) + "_lr" + str(lr) + "_" + str( init) + ".txt" txt_fn_path = "txt/" + txt_fn f_txt = open(txt_fn_path, 'w') f_txt.write("x,y,author,title,id") for x in X: f_txt.write("\n") for idx, item in enumerate(x): if idx == 4: f_txt.write("\"%s\"" % item) elif type(item) is str: f_txt.write("\"%s\"," % item) else: f_txt.write("%s," % item) f_txt.close() "\nTXT file created at:", txt_fn_path
class QueryData(dict): def __init__(self, query, labels=[], vectors=[]): self.query = query self.labels = labels self.distances = [] self.vectors = vectors self.vocab_size = 0 self.query_size = 0 self.parsed_positive = [] self.parsed_negative = [] self.dim_embedded = [] self.embedding = [] self.cluster_data = [] self.cluster_centroids = [] def clear_data(self): self.distances.clear() self.labels.clear() self.vectors.clear() self.parsed_positive.clear() self.parsed_negative.clear() self.dim_embedded.clear() self.cluster_data.clear() self.cluster_centroids.clear() def _closest_node(self, node, nodes): nodes = np.asarray(nodes) dist_2 = np.sum((nodes - node)**2, axis=1) return np.argmin(dist_2) def dim_reduce(self): self.dim_embedded = TSNE(n_components=2).fit_transform( np.array(self.vectors, dtype=np.float64)) self.embedding = self.dim_embedded.tolist() def cluster(self, num_clusters): clustering = KMeans(n_clusters=num_clusters).fit(self.dim_embedded) cluster_labels = clustering.labels_.tolist() centroids = clustering.cluster_centers_.tolist() for cluster_id in range(num_clusters): closest_node = self._closest_node(centroids[cluster_id], self.dim_embedded) closest_node_word = self.labels[closest_node] self.cluster_centroids.append({ 'x': centroids[cluster_id][0], 'y': centroids[cluster_id][1], 'label': cluster_labels[cluster_id], 'word': closest_node_word }) embedding = self.dim_embedded.tolist() for item in range(len(cluster_labels)): self.cluster_data.append({ 'x': embedding[item][0], 'y': embedding[item][1], 'label': cluster_labels[item] }) def to_dict(self): result = { 'query': self.query, 'labels': self.labels, 'vocab_size': self.vocab_size, 'query_size': self.query_size, 'centroids': self.cluster_centroids, 'cluster_data': self.cluster_data } return result
class Exploration(): def __init__(self, query, labels=[], vectors=[]): self.query = query self.parsed_query = {} self.labels = labels self.vectors = vectors self.reduction = [] self.clusters = [] self.distances = [] self.stats = {} def reduce(self): print('Performing tSNE reduction ' + 'on {} vectors'.format(len(self.vectors))) self.reduction = TSNE(n_components=2, verbose=1).fit_transform( np.array(self.vectors, dtype=np.float64)) # slower than below # replaced below tsne with scikit's above # self.reduction = bh_sne(np.array(self.vectors, dtype=np.float64)) def cluster(self, num_clusters=30): clustering = KMeans(n_clusters=num_clusters) clustering.fit(self.reduction) self.clusters = clustering.labels_ clustermatrix = [] reduction = self.reduction.tolist() for cluster_id in range(num_clusters): clustermatrix.append([ reduction[i] for i in range(len(self.vectors)) if self.clusters[i] == cluster_id ]) self.cluster_centroids = clustering.cluster_centers_.tolist() self.cluster_centroids_closest_nodes = [] for cluster_id in range(num_clusters): nodes_for_cluster = clustermatrix[cluster_id] centroid = self.cluster_centroids[cluster_id] closest_node_to_centroid = self._closest_node( centroid, nodes_for_cluster) coords = nodes_for_cluster[closest_node_to_centroid] node_id = reduction.index(coords) self.cluster_centroids_closest_nodes.append(node_id) def serialize(self): result = { 'query': self.query, 'parsed_query': self.parsed_query, 'labels': self.labels, 'stats': self.stats } if len(self.reduction) > 0: result['reduction'] = self.reduction.tolist() if len(self.distances) > 0: result['distances'] = self.distances if len(self.clusters) > 0: result['clusters'] = self.clusters.tolist() result['cluster_centroids'] = self.cluster_centroids closest_nodes = self.cluster_centroids_closest_nodes result['cluster_centroids_closest_nodes'] = closest_nodes return result def _closest_node(self, node, nodes): nodes = np.asarray(nodes) dist_2 = np.sum((nodes - node)**2, axis=1) return np.argmin(dist_2)
def tsne(self, X): X2 = TSNE(n_components=2, random_state=0, perplexity=40).fit_transform(X) return X2.tolist()
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], s=size_array[:], c='black', marker=".", alpha=1) fig.savefig("img/"+save_PATH+"/TSNE_SONDHEIM_{0:05d}.png".format(iters), transparent=True, figsize=(16.0, 9.0), dpi=320) print("IMAGE saved at: img/"+save_PATH+"/TSNE_SONDHEIM_{0:05d}.png".format(iters)) # for i, a in enumerate(txt_fn): # ax.annotate(a, (X_embedded[:, 0][i], X_embedded[:, 1][i])) # #ax.annotate(a+"\n'"+titles[i]+"'", (X_embedded[:, 0][i], X_embedded[:, 1][i])) # #plt.show() # fig.savefig("img/"+save_PATH+"/"+str(iters)+"_TSNE_SONDHEIM_ANNOTATED.png", transparent=False, figsize=(16.0, 9.0), dpi=320) X = X_embedded.tolist() for idx,x in enumerate(X): x.extend([ids[idx]]) # SAVE to txt file as csv for later import to d3 #txt_fn = datetime.datetime.now().strftime('%Y-%m-%d_%Hh%M')+"_SONDHEIM_TSNE_"+str(complexity_INIT)+"_"+str(n_comp)+"_"+str(perp)+"_"+str(lr)+"_"+str(exag)+"_num_iter"+str(num_iter)+".txt" txt_fn = "LR{}_EXAG{}_C{}_LAYERS{}_P{}.txt".format(lr,exag,complexity_INIT,n_comp,perp) txt_fn_path = txt_save_PATH+txt_fn f_txt=open(txt_fn_path,'w') f_txt.write("x,y,id,s") inc=0 for x in X:
def evaluate(self, dataset, net, save_tSNE=False, return_loss=True, print_to_logger=True): """ Evaluate the Contrative network on the provided dataset. ---------- INPUT |---- net (nn.Module) The Encoder network to validate. |---- dataset (torch.utils.data.Dataset) the dataset on which the | network is evaluated. |---- print_to_logger (bool) whether to print in the logger. |---- save_tSNE (bool) whether to save a 2D t-SNE representation of | the embeded data points |---- return_loss (bool) whether to return the validation loss. OUTPUT |---- (auc) (float) the validation loss if required. """ if print_to_logger: logger = logging.getLogger() # make dataloader (with drop_last = True to ensure that the loss can be computed) loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.n_job_dataloader, drop_last=True) # put net on device net = net.to(self.device) # define loss function loss_fn = InfoNCE_loss(self.tau, self.batch_size, device=self.device) if print_to_logger: logger.info("Start Evaluating Contrastive.") net.eval() with torch.no_grad(): sum_loss = 0.0 idx_h_z = [] n_batch = len(loader) for b, data in enumerate(loader): # get input input_1, input_2, _, idx = data input_1 = input_1.to(self.device).float() input_2 = input_2.to(self.device).float() idx = idx.to(self.device) # forward h_1, z_1 = net(input_1) h_2, z_2 = net(input_2) # normalize z_1 = F.normalize(z_1, dim=1) z_2 = F.normalize(z_2, dim=1) # compute loss loss = loss_fn(z_1, z_2) sum_loss += loss.item() # save embeddings if save_tSNE: idx_h_z += list( zip(idx.cpu().data.numpy().tolist(), h_1.cpu().data.numpy().tolist(), z_1.cpu().data.numpy().tolist())) if self.print_batch_progress: print_progessbar(b, n_batch, Name='\t\tEvaluation Batch', Size=40, erase=True) if save_tSNE: if print_to_logger: logger.info("Computing the t-SNE representation.") # Apply t-SNE transform on embeddings index, h, z = zip(*idx_h_z) h, z = np.array(h), np.array(z) h = TSNE(n_components=2).fit_transform(h) z = TSNE(n_components=2).fit_transform(z) self.eval_repr = list(zip(index, h.tolist(), z.tolist())) if print_to_logger: logger.info("Succesfully computed the t-SNE representation ") if return_loss: return loss / n_batch
def evaluate(self, net, dataset, return_auc=False, print_to_logger=True, save_tSNE=True): """ Evaluate the DSAD network on the provided dataset. ---------- INPUT |---- net (nn.Module) The DeepSAD network to validate. |---- dataset (torch.utils.data.Dataset) the dataset on which the | network is evaluated. |---- net (nn.Module) The DeepSAD network to validate. |---- return_auc (bool) whether to return the computed auc or not. |---- print_to_logger (bool) whether to print in the logger. |---- save_tSNE (bool) whether to save a 2D t-SNE representation of | the embeded data points OUTPUT |---- None """ if print_to_logger: logger = logging.getLogger() # make dataloader loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.n_job_dataloader) # put net on device net = net.to(self.device) # Evaluating if print_to_logger: logger.info('Start Evaluating the DMSAD.') start_time = time.time() idx_label_score = [] net.eval() with torch.no_grad(): for b, data in enumerate(loader): # get data on device input, label, mask, semi_label, idx = data input = input.to(self.device).float() label = label.to(self.device) mask = mask.to(self.device) semi_label = semi_label.to(self.device) idx = idx.to(self.device) # mask input input = input * mask # Embed input and compute anomaly score _, embed = net(input) # find closest sphere dist, sphere_idx = torch.min(torch.norm(self.c.unsqueeze(0) - embed.unsqueeze(1), p=2, dim=2), dim=1) # if self.R is not None: # # anomaly scores positive if dist > R and negative if dist < R # score = dist - torch.stack([self.R[j] for j in sphere_idx], dim=0) # else: # # else scores is just the minimal distance to a center score = dist # append idx, scores, label and embeding idx_label_score += list( zip(idx.cpu().data.numpy().tolist(), label.cpu().data.numpy().tolist(), score.cpu().data.numpy().tolist(), sphere_idx.cpu().data.numpy().tolist(), embed.cpu().data.numpy().tolist())) if self.print_batch_progress: print_progessbar(b, len(loader), Name='\t\t Evaluation Batch', Size=40, erase=True) # compute AUCs index, label, score, sphere_index, embed = zip(*idx_label_score) label, score = np.array(label), np.array(score) auc = roc_auc_score(label, score) if save_tSNE: embed = np.array(embed) embed = TSNE(n_components=2).fit_transform(embed) idx_label_score = list( zip(index, label.tolist(), score.tolist(), sphere_index, embed.tolist())) self.eval_time = time.time() - start_time self.eval_scores = idx_label_score self.eval_auc = auc if print_to_logger: logger.info(f'Evaluation Time : {self.eval_time}') logger.info(f'Evaluation AUC : {self.eval_auc:.3%}') logger.info('Finished Evaluating the DMSAD.') if return_auc: return auc
vectors = [] artists = [] with open('song_vectors.json') as jsonfile: result = json.load(jsonfile) for artist, values in result.items(): if artist in [ "Eminem", "Nirvana", "Billy Talent", "Ska-P", "Daft Punk", "Chick Corea", "Kings Of Leon" ]: for value in values: vector = value["latent"] artists.append(artist) vectors.append(vector) print(np.array(vectors).shape) X_embedded = TSNE(n_components=2).fit_transform(np.array(vectors)) for artist, p in zip(artists, X_embedded.tolist()): plt.scatter(p[0], p[1], color=artist_map[artist], s=30) ax = plt.gca() ax.set_xticks([]) ax.set_yticks([]) ax.set(xlabel="t-SNE axis 1", ylabel="t-SNE axis 2") ax.grid(True) # plt.show() plt.savefig("latent_vectors.pgf")
from sklearn.manifold import TSNE import numpy as np import json with open('mv_data.json') as f: j = json.loads(f.read()) def distance(a, b): return np.linalg.norm(a.reshape((40, 40))-b.reshape((40, 40))) data = np.array(list(map(lambda x: np.array(x).flatten(), map(lambda x: x['viewMatrix'], j['papers'])))) embed = TSNE(metric=distance).fit_transform(data) embed -= embed.min(axis=0) embed /= embed.max(axis=0) embed *= 2 embed -= 1 with open('tsne.json', 'w') as f: f.write(json.dumps(embed.tolist()))
from sklearn.manifold import TSNE import matplotlib.pyplot as plt import numpy as np import sys relu = lambda x: max(0.0, x) ans = [] with open('tsne.in') as f: # 需要重新打开文本进行读取 #with open('in.txt') as f: # 需要重新打开文本进行读取 for line2 in f: tmp = line2.rstrip().split() for i in range(len(tmp)): tmp[i] = relu(float(tmp[i])) #tmp = torch.Tensor(tmp) ans.append(tmp) #ans = ans[:2000] x = np.array(ans) print(x.tolist()) y = TSNE(n_components=2, perplexity=10, learning_rate=50, n_iter=10000).fit_transform(x) with open('draw_data.py', 'w') as f: f.write('a=') f.write('%s' % y.tolist())
class tSNE(Similarity): _similarity_type = 'tsne' def __init__(self, *args, **kwargs): """ This function might be called locally and in that case we want to return the actual similarity calculator instance. Or it might be run rmeotely (via celery) and in this case we want to reutrn the serialized version of the similarity instance. Parameters ---------- display_type : string String representation of the display, can be 'plot', 'hexbin'. Returns ------- N/A """ # Pull the display type out of kwargs if it is there. If not then we will # use 'plot' as the default. if 'display_type' in kwargs: display_type = kwargs['display_type'] del kwargs['display_type'] else: display_type = 'plot' super().__init__(tSNE._similarity_type, *args, **kwargs) log.info('Created {}'.format(self._similarity_type)) # Each line / element in these should correpsond self._Y = None self._fingerprints = [] self._filename_index = [] self._distance_measure = 'l2' # Display types self._display_type = display_type self._display_types = ['plot', 'hexbin', 'mpl'] if self._display_type not in self._display_types: raise Exception('Display type {} not one of {}'.format( self._display_type, self._display_types)) # Define the distance measures. self._distance_measures = { 'l2': lambda Y, point: np.sqrt(np.sum((Y - np.array(point))**2, axis=1)), 'l1': lambda Y, point: np.sum(np.abs((Y - np.array(point)), axis=1)), } @property def data(self): return self._Y @property def data_filtered(self): return self._Y[self._fingerprint_filter_inds] # # Calculation Methods # def calculate(self, fingerprints): """ Calculate the TSNE based on the fingerprints. Parameters ---------- fingerprints : list of Fingerprint instances The fingerprints we want to calculate over. Returns ------- N/A """ log.info('Going to calculate tSNE from {} fingerprints'.format( len(fingerprints))) # # Filter the fingerprints, if the filter is set. # if self._fingerprint_filter is not None: fingerprints = self._fingerprint_filter(fingerprints) # # Calculate the unique labels # labels = [] values = {} for ii, fp in enumerate(fingerprints): log.debug(' fingerprint is {}'.format(fp)) # # Add to unique label list # labels.extend( [pred[1] for pred in fp.predictions if pred[1] not in labels]) # # Store the predictions for next processing # values[ii] = fp.predictions self._fingerprints.append(fp) log.info('Unique labels {}'.format(labels)) # # Set up the similarity matrix X based on the predictions # X = np.zeros((len(fingerprints), len(labels))) for ii, fp in enumerate(fingerprints): inds = [labels.index(prediction[1]) for prediction in values[ii]] X[ii][inds] = [prediction[2] for prediction in values[ii]] log.debug('X is {}'.format(X)) log.debug('Fingerprint list {}'.format(self._fingerprints)) # # Compute the tSNE of the data. # log.info('Calculating the tSNE...') self._Y = TSNE(n_components=2).fit_transform(X) log.debug('self._Y is {}'.format(self._Y)) log.info('Done calculation') # # Utility Methods # def save(self): """ Save function converts the instance to a dict. Parameters ---------- None Returns ------- dict Dictionary representation of this instance. """ log.info('Returning the dictionary of information') return { 'uuid': self._uuid, 'similarity_type': self._similarity_type, 'similarity': self._Y.tolist(), 'fingerprint': [fp.save() for fp in self._fingerprints], 'parameters': { 'distance_measure': self._distance_measure } } def load(self, thedict, db=None): """ Reload the internal variables from the dictionary. Parameters ---------- thedict : dict The first parameter. db : str database object Returns ------- N/A """ log.info( 'Loading the dictionary of information with database {}'.format( db)) self._uuid = thedict['uuid'] self._similarity_type = thedict['similarity_type'] self._Y = np.array(thedict['similarity']) self._fingerprints = [ Fingerprint.factory(x) for x in thedict['fingerprint'] ] self._parameters = thedict['parameters'] self._distance_measure = self._parameters['distance_measure'] self._fingerprint_filter_inds = list(range(len(self._fingerprints))) # # Display methods # def set_display_type(self, display_type): """ Set the display type. Currently 'plot', 'hexbin', and 'mpl' are defined. Parameters ---------- display_type : string Display type: 'plot', 'hexbin', and 'mpl' are defined. Returns ------- N/A """ if display_type in self._display_types: self._display_type = display_type else: raise ValueError('Display type {} not in {}'.format( display_type, self._display_types)) def select_distance_measure(self, distance_measure=None): """ Select the distance measure. Parameters ---------- distance_measure : string Display type: 'plot', 'hexbin', and 'mpl' are defined. Returns ------- N/A """ if not distance_measure: dm_options = self._distance_measures.keys() selected = False N = 0 while not selected: # Show the fingerprints in order to allow for the person to select one print('Select distance measure to use (q to quit:') for ii, x in enumerate(dm_options): print(' {}) {}'.format(ii, x)) N = ii s = input('Select Number > ') if s == 'q': return try: s = int(s) if s >= 0 and s < N: self._distance_measure = self._distance_measures[s] except Exception: pass else: if distance_measure in self._distance_measures: self._distance_measure = distance_measure else: self._distance_measure = self._distance_measures.keys()[0] log.error( 'ERROR: No definition for {} so using {} instead.'.format( distance_measure, self._distance_measure)) def display(self, tsne_axis): """ Display the plot into the matplotlib axis in the parameter based on the plot type. This just determines the plot type and then calls the internal plot function. Parameters ---------- tsne_axis : Matplotlib.axes.axis The matplotlib axis into which we want to display the plot. Returns ------- N/A """ if self._display_type == 'plot': self._display_plot(tsne_axis) elif self._display_type == 'hexbin': return self._display_hexbin(tsne_axis) # elif self._display_type == 'mpl': # self._display_mpl(tsne_axis) else: raise ValueError('Plot type {} is not in the valid list {}'.format( self._display_type, self._display_types)) def _display_plot(self, tsne_axis): """ Display the plot into the matplotlib axis as a regular scatter plot. Parameters ---------- tsne_axis : Matplotlib.axes.axis The matplotlib axis into which we want to display the plot. Returns ------- N/A """ tsne_axis.plot(self._Y[:, 0], self._Y[:, 1]) #, '.') tsne_axis.grid('on') tsne_axis.set_title('tSNE [{}]'.format(self._distance_measure)) def _display_hexbin(self, tsne_axis): """ Display the plot into the matplotlib axis as a hexbin. Parameters ---------- tsne_axis : Matplotlib.axes.axis The matplotlib axis into which we want to display the plot. Returns ------- N/A """ output = tsne_axis.hexbin(self._Y[:, 0], self._Y[:, 1], cmap='hot') tsne_axis.grid('on') tsne_axis.set_title('tSNE [{}]'.format(self._distance_measure)) # Set the color limits so that it is a little brighter limmax = np.percentile(output.get_array(), 99.9) output.set_clim((0, limmax)) return output def find_similar(self, point, n=9, allow_overlapping_bounding_boxes=True): """ Find fingerprints that are close to the input point. Parameters ---------- point : tuple (int, int) A point in the plot. n : int Number to return. allow_overlapping_bounding_boxes: bool Whether to allow overlapping bb or not. Returns ------- list List of dicts that describe the closest fingerprints. """ log.info('') if self._fingerprint_filter_inds is None: self._fingerprint_filter_inds = list(range(len( self._fingerprints))) distances = self._distance_measures[self._distance_measure](self._Y, point) log.debug('Filtering based, n distances {} n filter_inds {}'.format( len(distances), len(self._fingerprint_filter_inds))) inds = [] for ind in np.argsort(distances): # First, make sure this index is one of the filtered ones. if ind in self._fingerprint_filter_inds: # Next check to see if we allow overlapping bounding boxes # If not, make sure this one doesn't overlap with any in the list so far. if (allow_overlapping_bounding_boxes or not any([ self._fingerprints[ind].cutout.bounding_box.overlap( self._fingerprints[ii].cutout.bounding_box) for ii in inds ])): inds.append(ind) if len(inds) == n: break # Now we want to look only in the "search_inds" if that is passed in return [{ 'tsne_point': self._Y[ind], 'distance': distances[ind], 'fingerprint': self._fingerprints[ind] } for ind in inds[:n]] def cutout_point(self, cutout): """ Given a cutout (and therefore a fingerprint), find the point in the tSNE plot that it corresponds to. Parameters ----------- cutout : Cutout The cutout we want to find. Return ------ tSNE point: tuple Point in the tSNE space the cutout corresponds to """ log.info('cutout {}'.format(cutout)) index = [ fingerprint.cutout.uuid for fingerprint in self._fingerprints ].index(cutout.uuid) return self._Y[index] def closest_cutout(self, data, point): """ Given a cutout (and therefore a fingerprint), find the point in the tSNE plot that it corresponds to. Parameters ----------- cutout : Cutout The cutout we want to find. Return ------ tSNE point: tuple Point in the tSNE space the cutout corresponds to """ log.info('data {} point {}'.format(data, point)) # # Get the cutouts assocated with the data passed in. # cutouts = [ fingerprint.cutout for fingerprint in self._fingerprints if fingerprint.cutout.data == data ] # # Compute distance between cutout bounding boxes centers and the point. # distances = [c.bounding_box.distance(point) for c in cutouts] # # Find the smallest. # index = np.argsort(distances)[0] log.debug('Closest cutout is with bb {} and dist {}'.format( cutouts[index].bounding_box, distances[index])) return cutouts[index]