def print_feature_eval(self): ''' For each feature type, calculate the means and the std dev for each cluster. Then take the mean and std dev of those quantaties ''' print "FEATURE EVAL" print stats = dict() for name in self.clusters[0].members[0].get_feature_set_names(): stats[name + "_global"] = self.feature_eval_metrics( lambda cluster, _doc: cluster.center.global_sim(_doc, name)) stats[name + "_region_uniform_weights"] = self.feature_eval_metrics( lambda cluster, _doc: utils.avg_val_mat( cluster.center.region_sim(_doc, name))) stats[name + "_region_fixed_weights"] = self.feature_eval_metrics( lambda cluster, _doc: utils.avg_val_mat( utils.mult_mats( cluster.center.region_sim_weights(_doc, name)))) stats['confirm'] = self.feature_eval_metrics( lambda cluster, _doc: self.confirm.cluster_doc_similarity( cluster, _doc)) padding_len = 1 + max(map(len, stats.keys())) print "Columns are in order:" print "1) Mean Similarity between Document and assigned Clusters" print "2) Std. Dev of Document-Cluster similarities" print "3) Mean Average Cluster similarity (cluster members to cluster)" print "4) Std Dev of Average Cluster similarity" print "5) Mean of Std Dev of Cluster similarity (cluster members to cluster)" print "6) Std Dev of the Std Dev of Cluster similarity" print for name in sorted(list(stats.keys())): print utils.pad_to_len(name, padding_len), "\t", "\t".join( map(lambda x: "%.4f" % x, stats[name])) if "_uniform" in name or "overall" in name: print print print
def collate_fn(self, datas): batch = {} batch['length'] = torch.LongTensor([data['length'] for data in datas]) padded_len = min(self.max_seq_len, max(batch['length'])) batch['context'] = torch.tensor([ pad_to_len(data['context'], padded_len, self.padding) for data in datas ]) batch['label'] = torch.LongTensor([data['label'] for data in datas]) return batch
def collate_fn(self, samples): batch = {} for key in ['id', 'len_text']: batch[key] = [sample[key] for sample in samples] for key in ['text', 'attention_mask']: to_len = max([len(sample[key]) for sample in samples]) padded = pad_to_len([sample[key] for sample in samples], to_len, self.padding) batch[key] = torch.tensor(padded) return batch
def collate_fn(self, samples): batch = {} samples.sort(key=lambda x: x['len_text'], reverse=True) #sort for key in ['id', 'len_text', 'len_summary']: batch[key] = [sample[key] for sample in samples] for key in ['text', 'summary', 'attention_mask']: to_len = max([len(sample[key]) for sample in samples]) padded = pad_to_len([sample[key] for sample in samples], to_len, self.padding) batch[key] = torch.tensor(padded) return batch
def print_feature_eval(self): ''' For each feature type, calculate the means and the std dev for each cluster. Then take the mean and std dev of those quantaties ''' print "FEATURE EVAL" print stats = dict() for name in self.clusters[0].members[0].get_feature_set_names(): stats[name + "_global"] = self.feature_eval_metrics( lambda cluster, _doc: cluster.center.global_sim(_doc, name)) stats[name + "_region_uniform_weights"] = self.feature_eval_metrics( lambda cluster, _doc: utils.avg_val_mat(cluster.center.region_sim(_doc, name))) stats[name + "_region_fixed_weights"] = self.feature_eval_metrics( lambda cluster, _doc: utils.avg_val_mat(utils.mult_mats(cluster.center.region_sim_weights(_doc, name)))) stats['confirm'] = self.feature_eval_metrics( lambda cluster, _doc: self.confirm.cluster_doc_similarity(cluster, _doc)) padding_len = 1 + max(map(len, stats.keys())) print "Columns are in order:" print "1) Mean Similarity between Document and assigned Clusters" print "2) Std. Dev of Document-Cluster similarities" print "3) Mean Average Cluster similarity (cluster members to cluster)" print "4) Std Dev of Average Cluster similarity" print "5) Mean of Std Dev of Cluster similarity (cluster members to cluster)" print "6) Std Dev of the Std Dev of Cluster similarity" print for name in sorted(list(stats.keys())): print utils.pad_to_len(name, padding_len), "\t", "\t".join(map(lambda x: "%.4f" % x, stats[name])) if "_uniform" in name or "overall" in name: print print print
def collate_fn(self, samples): batch = {} for key in ['id', 'sent_range']: batch[key] = [sample[key] for sample in samples] for key in ['text', 'label']: if any(key not in sample for sample in samples): continue to_len = max([len(sample[key]) for sample in samples]) padded = pad_to_len([sample[key] for sample in samples], to_len, self.padding if key != 'label' else SeqTaggingDataset.ignore_idx) batch[key] = torch.tensor(padded) return batch
def collate_fn(self, samples): batch = {} samples.sort(key=lambda x: len(x['text']), reverse=True) for key in ['id', 'len_text', 'len_summary']: if any(key not in sample for sample in samples): continue batch[key] = [sample[key] for sample in samples] for key in ['text', 'summary', 'attention_mask']: if any(key not in sample for sample in samples): continue to_len = max([len(sample[key]) for sample in samples]) padded = pad_to_len([sample[key] for sample in samples], to_len, self.padding) batch[key] = torch.tensor(padded) batch['padding_len'] = [len(sample['text']) for sample in samples] return batch
def print_doc_cluster_sim_mat(self): print "CLUSTER-DOC SIM MAT" print for x, cluster in enumerate(self.clusters): print "%d:\t%s" % (x, cluster.label) print print "documents labeled with # indicate that their most similar cluster has a different true label" print "documents labeled with ^ indicate that their assigned cluster is not the most similar cluster" print "cluster sim scores labeled with * indicate that the cluster shares the label with the document" print print (" " * 50) + "\t\t".join(map(str, xrange(len(self.clusters)))) print num_closest_to_incorrect_cluster = 0 doc_cluster_sim_mat = self.confirm.get_doc_cluster_sim_mat() for doc_idx in xrange(len(self.docs)): _doc = self.docs[doc_idx] to_print = list() best_cluster = None best_sim_score = -1 post = "" for cluster_idx in xrange(len(self.clusters)): cluster = self.clusters[cluster_idx] sim_score = doc_cluster_sim_mat[doc_idx][cluster_idx] if sim_score > best_sim_score: best_cluster = cluster best_sim_score = sim_score to_print.append("%3.2f" % sim_score) if (cluster.label == _doc.label): to_print[-1] += '*' if _doc.label != best_cluster.label: num_closest_to_incorrect_cluster += 1 post += "#" if _doc not in best_cluster.members: post += "^" print "%s%s" % (utils.pad_to_len("%s %s" % (_doc._id, (_doc.label + post)), 50), "\t".join(to_print)) #for _doc in self.docs: # to_print = list() # best_cluster = None # best_sim_score = -1 # post = "" # for cluster in self.clusters: # sim_score = self.confirm.cluster_doc_similarity(cluster, _doc) # if sim_score > best_sim_score: # best_cluster = cluster # best_sim_score = sim_score # to_print.append("%3.2f" % sim_score) # if (cluster.label == _doc.label): # to_print[-1] += '*' # if _doc.label != best_cluster.label: # num_closest_to_incorrect_cluster += 1 # post += "#" # if _doc not in best_cluster.members: # post += "^" # print "%s%s" % (utils.pad_to_len("%s %s" % (_doc._id, (_doc.label + post)), 50), "\t".join(to_print)) print print "Number of docs most similar to a wrong cluster: %d / %d = %2.1f%%" % ( num_closest_to_incorrect_cluster, len(self.docs), 100.0 * num_closest_to_incorrect_cluster / len(self.docs)) print print
def print_doc_cluster_sim_mat(self): print "CLUSTER-DOC SIM MAT" print for x, cluster in enumerate(self.clusters): print "%d:\t%s" % (x, cluster.label) print print "documents labeled with # indicate that their most similar cluster has a different true label" print "documents labeled with ^ indicate that their assigned cluster is not the most similar cluster" print "cluster sim scores labeled with * indicate that the cluster shares the label with the document" print print(" " * 50) + "\t\t".join(map(str, xrange(len(self.clusters)))) print num_closest_to_incorrect_cluster = 0 doc_cluster_sim_mat = self.confirm.get_doc_cluster_sim_mat() for doc_idx in xrange(len(self.docs)): _doc = self.docs[doc_idx] to_print = list() best_cluster = None best_sim_score = -1 post = "" for cluster_idx in xrange(len(self.clusters)): cluster = self.clusters[cluster_idx] sim_score = doc_cluster_sim_mat[doc_idx][cluster_idx] if sim_score > best_sim_score: best_cluster = cluster best_sim_score = sim_score to_print.append("%3.2f" % sim_score) if (cluster.label == _doc.label): to_print[-1] += '*' if _doc.label != best_cluster.label: num_closest_to_incorrect_cluster += 1 post += "#" if _doc not in best_cluster.members: post += "^" print "%s%s" % (utils.pad_to_len( "%s %s" % (_doc._id, (_doc.label + post)), 50), "\t".join(to_print)) #for _doc in self.docs: # to_print = list() # best_cluster = None # best_sim_score = -1 # post = "" # for cluster in self.clusters: # sim_score = self.confirm.cluster_doc_similarity(cluster, _doc) # if sim_score > best_sim_score: # best_cluster = cluster # best_sim_score = sim_score # to_print.append("%3.2f" % sim_score) # if (cluster.label == _doc.label): # to_print[-1] += '*' # if _doc.label != best_cluster.label: # num_closest_to_incorrect_cluster += 1 # post += "#" # if _doc not in best_cluster.members: # post += "^" # print "%s%s" % (utils.pad_to_len("%s %s" % (_doc._id, (_doc.label + post)), 50), "\t".join(to_print)) print print "Number of docs most similar to a wrong cluster: %d / %d = %2.1f%%" % ( num_closest_to_incorrect_cluster, len(self.docs), 100.0 * num_closest_to_incorrect_cluster / len(self.docs)) print print