def compute(title, cdict, ldict): """Compute extended BCubed precision and recall, and print the results.""" precision = bcubed.precision(cdict, ldict) recall = bcubed.recall(cdict, ldict) fscore = bcubed.fscore(precision, recall) print("{}: precision={:.2f}, recall={:.2f}, fscore={:.2f}".format( title, precision, recall, fscore))
def computeBcubed(title, cdict, ldict): """Compute extended BCubed precision and recall, and print the results.""" precision = bcubed.precision(cdict, ldict) recall = bcubed.recall(cdict, ldict) fscore = bcubed.fscore(precision, recall) return precision, recall, fscore
def bcubed_scores(cdict: dict, gdict: dict) -> Tuple[float, float, float]: precision = bcubed.precision(cdict, gdict) recall = bcubed.recall(cdict, gdict) f1_score = bcubed.fscore(precision, recall) return precision, recall, f1_score
def check_with_bcubed_lib(gold, pred): import bcubed ldict = dict([('item{}'.format(i), set([k])) for i, k in enumerate(gold)]) cdict = dict([('item{}'.format(i), set([k])) for i, k in enumerate(pred)]) precision = bcubed.precision(cdict, ldict) recall = bcubed.recall(cdict, ldict) fscore = bcubed.fscore(precision, recall) print('P={} R={} F1={}'.format(precision, recall, fscore))
def bcubed(gold_lst, predicted_lst): """ Takes gold, predicted. Returns recall, precision, f1score """ gold = {i:{cluster} for i,cluster in enumerate(gold_lst)} pred = {i:{cluster} for i,cluster in enumerate(predicted_lst)} precision = b3.precision(pred, gold) recall = b3.recall(pred, gold) return recall, precision, b3.fscore(precision, recall)
def __init__(self, truth, coms): self.ground_truth = self.process_input(truth) self.communities = self.process_input(coms) # FIXME: fix keyerror for nodes not in ground truth try: self.precision = bcubed.precision(self.communities, self.ground_truth) self.recall = bcubed.recall(self.communities, self.ground_truth) self.fscore = bcubed.fscore(self.precision, self.recall) except KeyError: self.precision = 0 self.recall = 0 self.fscore = 0
def evaluateBCubed(goldLabels, results): res_map = {} gold_map = {} for i in range(0, len(results)): res_map[i] = set() res_map[i].add(results[i]) gold_map[i] = set() gold_map[i].add(goldLabels[i]) p = bcubed.precision(res_map, gold_map) r = bcubed.recall(res_map, gold_map) f = bcubed.fscore(p, r) return [p, r, f]
def main(): """Main method.""" k = 35 # write ground truth vocabulary to gt_input.txt and get ground truth # dictionary ldict = aggregate_input_and_ground_truths() logging.info("Done generating ldict and ground truth text file.") # if file containing clusters hasn't already been created, create it if not os.path.isfile("./clusters.txt"): preprocess() # train word2vec and cluster output from the full vocab word2vec.word2clusters("./text8-phrases-extra", "./clusters.txt", k, verbose=True, min_count=1) logging.info("Done training.") logging.info("Done creating clusters.") # load clusters clusters = word2vec.load_clusters("./clusters.txt") # build cluster dictionary from full vocabulary cdict = {} for i in range(0, k): for word in clusters.get_words_on_cluster(i): cdict[word] = set([i]) logging.info("Done generating cdict.") # trim cluster dictionary down to only keys included in ground truths trimmed_cdict = {} for key in ldict.keys(): try: trimmed_cdict[key] = cdict[key] except: pass logging.info("done trimming cdict; begining scoring\n") # compute bcubed score precision = bcubed.precision(trimmed_cdict, ldict) recall = bcubed.recall(trimmed_cdict, ldict) fscore = bcubed.fscore(precision, recall) print "precision: {p}, \t recall: {r}, \t fscore: {f}".format(p=precision, r=recall, f=fscore) logging.info("done scoring\n")
def bcubed(self, x_test, y_test): ldict = {} cdict = {} labels_pred = self.predict(x_test) labels_pred = (labels_pred > PROB_THRESHOLD) for i, label in enumerate(y_test): ldict[i] = {int(label)} cdict[i] = {int(labels_pred[i])} precision = bcubed.precision(cdict, ldict) recall = bcubed.recall(cdict, ldict) fscore = bcubed.fscore(precision, recall) print('B-cubed metric:\nPrecision = {}\nRecall = {}\nF-score = {}'.format(precision, recall, fscore))
def external_eval_clusters(y_true, y_pred): """ :param y_true: true cluster ids :param y_pred: predicted cluster ids :return: external evaluation metrics of clustering quality. The metrics are purity, inverse purity, harmonic mean, b-cubed precision, recall and their harmonic mean. """ purity = purity_score(y_true, y_pred) inverse_purity = purity_score(y_true, y_pred, inv=True) f_purity = f_purity_score(y_true, y_pred) ldict = {i: {cluster_idx} for i, cluster_idx in enumerate(y_true)} cdict = {i: {cluster_idx} for i, cluster_idx in enumerate(y_pred)} bcubed_precision = bcubed.precision(cdict, ldict) bcubed_recall = bcubed.recall(cdict, ldict) bcubed_fscore = bcubed.fscore(bcubed_precision, bcubed_recall) return purity, inverse_purity, f_purity, bcubed_precision, bcubed_recall, bcubed_fscore
def calculate_bcubed(): with open(str(sys.argv[1])) as predictions, open('GroundTruthClusters.csv') as labels: predictions.readline() reader = csv.reader(predictions) clustering = dict((rows[0], set([rows[1]])) for rows in reader) # print clustering labels.readline() reader = csv.reader(labels) truth = dict((rows[0], set([rows[1]])) for rows in reader) precision = bcubed.precision(clustering, truth) recall = bcubed.recall(clustering, truth) fscore = bcubed.fscore(precision, recall) print precision print recall print fscore
def f_score(community_dict, gt_dict): # calculating f_score precision = bcubed.precision(community_dict, gt_dict) recall = bcubed.recall(community_dict, gt_dict) fscore = bcubed.fscore(precision, recall) return fscore
def evaluate_clustering(base_labels, computed_labels, data=None, metric='euclidean', silent=False): """ Print evaluation metrics for the clustering results :param base_labels: labels from a reference clustering :param computed_labels: lables assigned by the clustering :param data: the data matrix or a list of uuids :param metric: metric to use for the silhouette method :param silent: flag, if true avoid printing :return: """ # Converts labels list to dictionaries for the BCubed library base_dict = {k: {v} for k, v in dict(enumerate(base_labels)).items()} computed_dict = { k: {v} for k, v in dict(enumerate(computed_labels)).items() } num_clusters = len( set(computed_labels)) - (1 if -1 in computed_labels else 0) ars = metrics.adjusted_rand_score(base_labels, computed_labels) ami = metrics.adjusted_mutual_info_score(base_labels, computed_labels) fm = metrics.fowlkes_mallows_score(base_labels, computed_labels) h = metrics.homogeneity_score(base_labels, computed_labels) c = metrics.completeness_score(base_labels, computed_labels) p = bcubed.precision(base_dict, computed_dict) r = bcubed.recall(base_dict, computed_dict) fs = bcubed.fscore(p, r) p_p, p_r, p_q = cluster_metrics(base_labels, computed_labels) if not silent: print('-' * 80) print('Clustering evaluation') print('Number of clusters', num_clusters) print('Number of distinct families', len(set(base_labels))) print('Adjusted Rand index:', ars) print('Adjusted Mutual Information:', ami) print('Fowlkes-Mallows:', fm) print('Homogeneity:', h) print('Completeness:', c) print('BCubed Precision:', p) print('BCubed Recall:', r) print('BCubed FScore:', fs) print('Paper Precision:', p_p) print('Paper Recall:', p_r) print('Paper Quality:', p_q) if data is not None: sh = metrics.silhouette_score(data, computed_labels, metric=metric, random_state=42) if not silent: print('Silhouette', sh) ret = (ars, ami, fm, h, c, p, r, fs, p_p, p_r, p_q, sh) else: ret = (ars, ami, fm, h, c, p, r, fs, p_p, p_r, p_q) return ret
import bcubed import b3 import numpy as np import pdb num_cases = 10 num_clusters = np.random.randint(1,10,(num_cases,)) num_labels = np.random.randint(1,10,(num_cases,)) num_elements = np.random.randint(1000,2000,(num_cases,)) for i in xrange(num_cases): L = np.random.randint(1,num_clusters[i]+1,(num_elements[i],)) K = np.random.randint(1,num_labels[i]+1,(num_elements[i],)) [my_f,my_p,my_r] = b3.calc_b3(L,K) Ldict = { i:set([L[i]]) for i in xrange(num_elements[i])} Cdict = { i:set([K[i]]) for i in xrange(num_elements[i])} p = bcubed.precision(Cdict,Ldict) r = bcubed.recall(Cdict,Ldict) f = bcubed.fscore(p,r) # Check if(abs(p - my_p) > 0.0001 or abs(r - my_r) > 0.001 or abs(f - my_f) > 0.0001): print("ERROR")
import bcubed import b3 import numpy as np import pdb num_cases = 10 num_clusters = np.random.randint(1, 10, (num_cases, )) num_labels = np.random.randint(1, 10, (num_cases, )) num_elements = np.random.randint(1000, 2000, (num_cases, )) for i in xrange(num_cases): L = np.random.randint(1, num_clusters[i] + 1, (num_elements[i], )) K = np.random.randint(1, num_labels[i] + 1, (num_elements[i], )) [my_f, my_p, my_r] = b3.calc_b3(L, K) Ldict = {i: set([L[i]]) for i in xrange(num_elements[i])} Cdict = {i: set([K[i]]) for i in xrange(num_elements[i])} p = bcubed.precision(Cdict, Ldict) r = bcubed.recall(Cdict, Ldict) f = bcubed.fscore(p, r) # Check if (abs(p - my_p) > 0.0001 or abs(r - my_r) > 0.001 or abs(f - my_f) > 0.0001): print("ERROR")
gold_codings = { str(form): str(row["cogid"]) for form, row in gold_forms.items()} else: gold_dataset = get_dataset(args.gold) gold_cognatesets = cognate_sets(gold_dataset, code_column="COGID") gold_codings = { str(form): code for code, forms in gold_cognatesets.items() for form in forms} concept_codes = {} for concept, id in iterate_concept_and_id(): gold_c, c = concept_codes.setdefault(concept, ([], [])) gold_c.append(''.join([str(s) for s in gold_codings.get(id, ())])) c.append(''.join([str(s) for s in codings.get(id, ())])) v = 0 r = 0 a = 0 b = 0 for concept, (gold_c, c) in concept_codes.items(): v += metrics.v_measure_score(gold_c, c) r += metrics.adjusted_rand_score(gold_c, c) a += metrics.adjusted_mutual_info_score(gold_c, c) b += bcubed.fscore(bcubed.simple_precision(c, gold_c), bcubed.simple_recall(c, gold_c)) norm = len(concept_codes) print(args.codings, b/norm, v/norm, r/norm, a/norm)
# print len(allMentions) # print cluster_gold for key1, value1 in cluster_test.iteritems(): for key2, value2 in cluster_test[key1].iteritems(): cluster_test[key1][key2] = set([cluster_test[key1][key2]]) for key1, value1 in cluster_gold.iteritems(): for key2, value2 in cluster_gold[key1].iteritems(): cluster_gold[key1][key2] = set([cluster_gold[key1][key2]]) all_precision = [] all_recall = [] all_fscore = [] for key, value in cluster_test.iteritems(): precision = bcubed.precision(cluster_test[key], cluster_gold[key]) recall = bcubed.recall(cluster_test[key], cluster_gold[key]) fscore = bcubed.fscore(precision, recall) print 'precision: ' + str(precision) all_precision.append(precision) print 'recall: ' + str(recall) all_recall.append(recall) print 'fscore: ' + str(fscore) all_fscore.append(fscore) print '' print 'avg b-cubed precision: ' + str(sum(all_precision)/len(all_precision)) print 'avg b-cubed recall: ' + str(sum(all_recall)/len(all_recall)) print 'avg b-cubed fscore: ' + str(sum(all_fscore)/len(all_fscore)) print 'number of negative predictions = ' + str(sum(y_predicted==0)) + ' and positive predictions = ' + str(sum(y_predicted == 1)) #TO-DOS #1. Try other dimensional word embeddings
def _eval_clustering(self, labels_true, labels_predicted): # To address when COP-KMeans fails to satisfy all constraints at a k: if labels_predicted is None: # return an empty dictionary to expose in the final output return {"nmi": None, "ami": None, "ari": None, "fms": None, "v_measure": None, "bcubed_precision": None, "bcubed_recall": None, "bcubed_fscore": None, "Silhouette": None, "Calinski_harabasz": None, "Davies_Bouldin": None } nmi = normalized_mutual_info_score(labels_true, labels_predicted, average_method="max") ami = adjusted_mutual_info_score(labels_true, labels_predicted, average_method="arithmetic") ari = adjusted_rand_score(labels_true, labels_predicted) v_measure = v_measure_score(labels_true, labels_predicted, beta=1.0) fms = fowlkes_mallows_score(labels_true, labels_predicted) # Reshape labels for BCubed measures true_dict = self._reshape_labels_as_dicts(labels_true) pred_dict = self._reshape_labels_as_dicts(labels_predicted) bcubed_precision = bcubed.precision(cdict=pred_dict, ldict=true_dict) bcubed_recall = bcubed.recall(cdict=pred_dict, ldict=true_dict) bcubed_f1 = bcubed.fscore(bcubed_precision, bcubed_recall) # ===================================================================== # Unsupervised Metrics # ===================================================================== if not labels_predicted.nunique() in (1, len(self.data)): sil = silhouette_score(X=self.data, labels=labels_predicted, metric=self.distance_metric, random_state=13712) ch = calinski_harabasz_score(X=self.data, labels=labels_predicted) dv = davies_bouldin_score(X=self.data, labels=labels_predicted) else: sil = None ch = None dv = None ret = {} ret.update({"nmi": round(nmi, 4), "ami": round(ami, 4), "ari": round(ari, 4), "fms": round(fms, 4), "v_measure": round(v_measure, 4), "bcubed_precision": round(bcubed_precision, 4), "bcubed_recall": round(bcubed_recall, 4), "bcubed_fscore": round(bcubed_f1, 4), "Silhouette": round(sil, 4 ) if sil is not None else None, "Calinski_harabasz": round(ch, 4 ) if ch is not None else None, "Davies_Bouldin": round(dv, 4 ) if dv is not None else None # Here goes the unsupervised indices }) return ret
def evaluate_bcubed(judgments, gold): precision = bcubed.precision(judgments, gold) recall = bcubed.recall(judgments, gold) fscore = bcubed.fscore(precision, recall) return precision, recall, fscore