def analyze_results(predictions): # recalls = {key:value for key,value in recalls.items() if key[2] == 'cosine'} # precisions = {key:value for key,value in precisions.items() if key[2] == 'cosine'} recalls, precisions, fscores = {}, {}, {} for key in predictions: # if key[3] != ('lemma',): # continue r, p, f1 = bcubed(predictions[GOLD], predictions[key]) recalls[key] = r precisions[key] = p fscores[key] = f1 print key, r, p, f1 list_names = { hash_d(recalls): 'Best Recall Score', hash_d(precisions): 'Best Precision Score', hash_d(fscores): 'Best F1 Score' } for d in (recalls, precisions, fscores): di = copy.deepcopy(d) print '\nTop 10 param sets for %s:' % list_names[hash_d(d)] for i in range(25): params = key_with_max_value(di) print 'Number %d: Params = %s Scores = %0.5f %0.5f %0.5f'\ %(i+1, str(params), recalls[params], precisions[params], fscores[params]) del di[params]
def analyze_singleton_accuracy(gold, predictions): assert len(gold) == len(predictions) ### comparing singleton detection ### gold_sing_idxs = set() pred_sing_idxs = set() for lst in [gold, predictions, ]: sing_idxs = set() nonsing_idxs = set() for i in xrange(len(lst)): for j in xrange(len(lst)): if i != j: if lst[i] == lst[j]: # not singleton nonsing_idxs.add(i) nonsing_idxs.add(j) if i not in nonsing_idxs: sing_idxs.add(i) if len(gold_sing_idxs) == 0: gold_sing_idxs = sing_idxs.copy() else: pred_sing_idxs = sing_idxs.copy() print "PERCENT OF SINGLETONS: %0.3f"%(len(gold_sing_idxs) / float(len(gold))) gold_sing = [1 if i in gold_sing_idxs else 0 for i in xrange(len(gold))] pred_sing = [1 if i in pred_sing_idxs else 0 for i in xrange(len(predictions))] print 'Accuracy with respect to the correct identification of mentions as singletons:' print 'R: %0.4f, P: %0.4f, F1: %0.4f' % ( recall_score(gold_sing, pred_sing), precision_score(gold_sing, pred_sing), f1_score(gold_sing, pred_sing)) gold_non_sing = [0 if i in gold_sing_idxs else 1 for i in xrange(len(gold))] pred_non_sing = [0 if i in pred_sing_idxs else 1 for i in xrange(len(predictions))] print 'Accuracy with respect to the correct identification of mentions as NOT being singletons:' print 'R: %0.4f, P: %0.4f, F1: %0.4f' % ( recall_score(gold_non_sing, pred_non_sing), precision_score(gold_non_sing, pred_non_sing), f1_score(gold_non_sing, pred_non_sing)) # make coref chains without singletons non_sing_gold = [] non_sing_pred = [] for i,val in enumerate(gold_non_sing): if val == 1: non_sing_gold.append(gold[i]) non_sing_pred.append(predictions[i]) results = bcubed(non_sing_gold, non_sing_pred) print 'B3 results obtained after removing all GOLD singletons:' print results
def neural_predict(self, x, y, threshold_range, metric='cosine', link_function='single', rand_score=False, train_data=None, delta_filter=False, lemma_init=False, lemma_predictor=None): c = AgglomerativeClusterer(x, distance_metric=metric, train_data=train_data) best_score = (0., 0., 0.,) best_thresh = 0. best_delta = 0 best_clusters = None all_scores = {} if (type(threshold_range) != float): for threshold in np.linspace(0.65, 1.0, threshold_range)[1:-1]: clust_idxs = c.cluster(threshold, linktype=link_function) clusters = np.zeros(len(x)) for i,cluster in enumerate(clust_idxs): clusters[cluster] = i # recall,precision,f1 rpf1 = bcubed(y, clusters) all_scores[threshold] = rpf1 # delta filtering for each threshold, sloooowww print "delta filter for threshold",threshold print "result with no delta",rpf1 if delta_filter: for delta in np.linspace(0, 1, 101): _, new_clusters = self.delta_filter(clusters, delta) drpf1 = bcubed(y, new_clusters) print delta, drpf1 if drpf1[2] > best_score[2]: best_score = drpf1 best_thresh = threshold best_delta = delta print "best delta & thresh so far",best_delta,best_thresh print "best result so far",best_score else: if not lemma_init: clust_idxs = c.cluster(threshold_range, linktype=link_function) clusters = np.zeros(len(x)) for i,cluster in enumerate(clust_idxs): clusters[cluster] = i best_score = bcubed(y, clusters) best_clusters = clusters # use lemma initialization and tune to a certain value for delta! if lemma_init: print('Doing lemma initialization tests!') best_score = 0 best_params = None for delta in np.linspace(0.5, 1, 11): tmp, lemma_preds = lemma_predictor.predict(build_test_comparison=False, delta=delta) for thresh in np.linspace(0.6, 1, 21): for mkt in np.linspace(0,0,1):#0.5, 1, 26): # optimize min keep thresh clust_idxs = c.cluster(thresh, linktype=link_function, preset_predictions=lemma_preds, minimum_keeping_threshold=mkt) clusters = np.zeros(len(x)) for i,cluster in enumerate(clust_idxs): clusters[cluster] = i # recall,precision,f1 score = bcubed(y, clusters) print delta, thresh, mkt, score if score[2] > best_score: best_score = score[2] best_params = (thresh, delta, mkt,) print 'Best score and best params:' print best_score, ': with - d=%0.2f, t=%0.2f'%(best_params[1], best_params[0]) print 'with minimum keeping threshold: %0.2f'%best_params[2] best_thresh = best_params # return best_thresh as a tuple # delta filtering if delta_filter: new_best_score = best_score best_delta = 0 for delta in np.linspace(0, 1, 101): _, new_clusters = self.delta_filter(best_clusters, delta) rpf1 = bcubed(y, new_clusters) print delta,rpf1 if rpf1[2] > new_best_score[2]: new_best_score = rpf1 best_delta = delta print "BEST DELTA ",best_delta print "NEW BEST SCORE ",new_best_score # print "OLD BEST SCORE ",best_score return best_score, best_thresh, all_scores
# cd.get_all_tokens(topics=helpers.TEST), # topics=helpers.TEST, # events_only=True, # data_set='test', # with_topics=False) # predicted = pred.predict() # pred.save_predictions_mention_based(predicted) cd = load(events_only=False) gold = None for met in ['cosine']: for thresh in [0.7]: pred = ClusteringPredictor(cd.gold_mention_clusters, cd.get_all_tokens(topics=helpers.VAL), events_only=True, data_set='val', topics=helpers.VAL) print 'Predicting...' g, predicted = pred.predict(threshold=thresh, metric=met, link_function='single', build_test_comparison=gold is None, split_into_topics=False) if gold is None: gold = g print thresh, ':', bcubed(gold, predicted) # pred.save_predictions_mention_based(predicted, gold_list=gold) print
# cd = load(data_set='all') # pred = BaselineLemmaPredictor(cd.gold_mention_clusters, # cd.get_all_tokens(topics=helpers.TEST), # topics=helpers.TEST, # events_only=True, # data_set='test', # with_topics=False) # predicted = pred.predict() # pred.save_predictions_mention_based(predicted) cd = load(events_only=False) gold = None for met in ['cosine']: for thresh in [0.7]: pred = ClusteringPredictor(cd.gold_mention_clusters, cd.get_all_tokens(topics=helpers.VAL), events_only=True, data_set='val', topics=helpers.VAL) print 'Predicting...' g, predicted = pred.predict(threshold=thresh, metric=met, link_function='single', build_test_comparison=gold is None, split_into_topics=False ) if gold is None: gold = g print thresh, ':', bcubed(gold, predicted) # pred.save_predictions_mention_based(predicted, gold_list=gold) print
test_set = helpers.VAL split_into_topics = False train_pred, x_train, y_train, train_mentions, val_pred, x_val, y_val, val_mentions = \ initialize_data_sets(events_only, test_set, split_into_topics, cluster_singletons=PUT_SINGLETONS, remove_train_singletons=REMOVE_TRAIN_SINGLETONS) if LEMMA_PRED: lpred = get_lemma_predictor(helpers.VAL) # start with validation one test_comp = None best_score = 0 best_delta = 0 for delta in np.linspace(0,1,101): tmp, preds = lpred.predict(build_test_comparison=test_comp is None, delta=delta) test_comp = tmp if test_comp is None else test_comp score = bcubed(test_comp, preds)[2] # returns r,p,f1 if score > best_score: best_score = score best_delta = delta print 'Delta %0.2f gets us %0.5f accuracy!'%(delta, score) exit(0) # probably can be optimized, but just need tfidf to be built initialize_data_sets(events_only, helpers.TEST, split_into_topics, cluster_singletons=PUT_SINGLETONS, remove_train_singletons=REMOVE_TRAIN_SINGLETONS) ltestpred = get_lemma_predictor(helpers.TEST) gold, preds = ltestpred.predict(delta=best_delta) analyze_singleton_accuracy(gold, preds) ltestpred.save_predictions_mention_based(preds, 'HEAD_LEMMA_DELTA')
def analyze_singleton_accuracy(gold, predictions): assert len(gold) == len(predictions) ### comparing singleton detection ### gold_sing_idxs = set() pred_sing_idxs = set() for lst in [ gold, predictions, ]: sing_idxs = set() nonsing_idxs = set() for i in xrange(len(lst)): for j in xrange(len(lst)): if i != j: if lst[i] == lst[j]: # not singleton nonsing_idxs.add(i) nonsing_idxs.add(j) if i not in nonsing_idxs: sing_idxs.add(i) if len(gold_sing_idxs) == 0: gold_sing_idxs = sing_idxs.copy() else: pred_sing_idxs = sing_idxs.copy() print "PERCENT OF SINGLETONS: %0.3f" % (len(gold_sing_idxs) / float(len(gold))) gold_sing = [1 if i in gold_sing_idxs else 0 for i in xrange(len(gold))] pred_sing = [ 1 if i in pred_sing_idxs else 0 for i in xrange(len(predictions)) ] print 'Accuracy with respect to the correct identification of mentions as singletons:' print 'R: %0.4f, P: %0.4f, F1: %0.4f' % (recall_score( gold_sing, pred_sing), precision_score( gold_sing, pred_sing), f1_score(gold_sing, pred_sing)) gold_non_sing = [ 0 if i in gold_sing_idxs else 1 for i in xrange(len(gold)) ] pred_non_sing = [ 0 if i in pred_sing_idxs else 1 for i in xrange(len(predictions)) ] print 'Accuracy with respect to the correct identification of mentions as NOT being singletons:' print 'R: %0.4f, P: %0.4f, F1: %0.4f' % ( recall_score(gold_non_sing, pred_non_sing), precision_score(gold_non_sing, pred_non_sing), f1_score(gold_non_sing, pred_non_sing)) # make coref chains without singletons non_sing_gold = [] non_sing_pred = [] for i, val in enumerate(gold_non_sing): if val == 1: non_sing_gold.append(gold[i]) non_sing_pred.append(predictions[i]) results = bcubed(non_sing_gold, non_sing_pred) print 'B3 results obtained after removing all GOLD singletons:' print results
split_into_topics = False train_pred, x_train, y_train, train_mentions, val_pred, x_val, y_val, val_mentions = \ initialize_data_sets(events_only, test_set, split_into_topics, cluster_singletons=PUT_SINGLETONS, remove_train_singletons=REMOVE_TRAIN_SINGLETONS) if LEMMA_PRED: lpred = get_lemma_predictor(helpers.VAL) # start with validation one test_comp = None best_score = 0 best_delta = 0 for delta in np.linspace(0, 1, 101): tmp, preds = lpred.predict(build_test_comparison=test_comp is None, delta=delta) test_comp = tmp if test_comp is None else test_comp score = bcubed(test_comp, preds)[2] # returns r,p,f1 if score > best_score: best_score = score best_delta = delta print 'Delta %0.2f gets us %0.5f accuracy!' % (delta, score) exit(0) # probably can be optimized, but just need tfidf to be built initialize_data_sets(events_only, helpers.TEST, split_into_topics, cluster_singletons=PUT_SINGLETONS, remove_train_singletons=REMOVE_TRAIN_SINGLETONS) ltestpred = get_lemma_predictor(helpers.TEST)