def evaluate(predicted_boundaries, doc_name): evaluation = windowdiff(predicted_boundaries, gold_boundaries_set[doc_name], 3, boundary="BREAK", weighted=False) print "Window Diff Score:\t %f\n" % evaluation evaluations[doc_name] = evaluation with open(log_dir + timestamp, 'a') as log: log.write("\n" + doc_name + "\t"*3 + str(evaluation))
def make_distances(matches): """ returns the matches with additional distance columns """ df = matches.copy() df[DISTANCE_GEOMETRIC] = df.apply(lambda x: sd.euclidean( [x[MAN_START], x[MAN_END]], [x[AUTO_START], x[AUTO_END]]), axis=1) df[DISTANCE_EDIT] = df.apply( lambda x: distance.edit_distance(x[MAN_TEXT], x[AUTO_TEXT]), axis=1) df[DISTANCE_HAMMING] = df.apply( lambda x: segmentation.ghd(*to_segmentation_metric_form( [x[MAN_START], x[MAN_END]], [x[AUTO_START], x[AUTO_END]])), axis=1) # windowfill k will be 1/2 average segment length df[DISTANCE_Windowdiff] = df.apply( lambda x: segmentation.windowdiff(*to_segmentation_metric_form( [x[MAN_START], x[MAN_END]], [x[AUTO_START], x[AUTO_END]], return_window_size=True)), axis=1) df[DISTANCE_PK] = df.apply( lambda x: segmentation.pk(*to_segmentation_metric_form( [x[MAN_START], x[MAN_END]], [x[AUTO_START], x[AUTO_END]])), axis=1) return df
def get_standard_metrics(gt, pred, msn): gt_segs = ''.join(['1' if i in gt else '0' for i in range(msn)]) pred_segs = ''.join(['1' if i in pred else '0' for i in range(msn)]) k_val = int(round(len(gt_segs) / (gt_segs.count('1') * 2.0))) k_val = k_val // 4 return seg.pk(gt_segs, pred_segs, k=k_val), seg.windowdiff(gt_segs, pred_segs, k=k_val)
def compute_segmentation_scores(reference, results, k): """ Compute WindowDiff, Beeferman's Pk and Generalized Hamming Distance """ window_diff = float(windowdiff(reference, results, k, boundary="T")) / len(reference) bpk = pk(reference, results, boundary="T") generalized_hamming_distance = ghd(reference, results, boundary="T") / len(reference) return window_diff, bpk, generalized_hamming_distance
def get_seg_scores(y_test, pred_class, evalk=3): targetstr = "".join([str(int(x)) for x in y_test]) predstr = "".join([str(int(x)) for x in pred_class]) logger.debug(targetstr[0:50]) logger.debug(predstr[0:50]) wd = windowdiff(targetstr, predstr, k=evalk) pkval = pk(targetstr, predstr, k=evalk) print "PK: %f" % pkval print "WD: %f" % wd return pkval, wd
def get_pk_wd(x, k=3, conv=None): # logger.debug("evalk %f: " % evalk) if conv == None: conv = x["conv"].iloc[0] targets = "0" * k + "".join(x["target"].astype(str)) + "0" * k preds = "0" * k + "".join(x["pred"].astype(str)) + "0" * k wd = windowdiff(targets, preds, k=k) pkval = pk(targets, preds, k=k) # print "PK: %f" % pkval # print "WD: %f" % wd return pd.DataFrame({"conv": conv, "PK": pkval, "WD": wd}, index=[conv])
def score(predicts, labels, windowsize=2, type=1): ''' sample_num * conversation_length list of numpy arrays :param predicts: :param masks: :param labels: :param type: 1 -- origianl dataset, the windowsize is appropriate; 0 -- augmented datset, the windowsize may be wrong :return: windowdiff pk F1-macro ''' acc = 0 f1_macro = 0 f1_micro = 0 windiff = 0 pkk = 0 for i in range(len(predicts)): predict_str = ''.join(str(x) for x in list(predicts[i])) label_str = ''.join(str(x) for x in list(labels[i])) acc += np.sum(np.equal(predicts[i], labels[i])) / len(predicts[i]) f1_macro += f1_score(labels[i], predicts[i], average='macro') f1_micro += f1_score(labels[i], predicts[i], average='micro') if type: windiff += windowdiff(label_str, predict_str, windowsize) pkk += pk(label_str, predict_str, windowsize) acc = acc / len(predicts) f1_macro = f1_macro / len(predicts) f1_micro = f1_micro / len(predicts) if type: windiff = windiff / len(predicts) pkk = pkk / len(predicts) score = { "windowdiff": windiff, "pk": pkk, "F1-macro": f1_macro, "acc": acc } return score
def evaluate(gold_idx, pred_idx, k): """ gold_idx: golden standart of segmentation in the following format: list of lists of indexes pred_idx: predicted segmentation of the text in the following format: list of lists of indexes k: window size (preferrably half of the document length divided by the number of gold segments) return: pk (Beeferman D., Berger A., Lafferty J. (1999)) and windowdiff (Pevzner, L., and Hearst, M (2002)) metrics for the prediction (less the better) """ gold_idx = [[str(0) for i in j] for j in gold_idx] gold = [] for i in gold_idx: i[-1] = "1" gold.extend(i) gold = "".join(gold) pred_idx = [[str(0) for i in j] for j in pred_idx] pred = [] for i in pred_idx: i[-1] = "1" pred.extend(i) pred = "".join(pred) return {'pk': pk(gold, pred, k), 'windowdiff': windowdiff(gold, pred, k)}
def evaluate_text_tiling(data): text_tiler = TextTiling() X = data[0] y = data[1] window_diffs = [] for index, lecture_text in enumerate(X): boundaries = text_tiler.segment_text(lecture_text) ground_truth_boundaries = y[index] pred_boundaries = ''.join(str(boundary) for boundary in boundaries) ground_truth_boundaries = ''.join( str(boundary) for boundary in ground_truth_boundaries) k = int( len(ground_truth_boundaries) / float(2.0 * ground_truth_boundaries.count('1') + 1.0)) window_diff_score = windowdiff(pred_boundaries, ground_truth_boundaries, k) window_diffs.append(window_diff_score) avg_window_diff_score = np.mean(np.array(window_diffs)) print("Average window diff score:", avg_window_diff_score) return avg_window_diff_score
def evaluate_segmentation(bc3=False, limit=0): g = data_to_string(WAPITI_GOLD_FILE, limit=limit) # gold string r = data_to_string(WAPITI_RESULT_FILE, limit=limit) # result string if bc3: t = data_to_string(BC3_TEXT_TILING_FILE, limit=limit, label_position=0) # text tiling baseline string else: t = data_to_string(WAPITI_GOLD_FILE, limit=limit, label_position=-2) avg = float(len(g)) / (g.count("T") + 1) # average segment size k = int(avg / 2) # window size for WindowDiff b = ("T" + (int(math.floor(avg)) - 1) * ".") * int( math.ceil(float(len(g)) / int(math.floor(avg)))) b = b[:len(g)] # baseline string print(g[:150]) print(r[:150]) # WindowDiff wdi = (float(windowdiff(g, r, k, boundary="T")) / len(g)) * 100 # Beeferman's Pk bpk = (pk(g, r, boundary="T")) * 100 # Generalized Hamming Distance ghd = (GHD(g, r, boundary="T") / len(g)) * 100 # accuracy acc = accuracy(list(g), list(r)) * 100 # precision, recall, f-measure pre = metrics.precision_score(list(g), list(r)) * 100 rec = metrics.recall_score(list(g), list(r)) * 100 f_1 = (2.0 * (rec_rs * pre_rs)) / (rec_rs + pre_rs) return acc, pre, rec, f_1, wdi, bpk, ghd, g.count("T"), r.count("T")
def evaluate_segmentation(bc3=False, limit=0): g = data_to_string(WAPITI_GOLD_FILE, limit=limit) # gold string r = data_to_string(WAPITI_RESULT_FILE, limit=limit) # result string if bc3: t = data_to_string(BC3_TEXT_TILING_FILE, limit=limit, label_position=0) # text tiling baseline string else: t = data_to_string(WAPITI_GOLD_FILE, limit=limit, label_position=-2) avg = float(len(g)) / (g.count("T") + 1) # average segment size k = int(avg / 2) # window size for WindowDiff b = ("T" + (int(math.floor(avg)) - 1) * ".") * int(math.ceil(float(len(g)) / int(math.floor(avg)))) b = b[:len(g)] # baseline string print(g[:150]) print(r[:150]) # WindowDiff wdi = (float(windowdiff(g, r, k, boundary="T")) / len(g)) * 100 # Beeferman's Pk bpk = (pk(g, r, boundary="T")) * 100 # Generalized Hamming Distance ghd = (GHD(g, r, boundary="T") / len(g)) * 100 # accuracy acc = accuracy(list(g), list(r)) * 100 # precision, recall, f-measure pre = metrics.precision_score(list(g), list(r)) * 100 rec = metrics.recall_score(list(g), list(r)) * 100 f_1 = (2.0 * (rec_rs * pre_rs)) / (rec_rs + pre_rs) return acc, pre, rec, f_1, wdi, bpk, ghd, g.count("T"), r.count("T")
elif type(seg_method) is int: threshold = (np.mean(non_zero_depth_scores) - np.std(non_zero_depth_scores)) / 2 for depth_score in depth_scores: if depth_score > threshold and depth_score != 0.0: predicted_boundaries.append("BREAK") boundary_count += 1 else: predicted_boundaries.append(None) # There is one more boundary than there are depth scores predicted_boundaries.append(None) reverse_parse(doc_name, predicted_boundaries) test_index += 1 # print "%i boundaries predicted, %i gold boundaries." % (boundary_count, gold_count) if mode == 'evaluate': evaluation = windowdiff(predicted_boundaries, gold_boundaries_set[doc_name], 3, boundary="BREAK", weighted=False) print "Window Diff Score:\t %f\n" % evaluation evaluations[doc_name] = evaluation with open(log_dir + timestamp, 'a') as log: log.write(doc_name + "\t"*3 + str(evaluation)) elif mode == 'resolve': print "Resolving topic timings..." utt_start_times = get_utt_timing(doc_name) topic_start_times = [utt_start_times[0]] # Initialise with start time of first utterance for index in xrange(len(predicted_boundaries)): if predicted_boundaries[index] is not None: topic_start_times.append(utt_start_times[index]) prg_name = doc_name.replace("_parsed","") prg_name = prg_name.replace(".txt",".wav") topic_timings[prg_name] = topic_start_times
predicted_seg=""; for row in d: if (row[3] == .1): predicted_seg += "1"; print( "<---------------------TOPIC CHANGE HERE----------------------->"); else: predicted_seg+="0" print(sentences[int(row[0])]) s1 = real_segs.strip(); s2 = predicted_seg.strip(); print(s1); print(s2); difs.append(windowdiff(s1, s2, W)) print(difs) avg = float(sum(difs))/len(difs) if len(difs) > 0 else float('nan') print("AVERAGE WINDOW: " + str(avg)); plt.show() #x(block1_lda[0].__class__.__name__); #for sentence in sentences: # doc_lda = lda[reviewc.dictionary.doc2bow(reviewc.proc(sentence))]; # print(doc_lda) # print (sentence); # print "\n--------\n"
bayes_boundaries = [ 1, 72, 102, 103, 104, 105, 130, 131, 144, 158, 234, 235, 248 ] perfect_boundaries = [ 4, 21, 30, 49, 72, 104, 127, 131, 146, 169, 220, 225, 237 ] # bayes_boundaries = [13, 14, 15, 16, 21, 69, 106, 170, 171, 172, 222, 233, 248] max_i = max(sum([bayes_boundaries, perfect_boundaries], [])) bayes_seg = bound2seg(bayes_boundaries, max_i) perfect_seg = bound2seg(perfect_boundaries, max_i) k = int(max_i / (2 * (len(perfect_boundaries)))) # halved avg segment size print("wd bayes: ", windowdiff(bayes_seg, perfect_seg, k=k)) tdf = pd.read_pickle( "../processed_transcripts/joe_rogan_elon_musk.pkl")[0:max_i] te = TopicExtractor() topic_ranges = get_topic_ranges(tdf) tr_tuples = [(topic, tr) for topic, trs in topic_ranges.items() for tr in trs] tr_tuples = sorted(tr_tuples, key=lambda x: x[1][1] - x[1][0], reverse=True) geek_bounds = get_geek_bounds(tr_tuples)
#print i, sline, prevdoc, currdoc if not prevdoc == None: doclens.append(currlen) prevdoc = currdoc currlen = 1 else: currlen += 1 #if i > 400: # break #print targets #print preds #print doclens logger.debug("ndocs: %d" % len(doclens)) evalk = int(round(numpy.average(doclens)/2)) logger.debug("evalk %f: " % evalk) wd = windowdiff(targets, preds, k=evalk) #logger.debug("WD: %f" % wd) pkval = pk(targets, preds, k=evalk) #logger.debug("PK: %f" % pkval) fstem = os.path.basename(options.input) with open(options.outfile, "w") as f: f.write(fstem + "\tPK\t" + str(pkval) + "\n") f.write(fstem + "\tWD\t" + str(wd) + "\n") print "PK: %f" % pkval print "WD: %f" % wd
def evaluate_segmentation(bc3=False, limit=-1): d = "".join(data_to_list(WAPITI_TRAIN_FILE)) # training data g = "".join(data_to_list(WAPITI_GOLD_FILE, limit=limit)) # gold string temp_r = data_to_list(WAPITI_RESULT_FILE, limit=limit) # result string # n = data_to_list("var/union/ngrams_" + WAPITI_RESULT_FILE[-1], limit=limit) # scores = {} r = "" for i, col in enumerate(temp_r): # score = 0 # if n[i][:n[i].index("/")] == "T": # score = 1 # elif col[:col.index("/")] == "T": # score = float(col[col.index("/") + 1:]) # scores[i] = r += col[:col.index("/")] # sorted_indexes = sorted(scores, key=scores.get, reverse=True) # indexes = [index for index, score in scores.iteritems() if score > 0.99] # r = "." * len(g) # n_boundaries = int((float(g.count("T")) / len(g)) * len(g)) # for i, index in enumerate(sorted_indexes): # r = r[:index] + "T" + r[index + 1:] # if i == n_boundaries: # break # for index in indexes: # r = r[:index] + "T" + r[index+1:] if bc3: t = data_to_list(BC3_TEXT_TILING_FILE, limit=limit, label_position=0) # text tiling baseline string else: t = data_to_list(WAPITI_GOLD_FILE, limit=limit, label_position=-2) avg_g = float(len(g)) / (g.count("T") + 1) # average segment size (reference) avg_d = float(len(d)) / (d.count("T") + 1) # average segment size (training) k = int(avg_g / 2) # window size for WindowDiff b = ("T" + (int(math.floor(avg_d)) - 1) * ".") * int(math.ceil(float(len(d)) / int(math.floor(avg_d)))) b = b[:len(g)] # baseline string # WindowDiff wdi_rs = (float(windowdiff(g, r, k, boundary="T")) / len(g)) * 100 wdi_bl = (float(windowdiff(g, b, k, boundary="T")) / len(g)) * 100 wdi_tt = (float(windowdiff(g, t, k, boundary="T")) / len(g)) * 100 # Beeferman's Pk bpk_rs = (pk(g, r, boundary="T")) * 100 bpk_bl = (pk(g, b, boundary="T")) * 100 bpk_tt = (pk(g, t, boundary="T")) * 100 # Generalized Hamming Distance ghd_rs = (ghd(g, r, boundary="T") / len(g)) * 100 ghd_bl = (ghd(g, b, boundary="T") / len(g)) * 100 ghd_tt = (ghd(g, t, boundary="T") / len(g)) * 100 # accuracy acc_rs = accuracy(list(g), list(r)) * 100 acc_bl = accuracy(list(g), list(b)) * 100 acc_tt = accuracy(list(g), list(t)) * 100 # precision, recall, f-measure pre_rs = metrics.precision_score(list(g), list(r), pos_label="T") * 100 rec_rs = metrics.recall_score(list(g), list(r), pos_label="T") * 100 f_1_rs = (2.0 * (rec_rs * pre_rs)) / (rec_rs + pre_rs) pre_bl = metrics.precision_score(list(g), list(b), pos_label="T") * 100 rec_bl = metrics.recall_score(list(g), list(b), pos_label="T") * 100 f_1_bl = (2.0 * (rec_bl * pre_bl)) / (rec_bl + pre_bl) pre_tt = metrics.precision_score(list(g), list(t), pos_label="T") * 100 rec_tt = metrics.recall_score(list(g), list(t), pos_label="T") * 100 f_1_tt = (2.0 * (rec_tt * pre_tt)) / (rec_tt + pre_tt) return acc_rs, acc_bl, acc_tt, pre_rs, pre_bl, pre_tt, rec_rs, rec_bl, rec_tt, f_1_rs, f_1_bl, f_1_tt, wdi_rs, wdi_bl, wdi_tt, bpk_rs, bpk_bl, bpk_tt, ghd_rs, ghd_bl, ghd_tt, g.count("T"), b.count("T"), r.count("T"), t.count("T")
logger.info(tt.startids) logger.info(tt.nsents) goldseg = numpy.zeros(tt.nsents+1) goldseg[tt.startids] = 1 goldstr = "".join([str(int(x)) for x in goldseg[1:]]) logger.info(predstr) logger.info(goldstr) if len(tt.startids) > 1: curr_doc_sizes = numpy.array(tt.startids[1:]) - numpy.array(tt.startids[:-1]) #evalk = int(round(numpy.average(curr_doc_sizes)/2)) evalk = 3 logger.debug("eval k: %d" % evalk) try: wd = windowdiff(goldstr, predstr, k=evalk) logger.info("WD: %f", wd) wds.append(wd) except ValueError as e: logger.error("windowdiff value error") logger.error(e) try: pkval = pk(goldstr, predstr, k=evalk) logger.info("PK: %f", pkval) pks.append(pkval) except ValueError as e: logger.error("pkval value error") logger.error(e)