def correctTranslation(): data = request.get_json() translation = data["translation"] beam = data["beam"] document_unk_map = data["document_unk_map"] attention = data["attention"] document_id = data["document_id"] sentence_id = data["sentence_id"] document = get_document(document_id) extractor = DomainSpecificExtractor(source_file=document.filepath, src_lang=SRC_LANG, tgt_lang=TGT_LANG, train_source_file=f".data/wmt14/train.tok.clean.bpe.32000.{SRC_LANG}", train_vocab_file=f".data/vocab/train_vocab_{SRC_LANG}.pkl") keyphrases = extractor.extract_keyphrases() for key in document_unk_map: if key not in document.unk_map: document.unk_map[key] = document_unk_map[key] else: # Merge list values document.unk_map[key] = list(set(document.unk_map[key]) | set(document_unk_map[key])) sentence = document.sentences[int(sentence_id)] if translation != sentence.translation: sentence.diff = html_diff(sentence.translation[:-4].replace("@@ ", ""), translation[:-4].replace("@@ ", "")) sentence.translation = translation sentence.corrected = True sentence.flagged = False sentence.attention = attention sentence.beam = beam scorer = Scorer() score = scorer.compute_scores(sentence.source, sentence.translation, attention, keyphrases, "") score["order_id"] = sentence.score["order_id"] sentence.score = score document.sentences[int(sentence_id)] = sentence save_document(document, document_id) return jsonify({})
def filterForSimilarSentences(document_id, reference_id): document = get_document(document_id) reference = document.sentences[int(reference_id)].source print("Filter for similar sentences: " + str(reference)) scorer = Scorer() keyphrases = [] for k in document.keyphrases: keyphrases.append((k["name"], k["occurrences"])) for i, sentence in enumerate(document.sentences): score = scorer.compute_scores(sentence.source, " ".join(sentence.translation), sentence.attention, keyphrases, reference) score["order_id"] = i sentence.score = score save_document(document, document_id) return jsonify({})
class CorrelationExperiment: def __init__(self, model, source_file, target_file, source_file2, target_file2, num_sentences=1000, beam_size=3): self.model = model self.source_file = source_file self.target_file = target_file self.source_file2 = source_file2 self.target_file2 = target_file2 self.num_sentences = num_sentences self.beam_size = beam_size self.translationList = [] self.pairs = [] self.scoresList = [] self.scorer = Scorer() self.metric_to_cter = {} self.all_cter_scores = [] self.metric_to_bad = {} # metric order -> correlation def plot_correlation(self, dir, prefix, filename): palette = sns.color_palette() for i, metric in enumerate(metrics): f, axes = plt.subplots() f.set_figheight(6) f.set_figwidth(6) x, y = [], [] score_cter_tuples = [] cters = [] for score in self.metric_to_cter[metric]: for v in self.metric_to_cter[metric][score]: cters.append(v) score_cter_tuples += [ (score, v) for v in self.metric_to_cter[metric][score] ] values = self.metric_to_cter[metric][score] x += [score] * len(values) y += values score_cter_tuples = sorted(score_cter_tuples, key=lambda x: x[0], reverse=reverse_sort_direction[metric]) self.metric_to_bad[metric] = score_cter_tuples axes.set_ylim(-0.1, 1.1) plt.xticks(fontsize=15) plt.yticks(fontsize=15) corr, p_val = pearsonr(x, y) axes.text(0.05, 0.95, "r = {0:.2f}".format(corr.item()), transform=axes.transAxes, va="top", fontsize=13, weight="bold") sns.regplot(x, y, ax=axes, scatter_kws={'alpha': 0.2}, order=1, color=palette[i]) plt.ylabel("CharacTER", fontsize=17) plt.xlabel("Metric: " + name_map[metric], fontsize=17) plt.savefig(os.path.join(dir, prefix + "_" + metric + filename)) plt.close() # metric order -> document quality def plot_bad(self, dir, prefix, filename): palette = sns.color_palette() metric_percentage = {} mean = statistics.mean(self.all_cter_scores) stdev = statistics.stdev(self.all_cter_scores) threshold = mean + stdev for metric in metrics: bad_percentage = [] curr_bad_count = 0 for score, cter in self.metric_to_bad[metric]: if cter >= threshold: curr_bad_count += 1 bad_percentage.append(curr_bad_count) metric_percentage[metric] = bad_percentage for i, metric in enumerate(metrics): f, axes = plt.subplots() f.set_figheight(6) f.set_figwidth(6) bad_percentage = metric_percentage[metric] x = [ 100 * i / len(bad_percentage) for i in range(1, len(bad_percentage) + 1) ] y = [100 * p / max(bad_percentage) for p in bad_percentage] line, = plt.plot(x, y, color=palette[i], linewidth=2, alpha=0.9) line.set_label(name_map[metric]) line, = plt.plot(x, x, marker='', linestyle="--", color='black', linewidth=1, alpha=0.9) line.set_label("theoretical baseline") for m in metrics: if m == metric: continue line, = plt.plot([ 100 * i / len(metric_percentage[m]) for i in range(1, len(metric_percentage[m]) + 1) ], [ 100 * p / max(metric_percentage[m]) for p in metric_percentage[m] ], marker='', color=palette[metrics.index(m)], linewidth=1, alpha=0.5, label=name_map[m], linestyle="-") plt.legend(loc='upper left', ncol=1, fontsize=12) plt.yticks([0, 25, 50, 75, 100], fontsize=15) plt.xticks([0, 25, 50, 75, 100], fontsize=15) plt.ylabel("% sentences with low quality covered", fontsize=17) plt.xlabel("% sentences covered (metric: " + name_map[metric] + ")", fontsize=17) plt.savefig( os.path.join(dir, prefix + "_percentages" + "_" + metric + filename)) print("saved bad") plt.close() # BLEU values of remaining text # Sentences sorted from good to bad according to metric to calculate the BLEU score up to the current # sentence. # The plot shows the BLEU score when removing bad sentences first until only one good sentence remains. def plot_bleu(self, dir, prefix, filename): palette = sns.color_palette() metric_values = {} for metric in metrics: sorted_sentences = [(x, y, z) for _, x, y, z in sorted( zip([s[metric] for s in self.scoresList], [p[0] for p in self.pairs], [p[1] for p in self.pairs], [ " ".join(translation[:-1]) for translation in self.translationList ]), reverse=not reverse_sort_direction[metric])] sources, targets, translations = zip(*sorted_sentences) values = [] for i in range(len(sources)): s = [targets[i] for i in range(0, i + 1)] t = [translations[i] for i in range(0, i + 1)] bleu = compute_bleu(s, t) values.append(bleu) values.reverse() metric_values[metric] = values for i, metric in enumerate(metrics): f, axes = plt.subplots() f.set_figheight(6) f.set_figwidth(6) values = metric_values[metric] x = [ 100 * i / len(values[:-25]) for i in range(1, len(values[:-25]) + 1) ] y = [100 * p for p in values[:-25]] line, = plt.plot(x, y, color=palette[i], linewidth=2, alpha=0.9) line.set_label(name_map[metric]) for m in metrics: if m == metric: continue line, = plt.plot([ 100 * i / len(metric_values[m][:-25]) for i in range(1, len(metric_values[m][:-25]) + 1) ], [100 * p for p in metric_values[m][:-25]], marker='', color=palette[metrics.index(m)], linewidth=1, alpha=0.5, label=name_map[m], linestyle="-") #line.set_label("other metrics") plt.legend(loc='upper left', ncol=1, fontsize=12) plt.yticks(fontsize=15) plt.xticks([0, 25, 50, 75, 100], fontsize=15) plt.ylabel("BLEU", fontsize=17) plt.xlabel("% sentences covered (metric: " + name_map[metric] + ")", fontsize=17) plt.savefig(os.path.join(dir, prefix + "_" + metric + filename)) print("saved bleu") plt.close() # characTER # Sentences sorted from bad to good according to metric. # The plot shows CharacTER score of the currently processed sentence. def plot_cter2(self, dir, prefix, filename): palette = sns.color_palette() metric_values = {} for metric in metrics: sorted_sentences = [(x, y, z) for _, x, y, z in sorted( zip([s[metric] for s in self.scoresList], [p[0] for p in self.pairs], [p[1] for p in self.pairs], [ " ".join(translation[:-1]) for translation in self.translationList ]), reverse=reverse_sort_direction[metric])] sources, targets, translations = zip(*sorted_sentences) values = [] for i in range(len(sources)): s = targets[i] t = translations[i] cter = compute_cter(s, t) values.append(cter) metric_values[metric] = values for i, metric in enumerate(metrics): f, axes = plt.subplots() f.set_figheight(6) f.set_figwidth(6) axes.yaxis.set_major_formatter(FormatStrFormatter('%.2f')) values = metric_values[metric] x = [100 * i / len(values) for i in range(1, len(values) + 1)] y = [p for p in values] plt.plot(x, y, color=palette[i], linewidth=1, alpha=0.9) plt.xlim(0, 100) def movingaverage(interval, window_size): window = np.ones(int(window_size)) / float(window_size) return np.convolve(interval, window, 'valid') y_av = movingaverage(y, 100) plt.plot(x[50:-49], y_av, color='black', linewidth=3, alpha=0.9) plt.yticks(fontsize=15) plt.xticks([0, 25, 50, 75, 100], fontsize=15) plt.ylabel("CharacTER", fontsize=17) plt.xlabel("% sentences covered (metric: " + name_map[metric] + ")", fontsize=17) plt.savefig(os.path.join(dir, prefix + "_" + metric + filename)) print("saved characTER (2)") plt.close() # Average characTER remaining text # Sentences sorted from good to bad according to metric to calculate the average CharacTER score up to the current # sentence. # The plot shows average CharacTER scores when removing bad sentences first until only one good sentence remains. def plot_cter(self, dir, prefix, filename): palette = sns.color_palette() metric_values = {} for metric in metrics: sorted_sentences = [(x, y, z) for _, x, y, z in sorted( zip([s[metric] for s in self.scoresList], [p[0] for p in self.pairs], [p[1] for p in self.pairs], [ " ".join(translation[:-1]) for translation in self.translationList ]), reverse=not reverse_sort_direction[metric])] sources, targets, translations = zip(*sorted_sentences) values = [] val = 0 for i in range(len(sources)): s = targets[i] t = translations[i] cter = compute_cter(s, t) val += cter values.append(val / (i + 1)) values.reverse() metric_values[metric] = values for i, metric in enumerate(metrics): f, axes = plt.subplots() f.set_figheight(6) f.set_figwidth(6) values = metric_values[metric] x = [ 100 * i / len(values[:-25]) for i in range(1, len(values[:-25]) + 1) ] y = [p for p in values[:-25]] line, = plt.plot(x, y, color=palette[i], linewidth=2, alpha=0.9) line.set_label(name_map[metric]) for m in metrics: if m == metric: continue line, = plt.plot([ 100 * i / len(metric_values[m][:-25]) for i in range(1, len(metric_values[m][:-25]) + 1) ], [p for p in metric_values[m][:-25]], marker='', color=palette[metrics.index(m)], linewidth=1, alpha=0.5, label=name_map[m], linestyle="-") line.set_label("other metrics") plt.legend(loc='upper left', ncol=1, fontsize=12) plt.yticks(fontsize=15) plt.xticks([0, 25, 50, 75, 100], fontsize=15) plt.ylabel("CharacTER", fontsize=17) plt.xlabel("% sentences covered (metric: " + name_map[metric] + ")", fontsize=17) plt.savefig(os.path.join(dir, prefix + "_" + metric + filename)) print("saved characTER") plt.close() # distribution plot def plot_distr(self, dir, prefix, filename): palette = sns.color_palette() bins_map = {"length": 60} for i, metric in enumerate(metrics): f, axes = plt.subplots() f.set_figheight(6) f.set_figwidth(6) metric_scores = [] for value in self.metric_to_cter[metric]: metric_scores += len( self.metric_to_cter[metric][value]) * [value] if metric == "length": bins_map["length"] = max(metric_scores) - min( metric_scores) + 1 plt.ylabel("Density", fontsize=17) plt.xlabel("Metric: " + name_map[metric], fontsize=17) plt.xticks(fontsize=15) plt.yticks(fontsize=15) bins = bins_map[metric] if metric in bins_map else None dist_ax = sns.distplot(metric_scores, ax=axes, color=palette[i], bins=bins, hist_kws={"alpha": 0.2}) ax2 = dist_ax.twinx() sns.boxplot(x=metric_scores, ax=ax2, color=palette[i]) ax2.set(ylim=(-5, 5)) plt.savefig(os.path.join(dir, prefix + "_" + metric + filename)) plt.close() f, axes = plt.subplots() f.set_figheight(6) f.set_figwidth(6) plt.ylabel("Density", fontsize=17) plt.xlabel("CharacTER", fontsize=17) sns.distplot(self.all_cter_scores) plt.savefig(os.path.join(dir, prefix + "_" + "cter_dist.png")) plt.close() def run(self, src_lang, tgt_lang, dir, translationFile, scoresFile, attFile): loader = LanguagePairLoader(src_lang, tgt_lang, self.source_file, self.target_file) _, _, pairs = loader.load() loader2 = LanguagePairLoader(src_lang, tgt_lang, self.source_file2, self.target_file2) _, _, pairs2 = loader2.load() # concatenate both sets => all 1500 sentences pairs = pairs + pairs2 self.pairs = pairs[:self.num_sentences] # Translate sources sources, targets, translations = [p[0] for p in self.pairs ], [p[1] for p in self.pairs], [] extractor = DomainSpecificExtractor( source_file=self.source_file, src_lang=src_lang, tgt_lang=tgt_lang, train_source_file= f".data/wmt14/train.tok.clean.bpe.32000.{src_lang}", train_vocab_file=f".data/vocab/train_vocab_{src_lang}.pkl") keyphrases = extractor.extract_keyphrases(n_results=100) self.translationList = [] attentionList = [] self.scoresList = [] prefix = "_experiments/translated_beam3" if os.path.isfile(os.path.join(prefix, translationFile)) \ and os.path.isfile(os.path.join(prefix, scoresFile)) \ and os.path.isfile(os.path.join(prefix, attFile)): print("Translation reloaded") with open(os.path.join(prefix, translationFile), 'rb') as f: self.translationList = pickle.load(f) with open(os.path.join(prefix, attFile), 'rb') as f: attentionList = pickle.load(f) with open(os.path.join(prefix, scoresFile), 'rb') as f: self.scoresList = pickle.load(f) else: for i, pair in enumerate(self.pairs): if i % 10 == 0: print("Translated {} of {}".format(i, len(self.pairs))) translation, attn, _ = self.model.translate( pair[0], beam_size=self.beam_size) translations.append(" ".join(translation[:-1])) scores = self.scorer.compute_scores(pair[0], " ".join(translation), attn, keyphrases, "") self.translationList.append(translation) attentionList.append(attn) self.scoresList.append(scores) pickle.dump(self.translationList, open(os.path.join(dir, translationFile), "wb")) pickle.dump(self.scoresList, open(os.path.join(dir, scoresFile), "wb")) pickle.dump(attentionList, open(os.path.join(dir, attFile), "wb")) for i, pair in enumerate(self.pairs): if i % 10 == 0: print("Processing {} of {}".format(i, len(self.pairs))) for metric in self.scoresList[i]: #if metric == "coverage_penalty" and self.scoresList[i][metric] > 45: # remove some outliers # continue #if metric == "keyphrase_score" and self.scoresList[i][metric] == 0: # continue if not metric in self.metric_to_cter: self.metric_to_cter[metric] = {} if not self.scoresList[i][metric] in self.metric_to_cter[ metric]: self.metric_to_cter[metric][self.scoresList[i] [metric]] = [] cter = compute_cter(pair[1], " ".join(self.translationList[i][:-1])) self.all_cter_scores.append(cter) self.metric_to_cter[metric][self.scoresList[i][metric]].append( cter)
class MetricExperiment: def __init__( self, model, src_lang, tgt_lang, model_type, source_file, target_file, test_source_file, test_target_file, dir, evaluate_every=10, num_sentences=400, num_sentences_test=500, reuseCalculatedTranslations=False, reuseInitialTranslations=False, initialTranslationFile="", initialScoreFile="", initialTestTranslationFile="", translationFile="", batch_translate=True, ): self.model = model self.src_lang = src_lang self.tgt_lang = tgt_lang self.model_type = model_type self.source_file = source_file self.target_file = target_file self.loader = LanguagePairLoader(src_lang, tgt_lang, source_file, target_file) self.test_loader = LanguagePairLoader(src_lang, tgt_lang, test_source_file, test_target_file) self.extractor = DomainSpecificExtractor( source_file=source_file, src_lang=src_lang, tgt_lang=tgt_lang, train_source_file= f".data/wmt14/train.tok.clean.bpe.32000.{src_lang}", train_vocab_file=f".data/vocab/train_vocab_{src_lang}.pkl") self.target_extractor = DomainSpecificExtractor( source_file=target_file, src_lang=tgt_lang, tgt_lang=src_lang, train_source_file= f".data/wmt14/train.tok.clean.bpe.32000.{tgt_lang}", train_vocab_file=f".data/vocab/train_vocab_{tgt_lang}.pkl") self.scorer = Scorer() self.scores = {} self.num_sentences = num_sentences self.num_sentences_test = num_sentences_test self.batch_translate = batch_translate self.evaluate_every = evaluate_every self.reuseCalculatedTranslations = reuseCalculatedTranslations self.reuseInitialTranslations = reuseInitialTranslations self.initialTranslationFile = initialTranslationFile self.initialScoreFile = initialScoreFile self.initialTestTranslationFile = initialTestTranslationFile self.translationFile = translationFile self.metric_bleu_scores = {} self.metric_gleu_scores = {} self.metric_precisions = {} self.metric_recalls = {} self.prefix = "_experiments/retrain_beam3" self.dir = dir def save_data(self): prefix = ("batch_" if self.batch_translate else "beam_") + str( self.evaluate_every) + "_" prefix = os.path.join(self.dir, prefix) pickle.dump(self.metric_bleu_scores, open(prefix + "metric_bleu_scores.pkl", "wb")) pickle.dump(self.metric_gleu_scores, open(prefix + "metric_gleu_scores.pkl", "wb")) pickle.dump(self.metric_precisions, open(prefix + "metric_precisions.pkl", "wb")) pickle.dump(self.metric_recalls, open(prefix + "metric_recalls.pkl", "wb")) print("Saved all scores") def save_translation(self, translation, metric, step): name = os.path.join(self.dir, metric + "_" + str(step) + self.translationFile) pickle.dump(translation, open(name, "wb")) print("Saved: " + name) def restore_translation(self, metric, step): name = os.path.join(self.prefix, metric + "_" + str(step) + self.translationFile) with open(name, 'rb') as f: return pickle.load(f) def save_initialTranslation(self, scores, translations): name = os.path.join(self.dir, self.initialTranslationFile) pickle.dump(translations, open(name, "wb")) name = os.path.join(self.dir, self.initialScoreFile) pickle.dump(scores, open(name, "wb")) print("Saved: " + name) def restore_initialTranslation(self): name = os.path.join(self.prefix, self.initialTranslationFile) with open(name, 'rb') as f: translations = pickle.load(f) name = os.path.join(self.prefix, self.initialScoreFile) with open(name, 'rb') as f: scores = pickle.load(f) return translations, scores def save_initialTestTranslation(self, translations): name = os.path.join(self.dir, self.initialTestTranslationFile) pickle.dump(translations, open(name, "wb")) print("Saved: " + name) def restore_initialTestTranslation(self): name = os.path.join(self.prefix, self.initialTestTranslationFile) with open(name, 'rb') as f: return pickle.load(f) def run(self): _, _, pairs = self.loader.load() random.shuffle(pairs) pairs = pairs[:self.num_sentences] sources, targets, translations = [p[0] for p in pairs ], [p[1] for p in pairs], [] keyphrases = self.extractor.extract_keyphrases(n_results=100) target_keyphrases = self.target_extractor.extract_keyphrases( n_results=100) # translation and scores for order of retraining print('Translating ...') if not reuseCalculatedTranslations and not reuseInitialTranslations: for i, pair in enumerate(tqdm(pairs)): translation, attn, _ = self.model.translate(pair[0]) translations.append(" ".join(translation[:-1])) metrics_scores = self.scorer.compute_scores( pair[0], " ".join(translation[:-1]), attn, keyphrases, "") for metric in metrics_scores: if metric not in self.scores: self.scores[metric] = [] self.scores[metric].append(metrics_scores[metric]) self.save_initialTranslation(self.scores, translations) else: translations, self.scores = self.restore_initialTranslation() # initial test set translation _, _, test_pairs = self.test_loader.load() test_pairs = test_pairs[:self.num_sentences_test] test_sources, test_targets, test_translations = [ p[0] for p in test_pairs ], [p[1] for p in test_pairs], [] if not reuseCalculatedTranslations and not reuseInitialTranslations: print('- not reusing translations: Translating...') for i, source in enumerate(tqdm(test_sources)): translation, attn, _ = self.model.translate(source) test_translations.append(" ".join(translation[:-1])) if self.batch_translate: test_translations = [ t[:-6] for t in self.model.batch_translate(test_sources) ] self.save_initialTestTranslation(test_translations) else: test_translations = self.restore_initialTestTranslation() metrics = [ "random", "keyphrase_score", "coverage_penalty", "confidence", "length" ] print("Evaluating metrics...") for i, metric in enumerate(tqdm(metrics)): self.metric_bleu_scores[metric] = [] self.metric_gleu_scores[metric] = [] self.metric_precisions[metric] = [] self.metric_recalls[metric] = [] sourcesCopy = sources[:] targetsCopy = targets[:] translationsCopy = translations[:] self.evaluate_metric( self.src_lang, self.tgt_lang, self.model_type, sourcesCopy, targetsCopy, translationsCopy, self.scores[metric] if metric != "random" else [], metric, target_keyphrases, test_sources, test_targets, test_translations, need_sort=True if metric != "random" else False, reverse=reverse_sort_direction[metric] if metric != "random" else True) print() print(self.metric_bleu_scores) self.save_data() def shuffle_list(self, *ls): l = list(zip(*ls)) random.shuffle(l) return zip(*l) def evaluate_metric(self, src_lang, tgt_lang, model_type, sources, targets, translations, scores, metric, target_keyphrases, test_sources, test_targets, test_translations, need_sort=True, reverse=False): print() print("Evaluating {}".format(metric)) base_bleu = compute_bleu(targets, translations) print("Base BLEU (of retraining data): {}".format(base_bleu)) # Sort by metric if need_sort: sorted_sentences = [(x, y, z) for _, x, y, z in sorted( zip(scores, sources, targets, translations), reverse=reverse)] sources, targets, translations = zip(*sorted_sentences) else: sources, targets, translations = self.shuffle_list( sources, targets, translations) n = len(sources) encoder_optimizer_state, decoder_optimizer_state = None, None pretraining_bleu = compute_bleu(test_targets, test_translations) pretraining_gleu = compute_gleu(test_targets, test_translations) print() print("pretraining BLEU of test set (before retraining)") print(pretraining_bleu) prerecall = unigram_recall(target_keyphrases, test_targets, test_translations) preprecision = unigram_precision(target_keyphrases, test_targets, test_translations) self.metric_bleu_scores[metric].append( (pretraining_bleu, pretraining_bleu)) self.metric_gleu_scores[metric].append( (pretraining_gleu, pretraining_gleu)) self.metric_recalls[metric].append((prerecall, prerecall)) self.metric_precisions[metric].append((preprecision, preprecision)) self.save_data() if isinstance(self.model, TransformerTranslator): # create a new checkpoint here that gets overwritten with each ij # Neccessary to load trainer state. current_ckpt = f'.data/models/transformer/trafo_{src_lang}_{tgt_lang}_ensemble.pt' print('Training...') for i in tqdm(range(0, n)): # retranslate only every 10th sentence # evaluets for the 0th, 10th, 20th, ... sentence -> computes for sentences (0..9), (10..19), (20..29); # first sentence i = 0; evaluate_every = 10 if i % self.evaluate_every != 0: continue if not reuseCalculatedTranslations: # Now train, and compute BLEU again start = i end = min(i + self.evaluate_every, n) print() print("Correcting {} - {} of {} sentences".format( start, end - 1, n)) if isinstance(self.model, Seq2SeqModel): # same parameters that are used in the tool encoder_optimizer_state, decoder_optimizer_state = retrain_iters( self.model, [[x, y] for x, y in zip( sources[start:end], targets[start:end])], [], src_lang, tgt_lang, batch_size=1, encoder_optimizer_state=encoder_optimizer_state, decoder_optimizer_state=decoder_optimizer_state, print_every=1, n_epochs=15, learning_rate=0.0001, save_ckpt=i == n - 1) else: # same parameters that are used in the tool current_ckpt = self.model.retrain( src_lang, tgt_lang, [[x, y] for x, y in zip( sources[start:end], targets[start:end])], last_ckpt=current_ckpt, epochs=15, batch_size=1, device=DEVICE, save_ckpt=i == n - 1, print_info=False) corrected_translations = [] print(' - Translate using trained model') if not self.batch_translate: # Translate trained model for j in tqdm(range(0, len(test_sources))): translation, _, _ = self.model.translate( test_sources[j]) corrected_translations.append(" ".join( translation[:-1])) else: batch_translations = self.model.batch_translate( test_sources) corrected_translations = [ t[:-6] for t in batch_translations ] self.save_translation(corrected_translations, metric, i) else: corrected_translations = self.restore_translation(metric, i) # Compute posttraining BLEU posttraining_bleu = compute_bleu(test_targets, corrected_translations) posttraining_gleu = compute_gleu(test_targets, corrected_translations) postrecall = unigram_recall(target_keyphrases, test_targets, corrected_translations) postprecision = unigram_precision(target_keyphrases, test_targets, corrected_translations) print("(Base BLEU {})".format(base_bleu)) print("Delta Recall {} -> {}".format(prerecall, postrecall)) print("Delta Precision {} -> {}".format(preprecision, postprecision)) print("Delta GLEU: {} -> {}".format(pretraining_gleu, posttraining_gleu)) print("Delta BLEU: {} -> {}".format(pretraining_bleu, posttraining_bleu)) delta_bleu = posttraining_bleu - pretraining_bleu print("Delta: {}".format(delta_bleu)) self.metric_bleu_scores[metric].append( (pretraining_bleu, posttraining_bleu)) self.metric_gleu_scores[metric].append( (pretraining_gleu, posttraining_gleu)) self.metric_recalls[metric].append((prerecall, postrecall)) self.metric_precisions[metric].append( (preprecision, postprecision)) self.save_data() self.model = load_model(src_lang, tgt_lang, model_type, device=DEVICE) # reload initial model return None
class AveragedMetricExperiment: def __init__(self, model, source_file, target_file, raw_source_file, raw_target_file, num_sentences=400): self.model = model self.source_file = source_file self.target_file = target_file self.loader = LanguagePairLoader("de", "en", source_file, target_file) self.extractor = DomainSpecificExtractor(source_file=raw_source_file, train_source_file=hp.source_file, train_vocab_file="train_vocab.pkl") self.target_extractor = DomainSpecificExtractor(source_file=raw_target_file, train_source_file=hp.target_file, train_vocab_file="train_vocab_en.pkl") self.scorer = Scorer() self.scores = {} self.num_sentences = num_sentences self.metric_bleu_scores = {} self.metric_gleu_scores = {} self.metric_precisions = {} self.metric_recalls = {} self.cer = {} # Plot each metric plt.style.use('seaborn-darkgrid') self.palette = sns.color_palette() def save_data(self): prefix = "averaged_" pickle.dump(self.metric_bleu_scores, open(prefix + "metric_bleu_scores.pkl", "wb")) pickle.dump(self.metric_gleu_scores, open(prefix + "metric_gleu_scores.pkl", "wb")) pickle.dump(self.metric_precisions, open(prefix + "metric_precisions.pkl", "wb")) pickle.dump(self.metric_recalls, open(prefix + "metric_recalls.pkl", "wb")) pickle.dump(self.cer, open(prefix + "metric_cer.pkl", "wb")) print("Saved all scores") def run(self): _, _, pairs = self.loader.load() random.seed(2018) random.shuffle(pairs) pairs = pairs[:self.num_sentences] sources, targets, translations = [p[0] for p in pairs], [p[1] for p in pairs], [] keyphrases = self.extractor.extract_keyphrases(n_results=100) print(keyphrases) target_keyphrases = self.target_extractor.extract_keyphrases(n_results=100) print(target_keyphrases) for i, pair in enumerate(pairs): if i % 10 == 0: print("Translated {} of {}".format(i, len(pairs))) translation, attn, _ = self.model.translate(pair[0]) translations.append(" ".join(translation[:-1])) metrics_scores = self.scorer.compute_scores(pair[0], " ".join(translation[:-1]), attn, keyphrases) for metric in metrics_scores: if metric not in self.scores: self.scores[metric] = [] self.scores[metric].append(metrics_scores[metric]) metrics = [ # "coverage_penalty", # "coverage_deviation_penalty", # "confidence", # "length", # "ap_in", # "ap_out", # "random", "keyphrase_score" ] n_iters = 1 for i, metric in enumerate(metrics): avg_bleus = [0 for _ in range(1, 100 // (step_size * 2) + 1)] self.metric_bleu_scores[metric] = [] self.metric_gleu_scores[metric] = [] self.metric_precisions[metric] = [] self.metric_recalls[metric] = [] self.cer[metric] = [] for j in range(n_iters): self.evaluate_metric(sources, targets, translations, self.scores[metric] if metric != "random" else [], metric, target_keyphrases, need_sort=True if metric != "random" else False, reverse=sort_direction[metric] if metric != "random" else True) # plt.plot(x, delta_bleus, marker='', linestyle="--", color=self.palette[i], linewidth=1, alpha=0.9, # label=metric) self.save_data() def shuffle_list(self, *ls): l = list(zip(*ls)) random.shuffle(l) return zip(*l) def evaluate_metric(self, sources, targets, translations, scores, metric, target_keyphrases, need_sort=True, reverse=False): print("Evaluating {}".format(metric)) base_bleu = compute_bleu(targets, translations) print("Base BLEU: {}".format(base_bleu)) # Sort by metric if need_sort: sorted_sentences = [(x, y, z) for _, x, y, z in sorted(zip(scores, sources, targets, translations), reverse=reverse)] sources, targets, translations = zip(*sorted_sentences) else: sources, targets, translations = self.shuffle_list(sources, targets, translations) n = len(sources) encoder_optimizer_state, decoder_optimizer_state = None, None corrected_translations = [] cer_improvement = [] curr_cer = 0 for i in range(1, n + 1): print() print("{}: Correcting {} of {} sentences".format(metric, i, n)) curr_end = i # Compute BLEU before training for comparison pretraining_bleu = compute_bleu(targets[:curr_end], translations[:curr_end]) pretraining_gleu = compute_gleu(targets[:curr_end], translations[:curr_end]) prerecall = unigram_recall(target_keyphrases, targets[:curr_end], translations[:curr_end]) preprecision = unigram_precision(target_keyphrases, targets[:curr_end], translations[:curr_end]) precer = cer(targets[i - 1].replace("@@ ", "").split(), translations[i - 1].replace("@@ ", "").split()) translation, _, _ = seq2seq_model.translate(sources[i - 1]) corrected_translations.append(" ".join(translation[:-1])) postcer = cer(targets[i - 1].replace("@@ ", "").split(), " ".join(translation[:-1]).replace("@@ ", "").split()) curr_cer = precer - postcer cer_improvement.append(curr_cer) # Compute posttraining BLEU posttraining_bleu = compute_bleu(targets[:curr_end], corrected_translations) posttraining_gleu = compute_gleu(targets[:curr_end], corrected_translations) postrecall = unigram_recall(target_keyphrases, targets[:curr_end], corrected_translations) postprecision = unigram_precision(target_keyphrases, targets[:curr_end], corrected_translations) print("Delta Recall {} -> {}".format(prerecall, postrecall)) print("Delta Precision {} -> {}".format(preprecision, postprecision)) print("Delta BLEU: {} -> {}".format(pretraining_bleu, posttraining_bleu)) print("Delta CER: {} -> {}".format(precer, postcer)) self.metric_bleu_scores[metric].append((pretraining_bleu, posttraining_bleu)) self.metric_gleu_scores[metric].append((pretraining_gleu, posttraining_gleu)) self.metric_recalls[metric].append((prerecall, postrecall)) self.metric_precisions[metric].append((preprecision, postprecision)) # Now train, and compute BLEU again encoder_optimizer_state, decoder_optimizer_state = retrain_iters(self.model, [[sources[i - 1], targets[i - 1]]], [], batch_size=1, encoder_optimizer_state=encoder_optimizer_state, decoder_optimizer_state=decoder_optimizer_state, n_epochs=1, learning_rate=0.00005, weight_decay=1e-3) self.cer[metric] = cer_improvement reload_model(self.model) return None def plot(self): plt.xlabel('% Corrected Sentences') plt.ylabel('Δ BLEU') # Add titles plt.title("BLEU Change for Metrics", loc='center', fontsize=12, fontweight=0) # Add legend plt.legend(loc='lower right', ncol=1) plt.savefig('bleu_deltas.png')
def documentUpload(): if 'file' not in request.files: return redirect(request.url) file = request.files['file'] # if user does not select file, browser also # submit an empty part without filename if file.filename == '': return redirect(request.url) if file and allowed_file(file.filename): document_name = request.args.get("document_name") id = uuid4() filename = secure_filename(file.filename) filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) file.save(filepath) user = User.query.filter_by(username=get_jwt_identity()).first() dbDocument = DBDocument(id=id, name=document_name, user=user, model=model_name) document = Document(str(id), document_name, dict(), filepath) sentences = document.load_content(filename) sentences = list(filter(None, sentences)) # remove empty lines with open(filepath, "w", encoding='utf-8') as f: for i, sentence in enumerate(sentences): f.write(sentence.replace("@@ ", "") + "\n" if i < len(sentences) - 1 else "") extractor = DomainSpecificExtractor(source_file=filepath, src_lang=SRC_LANG, tgt_lang=TGT_LANG, train_source_file=f".data/wmt14/train.tok.clean.bpe.32000.{SRC_LANG}", train_vocab_file=f".data/vocab/train_vocab_{SRC_LANG}.pkl") keyphrases = extractor.extract_keyphrases(n_results=30) scorer = Scorer() print("Translating {} sentences".format(len(sentences))) beamSize = 3 attLayer = -2 for i, source in enumerate(sentences): translation, attn, translations = model.translate(source, beam_size=beamSize, attLayer=attLayer, beam_length=0.6, beam_coverage=0.4) print("Translated {} of {}".format(i + 1, len(sentences))) beam = translationsToTree(translations[:beamSize]) # print(" ", translation) score = scorer.compute_scores(source, " ".join(translation), attn, keyphrases, "") score["order_id"] = i sentence = Sentence(i, source, " ".join(translation), attn, beam, score) document.sentences.append(sentence) print("Finished translation") keyphrases = [{"name": k, "occurrences": f, "active": False} for (k, f) in keyphrases] document.keyphrases = keyphrases db.session.add(dbDocument) db.session.commit() save_document(document, id) return jsonify({}) return jsonify({})
class CorrelationExperiment: def __init__(self, model, source_file, target_file, num_sentences=1000): self.model = model self.source_file = source_file self.target_file = target_file self.scorer = Scorer() self.num_sentences = num_sentences self.metric_to_gleu = {} self.all_gleu_scores = [] self.metric_to_bad = {} self.bad_count = {} # self.threshold = 0.2826 - 0.167362 self.threshold = 0.6 def plot_correlation(self, filename): f, axes = plt.subplots(2, 3, sharey=True) f.set_figheight(8) f.set_figwidth(12) axes = np.reshape(axes, (6,)) for i, metric in enumerate(metrics): x, y = [], [] x_min = float('inf') x_max = float('-inf') x_temp = [] for score in self.metric_to_gleu[metric]: values = self.metric_to_gleu[metric][score] x_temp += [score] * len(values) bad_count = 0 score_gleu_tuples = [] gleus = [] for score in self.metric_to_gleu[metric]: for v in self.metric_to_gleu[metric][score]: if v >= self.threshold: bad_count += 1 gleus.append(v) score_gleu_tuples += [(score, v) for v in self.metric_to_gleu[metric][score]] values = self.metric_to_gleu[metric][score] x_min = min(x_min, score) x_max = max(x_max, score) x += [score] * len(values) y += values # plt.scatter([score] * len(values), values, color=palette(0), alpha=0.5) print(bad_count) print("Median {}".format(np.median(gleus))) print("Std {}".format(np.std(gleus))) self.bad_count[metric] = bad_count score_gleu_tuples = sorted(score_gleu_tuples, key=lambda x: x[0], reverse=sort_direction[metric]) self.metric_to_bad[metric] = score_gleu_tuples b, m = P.polyfit(x, y, 1) axes[i].set_ylim(-0.1, 1.1) if metric == "ap_out": axes[i].set_xlim(0, 2.5) if metric == "shortness_penalty": axes[i].set_xlim(0, 1) corr, p_val = pearsonr(x, y) axes[i].text(0.05, 0.95, "r = {0:.2f}".format(corr.item()), transform=axes[i].transAxes, va="top", fontsize=13, weight="bold") axes[i].set_title(name_map[metric], {'fontsize': 15, 'horizontalalignment': 'left'}, "left") sns.regplot(x, y, ax=axes[i], scatter_kws={'alpha': 0.2}, order=1) # plt.plot(np.asarray([x_min, x_max]), b + m * np.asarray([x_min, x_max]), '-') axes[0].set(ylabel="CharacTER") axes[3].set(ylabel="CharacTER") plt.tight_layout() plt.savefig(filename) plt.close() def plot_bad(self, filename): f, axes = plt.subplots(2, 3, sharey=True) f.set_figheight(8) f.set_figwidth(12) axes = np.reshape(axes, (6,)) palette = sns.color_palette() metric_percentage = {} for metric in metrics: bad_percentage = [] curr_bad_count = 0 for score, gleu in self.metric_to_bad[metric]: if gleu >= self.threshold: curr_bad_count += 1 bad_percentage.append(curr_bad_count / self.bad_count[metric]) metric_percentage[metric] = bad_percentage print(len([metric_percentage[m] for m in metric_percentage])) for i, metric in enumerate(metrics): plt.subplot(2, 3, i + 1) bad_percentage = metric_percentage[metric] percentiles = [0.25, 0.5, 0.75] indices = [] for perc in percentiles: indices.append(next(x[0] for x in enumerate(bad_percentage) if x[1] >= perc) / len(bad_percentage)) print(metric) print(indices) n = len(bad_percentage) x = [100 * i / n for i in range(1, n + 1)] plt.plot(x, [100 * p for p in bad_percentage], color=palette[i], linewidth=2, alpha=0.9) plt.plot(x, x, marker='', linestyle="--", color='black', linewidth=1.5, alpha=0.9) for m in metrics: plt.plot([100 * i / n for i in range(1, len(metric_percentage[m]) + 1)], [100 * p for p in metric_percentage[m]], marker='', color='grey', linewidth=1, alpha=0.3) if i + 1 not in [1, 4]: plt.tick_params(labelleft='off') plt.yticks([0, 25, 50, 75, 100]) plt.xticks([0, 25, 50, 75, 100]) # Add title plt.title(name_map[metric], loc='left', fontsize=15, fontweight=0) if i + 1 == 5: plt.xlabel("Percentile Threshold", fontsize=15) if i + 1 == 4 or i + 1 == 1: plt.ylabel("% Covered", fontsize=15) plt.tight_layout() plt.savefig(filename) print("saved bad") plt.close() def plot_distr(self, filename): palette = sns.color_palette() f, axes = plt.subplots(2, 3) f.set_figheight(8) f.set_figwidth(12) axes = np.reshape(axes, (6,)) bins_map = {"length": 60} metrics = [ "coverage_penalty", "coverage_deviation_penalty", "confidence", "length", "ap_in", "ap_out" ] for i, metric in enumerate(metrics): metric_scores = [] for value in self.metric_to_gleu[metric]: metric_scores += len(self.metric_to_gleu[metric][value]) * [value] if metric == "length": bins_map["length"] = max(metric_scores) - min(metric_scores) + 1 # axes[i].set_xlim(0, 61) axes[i].set_title(name_map[metric], {'fontsize': 15, 'horizontalalignment': 'left'}, "left") bins = bins_map[metric] if metric in bins_map else None dist_ax = sns.distplot(metric_scores, ax=axes[i], color=palette[i], bins=bins, hist_kws={"alpha": 0.2}) ax2 = dist_ax.twinx() sns.boxplot(x=metric_scores, ax=ax2, color=palette[i]) ax2.set(ylim=(-5, 5)) plt.tight_layout() plt.savefig(filename) plt.clf() f.set_figheight(4) f.set_figwidth(4) sns.distplot(self.all_gleu_scores) plt.tight_layout() plt.savefig("gleu_dist.png") plt.clf() def run(self): loader = LanguagePairLoader("de", "en", self.source_file, self.target_file) _, _, pairs = loader.load() pairs = pairs[:self.num_sentences] # Translate sources sources, targets, translations = [p[0] for p in pairs], [p[1] for p in pairs], [] extractor = DomainSpecificExtractor(source_file="data/khresmoi.tok.de", train_source_file=hp.source_file, train_vocab_file="train_vocab.pkl") keyphrases = extractor.extract_keyphrases(n_results=100) print(keyphrases) for i, pair in enumerate(pairs): if i % 10 == 0: print("Translated {} of {}".format(i, len(pairs))) translation, attn, _ = self.model.translate(pair[0], beam_size=1) translations.append(" ".join(translation[:-1])) scores = self.scorer.compute_scores(pair[0], " ".join(translation), attn, keyphrases) for metric in scores: if metric == "coverage_penalty" and scores[metric] > 80: continue if metric == "keyphrase_score" and scores[metric] == 0: continue if not metric in self.metric_to_gleu: self.metric_to_gleu[metric] = {} if not scores[metric] in self.metric_to_gleu[metric]: self.metric_to_gleu[metric][scores[metric]] = [] gleu = compute_cter(pair[1], " ".join(translation[:-1])) self.all_gleu_scores.append(gleu) self.metric_to_gleu[metric][scores[metric]].append(gleu)