def __init__(self, model, source_file, target_file, test_source_file, test_target_file,
                 raw_source_file,
                 raw_target_file, num_sentences=400,
                 batch_translate=True):
        self.model = model
        self.source_file = source_file
        self.target_file = target_file
        self.loader = LanguagePairLoader("de", "en", source_file, target_file)
        self.test_loader = LanguagePairLoader("de", "en", test_source_file, test_target_file)
        self.extractor = DomainSpecificExtractor(source_file=raw_source_file, train_source_file=hp.source_file,
                                                 train_vocab_file="train_vocab.pkl")
        self.target_extractor = DomainSpecificExtractor(source_file=raw_target_file, train_source_file=hp.source_file,
                                                        train_vocab_file="train_vocab_en.pkl")
        self.scorer = Scorer()
        self.scores = {}
        self.num_sentences = num_sentences
        self.batch_translate = batch_translate
        self.evaluate_every = 10

        self.metric_bleu_scores = {}
        self.metric_gleu_scores = {}
        self.metric_precisions = {}
        self.metric_recalls = {}

        # Plot each metric
        plt.style.use('seaborn-darkgrid')
        self.palette = sns.color_palette()
    def run(self):
        loader = LanguagePairLoader("de", "en", self.source_file, self.target_file)
        _, _, pairs = loader.load()

        pairs = pairs[:self.num_sentences]
        # Translate sources
        sources, targets, translations = [p[0] for p in pairs], [p[1] for p in pairs], []

        extractor = DomainSpecificExtractor(source_file="data/khresmoi.tok.de",
                                            train_source_file=hp.source_file,
                                            train_vocab_file="train_vocab.pkl")
        keyphrases = extractor.extract_keyphrases(n_results=100)
        print(keyphrases)

        for i, pair in enumerate(pairs):
            if i % 10 == 0:
                print("Translated {} of {}".format(i, len(pairs)))
            translation, attn, _ = self.model.translate(pair[0], beam_size=1)
            translations.append(" ".join(translation[:-1]))
            scores = self.scorer.compute_scores(pair[0], " ".join(translation), attn, keyphrases)

            for metric in scores:
                if metric == "coverage_penalty" and scores[metric] > 80:
                    continue
                if metric == "keyphrase_score" and scores[metric] == 0:
                    continue

                if not metric in self.metric_to_gleu:
                    self.metric_to_gleu[metric] = {}
                if not scores[metric] in self.metric_to_gleu[metric]:
                    self.metric_to_gleu[metric][scores[metric]] = []
                gleu = compute_cter(pair[1], " ".join(translation[:-1]))
                self.all_gleu_scores.append(gleu)
                self.metric_to_gleu[metric][scores[metric]].append(gleu)
Example #3
0
def retranslate(document_id):
    document = get_document(document_id)
    scorer = Scorer()
    extractor = DomainSpecificExtractor(source_file=document.filepath, src_lang=SRC_LANG, tgt_lang=TGT_LANG,
                                        train_source_file=f".data/wmt14/train.tok.clean.bpe.32000.{SRC_LANG}",
                                        train_vocab_file=f".data/vocab/train_vocab_{SRC_LANG}.pkl")
    keyphrases = extractor.extract_keyphrases()
    num_changes = 0

    for i, sentence in enumerate(document.sentences):
        sentence, num_changes = retranslateSentenceWithId(i, sentence, scorer, keyphrases, num_changes)

    save_document(document, document_id)
    return jsonify({"numChanges": num_changes})
Example #4
0
def retranslateSentence(document_id, sentence_id, beam_size, att_layer):

    document = get_document(document_id)
    scorer = Scorer()
    extractor = DomainSpecificExtractor(source_file=document.filepath, src_lang=SRC_LANG, tgt_lang=TGT_LANG,
                                        train_source_file=f".data/wmt14/train.tok.clean.bpe.32000.{SRC_LANG}",
                                        train_vocab_file=f".data/vocab/train_vocab_{SRC_LANG}.pkl")
    keyphrases = extractor.extract_keyphrases()
    num_changes = 0

    retranslateSentenceWithId(sentence_id, document.sentences[int(sentence_id)], scorer, keyphrases,
                              num_changes, int(beam_size), int(att_layer), force=True)
    save_document(document, document_id)

    return jsonify({})
Example #5
0
def correctTranslation():
    data = request.get_json()
    translation = data["translation"]
    beam = data["beam"]
    document_unk_map = data["document_unk_map"]
    attention = data["attention"]
    document_id = data["document_id"]
    sentence_id = data["sentence_id"]

    document = get_document(document_id)

    extractor = DomainSpecificExtractor(source_file=document.filepath, src_lang=SRC_LANG, tgt_lang=TGT_LANG,
                                        train_source_file=f".data/wmt14/train.tok.clean.bpe.32000.{SRC_LANG}",
                                        train_vocab_file=f".data/vocab/train_vocab_{SRC_LANG}.pkl")
    keyphrases = extractor.extract_keyphrases()

    for key in document_unk_map:
        if key not in document.unk_map:
            document.unk_map[key] = document_unk_map[key]
        else:
            # Merge list values
            document.unk_map[key] = list(set(document.unk_map[key]) | set(document_unk_map[key]))

    sentence = document.sentences[int(sentence_id)]

    if translation != sentence.translation:
        sentence.diff = html_diff(sentence.translation[:-4].replace("@@ ", ""),
                                  translation[:-4].replace("@@ ", ""))
    sentence.translation = translation
    sentence.corrected = True
    sentence.flagged = False
    sentence.attention = attention
    sentence.beam = beam

    scorer = Scorer()
    score = scorer.compute_scores(sentence.source, sentence.translation, attention, keyphrases, "")
    score["order_id"] = sentence.score["order_id"]
    sentence.score = score

    document.sentences[int(sentence_id)] = sentence

    save_document(document, document_id)

    return jsonify({})
Example #6
0
    def run(self, src_lang, tgt_lang, dir, translationFile, scoresFile,
            attFile):
        loader = LanguagePairLoader(src_lang, tgt_lang, self.source_file,
                                    self.target_file)
        _, _, pairs = loader.load()

        loader2 = LanguagePairLoader(src_lang, tgt_lang, self.source_file2,
                                     self.target_file2)
        _, _, pairs2 = loader2.load()

        # concatenate both sets => all 1500 sentences
        pairs = pairs + pairs2

        self.pairs = pairs[:self.num_sentences]

        # Translate sources
        sources, targets, translations = [p[0] for p in self.pairs
                                          ], [p[1] for p in self.pairs], []

        extractor = DomainSpecificExtractor(
            source_file=self.source_file,
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            train_source_file=
            f".data/wmt14/train.tok.clean.bpe.32000.{src_lang}",
            train_vocab_file=f".data/vocab/train_vocab_{src_lang}.pkl")

        keyphrases = extractor.extract_keyphrases(n_results=100)

        self.translationList = []
        attentionList = []
        self.scoresList = []
        prefix = "_experiments/translated_beam3"

        if os.path.isfile(os.path.join(prefix, translationFile)) \
                and os.path.isfile(os.path.join(prefix, scoresFile)) \
                and os.path.isfile(os.path.join(prefix, attFile)):
            print("Translation reloaded")
            with open(os.path.join(prefix, translationFile), 'rb') as f:
                self.translationList = pickle.load(f)
            with open(os.path.join(prefix, attFile), 'rb') as f:
                attentionList = pickle.load(f)
            with open(os.path.join(prefix, scoresFile), 'rb') as f:
                self.scoresList = pickle.load(f)

        else:
            for i, pair in enumerate(self.pairs):
                if i % 10 == 0:
                    print("Translated {} of {}".format(i, len(self.pairs)))

                translation, attn, _ = self.model.translate(
                    pair[0], beam_size=self.beam_size)
                translations.append(" ".join(translation[:-1]))

                scores = self.scorer.compute_scores(pair[0],
                                                    " ".join(translation),
                                                    attn, keyphrases, "")

                self.translationList.append(translation)
                attentionList.append(attn)
                self.scoresList.append(scores)

            pickle.dump(self.translationList,
                        open(os.path.join(dir, translationFile), "wb"))
            pickle.dump(self.scoresList,
                        open(os.path.join(dir, scoresFile), "wb"))
            pickle.dump(attentionList, open(os.path.join(dir, attFile), "wb"))

        for i, pair in enumerate(self.pairs):
            if i % 10 == 0:
                print("Processing {} of {}".format(i, len(self.pairs)))

            for metric in self.scoresList[i]:
                #if metric == "coverage_penalty" and self.scoresList[i][metric] > 45: # remove some outliers
                #    continue
                #if metric == "keyphrase_score" and self.scoresList[i][metric] == 0:
                #    continue

                if not metric in self.metric_to_cter:
                    self.metric_to_cter[metric] = {}
                if not self.scoresList[i][metric] in self.metric_to_cter[
                        metric]:
                    self.metric_to_cter[metric][self.scoresList[i]
                                                [metric]] = []

                cter = compute_cter(pair[1],
                                    " ".join(self.translationList[i][:-1]))
                self.all_cter_scores.append(cter)
                self.metric_to_cter[metric][self.scoresList[i][metric]].append(
                    cter)
    seq2seq_model = Seq2SeqModel(encoder, decoder, input_lang, output_lang)

    return seq2seq_model


def reload_model(seq2seq_model):
    checkpoint = torch.load(hp.checkpoint_name)
    encoder_state = checkpoint["encoder"]
    decoder_state = checkpoint["decoder"]

    seq2seq_model.encoder.load_state_dict(encoder_state)
    seq2seq_model.decoder.load_state_dict(decoder_state)


def keyphrase_score(sentence, keyphrases):
    score = 0

    for word in sentence.split(" "):
        for keyphrase, freq in keyphrases:
            score += word.lower().count(keyphrase.lower()) * freq
    return score


extractor = DomainSpecificExtractor(source_file="data/medical.tok.de",
                                    train_source_file=hp.source_file,
                                    train_vocab_file="train_vocab.pkl")

words = extractor.extract_keyphrases()

print(words)
Example #8
0
    def __init__(
        self,
        model,
        src_lang,
        tgt_lang,
        model_type,
        source_file,
        target_file,
        test_source_file,
        test_target_file,
        dir,
        evaluate_every=10,
        num_sentences=400,
        num_sentences_test=500,
        reuseCalculatedTranslations=False,
        reuseInitialTranslations=False,
        initialTranslationFile="",
        initialScoreFile="",
        initialTestTranslationFile="",
        translationFile="",
        batch_translate=True,
    ):

        self.model = model
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.model_type = model_type
        self.source_file = source_file
        self.target_file = target_file
        self.loader = LanguagePairLoader(src_lang, tgt_lang, source_file,
                                         target_file)
        self.test_loader = LanguagePairLoader(src_lang, tgt_lang,
                                              test_source_file,
                                              test_target_file)

        self.extractor = DomainSpecificExtractor(
            source_file=source_file,
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            train_source_file=
            f".data/wmt14/train.tok.clean.bpe.32000.{src_lang}",
            train_vocab_file=f".data/vocab/train_vocab_{src_lang}.pkl")

        self.target_extractor = DomainSpecificExtractor(
            source_file=target_file,
            src_lang=tgt_lang,
            tgt_lang=src_lang,
            train_source_file=
            f".data/wmt14/train.tok.clean.bpe.32000.{tgt_lang}",
            train_vocab_file=f".data/vocab/train_vocab_{tgt_lang}.pkl")

        self.scorer = Scorer()
        self.scores = {}
        self.num_sentences = num_sentences
        self.num_sentences_test = num_sentences_test
        self.batch_translate = batch_translate
        self.evaluate_every = evaluate_every
        self.reuseCalculatedTranslations = reuseCalculatedTranslations
        self.reuseInitialTranslations = reuseInitialTranslations

        self.initialTranslationFile = initialTranslationFile
        self.initialScoreFile = initialScoreFile
        self.initialTestTranslationFile = initialTestTranslationFile
        self.translationFile = translationFile

        self.metric_bleu_scores = {}
        self.metric_gleu_scores = {}
        self.metric_precisions = {}
        self.metric_recalls = {}

        self.prefix = "_experiments/retrain_beam3"
        self.dir = dir
Example #9
0
def documentUpload():
    if 'file' not in request.files:
        return redirect(request.url)
    file = request.files['file']
    # if user does not select file, browser also
    # submit an empty part without filename
    if file.filename == '':
        return redirect(request.url)
    if file and allowed_file(file.filename):
        document_name = request.args.get("document_name")
        id = uuid4()
        filename = secure_filename(file.filename)
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(filepath)

        user = User.query.filter_by(username=get_jwt_identity()).first()
        dbDocument = DBDocument(id=id, name=document_name, user=user, model=model_name)

        document = Document(str(id), document_name, dict(), filepath)
        sentences = document.load_content(filename)
        sentences = list(filter(None, sentences))  # remove empty lines

        with open(filepath, "w", encoding='utf-8') as f:
            for i, sentence in enumerate(sentences):
                f.write(sentence.replace("@@ ", "") + "\n" if i < len(sentences) - 1 else "")

        extractor = DomainSpecificExtractor(source_file=filepath, src_lang=SRC_LANG, tgt_lang=TGT_LANG,
                                            train_source_file=f".data/wmt14/train.tok.clean.bpe.32000.{SRC_LANG}",
                                            train_vocab_file=f".data/vocab/train_vocab_{SRC_LANG}.pkl")
        keyphrases = extractor.extract_keyphrases(n_results=30)

        scorer = Scorer()

        print("Translating {} sentences".format(len(sentences)))

        beamSize = 3
        attLayer = -2
        for i, source in enumerate(sentences):
            translation, attn, translations = model.translate(source, beam_size=beamSize,  attLayer=attLayer, beam_length=0.6,
                                                                      beam_coverage=0.4)
            print("Translated {} of {}".format(i + 1, len(sentences)))

            beam = translationsToTree(translations[:beamSize])

            # print("  ", translation)
            score = scorer.compute_scores(source, " ".join(translation), attn, keyphrases, "")
            score["order_id"] = i
            sentence = Sentence(i, source, " ".join(translation), attn, beam, score)

            document.sentences.append(sentence)

        print("Finished translation")

        keyphrases = [{"name": k, "occurrences": f, "active": False} for (k, f) in keyphrases]
        document.keyphrases = keyphrases
        db.session.add(dbDocument)
        db.session.commit()

        save_document(document, id)

        return jsonify({})
    return jsonify({})