# cli
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "dutch_sentences_pkl_file",
        help="Reduced pkl file of dutch sentences(to train on).")
    parser.add_argument(
        "english_sentences_pkl_file",
        help="Reduced pkl file of english sentences(to train on).")
    parser.add_argument(
        "trans_table",
        help=
        "Translation probabilty table obtained after training IBM Model 1(previously translation_probabilities_table.pkl)"
    )
    args = parser.parse_args()

    dutch_sentences = unpickle(args.dutch_sentences_pkl_file)
    english_sentences = unpickle(args.english_sentences_pkl_file)
    translation_table_prev = unpickle(args.trans_table)
    # dutch_sentences = dutch_sentences[:5]
    # english_sentences = english_sentences[:5]

    # alignment training model:
    final_alignment_prob, final_translation_prob = train2(
        dutch_sentences, english_sentences, translation_table_prev)
    # dumping these results into pkl files
    with open('final_alignment_prob.pkl', 'wb') as f:
        pickle.dump(final_alignment_prob, f)
    with open('final_translation_prob.pkl', 'wb') as f:
        pickle.dump(final_translation_prob, f)
Beispiel #2
0
 raw_sentences = get_sentences_from_document(args.document)
 normalized_sentences = clean_sentences(raw_sentences, keep_numbers=True)
 printv("Done.", verbose)
 if args.matrix:
     printv("Loading translations probability matrix... ", verbose, end="")
     translation_matrix = TranslationMatrix.thaw(
         args.translation_probabilities_table)
     printv("Done.", verbose)
     printv("Translating sentences... ", verbose, end="")
     translated_sentences = translate_from_matrix(normalized_sentences,
                                                  translation_matrix,
                                                  args.augment)
     printv("Done.", verbose)
 else:
     printv("Loading translations probability table... ", verbose, end="")
     unpickled_data = unpickle(args.translation_probabilities_table)
     translation_table = unpickled_data["data"]
     printv("Done.", verbose)
     printv("Translating sentences... ", verbose, end="")
     translated_sentences = [
         translate_from_table(sentence, translation_table, args.augment)
         for sentence in normalized_sentences
     ]
     printv("Done.", verbose)
 printv("Writing results to file... ", verbose, end="")
 if args.output:
     with open(args.output, "w+") as f:
         for sentence in translated_sentences:
             f.write(sentence + ".\n")
 else:
     for sentence in translated_sentences:
Beispiel #3
0
def train_table(dutch_sentences: List[str],
                english_sentences: List[str],
                max_iterations: int,
                convergence_factor: float,
                output_filename: str,
                resume_from_file: str,
                write_back_epoach: bool = False,
                verbose: bool = False) -> None:
    """ The engine for training the statistical machine translator based on the IBM Model 1
        (TODO: explain more here in this docstring). """
    printv("Determining the vocabularies... ", verbose, end="")
    english_vocab = get_vocab(english_sentences)
    dutch_vocab = get_vocab(dutch_sentences)
    printv("Done.", verbose)

    if resume_from_file:
        printv("Reloading the translation probabilities table... ",
               verbose,
               end="")
        reloaded_data = unpickle(resume_from_file)
        iteration = reloaded_data["iteration"]
        translation_table = reloaded_data["data"]
        printv("Done.", verbose)
    else:
        printv("Intializing the translation probabilities table... ",
               verbose,
               end="")
        iteration = 0
        initial_probability = 1 / (len(english_vocab))
        translation_table = {
            f: {e: initial_probability
                for e in english_vocab}
            for f in dutch_vocab
        }
        printv("Done.", verbose)

    counts = {}  # type: TranslationTable
    printv("Beginning the Expectation-Maximization algorithm.", verbose)
    while not converged(dutch_vocab, english_vocab, translation_table, counts,
                        convergence_factor, max_iterations, iteration,
                        verbose):
        start_time = time.time()
        iteration += 1

        printv("Intializing the counts and totals... ", verbose, end="")
        counts = {f: {e: 0.0 for e in english_vocab} for f in dutch_vocab}
        totals = {f: 0.0 for f in dutch_vocab}
        printv("Done.", verbose)

        printv("Calculating probabilities and collecting counts... ",
               verbose,
               end="")
        for english_sentence, dutch_sentence in zip(english_sentences,
                                                    dutch_sentences):
            subtotals = defaultdict(float)  # type: Dict[str, float]
            for english_word in english_sentence.split():
                for dutch_word in dutch_sentence.split():
                    subtotals[english_word] += translation_table[dutch_word][
                        english_word]
            for english_word in english_sentence.split():
                for dutch_word in dutch_sentence.split():
                    amount = translation_table[dutch_word][
                        english_word] / subtotals[english_word]
                    counts[dutch_word][english_word] += amount
                    totals[dutch_word] += amount
        printv("Done.", verbose)

        printv("Updating translations probabilities table... ",
               verbose,
               end="")
        for dutch_word in dutch_vocab:
            for english_word in english_vocab:
                translation_table[dutch_word][english_word] = counts[
                    dutch_word][english_word] / totals[dutch_word]
        printv("Done.", verbose)

        counts = {}
        totals = {}
        gc.collect()
        end_time = time.time()
        print("Completed iteration {} in {} seconds".format(
            iteration, end_time - start_time))
        if write_back_epoach or (iteration == max_iterations):
            write_back_data({
                "iteration": iteration,
                "data": translation_table
            }, output_filename, verbose)
Beispiel #4
0
 def thaw(cls, filename) -> "TranslationMatrix":
     data = unpickle(filename)
     return cls(dutch_vocab=data["dutch_vocab"],
                english_vocab=data["english_vocab"],
                matrix=data["matrix"])
Beispiel #5
0
        "-x",
        "--matrix",
        help=
        "Generate a translation probabilities matrix (list of lists) instead of a table (dict of dicts)",
        action="store_true")
    args = parser.parse_args()

    training_set = int(args.percentage)
    if training_set not in [1, 3, 5, 10]:
        raise ValueError(
            "Invaild percentage value. Valid values: [1, 3, 5, 10].")

    printv("Beginning training with the {}% dataset.".format(training_set),
           args.verbose)
    dutch_sentences = unpickle(
        "datasets/training/dutch/dutch_{}p_5t.reduced.pkl".format(
            training_set))
    english_sentences = unpickle(
        "datasets/training/english/english_{}p_5t.reduced.pkl".format(
            training_set))

    if args.invert:
        english_sentences, dutch_sentences = dutch_sentences, english_sentences

    if args.matrix:
        if args.output == "translation_probabilities_table":
            args.output = "translation_probabilities_matrix"
        train_matrix(dutch_sentences, english_sentences,
                     int(args.max_iterations), args.output,
                     args.resume_from_file, args.write_back_epoach,
                     args.verbose)
Beispiel #6
0
from typing import List, Dict, Set, Any
import collections
from collections import defaultdict
import time
import gc
import pickle
from project.tools.unpickle import unpickle
from project.core.train import get_vocab, converged, train_table, printv, write_back_data
from project.core.translate import translate

alignment_prob = unpickle("final_alignment_prob.pkl")
translation_prob = unpickle("final_translation_prob.pkl")
translation_table_prev = unpickle(
    "translation_probabilities_table.pkl")["data"]
dutch_sentences = unpickle("datasets/training/dutch/dutch_1p_5t.reduced.pkl")
english_sentences = unpickle(
    "datasets/training/english/english_1p_5t.reduced.pkl")
english_sentences = english_sentences[:5]
dutch_sentences = dutch_sentences[:5]


def handle_alignment(translation_prob, alignment_prob, english_sentence,
                     dutch_sentence):
    translation_ans = defaultdict(float)
    l_e = len(english_sentence)
    l_f = len(dutch_sentence)
    final_english_sentence = dict()
    for (j, e) in enumerate(english_sentence.split(), 1):
        cur_max = (0, -1)
        for (i, f) in enumerate(dutch_sentence.split(), 1):
            print(translation_prob[(e, f)])