Example #1
0
def main(e_path, f_path):

    e_corpus = Corpus(open(e_path), null='<null>')
    f_corpus = Corpus(open(f_path))

    model = get_ibm1(e_corpus, f_corpus)

    EM(e_corpus, f_corpus, model, iterations=10)

    map_decoder(e_corpus, f_corpus, model,
                partial(print_map, e_corpus=e_corpus, f_corpus=f_corpus))
Example #2
0
def main(e_path, f_path):

    e_corpus = Corpus(open(e_path), null='<null>')
    f_corpus = Corpus(open(f_path))

    model = get_ibm1(e_corpus, f_corpus)

    EM(e_corpus, f_corpus, model, iterations=10)
    from lola.io import print_lola_format
    map_decoder(e_corpus, f_corpus, model,
                partial(print_lola_format,
                        e_corpus=e_corpus,
                        f_corpus=f_corpus,
                        ostream=sys.stdout))
Example #3
0
def read_corpora(training_path: str, test_path: str, generating: bool,
                 min_count: int, max_count: int) -> (CorpusView, CorpusView):
    """
    Return training and test data.

    :param training_path: path to training corpus
    :param test_path: path to test corpus (or None)
    :param generating: whether this is the side we are generating (French)
    :param min_count: minimum frequency for word to be retained in the vocabulary
    :param max_count: maximum frequency for word to be retained in the vocabulary
    :return: Training view and test view
    """
    if test_path is None:  # not test corpus
        if generating:
            corpus = Corpus(training_path,
                            min_count=min_count,
                            max_count=max_count)
        else:  # we are conditioning on this corpus
            corpus = Corpus(training_path,
                            null='<NULL>',
                            min_count=min_count,
                            max_count=max_count)
        return corpus, None
    else:
        # read training data
        with open(training_path, 'r') as fi:
            lines = fi.readlines()
        n_training = len(lines)
        # read test data
        with open(test_path, 'r') as fi:
            lines.extend(fi.readlines())
        n_test = len(lines) - n_training
        # create a big corpus with everything
        if generating:
            corpus = Corpus(lines, min_count=min_count, max_count=max_count)
        else:  # we are conditioning on this corpus
            corpus = Corpus(lines,
                            null='<NULL>',
                            min_count=min_count,
                            max_count=max_count)
        # return two different views: the training view and the test view
        return CorpusView(corpus, 0,
                          n_training), CorpusView(corpus, n_training, n_test)