def main(e_path, f_path): e_corpus = Corpus(open(e_path), null='<null>') f_corpus = Corpus(open(f_path)) model = get_ibm1(e_corpus, f_corpus) EM(e_corpus, f_corpus, model, iterations=10) map_decoder(e_corpus, f_corpus, model, partial(print_map, e_corpus=e_corpus, f_corpus=f_corpus))
def main(e_path, f_path): e_corpus = Corpus(open(e_path), null='<null>') f_corpus = Corpus(open(f_path)) model = get_ibm1(e_corpus, f_corpus) EM(e_corpus, f_corpus, model, iterations=10) from lola.io import print_lola_format map_decoder(e_corpus, f_corpus, model, partial(print_lola_format, e_corpus=e_corpus, f_corpus=f_corpus, ostream=sys.stdout))
def read_corpora(training_path: str, test_path: str, generating: bool, min_count: int, max_count: int) -> (CorpusView, CorpusView): """ Return training and test data. :param training_path: path to training corpus :param test_path: path to test corpus (or None) :param generating: whether this is the side we are generating (French) :param min_count: minimum frequency for word to be retained in the vocabulary :param max_count: maximum frequency for word to be retained in the vocabulary :return: Training view and test view """ if test_path is None: # not test corpus if generating: corpus = Corpus(training_path, min_count=min_count, max_count=max_count) else: # we are conditioning on this corpus corpus = Corpus(training_path, null='<NULL>', min_count=min_count, max_count=max_count) return corpus, None else: # read training data with open(training_path, 'r') as fi: lines = fi.readlines() n_training = len(lines) # read test data with open(test_path, 'r') as fi: lines.extend(fi.readlines()) n_test = len(lines) - n_training # create a big corpus with everything if generating: corpus = Corpus(lines, min_count=min_count, max_count=max_count) else: # we are conditioning on this corpus corpus = Corpus(lines, null='<NULL>', min_count=min_count, max_count=max_count) # return two different views: the training view and the test view return CorpusView(corpus, 0, n_training), CorpusView(corpus, n_training, n_test)