def evaluate(parallel_corpus, model, N=100): """ Returns statistics for N document alignment trials. The documents are generated from the parallel corpus. - `parallel_corpus`: A file object - `model`: A YalignModel - `N`: Number of trials """ results = [] for idx, docs in enumerate(generate_documents(parallel_corpus)): A, B, alignments = training_scrambling_from_documents(*docs) predicted_alignments = model.align_indexes(A, B) scores = F_score(predicted_alignments, alignments) results.append(scores) if idx >= N - 1: break return _stats(results)
def basic_model(corpus_filepath, word_scores_filepath, lang_a=None, lang_b=None): """ Creates and trains a `YalignModel` with the basic configuration and default values. `corpus_filepath` is the path to a parallel corpus used for training, it can be: - a csv file with two sentences and alignement information, or - a tmx file with correct alignments (a regular parallel corpus), or - a text file with interleaved sentences (one line in language A, the next in language B) `word_scores_filepath` is the path to a csv file (possibly gzipped) with word dictionary data. (for ex. "house,casa,0.91"). `lang_a` and `lang_b` are requiered for the tokenizer in the case of a tmx file. In the other cases is not necesary because it's assumed that the words are already tokenized. """ # Word score word_pair_score = WordPairScore(word_scores_filepath) if corpus_filepath.endswith(".tmx"): A, B = tmx_file_to_documents(corpus_filepath, lang_a, lang_b) else: A, B = parallel_corpus_to_documents(corpus_filepath) alignments = training_alignments_from_documents(A, B) sentence_pair_score = SentencePairScore() sentence_pair_score.train(alignments, word_pair_score) # Yalign model metadata = {"lang_a": lang_a, "lang_b": lang_b} gap_penalty = 0.49 threshold = 1.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) model = YalignModel(document_aligner, threshold, metadata=metadata) A, B, correct = training_scrambling_from_documents( A[:OPTIMIZE_SAMPLE_SET_SIZE], B[:OPTIMIZE_SAMPLE_SET_SIZE]) model.optimize_gap_penalty_and_threshold(A, B, correct) return model
def setUp(self): random.seed(hash("Y U NO?")) base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt") A, B = parallel_corpus_to_documents(parallel_corpus) A = A[:25] B = B[:25] self.alignments = list(training_alignments_from_documents(A, B)) self.A, self.B, self.correct_alignments = list(training_scrambling_from_documents(A, B)) # Word score word_pair_score = WordPairScore(word_scores) # Sentence Score sentence_pair_score = SentencePairScore() sentence_pair_score.train(self.alignments, word_pair_score) # Yalign model self.min_ = sentence_pair_score.min_bound self.max_ = sentence_pair_score.max_bound gap_penalty = (self.min_ + self.max_) / 2.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) self.model = YalignModel(document_aligner, 1)
def basic_model(corpus_filepath, word_scores_filepath, lang_a=None, lang_b=None): """ Creates and trains a `YalignModel` with the basic configuration and default values. `corpus_filepath` is the path to a parallel corpus used for training, it can be: - a csv file with two sentences and alignement information, or - a tmx file with correct alignments (a regular parallel corpus), or - a text file with interleaved sentences (one line in language A, the next in language B) `word_scores_filepath` is the path to a csv file (possibly gzipped) with word dictionary data. (for ex. "house,casa,0.91"). `lang_a` and `lang_b` are requiered for the tokenizer in the case of a tmx file. In the other cases is not necesary because it's assumed that the words are already tokenized. """ # Word score word_pair_score = WordPairScore(word_scores_filepath) if corpus_filepath.endswith(".tmx"): A, B = tmx_file_to_documents(corpus_filepath, lang_a, lang_b) else: A, B = parallel_corpus_to_documents(corpus_filepath) alignments = training_alignments_from_documents(A, B) sentence_pair_score = SentencePairScore() sentence_pair_score.train(alignments, word_pair_score) # Yalign model metadata = {"lang_a": lang_a, "lang_b": lang_b} gap_penalty = 0.49 threshold = 1.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) model = YalignModel(document_aligner, threshold, metadata=metadata) A, B, correct = training_scrambling_from_documents(A[:OPTIMIZE_SAMPLE_SET_SIZE], B[:OPTIMIZE_SAMPLE_SET_SIZE]) model.optimize_gap_penalty_and_threshold(A, B, correct) return model
def setUp(self): random.seed(hash("Y U NO?")) base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt") A, B = parallel_corpus_to_documents(parallel_corpus) A = A[:25] B = B[:25] self.alignments = list(training_alignments_from_documents(A, B)) self.A, self.B, self.correct_alignments = \ list(training_scrambling_from_documents(A, B)) # Word score word_pair_score = WordPairScore(word_scores) # Sentence Score sentence_pair_score = SentencePairScore() sentence_pair_score.train(self.alignments, word_pair_score) # Yalign model self.min_ = sentence_pair_score.min_bound self.max_ = sentence_pair_score.max_bound gap_penalty = (self.min_ + self.max_) / 2.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) self.model = YalignModel(document_aligner, 1)