def default_sentence_pair_score(): base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores.csv") _, classifier_filepath = tempfile.mkstemp() training_file = os.path.join(base_path, "data", "test_training.csv") pairs = parse_training_file(training_file) classifier = SentencePairScore() classifier.train(pairs, WordPairScore(word_scores)) classifier.save(classifier_filepath) return classifier, classifier_filepath
def test_correlation_values(self): base_path = os.path.dirname(os.path.abspath(__file__)) parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt") word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") A, B = parallel_corpus_to_documents(parallel_corpus) self.alignments = [x for x in training_alignments_from_documents(A, B)] # Word score word_pair_score = WordPairScore(word_scores) # Sentence Score sentence_pair_score = SentencePairScore() sentence_pair_score.train(self.alignments, word_pair_score) cor = correlation(sentence_pair_score.classifier) for attr, value in cor.iteritems(): if value is not numpy.nan: self.assertTrue(-1 <= value <= 1)
class TestSentencePairScore(unittest.TestCase): def setUp(self): base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") word_pair_score = WordPairScore(word_scores) fin = os.path.join(base_path, "data", "parallel-en-es.txt") A, B = parallel_corpus_to_documents(fin) self.alignments = list(training_alignments_from_documents(A, B)) self.score = SentencePairScore() self.score.train(self.alignments, word_pair_score) def test_generates_numbers(self): a = Sentence(u"house you".split()) b = Sentence(u"casa usted".split()) x = self.score(a, b) self.assertIsInstance(x, (int, float)) a = Sentence(u"Valar Morghulis".split()) b = Sentence(u"Dracarys".split()) x = self.score(a, b) self.assertIsInstance(x, (int, float)) def test_score_order(self): a = Sentence(u"Call History .".split()) b = Sentence(u"Historial de llamadas .".split()) score1 = self.score(a, b) a = Sentence(u"Replace the cover .".split()) b = Sentence(u"Vuelva a ingresar un nuevo código de bloqueo .".split()) score2 = self.score(a, b) self.assertLess(score1, score2) def test_score_in_bounds(self): for alignment in self.alignments: score = self.score(*alignment) self.assertGreaterEqual(score, self.score.min_bound) self.assertLessEqual(score, self.score.max_bound) def test_number_of_word_pair_scores_better_than_all_mismatchs(self): a = Sentence(u"house µa µb µc µd".split()) b = Sentence(u"casa µ1 µ2 µ3 µ4".split()) s1 = self.score.problem.number_of_word_pair_scores(SentencePair(a, b)) c = Sentence(u"µx µa µb µc µd".split()) d = Sentence(u"µ5 µ1 µ2 µ3 µ4".split()) s2 = self.score.problem.number_of_word_pair_scores(SentencePair(c, d)) self.assertGreater(s1, s2)
def setUp(self): word_scores = os.path.join(data_path, "test_word_scores_big.csv") self.parallel_corpus = os.path.join(data_path, "parallel-en-es.txt") # Documents A, B = parallel_corpus_to_documents(self.parallel_corpus) self.document_a = A[:30] self.document_b = B[:30] training = training_alignments_from_documents(self.document_a, self.document_b) # Word score word_pair_score = WordPairScore(word_scores) # Sentence Score sentence_pair_score = SentencePairScore() sentence_pair_score.train(training, word_pair_score) # Yalign model document_aligner = SequenceAligner(sentence_pair_score, 0.49) self.model = YalignModel(document_aligner)
def basic_model(corpus_filepath, word_scores_filepath, lang_a=None, lang_b=None): """ Creates and trains a `YalignModel` with the basic configuration and default values. `corpus_filepath` is the path to a parallel corpus used for training, it can be: - a csv file with two sentences and alignement information, or - a tmx file with correct alignments (a regular parallel corpus), or - a text file with interleaved sentences (one line in language A, the next in language B) `word_scores_filepath` is the path to a csv file (possibly gzipped) with word dictionary data. (for ex. "house,casa,0.91"). `lang_a` and `lang_b` are requiered for the tokenizer in the case of a tmx file. In the other cases is not necesary because it's assumed that the words are already tokenized. """ # Word score word_pair_score = WordPairScore(word_scores_filepath) if corpus_filepath.endswith(".tmx"): A, B = tmx_file_to_documents(corpus_filepath, lang_a, lang_b) else: A, B = parallel_corpus_to_documents(corpus_filepath) alignments = training_alignments_from_documents(A, B) sentence_pair_score = SentencePairScore() sentence_pair_score.train(alignments, word_pair_score) # Yalign model metadata = {"lang_a": lang_a, "lang_b": lang_b} gap_penalty = 0.49 threshold = 1.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) model = YalignModel(document_aligner, threshold, metadata=metadata) A, B, correct = training_scrambling_from_documents( A[:OPTIMIZE_SAMPLE_SET_SIZE], B[:OPTIMIZE_SAMPLE_SET_SIZE]) model.optimize_gap_penalty_and_threshold(A, B, correct) return model
def setUp(self): random.seed(hash("Y U NO?")) base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt") A, B = parallel_corpus_to_documents(parallel_corpus) A = A[:25] B = B[:25] self.alignments = list(training_alignments_from_documents(A, B)) self.A, self.B, self.correct_alignments = list(training_scrambling_from_documents(A, B)) # Word score word_pair_score = WordPairScore(word_scores) # Sentence Score sentence_pair_score = SentencePairScore() sentence_pair_score.train(self.alignments, word_pair_score) # Yalign model self.min_ = sentence_pair_score.min_bound self.max_ = sentence_pair_score.max_bound gap_penalty = (self.min_ + self.max_) / 2.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) self.model = YalignModel(document_aligner, 1)
def basic_model(corpus_filepath, word_scores_filepath, lang_a=None, lang_b=None): """ Creates and trains a `YalignModel` with the basic configuration and default values. `corpus_filepath` is the path to a parallel corpus used for training, it can be: - a csv file with two sentences and alignement information, or - a tmx file with correct alignments (a regular parallel corpus), or - a text file with interleaved sentences (one line in language A, the next in language B) `word_scores_filepath` is the path to a csv file (possibly gzipped) with word dictionary data. (for ex. "house,casa,0.91"). `lang_a` and `lang_b` are requiered for the tokenizer in the case of a tmx file. In the other cases is not necesary because it's assumed that the words are already tokenized. """ # Word score word_pair_score = WordPairScore(word_scores_filepath) if corpus_filepath.endswith(".tmx"): A, B = tmx_file_to_documents(corpus_filepath, lang_a, lang_b) else: A, B = parallel_corpus_to_documents(corpus_filepath) alignments = training_alignments_from_documents(A, B) sentence_pair_score = SentencePairScore() sentence_pair_score.train(alignments, word_pair_score) # Yalign model metadata = {"lang_a": lang_a, "lang_b": lang_b} gap_penalty = 0.49 threshold = 1.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) model = YalignModel(document_aligner, threshold, metadata=metadata) A, B, correct = training_scrambling_from_documents(A[:OPTIMIZE_SAMPLE_SET_SIZE], B[:OPTIMIZE_SAMPLE_SET_SIZE]) model.optimize_gap_penalty_and_threshold(A, B, correct) return model
def setUp(self): random.seed(hash("Y U NO?")) base_path = os.path.dirname(os.path.abspath(__file__)) word_scores = os.path.join(base_path, "data", "test_word_scores_big.csv") parallel_corpus = os.path.join(base_path, "data", "parallel-en-es.txt") A, B = parallel_corpus_to_documents(parallel_corpus) A = A[:25] B = B[:25] self.alignments = list(training_alignments_from_documents(A, B)) self.A, self.B, self.correct_alignments = \ list(training_scrambling_from_documents(A, B)) # Word score word_pair_score = WordPairScore(word_scores) # Sentence Score sentence_pair_score = SentencePairScore() sentence_pair_score.train(self.alignments, word_pair_score) # Yalign model self.min_ = sentence_pair_score.min_bound self.max_ = sentence_pair_score.max_bound gap_penalty = (self.min_ + self.max_) / 2.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) self.model = YalignModel(document_aligner, 1)