def test_swap_languages(self): document_path = os.path.join(data_path, "corpus-en-es.tmx") swap_a, swap_b = tmx_file_to_documents(document_path, "es", "en") for x, y in zip(swap_a, self.document_b): self.assertEqual(x, y) for x, y in zip(swap_b, self.document_a): self.assertEqual(x, y)
def test_swap_languages(self): document_path = os.path.join(data_path, "corpus-en-es.tmx") swap_a, swap_b = tmx_file_to_documents(document_path, "es", "en") for x, y in zip(swap_a, self.document_b): self.assertEqual(x, y) for x, y in zip(swap_b, self.document_a): self.assertEqual(x, y)
def basic_model(corpus_filepath, word_scores_filepath, lang_a=None, lang_b=None): """ Creates and trains a `YalignModel` with the basic configuration and default values. `corpus_filepath` is the path to a parallel corpus used for training, it can be: - a csv file with two sentences and alignement information, or - a tmx file with correct alignments (a regular parallel corpus), or - a text file with interleaved sentences (one line in language A, the next in language B) `word_scores_filepath` is the path to a csv file (possibly gzipped) with word dictionary data. (for ex. "house,casa,0.91"). `lang_a` and `lang_b` are requiered for the tokenizer in the case of a tmx file. In the other cases is not necesary because it's assumed that the words are already tokenized. """ # Word score word_pair_score = WordPairScore(word_scores_filepath) if corpus_filepath.endswith(".tmx"): A, B = tmx_file_to_documents(corpus_filepath, lang_a, lang_b) else: A, B = parallel_corpus_to_documents(corpus_filepath) alignments = training_alignments_from_documents(A, B) sentence_pair_score = SentencePairScore() sentence_pair_score.train(alignments, word_pair_score) # Yalign model metadata = {"lang_a": lang_a, "lang_b": lang_b} gap_penalty = 0.49 threshold = 1.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) model = YalignModel(document_aligner, threshold, metadata=metadata) A, B, correct = training_scrambling_from_documents( A[:OPTIMIZE_SAMPLE_SET_SIZE], B[:OPTIMIZE_SAMPLE_SET_SIZE]) model.optimize_gap_penalty_and_threshold(A, B, correct) return model
def basic_model(corpus_filepath, word_scores_filepath, lang_a=None, lang_b=None): """ Creates and trains a `YalignModel` with the basic configuration and default values. `corpus_filepath` is the path to a parallel corpus used for training, it can be: - a csv file with two sentences and alignement information, or - a tmx file with correct alignments (a regular parallel corpus), or - a text file with interleaved sentences (one line in language A, the next in language B) `word_scores_filepath` is the path to a csv file (possibly gzipped) with word dictionary data. (for ex. "house,casa,0.91"). `lang_a` and `lang_b` are requiered for the tokenizer in the case of a tmx file. In the other cases is not necesary because it's assumed that the words are already tokenized. """ # Word score word_pair_score = WordPairScore(word_scores_filepath) if corpus_filepath.endswith(".tmx"): A, B = tmx_file_to_documents(corpus_filepath, lang_a, lang_b) else: A, B = parallel_corpus_to_documents(corpus_filepath) alignments = training_alignments_from_documents(A, B) sentence_pair_score = SentencePairScore() sentence_pair_score.train(alignments, word_pair_score) # Yalign model metadata = {"lang_a": lang_a, "lang_b": lang_b} gap_penalty = 0.49 threshold = 1.0 document_aligner = SequenceAligner(sentence_pair_score, gap_penalty) model = YalignModel(document_aligner, threshold, metadata=metadata) A, B, correct = training_scrambling_from_documents(A[:OPTIMIZE_SAMPLE_SET_SIZE], B[:OPTIMIZE_SAMPLE_SET_SIZE]) model.optimize_gap_penalty_and_threshold(A, B, correct) return model
def setUp(self): document_path = os.path.join(data_path, "corpus-en-es.tmx") self.document_a, self.document_b = tmx_file_to_documents(document_path, "en", "es")
def setUp(self): document_path = os.path.join(data_path, "corpus-en-es.tmx") self.document_a, self.document_b = tmx_file_to_documents( document_path, "en", "es")