def main(): output_format = "plaintext" lang_a = sys.argv[1] lang_b = sys.argv[2] model_path = os.path.abspath(sys.argv[3]) nltk.data.path += [model_path] model = YalignModel.load(model_path) pairing = read_pairing(open(sys.argv[4]), lang_a, lang_b) src_needed = set([a for a, _ in pairing]) tgt_needed = set([a for _, a in pairing]) src_articles = read_articles(open(sys.argv[5]), src_needed) tgt_articles = read_articles(open(sys.argv[6]), tgt_needed) for src, tgt in pairing: try: text_a = "\n".join(src_articles[src]) text_b = "\n".join(tgt_articles[tgt]) document_a = text_to_document(text_a, lang_a) document_b = text_to_document(text_b, lang_b) pairs = model.align(document_a, document_b) sys.stderr.write(u"{0} pairs in {1}-{2}\n".format(len(pairs), src, tgt).encode("utf-8")) write_plaintext(sys.stdout, pairs) except KeyError: sys.stderr.write(u"KeyError with {0}-{1}\n".format(src, tgt).encode("utf-8")) continue
def test_save_load_and_align(self): doc1 = [Sentence([u"House"]), Sentence([u"asoidfhuioasgh"])] doc2 = [Sentence([u"Casa"])] result_before_save = self.model.align(doc1, doc2) # Save tmp_folder = tempfile.mkdtemp() self.model.save(tmp_folder) # Load new_model = YalignModel.load(tmp_folder) result_after_load = new_model.align(doc1, doc2) self.assertEqual(result_before_save, result_after_load) self.assertEqual(self.model.threshold, new_model.threshold) self.assertEqual(self.model.document_pair_aligner.penalty, new_model.document_pair_aligner.penalty)
def test_command_tool(self): if self.cmdline is None: return tmpdir = tempfile.mkdtemp() _, tmpfile = tempfile.mkstemp() self.model.save(tmpdir) cmd = self.cmdline.format(corpus=self.parallel_corpus, model=tmpdir) outputfh = open(tmpfile, "w") subprocess.call(cmd, shell=True, stdout=outputfh) outputfh = open(tmpfile) output = outputfh.read() A, B = parallel_corpus_to_documents(self.parallel_corpus) model = YalignModel.load(tmpdir) value = self.alignment_function(A, B, model) self.assertIn("{}%".format(value), output)