コード例 #1
0
    def test_split_text_and_recombine_en(self):

        directory = get_data_dir()+"/texts/en"
        for i,fn in enumerate(os.listdir(directory)):
            with self.subTest(i=i):
                text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read()
                sentences, skeleton, _, _ = split_into_sentences(text,lang="en")
                for sentence in sentences:
                    self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip()))
                new_text = join_sentences(sentences, skeleton)
                self.assertEqual(text,new_text, "\nT: -%s-\nN: -%s-" %(text, new_text))
コード例 #2
0
 def test_joint_puntation_zh(self):
      sentences, sizes = self.parse_data_file("sentences_with_join_punct_zh.txt")
      for i, (sentence, size) in enumerate(izip(sentences,sizes)):
          with self.subTest(i=i):
              sentences,_,_,_ = split_into_sentences(sentence, lang="zh-CN")
              self.assertEqual(len(sentences),size, "%s \n %s"%(sentence, sentences))
コード例 #3
0
 def test_joint_puntation_en(self):
     sentences, sizes = self.parse_data_file("sentences_with_join_punct_en.txt")
     for sentence, size in izip(sentences,sizes):
         with self.subTest(sentence=sentence):
             sentences,_,_,_ = split_into_sentences(sentence, lang="en")
             self.assertEqual(len(sentences),size, "%s \n %s"%(sentence, sentences))