def test_split_text_and_recombine_en(self): directory = get_data_dir()+"/texts/en" for i,fn in enumerate(os.listdir(directory)): with self.subTest(i=i): text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read() sentences, skeleton, _, _ = split_into_sentences(text,lang="en") for sentence in sentences: self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip())) new_text = join_sentences(sentences, skeleton) self.assertEqual(text,new_text, "\nT: -%s-\nN: -%s-" %(text, new_text))
def test_joint_puntation_zh(self): sentences, sizes = self.parse_data_file("sentences_with_join_punct_zh.txt") for i, (sentence, size) in enumerate(izip(sentences,sizes)): with self.subTest(i=i): sentences,_,_,_ = split_into_sentences(sentence, lang="zh-CN") self.assertEqual(len(sentences),size, "%s \n %s"%(sentence, sentences))
def test_joint_puntation_en(self): sentences, sizes = self.parse_data_file("sentences_with_join_punct_en.txt") for sentence, size in izip(sentences,sizes): with self.subTest(sentence=sentence): sentences,_,_,_ = split_into_sentences(sentence, lang="en") self.assertEqual(len(sentences),size, "%s \n %s"%(sentence, sentences))