コード例 #1
0
    def test_split_join_tagged_text_en(self):
        directory = get_data_dir()+"/tagged_texts/en"
        for i, fn in enumerate(os.listdir(directory)):

            with self.subTest(i=i):
                sanitized_text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read()
                tagged_text, tags_skeleton = strip_tag_spaces(sanitized_text)
                text, tags = remove_tags(tagged_text)
                sentences, skeleton, list_tags = split_text(text, "en", tags)
                for sentence in sentences:
                    self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip()))
                # All sentences have to be able to be added tags
                for sentence, stags in izip(sentences, list_tags):
                    insert_tags(sentence, stags)

                new_text, new_tags = join_text(sentences,skeleton,list_tags)
                msg = "\noriginal:%s\nnew:%s\noriginal_tags:%s\tsentences:%s\nskeleton:-%s-\n%s" % (text,new_text,tags,sentences, skeleton, list_tags)
                self.assertEqual(text,new_text,msg)
                self.assertEqual(tags, new_tags,msg)
                new_tagged_text = insert_tags(new_text,new_tags)
                self.assertEqual(new_tagged_text, tagged_text)
                new_unsanitized_text = unstrip_tag_spaces(new_tagged_text, tags_skeleton)
                self.assertEqual(sanitized_text, new_unsanitized_text, sanitized_text)