def test_split_text_and_recombine_en(self):

        directory = get_data_dir()+"/texts/en"
        for i,fn in enumerate(os.listdir(directory)):
            with self.subTest(i=i):
                text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read()
                sentences, skeleton, _, _ = split_into_sentences(text,lang="en")
                for sentence in sentences:
                    self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip()))
                new_text = join_sentences(sentences, skeleton)
                self.assertEqual(text,new_text, "\nT: -%s-\nN: -%s-" %(text, new_text))
 def parse_data_file(self, file_name):
     data = codecs.open(get_data_dir()+"/%s"%file_name, encoding="utf-8").readlines()
     data = [d for d in data if d.strip() != ""]
     sentences = []
     sizes = []
     for i, s in enumerate(data):
         if i % 2 == 0:
             sentences.append(s.strip())
         else:
             sizes.append(int(s.strip()))
     new_sentences = []
     new_sizes = []
     for sentence, size in izip(sentences, sizes):
         if sentence.startswith("#"):
             continue
         new_sentences.append(sentence)
         new_sizes.append(size)
     if len(new_sentences) != len(new_sizes):
         raise ValueError("Error reading data from file: %s %s" % (len(new_sentences), len(new_sizes)))
     return new_sentences, new_sizes
    def test_example_file(self):
        from unbabel_text_utils.umtf_utils.wrappers_func import umtf_funcs_dict, \
            umtf_wrapper_funcs

        raw_file = codecs.open(
            get_data_dir()+"/inline_examples/examples.txt",
            encoding="utf-8").read()

        for i, line in enumerate(raw_file.split("\n")):
            with self.subTest(i=i):
                text, markup, wrappers = \
                    inline_to_annotation(line,
                                         umtf_wrapper_funcs,
                                         final_wrapper=get_wrappers)
                # print "I am doing in here...."

                new_inline = annotation_to_inline(
                    text,
                    markup,
                    wrappers,
                    wrappers_dict=umtf_funcs_dict)
                self.assertEqual(line, new_inline, "%i\nOrig: %s\nNext: %s\n" %
                                                   (i, line, new_inline))
    def test_split_join_tagged_text_en(self):
        directory = get_data_dir()+"/tagged_texts/en"
        for i, fn in enumerate(os.listdir(directory)):

            with self.subTest(i=i):
                sanitized_text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read()
                tagged_text, tags_skeleton = strip_tag_spaces(sanitized_text)
                text, tags = remove_tags(tagged_text)
                sentences, skeleton, list_tags = split_text(text, "en", tags)
                for sentence in sentences:
                    self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip()))
                # All sentences have to be able to be added tags
                for sentence, stags in izip(sentences, list_tags):
                    insert_tags(sentence, stags)

                new_text, new_tags = join_text(sentences,skeleton,list_tags)
                msg = "\noriginal:%s\nnew:%s\noriginal_tags:%s\tsentences:%s\nskeleton:-%s-\n%s" % (text,new_text,tags,sentences, skeleton, list_tags)
                self.assertEqual(text,new_text,msg)
                self.assertEqual(tags, new_tags,msg)
                new_tagged_text = insert_tags(new_text,new_tags)
                self.assertEqual(new_tagged_text, tagged_text)
                new_unsanitized_text = unstrip_tag_spaces(new_tagged_text, tags_skeleton)
                self.assertEqual(sanitized_text, new_unsanitized_text, sanitized_text)