def test_preserve_annotations_en(self):
     tagged_text = u''''Hello Jaime,<br/><br/>You have provide the link to your petition.  <br/><span class="notranslate" origval="0">https://www.change.org/p/firm%C3%A1-para-que-cristina-fern%C3%A1ndez-de-kirchner-revoque-la-designaci%C3%B3n-de-la-hija-de-agust%C3%ADn-rossi-como-directora-en-el-banco-naci%C3%B3n?recruiter=49166865&utm_source=share_petition&utm_medium=email&utm_campaign=share_email_responsive</span><br/><br/>This link however looks very long. I suggest you cuztomize the link so that it will be easier to copy and share with your contacts via e-mail.<br/><br/>You can shorten the URL by customizing the headline. Here\'s how you can do it: <br/> <br/>- Login to your Change.org account<br/>- Select the petition in question, select edit<br/>- Scroll down to the option \xe2\x80\x9cCustomize your headline for sharing\xe2\x80\x9d.<br/> <br/>If want an even shorter link, you can use a third party site, such as:<span class="notranslate" origval="1"> http://tinyurl.com/ </span>or<span class="notranslate" origval="2"> http://goo.gl/.</span><br/> <br/>Please let us know if you need further help.<br/><br/>Best Wishes,<br/><br/>Maria<br/>Change.org Help Center\n'''
     text, tags = remove_tags(tagged_text)
     sentences, skeleton, list_tags = split_text(text, "en", tags)
     for sentence in sentences:
                 self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip()))
     new_text, new_tags = join_text(sentences,skeleton,list_tags)
     self.assertEqual(text,new_text)
     self.assertEqual(tags, new_tags)
     nr_list_tags = sum([self._nr_tags(entry) for entry in list_tags])
     self.assertEqual(nr_list_tags,self._nr_tags(tags))
 def test_preserve_annotations_small_en(self):
     tagged_text = u''''Hello Jaime,<br/><br/>You have provide the link to your petition. <br/><br/>Best Wishes,<br/><br/>Maria<br/>Change.org Help Center\n'''
     text, tags = remove_tags(tagged_text)
     sentences, skeleton, list_tags = split_text(text, "en", tags)
     for sentence in sentences:
                 self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip()))
     new_text, new_tags = join_text(sentences,skeleton,list_tags)
     self.assertEqual(text,new_text)
     self.assertEqual(tags, new_tags)
     nr_list_tags = sum([self._nr_tags(entry) for entry in list_tags])
     self.assertEqual(nr_list_tags,self._nr_tags(tags))
    def test_split_join_tagged_text_en(self):
        directory = get_data_dir()+"/tagged_texts/en"
        for i, fn in enumerate(os.listdir(directory)):

            with self.subTest(i=i):
                sanitized_text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read()
                tagged_text, tags_skeleton = strip_tag_spaces(sanitized_text)
                text, tags = remove_tags(tagged_text)
                sentences, skeleton, list_tags = split_text(text, "en", tags)
                for sentence in sentences:
                    self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip()))
                # All sentences have to be able to be added tags
                for sentence, stags in izip(sentences, list_tags):
                    insert_tags(sentence, stags)

                new_text, new_tags = join_text(sentences,skeleton,list_tags)
                msg = "\noriginal:%s\nnew:%s\noriginal_tags:%s\tsentences:%s\nskeleton:-%s-\n%s" % (text,new_text,tags,sentences, skeleton, list_tags)
                self.assertEqual(text,new_text,msg)
                self.assertEqual(tags, new_tags,msg)
                new_tagged_text = insert_tags(new_text,new_tags)
                self.assertEqual(new_tagged_text, tagged_text)
                new_unsanitized_text = unstrip_tag_spaces(new_tagged_text, tags_skeleton)
                self.assertEqual(sanitized_text, new_unsanitized_text, sanitized_text)