def test_preserve_annotations_en(self): tagged_text = u''''Hello Jaime,<br/><br/>You have provide the link to your petition. <br/><span class="notranslate" origval="0">https://www.change.org/p/firm%C3%A1-para-que-cristina-fern%C3%A1ndez-de-kirchner-revoque-la-designaci%C3%B3n-de-la-hija-de-agust%C3%ADn-rossi-como-directora-en-el-banco-naci%C3%B3n?recruiter=49166865&utm_source=share_petition&utm_medium=email&utm_campaign=share_email_responsive</span><br/><br/>This link however looks very long. I suggest you cuztomize the link so that it will be easier to copy and share with your contacts via e-mail.<br/><br/>You can shorten the URL by customizing the headline. Here\'s how you can do it: <br/> <br/>- Login to your Change.org account<br/>- Select the petition in question, select edit<br/>- Scroll down to the option \xe2\x80\x9cCustomize your headline for sharing\xe2\x80\x9d.<br/> <br/>If want an even shorter link, you can use a third party site, such as:<span class="notranslate" origval="1"> http://tinyurl.com/ </span>or<span class="notranslate" origval="2"> http://goo.gl/.</span><br/> <br/>Please let us know if you need further help.<br/><br/>Best Wishes,<br/><br/>Maria<br/>Change.org Help Center\n''' text, tags = remove_tags(tagged_text) sentences, skeleton, list_tags = split_text(text, "en", tags) for sentence in sentences: self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip())) new_text, new_tags = join_text(sentences,skeleton,list_tags) self.assertEqual(text,new_text) self.assertEqual(tags, new_tags) nr_list_tags = sum([self._nr_tags(entry) for entry in list_tags]) self.assertEqual(nr_list_tags,self._nr_tags(tags))
def test_preserve_annotations_small_en(self): tagged_text = u''''Hello Jaime,<br/><br/>You have provide the link to your petition. <br/><br/>Best Wishes,<br/><br/>Maria<br/>Change.org Help Center\n''' text, tags = remove_tags(tagged_text) sentences, skeleton, list_tags = split_text(text, "en", tags) for sentence in sentences: self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip())) new_text, new_tags = join_text(sentences,skeleton,list_tags) self.assertEqual(text,new_text) self.assertEqual(tags, new_tags) nr_list_tags = sum([self._nr_tags(entry) for entry in list_tags]) self.assertEqual(nr_list_tags,self._nr_tags(tags))
def word_count(text, lang=None): # Remove all tags from text text_no_tags,_ = remove_tags(text) # Remove urls, emails and other breakable elements. text_no_tags = replace_url(u'url', text_no_tags) text_no_tags = replace_email(u'email', text_no_tags) text_no_tags = replace_date(u'date', text_no_tags) text_no_tags = replace_phone(u'phone', text_no_tags) text_no_tags = replace_money(u'money', text_no_tags) if lang is not None and lang not in asian_languages: words = word_count_aux(text_no_tags) return words else: asian_chars = sum([is_asian(x) for x in text_no_tags]) non_asian_words = "".join([filter_jchars(c) for c in text_no_tags]) words = word_count_aux(non_asian_words) return words + asian_chars
def inline_to_annotation(inline_text, wrapper_funcs=[default_wrapper_func], final_wrapper=None): """ Converts inline text into text and annotations """ # Remove wrappers wrappers = [] for wrapper_func in wrapper_funcs: inline_text, wrappers_aux = wrapper_func(inline_text) wrappers.extend(wrappers_aux) wrappers = clear_annotations_list(wrappers) # Remove markup tags text, markup = remove_tags(inline_text) if final_wrapper: wrappers = final_wrapper(text, wrappers) wrappers = clear_annotations_list(wrappers) return text, markup, wrappers
def test_split_join_tagged_text_en(self): directory = get_data_dir()+"/tagged_texts/en" for i, fn in enumerate(os.listdir(directory)): with self.subTest(i=i): sanitized_text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read() tagged_text, tags_skeleton = strip_tag_spaces(sanitized_text) text, tags = remove_tags(tagged_text) sentences, skeleton, list_tags = split_text(text, "en", tags) for sentence in sentences: self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip())) # All sentences have to be able to be added tags for sentence, stags in izip(sentences, list_tags): insert_tags(sentence, stags) new_text, new_tags = join_text(sentences,skeleton,list_tags) msg = "\noriginal:%s\nnew:%s\noriginal_tags:%s\tsentences:%s\nskeleton:-%s-\n%s" % (text,new_text,tags,sentences, skeleton, list_tags) self.assertEqual(text,new_text,msg) self.assertEqual(tags, new_tags,msg) new_tagged_text = insert_tags(new_text,new_tags) self.assertEqual(new_tagged_text, tagged_text) new_unsanitized_text = unstrip_tag_spaces(new_tagged_text, tags_skeleton) self.assertEqual(sanitized_text, new_unsanitized_text, sanitized_text)
def test_correct_tuples(self): strs = [ # No tags [u'a b', [[0, 0], [1, 1]], u'1 2'], [u'a b', [[0, 1], [1, 0]], u'2 1'], # Single Tag [u'a<a/> b', [[0, 0], [1, 1]], u'1<a/> 2'], [u'a<a/> b', [[0, 1], [1, 0]], u'2 1<a/>'], [u'a <a/>b', [[0, 0], [1, 1]], u'1 <a/>2'], [u'<a/>a b', [[0, 0], [1, 1]], u'<a/>1 2'], [u'<b/><a/>a b', [[0, 0], [1, 1]], u'<b/><a/>1 2'], [u'a b<a/>', [[0, 0], [1, 1]], u'1 2<a/>'], [u'a b<b/><a/>', [[0, 0], [1, 1]], u'1 2<b/><a/>'], [u'<a/><b/>a b, <c/>c <d/>d e <e/>f<f/>', [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6]], u'<a/><b/>1 2, <c/>3 <d/>4 5 <e/>6<f/>'], [u'<a/><b/>aaaaaaaa, <c/><d/>bbbbbbbb <e/>cccccccccc dddd eeeeee<f/><g/>', [[0, 0], [1, 1], [2, 2], [3, 3], [4, 7], [5, 5]], u'<a/><b/>obrigado, <c/><d/>maricela <e/>cccccccccc del Centro de ayuda<f/><g/>'], [u'<a/><b/>Peishan<c/>', [[0, 0]], u'<a/><b/>Peishan<c/>'], [u'<a/>3.', [[0, 0], [1, 1]], u'<a/>3.'], # Test words more than one char [u'<x>ccc dd</x>', [[0, 0], [1, 1]], u'<x>aa ffff</x>'], [u'a b <x>c d</x>.', [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]], u'1 2 <x>3 4</x>.'], [u'aaa bbbb <x>cccc dddd</x>.', [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]], u'11 222 <x>3333 444444</x>.'], # punct [u'<x>ccc dd.</x>', [[0, 0], [1, 1], [2, 2]], u'<x>aa ffff.</x>'], [u'<x>ccc dd</x>.', [[0, 0], [1, 1], [2, 2]], u'<x>aa ffff</x>.'], # Empty Tags [u'a<a></a> b', [[0, 0], [1, 1]], u'1<a></a> 2'], [u'a<a></a> b', [[0, 1], [1, 0]], u'2 1<a></a>'], ['Looking<b></b> at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?', [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]], 'Olhando<b></b> para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'], ['Looking <b></b>at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?', [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]], 'Olhando <b></b>para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'], # Two Tags [u'<a>a <i>b </i></a>', [[0, 0], [1, 1]], u'<a>1 <i>2 </i></a>'], [u'<a>a <i>b </i></a>', [[0, 1], [1, 0]], u'<a><i>2 </i>1 </a>'], [u'<a> a<i> b</i></a>', [[0, 1], [1, 0]], u'<a><i> 2</i> 1</a>'], [u'<a> a<s/><i> b</i></a>', [[0, 1], [1, 0]], u'<a><i> 2</i> 1<s/></a>'], # Nested tags ["<a>a<i> b<b> c</b> d</i> e</a>", [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]], "<a>a<i> b<b> c</b> d</i> e</a>"], ["<a>a<i> b<b> c</b> d</i> e</a>", [[0, 0], [1, 1], [2, 3], [3, 2], [4, 4]], "<a>a<i> b d<b> c</b></i> e</a>"], ["<a>a<i> b<b> c</b> d</i> e</a>", [[0, 0], [1, 1], [2, 2], [3, 4], [4, 3]], "<a>a<i> b<b> c</b> e d</i></a>"], ['<a/><b><c/><d><e><f/><g/><h/></e></d></b>Looking at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?', [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]], '<a/><b><c/><d><e><f/><g/><h/></e></d></b>Olhando para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'], ['Looking<b><c/><a></a> at</b> the screenshot that you have attached, can I ask if you are having this problem on the mobile website?', [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]], '<b>Olhando<c/><a></a></b> para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'], ['Looking <b><c/><a></a></b>at the screenshot that you have attached, can I ask if you are having this problem on the mobile website?', [[0, 0], [1, 0], [2, 1], [3, 2], [5, 3], [4, 4], [6, 5], [7, 6], [9, 7], [10, 8], [11, 9], [12, 10], [13, 10], [13, 11], [14, 10], [15, 12], [16, 13], [17, 14], [18, 14], [19, 14], [20, 15], [21, 16]], 'Olhando <b><c/><a></a></b>para o ecrashot que tem em anexo, posso pedir, se estiver a ter este problema no site movel?'], ['<a/><b/>All the best, <c/><d/>Anjali <e/><f>Change.org </f>Centro De Apoio<g/><h/>', [[0, 1], [1, 1], [2, 0], [2, 2], [2, 3], [3, 4], [4, 5], [6, 7], [7, 8], [8, 9], [5, 6]], '<a/><b/>Com os melhores cumprimentos, <c/><d/>Anjali <e/><f>Change.org </f>Centro De Apoio<g/><h/>'], ['<p>You can only link a <strong>personal</strong> Facebook profile to your Pinterest account. We don\'t support Facebook business pages.</p>', [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [6, 7], [7, 5], [8, 8], [9, 9], [10, 12], [11, 10], [12, 13]], '<p>Usted puede unico enlace un perfil de <strong>Facebook</strong> a su cuenta de Pinterest.</p>'], # Overwrite invalid result in html/xml ['<a>First link1</a> <b>Second link2</b>', [[0, 0], [1, 1], [2, 1], [3, 3]], '<a>First <b>link1 Second link2</b></a>'], ['<a>Cash out</a><b/><c> I cant receive my cash out via PayPal </c><d><e>alternatives in Turkey</e>', [[0, 0], [2, 2], [3, 1], [3, 2], [4, 3], [5, 4], [6, 5], [8, 6], [8, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12]], '<a>Cobrar<c> no puedo<b/> recibir mi pago a traves de PayPal </c></a><d><e>alternativas en Turquia</e>'], ] for i, (tagged_text, wa, truth) in enumerate(strs): with self.subTest(i=i): clean_text, tags = remove_tags(tagged_text) clean_trg_txt, _ = remove_tags(truth) src_tok = tokenize_string(clean_text) trg_tok = tokenize_string(clean_trg_txt) trg_clean_text, _ = remove_tags(truth) pp = phrase_tuples(tagged_text, src_tok, tags, clean_text) tp = compute_target_covering_words(pp, wa, trg_tok, clean_trg_txt) t_tags = generate_target_dict(tags, tp) t_text = insert_tags(trg_clean_text, t_tags) self.assertEqual(truth, t_text, "\nWanted:%s\nGot:%s\n" % (truth, t_text))
def test_neste_examples(self): """ Assume we can always tokenize on space Assume that we have the same characters on target. """ examples = [ ["<a><i>a b</i></a>", [[0, 0], [1, 1]], "<a><i>a b</i></a>"], ["<a><i>a b</i></a>", [[0, 1], [1, 0]], "<a><i>b a</i></a>"], ["<a>c<i>a b</i></a>", [[0, 0], [1, 2], [2, 1]], "<a>c<i>b a</i></a>"], ["<a>c<i>a b</i> d</a>", [[0, 0], [1, 2], [2, 1], [3, 3]], "<a>c<i>b a</i> d</a>"], ["<a>c<i>a b</i> e d</a>", [[0, 0], [1, 2], [2, 1], [3, 4], [4, 3]], "<a>c<i>b a</i> d e</a>"], ["<a>a<i> b<b> c</b> d</i> e</a>", [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]], "<a>a<i> b<b> c</b> d</i> e</a>"], ["<a>a<i> b</i></a>", [[0, 0], [1, 1]], "<a>a<i> b</i></a>"], ] # nested tags # "<a><i>a</i></a> --> <a><i>1</i></a>" # "<a>a<i>b</i></a> --> <a>a<i>b</i></a>" # "<a>a<i>b</i>c</a> --> <a>a<i>b</i>c</a>" for i, (tagged_text, wrd_align, truth) in enumerate(examples): with self.subTest(i=i): text, s_tags = remove_tags(tagged_text) s_token = text.split(" ") tok_src = split_with_indices(text, " ") tok_trg = tok_src t_token = get_target_tokens(s_token, wrd_align) t_tags = align_tags(tagged_text, text, s_token, tok_src, tok_trg, s_tags, t_token, wrd_align=wrd_align) target_text = " ".join(t_token) target_tagged_text = insert_tags(target_text, t_tags) self.assertEqual(target_tagged_text, truth, "\n%s\nTruth:\n%s\nGot:\n%s" % ( tagged_text, truth, target_tagged_text))