def test_split_text_and_recombine_en(self): directory = get_data_dir()+"/texts/en" for i,fn in enumerate(os.listdir(directory)): with self.subTest(i=i): text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read() sentences, skeleton, _, _ = split_into_sentences(text,lang="en") for sentence in sentences: self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip())) new_text = join_sentences(sentences, skeleton) self.assertEqual(text,new_text, "\nT: -%s-\nN: -%s-" %(text, new_text))
def parse_data_file(self, file_name): data = codecs.open(get_data_dir()+"/%s"%file_name, encoding="utf-8").readlines() data = [d for d in data if d.strip() != ""] sentences = [] sizes = [] for i, s in enumerate(data): if i % 2 == 0: sentences.append(s.strip()) else: sizes.append(int(s.strip())) new_sentences = [] new_sizes = [] for sentence, size in izip(sentences, sizes): if sentence.startswith("#"): continue new_sentences.append(sentence) new_sizes.append(size) if len(new_sentences) != len(new_sizes): raise ValueError("Error reading data from file: %s %s" % (len(new_sentences), len(new_sizes))) return new_sentences, new_sizes
def test_example_file(self): from unbabel_text_utils.umtf_utils.wrappers_func import umtf_funcs_dict, \ umtf_wrapper_funcs raw_file = codecs.open( get_data_dir()+"/inline_examples/examples.txt", encoding="utf-8").read() for i, line in enumerate(raw_file.split("\n")): with self.subTest(i=i): text, markup, wrappers = \ inline_to_annotation(line, umtf_wrapper_funcs, final_wrapper=get_wrappers) # print "I am doing in here...." new_inline = annotation_to_inline( text, markup, wrappers, wrappers_dict=umtf_funcs_dict) self.assertEqual(line, new_inline, "%i\nOrig: %s\nNext: %s\n" % (i, line, new_inline))
def test_split_join_tagged_text_en(self): directory = get_data_dir()+"/tagged_texts/en" for i, fn in enumerate(os.listdir(directory)): with self.subTest(i=i): sanitized_text = codecs.open("%s/%s"%(directory,fn), encoding="utf-8").read() tagged_text, tags_skeleton = strip_tag_spaces(sanitized_text) text, tags = remove_tags(tagged_text) sentences, skeleton, list_tags = split_text(text, "en", tags) for sentence in sentences: self.assertEqual(sentence.strip(),sentence, "Error sentences have extra spaces on edges:\n%s\n%s\n" % (sentence, sentence.strip())) # All sentences have to be able to be added tags for sentence, stags in izip(sentences, list_tags): insert_tags(sentence, stags) new_text, new_tags = join_text(sentences,skeleton,list_tags) msg = "\noriginal:%s\nnew:%s\noriginal_tags:%s\tsentences:%s\nskeleton:-%s-\n%s" % (text,new_text,tags,sentences, skeleton, list_tags) self.assertEqual(text,new_text,msg) self.assertEqual(tags, new_tags,msg) new_tagged_text = insert_tags(new_text,new_tags) self.assertEqual(new_tagged_text, tagged_text) new_unsanitized_text = unstrip_tag_spaces(new_tagged_text, tags_skeleton) self.assertEqual(sanitized_text, new_unsanitized_text, sanitized_text)