def test_german_tokenization(self): expressions = [ [ [ u"Als ", u"Vertreter ", u"des ", u"One", u"-", u"Nation", u"-", u"Konservatismus ", u"bekleidete ", u"er ", u"nach ", u"dem ", u"Wahlsieg ", u"der ", u"Tories ", u"1951 ", u"als ", u"führendes ", u"Kabinettsmitglied ", u"mehrere ", u"wichtige ", u"Regierungsämter", u", ", u"unter ", u"anderem ", u"das ", u"des ", u"Verteidigungsministers", u", ", u"des ", u"Außenministers ", u"und ", u"des ", u"Schatzkanzlers", u". " ], [ u"Seine ", u"Amtszeit ", u"als ", u"Premierminister ", u"war ", u"innenpolitisch ", u"geprägt ", u"von ", u"zahlreichen ", u"Reformen ", u"sowie ", u"einer ", u"prosperierenden ", u"Wirtschaft ", u"mit ", u"niedriger ", u"Arbeitslosigkeit ", u"und ", u"ungleichmäßigem ", u"Wirtschaftswachstum", u". ", ], [ u"Außenpolitisch ", u"behob ", u"er ", u"die ", u"durch ", u"die ", u"Sueskrise ", u"entstandene ", u"Entfremdung ", u"mit ", u"den ", u"USA", u", ", u"erreichte ", u"die ", u"Lieferung ", u"von ", u"amerikanischen ", u"Polaris", u"-", u"Mittelstreckenraketen ", u"als ", u"neuen ", u"Kern ", u"der ", u"britischen ", u"nuklearen ", u"Abschreckung ", u"und ", u"bereitete ", u"den ", u"Weg ", u"für ", u"ein ", u"partielles ", u"Atomteststoppabkommen", u"." ] ] ] for expression in expressions: self.assertEqual( sent_tokenize( "".join(w for sent in expression for w in sent), keep_whitespace=True, normalize_ascii=False ), expression )
def test_unequal_quote_detection(self): expression = [ [ u"Beyoncé", u"'s ", u'vocal ', u'range ', u'spans ', u'four ', u'octaves', u'. ' ], [ u'Jody ', u'Rosen ', u'highlights ', u'her ', u'tone ', u'and ', u'timbre ', u'as ', u'particularly ', u'distinctive', u', ', u'describing ', u'her ', u'voice ', u'as ', u'"', u'one ', u'of ', u'the ', u'most ', u'compelling ', u'instruments ', u'in ', u'popular ', u'music', u'"', u'. ' ], [ u'While ', u'another ', u'critic ', u'says ', u'she ', u'is ', u'a ', u'"', u'Vocal ', u'acrobat', u', ', u'being ', u'able ', u'to ', u'sing ', u'long ', u'and ', u'complex ', u'melismas ', u'and ', u'vocal ', u'runs ', u'effortlessly', u', ', u'and ', u'in ', u'key', u'. ' ], [ u'Her ', u'vocal ', u'abilities ', u'mean ', u'she ', u'is ', u'identified ', u'as ', u'the ', u'centerpiece ', u'of ', u'Destiny', u"'s ", u'Child', u'. ' ], [ u'The ', u'Daily ', u'Mail ', u'calls ', u"Beyoncé", u"'s ", u'voice ', u'"', u'versatile', u'"', u', ', u'capable ', u'of ', u'exploring ', u'power ', u'ballads', u', ', u'soul', u', ', u'rock ', u'belting', u', ', u'operatic ', u'flourishes', u', ', u'and ', u'hip ', u'hop', u'. ' ], [ u'Jon ', u'Pareles ', u'of ', u'The ', u'New ', u'York ', u'Times ', u'commented ', u'that ', u'her ', u'voice ', u'is ', u'"', u'velvety ', u'yet ', u'tart', u', ', u'with ', u'an ', u'insistent ', u'flutter ', u'and ', u'reserves ', u'of ', u'soul ', u'belting', u'"', u'. ' ], [ u'Rosen ', u'notes ', u'that ', u'the ', u'hip ', u'hop ', u'era ', u'highly ', u'influenced ', u"Beyoncé", u"'s ", u'strange ', u'rhythmic ', u'vocal ', u'style', u', ', u'but ', u'also ', u'finds ', u'her ', u'quite ', u'traditionalist ', u'in ', u'her ', u'use ', u'of ', u'balladry', u', ', u'gospel ', u'and ', u'falsetto', u'. ' ], [ u'Other ', u'critics ', u'praise ', u'her ', u'range ', u'and ', u'power', u', ', u'with ', u'Chris ', u'Richards ', u'of ', u'The ', u'Washington ', u'Post ', u'saying ', u'she ', u'was ', u'"', u'capable ', u'of ', u'punctuating ', u'any ', u'beat ', u'with ', u'goose', u'-', u'bump', u'-', u'inducing ', u'whispers ', u'or ', u'full', u'-', u'bore ', u'diva', u'-', u'roars', u'.', u'"' ] ] self.assertEqual( sent_tokenize(u"".join(w for sent in expression for w in sent), keep_whitespace=True), expression)
def featurize_example(question, context, vocab): # Convert to indices question_idxs = [ vocab.word_to_idx(normalize(w)) for w in ciseau.tokenize(question, normalize_ascii=False) ] context_sents = ciseau.sent_tokenize(context, keep_whitespace=True, normalize_ascii=False) # + 1 for end of sentence sent_lengths = [len(sent) + 1 for sent in context_sents] context_idxs = [] for sent in context_sents: for w in sent: context_idxs.append(vocab.word_to_idx(normalize(w))) context_idxs.append(vocab.eos) same_as_question = same_as_question_feature(question_idxs, context_idxs, vocab) repeated_words, repeated_intensity = repeated_word_features( context_idxs, vocab) return (question_idxs, context_idxs, same_as_question, repeated_words, repeated_intensity, sent_lengths), context_sents
def test_period_sequences(self): expression = [[ "Mr. ", "Joe ", "was ", "always ", "late ", "to ", "his ", "dates", ", ", "appointments", ", ", "etc.", "." ]] self.assertEqual( sent_tokenize("".join(w for sent in expression for w in sent), keep_whitespace=True), expression)
def test_contained_period_in_quotes(self): expression = [[ "the ", "gray ", "bird ", "(", "which ", "was ", "famous ", "for ", "its ", "colors", ".", ") ", "was ", "ressurected ", "\" ", "she ", "said", ".", "\"" ]] self.assertEqual( sent_tokenize("".join(w for sent in expression for w in sent), keep_whitespace=True), expression)
def test_sentence_detection(self): expression = [ [ u'Maslow', u'’s ', u'‘‘', u'Third ', u'Force ', u'Psychology ', u'Theory', u'’’ ', u'even ', u'allows ', u'literary ', u'analysts ', u'to ', u'critically ', u'understand ', u'how ', u'characters ', u'reflect ', u'the ', u'culture ', u'and ', u'the ', u'history ', u'in ', u'which ', u'they ', u'are ', u'contextualized', u'. ' ], [ u'It ', u'also ', u'allows ', u'analysts ', u'to ', u'understand ', u'the ', u'author', u'’s ', u'intended ', u'message ', u'and ', u'to ', u'understand ', u'the ', u'author', u'’s ', u'psychology', u'. ' ], [ u'The ', u'theory ', u'suggests ', u'that ', u'human ', u'beings ', u'possess ', u'a ', u'nature ', u'within ', u'them ', u'that ', u'demonstrates ', u'their ', u'true ', u'“', u'self', u'” ', u'and ', u'it ', u'suggests ', u'that ', u'the ', u'fulfillment ', u'of ', u'this ', u'nature ', u'is ', u'the ', u'reason ', u'for ', u'living', u'. ' ], [ u'It ', u'also ', u'suggests ', u'that ', u'neurological ', u'development ', u'hinders ', u'actualizing ', u'the ', u'nature ', u'because ', u'a ', u'person ', u'becomes ', u'estranged ', u'from ', u'his ', u'or ', u'her ', u'true ', u'self', u'. ' ], [ u'Therefore', u', ', u'literary ', u'devices ', u'reflect ', u'a ', u'characters', u'’s ', u'and ', u'an ', u'author', u'’s ', u'natural ', u'self', u'. ' ], [ u'In ', u'his ', u'‘‘', u'Third ', u'Force ', u'Psychology ', u'and ', u'the ', u'Study ', u'of ', u'Literature', u'’’', u', ', u'Paris ', u'argues ', u'“', u'D.', u'H ', u'Lawrence', u'’s ', u'“', u'pristine ', u'unconscious', u'” ', u'is ', u'a ', u'metaphor ', u'for ', u'the ', u'real ', u'self', u'”', u'. ' ], [ u'Thus ', u'Literature ', u'is ', u'a ', u'reputable ', u'tool ', u'that ', u'allows ', u'readers ', u'to ', u'develop ', u'and ', u'apply ', u'critical ', u'reasoning ', u'to ', u'the ', u'nature ', u'of ', u'emotions', u'.' ] ] self.assertEqual( sent_tokenize(u"".join(w for sent in expression for w in sent), keep_whitespace=True), expression)
def test_period_sequences(self): expression = [[ "Mr. ", "Joe ", "was ", "always ", "late ", "to ", "his ", "dates", ", ", "appointments", ", ", "etc.", "." ]] self.assertEqual( sent_tokenize( "".join(w for sent in expression for w in sent), keep_whitespace=True ), expression )
def test_contained_period_in_quotes(self): expression = [[ "the ", "gray ", "bird ", "(", "which ", "was ", "famous ", "for ", "its ", "colors", ".", ") ", "was ", "ressurected ", "\" ", "she ", "said", ".", "\"" ]] self.assertEqual( sent_tokenize( "".join(w for sent in expression for w in sent), keep_whitespace=True ), expression )
def test_german_tokenization(self): expressions = [[[ u"Als ", u"Vertreter ", u"des ", u"One", u"-", u"Nation", u"-", u"Konservatismus ", u"bekleidete ", u"er ", u"nach ", u"dem ", u"Wahlsieg ", u"der ", u"Tories ", u"1951 ", u"als ", u"führendes ", u"Kabinettsmitglied ", u"mehrere ", u"wichtige ", u"Regierungsämter", u", ", u"unter ", u"anderem ", u"das ", u"des ", u"Verteidigungsministers", u", ", u"des ", u"Außenministers ", u"und ", u"des ", u"Schatzkanzlers", u". " ], [ u"Seine ", u"Amtszeit ", u"als ", u"Premierminister ", u"war ", u"innenpolitisch ", u"geprägt ", u"von ", u"zahlreichen ", u"Reformen ", u"sowie ", u"einer ", u"prosperierenden ", u"Wirtschaft ", u"mit ", u"niedriger ", u"Arbeitslosigkeit ", u"und ", u"ungleichmäßigem ", u"Wirtschaftswachstum", u". ", ], [ u"Außenpolitisch ", u"behob ", u"er ", u"die ", u"durch ", u"die ", u"Sueskrise ", u"entstandene ", u"Entfremdung ", u"mit ", u"den ", u"USA", u", ", u"erreichte ", u"die ", u"Lieferung ", u"von ", u"amerikanischen ", u"Polaris", u"-", u"Mittelstreckenraketen ", u"als ", u"neuen ", u"Kern ", u"der ", u"britischen ", u"nuklearen ", u"Abschreckung ", u"und ", u"bereitete ", u"den ", u"Weg ", u"für ", u"ein ", u"partielles ", u"Atomteststoppabkommen", u"." ]]] for expression in expressions: self.assertEqual( sent_tokenize("".join(w for sent in expression for w in sent), keep_whitespace=True, normalize_ascii=False), expression)
def tokenize_example(question, context, answers, strip_labels=True): # Q: How should we choose the right answer answer = answers[0]["text"] answer_start = answers[0]["answer_start"] if strip_labels: answer_tokens = ciseau.tokenize(answer, normalize_ascii=False) start_offset, end_offset = normalize_answer_tokens(answer_tokens) answer = "".join(answer_tokens[start_offset:end_offset]) # add back the piece that was stripped off: answer_start = answer_start + len("".join( answer_tokens[:start_offset])) # replace answer string with placeholder placeholder = "XXXX" new_context = context[:answer_start] + placeholder + context[answer_start + len(answer):] token_context = ciseau.sent_tokenize(new_context, keep_whitespace=True) token_question = ciseau.tokenize(question) sentence_label = None for sent_idx, sent in enumerate(token_context): answer_start = None for idx, word in enumerate(sent): if placeholder in word: answer_start = idx break if answer_start is None: continue sentence_label = sent_idx # deal with cases where the answer is in the middle # of the word answer = word.replace(placeholder, answer) token_answer = ciseau.tokenize(answer) answer_end = answer_start + len(token_answer) - 1 answer_sent = sent[:answer_start] + token_answer + sent[answer_start + 1:] break token_context[sentence_label] = answer_sent return token_question, token_context, sentence_label, answer_start, answer_end
def test_unequal_quote_detection(self): expression = [ [u"Beyoncé", u"'s ", u'vocal ', u'range ', u'spans ', u'four ', u'octaves', u'. '], [u'Jody ', u'Rosen ', u'highlights ', u'her ', u'tone ', u'and ', u'timbre ', u'as ', u'particularly ', u'distinctive', u', ', u'describing ', u'her ', u'voice ', u'as ', u'"', u'one ', u'of ', u'the ', u'most ', u'compelling ', u'instruments ', u'in ', u'popular ', u'music', u'"', u'. ' ], [u'While ', u'another ', u'critic ', u'says ', u'she ', u'is ', u'a ', u'"', u'Vocal ', u'acrobat', u', ', u'being ', u'able ', u'to ', u'sing ', u'long ', u'and ', u'complex ', u'melismas ', u'and ', u'vocal ', u'runs ', u'effortlessly', u', ', u'and ', u'in ', u'key', u'. '], [u'Her ', u'vocal ', u'abilities ', u'mean ', u'she ', u'is ', u'identified ', u'as ', u'the ', u'centerpiece ', u'of ', u'Destiny', u"'s ", u'Child', u'. '], [u'The ', u'Daily ', u'Mail ', u'calls ', u"Beyoncé", u"'s ", u'voice ', u'"', u'versatile', u'"', u', ', u'capable ', u'of ', u'exploring ', u'power ', u'ballads', u', ', u'soul', u', ', u'rock ', u'belting', u', ', u'operatic ', u'flourishes', u', ', u'and ', u'hip ', u'hop', u'. '], [u'Jon ', u'Pareles ', u'of ', u'The ', u'New ', u'York ', u'Times ', u'commented ', u'that ', u'her ', u'voice ', u'is ', u'"', u'velvety ', u'yet ', u'tart', u', ', u'with ', u'an ', u'insistent ', u'flutter ', u'and ', u'reserves ', u'of ', u'soul ', u'belting', u'"', u'. '], [u'Rosen ', u'notes ', u'that ', u'the ', u'hip ', u'hop ', u'era ', u'highly ', u'influenced ', u"Beyoncé", u"'s ", u'strange ', u'rhythmic ', u'vocal ', u'style', u', ', u'but ', u'also ', u'finds ', u'her ', u'quite ', u'traditionalist ', u'in ', u'her ', u'use ', u'of ', u'balladry', u', ', u'gospel ', u'and ', u'falsetto', u'. '], [u'Other ', u'critics ', u'praise ', u'her ', u'range ', u'and ', u'power', u', ', u'with ', u'Chris ', u'Richards ', u'of ', u'The ', u'Washington ', u'Post ', u'saying ', u'she ', u'was ', u'"', u'capable ', u'of ', u'punctuating ', u'any ', u'beat ', u'with ', u'goose', u'-', u'bump', u'-', u'inducing ', u'whispers ', u'or ', u'full', u'-', u'bore ', u'diva', u'-', u'roars', u'.', u'"'] ] self.assertEqual( sent_tokenize( u"".join(w for sent in expression for w in sent), keep_whitespace=True ), expression )
def convert(article_name, doc, collection, wiki_trie, anchor_trie, trie_index2indices, trie_index2indices_counts, trie_index2indices_transitions, redirections, prefix): doc = doc.replace("\t", " ") # remove ref tags: doc = re.sub(ref_pattern, " ", doc) doc = re.sub(double_bracket_pattern, " ", doc) doc = re.sub(title_pattern, r"\n\n\1\. ", doc) doc = re.sub(bullet_point_pattern, r"\1 ", doc) article_index = match_wikipedia_to_wikidata( article_name, wiki_trie, redirections, prefix ) # find location of tagged items in wikipedia: annotated = annotate_document(doc, collection, wiki_trie, anchor_trie, trie_index2indices, trie_index2indices_counts, trie_index2indices_transitions, redirections, prefix) text_without_brackets = "".join(text for text, _ in annotated) sentences = ciseau.sent_tokenize( text_without_brackets, normalize_ascii=False, keep_whitespace=True ) return ( convert_document_to_labeled_tags( annotated, sentences ), collection.ids[article_index] if article_index is not None else "other" )
def test_sentence_detection(self): expression = [ [u'Maslow', u'’s ', u'‘‘', u'Third ', u'Force ', u'Psychology ', u'Theory', u'’’ ', u'even ', u'allows ', u'literary ', u'analysts ', u'to ', u'critically ', u'understand ', u'how ', u'characters ', u'reflect ', u'the ', u'culture ', u'and ', u'the ', u'history ', u'in ', u'which ', u'they ', u'are ', u'contextualized', u'. '], [u'It ', u'also ', u'allows ', u'analysts ', u'to ', u'understand ', u'the ', u'author', u'’s ', u'intended ', u'message ', u'and ', u'to ', u'understand ', u'the ', u'author', u'’s ', u'psychology', u'. '], [u'The ', u'theory ', u'suggests ', u'that ', u'human ', u'beings ', u'possess ', u'a ', u'nature ', u'within ', u'them ', u'that ', u'demonstrates ', u'their ', u'true ', u'“', u'self', u'” ', u'and ', u'it ', u'suggests ', u'that ', u'the ', u'fulfillment ', u'of ', u'this ', u'nature ', u'is ', u'the ', u'reason ', u'for ', u'living', u'. '], [u'It ', u'also ', u'suggests ', u'that ', u'neurological ', u'development ', u'hinders ', u'actualizing ', u'the ', u'nature ', u'because ', u'a ', u'person ', u'becomes ', u'estranged ', u'from ', u'his ', u'or ', u'her ', u'true ', u'self', u'. '], [u'Therefore', u', ', u'literary ', u'devices ', u'reflect ', u'a ', u'characters', u'’s ', u'and ', u'an ', u'author', u'’s ', u'natural ', u'self', u'. '], [u'In ', u'his ', u'‘‘', u'Third ', u'Force ', u'Psychology ', u'and ', u'the ', u'Study ', u'of ', u'Literature', u'’’', u', ', u'Paris ', u'argues ', u'“', u'D.', u'H ', u'Lawrence', u'’s ', u'“', u'pristine ', u'unconscious', u'” ', u'is ', u'a ', u'metaphor ', u'for ', u'the ', u'real ', u'self', u'”', u'. '], [u'Thus ', u'Literature ', u'is ', u'a ', u'reputable ', u'tool ', u'that ', u'allows ', u'readers ', u'to ', u'develop ', u'and ', u'apply ', u'critical ', u'reasoning ', u'to ', u'the ', u'nature ', u'of ', u'emotions', u'.'] ] self.assertEqual( sent_tokenize( u"".join(w for sent in expression for w in sent), keep_whitespace=True ), expression )
def test_spanish_tokenization(self): expressions = [ [ [ u"Pero ", u"si ", u"no ", u"es ", u"el ", u"caso", u", ", u"llega ", u"el ", u"momento ", u"de ", u"hacerse ", u"la ", u"pregunta ", u"de ", u"cada ", u"año", u". " ], [ u"¿", u"Qué ", u"hago ", u"con ", u"estos ", u"sobres ", u"de ", u"jamón ", u"o ", u"este ", u"lomo ", u"ibérico", u"? " ], [ u"¿", u"Los ", u"puedo ", u"congelar ", u"o ", u"es ", u"una ", u"aberración", u"? ", ], [ u"La ", u"respuesta ", u"rápida ", u"sería ", u"un ", u"sí", u"." ] ], [ [ u"De ", u"hecho", u", ", u"es ", u"algo ", u"que ", u"lleva ", u"mucho ", u"tiempo ", u"haciéndose", u". " ], [ u"En ", u"las ", u"matanzas ", u"de ", u"los ", u"pueblos ", u"muchas ", u"piezas ", u"se ", u"congelan ", u"una ", u"vez ", u"curadas ", u"para ", u"ir ", u"luego ", u"dándoles ", u"salida ", u"a ", u"lo ", u"largo ", u"de ", u"todo ", u"el ", u"año", u". " ], [ u"Otro ", u"ejemplo ", u"clásico", u": ", u"las ", u"embarazas ", u"que ", u"quieren ", u"evitar ", u"cualquier ", u"posible ", u"riesgo ", u"de ", u"toxoplasmosis ", u"pero ", u"no ", u"quieren ", u"renunciar ", u"a ", u"los ", u"embutidos ", u"durante ", u"eso ", u"nueve ", u"meses", u". " ], [ u"¿", u"Solución", u"? " ], [ u"Congelarlo", u"." ] ], [ [ u"Que ", u"lo ", u"sepas", u", ", u"¡", u"no ", u"pienso ", u"hacerlo ", u"todo ", u"yo ", u"sola", u"!" ] ], [ [ u"¡", u"No ", u"pienso ", u"hacerlo ", u"todo ", u"yo ", u"sola", u", ", u"que ", u"lo ", u"sepas", u"!" ] ], [ [ u"¡", u"No ", u"me ", u"digas ", u"nada", u"! " ], [ u"¡", u"Te ", u"has ", u"portado ", u"fatal", u"! " ], [ u"¡", u"No ", u"quiero ", u"volver ", u"a ", u"saber ", u"nada ", u"de ", u"ti", u"!" ] ], [ [ u"¡¡¡", u"Al ", u"ladrón", u"!!!" ] ] ] for expression in expressions: self.assertEqual( sent_tokenize( "".join(w for sent in expression for w in sent), keep_whitespace=True ), expression )
def condenseSentences(obj): if (isinstance(obj[0], str)): return obj elif (isinstance(obj[0], list)): return [item for sublist in obj for item in sublist] else: raise ValueError if (CONVERT_VI): fileIn = io.open('vi_untokenized.txt', 'r', encoding='utf-8') data = fileIn.readlines() data = [ condenseSentences(sent_tokenize(line, keep_whitespace=False)) for line in data ] data = [' '.join(words) for words in data] fileOut = io.open('vi_tokenized.txt', 'w', encoding='utf-8') for line in data: fileOut.write(line.strip() + '\n') fileIn.close() fileOut.close() if (CONVERT_CH): fileIn = io.open('ch_untokenized.txt', 'r', encoding='utf-8') data = fileIn.readlines() data = [' '.join(chars) for chars in data] fileOut = io.open('ch_tokenized.txt', 'w', encoding='utf-8') for line in data:
def test_spanish_tokenization(self): expressions = [[[ u"Pero ", u"si ", u"no ", u"es ", u"el ", u"caso", u", ", u"llega ", u"el ", u"momento ", u"de ", u"hacerse ", u"la ", u"pregunta ", u"de ", u"cada ", u"año", u". " ], [ u"¿", u"Qué ", u"hago ", u"con ", u"estos ", u"sobres ", u"de ", u"jamón ", u"o ", u"este ", u"lomo ", u"ibérico", u"? " ], [ u"¿", u"Los ", u"puedo ", u"congelar ", u"o ", u"es ", u"una ", u"aberración", u"? ", ], [ u"La ", u"respuesta ", u"rápida ", u"sería ", u"un ", u"sí", u"." ]], [[ u"De ", u"hecho", u", ", u"es ", u"algo ", u"que ", u"lleva ", u"mucho ", u"tiempo ", u"haciéndose", u". " ], [ u"En ", u"las ", u"matanzas ", u"de ", u"los ", u"pueblos ", u"muchas ", u"piezas ", u"se ", u"congelan ", u"una ", u"vez ", u"curadas ", u"para ", u"ir ", u"luego ", u"dándoles ", u"salida ", u"a ", u"lo ", u"largo ", u"de ", u"todo ", u"el ", u"año", u". " ], [ u"Otro ", u"ejemplo ", u"clásico", u": ", u"las ", u"embarazas ", u"que ", u"quieren ", u"evitar ", u"cualquier ", u"posible ", u"riesgo ", u"de ", u"toxoplasmosis ", u"pero ", u"no ", u"quieren ", u"renunciar ", u"a ", u"los ", u"embutidos ", u"durante ", u"eso ", u"nueve ", u"meses", u". " ], [u"¿", u"Solución", u"? "], [u"Congelarlo", u"."]], [[ u"Que ", u"lo ", u"sepas", u", ", u"¡", u"no ", u"pienso ", u"hacerlo ", u"todo ", u"yo ", u"sola", u"!" ]], [[ u"¡", u"No ", u"pienso ", u"hacerlo ", u"todo ", u"yo ", u"sola", u", ", u"que ", u"lo ", u"sepas", u"!" ]], [[u"¡", u"No ", u"me ", u"digas ", u"nada", u"! "], [u"¡", u"Te ", u"has ", u"portado ", u"fatal", u"! "], [ u"¡", u"No ", u"quiero ", u"volver ", u"a ", u"saber ", u"nada ", u"de ", u"ti", u"!" ]], [[u"¡¡¡", u"Al ", u"ladrón", u"!!!"]]] for expression in expressions: self.assertEqual( sent_tokenize("".join(w for sent in expression for w in sent), keep_whitespace=True), expression)