def test_german_tokenization(self):
     expressions = [
         [
             [
                 u"Als ", u"Vertreter ", u"des ", u"One", u"-", u"Nation", u"-", u"Konservatismus ",
                 u"bekleidete ", u"er ", u"nach ", u"dem ", u"Wahlsieg ", u"der ", u"Tories ", u"1951 ",
                 u"als ", u"führendes ", u"Kabinettsmitglied ", u"mehrere ", u"wichtige ", u"Regierungsämter",
                 u", ", u"unter ", u"anderem ", u"das ", u"des ", u"Verteidigungsministers", u", ", u"des ",
                 u"Außenministers ", u"und ", u"des ", u"Schatzkanzlers", u". "
             ],
             [
                 u"Seine ", u"Amtszeit ", u"als ",
                 u"Premierminister ", u"war ", u"innenpolitisch ", u"geprägt ", u"von ", u"zahlreichen ", u"Reformen ",
                 u"sowie ", u"einer ", u"prosperierenden ", u"Wirtschaft ", u"mit ", u"niedriger ", u"Arbeitslosigkeit ",
                 u"und ", u"ungleichmäßigem ", u"Wirtschaftswachstum", u". ",
             ],
             [
                 u"Außenpolitisch ", u"behob ", u"er ", u"die ", u"durch ", u"die ", u"Sueskrise ", u"entstandene ",
                 u"Entfremdung ", u"mit ", u"den ", u"USA", u", ", u"erreichte ", u"die ", u"Lieferung ", u"von ",
                 u"amerikanischen ", u"Polaris", u"-", u"Mittelstreckenraketen ", u"als ", u"neuen ", u"Kern ",
                 u"der ", u"britischen ", u"nuklearen ", u"Abschreckung ", u"und ", u"bereitete ", u"den ", u"Weg ",
                 u"für ", u"ein ", u"partielles ", u"Atomteststoppabkommen", u"."
             ]
         ]
     ]
     for expression in expressions:
         self.assertEqual(
             sent_tokenize(
                 "".join(w for sent in expression for w in sent),
                 keep_whitespace=True,
                 normalize_ascii=False
             ),
             expression
         )
Beispiel #2
0
 def test_unequal_quote_detection(self):
     expression = [
         [
             u"Beyoncé", u"'s ", u'vocal ', u'range ', u'spans ', u'four ',
             u'octaves', u'. '
         ],
         [
             u'Jody ', u'Rosen ', u'highlights ', u'her ', u'tone ',
             u'and ', u'timbre ', u'as ', u'particularly ', u'distinctive',
             u', ', u'describing ', u'her ', u'voice ', u'as ', u'"',
             u'one ', u'of ', u'the ', u'most ', u'compelling ',
             u'instruments ', u'in ', u'popular ', u'music', u'"', u'. '
         ],
         [
             u'While ', u'another ', u'critic ', u'says ', u'she ', u'is ',
             u'a ', u'"', u'Vocal ', u'acrobat', u', ', u'being ', u'able ',
             u'to ', u'sing ', u'long ', u'and ', u'complex ', u'melismas ',
             u'and ', u'vocal ', u'runs ', u'effortlessly', u', ', u'and ',
             u'in ', u'key', u'. '
         ],
         [
             u'Her ', u'vocal ', u'abilities ', u'mean ', u'she ', u'is ',
             u'identified ', u'as ', u'the ', u'centerpiece ', u'of ',
             u'Destiny', u"'s ", u'Child', u'. '
         ],
         [
             u'The ', u'Daily ', u'Mail ', u'calls ', u"Beyoncé", u"'s ",
             u'voice ', u'"', u'versatile', u'"', u', ', u'capable ',
             u'of ', u'exploring ', u'power ', u'ballads', u', ', u'soul',
             u', ', u'rock ', u'belting', u', ', u'operatic ',
             u'flourishes', u', ', u'and ', u'hip ', u'hop', u'. '
         ],
         [
             u'Jon ', u'Pareles ', u'of ', u'The ', u'New ', u'York ',
             u'Times ', u'commented ', u'that ', u'her ', u'voice ', u'is ',
             u'"', u'velvety ', u'yet ', u'tart', u', ', u'with ', u'an ',
             u'insistent ', u'flutter ', u'and ', u'reserves ', u'of ',
             u'soul ', u'belting', u'"', u'. '
         ],
         [
             u'Rosen ', u'notes ', u'that ', u'the ', u'hip ', u'hop ',
             u'era ', u'highly ', u'influenced ', u"Beyoncé", u"'s ",
             u'strange ', u'rhythmic ', u'vocal ', u'style', u', ', u'but ',
             u'also ', u'finds ', u'her ', u'quite ', u'traditionalist ',
             u'in ', u'her ', u'use ', u'of ', u'balladry', u', ',
             u'gospel ', u'and ', u'falsetto', u'. '
         ],
         [
             u'Other ', u'critics ', u'praise ', u'her ', u'range ',
             u'and ', u'power', u', ', u'with ', u'Chris ', u'Richards ',
             u'of ', u'The ', u'Washington ', u'Post ', u'saying ', u'she ',
             u'was ', u'"', u'capable ', u'of ', u'punctuating ', u'any ',
             u'beat ', u'with ', u'goose', u'-', u'bump', u'-',
             u'inducing ', u'whispers ', u'or ', u'full', u'-', u'bore ',
             u'diva', u'-', u'roars', u'.', u'"'
         ]
     ]
     self.assertEqual(
         sent_tokenize(u"".join(w for sent in expression for w in sent),
                       keep_whitespace=True), expression)
Beispiel #3
0
def featurize_example(question, context, vocab):
    # Convert to indices
    question_idxs = [
        vocab.word_to_idx(normalize(w))
        for w in ciseau.tokenize(question, normalize_ascii=False)
    ]

    context_sents = ciseau.sent_tokenize(context,
                                         keep_whitespace=True,
                                         normalize_ascii=False)
    # + 1 for end of sentence
    sent_lengths = [len(sent) + 1 for sent in context_sents]
    context_idxs = []
    for sent in context_sents:
        for w in sent:
            context_idxs.append(vocab.word_to_idx(normalize(w)))
        context_idxs.append(vocab.eos)

    same_as_question = same_as_question_feature(question_idxs, context_idxs,
                                                vocab)
    repeated_words, repeated_intensity = repeated_word_features(
        context_idxs, vocab)

    return (question_idxs, context_idxs, same_as_question, repeated_words,
            repeated_intensity, sent_lengths), context_sents
Beispiel #4
0
 def test_period_sequences(self):
     expression = [[
         "Mr. ", "Joe ", "was ", "always ", "late ", "to ", "his ", "dates",
         ", ", "appointments", ", ", "etc.", "."
     ]]
     self.assertEqual(
         sent_tokenize("".join(w for sent in expression for w in sent),
                       keep_whitespace=True), expression)
Beispiel #5
0
 def test_contained_period_in_quotes(self):
     expression = [[
         "the ", "gray ", "bird ", "(", "which ", "was ", "famous ", "for ",
         "its ", "colors", ".", ") ", "was ", "ressurected ", "\" ", "she ",
         "said", ".", "\""
     ]]
     self.assertEqual(
         sent_tokenize("".join(w for sent in expression for w in sent),
                       keep_whitespace=True), expression)
Beispiel #6
0
 def test_sentence_detection(self):
     expression = [
         [
             u'Maslow', u'’s ', u'‘‘', u'Third ', u'Force ', u'Psychology ',
             u'Theory', u'’’ ', u'even ', u'allows ', u'literary ',
             u'analysts ', u'to ', u'critically ', u'understand ', u'how ',
             u'characters ', u'reflect ', u'the ', u'culture ', u'and ',
             u'the ', u'history ', u'in ', u'which ', u'they ', u'are ',
             u'contextualized', u'. '
         ],
         [
             u'It ', u'also ', u'allows ', u'analysts ', u'to ',
             u'understand ', u'the ', u'author', u'’s ', u'intended ',
             u'message ', u'and ', u'to ', u'understand ', u'the ',
             u'author', u'’s ', u'psychology', u'. '
         ],
         [
             u'The ', u'theory ', u'suggests ', u'that ', u'human ',
             u'beings ', u'possess ', u'a ', u'nature ', u'within ',
             u'them ', u'that ', u'demonstrates ', u'their ', u'true ',
             u'“', u'self', u'” ', u'and ', u'it ', u'suggests ', u'that ',
             u'the ', u'fulfillment ', u'of ', u'this ', u'nature ', u'is ',
             u'the ', u'reason ', u'for ', u'living', u'. '
         ],
         [
             u'It ', u'also ', u'suggests ', u'that ', u'neurological ',
             u'development ', u'hinders ', u'actualizing ', u'the ',
             u'nature ', u'because ', u'a ', u'person ', u'becomes ',
             u'estranged ', u'from ', u'his ', u'or ', u'her ', u'true ',
             u'self', u'. '
         ],
         [
             u'Therefore', u', ', u'literary ', u'devices ', u'reflect ',
             u'a ', u'characters', u'’s ', u'and ', u'an ', u'author',
             u'’s ', u'natural ', u'self', u'. '
         ],
         [
             u'In ', u'his ', u'‘‘', u'Third ', u'Force ', u'Psychology ',
             u'and ', u'the ', u'Study ', u'of ', u'Literature', u'’’',
             u', ', u'Paris ', u'argues ', u'“', u'D.', u'H ', u'Lawrence',
             u'’s ', u'“', u'pristine ', u'unconscious', u'” ', u'is ',
             u'a ', u'metaphor ', u'for ', u'the ', u'real ', u'self', u'”',
             u'. '
         ],
         [
             u'Thus ', u'Literature ', u'is ', u'a ', u'reputable ',
             u'tool ', u'that ', u'allows ', u'readers ', u'to ',
             u'develop ', u'and ', u'apply ', u'critical ', u'reasoning ',
             u'to ', u'the ', u'nature ', u'of ', u'emotions', u'.'
         ]
     ]
     self.assertEqual(
         sent_tokenize(u"".join(w for sent in expression for w in sent),
                       keep_whitespace=True), expression)
 def test_period_sequences(self):
     expression = [[
         "Mr. ", "Joe ", "was ", "always ", "late ", "to ", "his ",
         "dates", ", ", "appointments", ", ", "etc.", "."
     ]]
     self.assertEqual(
         sent_tokenize(
             "".join(w for sent in expression for w in sent),
             keep_whitespace=True
         ),
         expression
     )
 def test_contained_period_in_quotes(self):
     expression = [[
         "the ", "gray ", "bird ", "(", "which ", "was ",
         "famous ", "for ", "its ", "colors", ".", ") ",
         "was ", "ressurected ", "\" ", "she ", "said", ".", "\""
     ]]
     self.assertEqual(
         sent_tokenize(
             "".join(w for sent in expression for w in sent),
             keep_whitespace=True
         ),
         expression
     )
Beispiel #9
0
 def test_german_tokenization(self):
     expressions = [[[
         u"Als ", u"Vertreter ", u"des ", u"One", u"-", u"Nation", u"-",
         u"Konservatismus ", u"bekleidete ", u"er ", u"nach ", u"dem ",
         u"Wahlsieg ", u"der ", u"Tories ", u"1951 ", u"als ",
         u"führendes ", u"Kabinettsmitglied ", u"mehrere ", u"wichtige ",
         u"Regierungsämter", u", ", u"unter ", u"anderem ", u"das ",
         u"des ", u"Verteidigungsministers", u", ", u"des ",
         u"Außenministers ", u"und ", u"des ", u"Schatzkanzlers", u". "
     ],
                     [
                         u"Seine ",
                         u"Amtszeit ",
                         u"als ",
                         u"Premierminister ",
                         u"war ",
                         u"innenpolitisch ",
                         u"geprägt ",
                         u"von ",
                         u"zahlreichen ",
                         u"Reformen ",
                         u"sowie ",
                         u"einer ",
                         u"prosperierenden ",
                         u"Wirtschaft ",
                         u"mit ",
                         u"niedriger ",
                         u"Arbeitslosigkeit ",
                         u"und ",
                         u"ungleichmäßigem ",
                         u"Wirtschaftswachstum",
                         u". ",
                     ],
                     [
                         u"Außenpolitisch ", u"behob ", u"er ", u"die ",
                         u"durch ", u"die ", u"Sueskrise ", u"entstandene ",
                         u"Entfremdung ", u"mit ", u"den ", u"USA", u", ",
                         u"erreichte ", u"die ", u"Lieferung ", u"von ",
                         u"amerikanischen ", u"Polaris", u"-",
                         u"Mittelstreckenraketen ", u"als ", u"neuen ",
                         u"Kern ", u"der ", u"britischen ", u"nuklearen ",
                         u"Abschreckung ", u"und ", u"bereitete ", u"den ",
                         u"Weg ", u"für ", u"ein ", u"partielles ",
                         u"Atomteststoppabkommen", u"."
                     ]]]
     for expression in expressions:
         self.assertEqual(
             sent_tokenize("".join(w for sent in expression for w in sent),
                           keep_whitespace=True,
                           normalize_ascii=False), expression)
Beispiel #10
0
def tokenize_example(question, context, answers, strip_labels=True):
    # Q: How should we choose the right answer
    answer = answers[0]["text"]
    answer_start = answers[0]["answer_start"]

    if strip_labels:
        answer_tokens = ciseau.tokenize(answer, normalize_ascii=False)
        start_offset, end_offset = normalize_answer_tokens(answer_tokens)
        answer = "".join(answer_tokens[start_offset:end_offset])
        # add back the piece that was stripped off:
        answer_start = answer_start + len("".join(
            answer_tokens[:start_offset]))

    # replace answer string with placeholder
    placeholder = "XXXX"
    new_context = context[:answer_start] + placeholder + context[answer_start +
                                                                 len(answer):]

    token_context = ciseau.sent_tokenize(new_context, keep_whitespace=True)
    token_question = ciseau.tokenize(question)

    sentence_label = None
    for sent_idx, sent in enumerate(token_context):
        answer_start = None
        for idx, word in enumerate(sent):
            if placeholder in word:
                answer_start = idx
                break

        if answer_start is None:
            continue

        sentence_label = sent_idx

        # deal with cases where the answer is in the middle
        # of the word
        answer = word.replace(placeholder, answer)
        token_answer = ciseau.tokenize(answer)

        answer_end = answer_start + len(token_answer) - 1
        answer_sent = sent[:answer_start] + token_answer + sent[answer_start +
                                                                1:]
        break

    token_context[sentence_label] = answer_sent

    return token_question, token_context, sentence_label, answer_start, answer_end
 def test_unequal_quote_detection(self):
     expression = [
         [u"Beyoncé", u"'s ", u'vocal ', u'range ', u'spans ', u'four ', u'octaves',
          u'. '],
         [u'Jody ', u'Rosen ', u'highlights ', u'her ', u'tone ', u'and ',
          u'timbre ', u'as ', u'particularly ', u'distinctive', u', ',
          u'describing ', u'her ', u'voice ', u'as ', u'"', u'one ', u'of ',
          u'the ', u'most ', u'compelling ', u'instruments ', u'in ',
          u'popular ', u'music', u'"', u'. '
         ],
         [u'While ', u'another ', u'critic ', u'says ', u'she ', u'is ',
          u'a ', u'"', u'Vocal ', u'acrobat', u', ', u'being ', u'able ',
          u'to ', u'sing ', u'long ', u'and ', u'complex ', u'melismas ',
          u'and ', u'vocal ', u'runs ', u'effortlessly', u', ', u'and ',
          u'in ', u'key', u'. '],
         [u'Her ', u'vocal ', u'abilities ', u'mean ', u'she ', u'is ',
          u'identified ', u'as ', u'the ', u'centerpiece ', u'of ', u'Destiny',
          u"'s ", u'Child', u'. '],
         [u'The ', u'Daily ', u'Mail ', u'calls ', u"Beyoncé", u"'s ", u'voice ',
          u'"', u'versatile', u'"', u', ', u'capable ', u'of ', u'exploring ',
          u'power ', u'ballads', u', ', u'soul', u', ', u'rock ', u'belting',
          u', ', u'operatic ', u'flourishes', u', ', u'and ', u'hip ', u'hop',
          u'. '],
         [u'Jon ', u'Pareles ', u'of ', u'The ', u'New ', u'York ', u'Times ',
          u'commented ', u'that ', u'her ', u'voice ', u'is ', u'"', u'velvety ',
          u'yet ', u'tart', u', ', u'with ', u'an ', u'insistent ', u'flutter ',
          u'and ', u'reserves ', u'of ', u'soul ', u'belting', u'"', u'. '],
         [u'Rosen ', u'notes ', u'that ', u'the ', u'hip ', u'hop ', u'era ',
          u'highly ', u'influenced ', u"Beyoncé", u"'s ", u'strange ', u'rhythmic ',
          u'vocal ', u'style', u', ', u'but ', u'also ', u'finds ', u'her ',
          u'quite ', u'traditionalist ', u'in ', u'her ', u'use ', u'of ',
          u'balladry', u', ', u'gospel ', u'and ', u'falsetto', u'. '],
         [u'Other ', u'critics ', u'praise ', u'her ', u'range ', u'and ',
          u'power', u', ', u'with ', u'Chris ', u'Richards ', u'of ', u'The ',
          u'Washington ', u'Post ', u'saying ', u'she ', u'was ', u'"',
          u'capable ', u'of ', u'punctuating ', u'any ', u'beat ', u'with ',
          u'goose', u'-', u'bump', u'-', u'inducing ', u'whispers ', u'or ',
          u'full', u'-', u'bore ', u'diva', u'-', u'roars', u'.', u'"']
     ]
     self.assertEqual(
         sent_tokenize(
             u"".join(w for sent in expression for w in sent),
             keep_whitespace=True
         ),
         expression
     )
def convert(article_name,
            doc,
            collection,
            wiki_trie,
            anchor_trie,
            trie_index2indices,
            trie_index2indices_counts,
            trie_index2indices_transitions,
            redirections,
            prefix):
    doc = doc.replace("\t", " ")
    # remove ref tags:
    doc = re.sub(ref_pattern, " ", doc)
    doc = re.sub(double_bracket_pattern, " ", doc)
    doc = re.sub(title_pattern, r"\n\n\1\. ", doc)
    doc = re.sub(bullet_point_pattern, r"\1 ", doc)

    article_index = match_wikipedia_to_wikidata(
        article_name, wiki_trie, redirections, prefix
    )
    # find location of tagged items in wikipedia:
    annotated = annotate_document(doc,
                                  collection,
                                  wiki_trie,
                                  anchor_trie,
                                  trie_index2indices,
                                  trie_index2indices_counts,
                                  trie_index2indices_transitions,
                                  redirections,
                                  prefix)
    text_without_brackets = "".join(text for text, _ in annotated)
    sentences = ciseau.sent_tokenize(
        text_without_brackets,
        normalize_ascii=False,
        keep_whitespace=True
    )
    return (
        convert_document_to_labeled_tags(
            annotated, sentences
        ),
        collection.ids[article_index] if article_index is not None else "other"
    )
 def test_sentence_detection(self):
     expression = [
         [u'Maslow', u'’s ', u'‘‘', u'Third ', u'Force ', u'Psychology ',
          u'Theory', u'’’ ', u'even ', u'allows ', u'literary ', u'analysts ',
          u'to ', u'critically ', u'understand ', u'how ', u'characters ',
          u'reflect ', u'the ', u'culture ', u'and ', u'the ', u'history ',
          u'in ', u'which ', u'they ', u'are ', u'contextualized', u'. '],
         [u'It ', u'also ', u'allows ', u'analysts ', u'to ', u'understand ',
          u'the ', u'author', u'’s ', u'intended ', u'message ', u'and ', u'to ',
          u'understand ', u'the ', u'author', u'’s ', u'psychology', u'. '],
         [u'The ', u'theory ', u'suggests ', u'that ', u'human ', u'beings ',
          u'possess ', u'a ', u'nature ', u'within ', u'them ', u'that ',
          u'demonstrates ', u'their ', u'true ', u'“', u'self', u'” ', u'and ',
          u'it ', u'suggests ', u'that ', u'the ', u'fulfillment ', u'of ',
          u'this ', u'nature ', u'is ', u'the ', u'reason ', u'for ', u'living',
          u'. '],
         [u'It ', u'also ', u'suggests ', u'that ', u'neurological ',
          u'development ', u'hinders ', u'actualizing ', u'the ', u'nature ',
          u'because ', u'a ', u'person ', u'becomes ', u'estranged ', u'from ',
          u'his ', u'or ', u'her ', u'true ', u'self', u'. '],
         [u'Therefore', u', ', u'literary ', u'devices ', u'reflect ', u'a ',
          u'characters', u'’s ', u'and ', u'an ', u'author', u'’s ', u'natural ',
          u'self', u'. '],
         [u'In ', u'his ', u'‘‘', u'Third ', u'Force ', u'Psychology ', u'and ',
          u'the ', u'Study ', u'of ', u'Literature', u'’’', u', ', u'Paris ',
          u'argues ', u'“', u'D.', u'H ', u'Lawrence', u'’s ', u'“', u'pristine ',
          u'unconscious', u'” ', u'is ', u'a ', u'metaphor ', u'for ', u'the ',
          u'real ', u'self', u'”', u'. '],
         [u'Thus ', u'Literature ', u'is ', u'a ', u'reputable ', u'tool ',
          u'that ', u'allows ', u'readers ', u'to ', u'develop ', u'and ',
          u'apply ', u'critical ', u'reasoning ', u'to ', u'the ', u'nature ',
          u'of ', u'emotions', u'.']
     ]
     self.assertEqual(
         sent_tokenize(
             u"".join(w for sent in expression for w in sent),
             keep_whitespace=True
         ),
         expression
     )
 def test_spanish_tokenization(self):
     expressions = [
         [
             [
                 u"Pero ", u"si ", u"no ", u"es ", u"el ", u"caso", u", ", u"llega ",
                 u"el ", u"momento ", u"de ", u"hacerse ", u"la ", u"pregunta ", u"de ",
                 u"cada ", u"año", u". "
             ],
             [
                 u"¿", u"Qué ", u"hago ", u"con ", u"estos ", u"sobres ", u"de ", u"jamón ",
                 u"o ", u"este ", u"lomo ", u"ibérico", u"? "
             ],
             [
                 u"¿", u"Los ", u"puedo ", u"congelar ", u"o ", u"es ", u"una ", u"aberración",
                 u"? ",
             ],
             [
                 u"La ", u"respuesta ", u"rápida ", u"sería ", u"un ", u"sí", u"."
             ]
         ],
         [
             [
                 u"De ", u"hecho", u", ", u"es ", u"algo ", u"que ", u"lleva ", u"mucho ", u"tiempo ",
                 u"haciéndose", u". "
             ],
             [
                 u"En ", u"las ", u"matanzas ", u"de ", u"los ", u"pueblos ", u"muchas ", u"piezas ",
                 u"se ", u"congelan ", u"una ", u"vez ", u"curadas ", u"para ", u"ir ", u"luego ",
                 u"dándoles ", u"salida ", u"a ", u"lo ", u"largo ", u"de ", u"todo ", u"el ", u"año",
                 u". "
             ],
             [
                 u"Otro ", u"ejemplo ", u"clásico", u": ", u"las ", u"embarazas ", u"que ", u"quieren ",
                 u"evitar ", u"cualquier ", u"posible ", u"riesgo ", u"de ", u"toxoplasmosis ", u"pero ",
                 u"no ", u"quieren ", u"renunciar ", u"a ", u"los ", u"embutidos ", u"durante ", u"eso ",
                 u"nueve ", u"meses", u". "
             ],
             [
                 u"¿", u"Solución", u"? "
             ],
             [
                 u"Congelarlo", u"."
             ]
         ],
         [
             [
                 u"Que ", u"lo ", u"sepas", u", ", u"¡", u"no ", u"pienso ", u"hacerlo ", u"todo ", u"yo ",
                 u"sola", u"!"
             ]
         ],
         [
             [
                 u"¡", u"No ", u"pienso ", u"hacerlo ", u"todo ", u"yo ", u"sola", u", ", u"que ", u"lo ",
                 u"sepas", u"!"
             ]
         ],
         [
             [
                 u"¡", u"No ", u"me ", u"digas ", u"nada", u"! "
             ],
             [
                 u"¡", u"Te ", u"has ", u"portado ", u"fatal", u"! "
             ],
             [
                 u"¡", u"No ", u"quiero ", u"volver ", u"a ", u"saber ", u"nada ", u"de ", u"ti", u"!"
             ]
         ],
         [
             [
                 u"¡¡¡", u"Al ", u"ladrón", u"!!!"
             ]
         ]
     ]
     for expression in expressions:
         self.assertEqual(
             sent_tokenize(
                 "".join(w for sent in expression for w in sent),
                 keep_whitespace=True
             ),
             expression
         )

def condenseSentences(obj):
    if (isinstance(obj[0], str)):
        return obj
    elif (isinstance(obj[0], list)):
        return [item for sublist in obj for item in sublist]
    else:
        raise ValueError


if (CONVERT_VI):
    fileIn = io.open('vi_untokenized.txt', 'r', encoding='utf-8')
    data = fileIn.readlines()
    data = [
        condenseSentences(sent_tokenize(line, keep_whitespace=False))
        for line in data
    ]
    data = [' '.join(words) for words in data]
    fileOut = io.open('vi_tokenized.txt', 'w', encoding='utf-8')
    for line in data:
        fileOut.write(line.strip() + '\n')
    fileIn.close()
    fileOut.close()

if (CONVERT_CH):
    fileIn = io.open('ch_untokenized.txt', 'r', encoding='utf-8')
    data = fileIn.readlines()
    data = [' '.join(chars) for chars in data]
    fileOut = io.open('ch_tokenized.txt', 'w', encoding='utf-8')
    for line in data:
Beispiel #16
0
 def test_spanish_tokenization(self):
     expressions = [[[
         u"Pero ", u"si ", u"no ", u"es ", u"el ", u"caso", u", ",
         u"llega ", u"el ", u"momento ", u"de ", u"hacerse ", u"la ",
         u"pregunta ", u"de ", u"cada ", u"año", u". "
     ],
                     [
                         u"¿", u"Qué ", u"hago ", u"con ", u"estos ",
                         u"sobres ", u"de ", u"jamón ", u"o ", u"este ",
                         u"lomo ", u"ibérico", u"? "
                     ],
                     [
                         u"¿",
                         u"Los ",
                         u"puedo ",
                         u"congelar ",
                         u"o ",
                         u"es ",
                         u"una ",
                         u"aberración",
                         u"? ",
                     ],
                     [
                         u"La ", u"respuesta ", u"rápida ", u"sería ",
                         u"un ", u"sí", u"."
                     ]],
                    [[
                        u"De ", u"hecho", u", ", u"es ", u"algo ", u"que ",
                        u"lleva ", u"mucho ", u"tiempo ", u"haciéndose",
                        u". "
                    ],
                     [
                         u"En ", u"las ", u"matanzas ", u"de ", u"los ",
                         u"pueblos ", u"muchas ", u"piezas ", u"se ",
                         u"congelan ", u"una ", u"vez ", u"curadas ",
                         u"para ", u"ir ", u"luego ", u"dándoles ",
                         u"salida ", u"a ", u"lo ", u"largo ", u"de ",
                         u"todo ", u"el ", u"año", u". "
                     ],
                     [
                         u"Otro ", u"ejemplo ", u"clásico", u": ", u"las ",
                         u"embarazas ", u"que ", u"quieren ", u"evitar ",
                         u"cualquier ", u"posible ", u"riesgo ", u"de ",
                         u"toxoplasmosis ", u"pero ", u"no ", u"quieren ",
                         u"renunciar ", u"a ", u"los ", u"embutidos ",
                         u"durante ", u"eso ", u"nueve ", u"meses", u". "
                     ], [u"¿", u"Solución", u"? "], [u"Congelarlo", u"."]],
                    [[
                        u"Que ", u"lo ", u"sepas", u", ", u"¡", u"no ",
                        u"pienso ", u"hacerlo ", u"todo ", u"yo ", u"sola",
                        u"!"
                    ]],
                    [[
                        u"¡", u"No ", u"pienso ", u"hacerlo ", u"todo ",
                        u"yo ", u"sola", u", ", u"que ", u"lo ", u"sepas",
                        u"!"
                    ]],
                    [[u"¡", u"No ", u"me ", u"digas ", u"nada", u"! "],
                     [u"¡", u"Te ", u"has ", u"portado ", u"fatal", u"! "],
                     [
                         u"¡", u"No ", u"quiero ", u"volver ", u"a ",
                         u"saber ", u"nada ", u"de ", u"ti", u"!"
                     ]], [[u"¡¡¡", u"Al ", u"ladrón", u"!!!"]]]
     for expression in expressions:
         self.assertEqual(
             sent_tokenize("".join(w for sent in expression for w in sent),
                           keep_whitespace=True), expression)