Ejemplo n.º 1
0
 def _transform(self, document):
     if self.raw:
         return [
             word_tokenize(s)
             for s in sentence_tokenize(self._join(document))
         ]
     else:
         return [
             word_tokenize(s)
             for s in sentence_tokenize(self._clean(document))
         ]
Ejemplo n.º 2
0
 def _transform(self, document):
     lines_from_section = section_extract(self.section_regex,
                                          document['description'])
     return [
         word_tokenize(clean_str(strip_bullets_from_line(line.text)))
         for line in lines_from_section
     ]
Ejemplo n.º 3
0
def word_tokenizer_gen(sent_gent):
    for sent in sent_gent:
        yield word_tokenize(sent)