def _transform(self, document):
     lines_from_section = section_extract(self.section_regex,
                                          document['description'])
     return [
         word_tokenize(clean_str(strip_bullets_from_line(line.text)))
         for line in lines_from_section
     ]
Beispiel #2
0
 def _clean(self, document):
     return self.join_spaces([
         clean_str(document[field])
         for field in self.document_schema_fields
     ])
Beispiel #3
0
 def test_deep_wrapper(self):
     assert clean_str([["macy's engineer / apply now", "engineer/apply now"], ["engineer.", "python!"]]) == \
         [["macy s engineer apply now", "engineer apply now"], ["engineer ", "python "]]