def set_tokenization_result(self, value): """Sets the value to the correspondent storage format""" if not isinstance(value, list): raise ValueError("Tokenization expected result should be a list " "of tuples (token-offset on text (int), token-string).") tkn_offsets, tokens = unzip(value, 2) self.tokens = list(tokens) self.offsets_to_text = list(tkn_offsets) self.tokenization_done_at = datetime.now() return self
def set_tokenization_result(self, value): """Sets the value to the correspondent storage format""" if not isinstance(value, list): raise ValueError( "Tokenization expected result should be a list " "of tuples (token-offset on text (int), token-string).") tkn_offsets, tokens = unzip(value, 2) self.tokens = list(tokens) self.offsets_to_text = list(tkn_offsets) self.tokenization_done_at = datetime.now() return self
def set_preprocess_result(self, step, result): """Set the result in the internal representation. Explicit save must be triggered after this call. Returns "self" so it's easily chainable with a .save() if desired """ if not isinstance(step, PreProcessSteps): raise InvalidPreprocessSteps if step == PreProcessSteps.sentencer: if not all(isinstance(x, int) for x in result): raise ValueError( 'Sentencer result shall only contain ints: %r' % result) if sorted(result) != result: raise ValueError('Sentencer result shall be ordered.') if len(set(result)) < len(result): raise ValueError( 'Sentencer result shall not contain duplicates.') if result[0] != 0: raise ValueError( 'Sentencer result must start with 0. Actual=%r' % result[0]) if result[-1] != len(self.tokens): raise ValueError( 'Sentencer result must end with token count=%d. Actual=%r' % (len(self.tokens), result[-1])) elif step == PreProcessSteps.tagging: if len(result) != len(self.tokens): raise ValueError( 'Tagging result must have same cardinality than tokens') field_name = self.preprocess_fields_mapping[step] if isinstance(field_name, tuple): # Some steps are stored on several fields names = field_name results = unzip(result, len(names)) for field_name, result in zip(names, results): setattr(self, field_name, result) else: setattr(self, field_name, result) return self.flag_preprocess_done(step)