Exemple #1
0
 def set_tokenization_result(self, value):
     """Sets the value to the correspondent storage format"""
     if not isinstance(value, list):
         raise ValueError("Tokenization expected result should be a list "
                          "of tuples (token-offset on text (int), token-string).")
     tkn_offsets, tokens = unzip(value, 2)
     self.tokens = list(tokens)
     self.offsets_to_text = list(tkn_offsets)
     self.tokenization_done_at = datetime.now()
     return self
Exemple #2
0
 def set_tokenization_result(self, value):
     """Sets the value to the correspondent storage format"""
     if not isinstance(value, list):
         raise ValueError(
             "Tokenization expected result should be a list "
             "of tuples (token-offset on text (int), token-string).")
     tkn_offsets, tokens = unzip(value, 2)
     self.tokens = list(tokens)
     self.offsets_to_text = list(tkn_offsets)
     self.tokenization_done_at = datetime.now()
     return self
Exemple #3
0
    def set_preprocess_result(self, step, result):
        """Set the result in the internal representation.
        Explicit save must be triggered after this call.
        Returns "self" so it's easily chainable with a .save() if desired
        """
        if not isinstance(step, PreProcessSteps):
            raise InvalidPreprocessSteps
        if step == PreProcessSteps.sentencer:
            if not all(isinstance(x, int) for x in result):
                raise ValueError(
                    'Sentencer result shall only contain ints: %r' % result)
            if sorted(result) != result:
                raise ValueError('Sentencer result shall be ordered.')
            if len(set(result)) < len(result):
                raise ValueError(
                    'Sentencer result shall not contain duplicates.')
            if result[0] != 0:
                raise ValueError(
                    'Sentencer result must start with 0. Actual=%r' %
                    result[0])
            if result[-1] != len(self.tokens):
                raise ValueError(
                    'Sentencer result must end with token count=%d. Actual=%r'
                    % (len(self.tokens), result[-1]))
        elif step == PreProcessSteps.tagging:
            if len(result) != len(self.tokens):
                raise ValueError(
                    'Tagging result must have same cardinality than tokens')

        field_name = self.preprocess_fields_mapping[step]
        if isinstance(field_name, tuple):
            # Some steps are stored on several fields
            names = field_name
            results = unzip(result, len(names))
            for field_name, result in zip(names, results):
                setattr(self, field_name, result)
        else:
            setattr(self, field_name, result)
        return self.flag_preprocess_done(step)