def preprocess(self, x):
        """Load a single example using this field, tokenizing if necessary.

        If the input is a Python 2 `str`, it will be converted to Unicode
        first. If `sequential=True`, it will be tokenized. Then the input
        will be optionally lowercased and passed to the user-provided
        `preprocessing` Pipeline."""
        if (six.PY2 and isinstance(x, six.string_types) and
                not isinstance(x, six.text_type)):  # never
            x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
        if self.sequential and isinstance(x, six.text_type):  # never
            x = self.tokenize(x.rstrip('\n'))
        if self.lower:
            x = [Pipeline(six.text_type.lower)(xx) for xx in x]
        if self.preprocessing is not None:
            return self.preprocessing(x)
        else:
            return x
Esempio n. 2
0
    def preprocess(self, x):
        """Load a single example using this field, tokenizing if necessary.

        If the input is a Python 2 `str`, it will be converted to Unicode
        first. If `sequential=True`, it will be tokenized. Then the input
        will be optionally lowercased and passed to the user-provided
        `preprocessing` Pipeline."""
        if (six.PY2 and isinstance(x, six.string_types)
                and not isinstance(x, six.text_type)):
            x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
        if isinstance(x, six.text_type):
            x = self.tokenize(x.rstrip('\n'))
        if self.lower:
            x = Pipeline(six.text_type.lower)(x)
        # The Pipeline that will be applied to examples using this field after
        # tokenizing but before numericalizing. Many Datasets replace this
        # attribute with a custom preprocessor. Default: None.
        if self.preprocessing is not None:
            return self.preprocessing(x)
        else:
            return x