def preprocess(self, x): """Load a single example using this field, tokenizing if necessary. If the input is a Python 2 `str`, it will be converted to Unicode first. If `sequential=True`, it will be tokenized. Then the input will be optionally lowercased and passed to the user-provided `preprocessing` Pipeline.""" if (six.PY2 and isinstance(x, six.string_types) and not isinstance(x, six.text_type)): x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x) if isinstance(x, list): # cue knowledge is list of sentences x = [self.tokenize(t.rstrip('\n')) for t in x] elif self.sequential and isinstance(x, six.text_type): x = self.tokenize(x.rstrip('\n')) # if self.sequential and isinstance(x, six.text_type): # x = self.tokenize(x.rstrip('\n')) if self.lower: x = Pipeline(six.text_type.lower)(x) if self.sequential and self.use_vocab and self.stop_words is not None: x = [w for w in x if w not in self.stop_words] if self.preprocessing is not None: return self.preprocessing(x) else: return x
def preprocess(self, x): """Load a single example using this field, tokenizing if necessary. If the input is a Python 2 `str`, it will be converted to Unicode first. If `sequential=True`, it will be tokenized. Then the input will be optionally lowercased and passed to the user-provided `preprocessing` Pipeline.""" if (six.PY2 and isinstance(x, six.string_types) and not isinstance(x, six.text_type)): x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x) # will strip and then split here! if self.sequential and isinstance(x, six.text_type): x = self.tokenize(x.rstrip('\n')) if self.lower: x = Pipeline(six.text_type.lower)(x) if self.preprocessing is not None: return self.preprocessing(x) else: return x