def spacy_tokenizer(self, review): tokens = [] tags = [] processed = self.nlp(self.data[review]['Content']) for word in processed: if self.features in [1, 2]: tokens.append(process_word(word.lower_)) tags.append((process_word(word.lower_), word.tag_)) else: tokens.append(word.lower_) tags.append((word.lower_, word.tag_)) if self.features in [2, 3]: tags = self.ngrams(tokens, tags) return tokens, tags
def whitespace_tokenizer(self, review): tokenizer = WhitespaceTokenizer() if self.features in [1, 2]: tokens = [ process_word(word.lower()) for word in tokenizer.tokenize(self.data[review]['Content']) ] else: tokens = [ word.lower() for word in tokenizer.tokenize(self.data[review]['Content']) ] tags = nltk.pos_tag(tokens) return tokens, tags
def word_tokenizer(self, review): if self.features in [1, 2]: tokens = [ process_word(word.lower()) for word in word_tokenize(self.data[review]['Content']) ] else: tokens = [ word.lower() for word in word_tokenize(self.data[review]['Content']) ] tags = nltk.pos_tag(tokens) if self.features in [2, 3]: tags = self.ngrams(tokens, tags) return tokens, tags