コード例 #1
0
    def spacy_tokenizer(self, review):
        tokens = []
        tags = []

        processed = self.nlp(self.data[review]['Content'])

        for word in processed:
            if self.features in [1, 2]:
                tokens.append(process_word(word.lower_))
                tags.append((process_word(word.lower_), word.tag_))
            else:
                tokens.append(word.lower_)
                tags.append((word.lower_, word.tag_))

        if self.features in [2, 3]:
            tags = self.ngrams(tokens, tags)

        return tokens, tags
コード例 #2
0
    def whitespace_tokenizer(self, review):
        tokenizer = WhitespaceTokenizer()
        if self.features in [1, 2]:
            tokens = [
                process_word(word.lower())
                for word in tokenizer.tokenize(self.data[review]['Content'])
            ]
        else:
            tokens = [
                word.lower()
                for word in tokenizer.tokenize(self.data[review]['Content'])
            ]

        tags = nltk.pos_tag(tokens)

        return tokens, tags
コード例 #3
0
    def word_tokenizer(self, review):
        if self.features in [1, 2]:
            tokens = [
                process_word(word.lower())
                for word in word_tokenize(self.data[review]['Content'])
            ]
        else:
            tokens = [
                word.lower()
                for word in word_tokenize(self.data[review]['Content'])
            ]

        tags = nltk.pos_tag(tokens)

        if self.features in [2, 3]:
            tags = self.ngrams(tokens, tags)
        return tokens, tags