コード例 #1
0
 def string_to_bag_of_words(self, text):
     text = RegexpTokenizer(r'\w+').tokenize(text.lower())
     stop_words = set(nltk.corpus.stopwords.words('english'))
     return Counter([
         WordNetLemmatizer().lemmatize(x) for x in text
         if not x in stop_words
     ])
コード例 #2
0
def getAllReviews(movieList):
    reviews = np.array(map(lambda x: x["reviews"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["score"] >= 30 else 'neg'))

    return tokenizeReview
コード例 #3
0
def getAllCritics(movieList):
    reviews = np.array(map(lambda x: x["critics"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["tomatometer"] == "fresh" else 'neg'))

    return tokenizeReview
コード例 #4
0
    def string_to_bag_of_words(self, text):
        text = RegexpTokenizer(r'\w+').tokenize(text.lower())

        if not(self.enable_stemming) and not(self.filter_stopwords):
            return Counter(text)

        elif not(self.enable_stemming) and self.filter_stopwords:
            stop_words = set(nltk.corpus.stopwords.words('english'))
            return Counter([x for x in text if not x in stop_words])

        elif self.enable_stemming and not(self.filter_stopwords):
            return Counter([WordNetLemmatizer().lemmatize(x) for x in text])

        else:
            stop_words = set(nltk.corpus.stopwords.words('english'))
            return Counter([WordNetLemmatizer().lemmatize(x) for x in text if not x in stop_words])