def word_counts(html):
    lem = WordNetLemmatizer()
    tokens = tokenize(BS(html).get_text())
    tokens = [
        lem.lemmatizer(token.lowercase) for token in tokens
        if token not in string.punctuation
    ]

    return make_dict(tokens)
Ejemplo n.º 2
0
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
wordnet = WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus= []

for i in range(len(sentences)):
    review = re.sub("[^a-zA-Z]",' ',sentences[i])
    review = review.lower()
    review = review.split()
#   review = [ps.PorterStemmer(word) for word in review if not word in set(stopwords.words('english'))]
    review = [wordnet.lemmatizer(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
x=cv.fit_transform(corpus).toarray()