def word_counts(html): lem = WordNetLemmatizer() tokens = tokenize(BS(html).get_text()) tokens = [ lem.lemmatizer(token.lowercase) for token in tokens if token not in string.punctuation ] return make_dict(tokens)
import re from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer ps = PorterStemmer() wordnet = WordNetLemmatizer() sentences = nltk.sent_tokenize(paragraph) corpus= [] for i in range(len(sentences)): review = re.sub("[^a-zA-Z]",' ',sentences[i]) review = review.lower() review = review.split() # review = [ps.PorterStemmer(word) for word in review if not word in set(stopwords.words('english'))] review = [wordnet.lemmatizer(word) for word in review if not word in set(stopwords.words('english'))] review = ' '.join(review) corpus.append(review) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=1500) x=cv.fit_transform(corpus).toarray()