from sklearn.feature_extraction.text import CountVectorizer docs = ['This is the first document.', 'This is the second document.', 'And this is the third one.'] cv = CountVectorizer() X = cv.fit_transform(docs) print(X.toarray())
[[1 1 0 1 0 0 1 0 0] [1 1 0 0 1 0 1 0 0] [0 0 1 0 0 1 1 1 1]]
from sklearn.feature_extraction.text import CountVectorizer docs = ['This is the first document.', 'This is the second document.', 'And this is the third one.'] cv = CountVectorizer(ngram_range=(1,3)) X = cv.fit_transform(docs) print(X.toarray())
[[1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0] [1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0] [0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1]]Both of these examples use Python's sklearn package and its CountVectorizer class.