from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer # list of documents documents = ['This is the first document.', 'This is the second document.', 'And this is the third one.', 'Is this the first document?'] # create CountVectorizer object vectorizer = CountVectorizer() # convert documents to bag-of-words format X = vectorizer.fit_transform(documents) # create TfidfTransformer object transformer = TfidfTransformer() # calculate TF-IDF scores X_tfidf = transformer.fit_transform(X) # print TF-IDF matrix print(X_tfidf.toarray())
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer # text string text = 'This is my sample text. It contains some words and stuff.' # create CountVectorizer object vectorizer = CountVectorizer() # convert text to bag-of-words format X = vectorizer.fit_transform([text]) # create TfidfTransformer object transformer = TfidfTransformer() # calculate TF-IDF scores X_tfidf = transformer.fit_transform(X) # print TF-IDF vector print(X_tfidf.toarray()[0])Overall, the scikit-learn library provides a robust set of tools for working with text data in machine learning, including the TfidfTransformer for transforming text into numerical features.