print(feature_names) # Let's print both the feature names and counts together # - first for the training data and then for the test data display_features(features, feature_names) display_features(new_doc_features, feature_names) # Now let's try the same with tf-idf instead of frequency counts # We use the tfidf_transformer function we defined import numpy as np from feature_extractors import tfidf_transformer feature_names = bow_vectorizer.get_feature_names() # We again convert to the dense form to print the values out tfidf_trans, tdidf_features = tfidf_transformer(bow_features) features = np.round(tdidf_features.todense(), 2) display_features(features, feature_names) # We do the same for the test document nd_tfidf = tfidf_trans.transform(new_doc_features) nd_features = np.round(nd_tfidf.todense(), 2) display_features(nd_features, feature_names) # We can also compute tf-idf scores/vectors ourselves from scratch # - without using sklearn's TfidfTransformer class import scipy.sparse as sp from numpy.linalg import norm feature_names = bow_vectorizer.get_feature_names()
new_doc_features = bow_vectorizer.transform(new_doc) new_doc_features = new_doc_features.todense() print new_doc_features feature_names = bow_vectorizer.get_feature_names() print feature_names display_features(features, feature_names) display_features(new_doc_features, feature_names) import numpy as np from feature_extractors import tfidf_transformer feature_names = bow_vectorizer.get_feature_names() tfidf_trans, tdidf_features = tfidf_transformer(bow_features) features = np.round(tdidf_features.todense(), 2) display_features(features, feature_names) nd_tfidf = tfidf_trans.transform(new_doc_features) nd_features = np.round(nd_tfidf.todense(), 2) display_features(nd_features, feature_names) import scipy.sparse as sp from numpy.linalg import norm feature_names = bow_vectorizer.get_feature_names() # compute term frequency tf = bow_features.todense()