コード例 #1
0
print(feature_names)

# Let's print both the feature names and counts together
# - first for the training data and then for the test data
display_features(features, feature_names)
display_features(new_doc_features, feature_names)


# Now let's try the same with tf-idf instead of frequency counts
# We use the tfidf_transformer function we defined
import numpy as np
from feature_extractors import tfidf_transformer
feature_names = bow_vectorizer.get_feature_names()

# We again convert to the dense form to print the values out    
tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
features = np.round(tdidf_features.todense(), 2)
display_features(features, feature_names)
# We do the same for the test document
nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)



# We can also compute tf-idf scores/vectors ourselves from scratch
# - without using sklearn's TfidfTransformer class
import scipy.sparse as sp
from numpy.linalg import norm
feature_names = bow_vectorizer.get_feature_names()
new_doc_features = bow_vectorizer.transform(new_doc)
new_doc_features = new_doc_features.todense()
print new_doc_features

feature_names = bow_vectorizer.get_feature_names()
print feature_names

display_features(features, feature_names)
display_features(new_doc_features, feature_names)


import numpy as np
from feature_extractors import tfidf_transformer
feature_names = bow_vectorizer.get_feature_names()
    
tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
features = np.round(tdidf_features.todense(), 2)
display_features(features, feature_names)

nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)



import scipy.sparse as sp
from numpy.linalg import norm
feature_names = bow_vectorizer.get_feature_names()

# compute term frequency
tf = bow_features.todense()