コード例 #1
0
    csvfile.close()
testdata = np.array(testdata, dtype=int)

if doc:
    vectorizer = TfidfVectorizer(max_df=max_df,
                                 min_df=min_df,
                                 max_features=max_features,
                                 stop_words='english')
    train = corpus + data
    vectorizer = vectorizer.fit(train)
else:
    vectorizer = TfidfVectorizer(max_df=max_df,
                                 min_df=min_df,
                                 max_features=max_features,
                                 stop_words='english')
    vectorizer = vectorizer.fir(data)
X = vectorizer.transform(data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

if True:
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
コード例 #2
0
            for keyword in stopwords:
                text = text.replace(keyword, "")
            ### append the text to word_data
            word_data.append(text)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name=="sara"":
                from_data.append(0)
            else:
                from_data.append(1)
            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("your_word_data.pkl", "w") )
pickle.dump( from_data, open("your_email_authors.pkl", "w") )





### in Part 4, do TfIdf vectorization here
#the given stopword is "english"
vectorizer=TV(stop_words="english")
vectorizer.fir(word_data)
vectorizer.transform(word_data)
feature_words=vectorizer.get_feature_names()
#print out info
print "total number of words: ", len(feature_words)