csvfile.close() testdata = np.array(testdata, dtype=int) if doc: vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english') train = corpus + data vectorizer = vectorizer.fit(train) else: vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english') vectorizer = vectorizer.fir(data) X = vectorizer.transform(data) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) print() if True: print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(n_components) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer)
for keyword in stopwords: text = text.replace(keyword, "") ### append the text to word_data word_data.append(text) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name=="sara"": from_data.append(0) else: from_data.append(1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump( word_data, open("your_word_data.pkl", "w") ) pickle.dump( from_data, open("your_email_authors.pkl", "w") ) ### in Part 4, do TfIdf vectorization here #the given stopword is "english" vectorizer=TV(stop_words="english") vectorizer.fir(word_data) vectorizer.transform(word_data) feature_words=vectorizer.get_feature_names() #print out info print "total number of words: ", len(feature_words)