def fetch_NMF(train_data, test_data): stop_words = text.ENGLISH_STOP_WORDS count_vect = CountVectorizer(tokenizer=project1b.tokenizer_class(), stop_words=stop_words, lowercase=True, min_df=2, max_df=0.99) tf_transformer = TfidfTransformer(use_idf=False) nmf_model = NMF(n_components=50) pipeline = Pipeline([('vectorize', count_vect), ('tf-idf', tf_transformer), ('nmf', nmf_model)]) nmf_train = pipeline.fit_transform(train_data.data) nmf_test = pipeline.transform(test_data.data) return nmf_train, nmf_test
'rec.sport.baseball', 'rec.sport.hockey' ] train_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) test_data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) project1e.classify(train_data) project1e.classify(test_data) # fetch LSI representation count_vect = CountVectorizer(tokenizer=project1b.tokenizer_class(), stop_words=stop_words, lowercase=True, min_df=2, max_df=0.99) #need to try min_df = 2, and min_df = 5 #svd_train = pipeline.fit_transform(train_data.data) #svd_test = pipeline.transform(test_data.data) classifier = GaussianNB() svd_train, svd_test = project1d.fetch_LSI(train_data, test_data) nmf_train, nmf_test = project1d.fetch_NMF(train_data, test_data) print("Training Naive Bayes classifier") classifier.fit(svd_train, train_data.target)
import nltk import math #def calculate_tcicf(freq,categories, categories_per_term): # result = freq* math.log10(categories/) # (categories/float(1+categories_per_term))) # return result tfidf_transformer = TfidfTransformer() stop_words = text.ENGLISH_STOP_WORDS vectorizer = CountVectorizer(analyzer='word', stop_words=stop_words, ngram_range=(1, 1), tokenizer=project1b.tokenizer_class(), lowercase=True, max_df=0.99, min_df=5) #min_df = 5 all_categories = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'alt.atheism', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'misc.forsale', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ] all_docs_per_category = []