Exemple #1
0
def fetch_NMF(train_data, test_data):
    stop_words = text.ENGLISH_STOP_WORDS
    count_vect = CountVectorizer(tokenizer=project1b.tokenizer_class(),
                                 stop_words=stop_words,
                                 lowercase=True,
                                 min_df=2,
                                 max_df=0.99)
    tf_transformer = TfidfTransformer(use_idf=False)
    nmf_model = NMF(n_components=50)
    pipeline = Pipeline([('vectorize', count_vect), ('tf-idf', tf_transformer),
                         ('nmf', nmf_model)])
    nmf_train = pipeline.fit_transform(train_data.data)
    nmf_test = pipeline.transform(test_data.data)
    return nmf_train, nmf_test
Exemple #2
0
        'rec.sport.baseball', 'rec.sport.hockey'
    ]
    train_data = fetch_20newsgroups(subset='train',
                                    categories=categories,
                                    shuffle=True,
                                    random_state=42)
    test_data = fetch_20newsgroups(subset='test',
                                   categories=categories,
                                   shuffle=True,
                                   random_state=42)

    project1e.classify(train_data)
    project1e.classify(test_data)
    # fetch LSI representation

    count_vect = CountVectorizer(tokenizer=project1b.tokenizer_class(),
                                 stop_words=stop_words,
                                 lowercase=True,
                                 min_df=2,
                                 max_df=0.99)
    #need to try min_df = 2, and min_df = 5

    #svd_train = pipeline.fit_transform(train_data.data)
    #svd_test = pipeline.transform(test_data.data)

    classifier = GaussianNB()
    svd_train, svd_test = project1d.fetch_LSI(train_data, test_data)
    nmf_train, nmf_test = project1d.fetch_NMF(train_data, test_data)

    print("Training Naive Bayes classifier")
    classifier.fit(svd_train, train_data.target)
Exemple #3
0
import nltk
import math

#def calculate_tcicf(freq,categories, categories_per_term):
#    result = freq* math.log10(categories/)
#                (categories/float(1+categories_per_term)))
#   return result

tfidf_transformer = TfidfTransformer()

stop_words = text.ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(analyzer='word',
                             stop_words=stop_words,
                             ngram_range=(1, 1),
                             tokenizer=project1b.tokenizer_class(),
                             lowercase=True,
                             max_df=0.99,
                             min_df=5)  #min_df = 5

all_categories = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'rec.autos', 'rec.motorcycles',
    'rec.sport.baseball', 'rec.sport.hockey', 'alt.atheism', 'sci.crypt',
    'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian',
    'misc.forsale', 'talk.politics.guns', 'talk.politics.mideast',
    'talk.politics.misc', 'talk.religion.misc'
]

all_docs_per_category = []