print # Train Naive Bayes on all documents. # To test the accuracy of a classifier, Using 10-fold crossvalidation # This yields 4 scores: Accuracy, Precision, Recall and F-score. print 'Bayes Classifier' print '-------------------------' print '(Accuracy, Precision,REcall,F-Measure)' print Bayes.test(corpus,folds=10) #Crossavalidation on reduced Dataset nfeatures=10000 f=corpus.feature_selection(top=nfeatures,method=IG) corpus=corpus.filter(features=f) print 'Bayes Classifier on Reduced dataset of', nfeatures,' features' print '-------------------------' print '(Accuracy, Precision,REcall,F-Measure)' print Bayes.test(corpus,folds=10) #Testing Model on sample Dataset print 'Testing Model on Sample Dataset' classifier = Bayes() for document in corpus.documents: classifier.train(document,type=document.type) # In the file top 10 are negative tweets and rest are positive tweets ft=open('test_20','r') test_lines=ft.readlines()
#print xxx print len(corpus) print len(corpus.features) print len(corpus.documents[0].vector) from time import time t = time() print KNN.test(corpus, folds=10) print time()-t print "filter..." from time import time t = time() f = corpus.feature_selection(150, verbose=False) print f print time()-t corpus = corpus.filter(f) #corpus.reduce(300) #print len(corpus.lsa.vectors[corpus.documents[0].id]) #print corpus.lsa.vectors[corpus.documents[0].id] #print len(corpus) #print len(corpus.lsa.terms) #print corpus.feature_selection(top=100, verbose=True) from time import time t = time() print KNN.test(corpus, folds=10)
print 'Number of Negative Tweets:',len(neg_lines) print 'Number of Positive Tweets:',len(pos_lines) documents = [] for line in neg_lines: document = Document(line,stopword=True,stemmer=PORTER,type='0') documents.append(document) for line in pos_lines: document = Document(line,stopword=True,stemmer=PORTER,type='1') documents.append(document) corpus = Corpus(documents,weight=TFIDF) print "number of documents:", len(corpus) print "number of words:", len(corpus.vector) print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus)) print #Filtering top 1000 features using Information Gain Criterion corpus=corpus.filter(features=(corpus.feature_selection(top=1000,method=IG))) # To test the accuracy of a classifier, Using 10-fold crossvalidation # This yields 4 scores: Accuracy, Precision, Recall and F-score. print 'classifying using KNN' print '-------------------------' print '(Accuracy, Precision,REcall,F-Measure)' print KNN.test(corpus,k=100,folds=10,distance=COSINE) f_neg.close() f_pos.close()