print



# Train Naive Bayes on all documents.
# To test the accuracy of a classifier, Using 10-fold crossvalidation
# This yields 4 scores: Accuracy, Precision, Recall and F-score.
print 'Bayes Classifier'
print  '-------------------------'
print  '(Accuracy, Precision,REcall,F-Measure)'
print Bayes.test(corpus,folds=10)


#Crossavalidation on reduced Dataset
nfeatures=10000
f=corpus.feature_selection(top=nfeatures,method=IG)
corpus=corpus.filter(features=f)
print 'Bayes Classifier on Reduced dataset of', nfeatures,' features'
print  '-------------------------'
print  '(Accuracy, Precision,REcall,F-Measure)'
print Bayes.test(corpus,folds=10)


#Testing Model on sample Dataset
print 'Testing Model on Sample Dataset'
classifier = Bayes()
for document in corpus.documents:
    classifier.train(document,type=document.type)
# In the file top 10 are negative tweets and rest are positive tweets
ft=open('test_20','r')
test_lines=ft.readlines()
Example #2
0
#print xxx


print len(corpus)
print len(corpus.features)
print len(corpus.documents[0].vector)
from time import time
t = time()
print KNN.test(corpus, folds=10)
print time()-t

print "filter..."

from time import time
t = time()
f = corpus.feature_selection(150, verbose=False)
print f
print time()-t
corpus = corpus.filter(f)

#corpus.reduce(300)
#print len(corpus.lsa.vectors[corpus.documents[0].id])
#print corpus.lsa.vectors[corpus.documents[0].id]
#print len(corpus)
#print len(corpus.lsa.terms)

#print corpus.feature_selection(top=100, verbose=True)

from time import time
t = time()
print KNN.test(corpus, folds=10)
print 'Number of Negative Tweets:',len(neg_lines)
print 'Number of Positive Tweets:',len(pos_lines)

documents = []
for line in neg_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type='0')
    documents.append(document)
for line in pos_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type='1')
    documents.append(document)

corpus = Corpus(documents,weight=TFIDF)
print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print

#Filtering top 1000 features using Information Gain Criterion
corpus=corpus.filter(features=(corpus.feature_selection(top=1000,method=IG)))

# To test the accuracy of a classifier, Using 10-fold crossvalidation
# This yields 4 scores: Accuracy, Precision, Recall and F-score.
print 'classifying using KNN'
print  '-------------------------'
print  '(Accuracy, Precision,REcall,F-Measure)'
print KNN.test(corpus,k=100,folds=10,distance=COSINE)

f_neg.close()
f_pos.close()