def test(classif, n=1, train_size=500, mode='k', iterations=1, dataset='', extra_dataset=None, limit=None, binary=False, idf=False, negation=True): (pos_dir, neg_dir) = select_dataset(dataset) if extra_dataset: mode = 'd' iterations = 1 train_size = 1000 test_set = dataset else: test_set = None print "TEST CONFIGURATION" print "dataset: %(dataset)s, stars: %(extra_dataset)s \nn: %(n)s, limit: %(limit)s, binary: %(binary)s, \nmode: %(mode)s, iterations: %(iterations)s, idf: %(idf)s" % { 'n': n, 'train_size': train_size, 'mode': mode, 'iterations': iterations, 'dataset': dataset, 'extra_dataset': extra_dataset, 'limit': limit, 'binary': binary, 'idf': idf } ind = Indexes(mode=mode, iterations=iterations, train_size=train_size) (pos_correct, neg_correct) = (0, 0) for k in range(iterations): ind.next() m = TestConfiguration(classif, n, ind, pos_dir, neg_dir, idf=idf, test_set=test_set, binary=binary, limit=limit, negation=negation) m.train() (pos, neg) = m.test() pos_correct += pos neg_correct += neg print "Results:" print "Positive:", round((pos_correct / iterations) * 100), "%" print "Negative:", round((neg_correct / iterations) * 100), "%" print "Total:", round( (neg_correct + pos_correct) / (2 * iterations) * 100), "%"
def crossValidate(self, iterations, mode='k', train_size=500): ind = Indexes(mode=mode, iterations=iterations, train_size=train_size) pos_correct = 0 neg_correct = 0 for k in range(iterations): ind.next() self.set_index(ind) self.train() (pos, neg) = self.test() pos_correct += pos neg_correct += neg print "Results:" print "Positive:", round((pos_correct/iterations)*100), "%" print "Negative:", round((neg_correct/iterations)*100), "%" print "Total:", round((neg_correct + pos_correct)/(2*iterations)*100), "%"
def crossValidate(self, iterations, mode='k', train_size=500): ind = Indexes(mode=mode, iterations=iterations, train_size=train_size) pos_correct = 0 neg_correct = 0 for k in range(iterations): ind.next() self.set_index(ind) self.train() (pos, neg) = self.test() pos_correct += pos neg_correct += neg print "Results:" print "Positive:", round((pos_correct / iterations) * 100), "%" print "Negative:", round((neg_correct / iterations) * 100), "%" print "Total:", round( (neg_correct + pos_correct) / (2 * iterations) * 100), "%"
def test(classif, n=1, train_size=500, mode='k', iterations=1, dataset='', extra_dataset=None, limit=None, binary=False, idf=False, negation=True): (pos_dir, neg_dir) = select_dataset(dataset) if extra_dataset: mode='d' iterations=1 train_size = 1000 test_set = dataset else: test_set = None print "TEST CONFIGURATION" print "dataset: %(dataset)s, stars: %(extra_dataset)s \nn: %(n)s, limit: %(limit)s, binary: %(binary)s, \nmode: %(mode)s, iterations: %(iterations)s, idf: %(idf)s" % {'n':n, 'train_size':train_size, 'mode':mode, 'iterations':iterations, 'dataset':dataset, 'extra_dataset':extra_dataset, 'limit':limit, 'binary':binary, 'idf':idf} ind = Indexes(mode=mode,iterations=iterations,train_size=train_size) (pos_correct, neg_correct) = (0,0) for k in range(iterations): ind.next() m = TestConfiguration(classif, n, ind, pos_dir, neg_dir, idf=idf, test_set=test_set, binary=binary, limit=limit, negation=negation) m.train() (pos, neg) = m.test() pos_correct += pos neg_correct += neg print "Results:" print "Positive:", round((pos_correct/iterations)*100), "%" print "Negative:", round((neg_correct/iterations)*100), "%" print "Total:", round((neg_correct + pos_correct)/(2*iterations)*100), "%"
train_size = 800 mode = 'k' iterations = 3 extra_dataset = None # print "Bayes:" # test(classifier.BayesClassifier,n=n,train_size=train_size,mode=mode,iterations=iterations, # dataset=dataset,extra_dataset=extra_dataset,limit=limit,binary=binary, idf=idf, negation = negation) print "MaxEnt:" # test(classifier.MaximumEntropyClassifier,n=n,train_size=train_size,mode=mode,iterations=iterations, dataset=dataset,extra_dataset=extra_dataset,limit=limit,binary=binary, idf=idf, negation=negation) # print "SVM:" # test(classifier.LinearSVMClassifier,n=n,train_size=train_size,mode=mode,iterations=iterations, dataset=dataset,extra_dataset=extra_dataset,limit=limit,binary=binary, idf=idf, negation=negation) mvc = MajorityVotingTester(negation) ind = Indexes(mode='k', iterations=3, train_size=800) ind.next() print ind (pos_dir, neg_dir) = select_dataset(dataset) m1 = TestConfiguration(classifier.BayesClassifier, n=n, ind=ind, pos_dir=pos_dir, neg_dir=neg_dir, binary=binary, limit=limit, idf=idf) mvc.addClassifier(m1) (pos_dir, neg_dir) = select_dataset(dataset) m2 = TestConfiguration(classifier.MaximumEntropyClassifier,
train_size = 800 mode = 'k' iterations = 3 extra_dataset=None # print "Bayes:" # test(classifier.BayesClassifier,n=n,train_size=train_size,mode=mode,iterations=iterations, # dataset=dataset,extra_dataset=extra_dataset,limit=limit,binary=binary, idf=idf, negation = negation) print "MaxEnt:" # test(classifier.MaximumEntropyClassifier,n=n,train_size=train_size,mode=mode,iterations=iterations, dataset=dataset,extra_dataset=extra_dataset,limit=limit,binary=binary, idf=idf, negation=negation) # print "SVM:" # test(classifier.LinearSVMClassifier,n=n,train_size=train_size,mode=mode,iterations=iterations, dataset=dataset,extra_dataset=extra_dataset,limit=limit,binary=binary, idf=idf, negation=negation) mvc = MajorityVotingTester(negation) ind = Indexes(mode='k',iterations=3,train_size=800) ind.next() print ind (pos_dir, neg_dir) = select_dataset(dataset) m1 = TestConfiguration(classifier.BayesClassifier, n=n, ind=ind, pos_dir=pos_dir, neg_dir=neg_dir, binary=binary, limit=limit, idf=idf) mvc.addClassifier(m1) (pos_dir, neg_dir) = select_dataset(dataset) m2 = TestConfiguration(classifier.MaximumEntropyClassifier, n=n, ind=ind, pos_dir=pos_dir, neg_dir=neg_dir, binary=binary, limit=limit, idf=idf) mvc.addClassifier(m2) (pos_dir, neg_dir) = select_dataset(dataset) m3 = TestConfiguration(classifier.LinearSVMClassifier, n=n, ind=ind, pos_dir=pos_dir, neg_dir=neg_dir, binary=binary, limit=limit, idf=idf) mvc.addClassifier(m3)
#!/usr/bin/python import os import ngrams from Indexes import Indexes import matplotlib.pyplot as plt from classifier import MaximumEntropyClassifier TRAIN_SIZE = 800 n = 1 print "Maximum Entropy" pos = os.listdir("pos") neg = os.listdir("neg") ind = Indexes('r',1,TRAIN_SIZE) print "> determined Indices" ind.next() pos_grams = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_train_ind()] pos_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(pos_grams),16165) neg_grams = [ngrams.ngrams(n, open("neg/"+neg[i]).read()) for i in ind.get_neg_train_ind()] neg_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(neg_grams),16165) print "> collapsed grams" trainingset = [([k],'pos',v) for (k,v) in pos_collapsed_grams.iteritems()] trainingset.extend([([k],'neg',v) for (k,v) in neg_collapsed_grams.iteritems()]) m = MaximumEntropyClassifier(trainingset) print "> created model" pos_res = []
#!/usr/bin/python import os import ngrams from Indexes import Indexes import matplotlib.pyplot as plt from classifier import MaximumEntropyClassifier TRAIN_SIZE = 800 n = 1 print "Maximum Entropy" pos = os.listdir("pos") neg = os.listdir("neg") ind = Indexes('r', 1, TRAIN_SIZE) print "> determined Indices" ind.next() pos_grams = [ ngrams.ngrams(n, open("pos/" + pos[i]).read()) for i in ind.get_pos_train_ind() ] pos_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(pos_grams), 16165) neg_grams = [ ngrams.ngrams(n, open("neg/" + neg[i]).read()) for i in ind.get_neg_train_ind() ]