def compare_classifiers(test_set, full_training_labeld_features, step, max_size): """ compare Naive Bayes with MaxEnt on different training set sizes """ nb_acc = [] me_acc = [] for size in range(step, max_size+1, step): print "creating trainig set of size", size training_set = create_even_training_set(size, full_training_labeld_features) print "train NBSentimentClassifier" nb_classifier = NBSentimentClassifier().train(training_set) nb_acc.append(nb_classifier.test_accuracy(test_set)) print "train MaxEntSentimentClassifier" me_classifier = MaxEntSentimentClassifier().train(training_set) me_acc.append(me_classifier.test_accuracy(test_set)) return nb_acc, me_acc
""" iterator that splits a list into num_chunks chunks. truncates remainder """ slice_len = len(alist)/num_chunks for x in xrange(0, num_chunks): yield alist[x*slice_len:(x+1)*slice_len] def combine_dicts(a, b, op=None): op = op or (lambda x, y: x + y) return dict(a.items() + b.items() + [(k, op(a[k], b[k])) for k in set(b) & set(a)]) comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() classifier = NBSentimentClassifier().load_model() if(rank == 0): if len(sys.argv) > 1: csvFile = sys.argv[1] keyword = sys.argv[2] else: csvFile = 'trainingandtestdata/testdata.csv' tweetlist = loadTwitterCSV(csvFile) #tweetlist = loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv') tweetlist = chunked(tweetlist, size) else: tweetlist = None # tweetlist must be defined
args = parser.parse_args() print "creating feature sets..." tweetlist = loadTwitterCSV('trainingandtestdata/testdata.csv') labeld_features = label_feats_from_tweets(tweetlist) #training_set, test_set = split_label_feats(labeld_features) tweetlist = loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv') training_set = label_feats_from_tweets(tweetlist) training_set, garbage = split_label_feats(training_set, 1.0) test_set, garbage = split_label_feats(labeld_features, 1.0) print "training set length: %i test set length: %i" % (len(training_set), len(test_set)) #print prettifyFeatureSet(test_set) if args.algo == 'nb': classifier = NBSentimentClassifier().train(training_set) print "training NaiveBayes classifier..." else: classifier = MaxEntSentimentClassifier().train(training_set) print "training MaxEnt classifier..." print "calculating accuracy..." print 'accuracy:', classifier.test_accuracy(test_set) #classifier.show_most_informative_features(30) classifier.save_model() # load a serialized trained classifier #classifier = NBSentimentClassifier().load_model() #classifier = MaxEntSentimentClassifier().load_model()