Example #1
0
def compare_classifiers(test_set, full_training_labeld_features, step, max_size):
	""" compare Naive Bayes with MaxEnt on different training set sizes """
	nb_acc = []
	me_acc = []

	for size in range(step, max_size+1, step):
		print "creating trainig set of size", size
		training_set = create_even_training_set(size, full_training_labeld_features)
		print "train NBSentimentClassifier"
		nb_classifier = NBSentimentClassifier().train(training_set)
		nb_acc.append(nb_classifier.test_accuracy(test_set))
		print "train MaxEntSentimentClassifier"
		me_classifier = MaxEntSentimentClassifier().train(training_set)
		me_acc.append(me_classifier.test_accuracy(test_set))
	return nb_acc, me_acc
Example #2
0
	""" iterator that splits a list into num_chunks chunks. truncates remainder """
	slice_len = len(alist)/num_chunks
	for x in xrange(0, num_chunks):
		yield alist[x*slice_len:(x+1)*slice_len]

def combine_dicts(a, b, op=None):
    op = op or (lambda x, y: x + y)
    return dict(a.items() + b.items() +
        [(k, op(a[k], b[k])) for k in set(b) & set(a)])


comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

classifier = NBSentimentClassifier().load_model()


if(rank == 0):
	if len(sys.argv) > 1:
		csvFile = sys.argv[1]
		keyword = sys.argv[2]
	else:
		csvFile = 'trainingandtestdata/testdata.csv'

	tweetlist = loadTwitterCSV(csvFile)
	#tweetlist = loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv')
	tweetlist = chunked(tweetlist, size)
else:
	tweetlist = None # tweetlist must be defined
Example #3
0
	args = parser.parse_args()
	print "creating feature sets..."
	tweetlist = loadTwitterCSV('trainingandtestdata/testdata.csv')
	labeld_features = label_feats_from_tweets(tweetlist)
	#training_set, test_set = split_label_feats(labeld_features)

	tweetlist = loadTwitterCSV('trainingandtestdata/training.1600000.processed.noemoticon.csv')
	training_set = label_feats_from_tweets(tweetlist)
	training_set, garbage = split_label_feats(training_set, 1.0)
	test_set, garbage = split_label_feats(labeld_features, 1.0)

	print "training set length: %i  test set length: %i" % (len(training_set), len(test_set))
	#print prettifyFeatureSet(test_set)
	
	if args.algo == 'nb':
		classifier = NBSentimentClassifier().train(training_set)
		print "training NaiveBayes classifier..."
	else:
		classifier = MaxEntSentimentClassifier().train(training_set)
		print "training MaxEnt classifier..."

	print "calculating accuracy..."
	print 'accuracy:', classifier.test_accuracy(test_set)
	#classifier.show_most_informative_features(30)


	classifier.save_model()

	# load a serialized trained classifier
	#classifier = NBSentimentClassifier().load_model()
	#classifier = MaxEntSentimentClassifier().load_model()