Exemple #1
0
if sys.argv[1] == "-lda" or sys.argv[1] == "-bayes":
    if len(sys.argv) != 4:
        print "Usage: control.py (-lda|-bayes) trainingfile testfile"
        sys.exit(0)
    (x,y)= csv_parser.parse_data(sys.argv[2])
    (testX, testY) = csv_parser.parse_data(sys.argv[3])
    if sys.argv[1] == "-lda":
        runLDA(x,y, testX, testY, sys.argv[3])
    else:
        runBayes(x,y, testX, testY, sys.argv[3])
    sys.exit(0)   

#default is logistic regression
(controls, training, test) = csv_parser.parse_control(sys.argv[1])

(x,y)= csv_parser.parse_data(training)
(testX, testY) = csv_parser.parse_data(test)

for i in xrange(len(controls)):
    params = controls[i]
    resultfile = open(test + "_result" + str(i), 'w') 
    resultfile.write('"WeightVector","ConfusionMatrix"\n')
    for j in xrange(int(math.floor(params[3]))):
        result= logisticregression.trainLogisticReg(params[0],params[1], int(params[2]), x, y)
        confusion = logisticregression.getConfusionMatrix(result, testX,testY)
        resultfile.write(str(result) + "," + str(confusion)+"\n")    
    resultfile.close()
        
 

def kfolds_all_algos(k, x, y, train_subjects, isotest_x, isotest_y, isotest_words):
	def word_count(examples, keep):
		words = {}
		for ex in examples:
			for subject in ex:
				if subject in words:
					words[subject] += 1
				else:
					words[subject] = 1
		sorted_x = sorted(words.iteritems(), key=operator.itemgetter(1))
		sorted_x.reverse()
		if keep > len(sorted_x):
		
			return words.keys()
		limit = sorted_x[keep][1]
		print limit
		ret_words = []
		for key in words:
			if words[key] >= limit:
				ret_words.append(key)
		return ret_words
		
	k_groups = splitdata(k, x, y, train_subjects)
	#now we have the k groups, assign each one as test once and run tests!
	print "groups split"
	lda_train_results = []
	lda_test_results = []
	#lda_iso_results = []
	nb_train_results = []
	nb_test_results = []
	#nb_iso_results = []
	lr_train_results = []
	lr_test_results = []
	#r_iso_results = []
	
	for i in xrange(k):
		print "K Fold number " + str(i)
		test = k_groups[i]
		train = []
		train.append([]) #x
		train.append([]) #y
		train.append([]) #words
		for j in xrange(k):
			if(j != i):
				train[0].extend(k_groups[j][0])
				train[1].extend(k_groups[j][1])
				train[2].extend(k_groups[j][2])
		# Perform a word count of the test data and get 15 top subjects.
		top_subjects = word_count(train[2], 50)
		
		# Extend train[0] and test to contain features for the top subjects.
		
		for i in xrange(0, len(train[0])):
			if type(train[0][i]) != list:
				train[0][i] = train[0][i].tolist()
				
		for i in xrange(0, len(train[0])):
			subjects = train[2][i]
			bits = [0] * len(top_subjects)
			for s in subjects:
				if s in top_subjects:
					bits[top_subjects.index(s)] = 1
			train[0][i].extend(bits)
		
		for i in xrange(0, len(test[0])):
			subjects = test[2][i]
			bits = [0] * len(top_subjects)
			for s in subjects:
				if s in top_subjects:
					bits[top_subjects.index(s)] = 1
			test[0][i].extend(bits)
		
		#Now we have test and training data... what shall we do?
		#train on LDA
		#print "Training LDA..."
		#(prob, mean, cov) = lda.trainLDA(train[0], train[1])
		#print str(prob) + "\t" + str(mean) + "\t" + str(cov)
		#print "DONE training LDA."
		print "Training NB..."
		(py, theta) = naivebayes.trainNaiveBayesMN(train[0], train[1])
		#print str(py) + "\t" + str(theta)
		print "DONE training NB"
		print "Training Logistic Regression..."
		t_x = copy.deepcopy(train[0])		
		for i in xrange(len(t_x)):
			temp_row = [1]
			temp_row.extend(t_x[i])
			t_x[i] = temp_row		
		(wvector, scales) = logisticregression.trainLogisticReg(0.01, 0.00001, 100, copy.deepcopy(t_x), copy.deepcopy(train[1]))
		#print str(wvector)
		print "DONE training Logistic Regression.\n"
		
		lr_model = linmod.LogisticRegression()
		lr_model.fit(t_x, train[1])
		for model, name in ((lr_model, "LR"),):
			tp, tn, fp, fn = 0, 0, 0, 0
			for i in xrange(0, len(t_x)):
				val = model.predict(t_x[i])
				if (val == 1 and train[1][i] == 1):
					tp += 1
				elif (val == 1 and train[1][i] == 0):
					fp += 1
				elif (val == 0 and train[1][i] == 0):
					tn += 1
				elif (val == 0 and train[1][i] == 1):
					fn += 1
			print "%s - TP: %d, FP: %d, TN: %d, FN: %d" % (name, tp, fp, tn, fn)
			
		#get Prediction Errors on left out set
		lr_test_error = logisticregression.getConfusionMatrix(wvector,scales, copy.deepcopy(test[0]), copy.deepcopy(test[1]))
		lr_train_error = logisticregression.getConfusionMatrix(wvector,scales, copy.deepcopy(train[0]), copy.deepcopy(train[1]))
		#lr_iso_error = logisticregression.getConfusionMatrix(wvector,scales, isotest_x, isotest_y)
		#lda_test_error = lda.getConfusionMatrix(prob, mean, cov, test[0], test[1])
		#lda_train_error = lda.getConfusionMatrix(prob, mean, cov, train[0], train[1])
		#lda_iso_error = lda.getConfusionMatrix(prob, mean, cov, isotest_x, isotest_y)
		nb_test_error = naivebayes.getConfusionMatrixMN(py, theta, test[0], test[1])
		nb_train_error = naivebayes.getConfusionMatrixMN(py, theta, train[0], train[1])
		#nb_iso_error = naivebayes.getConfusionMatrixMN(py, theta, isotest_x, isotest_y)
		
		#add to sets the false positives (for now)
		lr_train_results.append(lr_train_error)
		lr_test_results.append(lr_test_error)
		#lr_iso_results.append(lr_iso_error)
		#lda_train_results.append(lda_train_error)
		#lda_test_results.append(lda_test_error)
		#lda_iso_results.append(lda_iso_error)
		nb_train_results.append(nb_train_error)
		nb_test_results.append(nb_test_error)
		#nb_iso_results.append(nb_iso_error)
		
	#calc average training and test error for each algorithm
	#avr_lda_train = averageconfusionmatrix(lda_train_results)
	#avr_lda_test = averageconfusionmatrix(lda_test_results)
	#avr_lda_iso = averageconfusionmatrix(lda_iso_results)
	avr_lr_train = averageconfusionmatrix(lr_train_results)
	avr_lr_test = averageconfusionmatrix(lr_test_results)
	#avr_lr_iso = averageconfusionmatrix(lr_iso_results)
	avr_nb_train = averageconfusionmatrix(nb_train_results)
	avr_nb_test = averageconfusionmatrix(nb_test_results)
	#avr_nb_iso = averageconfusionmatrix(nb_iso_results)
	#return [avr_lr_train, avr_lr_test, avr_lr_iso, avr_lda_train, avr_lda_test, avr_lda_iso, avr_nb_train, avr_nb_test, avr_nb_iso]
	#return [avr_lr_train, avr_lr_test, avr_lda_train, avr_lda_test, avr_nb_train, avr_nb_test]
	return [avr_lr_train, avr_lr_test, avr_nb_train, avr_nb_test]
Exemple #3
0
def kfolds_all_algos(k, x, y, isotest_x, isotest_y):
	k_groups = splitdata(k, x, y)
	#now we have the k groups, assign each one as test once and run tests!
	print "groups split"
	lda_train_results = []
	lda_test_results = []
	lda_iso_results = []
	nb_train_results = []
	nb_test_results = []
	nb_iso_results = []
	lr_train_results = []
	lr_test_results = []
	lr_iso_results = []
	
	for i in xrange(k):
		print "K Fold number " + str(i)
		test = k_groups[i]
		train = []
		train.append([]) #x
		train.append([]) #y
		for j in xrange(k):
			if(j != i):
				train[0].extend(k_groups[j][0])
				train[1].extend(k_groups[j][1])
		
		#Now we have test and training data... what shall we do?
		#train on LDA
		print "Training LDA..."
		(prob, mean, cov) = lda.trainLDA(copy.deepcopy(train[0]), copy.deepcopy(train[1]))
		#print str(prob) + "\t" + str(mean) + "\t" + str(cov)
		print "DONE training LDA."
		print "Training NB..."
		(py, theta) = naivebayes.trainNaiveBayesMN(copy.deepcopy(train[0]), copy.deepcopy(train[1]))
		#print str(py) + "\t" + str(theta)
		print "DONE training NB"
		print "Training Logistic Regression..."
		t_x = copy.deepcopy(train[0])
		for i in xrange(len(t_x)):
			temp_row = [1]
			temp_row.extend(t_x[i])
			t_x[i] = temp_row
		(wvector, scales) = logisticregression.trainLogisticReg(0.01, 0.00001, 100, t_x, train[1])
		#print str(wvector)
		print "DONE training Logistic Regression.\n"
		
		#lr_model = linmod.LogisticRegression()
		#lr_model.fit(t_x, train[1])
		#for model, name in ((lr_model, "LR"),):
		#	tp, tn, fp, fn = 0, 0, 0, 0
		#	for i in xrange(0, len(t_x)):
		#		val = model.predict(t_x[i])
		#		if (val == 1 and train[1][i] == 1):
		#			tp += 1
		#		elif (val == 1 and train[1][i] == 0):
		#			fp += 1
		#		elif (val == 0 and train[1][i] == 0):
		#			tn += 1
		#		elif (val == 0 and train[1][i] == 1):
		#			fn += 1
		#	print "%s - TP: %d, FP: %d, TN: %d, FN: %d" % (name, tp, fp, tn, fn)
			
		#get Prediction Errors on left out set
		lr_test_error = logisticregression.getConfusionMatrix(wvector,scales, copy.deepcopy(test[0]), copy.deepcopy(test[1]))
		lr_train_error = logisticregression.getConfusionMatrix(wvector,scales, copy.deepcopy(train[0]), copy.deepcopy(train[1]))
		lr_iso_error = logisticregression.getConfusionMatrix(wvector,scales, copy.deepcopy(isotest_x), copy.deepcopy(isotest_y))
		lda_test_error = lda.getConfusionMatrix(prob, mean, cov, copy.deepcopy(test[0]), copy.deepcopy(test[1]))
		lda_train_error = lda.getConfusionMatrix(prob, mean, cov, copy.deepcopy(train[0]), copy.deepcopy(train[1]))
		lda_iso_error = lda.getConfusionMatrix(prob, mean, cov, copy.deepcopy(isotest_x), copy.deepcopy(isotest_y))
		nb_test_error = naivebayes.getConfusionMatrixMN(py, theta, copy.deepcopy(test[0]), copy.deepcopy(test[1]))
		nb_train_error = naivebayes.getConfusionMatrixMN(py, theta, copy.deepcopy(train[0]), copy.deepcopy(train[1]))
		nb_iso_error = naivebayes.getConfusionMatrixMN(py, theta, copy.deepcopy(isotest_x), copy.deepcopy(isotest_y))
		
		#add to sets the false positives (for now)
		lr_train_results.append(lr_train_error)
		lr_test_results.append(lr_test_error)
		lr_iso_results.append(lr_iso_error)
		lda_train_results.append(lda_train_error)
		lda_test_results.append(lda_test_error)
		lda_iso_results.append(lda_iso_error)
		nb_train_results.append(nb_train_error)
		nb_test_results.append(nb_test_error)
		nb_iso_results.append(nb_iso_error)
		
	#calc average training and test error for each algorithm
	avr_lda_train = averageconfusionmatrix(lda_train_results)
	avr_lda_test = averageconfusionmatrix(lda_test_results)
	avr_lda_iso = averageconfusionmatrix(lda_iso_results)
	avr_lr_train = averageconfusionmatrix(lr_train_results)
	avr_lr_test = averageconfusionmatrix(lr_test_results)
	avr_lr_iso = averageconfusionmatrix(lr_iso_results)
	avr_nb_train = averageconfusionmatrix(nb_train_results)
	avr_nb_test = averageconfusionmatrix(nb_test_results)
	avr_nb_iso = averageconfusionmatrix(nb_iso_results)
	return [avr_lr_train, avr_lr_test, avr_lr_iso, avr_lda_train, avr_lda_test, avr_lda_iso, avr_nb_train, avr_nb_test, avr_nb_iso]
Exemple #4
0
def kfolds_all_algos(k, x, y, isotest_x, isotest_y):
    k_groups = splitdata(k, x, y)
    #now we have the k groups, assign each one as test once and run tests!
    print "groups split"
    lda_train_results = []
    lda_test_results = []
    lda_iso_results = []
    nb_train_results = []
    nb_test_results = []
    nb_iso_results = []
    lr_train_results = []
    lr_test_results = []
    lr_iso_results = []

    for i in xrange(k):
        print "K Fold number " + str(i)
        test = k_groups[i]
        train = []
        train.append([])  #x
        train.append([])  #y
        for j in xrange(k):
            if (j != i):
                train[0].extend(k_groups[j][0])
                train[1].extend(k_groups[j][1])

        #Now we have test and training data... what shall we do?
        #train on LDA
        print "Training LDA..."
        (prob, mean, cov) = lda.trainLDA(copy.deepcopy(train[0]),
                                         copy.deepcopy(train[1]))
        #print str(prob) + "\t" + str(mean) + "\t" + str(cov)
        print "DONE training LDA."
        print "Training NB..."
        (py, theta) = naivebayes.trainNaiveBayesMN(copy.deepcopy(train[0]),
                                                   copy.deepcopy(train[1]))
        #print str(py) + "\t" + str(theta)
        print "DONE training NB"
        print "Training Logistic Regression..."
        t_x = copy.deepcopy(train[0])
        for i in xrange(len(t_x)):
            temp_row = [1]
            temp_row.extend(t_x[i])
            t_x[i] = temp_row
        (wvector,
         scales) = logisticregression.trainLogisticReg(0.01, 0.00001, 100, t_x,
                                                       train[1])
        #print str(wvector)
        print "DONE training Logistic Regression.\n"

        #lr_model = linmod.LogisticRegression()
        #lr_model.fit(t_x, train[1])
        #for model, name in ((lr_model, "LR"),):
        #	tp, tn, fp, fn = 0, 0, 0, 0
        #	for i in xrange(0, len(t_x)):
        #		val = model.predict(t_x[i])
        #		if (val == 1 and train[1][i] == 1):
        #			tp += 1
        #		elif (val == 1 and train[1][i] == 0):
        #			fp += 1
        #		elif (val == 0 and train[1][i] == 0):
        #			tn += 1
        #		elif (val == 0 and train[1][i] == 1):
        #			fn += 1
        #	print "%s - TP: %d, FP: %d, TN: %d, FN: %d" % (name, tp, fp, tn, fn)

        #get Prediction Errors on left out set
        lr_test_error = logisticregression.getConfusionMatrix(
            wvector, scales, copy.deepcopy(test[0]), copy.deepcopy(test[1]))
        lr_train_error = logisticregression.getConfusionMatrix(
            wvector, scales, copy.deepcopy(train[0]), copy.deepcopy(train[1]))
        lr_iso_error = logisticregression.getConfusionMatrix(
            wvector, scales, copy.deepcopy(isotest_x),
            copy.deepcopy(isotest_y))
        lda_test_error = lda.getConfusionMatrix(prob, mean, cov,
                                                copy.deepcopy(test[0]),
                                                copy.deepcopy(test[1]))
        lda_train_error = lda.getConfusionMatrix(prob, mean, cov,
                                                 copy.deepcopy(train[0]),
                                                 copy.deepcopy(train[1]))
        lda_iso_error = lda.getConfusionMatrix(prob, mean, cov,
                                               copy.deepcopy(isotest_x),
                                               copy.deepcopy(isotest_y))
        nb_test_error = naivebayes.getConfusionMatrixMN(
            py, theta, copy.deepcopy(test[0]), copy.deepcopy(test[1]))
        nb_train_error = naivebayes.getConfusionMatrixMN(
            py, theta, copy.deepcopy(train[0]), copy.deepcopy(train[1]))
        nb_iso_error = naivebayes.getConfusionMatrixMN(
            py, theta, copy.deepcopy(isotest_x), copy.deepcopy(isotest_y))

        #add to sets the false positives (for now)
        lr_train_results.append(lr_train_error)
        lr_test_results.append(lr_test_error)
        lr_iso_results.append(lr_iso_error)
        lda_train_results.append(lda_train_error)
        lda_test_results.append(lda_test_error)
        lda_iso_results.append(lda_iso_error)
        nb_train_results.append(nb_train_error)
        nb_test_results.append(nb_test_error)
        nb_iso_results.append(nb_iso_error)

    #calc average training and test error for each algorithm
    avr_lda_train = averageconfusionmatrix(lda_train_results)
    avr_lda_test = averageconfusionmatrix(lda_test_results)
    avr_lda_iso = averageconfusionmatrix(lda_iso_results)
    avr_lr_train = averageconfusionmatrix(lr_train_results)
    avr_lr_test = averageconfusionmatrix(lr_test_results)
    avr_lr_iso = averageconfusionmatrix(lr_iso_results)
    avr_nb_train = averageconfusionmatrix(nb_train_results)
    avr_nb_test = averageconfusionmatrix(nb_test_results)
    avr_nb_iso = averageconfusionmatrix(nb_iso_results)
    return [
        avr_lr_train, avr_lr_test, avr_lr_iso, avr_lda_train, avr_lda_test,
        avr_lda_iso, avr_nb_train, avr_nb_test, avr_nb_iso
    ]