from preprocess import splitColumns import numpy as np from NBCClassifier import featureVector from cvFunctions import getTrainData #from lr import lr from svm import svm #from nbc import nbc import math #from plot import plot # Defaults trainData = 'Data\yelp_data.csv' percentage = 100 # Get data as array train = getLines(trainData, 100) """ words is now a list of (word,frequency) tuples, ordered by descending order of frequency wordList is a list of all unique words in the training data """ """ The 100 most frequent words have been removed from words wordList is now a list of the words in desc order of frequency """ cv = [] #cv = [[[0 for z in range(4001)] for y in range(200)] for x in range(10)] #cv = np.array(cv);
def changeDepth(self, depths): # Get data as array train = getLines(self.trainData, 100) cv = [] train = np.array(train) np.random.shuffle(train) for i in range(10): cv.append(train[i * 200:(i + 1) * 200]) zoltempdt = [0 for xtemp in range(10)] zoltemprf = [0 for xtemp in range(10)] zoltempbag = [0 for xtemp in range(10)] zoltempsvm = [0 for xtemp in range(10)] w = 1000 it = 10 avgzoldt = [0 for xtemp in range(len(depths))] avgzolrf = [0 for xtemp in range(len(depths))] avgzolbag = [0 for xtemp in range(len(depths))] avgzolsvm = [0 for xtemp in range(len(depths))] stddevzoldt = [0 for xtemp in range(len(depths))] stddevzolrf = [0 for xtemp in range(len(depths))] stddevzolbag = [0 for xtemp in range(len(depths))] stddevzolsvm = [0 for xtemp in range(len(depths))] stderrzoldt = [0 for xtemp in range(len(depths))] stderrzolrf = [0 for xtemp in range(len(depths))] stderrzolbag = [0 for xtemp in range(len(depths))] stderrzolsvm = [0 for xtemp in range(len(depths))] testnew = [] trainnew = [] for r in range(len(depths)): for i in range(it): trainnew = [] testnew = cv[i] for j in range(it): if j != i: for k in range(200): trainnew.append(cv[j][k]) temptrain = trainnew trainDataset = getTrainData(temptrain, 0.25) rid_train, x_train, y_train = splitColumns(trainDataset) rid_test, x_test, y_test = splitColumns(testnew) # Pre-processing data x_train = preprocess(x_train) x_test = preprocess(x_test) # Creating dictionary from x_train words, wordList = getWordList(x_train) # Removing most frequent 100 words for _ in range(100): words.pop(0) wordList = [x for x, _ in words] # Forming feature vector, calculating Conditional probabilities, applying bag trainfv, trainfv0, trainfv1 = featureVector( wordList[:w], x_train, y_train) testfv, testfv0, testfv1 = featureVector( wordList[:w], x_test, y_test) zoltempdt[i] = decisionTree(trainfv, testfv, depths[r]) zoltemprf[i] = randomForest(trainfv, testfv, depths[r]) zoltempbag[i] = bagging(trainfv, testfv, depths[r]) # zoltempsvm[i] = svm(trainfv,testfv) avgzoldt[r] = np.mean(zoltempdt) avgzolrf[r] = np.mean(zoltemprf) avgzolbag[r] = np.mean(zoltempbag) avgzolsvm[r] = np.mean(zoltempsvm) stddevzoldt[r] = np.std(zoltempdt) stddevzolrf[r] = np.std(zoltemprf) stddevzolbag[r] = np.std(zoltempbag) stddevzolsvm[r] = np.std(zoltempsvm) stderrzoldt[r] = stddevzoldt[r] / math.sqrt(it) stderrzolrf[r] = stddevzolrf[r] / math.sqrt(it) stderrzolbag[r] = stddevzolbag[r] / math.sqrt(it) stderrzolsvm[r] = stddevzolsvm[r] / math.sqrt(it) print avgzoldt print avgzolbag print avgzolrf # print avgzolsvm print stddevzoldt print stddevzolbag print stddevzolrf # print stddevzolsvm print stderrzoldt print stderrzolbag print stderrzolrf # print stderrzolsvm f = open(self.file, "a+") f.write("\n AVERAGE ZERO ONE LOSS") f.write("\n 1. Decision Tree") f.write(str(avgzoldt)) f.write("\n 2. Bagging") f.write(str(avgzolbag)) f.write("\n 3. Random forest") f.write(str(avgzolrf)) # f.write("\n 4. SVM") # f.write(str(avgzolsvm)) f.write("\n STANDARD DEVIATION ZERO ONE LOSS") f.write("\n 1. Decision Tree") f.write(str(stddevzoldt)) f.write("\n 2. Bagging") f.write(str(stddevzolbag)) f.write("\n 3. Random forest") f.write(str(stddevzolrf)) # f.write("\n 4. SVM") # f.write(str(stddevzolsvm)) f.write("\n STANDARD ERROR ZERO ONE LOSS") f.write("\n 1. Decision Tree") f.write(str(stderrzoldt)) f.write("\n 2. Bagging") f.write(str(stderrzolbag)) f.write("\n 3. Random forest") f.write(str(stderrzolrf)) # f.write("\n 4. SVM") # f.write(str(stderrzolsvm)) f.close()