def baseline(rank): __author__ = 'vittorioselo' import os from os.path import join, isfile import pandas import numpy from collections import defaultdict from sklearn import metrics from errorAnalysis import meanError,setError myPath = 'trainPerRank/' listUsers = [f for f in os.listdir(str(myPath)) if isfile(join(myPath, f))] #listUsers.remove('.DS_Store') dictResult = defaultdict(float) averageError = float() errorSet = int() for user in listUsers: dictCount = defaultdict(int) majority = float() trainRank = list(map(lambda x:x[0],numpy.array(pandas.read_csv('trainPerRank/'+rank+'/stars/'+user, header=None)))) # trainRank = [val for sublist in trainRank for val in sublist] #trainRank = list(map(lambda x: int(x*5), trainRank)) validationRank = list(map(lambda x:x[0],numpy.array(pandas.read_csv('validationPerRank/'+rank+'/stars/'+user, header=None)))) #validationRank = [val for sublist in validationRank for val in sublist] #validationRank = list(map(lambda x: int(x*5), validationRank)) testRank = list(map(lambda x:x[0],numpy.array(pandas.read_csv('testPerRank/'+rank+'/stars/'+user, header=None)))) #testRank = [val for sublist in testRank for val in sublist] #testRank = list(map(lambda x: int(x*5), testRank)) for x in trainRank: dictCount[x] += 1 for x in validationRank: dictCount[x] += 1 for x in dictCount.keys(): if(dictCount[x] == max(dictCount.values())): majority = x break prediction = list() for x in range(len(testRank)): prediction.append(majority) dictResult[user] = metrics.accuracy_score(testRank, prediction) averageError += meanError(prediction,testRank) errorSet += setError(prediction,testRank) accuracy = float() for key in dictResult.keys(): accuracy += dictResult[key] accuracy /= len(listUsers) print(accuracy) print('=============ERROR=========') print(averageError/len(listUsers)) #=> -0.15346161033753716 print(errorSet) #=> 736
def runMethod2(): __author__ = 'Umberto' import pandas import numpy from sklearn import metrics,svm,linear_model from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier from sklearn.cross_validation import cross_val_score import os from os.path import join, isfile from collections import defaultdict from sklearn.naive_bayes import GaussianNB,BernoulliNB import pandas as pn from errorAnalysis import meanError from sklearn.metrics import confusion_matrix averageError = 0 listUsers = list() myPath = 'trainPerRank/' listUsers = [f for f in os.listdir(str(myPath)) if isfile(join(myPath, f))] #listUsers.remove('.DS_Store') dictResults = defaultdict(float) confusionMatrix= numpy.array([[0, 0, 0,0,0], [0, 0, 0,0,0], [0, 0, 0,0,0], [0, 0, 0,0,0], [0, 0, 0,0,0]]) listOfPredictions = [] listOfTrainData=[] listOfTrainTarget = [] listOfTestData = [] listOfTestTarget = [] for user in listUsers: #======READING TRAIN SET======== dataTrain_12 = numpy.array(pandas.read_csv('trainPerRank/1-2/prediction/'+user, header=None)) trainRank = list(map(lambda x:x[1],numpy.array(pandas.read_csv('trainPerRank/1-2/stars/'+user, header=None)))) #Need flat list dataTrain_12 = [val for sublist in dataTrain_12 for val in sublist] dataTrain_3 = numpy.array(pandas.read_csv('trainPerRank/3/prediction/'+user, header=None)) #Need flat list dataTrain_3 = [val for sublist in dataTrain_3 for val in sublist] dataTrain_45 = numpy.array(pandas.read_csv('trainPerRank/4-5/prediction/'+user, header=None)) #Need flat list dataTrain_45 = [val for sublist in dataTrain_45 for val in sublist] dfData = pn.DataFrame(pn.Series(dataTrain_12)) dfData = pn.concat([dfData, pn.Series(dataTrain_3)], axis=1) dfData = pn.concat([dfData, pn.Series(dataTrain_45)], axis=1) dataTrain = numpy.array(dfData) #===========SAVE ENSEMBLED DATATRAIN (for neural))================ listOfTrainData.append(dataTrain) listOfTrainTarget.append(trainRank) #============READING TEST SET ========== dataTest_12 = numpy.array(pandas.read_csv('testPerRank/1-2/prediction/'+user, header=None)) dataTest_3 = numpy.array(pandas.read_csv('testPerRank/3/prediction/'+user, header=None)) dataTest_45 = numpy.array(pandas.read_csv('testPerRank/4-5/prediction/'+user, header=None)) dataTest_12 = [val for sublist in dataTest_12 for val in sublist] dataTest_3 = [val for sublist in dataTest_3 for val in sublist] dataTest_45 = [val for sublist in dataTest_45 for val in sublist] dfDataTest = pn.DataFrame(pn.Series(dataTest_12)) dfDataTest = pn.concat([dfDataTest, pn.Series(dataTest_3)], axis=1) dfDataTest = pn.concat([dfDataTest, pn.Series(dataTest_45)], axis=1) dataTest = numpy.array(dfDataTest) testRank = list(map(lambda x:x[1],numpy.array(pandas.read_csv('testPerRank/1-2/stars/'+user, header=None)))) #=========SAVE TESTSET (neural)========= listOfTestData.append(dataTest) listOfTestTarget.append(testRank) # gnb = GaussianNB() # gnb.fit(dataTrain, trainRank) # prediction = gnb.predict(dataTest) # # treeClassifier = tree.DecisionTreeClassifier() # treeClassifier.fit(dataTrain, trainRank) # prediction = treeClassifier.predict(dataTest) # # # 0.534526167607 # =============ERROR ANALYSIS========= # -0.0107943569909 # confusion matrix: # [[ 0 9 8 1 4] # [ 1 32 19 30 10] # [ 0 33 143 95 30] # [ 0 38 67 400 72] # [ 0 20 20 138 147]] # clf = svm.SVC(kernel='linear') # clf.decision_function_shape = 'ovr' # clf.fit(dataTrain, trainRank) # prediction = clf.predict(dataTest) # =============Accuracy========= # 0.53582203379 # =============ERROR ANALYSIS========= # -0.0183031215174 # confusion matrix: # [[ 0 9 8 1 4] # [ 1 32 19 30 10] # [ 1 32 137 100 31] # [ 0 38 59 406 74] # [ 0 20 19 138 148]] #MAXENT CREATION # logreg = linear_model.LogisticRegression() # logreg.solver = 'lbfgs' # logreg.fit(dataTrain, trainRank) # #==========MAXENT PREDICTION ======== # prediction = logreg.predict(dataTest)#0.5349 # forest = RandomForestClassifier(n_estimators=40) # forest.fit(dataTrain, trainRank) # prediction = forest.predict(dataTest) clf1 = DecisionTreeClassifier(max_depth=1) clf = AdaBoostClassifier(base_estimator=clf1, algorithm="SAMME.R", n_estimators=3) clf.fit(dataTrain, trainRank) prediction = clf.predict(dataTest) # =============Accuracy========= # 0.538671496693 # =============ERROR ANALYSIS========= # -0.075507041171 # confusion matrix: # [[ 0 5 8 5 4] # [ 1 17 20 43 11] # [ 0 17 144 113 27] # [ 0 22 67 422 66] # [ 0 14 18 153 140]] dictResults[user] = metrics.accuracy_score(testRank, prediction) averageError += meanError(prediction,testRank) confusionMatrix += confusion_matrix(testRank,prediction,labels=[1,2,3,4,5]) listOfPredictions.append(prediction) accuracy = float() for user in dictResults.keys(): accuracy += dictResults[user] accuracy /= len(listUsers) print('=============Accuracy=========') print(accuracy) #with balanced train 0.5063370 #with unbalanced train 0.515719385728 print('=============ERROR ANALYSIS=========') print(averageError/len(listUsers)) #=> 0.00794524594896 print('confusion matrix:') print(confusionMatrix) # confusion matrix: # [[ 4 5 8 1 4] # [ 16 17 19 25 15] # [ 13 26 139 88 35] # [ 23 34 60 360 100] # [ 6 30 22 112 155]] listOfTrainData = [val for sublist in listOfTrainData for val in sublist] listOfTestData = [val for sublist in listOfTestData for val in sublist] listOfTrainTarget = [val for sublist in listOfTrainTarget for val in sublist] listOfTestTarget = [val for sublist in listOfTestTarget for val in sublist] dfTrain = pn.DataFrame(listOfTrainData) dfTrainRank =pn.DataFrame() for val in listOfTrainTarget: if val==1: dfTrainRank=dfTrainRank.append(pn.Series([0,0,0,0,1]),ignore_index=True) elif val ==2: dfTrainRank=dfTrainRank.append(pn.Series([0,0,0,1,0]),ignore_index=True) elif val ==3: dfTrainRank=dfTrainRank.append(pn.Series([0,0,1,0,0]),ignore_index=True) elif val ==4: dfTrainRank=dfTrainRank.append(pn.Series([0,1,0,0,0]),ignore_index=True) elif val ==5: dfTrainRank=dfTrainRank.append(pn.Series([1,0,0,0,0]),ignore_index=True) #create folder if necessary if not os.path.exists('trainForNeural/'): os.makedirs('trainForNeural/') dfTrain.to_csv('trainForNeural/data.csv', header=False, index_label=False, index=False) dfTrainRank.to_csv('trainForNeural/target.csv', header=False, index_label=False, index=False) dfTest = pn.DataFrame(listOfTestData) dfTestRank =pn.DataFrame() for val in listOfTestTarget: if val==1: dfTestRank=dfTestRank.append(pn.Series([0,0,0,0,1]),ignore_index=True) elif val ==2: dfTestRank=dfTestRank.append(pn.Series([0,0,0,1,0]),ignore_index=True) elif val ==3: dfTestRank=dfTestRank.append(pn.Series([0,0,1,0,0]),ignore_index=True) elif val ==4: dfTestRank=dfTestRank.append(pn.Series([0,1,0,0,0]),ignore_index=True) elif val ==5: dfTestRank=dfTestRank.append(pn.Series([1,0,0,0,0]),ignore_index=True) #create folder if necessary if not os.path.exists('testForNeural/'): os.makedirs('testForNeural/') dfTest.to_csv('testForNeural/data.csv', header=False, index_label=False, index=False) dfTestRank.to_csv('testForNeural/target.csv', header=False, index_label=False, index=False) return listOfPredictions