def crossValidate(self, posFile, negFile, numFolds): allData = FeatureSet() allData.load('data/'+posFile, patternClass='real') allData.add_instances('data/'+negFile, patternClass='pseudo') allData.libsvm_scale(paramOut = 'data/params') subsets = allData.get_cv_subsets(numFolds) resultList = [] # Go through all n folds... for i in range(numFolds): # Build training and test sets testSet = subsets[i] trainSet = FeatureSet() for j in range(numFolds): if j != i: trainSet.add_instances_from_featureset(subsets[j]) # Create svm files for train and test fold data. Train and test on these files. trainSet.weka_smote() trainSet.export_svm('data/trainSet.libsvm') testSet.export_svm('data/testSet.libsvm') # SVM settings for HMP features call('svm-train -c 1 -d 1 -h 1 -e 0.001 -g 0.06 -b 1 data/trainSet.libsvm models/'+str(i)+'.model', shell=True) # SVM settings for MicroPred features # call('svm-train -c 10000000 -d 1 -h 1 -e 0.001 -g 0.0019531 -b 1 data/trainSet.libsvm models/'+str(i)+'.model', shell=True) call('svm-predict -b 1 data/testSet.libsvm models/'+str(i)+'.model data/'+str(i)+'.results', shell=True) # Calculate sensitivity and specificity for fold model with open('data/'+str(i)+'.results', 'r') as resultFile: with open("data/"+str(i)+".sresults", 'w') as resultOut: # resultLines = resultFile.readlines() # posLines = resultLines[1:testSet.get_numpos())].sorted( key=lambda l: float(l.split()[1]) ) # negLines = resultLines[testSet.get_numpos():].sorted( key=lambda l: float(l.split()[1]) ) trueNeg = 0.0 truePos = 0.0 falseNeg = 0.0 falsePos = 0.0 resultSet = [] resultFile.readline() for j in range(testSet.get_numpos()): line = resultFile.readline() if line[0] == '1': resultSet.append(Result(t='1', p='1', conf=line.split()[1])) truePos += 1.0 else: resultSet.append(Result(t='1', p='0', conf=line.split()[1])) falseNeg += 1.0 for j in range(testSet.get_numneg()): line = resultFile.readline() if line[0] == '1': resultSet.append(Result(t='0', p='1', conf=line.split()[1])) falsePos += 1.0 else: resultSet.append(Result(t='0', p='0', conf=line.split()[1])) trueNeg += 1.0 resultSet = sorted(resultSet, key=lambda l: float(l.conf), reverse=True) for r in resultSet: resultOut.write(r.t + '\t' + r.p + '\t' + r.conf + '\n') resultList.append( (truePos/(truePos+falseNeg),trueNeg/(trueNeg+falsePos)) ) with open("roc_"+str(i)+".tsv", 'w') as rocOut: with open("pr_"+str(i)+".tsv", 'w') as prOut: ssList = [] prList = [] sens = 0.0 spec = 1.0 for r in resultSet: if r.t == '1': sens += 1.0 / testSet.get_numpos() if r.t == '0': spec -= 1.0 / testSet.get_numneg() ssList.append((sens*self.hpSens, (1-spec)*self.hpSpec)) if (sens*self.hpSens+(1-spec)*self.ci*self.hpSpec) != 0: prList.append((sens*self.hpSens/(sens*self.hpSens+(1-spec)*self.ci*self.hpSpec), sens*self.hpSens)) rocOut.write(str(sens)+'\t'+str(1-spec)+'\n') prOut.write(str(sens/(sens+spec*self.ci))+'\t'+str(sens)+'\n') p = Plotter() p.plot_roc(ssList, "Test", "roc_"+str(i)+".png") p.plot_pr(prList, "Test", self.ci, "pr_"+str(i)+".png") ################### # Report Results ################### for i in range(len(resultList)): print "## SVM "+str(i)+" ##" print 'Sensitivity: '+str(resultList[i][0]) print 'Specificity: '+str(resultList[i][1]) print 'average Sensitivity: '+str(sum([result[0] for result in resultList])/numFolds) print 'average Specificity: '+str(sum([result[1] for result in resultList])/numFolds) print 'Geometric mean: '+str(pow(sum([result[0] for result in resultList])/numFolds*sum([result[1] for result in resultList])/numFolds, 0.5))
sens += result[0] spec += result[1] return (sens/len(resultList), spec/len(resultList), math.sqrt(spec/len(resultList)*sens/len(resultList))) ################################################# # Run cross validation, build result files ################################################# # Load data from positive and negative input files allData = FeatureSet() allData.load('data/'+posFile, patternClass='real') allData.add_instances('data/'+negFile, patternClass='pseudo') allData.libsvm_scale(paramOut = 'data/params') subsets = allData.get_cv_subsets(numFolds) resultList = [] # Go through all n folds... for i in range(numFolds): # Build training and test sets testSet = subsets[i] trainSet = FeatureSet() for j in range(numFolds): if j != i: trainSet.add_instances_from_featureset(subsets[j]) # Create svm files for train and test fold data. Train and test on these files. trainSet.weka_smote() trainSet.export_svm('data/trainSet.libsvm') testSet.export_svm('data/testSet.libsvm') # SVM settings for HMP features call('svm-train -c 1 -d 1 -h 1 -e 0.001 -g 0.06 -b 1 data/trainSet.libsvm models/'+str(i)+'.model', shell=True)