Esempio n. 1
0
	def crossValidate(self, posFile, negFile, numFolds):
		allData = FeatureSet()
		allData.load('data/'+posFile, patternClass='real')
		allData.add_instances('data/'+negFile, patternClass='pseudo')
		allData.libsvm_scale(paramOut = 'data/params')
		subsets = allData.get_cv_subsets(numFolds)
		resultList = []
		# Go through all n folds...
		for i in range(numFolds):
			# Build training and test sets
			testSet = subsets[i]
			trainSet = FeatureSet()
			for j in range(numFolds):
				if j != i:
					trainSet.add_instances_from_featureset(subsets[j])
			# Create svm files for train and test fold data. Train and test on these files.
			trainSet.weka_smote()
			trainSet.export_svm('data/trainSet.libsvm')
			testSet.export_svm('data/testSet.libsvm')
			# SVM settings for HMP features
			call('svm-train -c 1 -d 1 -h 1 -e 0.001 -g 0.06 -b 1 data/trainSet.libsvm models/'+str(i)+'.model', shell=True)
			# SVM settings for MicroPred features
			# call('svm-train -c 10000000 -d 1 -h 1 -e 0.001 -g 0.0019531 -b 1 data/trainSet.libsvm models/'+str(i)+'.model', shell=True)
			call('svm-predict -b 1 data/testSet.libsvm models/'+str(i)+'.model data/'+str(i)+'.results', shell=True)
			# Calculate sensitivity and specificity for fold model
			with open('data/'+str(i)+'.results', 'r') as resultFile:
				with open("data/"+str(i)+".sresults", 'w') as resultOut:
					# resultLines = resultFile.readlines()
					# posLines = resultLines[1:testSet.get_numpos())].sorted( key=lambda l: float(l.split()[1]) )
					# negLines = resultLines[testSet.get_numpos():].sorted( key=lambda l: float(l.split()[1]) )
					trueNeg = 0.0
					truePos = 0.0
					falseNeg = 0.0
					falsePos = 0.0
					resultSet = []
					resultFile.readline()
					for j in range(testSet.get_numpos()):
						line = resultFile.readline()
						if line[0] == '1':
							resultSet.append(Result(t='1', p='1', conf=line.split()[1]))
							truePos += 1.0
						else:
							resultSet.append(Result(t='1', p='0', conf=line.split()[1]))
							falseNeg += 1.0
					for j in range(testSet.get_numneg()):
						line = resultFile.readline()
						if line[0] == '1':
							resultSet.append(Result(t='0', p='1', conf=line.split()[1]))
							falsePos += 1.0
						else:
							resultSet.append(Result(t='0', p='0', conf=line.split()[1]))
							trueNeg += 1.0
					resultSet = sorted(resultSet, key=lambda l: float(l.conf), reverse=True)
					for r in resultSet:
						resultOut.write(r.t + '\t' + r.p + '\t' + r.conf + '\n')

					resultList.append( (truePos/(truePos+falseNeg),trueNeg/(trueNeg+falsePos)) )

					with open("roc_"+str(i)+".tsv", 'w') as rocOut:
						with open("pr_"+str(i)+".tsv", 'w') as prOut:
							ssList = []
							prList = []
							sens = 0.0
							spec = 1.0
							for r in resultSet:
								if r.t == '1':
									sens += 1.0 / testSet.get_numpos()
								if r.t == '0':
									spec -= 1.0 / testSet.get_numneg()
								ssList.append((sens*self.hpSens, (1-spec)*self.hpSpec))
								if (sens*self.hpSens+(1-spec)*self.ci*self.hpSpec) != 0:
									prList.append((sens*self.hpSens/(sens*self.hpSens+(1-spec)*self.ci*self.hpSpec), sens*self.hpSens))
								rocOut.write(str(sens)+'\t'+str(1-spec)+'\n')
								prOut.write(str(sens/(sens+spec*self.ci))+'\t'+str(sens)+'\n')

					p = Plotter()
					p.plot_roc(ssList, "Test", "roc_"+str(i)+".png")
					p.plot_pr(prList, "Test", self.ci, "pr_"+str(i)+".png")

		###################
		# Report Results
		###################
		for i in range(len(resultList)):
			print "## SVM "+str(i)+" ##"
			print 'Sensitivity: '+str(resultList[i][0])
			print 'Specificity: '+str(resultList[i][1])
		print 'average Sensitivity: '+str(sum([result[0] for result in resultList])/numFolds)
		print 'average Specificity: '+str(sum([result[1] for result in resultList])/numFolds)
		print 'Geometric mean: '+str(pow(sum([result[0] for result in resultList])/numFolds*sum([result[1] for result in resultList])/numFolds, 0.5))
Esempio n. 2
0
		sens += result[0]
		spec += result[1]

	return (sens/len(resultList), spec/len(resultList), math.sqrt(spec/len(resultList)*sens/len(resultList)))


#################################################
# Run cross validation, build result files
#################################################

# Load data from positive and negative input files
allData = FeatureSet()
allData.load('data/'+posFile, patternClass='real')
allData.add_instances('data/'+negFile, patternClass='pseudo')
allData.libsvm_scale(paramOut = 'data/params')
subsets = allData.get_cv_subsets(numFolds)
resultList = []
# Go through all n folds...
for i in range(numFolds):
	# Build training and test sets
	testSet = subsets[i]
	trainSet = FeatureSet()
	for j in range(numFolds):
		if j != i:
			trainSet.add_instances_from_featureset(subsets[j])
	# Create svm files for train and test fold data. Train and test on these files.
	trainSet.weka_smote()
	trainSet.export_svm('data/trainSet.libsvm')
	testSet.export_svm('data/testSet.libsvm')
	# SVM settings for HMP features
	call('svm-train -c 1 -d 1 -h 1 -e 0.001 -g 0.06 -b 1 data/trainSet.libsvm models/'+str(i)+'.model', shell=True)