from subprocess import call from classes.FeatureSet import FeatureSet # Parameters: # # -p: File name for positive feature set (any file type) # -n: File name for negative feature set (any file type) # -o: Name of output LibSVM model opts, extraparams = getopt.getopt(sys.argv[1:], 'o:p:n:') for o,p in opts: if o == '-p': posPath = p if o == '-n': negPath = p if o == '-o': outPath = p # Aggregate inputs, export to libsvm file fs = FeatureSet() fs.load('data/'+posPath, patternClass = 'real') fs.add_instances('data/'+negPath, patternClass = 'pseudo') fs.weka_smote() fs.libsvm_scale(paramOut = 'models/'+outPath+'.scale') fs.export('tmp.libsvm') # Build model # Micropred: -c 100 -d 1 -h 1 -e 0.001 -g 0.0019531 # HeteroMir: -c 1 -d 1 -h 1 -e 0.001 -g 0.06 call('progs/libsvm-3.14/svm-train -c 1 -d 1 -h 1 -e 0.001 -g 0.06 -b 1 tmp.libsvm models/'+outPath+'.model', shell=True) # Clean up call('rm tmp.libsvm', shell=True)
import sys import getopt from subprocess import call from classes.FeatureSet import FeatureSet from classes.ResultSet import ResultSet # Parameters: # # -m: model name (<-m>.scale and <-m>.model should exist in models directory) # -i: File containing input feature data opts, extraparams = getopt.getopt(sys.argv[1:], 'm:i:') for o,p in opts: if o == '-m': modelName = p if o == '-i': inPath = p print '1' fs = FeatureSet() print '2' fs.load('data/'+inPath, patternClass = 'real') print '3' fs.libsvm_scale(params='models/'+modelName+'.scale') print '4' fs.export('tmp.libsvm') call('progs/libsvm-3.14/svm-predict -b 1 tmp.libsvm models/'+modelName+'.model data/'+inPath+'.results', shell=True) # call('rm tmp.libsvm', shell=True)
sens += result[0] spec += result[1] return (sens / len(resultList), spec / len(resultList), math.sqrt(spec / len(resultList) * sens / len(resultList))) ################################################# # Run cross validation, build result files ################################################# # Load data from positive and negative input files allData = FeatureSet() allData.load_micropred('data/' + posFile, patternClass='real') allData.add_instances_from_micropred('data/' + negFile, patternClass='pseudo') allData.libsvm_scale(paramOut='data/params') subsets = allData.get_subsets(numFolds) resultList = [] # Go through all n folds... for i in range(numFolds): # Build training and test sets testSet = subsets[i] trainSet = FeatureSet() for j in range(numFolds): if j != i: trainSet.add_instances_from_featureset(subsets[j]) # Create svm files for train and test fold data. Train and test on these files. trainSet.export_svm('data/trainSet.libsvm') testSet.export_svm('data/testSet.libsvm') call( 'svm-train -c 100 -d 1 -h 1 -e 0.001 -g 0.0019531 -b 1 data/trainSet.libsvm models/'
def crossValidate(self, posFile, negFile, numFolds): allData = FeatureSet() allData.load('data/'+posFile, patternClass='real') allData.add_instances('data/'+negFile, patternClass='pseudo') allData.libsvm_scale(paramOut = 'data/params') subsets = allData.get_cv_subsets(numFolds) resultList = [] # Go through all n folds... for i in range(numFolds): # Build training and test sets testSet = subsets[i] trainSet = FeatureSet() for j in range(numFolds): if j != i: trainSet.add_instances_from_featureset(subsets[j]) # Create svm files for train and test fold data. Train and test on these files. trainSet.weka_smote() trainSet.export_svm('data/trainSet.libsvm') testSet.export_svm('data/testSet.libsvm') # SVM settings for HMP features call('svm-train -c 1 -d 1 -h 1 -e 0.001 -g 0.06 -b 1 data/trainSet.libsvm models/'+str(i)+'.model', shell=True) # SVM settings for MicroPred features # call('svm-train -c 10000000 -d 1 -h 1 -e 0.001 -g 0.0019531 -b 1 data/trainSet.libsvm models/'+str(i)+'.model', shell=True) call('svm-predict -b 1 data/testSet.libsvm models/'+str(i)+'.model data/'+str(i)+'.results', shell=True) # Calculate sensitivity and specificity for fold model with open('data/'+str(i)+'.results', 'r') as resultFile: with open("data/"+str(i)+".sresults", 'w') as resultOut: # resultLines = resultFile.readlines() # posLines = resultLines[1:testSet.get_numpos())].sorted( key=lambda l: float(l.split()[1]) ) # negLines = resultLines[testSet.get_numpos():].sorted( key=lambda l: float(l.split()[1]) ) trueNeg = 0.0 truePos = 0.0 falseNeg = 0.0 falsePos = 0.0 resultSet = [] resultFile.readline() for j in range(testSet.get_numpos()): line = resultFile.readline() if line[0] == '1': resultSet.append(Result(t='1', p='1', conf=line.split()[1])) truePos += 1.0 else: resultSet.append(Result(t='1', p='0', conf=line.split()[1])) falseNeg += 1.0 for j in range(testSet.get_numneg()): line = resultFile.readline() if line[0] == '1': resultSet.append(Result(t='0', p='1', conf=line.split()[1])) falsePos += 1.0 else: resultSet.append(Result(t='0', p='0', conf=line.split()[1])) trueNeg += 1.0 resultSet = sorted(resultSet, key=lambda l: float(l.conf), reverse=True) for r in resultSet: resultOut.write(r.t + '\t' + r.p + '\t' + r.conf + '\n') resultList.append( (truePos/(truePos+falseNeg),trueNeg/(trueNeg+falsePos)) ) with open("roc_"+str(i)+".tsv", 'w') as rocOut: with open("pr_"+str(i)+".tsv", 'w') as prOut: ssList = [] prList = [] sens = 0.0 spec = 1.0 for r in resultSet: if r.t == '1': sens += 1.0 / testSet.get_numpos() if r.t == '0': spec -= 1.0 / testSet.get_numneg() ssList.append((sens*self.hpSens, (1-spec)*self.hpSpec)) if (sens*self.hpSens+(1-spec)*self.ci*self.hpSpec) != 0: prList.append((sens*self.hpSens/(sens*self.hpSens+(1-spec)*self.ci*self.hpSpec), sens*self.hpSens)) rocOut.write(str(sens)+'\t'+str(1-spec)+'\n') prOut.write(str(sens/(sens+spec*self.ci))+'\t'+str(sens)+'\n') p = Plotter() p.plot_roc(ssList, "Test", "roc_"+str(i)+".png") p.plot_pr(prList, "Test", self.ci, "pr_"+str(i)+".png") ################### # Report Results ################### for i in range(len(resultList)): print "## SVM "+str(i)+" ##" print 'Sensitivity: '+str(resultList[i][0]) print 'Specificity: '+str(resultList[i][1]) print 'average Sensitivity: '+str(sum([result[0] for result in resultList])/numFolds) print 'average Specificity: '+str(sum([result[1] for result in resultList])/numFolds) print 'Geometric mean: '+str(pow(sum([result[0] for result in resultList])/numFolds*sum([result[1] for result in resultList])/numFolds, 0.5))
import sys import getopt from subprocess import call from classes.FeatureSet import FeatureSet from classes.ResultSet import ResultSet # Parameters: # # -m: model name (<-m>.scale and <-m>.model should exist in models directory) # -i: File containing input feature data opts, extraparams = getopt.getopt(sys.argv[1:], 'm:i:') for o,p in opts: if o == '-m': modelName = p if o == '-i': inPath = p fs = FeatureSet() fs.load('data/'+inPath, patternClass = 'real') fs.libsvm_scale(params='models/'+modelName+'.scale') fs.export('tmp.libsvm') call('progs/libsvm-3.14/svm-predict -b 1 tmp.libsvm models/'+modelName+'.model data/'+inPath+'.results', shell=True) # call('rm tmp.libsvm', shell=True)
call('python extract_hairpins.py -i '+negPath, shell=True) print "### Extracting micropred features from coding regions" sl = SequenceList() sl.load_fasta('data/'+negPath+'.nr.hairpins') sl.select_random(10000) sl.export_fasta('data/'+negPath+'.nr.hairpins') call('python build_micropred_features.py -i '+negPath+'.nr.hairpins -n '+numThreads, shell=True) # call('python build_huntmi_features.py -i '+negPath+'.nr.hairpins') ################################################ # Build LibSVM model ################################################ print "### Building LibSVM model" call('python build_model.py -p '+speciesFilename+'.features -n '+negPath+'.nr.hairpins.micropred -o '+speciesFilename, shell=True) ################################################ # Build feature set from hairpin candidates in genome of interest ################################################ print "### Building hairpins from genome under exploration" call('python extract_hairpins.py -i '+inPath, shell=True) print "### Extracting micropred features from genome under exploration" call('python build_micropred_features.py -i '+inPath+'.nr.hairpins -n '+numThreads, shell=True) ################################################ # Run svm-predict on all hairpin candidates in genome of interest ################################################ fs = FeatureSet() fs.load('data/'+inPath+'.nr.hairpins.micropred', patternClass = 'real') fs.libsvm_scale(params='models/'+speciesFilename+'.scale') fs.export('data/'+inPath+'.nr.hairpins.libsvm') call('progs/libsvm-3.14/svm-predict -b 1 data/'+inPath+'.nr.hairpins.libsvm models/'+speciesFilename+'.model data/'+inPath+'.nr.hairpins.results', shell=True)
from classes.FeatureSet import FeatureSet # Parameters: # # -p: File name for positive feature set (any file type) # -n: File name for negative feature set (any file type) # -o: Name of output LibSVM model opts, extraparams = getopt.getopt(sys.argv[1:], 'o:p:n:') for o, p in opts: if o == '-p': posPath = p if o == '-n': negPath = p if o == '-o': outPath = p # Aggregate inputs, export to libsvm file fs = FeatureSet() fs.load('data/' + posPath, patternClass='real') fs.add_instances('data/' + negPath, patternClass='pseudo') fs.weka_smote() fs.libsvm_scale(paramOut='models/' + outPath + '.scale') fs.export('tmp.libsvm') # Build model call( 'svm-train -c 10000000 -d 1 -h 1 -e 0.001 -g 0.0019531 -b 1 tmp.libsvm models/' + outPath + '.model', shell=True) # Clean up call('rm tmp.libsvm', shell=True)
for result in resultList: sens += result[0] spec += result[1] return (sens/len(resultList), spec/len(resultList), math.sqrt(spec/len(resultList)*sens/len(resultList))) ################################################# # Run cross validation, build result files ################################################# # Load data from positive and negative input files allData = FeatureSet() allData.load('data/'+posFile, patternClass='real') allData.add_instances('data/'+negFile, patternClass='pseudo') allData.libsvm_scale(paramOut = 'data/params') subsets = allData.get_cv_subsets(numFolds) resultList = [] # Go through all n folds... for i in range(numFolds): # Build training and test sets testSet = subsets[i] trainSet = FeatureSet() for j in range(numFolds): if j != i: trainSet.add_instances_from_featureset(subsets[j]) # Create svm files for train and test fold data. Train and test on these files. trainSet.weka_smote() trainSet.export_svm('data/trainSet.libsvm') testSet.export_svm('data/testSet.libsvm') # SVM settings for HMP features