Python FeatureSet 예제들, classes.FeatureSet.FeatureSet Python 예제들

예제 #1

0

파일 보기

파일: build_model.py 프로젝트: jrgreen7/SMIRP

from subprocess import call
from classes.FeatureSet import FeatureSet

# Parameters:
#
# -p: File name for positive feature set (any file type)
# -n: File name for negative feature set (any file type)
# -o: Name of output LibSVM model

opts, extraparams = getopt.getopt(sys.argv[1:], 'o:p:n:')
for o,p in opts:
	if o == '-p':
		posPath = p
	if o == '-n':
		negPath = p
	if o == '-o':
		outPath = p

# Aggregate inputs, export to libsvm file
fs = FeatureSet()
fs.load('data/'+posPath, patternClass = 'real')
fs.add_instances('data/'+negPath, patternClass = 'pseudo')
fs.weka_smote()
fs.libsvm_scale(paramOut = 'models/'+outPath+'.scale')
fs.export('tmp.libsvm')
# Build model
# Micropred: -c 100 -d 1 -h 1 -e 0.001 -g 0.0019531
# HeteroMir: -c 1 -d 1 -h 1 -e 0.001 -g 0.06
call('progs/libsvm-3.14/svm-train -c 1 -d 1 -h 1 -e 0.001 -g 0.06 -b 1 tmp.libsvm models/'+outPath+'.model', shell=True)
# Clean up
call('rm tmp.libsvm', shell=True)

예제 #2

0

파일 보기

파일: predict.py 프로젝트: CU-BIC/SMIRP

import sys
import getopt
from subprocess import call
from classes.FeatureSet import FeatureSet
from classes.ResultSet import ResultSet

# Parameters:
#
# -m: model name (<-m>.scale and <-m>.model should exist in models directory)
# -i: File containing input feature data
opts, extraparams = getopt.getopt(sys.argv[1:], 'm:i:')
for o,p in opts:
	if o == '-m':
		modelName = p
	if o == '-i':
		inPath = p

print '1'
fs = FeatureSet()
print '2'
fs.load('data/'+inPath, patternClass = 'real')
print '3'
fs.libsvm_scale(params='models/'+modelName+'.scale')
print '4'
fs.export('tmp.libsvm')
call('progs/libsvm-3.14/svm-predict -b 1 tmp.libsvm models/'+modelName+'.model data/'+inPath+'.results', shell=True)
# call('rm tmp.libsvm', shell=True)

예제 #3

0

파일 보기

    sens = 0.0
    spec = 0.0
    for result in resultList:
        sens += result[0]
        spec += result[1]

    return (sens / len(resultList), spec / len(resultList),
            math.sqrt(spec / len(resultList) * sens / len(resultList)))


#################################################
# Run cross validation, build result files
#################################################

# Load data from positive and negative input files
allData = FeatureSet()
allData.load_micropred('data/' + posFile, patternClass='real')
allData.add_instances_from_micropred('data/' + negFile, patternClass='pseudo')
allData.libsvm_scale(paramOut='data/params')
subsets = allData.get_subsets(numFolds)
resultList = []
# Go through all n folds...
for i in range(numFolds):
    # Build training and test sets
    testSet = subsets[i]
    trainSet = FeatureSet()
    for j in range(numFolds):
        if j != i:
            trainSet.add_instances_from_featureset(subsets[j])
    # Create svm files for train and test fold data. Train and test on these files.
    trainSet.export_svm('data/trainSet.libsvm')

예제 #4

0

파일 보기

파일: predict.py 프로젝트: jrgreen7/SMIRP

import sys
import getopt
from subprocess import call
from classes.FeatureSet import FeatureSet
from classes.ResultSet import ResultSet

# Parameters:
#
# -m: model name (<-m>.scale and <-m>.model should exist in models directory)
# -i: File containing input feature data
opts, extraparams = getopt.getopt(sys.argv[1:], 'm:i:')
for o,p in opts:
	if o == '-m':
		modelName = p
	if o == '-i':
		inPath = p

fs = FeatureSet()
fs.load('data/'+inPath, patternClass = 'real')
fs.libsvm_scale(params='models/'+modelName+'.scale')
fs.export('tmp.libsvm')
call('progs/libsvm-3.14/svm-predict -b 1 tmp.libsvm models/'+modelName+'.model data/'+inPath+'.results', shell=True)
# call('rm tmp.libsvm', shell=True)

예제 #5

0

파일 보기

파일: ScanningTestFramework.py 프로젝트: jrgreen7/SMIRP

	def crossValidate(self, posFile, negFile, numFolds):
		allData = FeatureSet()
		allData.load('data/'+posFile, patternClass='real')
		allData.add_instances('data/'+negFile, patternClass='pseudo')
		allData.libsvm_scale(paramOut = 'data/params')
		subsets = allData.get_cv_subsets(numFolds)
		resultList = []
		# Go through all n folds...
		for i in range(numFolds):
			# Build training and test sets
			testSet = subsets[i]
			trainSet = FeatureSet()
			for j in range(numFolds):
				if j != i:
					trainSet.add_instances_from_featureset(subsets[j])
			# Create svm files for train and test fold data. Train and test on these files.
			trainSet.weka_smote()
			trainSet.export_svm('data/trainSet.libsvm')
			testSet.export_svm('data/testSet.libsvm')
			# SVM settings for HMP features
			call('svm-train -c 1 -d 1 -h 1 -e 0.001 -g 0.06 -b 1 data/trainSet.libsvm models/'+str(i)+'.model', shell=True)
			# SVM settings for MicroPred features
			# call('svm-train -c 10000000 -d 1 -h 1 -e 0.001 -g 0.0019531 -b 1 data/trainSet.libsvm models/'+str(i)+'.model', shell=True)
			call('svm-predict -b 1 data/testSet.libsvm models/'+str(i)+'.model data/'+str(i)+'.results', shell=True)
			# Calculate sensitivity and specificity for fold model
			with open('data/'+str(i)+'.results', 'r') as resultFile:
				with open("data/"+str(i)+".sresults", 'w') as resultOut:
					# resultLines = resultFile.readlines()
					# posLines = resultLines[1:testSet.get_numpos())].sorted( key=lambda l: float(l.split()[1]) )
					# negLines = resultLines[testSet.get_numpos():].sorted( key=lambda l: float(l.split()[1]) )
					trueNeg = 0.0
					truePos = 0.0
					falseNeg = 0.0
					falsePos = 0.0
					resultSet = []
					resultFile.readline()
					for j in range(testSet.get_numpos()):
						line = resultFile.readline()
						if line[0] == '1':
							resultSet.append(Result(t='1', p='1', conf=line.split()[1]))
							truePos += 1.0
						else:
							resultSet.append(Result(t='1', p='0', conf=line.split()[1]))
							falseNeg += 1.0
					for j in range(testSet.get_numneg()):
						line = resultFile.readline()
						if line[0] == '1':
							resultSet.append(Result(t='0', p='1', conf=line.split()[1]))
							falsePos += 1.0
						else:
							resultSet.append(Result(t='0', p='0', conf=line.split()[1]))
							trueNeg += 1.0
					resultSet = sorted(resultSet, key=lambda l: float(l.conf), reverse=True)
					for r in resultSet:
						resultOut.write(r.t + '\t' + r.p + '\t' + r.conf + '\n')

					resultList.append( (truePos/(truePos+falseNeg),trueNeg/(trueNeg+falsePos)) )

					with open("roc_"+str(i)+".tsv", 'w') as rocOut:
						with open("pr_"+str(i)+".tsv", 'w') as prOut:
							ssList = []
							prList = []
							sens = 0.0
							spec = 1.0
							for r in resultSet:
								if r.t == '1':
									sens += 1.0 / testSet.get_numpos()
								if r.t == '0':
									spec -= 1.0 / testSet.get_numneg()
								ssList.append((sens*self.hpSens, (1-spec)*self.hpSpec))
								if (sens*self.hpSens+(1-spec)*self.ci*self.hpSpec) != 0:
									prList.append((sens*self.hpSens/(sens*self.hpSens+(1-spec)*self.ci*self.hpSpec), sens*self.hpSens))
								rocOut.write(str(sens)+'\t'+str(1-spec)+'\n')
								prOut.write(str(sens/(sens+spec*self.ci))+'\t'+str(sens)+'\n')

					p = Plotter()
					p.plot_roc(ssList, "Test", "roc_"+str(i)+".png")
					p.plot_pr(prList, "Test", self.ci, "pr_"+str(i)+".png")

		###################
		# Report Results
		###################
		for i in range(len(resultList)):
			print "## SVM "+str(i)+" ##"
			print 'Sensitivity: '+str(resultList[i][0])
			print 'Specificity: '+str(resultList[i][1])
		print 'average Sensitivity: '+str(sum([result[0] for result in resultList])/numFolds)
		print 'average Specificity: '+str(sum([result[1] for result in resultList])/numFolds)
		print 'Geometric mean: '+str(pow(sum([result[0] for result in resultList])/numFolds*sum([result[1] for result in resultList])/numFolds, 0.5))

예제 #6

0

파일 보기

파일: full_pipeline.py 프로젝트: CU-BIC/SMIRP

call('python extract_hairpins.py -i '+negPath, shell=True)
print "### Extracting micropred features from coding regions"
sl = SequenceList()
sl.load_fasta('data/'+negPath+'.nr.hairpins')
sl.select_random(10000)
sl.export_fasta('data/'+negPath+'.nr.hairpins')
call('python build_micropred_features.py -i '+negPath+'.nr.hairpins -n '+numThreads, shell=True)
# call('python build_huntmi_features.py -i '+negPath+'.nr.hairpins')

################################################
# Build LibSVM model
################################################
print "### Building LibSVM model"
call('python build_model.py -p '+speciesFilename+'.features -n '+negPath+'.nr.hairpins.micropred -o '+speciesFilename, shell=True)

################################################
# Build feature set from hairpin candidates in genome of interest
################################################
print "### Building hairpins from genome under exploration"
call('python extract_hairpins.py -i '+inPath, shell=True)
print "### Extracting micropred features from genome under exploration"
call('python build_micropred_features.py -i '+inPath+'.nr.hairpins -n '+numThreads, shell=True)

################################################
# Run svm-predict on all hairpin candidates in genome of interest
################################################
fs = FeatureSet()
fs.load('data/'+inPath+'.nr.hairpins.micropred', patternClass = 'real')
fs.libsvm_scale(params='models/'+speciesFilename+'.scale')
fs.export('data/'+inPath+'.nr.hairpins.libsvm')
call('progs/libsvm-3.14/svm-predict -b 1 data/'+inPath+'.nr.hairpins.libsvm models/'+speciesFilename+'.model data/'+inPath+'.nr.hairpins.results', shell=True)

예제 #7

0

파일 보기

파일: build_model.py 프로젝트: CU-BIC/SMIRP

from classes.FeatureSet import FeatureSet

# Parameters:
#
# -p: File name for positive feature set (any file type)
# -n: File name for negative feature set (any file type)
# -o: Name of output LibSVM model

opts, extraparams = getopt.getopt(sys.argv[1:], 'o:p:n:')
for o, p in opts:
    if o == '-p':
        posPath = p
    if o == '-n':
        negPath = p
    if o == '-o':
        outPath = p

# Aggregate inputs, export to libsvm file
fs = FeatureSet()
fs.load('data/' + posPath, patternClass='real')
fs.add_instances('data/' + negPath, patternClass='pseudo')
fs.weka_smote()
fs.libsvm_scale(paramOut='models/' + outPath + '.scale')
fs.export('tmp.libsvm')
# Build model
call(
    'svm-train -c 10000000 -d 1 -h 1 -e 0.001 -g 0.0019531 -b 1 tmp.libsvm models/'
    + outPath + '.model',
    shell=True)
# Clean up
call('rm tmp.libsvm', shell=True)

예제 #8

0

파일 보기

파일: build_huntmi_features.py 프로젝트: jrgreen7/SMIRP

for o,p in opts:
	if o == '-i':
		inPath = p


# Make fold data
call('progs/HuntMi/progs/RNAfold/RNAfold -noPS < data/'+inPath+' > data/'+inPath+'.fold', shell=True)

# Make dustmasker feature. Delete temporary files associated with dustmasker feature.
call('python progs/HuntMi/progs/additional_features/dustmasker.py data/'+inPath+' data/'+inPath+'.dustmasker', shell=True)
call('rm tmp', shell=True)
call('rm data/'+inPath+'.dustmasker.tmp', shell=True)

# Make loops, translate, and triplet feature files
call('python progs/HuntMi/progs/additional_features/loops.py data/'+inPath+'.fold data/'+inPath+'.loops', shell=True)
call('python progs/HuntMi/progs/additional_features/translate.py data/'+inPath+' data/'+inPath+'.translate', shell=True)
call('python progs/HuntMi/progs/additional_features/triplet.py data/'+inPath+'.fold data/'+inPath+'.triplet', shell=True)

fs = FeatureSet()
fs.load('data/'+inPath+'.micropred')
fs.add_features_from_micropred('data/'+inPath+".dustmasker")
fs.add_features_from_micropred('data/'+inPath+'.triplet')
fs.add_features_from_micropred('data/'+inPath+'.loops')
fs.add_features_from_micropred('data/'+inPath+'.translate')
fs.export_micropred('data/'+inPath+'.huntmi')

call('rm data/'+inPath+'.fold', shell=True)
call('rm data/'+inPath+'.translate', shell=True)
call('rm data/'+inPath+'.triplet', shell=True)
call('rm data/'+inPath+'.loops', shell=True)
call('rm data/'+inPath+'.dustmasker', shell=True)

예제 #9

0

파일 보기

파일: cross_validation.py 프로젝트: jrgreen7/SMIRP

	
	sens = 0.0
	spec = 0.0
	for result in resultList:
		sens += result[0]
		spec += result[1]

	return (sens/len(resultList), spec/len(resultList), math.sqrt(spec/len(resultList)*sens/len(resultList)))


#################################################
# Run cross validation, build result files
#################################################

# Load data from positive and negative input files
allData = FeatureSet()
allData.load('data/'+posFile, patternClass='real')
allData.add_instances('data/'+negFile, patternClass='pseudo')
allData.libsvm_scale(paramOut = 'data/params')
subsets = allData.get_cv_subsets(numFolds)
resultList = []
# Go through all n folds...
for i in range(numFolds):
	# Build training and test sets
	testSet = subsets[i]
	trainSet = FeatureSet()
	for j in range(numFolds):
		if j != i:
			trainSet.add_instances_from_featureset(subsets[j])
	# Create svm files for train and test fold data. Train and test on these files.
	trainSet.weka_smote()

예제 #10

0

파일 보기

파일: convert_file.py 프로젝트: CU-BIC/SMIRP

import getopt, sys
from classes.FeatureSet import FeatureSet

opts, extraparams = getopt.getopt(sys.argv[1:], 'i:f:')
for o, p in opts:
    if o == '-i':
        inPath = p
    if o == '-f':
        outFormat = p

inFormat = inPath.split('.')[-1]
noFormatName = ""
for text in inPath.split('.')[:-1]:
    noFormatName += text
    noFormatName += '.'
outPath = noFormatName + outFormat

if inFormat in [
        'micropred', 'features', 'huntmi', 'csv', 'svm', 'libsvm', 'arff'
]:
    fs = FeatureSet()
    fs.load(inPath)
    fs.export(outPath)