def collectMatureFrames(baseName = None, merLength = None):
	
	#Defaults, Definitions, and Castings
	if not baseName:
		print 'base name of file family needed!'
		return 1
	if not merLength:
		merLength = 18
		
	merLength = int(merLength)


	###############collect IDS that have the same kmer
	print 'Collecting kmer IDs'
	
	conf = cgConfig.cgConfig()
	idFileName = conf.conf['resultsRaw']
	idFile = open(idFileName, 'r')
	interFileName = ('./out/%s/' % baseName) + baseName + '.collection.intermediate'
	interFile = open(interFileName, 'w')
	
	doneList = []

	for line in idFile:
		#For each kmer: grab id and Xmer frames --> output
		kmerID = line.strip().split('\t')[0].split('.')[0]
		if kmerID not in doneList:
			doneList.append(kmerID)
			matureSeq = line.strip().split('\t')[3]
			for frame in returnFrames(matureSeq, merLength):
				interFile.write('%s\t%s\n' % (kmerID, frame))
				
	interFile.close()
	idFile.close()
def probe(tcc, conf = None):
	
	if not conf:
		mConf = c.cgConfig('Main.conf')
	smallPath = mConf.conf['smallPath']
	
	chrom, strand, start, end = cg.tccSplit(tcc)
	
	total = 0
	for lib in cg.recurseDir(smallPath, end = 'mapped.%s.wig' % strand):
		
		
		try:
			eLevels = stepVectorScan.scanVectorsFile(lib, [tcc])
		except:
			print lib, 'index failed'
			continue
			
		
		#find highest expression level
		highest = 0
		for coord in eLevels:
			if eLevels[coord] > highest:
				highest = eLevels[coord]
				
				
		if highest > 0:
			print lib, highest
			total += highest
			#print eLevels
		
	print total
Exemple #3
0
def collectMatureFrames(baseName=None, merLength=None):

    #Defaults, Definitions, and Castings
    if not baseName:
        print 'base name of file family needed!'
        return 1
    if not merLength:
        merLength = 18

    merLength = int(merLength)

    ###############collect IDS that have the same kmer
    print 'Collecting kmer IDs'

    conf = cgConfig.cgConfig()
    idFileName = conf.conf['resultsRaw']
    idFile = open(idFileName, 'r')
    interFileName = ('./out/%s/' %
                     baseName) + baseName + '.collection.intermediate'
    interFile = open(interFileName, 'w')

    doneList = []

    for line in idFile:
        #For each kmer: grab id and Xmer frames --> output
        kmerID = line.strip().split('\t')[0].split('.')[0]
        if kmerID not in doneList:
            doneList.append(kmerID)
            matureSeq = line.strip().split('\t')[3]
            for frame in returnFrames(matureSeq, merLength):
                interFile.write('%s\t%s\n' % (kmerID, frame))

    interFile.close()
    idFile.close()
Exemple #4
0
def stageTWO(packetNumber=None):

    #Caste and defaults
    if not packetNumber:
        print 'need number of packet to run'
        return 1
    else:
        packetNumber = int(packetNumber)

    #####################################################
    #Load CONFIGURATION FILE:
    #####################################################

    conf = c.cgConfig()

    print '\nConfiguration:'
    for entry in conf.conf:
        print '..%s: %s' % (entry, conf.conf[entry])

    ####################################################
    #Files and File Naming
    ####################################################

    #scratchfile
    scratchFile = open(
        './scratch.txt', 'w'
    )  #This is just the file that the warnings for the perl script are redirected to

    #directories
    outDir = conf.conf['outDirectory']

    #Filenaming
    pipeName = conf.conf['runName'] + '-' + 's' + conf.conf[
        'frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf[
            'mirBasePairs'] + '.' + str(packetNumber)
    foldOut = outDir + '/' + pipeName + '.folded.frames.txt'
    filterOut = outDir + '/' + pipeName + '.filtered.mirs.tsv'
    finalOut = outDir + '/' + pipeName + '.FINAL.mirs.tsv'

    ####################################################
    #Pipeline
    ####################################################

    print "\nSTARTING STAGE THREE\n(packet # %s)" % packetNumber

    print "-Filtering frames and finding prospective mirs"
    subprocess.Popen([
        'python', './filter.frames.py', '-i', foldOut, '-g',
        conf.conf['genomes'], '-b', conf.conf['mirBasePairs'], '-m',
        conf.conf['mirLength'], '-o', filterOut
    ]).wait()
    '''		
	print "-Running Conservation filter"
	subprocess.Popen(['perl', './get_percent_identity_list_fix.pl',
					'-g', conf.conf['genomes'],
					'-l', filterOut,
					'-o', finalOut], stderr = scratchFile).wait()
	'''

    print "DONE"
def probe(tcc, conf=None):

    if not conf:
        mConf = c.cgConfig('Main.conf')
    smallPath = mConf.conf['smallPath']

    chrom, strand, start, end = cg.tccSplit(tcc)

    total = 0
    for lib in cg.recurseDir(smallPath, end='mapped.%s.wig' % strand):

        try:
            eLevels = stepVectorScan.scanVectorsFile(lib, [tcc])
        except:
            print lib, 'index failed'
            continue

        #find highest expression level
        highest = 0
        for coord in eLevels:
            if eLevels[coord] > highest:
                highest = eLevels[coord]

        if highest > 0:
            print lib, highest
            total += highest
            #print eLevels

    print total
def splitExonsIntrons(cName = None):
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)
	
	#init
	organism = conf.conf['organism']
	minOverlap = 50
	cHairs = getHairpins.getHairpins() #CID: HAIRPIN
	exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
	hairpins = []
	for CID in cHairs:
		hairpins.append(cHairs[CID])
	
	print 'checking overlaps'
	#check which hairpins overlap exons and by how much
	exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount = True)
	print '  ', len(exonOverlapped)
	
	print 'removing partial introns'
	#remove the ones that didn't overlap more than X:
	remList = []
	for tcc, oAmount in exonOverlapped:
		if oAmount < minOverlap:
			remList.append([tcc, oAmount])
	
	for item in remList:
		exonOverlapped.remove(item)
	print '  ', len(exonOverlapped), 'out of', len(cHairs.keys())
		
	#get CIDs of exons
	exonCIDs = []
	for tcc, oAmount in exonOverlapped:
		for CID in cHairs:
			if cHairs[CID] == tcc:
				exonCIDs.append(str(CID))
	
	
	#Open sorted predictions and write lines with CIDs to respective files
	predFile = open(conf.conf['resultsSorted'], 'r')
	exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w')
	intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w')
	for line in predFile:
		if line.split('\t')[7] in exonCIDs:
			exonFile.write(line)
		else:
			intronFile.write(line)
	predFile.close()
	exonFile.close()
	intronFile.close()
def splitExonsIntrons(cName=None):
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    #init
    organism = conf.conf['organism']
    minOverlap = 50
    cHairs = getHairpins.getHairpins()  #CID: HAIRPIN
    exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
    hairpins = []
    for CID in cHairs:
        hairpins.append(cHairs[CID])

    print 'checking overlaps'
    #check which hairpins overlap exons and by how much
    exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount=True)
    print '  ', len(exonOverlapped)

    print 'removing partial introns'
    #remove the ones that didn't overlap more than X:
    remList = []
    for tcc, oAmount in exonOverlapped:
        if oAmount < minOverlap:
            remList.append([tcc, oAmount])

    for item in remList:
        exonOverlapped.remove(item)
    print '  ', len(exonOverlapped), 'out of', len(cHairs.keys())

    #get CIDs of exons
    exonCIDs = []
    for tcc, oAmount in exonOverlapped:
        for CID in cHairs:
            if cHairs[CID] == tcc:
                exonCIDs.append(str(CID))

    #Open sorted predictions and write lines with CIDs to respective files
    predFile = open(conf.conf['resultsSorted'], 'r')
    exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w')
    intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w')
    for line in predFile:
        if line.split('\t')[7] in exonCIDs:
            exonFile.write(line)
        else:
            intronFile.write(line)
    predFile.close()
    exonFile.close()
    intronFile.close()
Exemple #8
0
def createMultiTrackDir(dirName, organism):
	'''THIS DIFFERS FROM ABOVE BECAUSE IT DOESN't REQUIRE META INFO
	IT JUST MAKES A MERGED WIG FOR EVERYTHING IN THE DIRECTORY'''
	mainConf = c.cgConfig('Main.conf')
	
	fileList = []
	for file in cg.recurseDir(dirName, end = '.mapped'):
		fileList.append(file)
				
	#make merged wig
	if organism == 'human':
		chroms = cg.humanChromosomes
		assembly = 'hg19'
	elif organism == 'mouse':
		chroms = cg.mouseChromosomes
		assembly = 'mm9'
	elif organism == 'zebrafish':
		chroms = cg.zebrafishChromosomes
		assembly = 'danRer6'
	
	print 'Making Bed File vectors'
	cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i')
	for fName in fileList:
		alignment_file = HTSeq.BowtieReader(fName)
		for alngt in alignment_file:
			if alngt.aligned:
				cvg.add_value( 1, alngt.iv ) #iv is the genomic interval..

	bedNamePos = dirName + '/Merge.' + organism + '.1.wig'
	bedNameNeg = dirName + '/Merge.' + organism + '.-1.wig'
	
	print 'Writing Bed File'
	cvg.write_bedgraph_file(bedNamePos, "+" )
	cvg.write_bedgraph_file(bedNameNeg, "-" )

	#Now extend it
	updateWigLength(bedNamePos, assembly)
	updateWigLength(bedNameNeg, assembly)
	
	#Now Sort it.
	cgSort.wigSort(bedNamePos)
	cgSort.wigSort(bedNameNeg)
def finish(packetNumber):
	#init
	conf = c.cgConfig()

	#directories
	outDir = conf.conf['outDirectory']

	#Filenaming
	pipeName = conf.conf['runName'] + '-' + 's' + conf.conf['frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf['mirBasePairs'] + '.' + str(packetNumber)
	extractOut = outDir + '/' + pipeName + '.folding.frames.tsv'
	splitDirectory = outDir + '/' + pipeName + '/'
	foldOut = outDir + '/' + pipeName + '.folded.frames.txt'


	print '-Stitching %s files back into one (%s)' % (conf.conf['numSplitFiles'], packetNumber)
	subprocess.Popen(['python', './stitchdb.py',
					'-b', pipeName,
					'-n', conf.conf['numSplitFiles'],
					'-o', outDir + '/',
					'-i', splitDirectory]).wait()
def queryNumJobsQ(user):
	
	#init
	mainConf = cgConfig.cgConfig('Main.conf')
	scratchFileName = mainConf.conf['qstatScratch']
	
	#output qstat info to scratch file
	sFile = open(scratchFileName, 'w')
	subprocess.Popen(['qstat', '-u', user], stdout=sFile).wait()
	sFile.close()
	
	#parse scratchfile, check how many times usr name appears
	sFile = open(scratchFileName, 'r')
	userCount = 0
	for line in sFile:
		if user in line:
			userCount = userCount + 1
	sFile.close()
	
	return userCount
Exemple #11
0
def queryNumJobsQ(user):

    #init
    mainConf = cgConfig.cgConfig('Main.conf')
    scratchFileName = mainConf.conf['qstatScratch']

    #output qstat info to scratch file
    sFile = open(scratchFileName, 'w')
    subprocess.Popen(['qstat', '-u', user], stdout=sFile).wait()
    sFile.close()

    #parse scratchfile, check how many times usr name appears
    sFile = open(scratchFileName, 'r')
    userCount = 0
    for line in sFile:
        if user in line:
            userCount = userCount + 1
    sFile.close()

    return userCount
def finish(packetNumber):
    #init
    conf = c.cgConfig()

    #directories
    outDir = conf.conf['outDirectory']

    #Filenaming
    pipeName = conf.conf['runName'] + '-' + 's' + conf.conf[
        'frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf[
            'mirBasePairs'] + '.' + str(packetNumber)
    extractOut = outDir + '/' + pipeName + '.folding.frames.tsv'
    splitDirectory = outDir + '/' + pipeName + '/'
    foldOut = outDir + '/' + pipeName + '.folded.frames.txt'

    print '-Stitching %s files back into one (%s)' % (
        conf.conf['numSplitFiles'], packetNumber)
    subprocess.Popen([
        'python', './stitchdb.py', '-b', pipeName, '-n',
        conf.conf['numSplitFiles'], '-o', outDir + '/', '-i', splitDirectory
    ]).wait()
Exemple #13
0
def createTrackInDir(dirName):
	'''Every Q function has a corresponding shell script
	Make wig file for all mapped files, for all organisms'''
	
	wrapperShell = '/home/chrisgre/scripts/mapping/createTrack.sh'
	
	mainConf = c.cgConfig('Main.conf')
	metaFileName = mainConf.conf['metaFileName']

	for file in cg.recurseDir(dirName, end = '.mapped'):
						
		#check if mouse or human
		baseFName = cg.getBaseFileName(file)
		baseFName = baseFName.split('.')[0]
		
		metaDict = cg.getMetaFileDict(metaFileName)
		
		org = 'None'
		if baseFName in metaDict:
			if metaDict[baseFName][1] == 'NONE':
				print '  NO ORG KNOWN FOR', file
				continue
			else:
				org = metaDict[baseFName][1]
				print '  USING ORG', org, file
				
		#check if there is an organism, must check due to files not in metaFile
		if org == 'None':
			print '  NO org (not in meta file)', file
			continue
			
		while True:
			#submit job if there are less than ten
			if clusterCheck.queryNumJobsQ('chrisgre') < 1000:
				#subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				subprocess.Popen(['qsub', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				#time.sleep(.5) #give it time to update qstat
				break
			else:#wait 10 secs...
				time.sleep(20)
Exemple #14
0
def stageTWO(packetNumber=None):

    #Caste and defaults
    if not packetNumber:
        print 'need number of packet to run'
        return 1
    else:
        packetNumber = int(packetNumber)

    #####################################################
    #Load CONFIGURATION FILE:
    #####################################################

    conf = c.cgConfig()
    '''
	print '\nConfiguration:'
	for entry in conf.conf:
		print '..%s: %s' % (entry, conf.conf.conf[entry])
		'''

    ####################################################
    #Files and File Naming
    ####################################################

    #scratchfile
    scratchFile = open(
        './scratch.txt', 'w'
    )  #This is just the file that the warnings for the perl script are redirected to

    #directories
    outDir = conf.conf['outDirectory']

    #Filenaming
    #!!!THIS IS DIFFERENENT THAN MIRPIPEPARA!!! (pipename is the name of packet...)
    pipeName = conf.conf['runName'] + '-' + 's' + conf.conf[
        'frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf[
            'mirBasePairs'] + '.' + str(packetNumber)
    extractOut = outDir + '/' + pipeName + '.folding.frames.tsv'
    splitDirectory = outDir + '/' + pipeName + '/'
    foldOut = outDir + '/' + pipeName + '.folded.frames.txt'

    ####################################################
    #Pipeline
    ####################################################

    print "\nSTARTING STAGE TWO\n(packet # %s)" % packetNumber

    if int(conf.conf['numSplitFiles']) == 1:  #if you want to run on one node
        print '-Folding Frames using RNAfold on SINGLE NODE'
        subprocess.Popen(['./RNAfold.sh', extractOut, foldOut]).wait()
    else:  #Else parallize
        print '-Splitting folding frames into %s files (%s)' % (
            conf.conf['numSplitFiles'], packetNumber)
        subprocess.Popen([
            'python', './splitdb.py', '-i', extractOut, '-b', pipeName, '-n',
            conf.conf['numSplitFiles'], '-d', splitDirectory
        ]).wait()

        print '-Submitting %s seperate jobs to cluster (%s)' % (
            conf.conf['numSplitFiles'], packetNumber)
        subprocess.Popen([
            'python', './parFold.py', '-b', pipeName, '-n',
            conf.conf['numSplitFiles'], '-d', splitDirectory
        ]).wait()

        #Get Job ID's:
        parseFile = open('%s.jobsinfo.txt' % pipeName, 'r')  #!!! edit this?
        jobIDs = []
        for line in parseFile:
            jobIDs.append(line.split(' ')[2])

        #check if right number were submitted
        if len(jobIDs) == int(conf.conf['numSplitFiles']):
            print '....jobs were submitted correctly'

    print 'DONE (%s)' % packetNumber
# get the hairpin values
# can also reuse elements from this script to get CID elements...
import bioLibCG as cg
from bioLibCG import ss
import cgConfig as c

mConf = c.cgConfig("Main.conf")
conf = c.cgConfig()


def getHairpins(fN):
    predFile = open(fN, "r")

    # populate CID:hairpin range
    cHairs = {}
    for line in predFile:
        # get cluster ID
        CID = ss(line)[7]
        hairpin = ss(line)[2]

        if CID in cHairs:
            # check if the starts and ends need to be stretched
            hStart = int(ss(cHairs[CID], ":")[2])
            hEnd = int(ss(cHairs[CID], ":")[3])

            start = int(ss(hairpin, ":")[2])
            end = int(ss(hairpin, ":")[3])

            if start < hStart:
                hStart = start
            if end > hEnd:
def updateReadDensity(tType):
    #go through wig each chromosome and check the mature seqs
    mainConf = cgConfig.cgConfig('Main.conf')
    conf = cgConfig.cgConfig()
    organism = conf.conf['organism']
    wigFolder = mainConf.conf['wig%s' % organism]
    newLines = []

    if tType == 'E':
        pFileName = conf.conf['resultsExons']
    elif tType == 'I':
        pFileName = conf.conf['resultsIntrons']
    else:
        print 'READ UPDATE FAIL'

    print '  Updating Read Density:', tType

    for wigFileN in cg.recurseDir(wigFolder, end='.wig'):

        #init
        chrom = wigFileN.strip().split('.')[-2]
        strand = wigFileN.strip().split('.')[-4]
        wigFile = open(wigFileN, 'r')
        mirFile = open(pFileName, 'r')
        print wigFileN

        #get rid of header
        wigFile.readline()

        print '  populating hitmap'
        #populate hitmap
        wigMap = {}
        for line in wigFile:
            value = int(line.strip().split('\t')[3].split('.')[0])
            if value > 0:
                start = int(line.strip().split('\t')[1])
                end = int(line.strip().split('\t')[2])
                for i in range(start, end):
                    wigMap[i] = value
        wigFile.close()

        print '  calculating hits for mature seqs'
        #calculate total hits per mature
        for line in mirFile:
            mTcc = line.strip().split('\t')[1]
            mirID = line.strip().split('\t')[0]
            if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1]
                                                  == strand):
                #if mirID == '26477.30.106643972': print 'Starting Total Count'
                highestHit = 0
                for i in range(int(mTcc.split(':')[2]),
                               int(mTcc.split(':')[3])):
                    #if mirID == '26477.30.106643972': print '  ', i
                    if i in wigMap:
                        if wigMap[i] > highestHit:
                            highestHit = wigMap[i]
                        #if mirID == '26477.30.106643972': print '    ', i, totalHits, wigMap[i]

                newLines.append(cg.appendToLine(line, str(highestHit), 11))

        mirFile.close()

    print 'Writing New File'
    #write new results file
    outFile = open(pFileName, 'w')
    for line in newLines:
        outFile.write(line)
    outFile.close()

    ####NOW UPDATE HIGHEST HIT PER CLUSTER####

    clusterCount = {}

    pFile = open(pFileName, 'r')
    for line in pFile:
        predictionCount = int(line.strip().split('\t')[11])
        CID = line.strip().split('\t')[7]
        if CID in clusterCount:
            if clusterCount[CID] < predictionCount:
                clusterCount[CID] = predictionCount
        else:
            clusterCount[CID] = predictionCount
    pFile.close()

    #update the file --> cluster small count
    newLines = []
    predFile = open(pFileName, 'r')
    for line in predFile:
        CID = line.strip().split('\t')[7]
        numMax = clusterCount[CID]
        newLines.append(cg.appendToLine(line, str(numMax), 12))
    predFile.close()

    #sort newLines by clusterID
    sortDict = {}
    CIDs = []
    for line in newLines:
        CID = int(line.strip().split('\t')[7])
        if CID not in CIDs:
            CIDs.append(CID)
        if CID in sortDict:
            sortDict[CID].append(line)
        else:
            sortDict[CID] = [line]

    CIDs.sort()

    newLines = []
    for CID in CIDs:
        for line in sortDict[CID]:
            newLines.append(line)

    #write new File
    newFile = open(pFileName, 'w')
    for line in newLines:
        newFile.write(line)
    newFile.close()
def updateReadDensity(tType):
	#go through wig each chromosome and check the mature seqs
	mainConf = cgConfig.cgConfig('Main.conf')
	conf = cgConfig.cgConfig()
	organism = conf.conf['organism']
	wigFolder = mainConf.conf['wig%s' % organism]	
	newLines = []
	

	if tType == 'E':
		pFileName = conf.conf['resultsExons']
	elif tType == 'I':
		pFileName = conf.conf['resultsIntrons']
	else:
		print 'READ UPDATE FAIL'

	print '  Updating Read Density:', tType

	for wigFileN in cg.recurseDir(wigFolder, end = '.wig'):
		
		
		#init
		chrom = wigFileN.strip().split('.')[-2]
		strand = wigFileN.strip().split('.')[-4]
		wigFile = open(wigFileN, 'r')
		mirFile = open(pFileName, 'r')
		print wigFileN
		
		#get rid of header
		wigFile.readline()
		
		print '  populating hitmap'
		#populate hitmap
		wigMap = {}
		for line in wigFile:
			value = int(line.strip().split('\t')[3].split('.')[0])
			if value > 0:
				start = int(line.strip().split('\t')[1])
				end = int(line.strip().split('\t')[2])
				for i in range(start, end):
					wigMap[i] = value
		wigFile.close()
		
		print '  calculating hits for mature seqs'
		#calculate total hits per mature
		for line in mirFile:
			mTcc = line.strip().split('\t')[1]
			mirID = line.strip().split('\t')[0]
			if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1] == strand):
				#if mirID == '26477.30.106643972': print 'Starting Total Count'
				highestHit = 0
				for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
					#if mirID == '26477.30.106643972': print '  ', i 
					if i in wigMap:
						if wigMap[i] > highestHit:
							highestHit = wigMap[i]
						#if mirID == '26477.30.106643972': print '    ', i, totalHits, wigMap[i]
			
				newLines.append(cg.appendToLine(line, str(highestHit), 11))
		
		mirFile.close()

	print 'Writing New File'
	#write new results file
	outFile = open(pFileName, 'w')
	for line in newLines:
		outFile.write(line)
	outFile.close()

	####NOW UPDATE HIGHEST HIT PER CLUSTER####

	clusterCount = {}

	pFile = open(pFileName, 'r')
	for line in pFile:
		predictionCount = int(line.strip().split('\t')[11])
		CID = line.strip().split('\t')[7]
		if CID in clusterCount:
			if clusterCount[CID] < predictionCount:
				clusterCount[CID] = predictionCount
		else:
			clusterCount[CID] = predictionCount
	pFile.close()

	#update the file --> cluster small count
	newLines = []
	predFile = open(pFileName, 'r')
	for line in predFile:
		CID = line.strip().split('\t')[7]
		numMax = clusterCount[CID]
		newLines.append(cg.appendToLine(line, str(numMax), 12))
	predFile.close()

	#sort newLines by clusterID
	sortDict = {}
	CIDs = []
	for line in newLines:
		CID = int(line.strip().split('\t')[7])
		if CID not in CIDs:
			CIDs.append(CID)
		if CID in sortDict:
			sortDict[CID].append(line)
		else:
			sortDict[CID] = [line]
		
	CIDs.sort()

	newLines = []
	for CID in CIDs:
		for line in sortDict[CID]:
			newLines.append(line)

	#write new File
	newFile = open(pFileName, 'w')
	for line in newLines:
		newFile.write(line)
	newFile.close()
Exemple #18
0
def exonNoisy(cName = None):
	#init
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)
	cHairs = getHairpins.getHairpins(conf.conf['resultsExons']) #CID: HAIRPIN
	organism = conf.conf['organism']
	geneSetFolder = mConf.conf['geneSets%s' % organism]
	
	#make prediction overlap hitmap
	print 'Making prediction list'
	predList = [] 
	for CID in cHairs:
		hPin = cHairs[CID]
		predList.append(hPin)
	
	if compare.checkIfOverlaps(predList):
		predList = compare.collapseOverlaps(predList)
	
	
	#make genes for Ensemble/make list of tccs for exons.
	print 'Creating gene set'
	
	ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllExons.tsv')
	print '  loaded # genes:', len(ensGenes.set)
	
	
	#collect levels for each haipin region
	print '[Checking all levels]'
	cidLevels = {}
	for CID in cHairs:
		print CID
		hPin = cHairs[CID]
			
		#for each hairpin, --> find overlapping transcripts in same gene
		overlappingGenes = ensGenes.geneOverlaps([hPin])
		if len(overlappingGenes) > 0:
			gIDs = [gene.id for gene in overlappingGenes]
			allTccs = ensGenes.getTccsFromGIDs(gIDs)
			if compare.checkIfOverlaps:
				print '  Overlaps...collapsing'
				allTccs = compare.collapseOverlaps(allTccs)
		else:
			print 'NO GENE OVERLAPS!!!!!', CID, hPin
		
		
		#filter out my predictions.
		print '  Filtering out predictions'
		checkList = compare.subtractTwoTccLists(allTccs, predList)
			
		
		#Get Expression level for gene.
		print '  Retrieving Expression levels:', cg.getTccListTotalLength(checkList)
		levels = []
		
		
		hPinLevels = stepVectorScan.scanVectorsHist(checkList, cName)
		for hPin in hPinLevels:
			levels.extend(hPinLevels[hPin])
		
			
		cidLevels[CID] = levels
		
	
	
	
	#output levels to file
	print 'Outputting to file'
	#find longest
	longest = 0
	for CID in cidLevels:
		length = len(cidLevels[CID])
		if length > longest:
			longest = length
	
	sortedKeys = cidLevels.keys()
	sortedKeys.sort()
	#print sortedKeys
	
	newLines = []
	for j in range(0, longest): #how many lines are there
		newLine = []
		for CID in sortedKeys:
			if len(cidLevels[CID]) > j:# add it
				newLine.append(str(cidLevels[CID][j]))
			else:
				newLine.append('NA')
	
		newLines.append('\t'.join(newLine) + '\n')
	
	outFileN = conf.conf['exonNoiseData']
	outFile = open(outFileN, 'w')
	outFile.write('\t'.join(sortedKeys) + '\n')
	outFile.writelines(newLines)
	outFile.close()
def intronNoisy(cName = None):
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)
	
	#init
	cHairs = getHairpins.getHairpins(conf.conf['resultsIntrons']) #CID: HAIRPIN
	organism = conf.conf['organism']
	exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
	slide = 1000
	
	#make prediction overlap hitmap
	predMap = {}
	predList = []
	for CID in cHairs:
		hPin = cHairs[CID]
		predList.append(hPin)
	
	#collapse Overlaps
	print ' collapsing predictions'
	predList = compare.collapseOverlaps(predList)
	print ' collapsing exons'
	exonList = compare.collapseOverlaps(exonList)
	
	
	#collect levels for each hairpin region
	cidLevels = {}
	for CID in cHairs:
		print CID
		hPin = cHairs[CID]
		chrom = ss(hPin, ':')[0]
		strand = ss(hPin, ':')[1]
		start = int(ss(hPin, ':')[2])
		end = int(ss(hPin, ':')[3])
		
		scanStart = start - slide
		scanEnd = end + slide
		
		scanRange = []
		scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start))
		scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd))
		
		print scanRange
		scanRange = compare.subtractTwoTccLists(scanRange, predList)
		scanRange = compare.subtractTwoTccLists(scanRange, exonList)
			
		levels = []
		
		print '  Retrieving Expression levels:', cg.getTccListTotalLength(scanRange)
		levels = []
		
		
		hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName)
		for hPin in hPinLevels:
			levels.extend(hPinLevels[hPin])
		
			
		cidLevels[CID] = levels
		
	#output levels to file
	
	#find longest
	longest = 0
	for CID in cidLevels:
		length = len(cidLevels[CID])
		if length > longest:
			longest = length
	
	sortedKeys = cidLevels.keys()
	sortedKeys.sort()
	
	newLines = []
	for j in range(0, longest): #how many lines are there
		newLine = []
		for CID in sortedKeys:
			if len(cidLevels[CID]) > j:# add it
				newLine.append(str(cidLevels[CID][j]))
			else:
				newLine.append('NA')
	
		newLines.append('\t'.join(newLine) + '\n')
	
	outFileN = conf.conf['intronNoiseData']
	outFile = open(outFileN, 'w')
	outFile.write('\t'.join(sortedKeys) + '\n')
	outFile.writelines(newLines)
	outFile.close()
def updateReadDensity(tType, cName):
	#go through wig each chromosome and check the mature seqs
	mainConf = cgConfig.cgConfig('Main.conf')
	conf = cgConfig.getConfig(cName)
	organism = conf.conf['organism']
	wigFolder = mainConf.conf['wig%s' % organism]	
	newLines = []
	
	
	#Differentiate between exon or intron...
	if tType == 'E':
		pFileName = conf.conf['resultsExons']
	elif tType == 'I':
		pFileName = conf.conf['resultsIntrons']
	else:
		print 'READ UPDATE FAIL'

	print '  Updating Read Density:', tType

	
	#get read density for each line...
	print '  calculating hits for mature seqs'
	#calculate total hits per mature
	mirFile = open(pFileName, 'r')
	for line in mirFile:
		mTcc = line.strip().split('\t')[1]
		mirID = line.strip().split('\t')[0]
		
		tccStretch = cgPeaks.stretch(mTcc, cName)
		highestHit = 0
		for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
			if i in tccStretch.profile:
				if tccStretch.profile[i] > highestHit:
					highestHit = tccStretch.profile[i]		
		
		newLines.append(cg.appendToLine(line, str(highestHit), 11))
	
	mirFile.close()

	print 'Writing New File'
	#write new results file
	outFile = open(pFileName, 'w')
	for line in newLines:
		outFile.write(line)
	outFile.close()

	####NOW UPDATE HIGHEST HIT PER CLUSTER####

	clusterCount = {}

	pFile = open(pFileName, 'r')
	for line in pFile:
		predictionCount = int(line.strip().split('\t')[11])
		CID = line.strip().split('\t')[7]
		if CID in clusterCount:
			if clusterCount[CID] < predictionCount:
				clusterCount[CID] = predictionCount
		else:
			clusterCount[CID] = predictionCount
	pFile.close()

	#update the file --> cluster small count
	newLines = []
	predFile = open(pFileName, 'r')
	for line in predFile:
		CID = line.strip().split('\t')[7]
		numMax = clusterCount[CID]
		newLines.append(cg.appendToLine(line, str(numMax), 12))
	predFile.close()

	#sort newLines by clusterID
	sortDict = {}
	CIDs = []
	for line in newLines:
		CID = int(line.strip().split('\t')[7])
		if CID not in CIDs:
			CIDs.append(CID)
		if CID in sortDict:
			sortDict[CID].append(line)
		else:
			sortDict[CID] = [line]
		
	CIDs.sort()

	newLines = []
	for CID in CIDs:
		for line in sortDict[CID]:
			newLines.append(line)

	#write new File
	newFile = open(pFileName, 'w')
	for line in newLines:
		newFile.write(line)
	newFile.close()
Exemple #21
0
def intronNoisy(cName=None):
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    #init
    cHairs = getHairpins.getHairpins(
        conf.conf['resultsIntrons'])  #CID: HAIRPIN
    organism = conf.conf['organism']
    exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
    slide = 1000

    #make prediction overlap hitmap
    predMap = {}
    predList = []
    for CID in cHairs:
        hPin = cHairs[CID]
        predList.append(hPin)

    #collapse Overlaps
    print ' collapsing predictions'
    predList = compare.collapseOverlaps(predList)
    print ' collapsing exons'
    exonList = compare.collapseOverlaps(exonList)

    #collect levels for each hairpin region
    cidLevels = {}
    for CID in cHairs:
        print CID
        hPin = cHairs[CID]
        chrom = ss(hPin, ':')[0]
        strand = ss(hPin, ':')[1]
        start = int(ss(hPin, ':')[2])
        end = int(ss(hPin, ':')[3])

        scanStart = start - slide
        scanEnd = end + slide

        scanRange = []
        scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start))
        scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd))

        print scanRange
        scanRange = compare.subtractTwoTccLists(scanRange, predList)
        scanRange = compare.subtractTwoTccLists(scanRange, exonList)

        levels = []

        print '  Retrieving Expression levels:', cg.getTccListTotalLength(
            scanRange)
        levels = []

        hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName)
        for hPin in hPinLevels:
            levels.extend(hPinLevels[hPin])

        cidLevels[CID] = levels

    #output levels to file

    #find longest
    longest = 0
    for CID in cidLevels:
        length = len(cidLevels[CID])
        if length > longest:
            longest = length

    sortedKeys = cidLevels.keys()
    sortedKeys.sort()

    newLines = []
    for j in range(0, longest):  #how many lines are there
        newLine = []
        for CID in sortedKeys:
            if len(cidLevels[CID]) > j:  # add it
                newLine.append(str(cidLevels[CID][j]))
            else:
                newLine.append('NA')

        newLines.append('\t'.join(newLine) + '\n')

    outFileN = conf.conf['intronNoiseData']
    outFile = open(outFileN, 'w')
    outFile.write('\t'.join(sortedKeys) + '\n')
    outFile.writelines(newLines)
    outFile.close()
Exemple #22
0
def findPeaks(pType, cName=None):

    #init
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    if pType == 'E':
        predName = conf.conf['resultsExonsSorted']
    else:
        predName = conf.conf['resultsIntronsSorted']

    print predName
    #make CID:hairpin:peak dictionary
    cHairs = getHairpins.getHairpins(predName)
    peakDict = {}
    for CID in cHairs:
        peakDict[CID] = [cHairs[CID], 'None']

    timer = cg.cgTimer()
    timer.start()

    #put peaks in memory
    print 'Creating peak data'
    peaks = {}  # chr:peak:value
    for CID in cHairs:
        chrom, strand, start, end = cg.tccSplit(cHairs[CID])
        tcc = cHairs[CID]

        #init dictionary
        if chrom not in peaks:
            peaks[chrom] = {}

        if strand not in peaks[chrom]:
            peaks[chrom][strand] = {}

        #create peaks for tcc and add to peak dictionary
        stretch = cgPeaks.stretch(tcc, cName)
        stretch.createPeaks()
        for peakCoord in stretch.peaks:
            peaks[chrom][strand][peakCoord] = 0
    print timer.split()

    print 'finding best combos'
    bestCombos = []
    aPass = 0
    bPass = 0
    cPass = 0
    numT = 0
    for CID in peakDict:
        cgFlag = False
        if CID == '538': cgFlag = True

        tcc = peakDict[CID][0]
        #print tcc
        tccPeaks = []
        chrom = cg.ss(tcc, ':')[0]
        strand = cg.ss(tcc, ':')[1]
        start = int(cg.ss(tcc, ':')[2])
        end = int(cg.ss(tcc, ':')[3])

        #get all peaks
        for i in range(start, end + 1):
            if i in peaks[chrom][strand]:
                #print '  peak added', i
                tccPeaks.append(i)

        #Calculate parameters...
        pairStrings = []  #used to check if pair already added
        peakCombos = []
        for x in tccPeaks:

            #scan a 30 bp range around this point and find the best roof...
            pRange = 30
            rTcc = cg.makeTcc(chrom, strand, x, x + 1)

            #quickly get max value...kinda a long way to do it but whatever
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         1,
                                                         cName,
                                                         ratio=False)
            xval = cProfile[0]
            max = xval
            highestValueCoord = x

            #now make profile for roof...
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         pRange,
                                                         cName,
                                                         ratio=True)

            #now get highest stretch length and the rNext coord.
            minVal = .80
            highest = 0
            stretch = 0
            startCurrent = None
            startFinal = None
            endFinal = None
            for i in range(1 - pRange, pRange):
                if cProfile[i] > minVal:
                    stretch += 1
                    if startCurrent == None:
                        startCurrent = i
                else:
                    if stretch > 0:
                        if stretch > highest:  #stretch ended and was higher than previous
                            highest = stretch
                            endFinal = i - 1
                            startFinal = startCurrent
                            startCurrent = None
                        else:
                            startCurrent = None
                    stretch = 0

            #get +/- 4 value...
            val = [1.0, 1.0]
            if (startFinal) and (endFinal):
                low = startFinal - 4
                high = endFinal + 4
                if low > (1 - pRange):
                    if high < pRange:
                        val[0] = float(cProfile[startFinal - 4])
                        val[1] = float(cProfile[endFinal + 4])

            #fill in other details...
            y = 'S'
            dist = 'S'
            ratio = 'S'

            peakCombos.append([tcc, x, y, dist, ratio, max, highest, val])
            #print '  ', peakCombos[-1]

        #find best combo...
        topCombo = None
        for combo in peakCombos:
            roofLength = combo[6]
            dropValue = combo[7][0]
            if combo[7][1] > dropValue:
                dropValue = combo[7][1]

            #print roofLength, dropValue
            if 14 < roofLength < 26:
                if 0.0 < dropValue < 0.2:
                    #pick one with rooflength nearest 20:
                    if topCombo:
                        if (math.fabs(22 - roofLength)) < (
                                math.fabs(22 - topCombo[6])):
                            topCombo = combo
                    else:
                        topCombo = combo

        if topCombo:
            peakDict[CID][1] = topCombo
            bestCombos.append(topCombo)
            print bestCombos[-1]
        else:
            #print 'None'
            pass

    print timer.split()

    #now update predFile (SLOT 13)
    predFile = open(predName, 'r')
    newLines = []
    for line in predFile:
        CID = cg.ss(line)[7]
        if peakDict[CID][1] == 'None':
            peakInfo = 'None'
        else:
            peakInfo = '%s:%s:%s:%s:%s:%s' % (
                str(peakDict[CID][1][1])[-3:], 'S', str(
                    peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],
                peakDict[CID][1][6], peakDict[CID][1][7])
        newLines.append(cg.appendToLine(line, peakInfo, 13))
    predFile.close()

    predFile = open(predName, 'w')
    predFile.writelines(newLines)
    predFile.close()
def updateReadDensity(tType, cName):
    #go through wig each chromosome and check the mature seqs
    mainConf = cgConfig.cgConfig('Main.conf')
    conf = cgConfig.getConfig(cName)
    organism = conf.conf['organism']
    wigFolder = mainConf.conf['wig%s' % organism]
    newLines = []

    #Differentiate between exon or intron...
    if tType == 'E':
        pFileName = conf.conf['resultsExons']
    elif tType == 'I':
        pFileName = conf.conf['resultsIntrons']
    else:
        print 'READ UPDATE FAIL'

    print '  Updating Read Density:', tType

    #get read density for each line...
    print '  calculating hits for mature seqs'
    #calculate total hits per mature
    mirFile = open(pFileName, 'r')
    for line in mirFile:
        mTcc = line.strip().split('\t')[1]
        mirID = line.strip().split('\t')[0]

        tccStretch = cgPeaks.stretch(mTcc, cName)
        highestHit = 0
        for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
            if i in tccStretch.profile:
                if tccStretch.profile[i] > highestHit:
                    highestHit = tccStretch.profile[i]

        newLines.append(cg.appendToLine(line, str(highestHit), 11))

    mirFile.close()

    print 'Writing New File'
    #write new results file
    outFile = open(pFileName, 'w')
    for line in newLines:
        outFile.write(line)
    outFile.close()

    ####NOW UPDATE HIGHEST HIT PER CLUSTER####

    clusterCount = {}

    pFile = open(pFileName, 'r')
    for line in pFile:
        predictionCount = int(line.strip().split('\t')[11])
        CID = line.strip().split('\t')[7]
        if CID in clusterCount:
            if clusterCount[CID] < predictionCount:
                clusterCount[CID] = predictionCount
        else:
            clusterCount[CID] = predictionCount
    pFile.close()

    #update the file --> cluster small count
    newLines = []
    predFile = open(pFileName, 'r')
    for line in predFile:
        CID = line.strip().split('\t')[7]
        numMax = clusterCount[CID]
        newLines.append(cg.appendToLine(line, str(numMax), 12))
    predFile.close()

    #sort newLines by clusterID
    sortDict = {}
    CIDs = []
    for line in newLines:
        CID = int(line.strip().split('\t')[7])
        if CID not in CIDs:
            CIDs.append(CID)
        if CID in sortDict:
            sortDict[CID].append(line)
        else:
            sortDict[CID] = [line]

    CIDs.sort()

    newLines = []
    for CID in CIDs:
        for line in sortDict[CID]:
            newLines.append(line)

    #write new File
    newFile = open(pFileName, 'w')
    for line in newLines:
        newFile.write(line)
    newFile.close()
def updateNoise(pType, cName=None):

    #init
    mainConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    if pType == 'E':
        predName = conf.conf['resultsExons']
    else:
        predName = conf.conf['resultsIntrons']

    #populate cid: exon dist
    print 'Populating CID/INtron/exon distribution data'
    if pType == 'E':
        noiseFN = conf.conf['exonNoiseData']
        f = open(noiseFN, 'r')
    else:
        noiseFN = conf.conf['intronNoiseData']
        f = open(noiseFN, 'r')

    exonDists = {}  #cid: [exon dist]
    header = f.readline()
    order = {}  # num:CID
    for i, CID in enumerate(header.strip().split('\t')):
        order[i] = CID
        exonDists[CID] = []

    for line in f:
        data = line.strip().split('\t')
        for i, dataPoint in enumerate(data):
            if dataPoint == 'NA' or dataPoint == '':
                continue
            else:
                dataPoint = float(dataPoint)
                CID = order[i]
                exonDists[CID].append(dataPoint)

    #get highest expression level for each cluster
    print 'Populating highest expression levels'
    predExpression = {}  # CID; highest level
    exonFile = open(predName, 'r')
    for line in exonFile:
        CID = line.strip().split('\t')[7]
        hDensity = line.strip().split('\t')[12]

        predExpression[CID] = hDensity

    #get pVals for each CID
    print 'Getting pvals for each cluster'
    pVals = {}  # CID; [lam,pVal]
    for CID in exonDists:
        if not len(exonDists[CID]) > 0:  #no data in 2kb range.
            lam = 'NA'
            pVal = 'NA'
        else:
            lam = cgStats.getLam(exonDists[CID])
            pVal = cgStats.getPValExp(predExpression[CID], lam)

        pVals[CID] = [
            lam, pVal
        ]  #lam gives a good approximation of noise levels in region...

    print 'Updating the file'
    #update file...
    predFile = open(predName, 'r')
    newLines = []
    for line in predFile:
        CID = line.split('\t')[7]
        newLine = cg.appendToLine(line, pVals[CID][0], 14)
        newLine = cg.appendToLine(newLine, pVals[CID][1], 15)
        newLines.append(newLine)
    predFile.close()

    predFile = open(predName, 'w')
    predFile.writelines(newLines)
    predFile.close()
import fastQTypes, cgConfig
import bioLibCG as cg
import subprocess
import os, time
import clusterCheck

#init
mainConf = cgConfig.cgConfig('Main.conf')
metaFileNames = [mainConf.conf['metaFileName']]
wrapperShell = '/home/chrisgre/scripts/smallRNAProcessing/clipAdapter.sh'

def clipAdapter(fName, adapter = None, validate = False, oName = None, overwrite = True):
	
	#Check to see if the file exists:
	putativeN = fName.replace('.fastq','.clipped.fastq')
	if os.path.isfile(putativeN):
		if overwrite:
			print '  Overwriting file', putativeN
			os.remove(putativeN)
		else:
			print '  \nNOT OVERWRITING FILE', putativeN
			return 1
			 
	#If the adapter is none, try to find it in the small.meta file
	if adapter is None:
		baseFName = cg.getBaseFileName(fName) + '.counts'
		for metaFileName in metaFileNames:
			mFile = open(metaFileName, 'r')
			for line in mFile:
				fields = line.strip().split('\t')
				if baseFName == fields[0]:
def findPeaks(pType, cName = None):
	
	#init
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)

	if pType == 'E':
		predName = conf.conf['resultsExonsSorted']
	else:
		predName = conf.conf['resultsIntronsSorted']
	
	print predName
	#make CID:hairpin:peak dictionary
	cHairs = getHairpins.getHairpins(predName)
	peakDict = {}
	for CID in cHairs:
		peakDict[CID] = [cHairs[CID],'None']
		

	timer = cg.cgTimer()
	timer.start()

	#put peaks in memory
	print 'Creating peak data'
	peaks = {} # chr:peak:value
	for CID in cHairs:
		chrom, strand, start, end = cg.tccSplit(cHairs[CID])
		tcc = cHairs[CID]
		
		#init dictionary
		if chrom not in peaks:
			peaks[chrom] = {}
		
		if strand not in peaks[chrom]:
			peaks[chrom][strand] = {}
		
		#create peaks for tcc and add to peak dictionary
		stretch = cgPeaks.stretch(tcc, cName)
		stretch.createPeaks()
		for peakCoord in stretch.peaks:
			peaks[chrom][strand][peakCoord] = 0
	print timer.split()

	print 'finding best combos'
	bestCombos = []
	aPass = 0
	bPass = 0
	cPass = 0
	numT = 0
	for CID in peakDict:
		cgFlag = False
		if CID == '538':cgFlag = True
		
		tcc = peakDict[CID][0]
		#print tcc
		tccPeaks = []
		chrom = cg.ss(tcc, ':')[0]
		strand = cg.ss(tcc, ':')[1]
		start = int(cg.ss(tcc, ':')[2])
		end = int(cg.ss(tcc, ':')[3])
		
		#get all peaks
		for i in range(start, end + 1):
			if i in peaks[chrom][strand]:
				#print '  peak added', i
				tccPeaks.append(i)
		
		#Calculate parameters...
		pairStrings = [] #used to check if pair already added
		peakCombos = []
		for x in tccPeaks:
				
								
				#scan a 30 bp range around this point and find the best roof...
				pRange = 30
				rTcc = cg.makeTcc(chrom, strand, x, x + 1)
				
				#quickly get max value...kinda a long way to do it but whatever
				cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False)
				xval = cProfile[0]
				max = xval
				highestValueCoord = x
				
				#now make profile for roof...
				cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True)
				
				
				
				#now get highest stretch length and the rNext coord.
				minVal = .80
				highest = 0
				stretch = 0
				startCurrent = None
				startFinal = None
				endFinal = None
				for i in range(1 - pRange, pRange):
					if cProfile[i] > minVal:
						stretch += 1
						if startCurrent == None:
							startCurrent = i
					else:
						if stretch > 0:
							if stretch > highest: #stretch ended and was higher than previous
								highest = stretch
								endFinal = i - 1
								startFinal = startCurrent
								startCurrent = None
							else:
								startCurrent = None
						stretch = 0
				
				#get +/- 4 value...
				val = [1.0, 1.0]
				if (startFinal) and (endFinal):
					low = startFinal - 4
					high = endFinal + 4
					if low > (1 - pRange):
						if high < pRange:
							val[0] = float(cProfile[startFinal - 4])
							val[1] = float(cProfile[endFinal + 4])
				
				#fill in other details...
				y = 'S'
				dist = 'S'
				ratio = 'S'
				
				peakCombos.append([tcc,x,y,dist,ratio,max,highest,val])
				#print '  ', peakCombos[-1]
		
		#find best combo...
		topCombo = None
		for combo in peakCombos:
			roofLength = combo[6]
			dropValue = combo[7][0]
			if combo[7][1] > dropValue:
				dropValue = combo[7][1]
			
			#print roofLength, dropValue
			if 14 < roofLength < 26:
				if 0.0 < dropValue < 0.2:
					#pick one with rooflength nearest 20:
					if topCombo:
						if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])):
							topCombo = combo
					else:
						topCombo = combo
		
		if topCombo:
			peakDict[CID][1] = topCombo
			bestCombos.append(topCombo)
			print bestCombos[-1]
		else:
			#print 'None'
			pass

	print timer.split()


	#now update predFile (SLOT 13)
	predFile = open(predName, 'r')
	newLines = []
	for line in predFile:
		CID = cg.ss(line)[7]
		if peakDict[CID][1] == 'None':
			peakInfo = 'None'
		else:
			peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7])
		newLines.append(cg.appendToLine(line, peakInfo, 13))
	predFile.close()

	predFile = open(predName, 'w')
	predFile.writelines(newLines)
	predFile.close()
###This is the MAIN script to compare small RNA libraries to the mature microRNA sequences.  It first calls
##collectMatureFrames and then calls compareFinished to do the actual comparisons.

from bioLibCG import *
import os
from collectMF import *
from compareCounts import *
import cgConfig

#init
conf = cgConfig.cgConfig()
baseName = conf.conf['baseName']
merLength = conf.conf['merLength']
inDirectory = conf.conf['inDirectory']
smallPath = conf.conf['smallPath']

mainConf = cgConfig.cgConfig('Main.conf')
smallPath = mainConf.conf['smallPath']
merLength = int(merLength)

####Make directory for files to go into
filePath = 'out/%s' % baseName
if baseName in os.listdir('out/'):
    #Delete all contents
    for file in os.listdir(filePath):
        os.remove('%s/%s' % (filePath, file))
else:
    os.mkdir(filePath)

###Get ids and Xmer frames
collectMatureFrames(baseName, merLength)
def stageTWO(packetNumber = None):
	
	#Caste and defaults
	if not packetNumber:
		print 'need number of packet to run'
		return 1
	else:
		packetNumber = int(packetNumber)
		
		
		
	#####################################################
	#Load CONFIGURATION FILE:
	#####################################################

	conf = c.cgConfig()

	print '\nConfiguration:'
	for entry in conf.conf:
		print '..%s: %s' % (entry, conf.conf[entry])


	####################################################
	#Files and File Naming
	####################################################

	#scratchfile
	scratchFile = open('./scratch.txt', 'w') #This is just the file that the warnings for the perl script are redirected to
	
	#directories
	outDir = conf.conf['outDirectory']

	#Filenaming
	pipeName = conf.conf['runName'] + '-' + 's' + conf.conf['frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf['mirBasePairs'] + '.' + str(packetNumber)
	foldOut = outDir + '/'  + pipeName + '.folded.frames.txt'
	filterOut = outDir + '/' + pipeName + '.filtered.mirs.tsv'
	finalOut = outDir + '/' + pipeName + '.FINAL.mirs.tsv'


	####################################################
	#Pipeline
	####################################################

	print "\nSTARTING STAGE THREE\n(packet # %s)" % packetNumber

				

	print "-Filtering frames and finding prospective mirs"
	subprocess.Popen(['python', './filter.frames.py',
					'-i', foldOut,
					'-g', conf.conf['genomes'],
					'-b', conf.conf['mirBasePairs'],
					'-m', conf.conf['mirLength'],
					'-o', filterOut]).wait()
	'''		
	print "-Running Conservation filter"
	subprocess.Popen(['perl', './get_percent_identity_list_fix.pl',
					'-g', conf.conf['genomes'],
					'-l', filterOut,
					'-o', finalOut], stderr = scratchFile).wait()
	'''
	
	print "DONE"
Exemple #29
0
#generate number of reads per prediction.

import bioLibCG as cg
import cgConfig

conf = cgConfig.cgConfig()
resultsFile = open(conf.conf['resultsSorted'], 'r')
resultsFile = open(conf.conf['onePerLine'], 'r')

#make header for R
print 'density'
for line in resultsFile:	
	print line.strip().split('\t')[12]
	
Exemple #30
0
#mirPipe using parrallel cpu's
import subprocess
import time, os
import cgConfig as c



startTime = time.time()
#####################################################
#Load CONFIGURATION FILE:
#####################################################

conf = c.cgConfig()

print '\nConfiguration:'
for entry in conf.conf:
	print '..%s: %s' % (entry, conf.conf[entry])

####################################################
#Files and File Naming
####################################################

#scratchfile
scratchFile = open('./scratch.txt', 'w') #This is just the file that the warnings for the perl script are redirected to

#directories
outDir = conf.conf['outDirectory']

#Filenaming
pipeName = conf.conf['runName'] + '-' + 's' + conf.conf['frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf['mirBasePairs']
conservedOut = outDir + '/' + pipeName + '.ALL.conserved.kmers.tsv'
Exemple #31
0
import cgConfig as c
import bioLibCG as cg
import cgSort

mConf = c.cgConfig('Main.conf')

smallPath = mConf.conf['smallPath']

smallPath = '/home/chrisgre/smallLibs/WIGS/zebrafish'
#grab everything - NOT WIG MERGES...
smallLibs = cg.recurseDir(smallPath,  end = '.wig')
smallLibs.extend(cg.recurseDir(smallPath, end = '.wig'))

for lib in smallLibs:
	
	print 'sorting', lib
	cgSort.wigSort(lib)
Exemple #32
0
#mirPipe using parrallel cpu's
import subprocess
import time, os
import cgConfig as c

startTime = time.time()
#####################################################
#Load CONFIGURATION FILE:
#####################################################

conf = c.cgConfig()

print '\nConfiguration:'
for entry in conf.conf:
    print '..%s: %s' % (entry, conf.conf[entry])

####################################################
#Files and File Naming
####################################################

#scratchfile
scratchFile = open(
    './scratch.txt', 'w'
)  #This is just the file that the warnings for the perl script are redirected to

#directories
outDir = conf.conf['outDirectory']

#Filenaming
pipeName = conf.conf['runName'] + '-' + 's' + conf.conf[
    'frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf[
Exemple #33
0
def stageTWO(packetNumber = None):
	
	#Caste and defaults
	if not packetNumber:
		print 'need number of packet to run'
		return 1
	else:
		packetNumber = int(packetNumber)
		
		
		
	#####################################################
	#Load CONFIGURATION FILE:
	#####################################################

	conf = c.cgConfig()
	'''
	print '\nConfiguration:'
	for entry in conf.conf:
		print '..%s: %s' % (entry, conf.conf.conf[entry])
		'''

	
	####################################################
	#Files and File Naming
	####################################################

	#scratchfile
	scratchFile = open('./scratch.txt', 'w') #This is just the file that the warnings for the perl script are redirected to
	
	#directories
	outDir = conf.conf['outDirectory']

	#Filenaming
	#!!!THIS IS DIFFERENENT THAN MIRPIPEPARA!!! (pipename is the name of packet...)
	pipeName = conf.conf['runName'] + '-' + 's' + conf.conf['frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf['mirBasePairs'] + '.' + str(packetNumber)
	extractOut = outDir + '/' + pipeName + '.folding.frames.tsv'
	splitDirectory = outDir + '/' + pipeName + '/'
	foldOut = outDir + '/' + pipeName + '.folded.frames.txt'

	####################################################
	#Pipeline
	####################################################

	print "\nSTARTING STAGE TWO\n(packet # %s)" % packetNumber

				
	if int(conf.conf['numSplitFiles']) == 1: #if you want to run on one node
		print '-Folding Frames using RNAfold on SINGLE NODE'
		subprocess.Popen(['./RNAfold.sh', extractOut, foldOut]).wait()
	else: #Else parallize
		print '-Splitting folding frames into %s files (%s)' % (conf.conf['numSplitFiles'], packetNumber)
		subprocess.Popen(['python', './splitdb.py',
						'-i', extractOut,
						'-b', pipeName,
						'-n', conf.conf['numSplitFiles'],
						'-d', splitDirectory]).wait()
						
		print '-Submitting %s seperate jobs to cluster (%s)' % (conf.conf['numSplitFiles'], packetNumber)
		subprocess.Popen(['python', './parFold.py',
						'-b', pipeName,
						'-n', conf.conf['numSplitFiles'],
						'-d', splitDirectory]).wait()

		#Get Job ID's:
		parseFile = open('%s.jobsinfo.txt' % pipeName, 'r') #!!! edit this?
		jobIDs = []
		for line in parseFile:
			jobIDs.append(line.split(' ')[2])
			
		#check if right number were submitted
		if len(jobIDs) == int(conf.conf['numSplitFiles']):
			print '....jobs were submitted correctly'

	
	print 'DONE (%s)' % packetNumber
Exemple #34
0
def createMultiTrack(dirName, organism):
	'''merge all mapped tracks in directory and create a single wig file'''
	mainConf = c.cgConfig('Main.conf')
	metaFileName = mainConf.conf['metaFileName']
	
	fileList = []
	for file in cg.recurseDir(dirName, end = '.mapped'):
						
		#check if mouse or human SHOULD PUT INTO A STD FUNCTION FOR META FILE
		#check if mouse or human
		baseFName = cg.getBaseFileName(file, naked= True)
		
		metaDict = cg.getMetaFileDict(metaFileName)
		
		org = 'None'
		if baseFName in metaDict:
			if metaDict[baseFName][1] == 'NONE':
				print '  NO ORG KNOWN FOR', file
				continue
			elif not metaDict[baseFName][1] == organism:
				print '  NOT ORGANISM RUNNING', file
				continue
			else:
				org = metaDict[baseFName][1]
				print '  USING ORG', org, file
			
		#check if there is an organism, must check due to files not in metaFile
		if org == 'None':
			print '  NO org (not in meta file)', file
			continue
		
		#only make wig file for organism asked for
		if not org == organism:
			continue
		
		#if it is right organism and has mapped file then add
		fileList.append(file)
	
	
	#make merged wig
	if organism == 'human':
		chroms = cg.humanChromosomes
		assembly = 'hg19'
	elif organism == 'mouse':
		chroms = cg.mouseChromosomes
		assembly = 'mm9'
	elif organism == 'zebrafish':
		chroms = cg.zebrafishChromosomes
		assembly = 'danRer6'
	
	print 'Making Bed File vectors'
	cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i')
	for fName in fileList:
		alignment_file = HTSeq.BowtieReader(fName)
		for alngt in alignment_file:
			if alngt.aligned:
				cvg.add_value( 1, alngt.iv ) #iv is the genomic interval..

	bedNamePos = dirName + '/Merge.' + organism + '.1.wig'
	bedNameNeg = dirName + '/Merge.' + organism + '.-1.wig'
	
	print 'Writing Bed File'
	cvg.write_bedgraph_file(bedNamePos, "+" )
	cvg.write_bedgraph_file(bedNameNeg, "-" )

	#Now extend it
	updateWigLength(bedNamePos, assembly)
	updateWigLength(bedNameNeg, assembly)
	
	#Now Sort it.
	cgSort.wigSort(bedNamePos)
	cgSort.wigSort(bedNameNeg)