def collectMatureFrames(baseName = None, merLength = None): #Defaults, Definitions, and Castings if not baseName: print 'base name of file family needed!' return 1 if not merLength: merLength = 18 merLength = int(merLength) ###############collect IDS that have the same kmer print 'Collecting kmer IDs' conf = cgConfig.cgConfig() idFileName = conf.conf['resultsRaw'] idFile = open(idFileName, 'r') interFileName = ('./out/%s/' % baseName) + baseName + '.collection.intermediate' interFile = open(interFileName, 'w') doneList = [] for line in idFile: #For each kmer: grab id and Xmer frames --> output kmerID = line.strip().split('\t')[0].split('.')[0] if kmerID not in doneList: doneList.append(kmerID) matureSeq = line.strip().split('\t')[3] for frame in returnFrames(matureSeq, merLength): interFile.write('%s\t%s\n' % (kmerID, frame)) interFile.close() idFile.close()
def probe(tcc, conf = None): if not conf: mConf = c.cgConfig('Main.conf') smallPath = mConf.conf['smallPath'] chrom, strand, start, end = cg.tccSplit(tcc) total = 0 for lib in cg.recurseDir(smallPath, end = 'mapped.%s.wig' % strand): try: eLevels = stepVectorScan.scanVectorsFile(lib, [tcc]) except: print lib, 'index failed' continue #find highest expression level highest = 0 for coord in eLevels: if eLevels[coord] > highest: highest = eLevels[coord] if highest > 0: print lib, highest total += highest #print eLevels print total
def collectMatureFrames(baseName=None, merLength=None): #Defaults, Definitions, and Castings if not baseName: print 'base name of file family needed!' return 1 if not merLength: merLength = 18 merLength = int(merLength) ###############collect IDS that have the same kmer print 'Collecting kmer IDs' conf = cgConfig.cgConfig() idFileName = conf.conf['resultsRaw'] idFile = open(idFileName, 'r') interFileName = ('./out/%s/' % baseName) + baseName + '.collection.intermediate' interFile = open(interFileName, 'w') doneList = [] for line in idFile: #For each kmer: grab id and Xmer frames --> output kmerID = line.strip().split('\t')[0].split('.')[0] if kmerID not in doneList: doneList.append(kmerID) matureSeq = line.strip().split('\t')[3] for frame in returnFrames(matureSeq, merLength): interFile.write('%s\t%s\n' % (kmerID, frame)) interFile.close() idFile.close()
def stageTWO(packetNumber=None): #Caste and defaults if not packetNumber: print 'need number of packet to run' return 1 else: packetNumber = int(packetNumber) ##################################################### #Load CONFIGURATION FILE: ##################################################### conf = c.cgConfig() print '\nConfiguration:' for entry in conf.conf: print '..%s: %s' % (entry, conf.conf[entry]) #################################################### #Files and File Naming #################################################### #scratchfile scratchFile = open( './scratch.txt', 'w' ) #This is just the file that the warnings for the perl script are redirected to #directories outDir = conf.conf['outDirectory'] #Filenaming pipeName = conf.conf['runName'] + '-' + 's' + conf.conf[ 'frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf[ 'mirBasePairs'] + '.' + str(packetNumber) foldOut = outDir + '/' + pipeName + '.folded.frames.txt' filterOut = outDir + '/' + pipeName + '.filtered.mirs.tsv' finalOut = outDir + '/' + pipeName + '.FINAL.mirs.tsv' #################################################### #Pipeline #################################################### print "\nSTARTING STAGE THREE\n(packet # %s)" % packetNumber print "-Filtering frames and finding prospective mirs" subprocess.Popen([ 'python', './filter.frames.py', '-i', foldOut, '-g', conf.conf['genomes'], '-b', conf.conf['mirBasePairs'], '-m', conf.conf['mirLength'], '-o', filterOut ]).wait() ''' print "-Running Conservation filter" subprocess.Popen(['perl', './get_percent_identity_list_fix.pl', '-g', conf.conf['genomes'], '-l', filterOut, '-o', finalOut], stderr = scratchFile).wait() ''' print "DONE"
def probe(tcc, conf=None): if not conf: mConf = c.cgConfig('Main.conf') smallPath = mConf.conf['smallPath'] chrom, strand, start, end = cg.tccSplit(tcc) total = 0 for lib in cg.recurseDir(smallPath, end='mapped.%s.wig' % strand): try: eLevels = stepVectorScan.scanVectorsFile(lib, [tcc]) except: print lib, 'index failed' continue #find highest expression level highest = 0 for coord in eLevels: if eLevels[coord] > highest: highest = eLevels[coord] if highest > 0: print lib, highest total += highest #print eLevels print total
def splitExonsIntrons(cName = None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init organism = conf.conf['organism'] minOverlap = 50 cHairs = getHairpins.getHairpins() #CID: HAIRPIN exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) hairpins = [] for CID in cHairs: hairpins.append(cHairs[CID]) print 'checking overlaps' #check which hairpins overlap exons and by how much exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount = True) print ' ', len(exonOverlapped) print 'removing partial introns' #remove the ones that didn't overlap more than X: remList = [] for tcc, oAmount in exonOverlapped: if oAmount < minOverlap: remList.append([tcc, oAmount]) for item in remList: exonOverlapped.remove(item) print ' ', len(exonOverlapped), 'out of', len(cHairs.keys()) #get CIDs of exons exonCIDs = [] for tcc, oAmount in exonOverlapped: for CID in cHairs: if cHairs[CID] == tcc: exonCIDs.append(str(CID)) #Open sorted predictions and write lines with CIDs to respective files predFile = open(conf.conf['resultsSorted'], 'r') exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w') intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w') for line in predFile: if line.split('\t')[7] in exonCIDs: exonFile.write(line) else: intronFile.write(line) predFile.close() exonFile.close() intronFile.close()
def splitExonsIntrons(cName=None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init organism = conf.conf['organism'] minOverlap = 50 cHairs = getHairpins.getHairpins() #CID: HAIRPIN exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) hairpins = [] for CID in cHairs: hairpins.append(cHairs[CID]) print 'checking overlaps' #check which hairpins overlap exons and by how much exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount=True) print ' ', len(exonOverlapped) print 'removing partial introns' #remove the ones that didn't overlap more than X: remList = [] for tcc, oAmount in exonOverlapped: if oAmount < minOverlap: remList.append([tcc, oAmount]) for item in remList: exonOverlapped.remove(item) print ' ', len(exonOverlapped), 'out of', len(cHairs.keys()) #get CIDs of exons exonCIDs = [] for tcc, oAmount in exonOverlapped: for CID in cHairs: if cHairs[CID] == tcc: exonCIDs.append(str(CID)) #Open sorted predictions and write lines with CIDs to respective files predFile = open(conf.conf['resultsSorted'], 'r') exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w') intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w') for line in predFile: if line.split('\t')[7] in exonCIDs: exonFile.write(line) else: intronFile.write(line) predFile.close() exonFile.close() intronFile.close()
def createMultiTrackDir(dirName, organism): '''THIS DIFFERS FROM ABOVE BECAUSE IT DOESN't REQUIRE META INFO IT JUST MAKES A MERGED WIG FOR EVERYTHING IN THE DIRECTORY''' mainConf = c.cgConfig('Main.conf') fileList = [] for file in cg.recurseDir(dirName, end = '.mapped'): fileList.append(file) #make merged wig if organism == 'human': chroms = cg.humanChromosomes assembly = 'hg19' elif organism == 'mouse': chroms = cg.mouseChromosomes assembly = 'mm9' elif organism == 'zebrafish': chroms = cg.zebrafishChromosomes assembly = 'danRer6' print 'Making Bed File vectors' cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i') for fName in fileList: alignment_file = HTSeq.BowtieReader(fName) for alngt in alignment_file: if alngt.aligned: cvg.add_value( 1, alngt.iv ) #iv is the genomic interval.. bedNamePos = dirName + '/Merge.' + organism + '.1.wig' bedNameNeg = dirName + '/Merge.' + organism + '.-1.wig' print 'Writing Bed File' cvg.write_bedgraph_file(bedNamePos, "+" ) cvg.write_bedgraph_file(bedNameNeg, "-" ) #Now extend it updateWigLength(bedNamePos, assembly) updateWigLength(bedNameNeg, assembly) #Now Sort it. cgSort.wigSort(bedNamePos) cgSort.wigSort(bedNameNeg)
def finish(packetNumber): #init conf = c.cgConfig() #directories outDir = conf.conf['outDirectory'] #Filenaming pipeName = conf.conf['runName'] + '-' + 's' + conf.conf['frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf['mirBasePairs'] + '.' + str(packetNumber) extractOut = outDir + '/' + pipeName + '.folding.frames.tsv' splitDirectory = outDir + '/' + pipeName + '/' foldOut = outDir + '/' + pipeName + '.folded.frames.txt' print '-Stitching %s files back into one (%s)' % (conf.conf['numSplitFiles'], packetNumber) subprocess.Popen(['python', './stitchdb.py', '-b', pipeName, '-n', conf.conf['numSplitFiles'], '-o', outDir + '/', '-i', splitDirectory]).wait()
def queryNumJobsQ(user): #init mainConf = cgConfig.cgConfig('Main.conf') scratchFileName = mainConf.conf['qstatScratch'] #output qstat info to scratch file sFile = open(scratchFileName, 'w') subprocess.Popen(['qstat', '-u', user], stdout=sFile).wait() sFile.close() #parse scratchfile, check how many times usr name appears sFile = open(scratchFileName, 'r') userCount = 0 for line in sFile: if user in line: userCount = userCount + 1 sFile.close() return userCount
def finish(packetNumber): #init conf = c.cgConfig() #directories outDir = conf.conf['outDirectory'] #Filenaming pipeName = conf.conf['runName'] + '-' + 's' + conf.conf[ 'frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf[ 'mirBasePairs'] + '.' + str(packetNumber) extractOut = outDir + '/' + pipeName + '.folding.frames.tsv' splitDirectory = outDir + '/' + pipeName + '/' foldOut = outDir + '/' + pipeName + '.folded.frames.txt' print '-Stitching %s files back into one (%s)' % ( conf.conf['numSplitFiles'], packetNumber) subprocess.Popen([ 'python', './stitchdb.py', '-b', pipeName, '-n', conf.conf['numSplitFiles'], '-o', outDir + '/', '-i', splitDirectory ]).wait()
def createTrackInDir(dirName): '''Every Q function has a corresponding shell script Make wig file for all mapped files, for all organisms''' wrapperShell = '/home/chrisgre/scripts/mapping/createTrack.sh' mainConf = c.cgConfig('Main.conf') metaFileName = mainConf.conf['metaFileName'] for file in cg.recurseDir(dirName, end = '.mapped'): #check if mouse or human baseFName = cg.getBaseFileName(file) baseFName = baseFName.split('.')[0] metaDict = cg.getMetaFileDict(metaFileName) org = 'None' if baseFName in metaDict: if metaDict[baseFName][1] == 'NONE': print ' NO ORG KNOWN FOR', file continue else: org = metaDict[baseFName][1] print ' USING ORG', org, file #check if there is an organism, must check due to files not in metaFile if org == 'None': print ' NO org (not in meta file)', file continue while True: #submit job if there are less than ten if clusterCheck.queryNumJobsQ('chrisgre') < 1000: #subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ]) subprocess.Popen(['qsub', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ]) #time.sleep(.5) #give it time to update qstat break else:#wait 10 secs... time.sleep(20)
def stageTWO(packetNumber=None): #Caste and defaults if not packetNumber: print 'need number of packet to run' return 1 else: packetNumber = int(packetNumber) ##################################################### #Load CONFIGURATION FILE: ##################################################### conf = c.cgConfig() ''' print '\nConfiguration:' for entry in conf.conf: print '..%s: %s' % (entry, conf.conf.conf[entry]) ''' #################################################### #Files and File Naming #################################################### #scratchfile scratchFile = open( './scratch.txt', 'w' ) #This is just the file that the warnings for the perl script are redirected to #directories outDir = conf.conf['outDirectory'] #Filenaming #!!!THIS IS DIFFERENENT THAN MIRPIPEPARA!!! (pipename is the name of packet...) pipeName = conf.conf['runName'] + '-' + 's' + conf.conf[ 'frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf[ 'mirBasePairs'] + '.' + str(packetNumber) extractOut = outDir + '/' + pipeName + '.folding.frames.tsv' splitDirectory = outDir + '/' + pipeName + '/' foldOut = outDir + '/' + pipeName + '.folded.frames.txt' #################################################### #Pipeline #################################################### print "\nSTARTING STAGE TWO\n(packet # %s)" % packetNumber if int(conf.conf['numSplitFiles']) == 1: #if you want to run on one node print '-Folding Frames using RNAfold on SINGLE NODE' subprocess.Popen(['./RNAfold.sh', extractOut, foldOut]).wait() else: #Else parallize print '-Splitting folding frames into %s files (%s)' % ( conf.conf['numSplitFiles'], packetNumber) subprocess.Popen([ 'python', './splitdb.py', '-i', extractOut, '-b', pipeName, '-n', conf.conf['numSplitFiles'], '-d', splitDirectory ]).wait() print '-Submitting %s seperate jobs to cluster (%s)' % ( conf.conf['numSplitFiles'], packetNumber) subprocess.Popen([ 'python', './parFold.py', '-b', pipeName, '-n', conf.conf['numSplitFiles'], '-d', splitDirectory ]).wait() #Get Job ID's: parseFile = open('%s.jobsinfo.txt' % pipeName, 'r') #!!! edit this? jobIDs = [] for line in parseFile: jobIDs.append(line.split(' ')[2]) #check if right number were submitted if len(jobIDs) == int(conf.conf['numSplitFiles']): print '....jobs were submitted correctly' print 'DONE (%s)' % packetNumber
# get the hairpin values # can also reuse elements from this script to get CID elements... import bioLibCG as cg from bioLibCG import ss import cgConfig as c mConf = c.cgConfig("Main.conf") conf = c.cgConfig() def getHairpins(fN): predFile = open(fN, "r") # populate CID:hairpin range cHairs = {} for line in predFile: # get cluster ID CID = ss(line)[7] hairpin = ss(line)[2] if CID in cHairs: # check if the starts and ends need to be stretched hStart = int(ss(cHairs[CID], ":")[2]) hEnd = int(ss(cHairs[CID], ":")[3]) start = int(ss(hairpin, ":")[2]) end = int(ss(hairpin, ":")[3]) if start < hStart: hStart = start if end > hEnd:
def updateReadDensity(tType): #go through wig each chromosome and check the mature seqs mainConf = cgConfig.cgConfig('Main.conf') conf = cgConfig.cgConfig() organism = conf.conf['organism'] wigFolder = mainConf.conf['wig%s' % organism] newLines = [] if tType == 'E': pFileName = conf.conf['resultsExons'] elif tType == 'I': pFileName = conf.conf['resultsIntrons'] else: print 'READ UPDATE FAIL' print ' Updating Read Density:', tType for wigFileN in cg.recurseDir(wigFolder, end='.wig'): #init chrom = wigFileN.strip().split('.')[-2] strand = wigFileN.strip().split('.')[-4] wigFile = open(wigFileN, 'r') mirFile = open(pFileName, 'r') print wigFileN #get rid of header wigFile.readline() print ' populating hitmap' #populate hitmap wigMap = {} for line in wigFile: value = int(line.strip().split('\t')[3].split('.')[0]) if value > 0: start = int(line.strip().split('\t')[1]) end = int(line.strip().split('\t')[2]) for i in range(start, end): wigMap[i] = value wigFile.close() print ' calculating hits for mature seqs' #calculate total hits per mature for line in mirFile: mTcc = line.strip().split('\t')[1] mirID = line.strip().split('\t')[0] if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1] == strand): #if mirID == '26477.30.106643972': print 'Starting Total Count' highestHit = 0 for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])): #if mirID == '26477.30.106643972': print ' ', i if i in wigMap: if wigMap[i] > highestHit: highestHit = wigMap[i] #if mirID == '26477.30.106643972': print ' ', i, totalHits, wigMap[i] newLines.append(cg.appendToLine(line, str(highestHit), 11)) mirFile.close() print 'Writing New File' #write new results file outFile = open(pFileName, 'w') for line in newLines: outFile.write(line) outFile.close() ####NOW UPDATE HIGHEST HIT PER CLUSTER#### clusterCount = {} pFile = open(pFileName, 'r') for line in pFile: predictionCount = int(line.strip().split('\t')[11]) CID = line.strip().split('\t')[7] if CID in clusterCount: if clusterCount[CID] < predictionCount: clusterCount[CID] = predictionCount else: clusterCount[CID] = predictionCount pFile.close() #update the file --> cluster small count newLines = [] predFile = open(pFileName, 'r') for line in predFile: CID = line.strip().split('\t')[7] numMax = clusterCount[CID] newLines.append(cg.appendToLine(line, str(numMax), 12)) predFile.close() #sort newLines by clusterID sortDict = {} CIDs = [] for line in newLines: CID = int(line.strip().split('\t')[7]) if CID not in CIDs: CIDs.append(CID) if CID in sortDict: sortDict[CID].append(line) else: sortDict[CID] = [line] CIDs.sort() newLines = [] for CID in CIDs: for line in sortDict[CID]: newLines.append(line) #write new File newFile = open(pFileName, 'w') for line in newLines: newFile.write(line) newFile.close()
def updateReadDensity(tType): #go through wig each chromosome and check the mature seqs mainConf = cgConfig.cgConfig('Main.conf') conf = cgConfig.cgConfig() organism = conf.conf['organism'] wigFolder = mainConf.conf['wig%s' % organism] newLines = [] if tType == 'E': pFileName = conf.conf['resultsExons'] elif tType == 'I': pFileName = conf.conf['resultsIntrons'] else: print 'READ UPDATE FAIL' print ' Updating Read Density:', tType for wigFileN in cg.recurseDir(wigFolder, end = '.wig'): #init chrom = wigFileN.strip().split('.')[-2] strand = wigFileN.strip().split('.')[-4] wigFile = open(wigFileN, 'r') mirFile = open(pFileName, 'r') print wigFileN #get rid of header wigFile.readline() print ' populating hitmap' #populate hitmap wigMap = {} for line in wigFile: value = int(line.strip().split('\t')[3].split('.')[0]) if value > 0: start = int(line.strip().split('\t')[1]) end = int(line.strip().split('\t')[2]) for i in range(start, end): wigMap[i] = value wigFile.close() print ' calculating hits for mature seqs' #calculate total hits per mature for line in mirFile: mTcc = line.strip().split('\t')[1] mirID = line.strip().split('\t')[0] if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1] == strand): #if mirID == '26477.30.106643972': print 'Starting Total Count' highestHit = 0 for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])): #if mirID == '26477.30.106643972': print ' ', i if i in wigMap: if wigMap[i] > highestHit: highestHit = wigMap[i] #if mirID == '26477.30.106643972': print ' ', i, totalHits, wigMap[i] newLines.append(cg.appendToLine(line, str(highestHit), 11)) mirFile.close() print 'Writing New File' #write new results file outFile = open(pFileName, 'w') for line in newLines: outFile.write(line) outFile.close() ####NOW UPDATE HIGHEST HIT PER CLUSTER#### clusterCount = {} pFile = open(pFileName, 'r') for line in pFile: predictionCount = int(line.strip().split('\t')[11]) CID = line.strip().split('\t')[7] if CID in clusterCount: if clusterCount[CID] < predictionCount: clusterCount[CID] = predictionCount else: clusterCount[CID] = predictionCount pFile.close() #update the file --> cluster small count newLines = [] predFile = open(pFileName, 'r') for line in predFile: CID = line.strip().split('\t')[7] numMax = clusterCount[CID] newLines.append(cg.appendToLine(line, str(numMax), 12)) predFile.close() #sort newLines by clusterID sortDict = {} CIDs = [] for line in newLines: CID = int(line.strip().split('\t')[7]) if CID not in CIDs: CIDs.append(CID) if CID in sortDict: sortDict[CID].append(line) else: sortDict[CID] = [line] CIDs.sort() newLines = [] for CID in CIDs: for line in sortDict[CID]: newLines.append(line) #write new File newFile = open(pFileName, 'w') for line in newLines: newFile.write(line) newFile.close()
def exonNoisy(cName = None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) cHairs = getHairpins.getHairpins(conf.conf['resultsExons']) #CID: HAIRPIN organism = conf.conf['organism'] geneSetFolder = mConf.conf['geneSets%s' % organism] #make prediction overlap hitmap print 'Making prediction list' predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) if compare.checkIfOverlaps(predList): predList = compare.collapseOverlaps(predList) #make genes for Ensemble/make list of tccs for exons. print 'Creating gene set' ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllExons.tsv') print ' loaded # genes:', len(ensGenes.set) #collect levels for each haipin region print '[Checking all levels]' cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] #for each hairpin, --> find overlapping transcripts in same gene overlappingGenes = ensGenes.geneOverlaps([hPin]) if len(overlappingGenes) > 0: gIDs = [gene.id for gene in overlappingGenes] allTccs = ensGenes.getTccsFromGIDs(gIDs) if compare.checkIfOverlaps: print ' Overlaps...collapsing' allTccs = compare.collapseOverlaps(allTccs) else: print 'NO GENE OVERLAPS!!!!!', CID, hPin #filter out my predictions. print ' Filtering out predictions' checkList = compare.subtractTwoTccLists(allTccs, predList) #Get Expression level for gene. print ' Retrieving Expression levels:', cg.getTccListTotalLength(checkList) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(checkList, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file print 'Outputting to file' #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() #print sortedKeys newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j:# add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['exonNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
def intronNoisy(cName = None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init cHairs = getHairpins.getHairpins(conf.conf['resultsIntrons']) #CID: HAIRPIN organism = conf.conf['organism'] exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) slide = 1000 #make prediction overlap hitmap predMap = {} predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) #collapse Overlaps print ' collapsing predictions' predList = compare.collapseOverlaps(predList) print ' collapsing exons' exonList = compare.collapseOverlaps(exonList) #collect levels for each hairpin region cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] chrom = ss(hPin, ':')[0] strand = ss(hPin, ':')[1] start = int(ss(hPin, ':')[2]) end = int(ss(hPin, ':')[3]) scanStart = start - slide scanEnd = end + slide scanRange = [] scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start)) scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd)) print scanRange scanRange = compare.subtractTwoTccLists(scanRange, predList) scanRange = compare.subtractTwoTccLists(scanRange, exonList) levels = [] print ' Retrieving Expression levels:', cg.getTccListTotalLength(scanRange) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j:# add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['intronNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
def updateReadDensity(tType, cName): #go through wig each chromosome and check the mature seqs mainConf = cgConfig.cgConfig('Main.conf') conf = cgConfig.getConfig(cName) organism = conf.conf['organism'] wigFolder = mainConf.conf['wig%s' % organism] newLines = [] #Differentiate between exon or intron... if tType == 'E': pFileName = conf.conf['resultsExons'] elif tType == 'I': pFileName = conf.conf['resultsIntrons'] else: print 'READ UPDATE FAIL' print ' Updating Read Density:', tType #get read density for each line... print ' calculating hits for mature seqs' #calculate total hits per mature mirFile = open(pFileName, 'r') for line in mirFile: mTcc = line.strip().split('\t')[1] mirID = line.strip().split('\t')[0] tccStretch = cgPeaks.stretch(mTcc, cName) highestHit = 0 for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])): if i in tccStretch.profile: if tccStretch.profile[i] > highestHit: highestHit = tccStretch.profile[i] newLines.append(cg.appendToLine(line, str(highestHit), 11)) mirFile.close() print 'Writing New File' #write new results file outFile = open(pFileName, 'w') for line in newLines: outFile.write(line) outFile.close() ####NOW UPDATE HIGHEST HIT PER CLUSTER#### clusterCount = {} pFile = open(pFileName, 'r') for line in pFile: predictionCount = int(line.strip().split('\t')[11]) CID = line.strip().split('\t')[7] if CID in clusterCount: if clusterCount[CID] < predictionCount: clusterCount[CID] = predictionCount else: clusterCount[CID] = predictionCount pFile.close() #update the file --> cluster small count newLines = [] predFile = open(pFileName, 'r') for line in predFile: CID = line.strip().split('\t')[7] numMax = clusterCount[CID] newLines.append(cg.appendToLine(line, str(numMax), 12)) predFile.close() #sort newLines by clusterID sortDict = {} CIDs = [] for line in newLines: CID = int(line.strip().split('\t')[7]) if CID not in CIDs: CIDs.append(CID) if CID in sortDict: sortDict[CID].append(line) else: sortDict[CID] = [line] CIDs.sort() newLines = [] for CID in CIDs: for line in sortDict[CID]: newLines.append(line) #write new File newFile = open(pFileName, 'w') for line in newLines: newFile.write(line) newFile.close()
def intronNoisy(cName=None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init cHairs = getHairpins.getHairpins( conf.conf['resultsIntrons']) #CID: HAIRPIN organism = conf.conf['organism'] exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) slide = 1000 #make prediction overlap hitmap predMap = {} predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) #collapse Overlaps print ' collapsing predictions' predList = compare.collapseOverlaps(predList) print ' collapsing exons' exonList = compare.collapseOverlaps(exonList) #collect levels for each hairpin region cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] chrom = ss(hPin, ':')[0] strand = ss(hPin, ':')[1] start = int(ss(hPin, ':')[2]) end = int(ss(hPin, ':')[3]) scanStart = start - slide scanEnd = end + slide scanRange = [] scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start)) scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd)) print scanRange scanRange = compare.subtractTwoTccLists(scanRange, predList) scanRange = compare.subtractTwoTccLists(scanRange, exonList) levels = [] print ' Retrieving Expression levels:', cg.getTccListTotalLength( scanRange) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j: # add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['intronNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
def findPeaks(pType, cName=None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID], 'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538': cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio=False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc, x, y, dist, ratio, max, highest, val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < ( math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % ( str(peakDict[CID][1][1])[-3:], 'S', str( peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5], peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def updateNoise(pType, cName=None): #init mainConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExons'] else: predName = conf.conf['resultsIntrons'] #populate cid: exon dist print 'Populating CID/INtron/exon distribution data' if pType == 'E': noiseFN = conf.conf['exonNoiseData'] f = open(noiseFN, 'r') else: noiseFN = conf.conf['intronNoiseData'] f = open(noiseFN, 'r') exonDists = {} #cid: [exon dist] header = f.readline() order = {} # num:CID for i, CID in enumerate(header.strip().split('\t')): order[i] = CID exonDists[CID] = [] for line in f: data = line.strip().split('\t') for i, dataPoint in enumerate(data): if dataPoint == 'NA' or dataPoint == '': continue else: dataPoint = float(dataPoint) CID = order[i] exonDists[CID].append(dataPoint) #get highest expression level for each cluster print 'Populating highest expression levels' predExpression = {} # CID; highest level exonFile = open(predName, 'r') for line in exonFile: CID = line.strip().split('\t')[7] hDensity = line.strip().split('\t')[12] predExpression[CID] = hDensity #get pVals for each CID print 'Getting pvals for each cluster' pVals = {} # CID; [lam,pVal] for CID in exonDists: if not len(exonDists[CID]) > 0: #no data in 2kb range. lam = 'NA' pVal = 'NA' else: lam = cgStats.getLam(exonDists[CID]) pVal = cgStats.getPValExp(predExpression[CID], lam) pVals[CID] = [ lam, pVal ] #lam gives a good approximation of noise levels in region... print 'Updating the file' #update file... predFile = open(predName, 'r') newLines = [] for line in predFile: CID = line.split('\t')[7] newLine = cg.appendToLine(line, pVals[CID][0], 14) newLine = cg.appendToLine(newLine, pVals[CID][1], 15) newLines.append(newLine) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
import fastQTypes, cgConfig import bioLibCG as cg import subprocess import os, time import clusterCheck #init mainConf = cgConfig.cgConfig('Main.conf') metaFileNames = [mainConf.conf['metaFileName']] wrapperShell = '/home/chrisgre/scripts/smallRNAProcessing/clipAdapter.sh' def clipAdapter(fName, adapter = None, validate = False, oName = None, overwrite = True): #Check to see if the file exists: putativeN = fName.replace('.fastq','.clipped.fastq') if os.path.isfile(putativeN): if overwrite: print ' Overwriting file', putativeN os.remove(putativeN) else: print ' \nNOT OVERWRITING FILE', putativeN return 1 #If the adapter is none, try to find it in the small.meta file if adapter is None: baseFName = cg.getBaseFileName(fName) + '.counts' for metaFileName in metaFileNames: mFile = open(metaFileName, 'r') for line in mFile: fields = line.strip().split('\t') if baseFName == fields[0]:
def findPeaks(pType, cName = None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID],'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538':cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc,x,y,dist,ratio,max,highest,val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
###This is the MAIN script to compare small RNA libraries to the mature microRNA sequences. It first calls ##collectMatureFrames and then calls compareFinished to do the actual comparisons. from bioLibCG import * import os from collectMF import * from compareCounts import * import cgConfig #init conf = cgConfig.cgConfig() baseName = conf.conf['baseName'] merLength = conf.conf['merLength'] inDirectory = conf.conf['inDirectory'] smallPath = conf.conf['smallPath'] mainConf = cgConfig.cgConfig('Main.conf') smallPath = mainConf.conf['smallPath'] merLength = int(merLength) ####Make directory for files to go into filePath = 'out/%s' % baseName if baseName in os.listdir('out/'): #Delete all contents for file in os.listdir(filePath): os.remove('%s/%s' % (filePath, file)) else: os.mkdir(filePath) ###Get ids and Xmer frames collectMatureFrames(baseName, merLength)
def stageTWO(packetNumber = None): #Caste and defaults if not packetNumber: print 'need number of packet to run' return 1 else: packetNumber = int(packetNumber) ##################################################### #Load CONFIGURATION FILE: ##################################################### conf = c.cgConfig() print '\nConfiguration:' for entry in conf.conf: print '..%s: %s' % (entry, conf.conf[entry]) #################################################### #Files and File Naming #################################################### #scratchfile scratchFile = open('./scratch.txt', 'w') #This is just the file that the warnings for the perl script are redirected to #directories outDir = conf.conf['outDirectory'] #Filenaming pipeName = conf.conf['runName'] + '-' + 's' + conf.conf['frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf['mirBasePairs'] + '.' + str(packetNumber) foldOut = outDir + '/' + pipeName + '.folded.frames.txt' filterOut = outDir + '/' + pipeName + '.filtered.mirs.tsv' finalOut = outDir + '/' + pipeName + '.FINAL.mirs.tsv' #################################################### #Pipeline #################################################### print "\nSTARTING STAGE THREE\n(packet # %s)" % packetNumber print "-Filtering frames and finding prospective mirs" subprocess.Popen(['python', './filter.frames.py', '-i', foldOut, '-g', conf.conf['genomes'], '-b', conf.conf['mirBasePairs'], '-m', conf.conf['mirLength'], '-o', filterOut]).wait() ''' print "-Running Conservation filter" subprocess.Popen(['perl', './get_percent_identity_list_fix.pl', '-g', conf.conf['genomes'], '-l', filterOut, '-o', finalOut], stderr = scratchFile).wait() ''' print "DONE"
#generate number of reads per prediction. import bioLibCG as cg import cgConfig conf = cgConfig.cgConfig() resultsFile = open(conf.conf['resultsSorted'], 'r') resultsFile = open(conf.conf['onePerLine'], 'r') #make header for R print 'density' for line in resultsFile: print line.strip().split('\t')[12]
#mirPipe using parrallel cpu's import subprocess import time, os import cgConfig as c startTime = time.time() ##################################################### #Load CONFIGURATION FILE: ##################################################### conf = c.cgConfig() print '\nConfiguration:' for entry in conf.conf: print '..%s: %s' % (entry, conf.conf[entry]) #################################################### #Files and File Naming #################################################### #scratchfile scratchFile = open('./scratch.txt', 'w') #This is just the file that the warnings for the perl script are redirected to #directories outDir = conf.conf['outDirectory'] #Filenaming pipeName = conf.conf['runName'] + '-' + 's' + conf.conf['frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf['mirBasePairs'] conservedOut = outDir + '/' + pipeName + '.ALL.conserved.kmers.tsv'
import cgConfig as c import bioLibCG as cg import cgSort mConf = c.cgConfig('Main.conf') smallPath = mConf.conf['smallPath'] smallPath = '/home/chrisgre/smallLibs/WIGS/zebrafish' #grab everything - NOT WIG MERGES... smallLibs = cg.recurseDir(smallPath, end = '.wig') smallLibs.extend(cg.recurseDir(smallPath, end = '.wig')) for lib in smallLibs: print 'sorting', lib cgSort.wigSort(lib)
#mirPipe using parrallel cpu's import subprocess import time, os import cgConfig as c startTime = time.time() ##################################################### #Load CONFIGURATION FILE: ##################################################### conf = c.cgConfig() print '\nConfiguration:' for entry in conf.conf: print '..%s: %s' % (entry, conf.conf[entry]) #################################################### #Files and File Naming #################################################### #scratchfile scratchFile = open( './scratch.txt', 'w' ) #This is just the file that the warnings for the perl script are redirected to #directories outDir = conf.conf['outDirectory'] #Filenaming pipeName = conf.conf['runName'] + '-' + 's' + conf.conf[ 'frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf[
def stageTWO(packetNumber = None): #Caste and defaults if not packetNumber: print 'need number of packet to run' return 1 else: packetNumber = int(packetNumber) ##################################################### #Load CONFIGURATION FILE: ##################################################### conf = c.cgConfig() ''' print '\nConfiguration:' for entry in conf.conf: print '..%s: %s' % (entry, conf.conf.conf[entry]) ''' #################################################### #Files and File Naming #################################################### #scratchfile scratchFile = open('./scratch.txt', 'w') #This is just the file that the warnings for the perl script are redirected to #directories outDir = conf.conf['outDirectory'] #Filenaming #!!!THIS IS DIFFERENENT THAN MIRPIPEPARA!!! (pipename is the name of packet...) pipeName = conf.conf['runName'] + '-' + 's' + conf.conf['frameStep'] + 'k' + conf.conf['kmerLength'] + 'b' + conf.conf['mirBasePairs'] + '.' + str(packetNumber) extractOut = outDir + '/' + pipeName + '.folding.frames.tsv' splitDirectory = outDir + '/' + pipeName + '/' foldOut = outDir + '/' + pipeName + '.folded.frames.txt' #################################################### #Pipeline #################################################### print "\nSTARTING STAGE TWO\n(packet # %s)" % packetNumber if int(conf.conf['numSplitFiles']) == 1: #if you want to run on one node print '-Folding Frames using RNAfold on SINGLE NODE' subprocess.Popen(['./RNAfold.sh', extractOut, foldOut]).wait() else: #Else parallize print '-Splitting folding frames into %s files (%s)' % (conf.conf['numSplitFiles'], packetNumber) subprocess.Popen(['python', './splitdb.py', '-i', extractOut, '-b', pipeName, '-n', conf.conf['numSplitFiles'], '-d', splitDirectory]).wait() print '-Submitting %s seperate jobs to cluster (%s)' % (conf.conf['numSplitFiles'], packetNumber) subprocess.Popen(['python', './parFold.py', '-b', pipeName, '-n', conf.conf['numSplitFiles'], '-d', splitDirectory]).wait() #Get Job ID's: parseFile = open('%s.jobsinfo.txt' % pipeName, 'r') #!!! edit this? jobIDs = [] for line in parseFile: jobIDs.append(line.split(' ')[2]) #check if right number were submitted if len(jobIDs) == int(conf.conf['numSplitFiles']): print '....jobs were submitted correctly' print 'DONE (%s)' % packetNumber
def createMultiTrack(dirName, organism): '''merge all mapped tracks in directory and create a single wig file''' mainConf = c.cgConfig('Main.conf') metaFileName = mainConf.conf['metaFileName'] fileList = [] for file in cg.recurseDir(dirName, end = '.mapped'): #check if mouse or human SHOULD PUT INTO A STD FUNCTION FOR META FILE #check if mouse or human baseFName = cg.getBaseFileName(file, naked= True) metaDict = cg.getMetaFileDict(metaFileName) org = 'None' if baseFName in metaDict: if metaDict[baseFName][1] == 'NONE': print ' NO ORG KNOWN FOR', file continue elif not metaDict[baseFName][1] == organism: print ' NOT ORGANISM RUNNING', file continue else: org = metaDict[baseFName][1] print ' USING ORG', org, file #check if there is an organism, must check due to files not in metaFile if org == 'None': print ' NO org (not in meta file)', file continue #only make wig file for organism asked for if not org == organism: continue #if it is right organism and has mapped file then add fileList.append(file) #make merged wig if organism == 'human': chroms = cg.humanChromosomes assembly = 'hg19' elif organism == 'mouse': chroms = cg.mouseChromosomes assembly = 'mm9' elif organism == 'zebrafish': chroms = cg.zebrafishChromosomes assembly = 'danRer6' print 'Making Bed File vectors' cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i') for fName in fileList: alignment_file = HTSeq.BowtieReader(fName) for alngt in alignment_file: if alngt.aligned: cvg.add_value( 1, alngt.iv ) #iv is the genomic interval.. bedNamePos = dirName + '/Merge.' + organism + '.1.wig' bedNameNeg = dirName + '/Merge.' + organism + '.-1.wig' print 'Writing Bed File' cvg.write_bedgraph_file(bedNamePos, "+" ) cvg.write_bedgraph_file(bedNameNeg, "-" ) #Now extend it updateWigLength(bedNamePos, assembly) updateWigLength(bedNameNeg, assembly) #Now Sort it. cgSort.wigSort(bedNamePos) cgSort.wigSort(bedNameNeg)