def mapFastQ(fName, organism): indexFileHuman = '/home/chrisgre/indexes/bowtie/hg19' indexFileMouse = '/home/chrisgre/indexes/bowtie/mm9' indexFileZebrafish = '/home/chrisgre/indexes/bowtie/danRer6' if organism == 'human': indexName = indexFileHuman elif organism == 'mouse': indexName = indexFileMouse elif organism == 'zebrafish': indexName = indexFileZebrafish outName = fName + '.mapped' logFile = open(mainConf.conf['outLog'] + cg.getBaseFileName(fName), 'w') errorFile = open(mainConf.conf['errorLog'] + cg.getBaseFileName(fName), 'w') if fastQTypes.getFastQType(fName, quick = True) == 'Sa': print 'Mapping with 33 phred offset' subprocess.Popen(['bowtie', '--phred33-quals', '-k', '20', '-m', '20', '-p', '1', indexName, fName, outName], stdout=logFile, stderr=errorFile).wait() else: print 'Mapping with 64 phred offset' subprocess.Popen(['bowtie', '--phred64-quals', '-k', '20', '-m', '20', '-p', '1', indexName, fName, outName], stdout=logFile, stderr=errorFile).wait() logFile.close() errorFile.close()
def testmerge(masterDir, parDir): '''The master directory will contain the merged objects, the slave directory contains the directories of all the runs oRNA (master) aDir (master) pRuns --run.00 ----oRNA (slave: pRuns/run.00/oRNA) ----aDir --run.01 ''' mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment) id_masterObj = mDC.load() #recurse through all the runs masterBN = bioLibCG.getBaseFileName(masterDir) for slaveDir in bioLibCG.recursePaths(parDir, end = masterBN): oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment) id_slaveObj = oDC.load() id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj, cgOriginRNA.OriginRNA) mDC.commit(id_masterObj)
def makeWig(fN, assembly, format=None, name=None): '''format assumes bowtie suitible for medium mapped files. takes longer.''' #assume bowtie if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) if not name: name = cg.getBaseFileName(fN, naked=True) lDict = cg.returnChromLengthDict(assembly) for chrom in lDict: if not chrom in cg.acceptableChroms: continue for strand in ['1', '-1']: f = open(fN, 'r') #create hitmap of chrom and strand print chrom, strand, 'hitmap' hitDict = {} for line in f: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) lStrand = str(lStrand) start = int(start) end = int(end) if chrom == lChrom and strand == lStrand: for i in range(start, end + 1): try: hitDict[i] += 1 except KeyError: hitDict[i] = 1 #write results to wig file writeWigFromHitDict(hitDict, assembly)
def testmerge(masterDir, parDir): '''The master directory will contain the merged objects, the slave directory contains the directories of all the runs oRNA (master) aDir (master) pRuns --run.00 ----oRNA (slave: pRuns/run.00/oRNA) ----aDir --run.01 ''' mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment) id_masterObj = mDC.load() #recurse through all the runs masterBN = bioLibCG.getBaseFileName(masterDir) for slaveDir in bioLibCG.recursePaths(parDir, end=masterBN): oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment) id_slaveObj = oDC.load() id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj, cgOriginRNA.OriginRNA) mDC.commit(id_masterObj)
def makeWig(fN, assembly, format = None, name = None): '''format assumes bowtie suitible for medium mapped files. takes longer.''' #assume bowtie if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) if not name: name = cg.getBaseFileName(fN, naked = True) lDict = cg.returnChromLengthDict(assembly) for chrom in lDict: if not chrom in cg.acceptableChroms: continue for strand in ['1', '-1']: f = open(fN, 'r') #create hitmap of chrom and strand print chrom, strand, 'hitmap' hitDict = {} for line in f: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) lStrand = str(lStrand) start = int(start) end = int(end) if chrom == lChrom and strand == lStrand: for i in range(start, end + 1): try: hitDict[i] += 1 except KeyError: hitDict[i] = 1 #write results to wig file writeWigFromHitDict(hitDict, assembly)
def writeWigFromHitDict(hitDict, assembly, name, directory=None): mConf = c.getConfig('Main.conf') if not directory: directory = mConf.conf['wigs'] if not name: name = cg.getBaseFileName(name, naked=True) lDict = cg.returnChromLengthDict(assembly) cg.clearDirectory(directory, overwrite=False) #write results to wig file for chrom in hitDict: for strand in hitDict[chrom]: oF = open(directory + '/%s.%s.%s.wig' % (name, chrom, strand), 'w') oF.write('track type=bedGraph name=%s.%s.%s\n' % (name, chrom, strand)) #print ' sorting' #print hitDict[chrom] chromEnd = lDict[chrom] # hitDict[chrom][strand][chromEnd] = 0 keys = hitDict[chrom][strand].keys() keys.sort() #print ' writing blocks' prevVal = 0 prevCoord = 0 blockStart = 0 blockEnd = 1 for key in keys: val = hitDict[chrom][strand][key] if prevCoord == key - 1: if val == prevVal: #should be combined blockEnd = key + 1 else: #no zero block #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #!make it a float value? #start new block blockStart = key blockEnd = key + 1 else: #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #write zero block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockEnd, key, 0)) #start new block blockStart = key blockEnd = key + 1 prevVal = val prevCoord = key oF.close()
def mixWig(directory, assembly, name=None): '''Does it by chromosome --> faster, less memory''' if not name: name = 'Merge' #gather all chromosomes chromList = [] for fN in cg.recurseDir(directory, end='.wig'): chrom = cg.getBaseFileName(fN).strip().split('.')[-3] if chrom not in chromList: chromList.append(chrom) print chromList for chrom in chromList: print chrom #Gather all the values from all the files hitDict = {} # chrom : { strand : coord for fN in cg.recurseDir(directory, end='.wig'): fChrom = cg.getBaseFileName(fN).strip().split('.')[-3] if fChrom != chrom: continue print ' ', fN, fChrom f = open(fN, 'r') f.readline() #header strand = cg.getBaseFileName(fN).strip().split('.')[-2] for line in f: lChrom, start, end, val = (line.strip().split('\t')) start, end, val = int(start), int(end), int(val) if val < 1: continue #print start, end, val for i in range(start, end): try: hitDict[lChrom][strand][i] += val except (KeyError, TypeError): if not lChrom in hitDict: hitDict[lChrom] = {} if not strand in hitDict[lChrom]: hitDict[lChrom][strand] = {} hitDict[lChrom][strand][i] = val #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def mixWig(directory, assembly, name = None): '''Does it by chromosome --> faster, less memory''' if not name: name = 'Merge' #gather all chromosomes chromList = [] for fN in cg.recurseDir(directory, end = '.wig'): chrom = cg.getBaseFileName(fN).strip().split('.')[-3] if chrom not in chromList: chromList.append(chrom) print chromList for chrom in chromList: print chrom #Gather all the values from all the files hitDict = {} # chrom : { strand : coord for fN in cg.recurseDir(directory, end = '.wig'): fChrom = cg.getBaseFileName(fN).strip().split('.')[-3] if fChrom != chrom: continue print ' ', fN, fChrom f = open(fN, 'r') f.readline() #header strand = cg.getBaseFileName(fN).strip().split('.')[-2] for line in f: lChrom, start, end, val = (line.strip().split('\t')) start, end, val = int(start), int(end), int(val) if val < 1: continue #print start, end, val for i in range(start, end): try: hitDict[lChrom][strand][i] += val except (KeyError,TypeError): if not lChrom in hitDict: hitDict[lChrom] = {} if not strand in hitDict[lChrom]: hitDict[lChrom][strand] = {} hitDict[lChrom][strand][i] = val #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def makeWigMem(fN, assembly, format = None, name = None, directory = None): '''format assumes bowtie suitible for small mapped files.''' if not name: name = cg.getBaseFileName(fN, naked = True) if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) lDict = cg.returnChromLengthDict(assembly) f = open(fN, 'r') f.readline() #header...file might not have one but its one read... #create hitmap of chrom and strand hitDict = {} #format = chr: { strand : { coord : value for line in f: try: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) except AttributeError: continue lStrand = str(lStrand) start = int(start) end = int(end) if lChrom in cg.acceptableChroms: #wig for degradome if lStrand == '1': i = start + 20 else: i = start try: hitDict[lChrom][lStrand][i] += 1 except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = 1 ''' for i in range(start, end): try: hitDict[lChrom][lStrand][i] += 1 except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = 1 ''' f.close() #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def makeWigMem(fN, assembly, format=None, name=None, directory=None): '''format assumes bowtie suitible for small mapped files.''' if not name: name = cg.getBaseFileName(fN, naked=True) if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) lDict = cg.returnChromLengthDict(assembly) f = open(fN, 'r') f.readline() #header...file might not have one but its one read... #create hitmap of chrom and strand hitDict = {} #format = chr: { strand : { coord : value for line in f: try: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) except AttributeError: continue lStrand = str(lStrand) start = int(start) end = int(end) if lChrom in cg.acceptableChroms: #wig for degradome if lStrand == '1': i = start + 20 else: i = start try: hitDict[lChrom][lStrand][i] += 1 except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = 1 ''' for i in range(start, end): try: hitDict[lChrom][lStrand][i] += 1 except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = 1 ''' f.close() #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def writeWigFromHitDict(hitDict, assembly, name, directory = None): mConf = c.getConfig('Main.conf') if not directory: directory = mConf.conf['wigs'] if not name: name = cg.getBaseFileName(name, naked = True) lDict = cg.returnChromLengthDict(assembly) cg.clearDirectory(directory, overwrite = False) #write results to wig file for chrom in hitDict: for strand in hitDict[chrom]: oF = open(directory + '/%s.%s.%s.wig' % (name, chrom, strand), 'w') oF.write('track type=bedGraph name=%s.%s.%s\n' % (name, chrom, strand)) #print ' sorting' #print hitDict[chrom] chromEnd = lDict[chrom] # hitDict[chrom][strand][chromEnd] = 0 keys = hitDict[chrom][strand].keys() keys.sort() #print ' writing blocks' prevVal = 0 prevCoord = 0 blockStart = 0 blockEnd = 1 for key in keys: val = hitDict[chrom][strand][key] if prevCoord == key - 1: if val == prevVal:#should be combined blockEnd = key + 1 else: #no zero block #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #!make it a float value? #start new block blockStart = key blockEnd = key + 1 else: #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #write zero block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockEnd, key, 0)) #start new block blockStart = key blockEnd = key + 1 prevVal = val prevCoord = key oF.close()
def clipAdapter(fName, adapter = None, validate = False, oName = None, overwrite = True): #Check to see if the file exists: putativeN = fName.replace('.fastq','.clipped.fastq') if os.path.isfile(putativeN): if overwrite: print ' Overwriting file', putativeN os.remove(putativeN) else: print ' \nNOT OVERWRITING FILE', putativeN return 1 #If the adapter is none, try to find it in the small.meta file if adapter is None: baseFName = cg.getBaseFileName(fName) + '.counts' for metaFileName in metaFileNames: mFile = open(metaFileName, 'r') for line in mFile: fields = line.strip().split('\t') if baseFName == fields[0]: if fields[3] == 'NONE': print ' NO ADAPTER KNOWN FOR', fName return 1 else: adapter = fields[3] print ' Using adapter', adapter, fName mFile.close() #Is it a valid fastq file? if validate: pass #check the type of fastq file sangerType = False fType = fastQTypes.getFastQType(fName, quick = True) if fType == 'Sa': sangerType = True print ' Detected format:', fType, fName #Run it through clipper print 'Clipping file', fName if oName is None: oName = fName.replace('.fastq','.clipped.fastq') if sangerType: subprocess.Popen(['fastx_clipper', '-n', '-v', '-Q', '33', '-i', str(fName), '-a', str(adapter), '-o', str(oName)]).wait() else: subprocess.Popen(['fastx_clipper', '-n', '-v', '-i', str(fName), '-a', str(adapter), '-o', str(oName)]).wait() print ' DONE', fName
def clipAdapter(fName, adapter=None, validate=False, oName=None): #If the adapter is none, try to find it in the small.meta file if adapter is None: baseFName = cg.getBaseFileName(fName) + '.counts' for metaFileName in metaFileNames: mFile = open(metaFileName, 'r') for line in mFile: fields = line.strip().split('\t') if baseFName == fields[0]: if fields[3] == 'NONE': print 'NO ADAPTER KNOWN FOR', fName return 1 else: adapter = fields[3] print 'Using adapter', adapter mFile.close() #Is it a valid fastq file? if validate: pass #check the type of fastq file sangerType = False fType = fastQTypes.getFastQType(fName, quick=True) if fType == 'Sa': sangerType = True print 'Detected format:', fType #Run it through clipper print 'Clipping file' if oName is None: oName = fName.replace('.fastq', '.clipped.fastq') if sangerType: subprocess.Popen([ 'fastx_clipper', '-v', '-Q', '33', '-i', str(fName), '-a', str(adapter), '-o', str(oName) ]).wait() else: subprocess.Popen([ 'fastx_clipper', '-v', '-i', str(fName), '-a', str(adapter), '-o', str(oName) ]).wait() print 'DONE'
def plotResults(fN, cName=None): cHairs = getHairpins.getHairpins(fN) #CID: HAIRPIN directory = cg.getBaseFileName(fN) cg.clearDirectory(directory) #change the directory before plotting cwd = os.getcwd() os.chdir(directory) for CID in cHairs: print 'plotting:', CID cgPlot.plotASProfile(cHairs[CID], cName) os.chdir(cwd)
def plotResults(fN, cName = None): cHairs = getHairpins.getHairpins(fN) #CID: HAIRPIN directory = cg.getBaseFileName(fN) cg.clearDirectory(directory) #change the directory before plotting cwd = os.getcwd() os.chdir(directory) for CID in cHairs: print 'plotting:', CID cgPlot.plotASProfile(cHairs[CID], cName) os.chdir(cwd)
def mapFastQInDirQ(dirName, overwrite = True): '''Every Q function has a corresponding shell script''' wrapperShell = '/home/chrisgre/scripts/mapping/mapFastQ.sh' for file in cg.recurseDir(dirName, end = 'clipped.fastq'): print file putativeN = file.replace('.clipped.fastq','.clipped.fastq.mapped') if os.path.isfile(putativeN): if overwrite: print ' Overwriting file', putativeN os.remove(putativeN) else: print ' \nNOT OVERWRITING FILE', putativeN continue #check if mouse or human baseFName = cg.getBaseFileName(file, naked = True) org = 'None' for metaFileName in metaFileNames: mFile = open(metaFileName, 'r') for line in mFile: fields = line.strip().split('\t') if baseFName == fields[0]: if fields[2] == 'NONE': print ' NO ORG KNOWN FOR', file continue else: org = fields[2] print ' USING ORG', org, file mFile.close() #check if there is an organism, must check due to files not in metaFile if org == 'None': print ' NO org', file continue while True: #submit job if there are less than ten if clusterCheck.queryNumJobsQ('chrisgre') < 40: #subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ]) subprocess.Popen(['qsub', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ]) time.sleep(.2) #give it time to update qstat break else:#wait 10 secs... time.sleep(20)
def createTrackInDir(dirName): '''Every Q function has a corresponding shell script Make wig file for all mapped files, for all organisms''' wrapperShell = '/home/chrisgre/scripts/mapping/createTrack.sh' mainConf = c.cgConfig('Main.conf') metaFileName = mainConf.conf['metaFileName'] for file in cg.recurseDir(dirName, end = '.mapped'): #check if mouse or human baseFName = cg.getBaseFileName(file) baseFName = baseFName.split('.')[0] metaDict = cg.getMetaFileDict(metaFileName) org = 'None' if baseFName in metaDict: if metaDict[baseFName][1] == 'NONE': print ' NO ORG KNOWN FOR', file continue else: org = metaDict[baseFName][1] print ' USING ORG', org, file #check if there is an organism, must check due to files not in metaFile if org == 'None': print ' NO org (not in meta file)', file continue while True: #submit job if there are less than ten if clusterCheck.queryNumJobsQ('chrisgre') < 1000: #subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ]) subprocess.Popen(['qsub', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ]) #time.sleep(.5) #give it time to update qstat break else:#wait 10 secs... time.sleep(20)
def makeWigMem(fN, assembly, format = None, name = None, directory = None, degWig = False, switchStrand = True, normalized = False): '''format assumes bowtie suitible for small mapped files. switch strand does not switch the strands, it just makes sure if the data is backwards (HeLa) that it will put the peak in the right spot''' print 'degWig Value', degWig print 'switch strands?', switchStrand if not name: name = cg.getBaseFileName(fN, naked = True) if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) lDict = cg.returnChromLengthDict(assembly) f = open(fN, 'r') f.readline() #header...file might not have one but its one read... #create hitmap of chrom and strand hitDict = {} #format = chr: { strand : { coord : value for line in f: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) lStrand = str(lStrand) start = int(start) end = int(end) numPlacesMapped = int(line.strip().split('\t')[6]) numPlacesMapped += 1 readCount = 1 if normalized: readCount = float(readCount)/numPlacesMapped if lChrom in cg.acceptableChroms: if degWig: #wig for degradome NOTE:!!! change lStrand == '1' to '-1' for Bracken! if switchStrand: if lStrand == '1': i = start + (end - start) else: i = start + 1 else: if lStrand == '-1': i = start + (end - start) else: i = start + 1 hitDict.setdefault(lChrom, {}).setdefault(lStrand, {}) hitDict[lChrom][lStrand][i] = hitDict[lChrom][lStrand].get(i, 0) + readCount else: #wig for regular for i in range(start, end): try: hitDict[lChrom][lStrand][i] += readCount except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = readCount f.close() #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def createMultiTrack(dirName, organism): '''merge all mapped tracks in directory and create a single wig file''' mainConf = c.cgConfig('Main.conf') metaFileName = mainConf.conf['metaFileName'] fileList = [] for file in cg.recurseDir(dirName, end = '.mapped'): #check if mouse or human SHOULD PUT INTO A STD FUNCTION FOR META FILE #check if mouse or human baseFName = cg.getBaseFileName(file, naked= True) metaDict = cg.getMetaFileDict(metaFileName) org = 'None' if baseFName in metaDict: if metaDict[baseFName][1] == 'NONE': print ' NO ORG KNOWN FOR', file continue elif not metaDict[baseFName][1] == organism: print ' NOT ORGANISM RUNNING', file continue else: org = metaDict[baseFName][1] print ' USING ORG', org, file #check if there is an organism, must check due to files not in metaFile if org == 'None': print ' NO org (not in meta file)', file continue #only make wig file for organism asked for if not org == organism: continue #if it is right organism and has mapped file then add fileList.append(file) #make merged wig if organism == 'human': chroms = cg.humanChromosomes assembly = 'hg19' elif organism == 'mouse': chroms = cg.mouseChromosomes assembly = 'mm9' elif organism == 'zebrafish': chroms = cg.zebrafishChromosomes assembly = 'danRer6' print 'Making Bed File vectors' cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i') for fName in fileList: alignment_file = HTSeq.BowtieReader(fName) for alngt in alignment_file: if alngt.aligned: cvg.add_value( 1, alngt.iv ) #iv is the genomic interval.. bedNamePos = dirName + '/Merge.' + organism + '.1.wig' bedNameNeg = dirName + '/Merge.' + organism + '.-1.wig' print 'Writing Bed File' cvg.write_bedgraph_file(bedNamePos, "+" ) cvg.write_bedgraph_file(bedNameNeg, "-" ) #Now extend it updateWigLength(bedNamePos, assembly) updateWigLength(bedNameNeg, assembly) #Now Sort it. cgSort.wigSort(bedNamePos) cgSort.wigSort(bedNameNeg)
#make lib:hits dict densityFile = open('/home/chrisgre/scripts/readDensity/individual.densities.data', 'r') tissueHits = {} cID = 'NONE' for line in densityFile: if line.startswith('\t'): #lib: hit l = line.strip().split('\t')[0] hits = int(line.strip().split('\t')[1]) tissueHits[cID][l] = hits else: cID = line.strip() tissueHits[cID] = {} tissueHist = {}#tissue: hits for mirID in mirIDs: if mirID in tissueHits: for smallLib in tissueHits[mirID]: smallName = cg.getBaseFileName(smallLib, naked = True) if smallName in metaDict: if metaDict[smallName][1] == 'mouse': if len(metaDict[smallName]) > 3: t = metaDict[smallName][3] if t in tissueHist: tissueHist[t] += tissueHits[mirID][smallLib] else: tissueHist[t] = tissueHits[mirID][smallLib] print tissueHist
direc = '/home/chrisgre/apps/projects/small.rna.libs' metaFileName = direc + '/' + 'small.meta' ##make data of already made file so there aren't any duplicates: fileDict = {} #filename... metaFile = open(metaFileName, 'r') #add new entries countFiles = cg.recurseDir(direc, end='.fastq') for file in countFiles: fileName = file.strip().split('/')[-1] if len(fileName.split('.')) > 2: #has to specifically end in fastq... continue fileName = cg.getBaseFileName(file, naked=True) dir = file.strip().split('/')[-2] org = 'NONE' if 'human' in dir: org = 'human' if 'mouse' in dir: org = 'mouse' if 'pig' in dir: org = 'pig' if 'dog' in dir: org = 'dog' if 'rat' in dir: org = 'rat' if 'zebrafish' in dir: org = 'zebrafish'
if metaDict[baseFName][1] == organism: organismFileList.append(baseFName) #put small hits for each prediction in dictionary pCount = {} smallFile = open(smallFileName, 'r') currID = 'NONE' for line in smallFile: if '\t' not in line: #This is the line with the id in it --> store another ID currID = line.strip() else: #this line contains library and count info --> add lib = line.strip().split('\t')[0] count = int(line.strip().split('\t')[1]) if cg.getBaseFileName(lib, naked=True) in organismFileList: if currID in pCount: pCount[currID] = pCount[currID] + count else: pCount[currID] = count #update the file --> any line with kmer on it give it the count newLines = [] predFile = open(pFileName, 'r') for line in predFile: kmer = line.strip().split('\t')[0].split('.')[0] if kmer in pCount: numSmall = pCount[kmer] else: numSmall = 0 newLine = line.strip().split('\t')
#make lib:hits dict densityFile = open( '/home/chrisgre/scripts/readDensity/individual.densities.data', 'r') tissueHits = {} cID = 'NONE' for line in densityFile: if line.startswith('\t'): #lib: hit l = line.strip().split('\t')[0] hits = int(line.strip().split('\t')[1]) tissueHits[cID][l] = hits else: cID = line.strip() tissueHits[cID] = {} tissueHist = {} #tissue: hits for mirID in mirIDs: if mirID in tissueHits: for smallLib in tissueHits[mirID]: smallName = cg.getBaseFileName(smallLib, naked=True) if smallName in metaDict: if metaDict[smallName][1] == 'mouse': if len(metaDict[smallName]) > 3: t = metaDict[smallName][3] if t in tissueHist: tissueHist[t] += tissueHits[mirID][smallLib] else: tissueHist[t] = tissueHits[mirID][smallLib] print tissueHist
def makeWigMem(fN, assembly, format=None, name=None, directory=None, degWig=False, switchStrand=True, normalized=False): '''format assumes bowtie suitible for small mapped files. switch strand does not switch the strands, it just makes sure if the data is backwards (HeLa) that it will put the peak in the right spot''' print 'degWig Value', degWig print 'switch strands?', switchStrand if not name: name = cg.getBaseFileName(fN, naked=True) if not format: format = 'Bowtie' parserFunction = returnParserFunction(format) lDict = cg.returnChromLengthDict(assembly) f = open(fN, 'r') f.readline() #header...file might not have one but its one read... #create hitmap of chrom and strand hitDict = {} #format = chr: { strand : { coord : value for line in f: lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line)) lStrand = str(lStrand) start = int(start) end = int(end) numPlacesMapped = int(line.strip().split('\t')[6]) numPlacesMapped += 1 readCount = 1 if normalized: readCount = float(readCount) / numPlacesMapped if lChrom in cg.acceptableChroms: if degWig: #wig for degradome NOTE:!!! change lStrand == '1' to '-1' for Bracken! if switchStrand: if lStrand == '1': i = start + (end - start) else: i = start + 1 else: if lStrand == '-1': i = start + (end - start) else: i = start + 1 hitDict.setdefault(lChrom, {}).setdefault(lStrand, {}) hitDict[lChrom][lStrand][i] = hitDict[lChrom][lStrand].get( i, 0) + readCount else: #wig for regular for i in range(start, end): try: hitDict[lChrom][lStrand][i] += readCount except KeyError: if lChrom not in hitDict: hitDict[lChrom] = {} if lStrand not in hitDict[lChrom]: hitDict[lChrom][lStrand] = {} hitDict[lChrom][lStrand][i] = readCount f.close() #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
direc = '/home/chrisgre/apps/projects/small.rna.libs' metaFileName = direc + '/' + 'small.meta' ##make data of already made file so there aren't any duplicates: fileDict = {} #filename... metaFile = open(metaFileName, 'r') #add new entries countFiles = cg.recurseDir(direc, end = '.fastq') for file in countFiles: fileName = file.strip().split('/')[-1] if len(fileName.split('.')) > 2: #has to specifically end in fastq... continue fileName = cg.getBaseFileName(file, naked = True) dir = file.strip().split('/')[-2] org = 'NONE' if 'human' in dir: org = 'human' if 'mouse' in dir: org = 'mouse' if 'pig' in dir: org = 'pig' if 'dog' in dir: org = 'dog' if 'rat' in dir: org = 'rat' if 'zebrafish' in dir: org = 'zebrafish'
if metaDict[baseFName][1] == organism: organismFileList.append(baseFName) #put small hits for each prediction in dictionary pCount = {} smallFile = open(smallFileName, 'r') currID = 'NONE' for line in smallFile: if '\t' not in line: #This is the line with the id in it --> store another ID currID = line.strip() else: #this line contains library and count info --> add lib = line.strip().split('\t')[0] count = int(line.strip().split('\t')[1]) if cg.getBaseFileName(lib, naked = True) in organismFileList: if currID in pCount: pCount[currID] = pCount[currID] + count else: pCount[currID] = count #update the file --> any line with kmer on it give it the count newLines = [] predFile = open(pFileName, 'r') for line in predFile: kmer = line.strip().split('\t')[0].split('.')[0] if kmer in pCount: numSmall = pCount[kmer] else: numSmall = 0 newLine = line.strip().split('\t')