def parClean(fN): if '/' in fN: dirName = os.path.dirname(fN) else: dirName = os.environ['PWD'] basename = os.path.basename(fN) #remove the original file os.remove(fN) #remove the exit signals for eFN in bioLibCG.recurseDir(dirName, start = basename, end = 'exitSignal'): pass os.remove(eFN) #cat the SORTED range files. rangeFiles = bioLibCG.recurseDir(dirName, start = basename, within = 'range') rangeFiles.sort(key = lambda x: int(x.split('.')[-2])) f = open(fN, 'w') for rFN in rangeFiles: subprocess.Popen(['cat', rFN], stdout = f).wait() f.close() #remove the range files for rFN in rangeFiles: os.remove(rFN)
def probe(tcc, conf=None): if not conf: mConf = c.cgConfig('Main.conf') smallPath = mConf.conf['smallPath'] chrom, strand, start, end = cg.tccSplit(tcc) total = 0 for lib in cg.recurseDir(smallPath, end='mapped.%s.wig' % strand): try: eLevels = stepVectorScan.scanVectorsFile(lib, [tcc]) except: print lib, 'index failed' continue #find highest expression level highest = 0 for coord in eLevels: if eLevels[coord] > highest: highest = eLevels[coord] if highest > 0: print lib, highest total += highest #print eLevels print total
def scanSequence(seqList, dirName): '''Given list of sequences --> get all reads that have sequence ''' fileNames = cg.recurseDir(dirName, end = '.sequence') if len(fileNames) > 1: print fileNames print 'there is more than one sequence file in this directory' return 1 else: fN = fileNames[0] #for seq in seqList: seq = seqList fIndex = cgIndex.lineIndex(fN, header = False) fIndex.passCheckFunction(cgIndex.mapSequenceCheckFunction) fIndex.binarySearch(seq) #places file pointer at beginning of sequence line #extend and report fIndex.extendUp(seq) finalReads = [] for line in fIndex.file: if fIndex.checkFunction(seq, line) == 0: finalReads.append(line.strip()) else: return finalReads
def scanSequence(seqList, dirName): '''Given list of sequences --> get all reads that have sequence ''' fileNames = cg.recurseDir(dirName, end='.sequence') if len(fileNames) > 1: print fileNames print 'there is more than one sequence file in this directory' return 1 else: fN = fileNames[0] #for seq in seqList: seq = seqList fIndex = cgIndex.lineIndex(fN, header=False) fIndex.passCheckFunction(cgIndex.mapSequenceCheckFunction) fIndex.binarySearch( seq) #places file pointer at beginning of sequence line #extend and report fIndex.extendUp(seq) finalReads = [] for line in fIndex.file: if fIndex.checkFunction(seq, line) == 0: finalReads.append(line.strip()) else: return finalReads
def createMTrack(dirName): '''merge all mapped tracks in directory and create a single wig file''' fileList = cg.recurseDir(dirName, end = '.out') chroms = cg.humanChromosomes print 'Making Bed File vectors' cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i') for fName in fileList: print fName alignment_file = HTSeq.BowtieReader(fName) for alngt in alignment_file: if alngt.aligned: try: cvg.add_value( 1, alngt.iv ) #iv is the genomic interval.. except KeyError: pass bedNamePos = dirName + '/Merge.' + 'hg19' + '.1.wig' bedNameNeg = dirName + '/Merge.' + 'hg19' + '.-1.wig' print 'Writing Bed File' cvg.write_bedgraph_file(bedNamePos, "+" ) cvg.write_bedgraph_file(bedNameNeg, "-" ) #Now extend it updateWigLength(bedNamePos, 'hg19') updateWigLength(bedNameNeg, 'hg19') #Now Sort it. cgSort.wigSort(bedNamePos) cgSort.wigSort(bedNameNeg)
def probe(tcc, conf = None): if not conf: mConf = c.cgConfig('Main.conf') smallPath = mConf.conf['smallPath'] chrom, strand, start, end = cg.tccSplit(tcc) total = 0 for lib in cg.recurseDir(smallPath, end = 'mapped.%s.wig' % strand): try: eLevels = stepVectorScan.scanVectorsFile(lib, [tcc]) except: print lib, 'index failed' continue #find highest expression level highest = 0 for coord in eLevels: if eLevels[coord] > highest: highest = eLevels[coord] if highest > 0: print lib, highest total += highest #print eLevels print total
def mixWig(directory, assembly, name=None): '''Does it by chromosome --> faster, less memory''' if not name: name = 'Merge' #gather all chromosomes chromList = [] for fN in cg.recurseDir(directory, end='.wig'): chrom = cg.getBaseFileName(fN).strip().split('.')[-3] if chrom not in chromList: chromList.append(chrom) print chromList for chrom in chromList: print chrom #Gather all the values from all the files hitDict = {} # chrom : { strand : coord for fN in cg.recurseDir(directory, end='.wig'): fChrom = cg.getBaseFileName(fN).strip().split('.')[-3] if fChrom != chrom: continue print ' ', fN, fChrom f = open(fN, 'r') f.readline() #header strand = cg.getBaseFileName(fN).strip().split('.')[-2] for line in f: lChrom, start, end, val = (line.strip().split('\t')) start, end, val = int(start), int(end), int(val) if val < 1: continue #print start, end, val for i in range(start, end): try: hitDict[lChrom][strand][i] += val except (KeyError, TypeError): if not lChrom in hitDict: hitDict[lChrom] = {} if not strand in hitDict[lChrom]: hitDict[lChrom][strand] = {} hitDict[lChrom][strand][i] = val #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def mixWig(directory, assembly, name = None): '''Does it by chromosome --> faster, less memory''' if not name: name = 'Merge' #gather all chromosomes chromList = [] for fN in cg.recurseDir(directory, end = '.wig'): chrom = cg.getBaseFileName(fN).strip().split('.')[-3] if chrom not in chromList: chromList.append(chrom) print chromList for chrom in chromList: print chrom #Gather all the values from all the files hitDict = {} # chrom : { strand : coord for fN in cg.recurseDir(directory, end = '.wig'): fChrom = cg.getBaseFileName(fN).strip().split('.')[-3] if fChrom != chrom: continue print ' ', fN, fChrom f = open(fN, 'r') f.readline() #header strand = cg.getBaseFileName(fN).strip().split('.')[-2] for line in f: lChrom, start, end, val = (line.strip().split('\t')) start, end, val = int(start), int(end), int(val) if val < 1: continue #print start, end, val for i in range(start, end): try: hitDict[lChrom][strand][i] += val except (KeyError,TypeError): if not lChrom in hitDict: hitDict[lChrom] = {} if not strand in hitDict[lChrom]: hitDict[lChrom][strand] = {} hitDict[lChrom][strand][i] = val #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory)
def parCleanSplit(fN): '''Remove the exit signals for a split continuation run''' if '/' in fN: dirName = os.path.dirname(fN) else: dirName = os.environ['PWD'] basename = os.path.basename(fN) #remove the exit signals for eFN in bioLibCG.recurseDir(dirName, start = basename, end = 'exitSignal'): os.remove(eFN)
def parCleanSplit(fN): '''Remove the exit signals for a split continuation run''' if '/' in fN: dirName = os.path.dirname(fN) else: dirName = os.environ['PWD'] basename = os.path.basename(fN) #remove the exit signals for eFN in bioLibCG.recurseDir(dirName, start=basename, end='exitSignal'): os.remove(eFN)
def parClean(fN): if '/' in fN: dirName = os.path.dirname(fN) else: dirName = os.environ['PWD'] basename = os.path.basename(fN) #remove the original file print '..removing original file, if present' try: os.remove(fN) except OSError: pass #remove the exit signals print '..removing exit signals' for eFN in bioLibCG.recurseDir(dirName, start = basename, end = 'exitSignal'): os.remove(eFN) #cat the SORTED range files. rangeFiles = bioLibCG.recurseDir(dirName, start = basename, within = 'range') rangeFiles.sort(key = lambda x: int(x.split('.')[-2])) print '..catting files together' f = open(fN, 'w') for rFN in rangeFiles: print '....', rFN subprocess.Popen(['cat', rFN], stdout = f).wait() f.close() #remove the range files print '..removing packets' for rFN in rangeFiles: print '....', rFN os.remove(rFN)
def getAll(chrom, strand, point): fNs = cg.recurseDir(mConf.conf['smallPath'], end = '.wig') for file in fNs: if 'WIG' in file: fNs.remove(file) elif file == '/home/chrisgre/smallLibs/WIGS/Merge.mouse.1.wig' or '/home/chrisgre/smallLibs/WIGS/Merge.human.1.wig': fNs.remove(file) for fN in fNs: fStrand = cg.ss(fN, '.')[-2] if str(fStrand) == str(strand): val = getWigValue(chrom, point, fN) if val > 0: print fN, val
def getAll(chrom, strand, point): fNs = cg.recurseDir(mConf.conf['smallPath'], end='.wig') for file in fNs: if 'WIG' in file: fNs.remove(file) elif file == '/home/chrisgre/smallLibs/WIGS/Merge.mouse.1.wig' or '/home/chrisgre/smallLibs/WIGS/Merge.human.1.wig': fNs.remove(file) for fN in fNs: fStrand = cg.ss(fN, '.')[-2] if str(fStrand) == str(strand): val = getWigValue(chrom, point, fN) if val > 0: print fN, val
def parClean(fN): if '/' in fN: dirName = os.path.dirname(fN) else: dirName = os.environ['PWD'] basename = os.path.basename(fN) #remove the original file print '..removing original file, if present' try: os.remove(fN) except OSError: pass #remove the exit signals print '..removing exit signals' for eFN in bioLibCG.recurseDir(dirName, start=basename, end='exitSignal'): os.remove(eFN) #cat the SORTED range files. rangeFiles = bioLibCG.recurseDir(dirName, start=basename, within='range') rangeFiles.sort(key=lambda x: int(x.split('.')[-2])) print '..catting files together' f = open(fN, 'w') for rFN in rangeFiles: print '....', rFN subprocess.Popen(['cat', rFN], stdout=f).wait() f.close() #remove the range files print '..removing packets' for rFN in rangeFiles: print '....', rFN os.remove(rFN)
def load(self): #get schema attName_field = getClassScheme(self.mappingClass) id_obj = {} selectedAttNames = [] #load defined attributes for fN in bioLibCG.recurseDir(self.aDir): baseName = fN.strip().split('/')[-1] if baseName.startswith('a.'): #get atribute name/type bs = baseName.split('.') attName = bs[1] attType = attName_field[attName].dataType casteFxn = getCasteFunction(attType) selectedAttNames.append(attName) #now set the attributes f = open(fN, 'r') for line in f: ls = line.strip().split('\t') id = int(ls[0]) att = ls[1] if attType in listTypes: att = att.split(',') att = [casteFxn(x) for x in att] else: att = casteFxn(att) #now set the attribute property for the id if id not in id_obj: id_obj[id] = self.mappingClass(id) o = id_obj[id] setattr(o, attName, att) f.close() #Now initialize objects---It's important to do this after all the data has been loaded... for obj in id_obj.values(): loadedAttNames = obj.__dict__.keys() for attName in selectedAttNames: if attName not in loadedAttNames: setattr(obj, attName, copy.copy(attName_field[attName].dataDefault)) return id_obj
def checkDups(dir): id_seqs = {} for i in bioLibCG.recurseDir(dir, end='.simSeqs'): f = open(i, 'r') for line in f: ls = line.strip().split('\t') id, seq = ls[0], ls[1] id_seqs.setdefault(id, []).append(seq) for id, seqs in id_seqs.items(): if len(seqs) != len(list(set(seqs))): print 'fail' print id, seqs
def checkDups(dir): id_seqs = {} for i in bioLibCG.recurseDir(dir, end = '.simSeqs'): f = open(i, 'r') for line in f: ls = line.strip().split('\t') id, seq = ls[0], ls[1] id_seqs.setdefault(id, []).append(seq) for id, seqs in id_seqs.items(): if len(seqs) != len(list(set(seqs))): print 'fail' print id, seqs
def mapFastQInDirQ(dirName, overwrite = True): '''Every Q function has a corresponding shell script''' wrapperShell = '/home/chrisgre/scripts/mapping/mapFastQ.sh' for file in cg.recurseDir(dirName, end = 'clipped.fastq'): print file putativeN = file.replace('.clipped.fastq','.clipped.fastq.mapped') if os.path.isfile(putativeN): if overwrite: print ' Overwriting file', putativeN os.remove(putativeN) else: print ' \nNOT OVERWRITING FILE', putativeN continue #check if mouse or human baseFName = cg.getBaseFileName(file, naked = True) org = 'None' for metaFileName in metaFileNames: mFile = open(metaFileName, 'r') for line in mFile: fields = line.strip().split('\t') if baseFName == fields[0]: if fields[2] == 'NONE': print ' NO ORG KNOWN FOR', file continue else: org = fields[2] print ' USING ORG', org, file mFile.close() #check if there is an organism, must check due to files not in metaFile if org == 'None': print ' NO org', file continue while True: #submit job if there are less than ten if clusterCheck.queryNumJobsQ('chrisgre') < 40: #subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ]) subprocess.Popen(['qsub', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ]) time.sleep(.2) #give it time to update qstat break else:#wait 10 secs... time.sleep(20)
def checkExit(fN, numPackets): numPackets = int(numPackets) dirName = os.path.dirname(fN) baseName = os.path.basename(fN) if os.environ['PWD'] not in dirName: dirName = os.path.dirname(os.environ['PWD'] + '/' + fN) sleepTime = 1 pbar = ProgressBar(widgets=[' ', SimpleProgress(), ' ', Timer(), ' ', Bar()], maxval=numPackets).start() while True: time.sleep(sleepTime) exitSignals = bioLibCG.recurseDir(dirName, start = baseName , end = 'exitSignal') pbar.update(len(exitSignals)) if len(exitSignals) == numPackets: pbar.finish() print 'Jobs all finished!' break
def checkExit(fN, numPackets): numPackets = int(numPackets) dirName = os.path.dirname(fN) baseName = os.path.basename(fN) if os.environ['PWD'] not in dirName: dirName = os.path.dirname(os.environ['PWD'] + '/' + fN) sleepTime = 10 iteration = 1 while True: time.sleep(sleepTime) exitSignals = bioLibCG.recurseDir(dirName, start = baseName , end = 'exitSignal') print 'waiting...', str(len(exitSignals)), '/', str(numPackets), '%s' % bioLibCG.prettyTime(sleepTime * iteration), fN if len(exitSignals) == numPackets: print 'Jobs all finished!' break iteration += 1
def clipAdapterInDirQ(dirName): '''The Q if for doing it on a cluster using qsub Every Q function has a corresponding shell script''' for file in cg.recurseDir(dirName, end = '.fastq'): #check if it isn't a clipped file: if 'clipped' in file: continue while True: #submit job if there are less than ten if clusterCheck.queryNumJobsQ('chrisgre') < 100: subprocess.Popen(['qsub', '-V', '-cwd', '-o', 'errors', '-e', 'errors', wrapperShell, file]) time.sleep(.2) #give it time to update qstat break else:#wait 10 secs... time.sleep(10)
def mergeInputs(cName, eLevel): conf = c.getConfig(cName) assembly = conf.conf['assembly'] ending = '%s.%s' % (eLevel, assembly) print 'merging all files with ending', ending newLines = [] for fN in cg.recurseDir('out', end = ending): print os.getcwd(), fN fN = os.getcwd() + '/' + fN f = open(fN, 'r') newLines.extend(f.readlines()) f.close() f = open('peakData.%s.%s' % (eLevel, assembly), 'w') f.writelines(newLines) f.close()
def mergeInputs(cName, eLevel): conf = c.getConfig(cName) assembly = conf.conf['assembly'] ending = '%s.%s' % (eLevel, assembly) print 'merging all files with ending', ending newLines = [] for fN in cg.recurseDir('out', end=ending): print os.getcwd(), fN fN = os.getcwd() + '/' + fN f = open(fN, 'r') newLines.extend(f.readlines()) f.close() f = open('peakData.%s.%s' % (eLevel, assembly), 'w') f.writelines(newLines) f.close()
def createMultiTrackDir(dirName, organism): '''THIS DIFFERS FROM ABOVE BECAUSE IT DOESN't REQUIRE META INFO IT JUST MAKES A MERGED WIG FOR EVERYTHING IN THE DIRECTORY''' mainConf = c.cgConfig('Main.conf') fileList = [] for file in cg.recurseDir(dirName, end = '.mapped'): fileList.append(file) #make merged wig if organism == 'human': chroms = cg.humanChromosomes assembly = 'hg19' elif organism == 'mouse': chroms = cg.mouseChromosomes assembly = 'mm9' elif organism == 'zebrafish': chroms = cg.zebrafishChromosomes assembly = 'danRer6' print 'Making Bed File vectors' cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i') for fName in fileList: alignment_file = HTSeq.BowtieReader(fName) for alngt in alignment_file: if alngt.aligned: cvg.add_value( 1, alngt.iv ) #iv is the genomic interval.. bedNamePos = dirName + '/Merge.' + organism + '.1.wig' bedNameNeg = dirName + '/Merge.' + organism + '.-1.wig' print 'Writing Bed File' cvg.write_bedgraph_file(bedNamePos, "+" ) cvg.write_bedgraph_file(bedNameNeg, "-" ) #Now extend it updateWigLength(bedNamePos, assembly) updateWigLength(bedNameNeg, assembly) #Now Sort it. cgSort.wigSort(bedNamePos) cgSort.wigSort(bedNameNeg)
def checkExit(fN, numPackets): numPackets = int(numPackets) dirName = os.path.dirname(fN) baseName = os.path.basename(fN) if os.environ["PWD"] not in dirName: dirName = os.path.dirname(os.environ["PWD"] + "/" + fN) sleepTime = 10 iteration = 1 while True: time.sleep(sleepTime) exitSignals = bioLibCG.recurseDir(dirName, start=baseName, end="exitSignal") print "waiting...", str(len(exitSignals)), "/", str(numPackets), "%s" % bioLibCG.prettyTime( sleepTime * iteration ), fN if len(exitSignals) == numPackets: print "Jobs all finished!" break iteration += 1
def clipAdapterInDirQ(dirName): '''The Q if for doing it on a cluster using qsub Every Q function has a corresponding shell script''' for file in cg.recurseDir(dirName, end='.fastq'): #check if it isn't a clipped file: if 'clipped' in file: continue while True: #submit job if there are less than ten if clusterCheck.queryNumJobsQ('chrisgre') < 100: subprocess.Popen([ 'qsub', '-V', '-cwd', '-o', 'errors', '-e', 'errors', wrapperShell, file ]) time.sleep(.2) #give it time to update qstat break else: #wait 10 secs... time.sleep(10)
def createTrackInDir(dirName): '''Every Q function has a corresponding shell script Make wig file for all mapped files, for all organisms''' wrapperShell = '/home/chrisgre/scripts/mapping/createTrack.sh' mainConf = c.cgConfig('Main.conf') metaFileName = mainConf.conf['metaFileName'] for file in cg.recurseDir(dirName, end = '.mapped'): #check if mouse or human baseFName = cg.getBaseFileName(file) baseFName = baseFName.split('.')[0] metaDict = cg.getMetaFileDict(metaFileName) org = 'None' if baseFName in metaDict: if metaDict[baseFName][1] == 'NONE': print ' NO ORG KNOWN FOR', file continue else: org = metaDict[baseFName][1] print ' USING ORG', org, file #check if there is an organism, must check due to files not in metaFile if org == 'None': print ' NO org (not in meta file)', file continue while True: #submit job if there are less than ten if clusterCheck.queryNumJobsQ('chrisgre') < 1000: #subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ]) subprocess.Popen(['qsub', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ]) #time.sleep(.5) #give it time to update qstat break else:#wait 10 secs... time.sleep(20)
def splitRun(baseFN, memoryAmount, scriptName, *args): if '/' in baseFN: dirName = os.path.dirname(baseFN) else: dirName = os.environ['PWD'] basename = os.path.basename(baseFN) #cat the SORTED range files. rangeFiles = [ x for x in bioLibCG.recurseDir(dirName, start=basename, within='range') if 'exitSignal' not in x ] rangeFiles.sort(key=lambda x: int(x.split('.')[-2])) for fN in rangeFiles: #specify the correct qJob with correct memory qJobX = '%s/exec/qJobX%s.sh' % (os.environ['HOME'], memoryAmount) qDo = '%s/exec/qDo.sh' % (os.environ['HOME']) #construct command to pass if memoryAmount == 'LOCAL': com = [qDo, scriptName] else: com = [qJobX, qDo, scriptName] #append script arguments including SPLIT FN for arg in args: if arg == 'splitFN': com.append(fN) else: com.append(arg) #now append paraInfo for split run [splitRun, splitRun] com.extend(['splitRun', 'splitRun']) #run each job subprocess.Popen(com).wait()
def loadWigDictFloat(wigDir): '''Wig files in a directory must be a certain file format: NAME.chr.strand.wig''' chr_strand_coord_expr = {} for fN in bioLibCG.recurseDir(wigDir, end = '.wig'): chrom, strand = fN.split('/')[-1].split('.')[1], fN.split('/')[-1].split('.')[2] chr_strand_coord_expr.setdefault(chrom, {})[strand] = {} f = open(fN, 'r') f.readline() #header for line in f: ls = line.strip().split('\t') start, end, expr = int(ls[1]) + 1, int(ls[2]), float(ls[3]) #1BASE if float(expr) == 0.0: continue for i in range(start, end + 1): chr_strand_coord_expr[chrom][strand][i] = expr f.close() return chr_strand_coord_expr
def checkExit(fN, numPackets): numPackets = int(numPackets) dirName = os.path.dirname(fN) baseName = os.path.basename(fN) if os.environ['PWD'] not in dirName: dirName = os.path.dirname(os.environ['PWD'] + '/' + fN) sleepTime = 1 pbar = ProgressBar( widgets=[' ', SimpleProgress(), ' ', Timer(), ' ', Bar()], maxval=numPackets).start() while True: time.sleep(sleepTime) exitSignals = bioLibCG.recurseDir(dirName, start=baseName, end='exitSignal') pbar.update(len(exitSignals)) if len(exitSignals) == numPackets: pbar.finish() print 'Jobs all finished!' break
def scanCoord(tcc, dirName): fileNames = cg.recurseDir(dirName, end='.starts') #get name of file for index chrom, strand, start, end = cg.tccSplit(tcc) nameCheck = '%s.%s' % (chrom, strand) fN = 'None' for fileName in fileNames: if nameCheck in fileName: fN = fileName if fN == 'None': print 'No Index file for', nameCheck return 0 fIndex = cgIndex.lineIndex(fN, header=False) fIndex.passCheckFunction(cgIndex.mapStartCheckFunction) fIndex.binarySearch( tcc, skipEnd=True) #places file pointer at beginning of sequence line #Check if you need to move down one line checkLine = fIndex.getLineFromByte(fIndex.currentByte) fIndex.passCheckFunction( cgIndex.mapStartRangeCheckFunction ) #Note i'm passing now, but it is also used in extending if fIndex.checkFunction(tcc, checkLine) != 0: fIndex.file.readline() fIndex.currentByte = fIndex.file.tell() #Now extend up until in range, down until in range --> return reads. fIndex.extendUp(tcc) finalReads = [] for line in fIndex.file: if fIndex.checkFunction(tcc, line) == 0: finalReads.append(line.strip()) else: return finalReads
def splitRun(baseFN, memoryAmount, scriptName, *args): if "/" in baseFN: dirName = os.path.dirname(baseFN) else: dirName = os.environ["PWD"] basename = os.path.basename(baseFN) # cat the SORTED range files. rangeFiles = [x for x in bioLibCG.recurseDir(dirName, start=basename, within="range") if "exitSignal" not in x] rangeFiles.sort(key=lambda x: int(x.split(".")[-2])) for fN in rangeFiles: # specify the correct qJob with correct memory qJobX = "%s/exec/qJobX%s.sh" % (os.environ["HOME"], memoryAmount) qDo = "%s/exec/qDo.sh" % (os.environ["HOME"]) # construct command to pass if memoryAmount == "LOCAL": com = [qDo, scriptName] else: com = [qJobX, qDo, scriptName] # append script arguments including SPLIT FN for arg in args: if arg == "splitFN": com.append(fN) else: com.append(arg) # now append paraInfo for split run [splitRun, splitRun] com.extend(["splitRun", "splitRun"]) # run each job subprocess.Popen(com).wait()
def scanCoord(tcc, dirName): fileNames = cg.recurseDir(dirName, end = '.starts') #get name of file for index chrom, strand, start, end = cg.tccSplit(tcc) nameCheck = '%s.%s' % (chrom, strand) fN = 'None' for fileName in fileNames: if nameCheck in fileName: fN = fileName if fN == 'None': print 'No Index file for', nameCheck return 0 fIndex = cgIndex.lineIndex(fN, header = False) fIndex.passCheckFunction(cgIndex.mapStartCheckFunction) fIndex.binarySearch(tcc, skipEnd = True) #places file pointer at beginning of sequence line #Check if you need to move down one line checkLine = fIndex.getLineFromByte(fIndex.currentByte) fIndex.passCheckFunction(cgIndex.mapStartRangeCheckFunction) #Note i'm passing now, but it is also used in extending if fIndex.checkFunction(tcc, checkLine) != 0: fIndex.file.readline() fIndex.currentByte = fIndex.file.tell() #Now extend up until in range, down until in range --> return reads. fIndex.extendUp(tcc) finalReads = [] for line in fIndex.file: if fIndex.checkFunction(tcc, line) == 0: finalReads.append(line.strip()) else: return finalReads
def commit(self, id_obj): #get schema attName_field = getClassScheme(self.mappingClass) #will update this functionality later selectedAttNames = [x for x in attName_field] #Get new values...Ideally this would only have the ones that have changed...but it doesn't... id_att_newVals = {} for id, obj in id_obj.items(): id_att_newVals[id] = {} for attName, att in obj.__dict__.items(): id_att_newVals[id][attName] = att #get old values id_att_oldVals = {} for fN in bioLibCG.recurseDir(self.aDir): baseName = fN.strip().split('/')[-1] if baseName.startswith('a.'): #get atribute name/type bs = baseName.split('.') attName = bs[1] attType = attName_field[attName].dataType casteFxn = getCasteFunction(attType) #now set the attributes f = open(fN, 'r') for line in f: ls = line.strip().split('\t') id = int(ls[0]) att = ls[1] #caste them correctly if attType in listTypes: att = att.split(',') att = [casteFxn(x) for x in att] else: att = casteFxn(att) if id not in id_att_oldVals: id_att_oldVals[id] = {} id_att_oldVals[id][attName] = att f.close() #consolidate id_att_finalVals = {} for id in id_att_oldVals: for attName in id_att_oldVals[id]: #check default...I don't think you have to do this because it is from file...no defaults... #if id_att_oldVals[id][attName] == attName_field[attName].dataDefault: #continue #replace it if id in id_att_newVals: if attName in id_att_newVals[id]: if id not in id_att_finalVals: id_att_finalVals[id] = {} id_att_finalVals[id][attName] = id_att_newVals[id][ attName] else: if id not in id_att_finalVals: id_att_finalVals[id] = {} id_att_finalVals[id][attName] = id_att_oldVals[id][ attName] else: if id not in id_att_finalVals: id_att_finalVals[id] = {} id_att_finalVals[id][attName] = id_att_oldVals[id][attName] for id in id_att_newVals: for attName in id_att_newVals[id]: #check default #if id_att_newVals[id][attName] == attName_field[attName].dataDefault: #continue if id in id_att_oldVals: if attName in id_att_oldVals[id]: continue #already took care of this above else: if id not in id_att_finalVals: id_att_finalVals[id] = {} id_att_finalVals[id][attName] = id_att_newVals[id][ attName] else: if id not in id_att_finalVals: id_att_finalVals[id] = {} id_att_finalVals[id][attName] = id_att_newVals[id][attName] #write to files for attName in selectedAttNames: listFlag = False if attName_field[attName].dataType in listTypes: listFlag = True f = open(self.aDir + '/a.' + attName, 'w') for id in id_att_finalVals: if attName in id_att_finalVals[id]: finalVal = id_att_finalVals[id][attName] if finalVal == attName_field[attName].dataDefault: continue if listFlag: finalVal = ','.join([str(x) for x in finalVal]) f.write('%s\t%s\n' % (id, finalVal)) f.close() listFlag = False
#using the continuos blocks from in the small RNA lib file, identify all the peaks... import bioLibCG as cg import cgConfig as c mConf = c.cgConfig('Main.conf') fileNames = cg.recurseDir(mConf.conf['wigMouse'], end='.wig') for fN in fileNames: file = open(fN, 'r') file.readline() #header #get all points in midpoint form pointsDict = {} for line in file: start = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) point = start + (end - start) / 2 #midpoint pointsDict[point] = int(cg.ss(line)[3].split('.')[0]) file.close() #determine peaks based off of neighbors of each point lowest = pointsDict.keys() lowest.sort() peaks = [] span = 2 #must be > 0 for i in range(span + 1, len(lowest) - span - 1): val = pointsDict[lowest[i]] if val < 5: #minimum
def updateReadDensity(tType): #go through wig each chromosome and check the mature seqs mainConf = cgConfig.cgConfig('Main.conf') conf = cgConfig.cgConfig() organism = conf.conf['organism'] wigFolder = mainConf.conf['wig%s' % organism] newLines = [] if tType == 'E': pFileName = conf.conf['resultsExons'] elif tType == 'I': pFileName = conf.conf['resultsIntrons'] else: print 'READ UPDATE FAIL' print ' Updating Read Density:', tType for wigFileN in cg.recurseDir(wigFolder, end='.wig'): #init chrom = wigFileN.strip().split('.')[-2] strand = wigFileN.strip().split('.')[-4] wigFile = open(wigFileN, 'r') mirFile = open(pFileName, 'r') print wigFileN #get rid of header wigFile.readline() print ' populating hitmap' #populate hitmap wigMap = {} for line in wigFile: value = int(line.strip().split('\t')[3].split('.')[0]) if value > 0: start = int(line.strip().split('\t')[1]) end = int(line.strip().split('\t')[2]) for i in range(start, end): wigMap[i] = value wigFile.close() print ' calculating hits for mature seqs' #calculate total hits per mature for line in mirFile: mTcc = line.strip().split('\t')[1] mirID = line.strip().split('\t')[0] if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1] == strand): #if mirID == '26477.30.106643972': print 'Starting Total Count' highestHit = 0 for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])): #if mirID == '26477.30.106643972': print ' ', i if i in wigMap: if wigMap[i] > highestHit: highestHit = wigMap[i] #if mirID == '26477.30.106643972': print ' ', i, totalHits, wigMap[i] newLines.append(cg.appendToLine(line, str(highestHit), 11)) mirFile.close() print 'Writing New File' #write new results file outFile = open(pFileName, 'w') for line in newLines: outFile.write(line) outFile.close() ####NOW UPDATE HIGHEST HIT PER CLUSTER#### clusterCount = {} pFile = open(pFileName, 'r') for line in pFile: predictionCount = int(line.strip().split('\t')[11]) CID = line.strip().split('\t')[7] if CID in clusterCount: if clusterCount[CID] < predictionCount: clusterCount[CID] = predictionCount else: clusterCount[CID] = predictionCount pFile.close() #update the file --> cluster small count newLines = [] predFile = open(pFileName, 'r') for line in predFile: CID = line.strip().split('\t')[7] numMax = clusterCount[CID] newLines.append(cg.appendToLine(line, str(numMax), 12)) predFile.close() #sort newLines by clusterID sortDict = {} CIDs = [] for line in newLines: CID = int(line.strip().split('\t')[7]) if CID not in CIDs: CIDs.append(CID) if CID in sortDict: sortDict[CID].append(line) else: sortDict[CID] = [line] CIDs.sort() newLines = [] for CID in CIDs: for line in sortDict[CID]: newLines.append(line) #write new File newFile = open(pFileName, 'w') for line in newLines: newFile.write(line) newFile.close()
#using the continuos blocks from in the small RNA lib file, identify all the peaks... import bioLibCG as cg import cgConfig as c mConf = c.cgConfig('Main.conf') fileNames = cg.recurseDir(mConf.conf['wigMouse'], end = '.wig') for fN in fileNames: file = open(fN, 'r') file.readline() #header #get all points in midpoint form pointsDict = {} for line in file: start = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) point = start + (end-start)/2 #midpoint pointsDict[point] = int(cg.ss(line)[3].split('.')[0]) file.close() #determine peaks based off of neighbors of each point lowest = pointsDict.keys() lowest.sort() peaks = [] span = 2 #must be > 0 for i in range(span + 1,len(lowest) - span - 1): val = pointsDict[lowest[i]]
import bioLibCG as cg import os direc = '/home/chrisgre/apps/projects/small.rna.libs' metaFileName = direc + '/' + 'small.meta' ##make data of already made file so there aren't any duplicates: fileDict = {} #filename... metaFile = open(metaFileName, 'r') #add new entries countFiles = cg.recurseDir(direc, end='.fastq') for file in countFiles: fileName = file.strip().split('/')[-1] if len(fileName.split('.')) > 2: #has to specifically end in fastq... continue fileName = cg.getBaseFileName(file, naked=True) dir = file.strip().split('/')[-2] org = 'NONE' if 'human' in dir: org = 'human' if 'mouse' in dir: org = 'mouse' if 'pig' in dir: org = 'pig' if 'dog' in dir: org = 'dog' if 'rat' in dir: org = 'rat'
###This will calculate the total number of hits per microRNA --> update ##the newResults file with them --> put the value of the highest mature --> CID import cgConfig import bioLibCG as cg #go through wig each chromosome and check the mature seqs mainConf = cgConfig.cgConfig('Main.conf') conf = cgConfig.cgConfig() newLines = [] pFileName = 'newResults.results' for wigFileN in cg.recurseDir(mainConf.conf['wigMouse'], end = '.wig'): #init chrom = wigFileN.strip().split('.')[-2] strand = wigFileN.strip().split('.')[-4] wigFile = open(wigFileN, 'r') mirFile = open(conf.conf['results'], 'r') print wigFileN #get rid of header wigFile.readline() print ' populating hitmap' #populate hitmap wigMap = {} for line in wigFile: value = int(line.strip().split('\t')[3].split('.')[0])
def commit(self, id_obj): #get schema attName_field = getClassScheme(self.mappingClass) #will update this functionality later selectedAttNames = [x for x in attName_field] #Get new values...Ideally this would only have the ones that have changed...but it doesn't... id_att_newVals = {} for id, obj in id_obj.items(): id_att_newVals[id] = {} for attName, att in obj.__dict__.items(): id_att_newVals[id][attName] = att #get old values id_att_oldVals = {} for fN in bioLibCG.recurseDir(self.aDir): baseName = fN.strip().split('/')[-1] if baseName.startswith('a.'): #get atribute name/type bs = baseName.split('.') attName = bs[1] attType = attName_field[attName].dataType casteFxn = getCasteFunction(attType) #now set the attributes f = open(fN, 'r') for line in f: ls = line.strip().split('\t') id = int(ls[0]) att = ls[1] #caste them correctly if attType in listTypes: att = att.split(',') att = [casteFxn(x) for x in att] else: att = casteFxn(att) if id not in id_att_oldVals: id_att_oldVals[id] = {} id_att_oldVals[id][attName] = att f.close() #consolidate id_att_finalVals = {} for id in id_att_oldVals: for attName in id_att_oldVals[id]: #check default...I don't think you have to do this because it is from file...no defaults... #if id_att_oldVals[id][attName] == attName_field[attName].dataDefault: #continue #replace it if id in id_att_newVals: if attName in id_att_newVals[id]: if id not in id_att_finalVals: id_att_finalVals[id] = {} id_att_finalVals[id][attName] = id_att_newVals[id][attName] else: if id not in id_att_finalVals: id_att_finalVals[id] = {} id_att_finalVals[id][attName] = id_att_oldVals[id][attName] else: if id not in id_att_finalVals: id_att_finalVals[id] = {} id_att_finalVals[id][attName] = id_att_oldVals[id][attName] for id in id_att_newVals: for attName in id_att_newVals[id]: #check default #if id_att_newVals[id][attName] == attName_field[attName].dataDefault: #continue if id in id_att_oldVals: if attName in id_att_oldVals[id]: continue #already took care of this above else: if id not in id_att_finalVals: id_att_finalVals[id] = {} id_att_finalVals[id][attName] = id_att_newVals[id][attName] else: if id not in id_att_finalVals: id_att_finalVals[id] = {} id_att_finalVals[id][attName] = id_att_newVals[id][attName] #write to files for attName in selectedAttNames: listFlag = False if attName_field[attName].dataType in listTypes: listFlag = True f = open(self.aDir + '/a.' + attName, 'w') for id in id_att_finalVals: if attName in id_att_finalVals[id]: finalVal = id_att_finalVals[id][attName] if finalVal == attName_field[attName].dataDefault: continue if listFlag: finalVal = ','.join([str(x) for x in finalVal]) f.write('%s\t%s\n' % (id, finalVal)) f.close() listFlag = False
def clipAdapterInDirQ(dirName): '''The Q if for doing it on a cluster using qsub Every Q function has a corresponding shell script''' for file in cg.recurseDir(dirName, end = '.fastq'): subprocess.Popen([wrapperShell, file])
import cgConfig as c import wigValue import compareData as compare #init mConf = c.cgConfig('Main.conf') conf = c.cgConfig() tccList = ['chr3:-1:96042576:96042685'] #tccList = compare.tccFileToList('mouseKnownMirs.tcc', 0) timer = cg.cgTimer() timer.start() #put peaks in memory print 'loading peak data' peakFilesNames = cg.recurseDir(mConf.conf['wigMouse'], end = '.peaks') peaks = {} # chr:peak:value for pN in peakFilesNames: chrom = pN.strip().split('.')[4] strand = pN.strip().split('.')[2] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #get peaks and values and put in dictionary pFile = open(pN, 'r') for line in pFile:
#using the continuos blocks from in the small RNA lib file, identify all the peaks... import bioLibCG as cg import cgConfig as c mConf = c.cgConfig('Main.conf') fileNames = cg.recurseDir(mConf.conf['wigMouse'], start = 'cb.', end = '.tsv') for fN in fileNames: file = open(fN, 'r') #get peak positions and average heights lines = [str(int(x.split('\t')[0]) + int(x.split('\t')[2])/2) + '\t' + str(x.strip().split('\t')[3]) for x in file] file.close() #output new file with new ending outFile = open(fN + '.peakdata', 'w') for line in lines: outFile.write('%s\n' % line)
def clipAdapterInDirQ(dirName): '''The Q if for doing it on a cluster using qsub Every Q function has a corresponding shell script''' for file in cg.recurseDir(dirName, end='.fastq'): subprocess.Popen([wrapperShell, file])
import cgConfig as c import wigValue import compareData as compare #init mConf = c.cgConfig('Main.conf') conf = c.cgConfig() tccList = ['chr3:-1:96042576:96042685'] #tccList = compare.tccFileToList('mouseKnownMirs.tcc', 0) timer = cg.cgTimer() timer.start() #put peaks in memory print 'loading peak data' peakFilesNames = cg.recurseDir(mConf.conf['wigMouse'], end='.peaks') peaks = {} # chr:peak:value for pN in peakFilesNames: chrom = pN.strip().split('.')[4] strand = pN.strip().split('.')[2] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #get peaks and values and put in dictionary pFile = open(pN, 'r') for line in pFile:
import bioLibCG as cg import os direc = '/home/chrisgre/apps/projects/small.rna.libs' metaFileName = direc + '/' + 'small.meta' ##make data of already made file so there aren't any duplicates: fileDict = {} #filename... metaFile = open(metaFileName, 'r') #add new entries countFiles = cg.recurseDir(direc, end = '.fastq') for file in countFiles: fileName = file.strip().split('/')[-1] if len(fileName.split('.')) > 2: #has to specifically end in fastq... continue fileName = cg.getBaseFileName(file, naked = True) dir = file.strip().split('/')[-2] org = 'NONE' if 'human' in dir: org = 'human' if 'mouse' in dir: org = 'mouse' if 'pig' in dir: org = 'pig' if 'dog' in dir: org = 'dog' if 'rat' in dir: org = 'rat'
import cgConfig as c import bioLibCG as cg import cgSort mConf = c.cgConfig('Main.conf') smallPath = mConf.conf['smallPath'] smallPath = '/home/chrisgre/smallLibs/WIGS/zebrafish' #grab everything - NOT WIG MERGES... smallLibs = cg.recurseDir(smallPath, end = '.wig') smallLibs.extend(cg.recurseDir(smallPath, end = '.wig')) for lib in smallLibs: print 'sorting', lib cgSort.wigSort(lib)
def updateReadDensity(tType): #go through wig each chromosome and check the mature seqs mainConf = cgConfig.cgConfig('Main.conf') conf = cgConfig.cgConfig() organism = conf.conf['organism'] wigFolder = mainConf.conf['wig%s' % organism] newLines = [] if tType == 'E': pFileName = conf.conf['resultsExons'] elif tType == 'I': pFileName = conf.conf['resultsIntrons'] else: print 'READ UPDATE FAIL' print ' Updating Read Density:', tType for wigFileN in cg.recurseDir(wigFolder, end = '.wig'): #init chrom = wigFileN.strip().split('.')[-2] strand = wigFileN.strip().split('.')[-4] wigFile = open(wigFileN, 'r') mirFile = open(pFileName, 'r') print wigFileN #get rid of header wigFile.readline() print ' populating hitmap' #populate hitmap wigMap = {} for line in wigFile: value = int(line.strip().split('\t')[3].split('.')[0]) if value > 0: start = int(line.strip().split('\t')[1]) end = int(line.strip().split('\t')[2]) for i in range(start, end): wigMap[i] = value wigFile.close() print ' calculating hits for mature seqs' #calculate total hits per mature for line in mirFile: mTcc = line.strip().split('\t')[1] mirID = line.strip().split('\t')[0] if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1] == strand): #if mirID == '26477.30.106643972': print 'Starting Total Count' highestHit = 0 for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])): #if mirID == '26477.30.106643972': print ' ', i if i in wigMap: if wigMap[i] > highestHit: highestHit = wigMap[i] #if mirID == '26477.30.106643972': print ' ', i, totalHits, wigMap[i] newLines.append(cg.appendToLine(line, str(highestHit), 11)) mirFile.close() print 'Writing New File' #write new results file outFile = open(pFileName, 'w') for line in newLines: outFile.write(line) outFile.close() ####NOW UPDATE HIGHEST HIT PER CLUSTER#### clusterCount = {} pFile = open(pFileName, 'r') for line in pFile: predictionCount = int(line.strip().split('\t')[11]) CID = line.strip().split('\t')[7] if CID in clusterCount: if clusterCount[CID] < predictionCount: clusterCount[CID] = predictionCount else: clusterCount[CID] = predictionCount pFile.close() #update the file --> cluster small count newLines = [] predFile = open(pFileName, 'r') for line in predFile: CID = line.strip().split('\t')[7] numMax = clusterCount[CID] newLines.append(cg.appendToLine(line, str(numMax), 12)) predFile.close() #sort newLines by clusterID sortDict = {} CIDs = [] for line in newLines: CID = int(line.strip().split('\t')[7]) if CID not in CIDs: CIDs.append(CID) if CID in sortDict: sortDict[CID].append(line) else: sortDict[CID] = [line] CIDs.sort() newLines = [] for CID in CIDs: for line in sortDict[CID]: newLines.append(line) #write new File newFile = open(pFileName, 'w') for line in newLines: newFile.write(line) newFile.close()
import bioLibCG as cg import cgRnaSeq dir = '/home/chrisgre/smallLibs/zebrafish.embryo.GSE22068.20620952' QNames = cg.recurseDir(dir, end = '.fastq') #fastqfile names slNames = cg.recurseDir(dir, end = '.txt') #single sequence per line file names fastNames = cg.recurseDir(dir, end = '.fa') fastNames.extend(cg.recurseDir(dir, end = '.fna')) for QFileName in QNames: print 'Creating counts for file', QFileName cgRnaSeq.createCountFileFastQ(QFileName) for slName in slNames: print 'Creating counts for file', slName cgRnaSeq.createCountFileSL(slName) for fastName in fastNames: print 'Creating counts for file', fastName cgRnaSeq.createCountFileFasta(fastName)
def createMultiTrack(dirName, organism): '''merge all mapped tracks in directory and create a single wig file''' mainConf = c.cgConfig('Main.conf') metaFileName = mainConf.conf['metaFileName'] fileList = [] for file in cg.recurseDir(dirName, end = '.mapped'): #check if mouse or human SHOULD PUT INTO A STD FUNCTION FOR META FILE #check if mouse or human baseFName = cg.getBaseFileName(file, naked= True) metaDict = cg.getMetaFileDict(metaFileName) org = 'None' if baseFName in metaDict: if metaDict[baseFName][1] == 'NONE': print ' NO ORG KNOWN FOR', file continue elif not metaDict[baseFName][1] == organism: print ' NOT ORGANISM RUNNING', file continue else: org = metaDict[baseFName][1] print ' USING ORG', org, file #check if there is an organism, must check due to files not in metaFile if org == 'None': print ' NO org (not in meta file)', file continue #only make wig file for organism asked for if not org == organism: continue #if it is right organism and has mapped file then add fileList.append(file) #make merged wig if organism == 'human': chroms = cg.humanChromosomes assembly = 'hg19' elif organism == 'mouse': chroms = cg.mouseChromosomes assembly = 'mm9' elif organism == 'zebrafish': chroms = cg.zebrafishChromosomes assembly = 'danRer6' print 'Making Bed File vectors' cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i') for fName in fileList: alignment_file = HTSeq.BowtieReader(fName) for alngt in alignment_file: if alngt.aligned: cvg.add_value( 1, alngt.iv ) #iv is the genomic interval.. bedNamePos = dirName + '/Merge.' + organism + '.1.wig' bedNameNeg = dirName + '/Merge.' + organism + '.-1.wig' print 'Writing Bed File' cvg.write_bedgraph_file(bedNamePos, "+" ) cvg.write_bedgraph_file(bedNameNeg, "-" ) #Now extend it updateWigLength(bedNamePos, assembly) updateWigLength(bedNameNeg, assembly) #Now Sort it. cgSort.wigSort(bedNamePos) cgSort.wigSort(bedNameNeg)