def parClean(fN):
        
        
        if '/' in fN:
                dirName = os.path.dirname(fN)
        else:
                dirName = os.environ['PWD']

        basename = os.path.basename(fN)

        #remove the original file
        os.remove(fN)
        
        #remove the exit signals
        for eFN in bioLibCG.recurseDir(dirName, start = basename, end = 'exitSignal'):
                pass
                os.remove(eFN)

        #cat the SORTED range files.                
        rangeFiles = bioLibCG.recurseDir(dirName, start = basename, within = 'range')
        rangeFiles.sort(key = lambda x: int(x.split('.')[-2]))
        
        f = open(fN, 'w')
        for rFN in rangeFiles:
                subprocess.Popen(['cat', rFN], stdout = f).wait()
        f.close()

        #remove the range files
        for rFN in rangeFiles:
                os.remove(rFN)
def probe(tcc, conf=None):

    if not conf:
        mConf = c.cgConfig('Main.conf')
    smallPath = mConf.conf['smallPath']

    chrom, strand, start, end = cg.tccSplit(tcc)

    total = 0
    for lib in cg.recurseDir(smallPath, end='mapped.%s.wig' % strand):

        try:
            eLevels = stepVectorScan.scanVectorsFile(lib, [tcc])
        except:
            print lib, 'index failed'
            continue

        #find highest expression level
        highest = 0
        for coord in eLevels:
            if eLevels[coord] > highest:
                highest = eLevels[coord]

        if highest > 0:
            print lib, highest
            total += highest
            #print eLevels

    print total
Exemple #3
0
def scanSequence(seqList, dirName):
	'''Given list of sequences --> get all reads that have sequence
	'''
	
	fileNames = cg.recurseDir(dirName, end = '.sequence')
	if len(fileNames) > 1:
		print fileNames
		print 'there is more than one sequence file in this directory'
		return 1
	else:
		fN = fileNames[0]
	
	#for seq in seqList:
	seq = seqList
	fIndex = cgIndex.lineIndex(fN, header = False)
	fIndex.passCheckFunction(cgIndex.mapSequenceCheckFunction)
	fIndex.binarySearch(seq) #places file pointer at beginning of sequence line
	
	#extend and report
	fIndex.extendUp(seq)
	finalReads = []
	for line in fIndex.file:
		if fIndex.checkFunction(seq, line) == 0:
			finalReads.append(line.strip())
		else:
			return finalReads
Exemple #4
0
def scanSequence(seqList, dirName):
    '''Given list of sequences --> get all reads that have sequence
	'''

    fileNames = cg.recurseDir(dirName, end='.sequence')
    if len(fileNames) > 1:
        print fileNames
        print 'there is more than one sequence file in this directory'
        return 1
    else:
        fN = fileNames[0]

    #for seq in seqList:
    seq = seqList
    fIndex = cgIndex.lineIndex(fN, header=False)
    fIndex.passCheckFunction(cgIndex.mapSequenceCheckFunction)
    fIndex.binarySearch(
        seq)  #places file pointer at beginning of sequence line

    #extend and report
    fIndex.extendUp(seq)
    finalReads = []
    for line in fIndex.file:
        if fIndex.checkFunction(seq, line) == 0:
            finalReads.append(line.strip())
        else:
            return finalReads
Exemple #5
0
def createMTrack(dirName):
	'''merge all mapped tracks in directory and create a single wig file'''
	
	fileList = cg.recurseDir(dirName, end = '.out')
	
	chroms = cg.humanChromosomes
	print 'Making Bed File vectors'
	cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i')
	for fName in fileList:
		print fName
		alignment_file = HTSeq.BowtieReader(fName)
		for alngt in alignment_file:
			if alngt.aligned:
				try:
					cvg.add_value( 1, alngt.iv ) #iv is the genomic interval..
				except KeyError:
					pass

	bedNamePos = dirName + '/Merge.' + 'hg19' + '.1.wig'
	bedNameNeg = dirName + '/Merge.' + 'hg19' + '.-1.wig'
	
	print 'Writing Bed File'
	cvg.write_bedgraph_file(bedNamePos, "+" )
	cvg.write_bedgraph_file(bedNameNeg, "-" )

	#Now extend it
	updateWigLength(bedNamePos, 'hg19')
	updateWigLength(bedNameNeg, 'hg19')
	
	#Now Sort it.
	cgSort.wigSort(bedNamePos)
	cgSort.wigSort(bedNameNeg)
def probe(tcc, conf = None):
	
	if not conf:
		mConf = c.cgConfig('Main.conf')
	smallPath = mConf.conf['smallPath']
	
	chrom, strand, start, end = cg.tccSplit(tcc)
	
	total = 0
	for lib in cg.recurseDir(smallPath, end = 'mapped.%s.wig' % strand):
		
		
		try:
			eLevels = stepVectorScan.scanVectorsFile(lib, [tcc])
		except:
			print lib, 'index failed'
			continue
			
		
		#find highest expression level
		highest = 0
		for coord in eLevels:
			if eLevels[coord] > highest:
				highest = eLevels[coord]
				
				
		if highest > 0:
			print lib, highest
			total += highest
			#print eLevels
		
	print total
def mixWig(directory, assembly, name=None):
    '''Does it by chromosome --> faster, less memory'''

    if not name: name = 'Merge'
    #gather all chromosomes
    chromList = []
    for fN in cg.recurseDir(directory, end='.wig'):
        chrom = cg.getBaseFileName(fN).strip().split('.')[-3]
        if chrom not in chromList:
            chromList.append(chrom)

    print chromList

    for chrom in chromList:

        print chrom
        #Gather all the values from all the files
        hitDict = {}  # chrom : { strand : coord
        for fN in cg.recurseDir(directory, end='.wig'):
            fChrom = cg.getBaseFileName(fN).strip().split('.')[-3]
            if fChrom != chrom: continue
            print '  ', fN, fChrom
            f = open(fN, 'r')
            f.readline()  #header
            strand = cg.getBaseFileName(fN).strip().split('.')[-2]
            for line in f:

                lChrom, start, end, val = (line.strip().split('\t'))
                start, end, val = int(start), int(end), int(val)
                if val < 1: continue
                #print start, end, val
                for i in range(start, end):
                    try:
                        hitDict[lChrom][strand][i] += val
                    except (KeyError, TypeError):
                        if not lChrom in hitDict:
                            hitDict[lChrom] = {}
                        if not strand in hitDict[lChrom]:
                            hitDict[lChrom][strand] = {}
                        hitDict[lChrom][strand][i] = val

        #write results to wig file
        writeWigFromHitDict(hitDict, assembly, name, directory)
Exemple #8
0
def mixWig(directory, assembly, name = None):
	'''Does it by chromosome --> faster, less memory'''
	
	if not name: name = 'Merge'
	#gather all chromosomes
	chromList = []
	for fN in cg.recurseDir(directory, end = '.wig'):
		chrom = cg.getBaseFileName(fN).strip().split('.')[-3]
		if chrom not in chromList:
			chromList.append(chrom)
	
	print chromList
	
	for chrom in chromList:
		
		print chrom
		#Gather all the values from all the files
		hitDict = {} # chrom : { strand : coord
		for fN in cg.recurseDir(directory, end = '.wig'):
			fChrom = cg.getBaseFileName(fN).strip().split('.')[-3]
			if fChrom != chrom: continue
			print  '  ', fN, fChrom
			f = open(fN, 'r')
			f.readline() #header
			strand = cg.getBaseFileName(fN).strip().split('.')[-2]
			for line in f:
				
				lChrom, start, end, val = (line.strip().split('\t'))
				start, end, val = int(start), int(end), int(val)
				if val < 1: continue
				#print start, end, val
				for i in range(start, end):
					try:
						hitDict[lChrom][strand][i] += val
					except (KeyError,TypeError):
						if not lChrom in hitDict:
							hitDict[lChrom] = {}
						if not strand in hitDict[lChrom]:
							hitDict[lChrom][strand] = {}
						hitDict[lChrom][strand][i] = val
		
		#write results to wig file
		writeWigFromHitDict(hitDict, assembly, name, directory)
def parCleanSplit(fN):
        '''Remove the exit signals for a split continuation run'''
        if '/' in fN:
                dirName = os.path.dirname(fN)
        else:
                dirName = os.environ['PWD']

        basename = os.path.basename(fN)

        #remove the exit signals
        for eFN in bioLibCG.recurseDir(dirName, start = basename, end = 'exitSignal'):
                os.remove(eFN)
def parCleanSplit(fN):
    '''Remove the exit signals for a split continuation run'''
    if '/' in fN:
        dirName = os.path.dirname(fN)
    else:
        dirName = os.environ['PWD']

    basename = os.path.basename(fN)

    #remove the exit signals
    for eFN in bioLibCG.recurseDir(dirName, start=basename, end='exitSignal'):
        os.remove(eFN)
def parClean(fN):
        
        if '/' in fN:
                dirName = os.path.dirname(fN)
        else:
                dirName = os.environ['PWD']

        basename = os.path.basename(fN)

        #remove the original file
        print '..removing original file, if present'
        try:
            os.remove(fN)
        except OSError:
            pass
        
        #remove the exit signals
        print '..removing exit signals'
        for eFN in bioLibCG.recurseDir(dirName, start = basename, end = 'exitSignal'):
                os.remove(eFN)

        #cat the SORTED range files.                
        rangeFiles = bioLibCG.recurseDir(dirName, start = basename, within = 'range')
        rangeFiles.sort(key = lambda x: int(x.split('.')[-2]))
 

        print '..catting files together'
        f = open(fN, 'w')
        for rFN in rangeFiles:
                print '....', rFN
                subprocess.Popen(['cat', rFN], stdout = f).wait()
        f.close()

        #remove the range files
        print '..removing packets'
        for rFN in rangeFiles:
                print '....', rFN
                os.remove(rFN)
def getAll(chrom, strand, point):
	fNs = cg.recurseDir(mConf.conf['smallPath'], end = '.wig')
	for file in fNs:
		if 'WIG' in file:
			fNs.remove(file)
		elif file == '/home/chrisgre/smallLibs/WIGS/Merge.mouse.1.wig' or '/home/chrisgre/smallLibs/WIGS/Merge.human.1.wig':
			fNs.remove(file)
	
	for fN in fNs:
		fStrand = cg.ss(fN, '.')[-2]
		if str(fStrand) == str(strand):
			val = getWigValue(chrom, point, fN)
			if val > 0:
				print fN, val
Exemple #13
0
def getAll(chrom, strand, point):
    fNs = cg.recurseDir(mConf.conf['smallPath'], end='.wig')
    for file in fNs:
        if 'WIG' in file:
            fNs.remove(file)
        elif file == '/home/chrisgre/smallLibs/WIGS/Merge.mouse.1.wig' or '/home/chrisgre/smallLibs/WIGS/Merge.human.1.wig':
            fNs.remove(file)

    for fN in fNs:
        fStrand = cg.ss(fN, '.')[-2]
        if str(fStrand) == str(strand):
            val = getWigValue(chrom, point, fN)
            if val > 0:
                print fN, val
def parClean(fN):

    if '/' in fN:
        dirName = os.path.dirname(fN)
    else:
        dirName = os.environ['PWD']

    basename = os.path.basename(fN)

    #remove the original file
    print '..removing original file, if present'
    try:
        os.remove(fN)
    except OSError:
        pass

    #remove the exit signals
    print '..removing exit signals'
    for eFN in bioLibCG.recurseDir(dirName, start=basename, end='exitSignal'):
        os.remove(eFN)

    #cat the SORTED range files.
    rangeFiles = bioLibCG.recurseDir(dirName, start=basename, within='range')
    rangeFiles.sort(key=lambda x: int(x.split('.')[-2]))

    print '..catting files together'
    f = open(fN, 'w')
    for rFN in rangeFiles:
        print '....', rFN
        subprocess.Popen(['cat', rFN], stdout=f).wait()
    f.close()

    #remove the range files
    print '..removing packets'
    for rFN in rangeFiles:
        print '....', rFN
        os.remove(rFN)
Exemple #15
0
        def load(self):

                #get schema
                attName_field = getClassScheme(self.mappingClass)
                
                id_obj = {}
                selectedAttNames = []
                #load defined attributes
                for fN in bioLibCG.recurseDir(self.aDir):
                        baseName = fN.strip().split('/')[-1]
                        if baseName.startswith('a.'):

                                #get atribute name/type
                                bs = baseName.split('.')
                                attName = bs[1]
                                attType = attName_field[attName].dataType
                                casteFxn = getCasteFunction(attType)
                                selectedAttNames.append(attName)

                                #now set the attributes
                                f = open(fN, 'r')
                                for line in f:
                                        ls = line.strip().split('\t')
                                        id = int(ls[0])
                                        att = ls[1]
                                        if attType in listTypes: 
                                                att = att.split(',')
                                                att = [casteFxn(x) for x in att]
                                        else:
                                                att = casteFxn(att)
                                        
                                        #now set the attribute property for the id
                                        if id not in id_obj:
                                                id_obj[id] = self.mappingClass(id) 
                                        
                                        o = id_obj[id]
                                        setattr(o, attName, att)
                                f.close()

                #Now initialize objects---It's important to do this after all the data has been loaded...
                for obj in id_obj.values():
                        loadedAttNames = obj.__dict__.keys()
                        for attName in selectedAttNames:
                                if attName not in loadedAttNames:
                                        setattr(obj, attName, copy.copy(attName_field[attName].dataDefault))
                                


                return id_obj                                        
Exemple #16
0
def checkDups(dir):

    id_seqs = {}
    for i in bioLibCG.recurseDir(dir, end='.simSeqs'):

        f = open(i, 'r')
        for line in f:
            ls = line.strip().split('\t')
            id, seq = ls[0], ls[1]
            id_seqs.setdefault(id, []).append(seq)

    for id, seqs in id_seqs.items():
        if len(seqs) != len(list(set(seqs))):
            print 'fail'
            print id, seqs
Exemple #17
0
    def load(self):

        #get schema
        attName_field = getClassScheme(self.mappingClass)

        id_obj = {}
        selectedAttNames = []
        #load defined attributes
        for fN in bioLibCG.recurseDir(self.aDir):
            baseName = fN.strip().split('/')[-1]
            if baseName.startswith('a.'):

                #get atribute name/type
                bs = baseName.split('.')
                attName = bs[1]
                attType = attName_field[attName].dataType
                casteFxn = getCasteFunction(attType)
                selectedAttNames.append(attName)

                #now set the attributes
                f = open(fN, 'r')
                for line in f:
                    ls = line.strip().split('\t')
                    id = int(ls[0])
                    att = ls[1]
                    if attType in listTypes:
                        att = att.split(',')
                        att = [casteFxn(x) for x in att]
                    else:
                        att = casteFxn(att)

                    #now set the attribute property for the id
                    if id not in id_obj:
                        id_obj[id] = self.mappingClass(id)

                    o = id_obj[id]
                    setattr(o, attName, att)
                f.close()

        #Now initialize objects---It's important to do this after all the data has been loaded...
        for obj in id_obj.values():
            loadedAttNames = obj.__dict__.keys()
            for attName in selectedAttNames:
                if attName not in loadedAttNames:
                    setattr(obj, attName,
                            copy.copy(attName_field[attName].dataDefault))

        return id_obj
Exemple #18
0
def checkDups(dir):
        
        id_seqs = {}
        for i in bioLibCG.recurseDir(dir, end = '.simSeqs'):
              
                f = open(i, 'r')
                for line in f:
                        ls = line.strip().split('\t')
                        id, seq = ls[0], ls[1]
                        id_seqs.setdefault(id, []).append(seq)

        
        for id, seqs in id_seqs.items():
                if len(seqs) != len(list(set(seqs))):
                        print 'fail'
                        print id, seqs
Exemple #19
0
def mapFastQInDirQ(dirName, overwrite = True):
	'''Every Q function has a corresponding shell script'''
	wrapperShell = '/home/chrisgre/scripts/mapping/mapFastQ.sh'
	
	
	for file in cg.recurseDir(dirName, end = 'clipped.fastq'):
		print file
		
		putativeN = file.replace('.clipped.fastq','.clipped.fastq.mapped')
		if os.path.isfile(putativeN):
			if overwrite:
				print '  Overwriting file', putativeN
				os.remove(putativeN)
			else:
				print '  \nNOT OVERWRITING FILE', putativeN
				continue
				
		#check if mouse or human
		baseFName = cg.getBaseFileName(file, naked = True)
		org = 'None'
		for metaFileName in metaFileNames:
			mFile = open(metaFileName, 'r')
			for line in mFile:
				fields = line.strip().split('\t')
				if baseFName == fields[0]:
					if fields[2] == 'NONE':
						print '  NO ORG KNOWN FOR', file
						continue
					else:
						org = fields[2]
						print '  USING ORG', org, file
			mFile.close()
			
		#check if there is an organism, must check due to files not in metaFile
		if org == 'None':
			print '  NO org', file
			continue
			
		while True:
			#submit job if there are less than ten
			if clusterCheck.queryNumJobsQ('chrisgre') < 40:
				#subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				subprocess.Popen(['qsub', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				time.sleep(.2) #give it time to update qstat
				break
			else:#wait 10 secs...
				time.sleep(20)
Exemple #20
0
def checkExit(fN, numPackets):
        numPackets = int(numPackets)
        dirName = os.path.dirname(fN)
        baseName = os.path.basename(fN)
        if os.environ['PWD'] not in dirName:
                dirName = os.path.dirname(os.environ['PWD'] + '/' + fN)
 
        sleepTime = 1
        pbar = ProgressBar(widgets=['  ', SimpleProgress(), ' ', Timer(), ' ', Bar()], maxval=numPackets).start()
        while True:
                time.sleep(sleepTime)
                exitSignals = bioLibCG.recurseDir(dirName, start = baseName , end = 'exitSignal')
                pbar.update(len(exitSignals))                
                if len(exitSignals) == numPackets:
                        pbar.finish()
                        print 'Jobs all finished!'
                        break
def checkExit(fN, numPackets):
        numPackets = int(numPackets)
        dirName = os.path.dirname(fN)
        baseName = os.path.basename(fN)
        if os.environ['PWD'] not in dirName:
                dirName = os.path.dirname(os.environ['PWD'] + '/' + fN)
 
        sleepTime = 10
        iteration = 1
        while True:

                time.sleep(sleepTime)
                exitSignals = bioLibCG.recurseDir(dirName, start = baseName , end = 'exitSignal')
                print 'waiting...', str(len(exitSignals)), '/', str(numPackets), '%s' % bioLibCG.prettyTime(sleepTime * iteration), fN 
                if len(exitSignals) == numPackets:
                        print 'Jobs all finished!'
                        break
                iteration += 1
def clipAdapterInDirQ(dirName):
	'''The Q if for doing it on a cluster using qsub
	Every Q function has a corresponding shell script'''
	
	for file in cg.recurseDir(dirName, end = '.fastq'):
		
		#check if it isn't a clipped file:
		if 'clipped' in file:
			continue
			
		while True:
			#submit job if there are less than ten
			if clusterCheck.queryNumJobsQ('chrisgre') < 100:
				subprocess.Popen(['qsub', '-V', '-cwd', '-o', 'errors', '-e', 'errors', wrapperShell, file])
				time.sleep(.2) #give it time to update qstat
				break
			else:#wait 10 secs...
				time.sleep(10)
def mergeInputs(cName, eLevel):
	
	conf = c.getConfig(cName)
	assembly = conf.conf['assembly']
	ending = '%s.%s' % (eLevel, assembly)
	
	print 'merging all files with ending', ending
	
	newLines = []
	for fN in cg.recurseDir('out', end = ending):
		print os.getcwd(), fN
		fN = os.getcwd() + '/' + fN
		f = open(fN, 'r')
		newLines.extend(f.readlines())
		f.close()
	
	f = open('peakData.%s.%s' % (eLevel, assembly), 'w')
	f.writelines(newLines)
	f.close()
Exemple #24
0
def mergeInputs(cName, eLevel):

    conf = c.getConfig(cName)
    assembly = conf.conf['assembly']
    ending = '%s.%s' % (eLevel, assembly)

    print 'merging all files with ending', ending

    newLines = []
    for fN in cg.recurseDir('out', end=ending):
        print os.getcwd(), fN
        fN = os.getcwd() + '/' + fN
        f = open(fN, 'r')
        newLines.extend(f.readlines())
        f.close()

    f = open('peakData.%s.%s' % (eLevel, assembly), 'w')
    f.writelines(newLines)
    f.close()
Exemple #25
0
def createMultiTrackDir(dirName, organism):
	'''THIS DIFFERS FROM ABOVE BECAUSE IT DOESN't REQUIRE META INFO
	IT JUST MAKES A MERGED WIG FOR EVERYTHING IN THE DIRECTORY'''
	mainConf = c.cgConfig('Main.conf')
	
	fileList = []
	for file in cg.recurseDir(dirName, end = '.mapped'):
		fileList.append(file)
				
	#make merged wig
	if organism == 'human':
		chroms = cg.humanChromosomes
		assembly = 'hg19'
	elif organism == 'mouse':
		chroms = cg.mouseChromosomes
		assembly = 'mm9'
	elif organism == 'zebrafish':
		chroms = cg.zebrafishChromosomes
		assembly = 'danRer6'
	
	print 'Making Bed File vectors'
	cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i')
	for fName in fileList:
		alignment_file = HTSeq.BowtieReader(fName)
		for alngt in alignment_file:
			if alngt.aligned:
				cvg.add_value( 1, alngt.iv ) #iv is the genomic interval..

	bedNamePos = dirName + '/Merge.' + organism + '.1.wig'
	bedNameNeg = dirName + '/Merge.' + organism + '.-1.wig'
	
	print 'Writing Bed File'
	cvg.write_bedgraph_file(bedNamePos, "+" )
	cvg.write_bedgraph_file(bedNameNeg, "-" )

	#Now extend it
	updateWigLength(bedNamePos, assembly)
	updateWigLength(bedNameNeg, assembly)
	
	#Now Sort it.
	cgSort.wigSort(bedNamePos)
	cgSort.wigSort(bedNameNeg)
def checkExit(fN, numPackets):
    numPackets = int(numPackets)
    dirName = os.path.dirname(fN)
    baseName = os.path.basename(fN)
    if os.environ["PWD"] not in dirName:
        dirName = os.path.dirname(os.environ["PWD"] + "/" + fN)

    sleepTime = 10
    iteration = 1
    while True:

        time.sleep(sleepTime)
        exitSignals = bioLibCG.recurseDir(dirName, start=baseName, end="exitSignal")
        print "waiting...", str(len(exitSignals)), "/", str(numPackets), "%s" % bioLibCG.prettyTime(
            sleepTime * iteration
        ), fN
        if len(exitSignals) == numPackets:
            print "Jobs all finished!"
            break
        iteration += 1
Exemple #27
0
def clipAdapterInDirQ(dirName):
    '''The Q if for doing it on a cluster using qsub
	Every Q function has a corresponding shell script'''

    for file in cg.recurseDir(dirName, end='.fastq'):

        #check if it isn't a clipped file:
        if 'clipped' in file:
            continue

        while True:
            #submit job if there are less than ten
            if clusterCheck.queryNumJobsQ('chrisgre') < 100:
                subprocess.Popen([
                    'qsub', '-V', '-cwd', '-o', 'errors', '-e', 'errors',
                    wrapperShell, file
                ])
                time.sleep(.2)  #give it time to update qstat
                break
            else:  #wait 10 secs...
                time.sleep(10)
Exemple #28
0
def createTrackInDir(dirName):
	'''Every Q function has a corresponding shell script
	Make wig file for all mapped files, for all organisms'''
	
	wrapperShell = '/home/chrisgre/scripts/mapping/createTrack.sh'
	
	mainConf = c.cgConfig('Main.conf')
	metaFileName = mainConf.conf['metaFileName']

	for file in cg.recurseDir(dirName, end = '.mapped'):
						
		#check if mouse or human
		baseFName = cg.getBaseFileName(file)
		baseFName = baseFName.split('.')[0]
		
		metaDict = cg.getMetaFileDict(metaFileName)
		
		org = 'None'
		if baseFName in metaDict:
			if metaDict[baseFName][1] == 'NONE':
				print '  NO ORG KNOWN FOR', file
				continue
			else:
				org = metaDict[baseFName][1]
				print '  USING ORG', org, file
				
		#check if there is an organism, must check due to files not in metaFile
		if org == 'None':
			print '  NO org (not in meta file)', file
			continue
			
		while True:
			#submit job if there are less than ten
			if clusterCheck.queryNumJobsQ('chrisgre') < 1000:
				#subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				subprocess.Popen(['qsub', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				#time.sleep(.5) #give it time to update qstat
				break
			else:#wait 10 secs...
				time.sleep(20)
Exemple #29
0
def splitRun(baseFN, memoryAmount, scriptName, *args):

    if '/' in baseFN:
        dirName = os.path.dirname(baseFN)
    else:
        dirName = os.environ['PWD']

    basename = os.path.basename(baseFN)

    #cat the SORTED range files.
    rangeFiles = [
        x for x in bioLibCG.recurseDir(dirName, start=basename, within='range')
        if 'exitSignal' not in x
    ]
    rangeFiles.sort(key=lambda x: int(x.split('.')[-2]))

    for fN in rangeFiles:

        #specify the correct qJob with correct memory
        qJobX = '%s/exec/qJobX%s.sh' % (os.environ['HOME'], memoryAmount)
        qDo = '%s/exec/qDo.sh' % (os.environ['HOME'])

        #construct command to pass
        if memoryAmount == 'LOCAL':
            com = [qDo, scriptName]
        else:
            com = [qJobX, qDo, scriptName]

        #append script arguments including SPLIT FN
        for arg in args:
            if arg == 'splitFN':
                com.append(fN)
            else:
                com.append(arg)

        #now append paraInfo for split run [splitRun, splitRun]
        com.extend(['splitRun', 'splitRun'])
        #run each job
        subprocess.Popen(com).wait()
Exemple #30
0
def loadWigDictFloat(wigDir):
        '''Wig files in a directory must be a certain file format: NAME.chr.strand.wig'''

        chr_strand_coord_expr = {}
        
        for fN in bioLibCG.recurseDir(wigDir, end = '.wig'):
                chrom, strand = fN.split('/')[-1].split('.')[1], fN.split('/')[-1].split('.')[2]

                chr_strand_coord_expr.setdefault(chrom, {})[strand] = {}
                f = open(fN, 'r')
                f.readline() #header
                for line in f:
                        ls = line.strip().split('\t')
                        start, end, expr = int(ls[1]) + 1, int(ls[2]), float(ls[3]) #1BASE

                        if float(expr) == 0.0: continue
                        
                        for i in range(start, end + 1):
                                chr_strand_coord_expr[chrom][strand][i] = expr
                f.close()

        return chr_strand_coord_expr
Exemple #31
0
def checkExit(fN, numPackets):
    numPackets = int(numPackets)
    dirName = os.path.dirname(fN)
    baseName = os.path.basename(fN)
    if os.environ['PWD'] not in dirName:
        dirName = os.path.dirname(os.environ['PWD'] + '/' + fN)

    sleepTime = 1
    pbar = ProgressBar(
        widgets=['  ', SimpleProgress(), ' ',
                 Timer(), ' ',
                 Bar()],
        maxval=numPackets).start()
    while True:
        time.sleep(sleepTime)
        exitSignals = bioLibCG.recurseDir(dirName,
                                          start=baseName,
                                          end='exitSignal')
        pbar.update(len(exitSignals))
        if len(exitSignals) == numPackets:
            pbar.finish()
            print 'Jobs all finished!'
            break
Exemple #32
0
def scanCoord(tcc, dirName):

    fileNames = cg.recurseDir(dirName, end='.starts')

    #get name of file for index
    chrom, strand, start, end = cg.tccSplit(tcc)
    nameCheck = '%s.%s' % (chrom, strand)
    fN = 'None'
    for fileName in fileNames:
        if nameCheck in fileName: fN = fileName
    if fN == 'None':
        print 'No Index file for', nameCheck
        return 0

    fIndex = cgIndex.lineIndex(fN, header=False)
    fIndex.passCheckFunction(cgIndex.mapStartCheckFunction)
    fIndex.binarySearch(
        tcc, skipEnd=True)  #places file pointer at beginning of sequence line

    #Check if you need to move down one line
    checkLine = fIndex.getLineFromByte(fIndex.currentByte)
    fIndex.passCheckFunction(
        cgIndex.mapStartRangeCheckFunction
    )  #Note i'm passing now, but it is also used in extending
    if fIndex.checkFunction(tcc, checkLine) != 0:
        fIndex.file.readline()
    fIndex.currentByte = fIndex.file.tell()

    #Now extend up until in range, down until in range --> return reads.
    fIndex.extendUp(tcc)

    finalReads = []
    for line in fIndex.file:
        if fIndex.checkFunction(tcc, line) == 0:
            finalReads.append(line.strip())
        else:
            return finalReads
Exemple #33
0
def splitRun(baseFN, memoryAmount, scriptName, *args):

    if "/" in baseFN:
        dirName = os.path.dirname(baseFN)
    else:
        dirName = os.environ["PWD"]

    basename = os.path.basename(baseFN)

    # cat the SORTED range files.
    rangeFiles = [x for x in bioLibCG.recurseDir(dirName, start=basename, within="range") if "exitSignal" not in x]
    rangeFiles.sort(key=lambda x: int(x.split(".")[-2]))

    for fN in rangeFiles:

        # specify the correct qJob with correct memory
        qJobX = "%s/exec/qJobX%s.sh" % (os.environ["HOME"], memoryAmount)
        qDo = "%s/exec/qDo.sh" % (os.environ["HOME"])

        # construct command to pass
        if memoryAmount == "LOCAL":
            com = [qDo, scriptName]
        else:
            com = [qJobX, qDo, scriptName]

        # append script arguments including SPLIT FN
        for arg in args:
            if arg == "splitFN":
                com.append(fN)
            else:
                com.append(arg)

        # now append paraInfo for split run [splitRun, splitRun]
        com.extend(["splitRun", "splitRun"])
        # run each job
        subprocess.Popen(com).wait()
Exemple #34
0
def scanCoord(tcc, dirName):
	
	fileNames = cg.recurseDir(dirName, end = '.starts')
	
	
	#get name of file for index
	chrom, strand, start, end = cg.tccSplit(tcc)
	nameCheck = '%s.%s' % (chrom, strand)
	fN = 'None'
	for fileName in fileNames:
		if nameCheck in fileName: fN = fileName
	if fN == 'None': 
		print 'No Index file for', nameCheck
		return 0
        
	
	fIndex = cgIndex.lineIndex(fN, header = False)
	fIndex.passCheckFunction(cgIndex.mapStartCheckFunction)
	fIndex.binarySearch(tcc, skipEnd = True) #places file pointer at beginning of sequence line
        
        #Check if you need to move down one line
        checkLine = fIndex.getLineFromByte(fIndex.currentByte)
        fIndex.passCheckFunction(cgIndex.mapStartRangeCheckFunction) #Note i'm passing now, but it is also used in extending
        if fIndex.checkFunction(tcc, checkLine) != 0:
                fIndex.file.readline()
        fIndex.currentByte = fIndex.file.tell()

        #Now extend up until in range, down until in range --> return reads.
        fIndex.extendUp(tcc)
        
	finalReads = []
	for line in fIndex.file:
		if fIndex.checkFunction(tcc, line) == 0:
			finalReads.append(line.strip())
                else:
			return finalReads
Exemple #35
0
    def commit(self, id_obj):

        #get schema
        attName_field = getClassScheme(self.mappingClass)

        #will update this functionality later

        selectedAttNames = [x for x in attName_field]

        #Get new values...Ideally this would only have the ones that have changed...but it doesn't...
        id_att_newVals = {}
        for id, obj in id_obj.items():
            id_att_newVals[id] = {}
            for attName, att in obj.__dict__.items():
                id_att_newVals[id][attName] = att

        #get old values
        id_att_oldVals = {}
        for fN in bioLibCG.recurseDir(self.aDir):
            baseName = fN.strip().split('/')[-1]
            if baseName.startswith('a.'):

                #get atribute name/type
                bs = baseName.split('.')
                attName = bs[1]
                attType = attName_field[attName].dataType
                casteFxn = getCasteFunction(attType)

                #now set the attributes
                f = open(fN, 'r')
                for line in f:
                    ls = line.strip().split('\t')
                    id = int(ls[0])
                    att = ls[1]
                    #caste them correctly
                    if attType in listTypes:
                        att = att.split(',')
                        att = [casteFxn(x) for x in att]
                    else:
                        att = casteFxn(att)

                    if id not in id_att_oldVals: id_att_oldVals[id] = {}
                    id_att_oldVals[id][attName] = att
                f.close()

        #consolidate
        id_att_finalVals = {}
        for id in id_att_oldVals:
            for attName in id_att_oldVals[id]:

                #check default...I don't think you have to do this because it is from file...no defaults...
                #if id_att_oldVals[id][attName] == attName_field[attName].dataDefault:
                #continue

                #replace it
                if id in id_att_newVals:
                    if attName in id_att_newVals[id]:

                        if id not in id_att_finalVals:
                            id_att_finalVals[id] = {}
                        id_att_finalVals[id][attName] = id_att_newVals[id][
                            attName]
                    else:
                        if id not in id_att_finalVals:
                            id_att_finalVals[id] = {}
                        id_att_finalVals[id][attName] = id_att_oldVals[id][
                            attName]
                else:
                    if id not in id_att_finalVals: id_att_finalVals[id] = {}
                    id_att_finalVals[id][attName] = id_att_oldVals[id][attName]

        for id in id_att_newVals:
            for attName in id_att_newVals[id]:

                #check default
                #if id_att_newVals[id][attName] == attName_field[attName].dataDefault:
                #continue

                if id in id_att_oldVals:
                    if attName in id_att_oldVals[id]:
                        continue  #already took care of this above
                    else:
                        if id not in id_att_finalVals:
                            id_att_finalVals[id] = {}
                        id_att_finalVals[id][attName] = id_att_newVals[id][
                            attName]
                else:
                    if id not in id_att_finalVals: id_att_finalVals[id] = {}
                    id_att_finalVals[id][attName] = id_att_newVals[id][attName]

        #write to files
        for attName in selectedAttNames:
            listFlag = False
            if attName_field[attName].dataType in listTypes:
                listFlag = True
            f = open(self.aDir + '/a.' + attName, 'w')
            for id in id_att_finalVals:
                if attName in id_att_finalVals[id]:
                    finalVal = id_att_finalVals[id][attName]
                    if finalVal == attName_field[attName].dataDefault:
                        continue
                    if listFlag:
                        finalVal = ','.join([str(x) for x in finalVal])

                    f.write('%s\t%s\n' % (id, finalVal))
            f.close()
            listFlag = False
#using the continuos blocks from in the small RNA lib file, identify all the peaks...
import bioLibCG as cg
import cgConfig as c

mConf = c.cgConfig('Main.conf')

fileNames = cg.recurseDir(mConf.conf['wigMouse'], end='.wig')

for fN in fileNames:
    file = open(fN, 'r')
    file.readline()  #header

    #get all points in midpoint form
    pointsDict = {}

    for line in file:
        start = int(cg.ss(line)[1])
        end = int(cg.ss(line)[2])
        point = start + (end - start) / 2  #midpoint
        pointsDict[point] = int(cg.ss(line)[3].split('.')[0])
    file.close()

    #determine peaks based off of neighbors of each point
    lowest = pointsDict.keys()
    lowest.sort()
    peaks = []
    span = 2  #must be > 0
    for i in range(span + 1, len(lowest) - span - 1):

        val = pointsDict[lowest[i]]
        if val < 5:  #minimum
def updateReadDensity(tType):
    #go through wig each chromosome and check the mature seqs
    mainConf = cgConfig.cgConfig('Main.conf')
    conf = cgConfig.cgConfig()
    organism = conf.conf['organism']
    wigFolder = mainConf.conf['wig%s' % organism]
    newLines = []

    if tType == 'E':
        pFileName = conf.conf['resultsExons']
    elif tType == 'I':
        pFileName = conf.conf['resultsIntrons']
    else:
        print 'READ UPDATE FAIL'

    print '  Updating Read Density:', tType

    for wigFileN in cg.recurseDir(wigFolder, end='.wig'):

        #init
        chrom = wigFileN.strip().split('.')[-2]
        strand = wigFileN.strip().split('.')[-4]
        wigFile = open(wigFileN, 'r')
        mirFile = open(pFileName, 'r')
        print wigFileN

        #get rid of header
        wigFile.readline()

        print '  populating hitmap'
        #populate hitmap
        wigMap = {}
        for line in wigFile:
            value = int(line.strip().split('\t')[3].split('.')[0])
            if value > 0:
                start = int(line.strip().split('\t')[1])
                end = int(line.strip().split('\t')[2])
                for i in range(start, end):
                    wigMap[i] = value
        wigFile.close()

        print '  calculating hits for mature seqs'
        #calculate total hits per mature
        for line in mirFile:
            mTcc = line.strip().split('\t')[1]
            mirID = line.strip().split('\t')[0]
            if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1]
                                                  == strand):
                #if mirID == '26477.30.106643972': print 'Starting Total Count'
                highestHit = 0
                for i in range(int(mTcc.split(':')[2]),
                               int(mTcc.split(':')[3])):
                    #if mirID == '26477.30.106643972': print '  ', i
                    if i in wigMap:
                        if wigMap[i] > highestHit:
                            highestHit = wigMap[i]
                        #if mirID == '26477.30.106643972': print '    ', i, totalHits, wigMap[i]

                newLines.append(cg.appendToLine(line, str(highestHit), 11))

        mirFile.close()

    print 'Writing New File'
    #write new results file
    outFile = open(pFileName, 'w')
    for line in newLines:
        outFile.write(line)
    outFile.close()

    ####NOW UPDATE HIGHEST HIT PER CLUSTER####

    clusterCount = {}

    pFile = open(pFileName, 'r')
    for line in pFile:
        predictionCount = int(line.strip().split('\t')[11])
        CID = line.strip().split('\t')[7]
        if CID in clusterCount:
            if clusterCount[CID] < predictionCount:
                clusterCount[CID] = predictionCount
        else:
            clusterCount[CID] = predictionCount
    pFile.close()

    #update the file --> cluster small count
    newLines = []
    predFile = open(pFileName, 'r')
    for line in predFile:
        CID = line.strip().split('\t')[7]
        numMax = clusterCount[CID]
        newLines.append(cg.appendToLine(line, str(numMax), 12))
    predFile.close()

    #sort newLines by clusterID
    sortDict = {}
    CIDs = []
    for line in newLines:
        CID = int(line.strip().split('\t')[7])
        if CID not in CIDs:
            CIDs.append(CID)
        if CID in sortDict:
            sortDict[CID].append(line)
        else:
            sortDict[CID] = [line]

    CIDs.sort()

    newLines = []
    for CID in CIDs:
        for line in sortDict[CID]:
            newLines.append(line)

    #write new File
    newFile = open(pFileName, 'w')
    for line in newLines:
        newFile.write(line)
    newFile.close()
#using the continuos blocks from in the small RNA lib file, identify all the peaks...
import bioLibCG as cg
import cgConfig as c

mConf = c.cgConfig('Main.conf')

fileNames = cg.recurseDir(mConf.conf['wigMouse'],  end = '.wig')

for fN in fileNames:
	file = open(fN, 'r')
	file.readline() #header
	
	#get all points in midpoint form
	pointsDict = {}
	
	for line in file:
		start = int(cg.ss(line)[1])
		end = int(cg.ss(line)[2])
		point = start + (end-start)/2 #midpoint
		pointsDict[point] = int(cg.ss(line)[3].split('.')[0])
	file.close()
	
	
	#determine peaks based off of neighbors of each point
	lowest = pointsDict.keys()
	lowest.sort()
	peaks = []
	span = 2 #must be > 0
	for i in range(span + 1,len(lowest) - span - 1):
			
		val = pointsDict[lowest[i]]
import bioLibCG as cg
import os

direc = '/home/chrisgre/apps/projects/small.rna.libs'
metaFileName = direc + '/' + 'small.meta'

##make data of already made file so there aren't any duplicates:
fileDict = {}  #filename...
metaFile = open(metaFileName, 'r')

#add new entries
countFiles = cg.recurseDir(direc, end='.fastq')

for file in countFiles:
    fileName = file.strip().split('/')[-1]
    if len(fileName.split('.')) > 2:  #has to specifically end in fastq...
        continue
    fileName = cg.getBaseFileName(file, naked=True)
    dir = file.strip().split('/')[-2]

    org = 'NONE'
    if 'human' in dir:
        org = 'human'
    if 'mouse' in dir:
        org = 'mouse'
    if 'pig' in dir:
        org = 'pig'
    if 'dog' in dir:
        org = 'dog'
    if 'rat' in dir:
        org = 'rat'
Exemple #40
0
###This will calculate the total number of hits per microRNA --> update 
##the newResults file with them --> put the value of the highest mature --> CID

import cgConfig
import bioLibCG as cg

#go through wig each chromosome and check the mature seqs
mainConf = cgConfig.cgConfig('Main.conf')
conf = cgConfig.cgConfig()
newLines = []
pFileName = 'newResults.results'


for wigFileN in cg.recurseDir(mainConf.conf['wigMouse'], end = '.wig'):
	
	
	#init
	chrom = wigFileN.strip().split('.')[-2]
	strand = wigFileN.strip().split('.')[-4]
	wigFile = open(wigFileN, 'r')
	mirFile = open(conf.conf['results'], 'r')
	print wigFileN
	
	#get rid of header
	wigFile.readline()
	
	print '  populating hitmap'
	#populate hitmap
	wigMap = {}
	for line in wigFile:
		value = int(line.strip().split('\t')[3].split('.')[0])
Exemple #41
0
        def commit(self, id_obj):

                
                #get schema
                attName_field = getClassScheme(self.mappingClass)
                
                #will update this functionality later
                
                selectedAttNames = [x for x in attName_field]

                #Get new values...Ideally this would only have the ones that have changed...but it doesn't...
                id_att_newVals = {}
                for id, obj in id_obj.items():
                        id_att_newVals[id] = {}
                        for attName, att in obj.__dict__.items(): 
                                id_att_newVals[id][attName] = att

                #get old values
                id_att_oldVals = {}
                for fN in bioLibCG.recurseDir(self.aDir):
                        baseName = fN.strip().split('/')[-1]
                        if baseName.startswith('a.'):

                                #get atribute name/type
                                bs = baseName.split('.')
                                attName = bs[1]
                                attType = attName_field[attName].dataType
                                casteFxn = getCasteFunction(attType)

                                #now set the attributes
                                f = open(fN, 'r')
                                for line in f:
                                        ls = line.strip().split('\t')
                                        id = int(ls[0])
                                        att = ls[1]
                                        #caste them correctly
                                        if attType in listTypes: 
                                                att = att.split(',')
                                                att = [casteFxn(x) for x in att]
                                        else:
                                                att = casteFxn(att)
                                        
                                        if id not in id_att_oldVals: id_att_oldVals[id] = {}
                                        id_att_oldVals[id][attName] = att
                                f.close()
                                        
                #consolidate
                id_att_finalVals = {}
                for id in id_att_oldVals:
                        for attName in id_att_oldVals[id]:
                                
                                #check default...I don't think you have to do this because it is from file...no defaults...
                                #if id_att_oldVals[id][attName] == attName_field[attName].dataDefault:
                                        #continue
                                
                                #replace it
                                if id in id_att_newVals:
                                        if attName in id_att_newVals[id]:

                                                if id not in id_att_finalVals: id_att_finalVals[id] = {}
                                                id_att_finalVals[id][attName] = id_att_newVals[id][attName]
                                        else:
                                                if id not in id_att_finalVals: id_att_finalVals[id] = {}
                                                id_att_finalVals[id][attName] = id_att_oldVals[id][attName]
                                else:
                                        if id not in id_att_finalVals: id_att_finalVals[id] = {}
                                        id_att_finalVals[id][attName] = id_att_oldVals[id][attName]

                for id in id_att_newVals:
                        for attName in id_att_newVals[id]:
                                
                                #check default
                                #if id_att_newVals[id][attName] == attName_field[attName].dataDefault:
                                        #continue
                                
                                if id in id_att_oldVals:
                                        if attName in id_att_oldVals[id]:
                                                continue #already took care of this above
                                        else:
                                                if id not in id_att_finalVals: id_att_finalVals[id] = {}
                                                id_att_finalVals[id][attName] = id_att_newVals[id][attName]
                                else:
                                        if id not in id_att_finalVals: id_att_finalVals[id] = {}
                                        id_att_finalVals[id][attName] = id_att_newVals[id][attName]

                #write to files
                for attName in selectedAttNames:
                        listFlag = False
                        if attName_field[attName].dataType in listTypes:
                                listFlag = True
                        f = open(self.aDir + '/a.' + attName, 'w')
                        for id in id_att_finalVals:
                                if attName in id_att_finalVals[id]:
                                        finalVal = id_att_finalVals[id][attName]
                                        if finalVal == attName_field[attName].dataDefault:
                                                continue
                                        if listFlag:
                                                finalVal = ','.join([str(x) for x in finalVal]) 

                                        f.write('%s\t%s\n' % (id, finalVal))
                        f.close()
                        listFlag = False
def clipAdapterInDirQ(dirName):
	'''The Q if for doing it on a cluster using qsub
	Every Q function has a corresponding shell script'''
	
	for file in cg.recurseDir(dirName, end = '.fastq'):
		subprocess.Popen([wrapperShell, file])
import cgConfig as c
import wigValue
import compareData as compare

#init
mConf = c.cgConfig('Main.conf')
conf = c.cgConfig()

tccList = ['chr3:-1:96042576:96042685']
#tccList = compare.tccFileToList('mouseKnownMirs.tcc', 0)
timer = cg.cgTimer()
timer.start()

#put peaks in memory
print 'loading peak data'
peakFilesNames = cg.recurseDir(mConf.conf['wigMouse'], end = '.peaks')
peaks = {} # chr:peak:value
for pN in peakFilesNames:
	chrom = pN.strip().split('.')[4]
	strand = pN.strip().split('.')[2]
	
	#init dictionary
	if chrom not in peaks:
		peaks[chrom] = {}
	
	if strand not in peaks[chrom]:
		peaks[chrom][strand] = {}
	
	#get peaks and values and put in dictionary
	pFile = open(pN, 'r')
	for line in pFile:
Exemple #44
0
#using the continuos blocks from in the small RNA lib file, identify all the peaks...
import bioLibCG as cg
import cgConfig as c

mConf = c.cgConfig('Main.conf')

fileNames = cg.recurseDir(mConf.conf['wigMouse'], start = 'cb.', end = '.tsv')

for fN in fileNames:
	file = open(fN, 'r')
	
	#get peak positions and average heights
	lines = [str(int(x.split('\t')[0]) + int(x.split('\t')[2])/2) + '\t' + str(x.strip().split('\t')[3]) for x in file]
	file.close()
	
	
	
	
	
	
	
	
	#output new file with new ending
	outFile = open(fN + '.peakdata', 'w')
	for line in lines:
		outFile.write('%s\n' % line)
def clipAdapterInDirQ(dirName):
    '''The Q if for doing it on a cluster using qsub
	Every Q function has a corresponding shell script'''

    for file in cg.recurseDir(dirName, end='.fastq'):
        subprocess.Popen([wrapperShell, file])
Exemple #46
0
import cgConfig as c
import wigValue
import compareData as compare

#init
mConf = c.cgConfig('Main.conf')
conf = c.cgConfig()

tccList = ['chr3:-1:96042576:96042685']
#tccList = compare.tccFileToList('mouseKnownMirs.tcc', 0)
timer = cg.cgTimer()
timer.start()

#put peaks in memory
print 'loading peak data'
peakFilesNames = cg.recurseDir(mConf.conf['wigMouse'], end='.peaks')
peaks = {}  # chr:peak:value
for pN in peakFilesNames:
    chrom = pN.strip().split('.')[4]
    strand = pN.strip().split('.')[2]

    #init dictionary
    if chrom not in peaks:
        peaks[chrom] = {}

    if strand not in peaks[chrom]:
        peaks[chrom][strand] = {}

    #get peaks and values and put in dictionary
    pFile = open(pN, 'r')
    for line in pFile:
import bioLibCG as cg
import os

direc = '/home/chrisgre/apps/projects/small.rna.libs'
metaFileName = direc + '/' + 'small.meta'

##make data of already made file so there aren't any duplicates:
fileDict = {} #filename...
metaFile = open(metaFileName, 'r')

#add new entries
countFiles = cg.recurseDir(direc, end = '.fastq')

for file in countFiles:
	fileName = file.strip().split('/')[-1]
	if len(fileName.split('.')) > 2: #has to specifically end in fastq...
		continue
	fileName = cg.getBaseFileName(file, naked = True)
	dir = file.strip().split('/')[-2]
		
	org = 'NONE'
	if 'human' in dir:
		org = 'human'
	if 'mouse' in dir:
		org = 'mouse'
	if 'pig' in dir:
		org = 'pig'
	if 'dog' in dir:
		org = 'dog'
	if 'rat' in dir:
		org = 'rat'
Exemple #48
0
import cgConfig as c
import bioLibCG as cg
import cgSort

mConf = c.cgConfig('Main.conf')

smallPath = mConf.conf['smallPath']

smallPath = '/home/chrisgre/smallLibs/WIGS/zebrafish'
#grab everything - NOT WIG MERGES...
smallLibs = cg.recurseDir(smallPath,  end = '.wig')
smallLibs.extend(cg.recurseDir(smallPath, end = '.wig'))

for lib in smallLibs:
	
	print 'sorting', lib
	cgSort.wigSort(lib)
def updateReadDensity(tType):
	#go through wig each chromosome and check the mature seqs
	mainConf = cgConfig.cgConfig('Main.conf')
	conf = cgConfig.cgConfig()
	organism = conf.conf['organism']
	wigFolder = mainConf.conf['wig%s' % organism]	
	newLines = []
	

	if tType == 'E':
		pFileName = conf.conf['resultsExons']
	elif tType == 'I':
		pFileName = conf.conf['resultsIntrons']
	else:
		print 'READ UPDATE FAIL'

	print '  Updating Read Density:', tType

	for wigFileN in cg.recurseDir(wigFolder, end = '.wig'):
		
		
		#init
		chrom = wigFileN.strip().split('.')[-2]
		strand = wigFileN.strip().split('.')[-4]
		wigFile = open(wigFileN, 'r')
		mirFile = open(pFileName, 'r')
		print wigFileN
		
		#get rid of header
		wigFile.readline()
		
		print '  populating hitmap'
		#populate hitmap
		wigMap = {}
		for line in wigFile:
			value = int(line.strip().split('\t')[3].split('.')[0])
			if value > 0:
				start = int(line.strip().split('\t')[1])
				end = int(line.strip().split('\t')[2])
				for i in range(start, end):
					wigMap[i] = value
		wigFile.close()
		
		print '  calculating hits for mature seqs'
		#calculate total hits per mature
		for line in mirFile:
			mTcc = line.strip().split('\t')[1]
			mirID = line.strip().split('\t')[0]
			if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1] == strand):
				#if mirID == '26477.30.106643972': print 'Starting Total Count'
				highestHit = 0
				for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
					#if mirID == '26477.30.106643972': print '  ', i 
					if i in wigMap:
						if wigMap[i] > highestHit:
							highestHit = wigMap[i]
						#if mirID == '26477.30.106643972': print '    ', i, totalHits, wigMap[i]
			
				newLines.append(cg.appendToLine(line, str(highestHit), 11))
		
		mirFile.close()

	print 'Writing New File'
	#write new results file
	outFile = open(pFileName, 'w')
	for line in newLines:
		outFile.write(line)
	outFile.close()

	####NOW UPDATE HIGHEST HIT PER CLUSTER####

	clusterCount = {}

	pFile = open(pFileName, 'r')
	for line in pFile:
		predictionCount = int(line.strip().split('\t')[11])
		CID = line.strip().split('\t')[7]
		if CID in clusterCount:
			if clusterCount[CID] < predictionCount:
				clusterCount[CID] = predictionCount
		else:
			clusterCount[CID] = predictionCount
	pFile.close()

	#update the file --> cluster small count
	newLines = []
	predFile = open(pFileName, 'r')
	for line in predFile:
		CID = line.strip().split('\t')[7]
		numMax = clusterCount[CID]
		newLines.append(cg.appendToLine(line, str(numMax), 12))
	predFile.close()

	#sort newLines by clusterID
	sortDict = {}
	CIDs = []
	for line in newLines:
		CID = int(line.strip().split('\t')[7])
		if CID not in CIDs:
			CIDs.append(CID)
		if CID in sortDict:
			sortDict[CID].append(line)
		else:
			sortDict[CID] = [line]
		
	CIDs.sort()

	newLines = []
	for CID in CIDs:
		for line in sortDict[CID]:
			newLines.append(line)

	#write new File
	newFile = open(pFileName, 'w')
	for line in newLines:
		newFile.write(line)
	newFile.close()
Exemple #50
0
import bioLibCG as cg
import cgRnaSeq

dir = '/home/chrisgre/smallLibs/zebrafish.embryo.GSE22068.20620952'


QNames = cg.recurseDir(dir, end = '.fastq') #fastqfile names
slNames = cg.recurseDir(dir, end = '.txt') #single sequence per line file names
fastNames = cg.recurseDir(dir, end = '.fa')
fastNames.extend(cg.recurseDir(dir, end = '.fna'))

for QFileName in QNames:
	print 'Creating counts for file', QFileName
	cgRnaSeq.createCountFileFastQ(QFileName)

for slName in slNames:
	print 'Creating counts for file', slName
	cgRnaSeq.createCountFileSL(slName)


for fastName in fastNames:
	print 'Creating counts for file', fastName
	cgRnaSeq.createCountFileFasta(fastName)
Exemple #51
0
def createMultiTrack(dirName, organism):
	'''merge all mapped tracks in directory and create a single wig file'''
	mainConf = c.cgConfig('Main.conf')
	metaFileName = mainConf.conf['metaFileName']
	
	fileList = []
	for file in cg.recurseDir(dirName, end = '.mapped'):
						
		#check if mouse or human SHOULD PUT INTO A STD FUNCTION FOR META FILE
		#check if mouse or human
		baseFName = cg.getBaseFileName(file, naked= True)
		
		metaDict = cg.getMetaFileDict(metaFileName)
		
		org = 'None'
		if baseFName in metaDict:
			if metaDict[baseFName][1] == 'NONE':
				print '  NO ORG KNOWN FOR', file
				continue
			elif not metaDict[baseFName][1] == organism:
				print '  NOT ORGANISM RUNNING', file
				continue
			else:
				org = metaDict[baseFName][1]
				print '  USING ORG', org, file
			
		#check if there is an organism, must check due to files not in metaFile
		if org == 'None':
			print '  NO org (not in meta file)', file
			continue
		
		#only make wig file for organism asked for
		if not org == organism:
			continue
		
		#if it is right organism and has mapped file then add
		fileList.append(file)
	
	
	#make merged wig
	if organism == 'human':
		chroms = cg.humanChromosomes
		assembly = 'hg19'
	elif organism == 'mouse':
		chroms = cg.mouseChromosomes
		assembly = 'mm9'
	elif organism == 'zebrafish':
		chroms = cg.zebrafishChromosomes
		assembly = 'danRer6'
	
	print 'Making Bed File vectors'
	cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i')
	for fName in fileList:
		alignment_file = HTSeq.BowtieReader(fName)
		for alngt in alignment_file:
			if alngt.aligned:
				cvg.add_value( 1, alngt.iv ) #iv is the genomic interval..

	bedNamePos = dirName + '/Merge.' + organism + '.1.wig'
	bedNameNeg = dirName + '/Merge.' + organism + '.-1.wig'
	
	print 'Writing Bed File'
	cvg.write_bedgraph_file(bedNamePos, "+" )
	cvg.write_bedgraph_file(bedNameNeg, "-" )

	#Now extend it
	updateWigLength(bedNamePos, assembly)
	updateWigLength(bedNameNeg, assembly)
	
	#Now Sort it.
	cgSort.wigSort(bedNamePos)
	cgSort.wigSort(bedNameNeg)