コード例 #1
0
ファイル: mapFastQ.py プロジェクト: JasonAng/ResearchScripts
def mapFastQ(fName, organism):
	
	indexFileHuman = '/home/chrisgre/indexes/bowtie/hg19'
	indexFileMouse = '/home/chrisgre/indexes/bowtie/mm9'
	indexFileZebrafish = '/home/chrisgre/indexes/bowtie/danRer6'
	
	if organism == 'human':
		indexName = indexFileHuman
	elif organism == 'mouse':
		indexName = indexFileMouse
	elif organism == 'zebrafish':
		indexName = indexFileZebrafish
		
	
	outName = fName + '.mapped'
	
		
	logFile = open(mainConf.conf['outLog'] + cg.getBaseFileName(fName), 'w')
	errorFile = open(mainConf.conf['errorLog'] + cg.getBaseFileName(fName), 'w')
	
	if fastQTypes.getFastQType(fName, quick = True) == 'Sa':
		print 'Mapping with 33 phred offset'
		subprocess.Popen(['bowtie', '--phred33-quals', '-k', '20', '-m', '20', '-p', '1', indexName, fName, outName], stdout=logFile, stderr=errorFile).wait()
	else:
		print 'Mapping with 64 phred offset'
		subprocess.Popen(['bowtie', '--phred64-quals', '-k', '20', '-m', '20', '-p', '1', indexName, fName, outName], stdout=logFile, stderr=errorFile).wait()
	
	logFile.close()
	errorFile.close()
コード例 #2
0
ファイル: makeWig.py プロジェクト: cgreer/ResearchScripts
def makeWig(fN, assembly, format = None, name = None):
	
	'''format assumes bowtie
	suitible for medium mapped files.
	takes longer.'''
	#assume bowtie
	if not format: format = 'Bowtie'
	parserFunction = returnParserFunction(format)
	if not name: name = cg.getBaseFileName(fN, naked = True)
	lDict = cg.returnChromLengthDict(assembly)
	
	
	for chrom in lDict:
		if not chrom in cg.acceptableChroms: continue
		for strand in ['1', '-1']:
			f = open(fN, 'r')
			#create hitmap of chrom and strand
			print chrom, strand, 'hitmap'
			hitDict = {}
			for line in f:
				
				lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
				lStrand = str(lStrand)
				start = int(start)
				end = int(end)
				if chrom == lChrom and strand == lStrand:
					for i in range(start, end + 1):
						try:
							hitDict[i] += 1
						except KeyError:
							hitDict[i] = 1
			
			#write results to wig file
			writeWigFromHitDict(hitDict, assembly)
コード例 #3
0
def testmerge(masterDir, parDir):
        '''The master directory will contain the merged objects,
        the slave directory contains the directories of all the runs
        oRNA (master)
        aDir (master)
        pRuns
        --run.00
        ----oRNA (slave: pRuns/run.00/oRNA)
        ----aDir
        --run.01
        '''

        mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment)
        id_masterObj = mDC.load()
        
        #recurse through all the runs
        masterBN = bioLibCG.getBaseFileName(masterDir)

        for slaveDir in bioLibCG.recursePaths(parDir, end = masterBN):

        
                oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment)
                id_slaveObj = oDC.load()
       
                id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj, cgOriginRNA.OriginRNA) 
        
        mDC.commit(id_masterObj)
コード例 #4
0
ファイル: makeWig.py プロジェクト: cgreer/ResearchScripts
def mixWig(directory, assembly, name = None):
	'''Does it by chromosome --> faster, less memory'''
	
	if not name: name = 'Merge'
	#gather all chromosomes
	chromList = []
	for fN in cg.recurseDir(directory, end = '.wig'):
		chrom = cg.getBaseFileName(fN).strip().split('.')[-3]
		if chrom not in chromList:
			chromList.append(chrom)
	
	print chromList
	
	for chrom in chromList:
		
		print chrom
		#Gather all the values from all the files
		hitDict = {} # chrom : { strand : coord
		for fN in cg.recurseDir(directory, end = '.wig'):
			fChrom = cg.getBaseFileName(fN).strip().split('.')[-3]
			if fChrom != chrom: continue
			print  '  ', fN, fChrom
			f = open(fN, 'r')
			f.readline() #header
			strand = cg.getBaseFileName(fN).strip().split('.')[-2]
			for line in f:
				
				lChrom, start, end, val = (line.strip().split('\t'))
				start, end, val = int(start), int(end), int(val)
				if val < 1: continue
				#print start, end, val
				for i in range(start, end):
					try:
						hitDict[lChrom][strand][i] += val
					except (KeyError,TypeError):
						if not lChrom in hitDict:
							hitDict[lChrom] = {}
						if not strand in hitDict[lChrom]:
							hitDict[lChrom][strand] = {}
						hitDict[lChrom][strand][i] = val
		
		#write results to wig file
		writeWigFromHitDict(hitDict, assembly, name, directory)
コード例 #5
0
ファイル: makeWig.py プロジェクト: combiochem/ResearchScripts
def makeWigMem(fN, assembly, format = None, name = None, directory = None):
	'''format assumes bowtie
	suitible for small mapped files.'''
	
	if not name: name = cg.getBaseFileName(fN, naked = True)
	if not format: format = 'Bowtie'
	parserFunction = returnParserFunction(format)
	
	lDict = cg.returnChromLengthDict(assembly)
	f = open(fN, 'r')
	f.readline() #header...file might not have one but its one read...
	
	#create hitmap of chrom and strand
	hitDict = {} #format = chr: { strand : { coord : value 
	for line in f:
		try:
			lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
		except AttributeError:
			continue
		lStrand = str(lStrand)
		start = int(start)
		end = int(end)
		if lChrom in cg.acceptableChroms:
			
			#wig for degradome
			if lStrand == '1':
				i = start + 20
			else:
				i = start
				
			try:
				hitDict[lChrom][lStrand][i] += 1
			except KeyError:
				if lChrom not in hitDict:
					hitDict[lChrom] = {}
				if lStrand not in hitDict[lChrom]:
					hitDict[lChrom][lStrand] = {}
				hitDict[lChrom][lStrand][i] = 1
			'''
			
			for i in range(start, end):
				try:
					hitDict[lChrom][lStrand][i] += 1
				except KeyError:
					if lChrom not in hitDict:
						hitDict[lChrom] = {}
					if lStrand not in hitDict[lChrom]:
						hitDict[lChrom][lStrand] = {}
					hitDict[lChrom][lStrand][i] = 1
			'''		
	f.close()
	
	#write results to wig file
	writeWigFromHitDict(hitDict, assembly, name, directory)
コード例 #6
0
ファイル: makeWig.py プロジェクト: cgreer/ResearchScripts
def writeWigFromHitDict(hitDict, assembly, name, directory = None):
	
	mConf = c.getConfig('Main.conf')
	if not directory: directory = mConf.conf['wigs']
	if not name: name = cg.getBaseFileName(name, naked = True)
	lDict = cg.returnChromLengthDict(assembly)
	
	cg.clearDirectory(directory, overwrite = False)
	#write results to wig file
	for chrom in hitDict:
		for strand in hitDict[chrom]:
			
			oF = open(directory + '/%s.%s.%s.wig' % (name, chrom, strand), 'w')
			oF.write('track type=bedGraph name=%s.%s.%s\n' % (name, chrom, strand))
			
			#print '  sorting'
			#print hitDict[chrom]
			chromEnd = lDict[chrom] #
			hitDict[chrom][strand][chromEnd] = 0
			keys = hitDict[chrom][strand].keys()
			keys.sort()
			
			#print '  writing blocks'
			prevVal = 0
			prevCoord = 0
			blockStart = 0
			blockEnd = 1
			for key in keys:
				val = hitDict[chrom][strand][key]
				
				if prevCoord == key - 1: 
					if val == prevVal:#should be combined
						blockEnd = key + 1
					else: #no zero block
						#write old block
						oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #!make it a float value?
						#start new block
						blockStart = key
						blockEnd = key + 1
						
				else:
					#write old block
					oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal))
					#write zero block
					oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockEnd, key, 0))
					#start new block
					blockStart = key
					blockEnd = key + 1
				
				prevVal = val
				prevCoord = key
			oF.close()
コード例 #7
0
def clipAdapter(fName, adapter = None, validate = False, oName = None, overwrite = True):
	
	#Check to see if the file exists:
	putativeN = fName.replace('.fastq','.clipped.fastq')
	if os.path.isfile(putativeN):
		if overwrite:
			print '  Overwriting file', putativeN
			os.remove(putativeN)
		else:
			print '  \nNOT OVERWRITING FILE', putativeN
			return 1
			 
	#If the adapter is none, try to find it in the small.meta file
	if adapter is None:
		baseFName = cg.getBaseFileName(fName) + '.counts'
		for metaFileName in metaFileNames:
			mFile = open(metaFileName, 'r')
			for line in mFile:
				fields = line.strip().split('\t')
				if baseFName == fields[0]:
					if fields[3] == 'NONE':
						print '  NO ADAPTER KNOWN FOR', fName
						return 1
					else:
						adapter = fields[3]
						print '  Using adapter', adapter, fName
			mFile.close()
	
	
	
	#Is it a valid fastq file?
	if validate: 
		pass
	
	#check the type of fastq file
	sangerType = False
	fType = fastQTypes.getFastQType(fName, quick = True)
	if fType == 'Sa':
		sangerType = True
	print '  Detected format:', fType, fName
	
	#Run it through clipper
	print 'Clipping file', fName
	if oName is None:
		oName = fName.replace('.fastq','.clipped.fastq')
	
	if sangerType:
		subprocess.Popen(['fastx_clipper', '-n', '-v', '-Q', '33', '-i', str(fName), '-a', str(adapter), '-o', str(oName)]).wait()
	else:
		subprocess.Popen(['fastx_clipper', '-n', '-v', '-i', str(fName), '-a', str(adapter), '-o', str(oName)]).wait()
	print '  DONE', fName
コード例 #8
0
def plotResults(fN, cName = None):
		
	cHairs = getHairpins.getHairpins(fN) #CID: HAIRPIN
	
	directory = cg.getBaseFileName(fN)
	cg.clearDirectory(directory)
	
	#change the directory before plotting
	cwd = os.getcwd()
	os.chdir(directory)
	
	for CID in cHairs:
		print 'plotting:', CID
		cgPlot.plotASProfile(cHairs[CID], cName)
	
	os.chdir(cwd)
コード例 #9
0
ファイル: mapFastQ.py プロジェクト: JasonAng/ResearchScripts
def mapFastQInDirQ(dirName, overwrite = True):
	'''Every Q function has a corresponding shell script'''
	wrapperShell = '/home/chrisgre/scripts/mapping/mapFastQ.sh'
	
	
	for file in cg.recurseDir(dirName, end = 'clipped.fastq'):
		print file
		
		putativeN = file.replace('.clipped.fastq','.clipped.fastq.mapped')
		if os.path.isfile(putativeN):
			if overwrite:
				print '  Overwriting file', putativeN
				os.remove(putativeN)
			else:
				print '  \nNOT OVERWRITING FILE', putativeN
				continue
				
		#check if mouse or human
		baseFName = cg.getBaseFileName(file, naked = True)
		org = 'None'
		for metaFileName in metaFileNames:
			mFile = open(metaFileName, 'r')
			for line in mFile:
				fields = line.strip().split('\t')
				if baseFName == fields[0]:
					if fields[2] == 'NONE':
						print '  NO ORG KNOWN FOR', file
						continue
					else:
						org = fields[2]
						print '  USING ORG', org, file
			mFile.close()
			
		#check if there is an organism, must check due to files not in metaFile
		if org == 'None':
			print '  NO org', file
			continue
			
		while True:
			#submit job if there are less than ten
			if clusterCheck.queryNumJobsQ('chrisgre') < 40:
				#subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				subprocess.Popen(['qsub', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				time.sleep(.2) #give it time to update qstat
				break
			else:#wait 10 secs...
				time.sleep(20)
コード例 #10
0
def createTrackInDir(dirName):
	'''Every Q function has a corresponding shell script
	Make wig file for all mapped files, for all organisms'''
	
	wrapperShell = '/home/chrisgre/scripts/mapping/createTrack.sh'
	
	mainConf = c.cgConfig('Main.conf')
	metaFileName = mainConf.conf['metaFileName']

	for file in cg.recurseDir(dirName, end = '.mapped'):
						
		#check if mouse or human
		baseFName = cg.getBaseFileName(file)
		baseFName = baseFName.split('.')[0]
		
		metaDict = cg.getMetaFileDict(metaFileName)
		
		org = 'None'
		if baseFName in metaDict:
			if metaDict[baseFName][1] == 'NONE':
				print '  NO ORG KNOWN FOR', file
				continue
			else:
				org = metaDict[baseFName][1]
				print '  USING ORG', org, file
				
		#check if there is an organism, must check due to files not in metaFile
		if org == 'None':
			print '  NO org (not in meta file)', file
			continue
			
		while True:
			#submit job if there are less than ten
			if clusterCheck.queryNumJobsQ('chrisgre') < 1000:
				#subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				subprocess.Popen(['qsub', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				#time.sleep(.5) #give it time to update qstat
				break
			else:#wait 10 secs...
				time.sleep(20)
コード例 #11
0
direc = '/home/chrisgre/apps/projects/small.rna.libs'
metaFileName = direc + '/' + 'small.meta'

##make data of already made file so there aren't any duplicates:
fileDict = {} #filename...
metaFile = open(metaFileName, 'r')

#add new entries
countFiles = cg.recurseDir(direc, end = '.fastq')

for file in countFiles:
	fileName = file.strip().split('/')[-1]
	if len(fileName.split('.')) > 2: #has to specifically end in fastq...
		continue
	fileName = cg.getBaseFileName(file, naked = True)
	dir = file.strip().split('/')[-2]
		
	org = 'NONE'
	if 'human' in dir:
		org = 'human'
	if 'mouse' in dir:
		org = 'mouse'
	if 'pig' in dir:
		org = 'pig'
	if 'dog' in dir:
		org = 'dog'
	if 'rat' in dir:
		org = 'rat'
	if 'zebrafish' in dir:
		org = 'zebrafish'
コード例 #12
0
ファイル: makeWig.py プロジェクト: cgreer/ResearchScripts
def makeWigMem(fN, assembly, format = None, name = None, directory = None, degWig = False, switchStrand = True, normalized = False):
	'''format assumes bowtie
	suitible for small mapped files.
        switch strand does not switch the strands, it just makes sure if the data is backwards (HeLa) that it will 
        put the peak in the right spot'''
	
        print 'degWig Value', degWig
        print 'switch strands?', switchStrand
	if not name: name = cg.getBaseFileName(fN, naked = True)
	if not format: format = 'Bowtie'
	parserFunction = returnParserFunction(format)
	
	lDict = cg.returnChromLengthDict(assembly)
	f = open(fN, 'r')
	f.readline() #header...file might not have one but its one read...
	
	#create hitmap of chrom and strand
	hitDict = {} #format = chr: { strand : { coord : value 
	for line in f:
                lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
		lStrand = str(lStrand)
		start = int(start)
		end = int(end)
                numPlacesMapped = int(line.strip().split('\t')[6])
                numPlacesMapped += 1
                readCount = 1
                if normalized:
                    readCount = float(readCount)/numPlacesMapped

		if lChrom in cg.acceptableChroms:
                        
                        if degWig:
                                #wig for degradome NOTE:!!! change lStrand == '1' to '-1' for Bracken!
                                if switchStrand:
                                    if lStrand == '1':
                                            i = start + (end - start)
                                    else:
                                            i = start + 1
                                else:                                            
                                    if lStrand == '-1':
                                            i = start + (end - start)
                                    else:
                                            i = start + 1


                                hitDict.setdefault(lChrom, {}).setdefault(lStrand, {})
                                hitDict[lChrom][lStrand][i] = hitDict[lChrom][lStrand].get(i, 0) + readCount
                        else:

                                #wig for regular
                                for i in range(start, end):
                                        try:
                                                hitDict[lChrom][lStrand][i] += readCount 
                                        except KeyError:
                                                if lChrom not in hitDict:
                                                        hitDict[lChrom] = {}
                                                if lStrand not in hitDict[lChrom]:
                                                        hitDict[lChrom][lStrand] = {}
                                                hitDict[lChrom][lStrand][i] = readCount

	f.close()
	
	#write results to wig file
	writeWigFromHitDict(hitDict, assembly, name, directory)
コード例 #13
0
#make lib:hits dict
densityFile = open('/home/chrisgre/scripts/readDensity/individual.densities.data', 'r')
tissueHits = {}
cID = 'NONE'
for line in densityFile:
	if line.startswith('\t'): #lib: hit
		l = line.strip().split('\t')[0]
		hits = int(line.strip().split('\t')[1])
		tissueHits[cID][l] = hits 
	else:
		cID = line.strip()
		tissueHits[cID] = {}

tissueHist = {}#tissue: hits
for mirID in mirIDs:
	if mirID in tissueHits:
		for smallLib in tissueHits[mirID]:
			smallName = cg.getBaseFileName(smallLib, naked = True)
			if smallName in metaDict:
				if metaDict[smallName][1] == 'mouse':
					if len(metaDict[smallName]) > 3:
						t = metaDict[smallName][3]
						if t in tissueHist:
							tissueHist[t] += tissueHits[mirID][smallLib]
						else:
							tissueHist[t] = tissueHits[mirID][smallLib]

print tissueHist
コード例 #14
0
	if metaDict[baseFName][1] == organism:
		organismFileList.append(baseFName)

#put small hits for each prediction in dictionary
pCount = {}
smallFile = open(smallFileName, 'r')

currID = 'NONE'
for line in smallFile:
	if '\t' not in line: #This is the line with the id in it -->  store another ID
		currID = line.strip()
	else: #this line contains library and count info  --> add 
		lib = line.strip().split('\t')[0]
		count = int(line.strip().split('\t')[1])
		
		if cg.getBaseFileName(lib, naked = True) in organismFileList:
			if currID in pCount:
				pCount[currID] = pCount[currID] + count
			else:
				pCount[currID] = count

#update the file --> any line with kmer on it give it the count
newLines = []
predFile = open(pFileName, 'r')
for line in predFile:
	kmer = line.strip().split('\t')[0].split('.')[0]
	if kmer in pCount:
		numSmall = pCount[kmer]
	else:
		numSmall = 0
	newLine = line.strip().split('\t')
コード例 #15
0
def createMultiTrack(dirName, organism):
	'''merge all mapped tracks in directory and create a single wig file'''
	mainConf = c.cgConfig('Main.conf')
	metaFileName = mainConf.conf['metaFileName']
	
	fileList = []
	for file in cg.recurseDir(dirName, end = '.mapped'):
						
		#check if mouse or human SHOULD PUT INTO A STD FUNCTION FOR META FILE
		#check if mouse or human
		baseFName = cg.getBaseFileName(file, naked= True)
		
		metaDict = cg.getMetaFileDict(metaFileName)
		
		org = 'None'
		if baseFName in metaDict:
			if metaDict[baseFName][1] == 'NONE':
				print '  NO ORG KNOWN FOR', file
				continue
			elif not metaDict[baseFName][1] == organism:
				print '  NOT ORGANISM RUNNING', file
				continue
			else:
				org = metaDict[baseFName][1]
				print '  USING ORG', org, file
			
		#check if there is an organism, must check due to files not in metaFile
		if org == 'None':
			print '  NO org (not in meta file)', file
			continue
		
		#only make wig file for organism asked for
		if not org == organism:
			continue
		
		#if it is right organism and has mapped file then add
		fileList.append(file)
	
	
	#make merged wig
	if organism == 'human':
		chroms = cg.humanChromosomes
		assembly = 'hg19'
	elif organism == 'mouse':
		chroms = cg.mouseChromosomes
		assembly = 'mm9'
	elif organism == 'zebrafish':
		chroms = cg.zebrafishChromosomes
		assembly = 'danRer6'
	
	print 'Making Bed File vectors'
	cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i')
	for fName in fileList:
		alignment_file = HTSeq.BowtieReader(fName)
		for alngt in alignment_file:
			if alngt.aligned:
				cvg.add_value( 1, alngt.iv ) #iv is the genomic interval..

	bedNamePos = dirName + '/Merge.' + organism + '.1.wig'
	bedNameNeg = dirName + '/Merge.' + organism + '.-1.wig'
	
	print 'Writing Bed File'
	cvg.write_bedgraph_file(bedNamePos, "+" )
	cvg.write_bedgraph_file(bedNameNeg, "-" )

	#Now extend it
	updateWigLength(bedNamePos, assembly)
	updateWigLength(bedNameNeg, assembly)
	
	#Now Sort it.
	cgSort.wigSort(bedNamePos)
	cgSort.wigSort(bedNameNeg)