Ejemplo n.º 1
def main():
	# Functions for jtools:
	# Get sequence
	# Detect tandem acceptors NAGNAG
	# Annotate with genes
	# Jiggle
	# Bed to juncid
	# Guess frame
	# Find stops in intron + in frame
	# SVM recomputes
	# Splice site strength? ppt? 

	# Load a fasta file
	global f
	# Opening fasta filehandle
	print >> sys.stderr, "[%s] Opening fasta file" % (spanki_utils.timestamp())
	f = Fasta(fastafile)
	fastachr = set(sorted(f.keys()))
	#print fastachr

	### Parsing a juncbed file
	if (juncbedfile):

		print "juncid\toriginal_id\tdastring"
		print >> sys.stderr, "Loading", juncbedfile
		lines = csv.reader(open(juncbedfile, 'rb'), delimiter='\t')
		z = []
		for line in lines:
			pattern = re.compile('track')
			track = pattern.search(line[0])
			if not track:
				values = line
				blocksizes = values[10].split(",")
				blockstarts = values[11].split(",")
				chr = values[0]
				rangestart = int(values[1]) - 1
				rangeend = int(values[2])
				strand = values[5]
				id = values[3]
				intronstart = rangestart + int(blocksizes[0]) + 2
				intronend = rangeend - int(blocksizes[1]) 
				# Or..
				#intronend = rangestart + int(blocksizes[0]) + int(blockstarts[1])
				#chrXHet	800	1767	JUNC00000001	2	+	800	1767	255,0,0	2	20,63	0,904
				intronsize = intronend - intronstart;
				juncid = chr + ":" + str(intronstart) + "_" + str(intronend) + ":" + strand
				dastring = intron_sequence_single(juncid,f)
				print juncid, values[3], dastring
		print >> sys.stderr, "Distribution of detected motifs:\n",Counter(z)
	### Parsing a intronbed file
	#scaffold_12916	13833982	13834044	10
	#scaffold_12916	13838614	13838676	67
	#scaffold_12916	13839119	13839204	75

	if (intronbedfile):
		print "juncid\tid\tdastring"
		lines = csv.reader(open(intronbedfile, 'rb'), delimiter='\t')
		for line in lines:
			pattern = re.compile('track')
			track = pattern.search(line[0])
			values = line
			if not track:
				chr = values[0]
				intronstart = int(values[1]) + 1
				intronend = int(values[2]) - 1
				strand = "+"
				id = values[0]
				intronsize = intronend - intronstart;
				juncid = chr + ":" + str(intronstart) + "_" + str(intronend) + ":" + strand
				dastring = intron_sequence_single(juncid,f)
				print juncid, values[3], dastring

	### Converting from another format

	if gfffile:
		#reflist = tab_to_dict(gff)
		results = collections.defaultdict(lambda : collections.defaultdict(dict))
		gffdict = gff_to_dict(gfffile)
		for x in gffdict:
			#print x
			#print gffdict[x]
			if (gffdict[x]['feature_type'] == "exon_junction"):
				juncid = gffdict[x]['chr'] + ":" + str(int(gffdict[x]['start']) + 1) + "_" + str(int(gffdict[x]['end']) - 1) + ":" + gffdict[x]['strand']
			elif (gffdict[x]['feature_type'] == "intron"):
				juncid = gffdict[x]['chr'] + ":" + gffdict[x]['start'] + "_" + gffdict[x]['end'] + ":" + gffdict[x]['strand']
			dastring = intron_sequence_single(juncid,f)
			#print dastring
			results[x]['juncid'] = juncid
			results[x]['dastring'] = dastring
		print "ID\tjuncid\tdastring"
		for x in sorted(results.iterkeys()):
			print x, "\t", results[x]['juncid'], "\t", results[x]['dastring']

	### Converting from another format

	if gtffile:
		# Intializing the reference
		# You need the gtf file, and the fasta file
		lookup = spanki_utils.prep_ref(gtffile,fastafile,output_dir)
		## Note that you now have a reference called ref.bam, and a lookup dict
		#tmp_dir = output_dir + "/tmp/"
		#reffile = tmp_dir + "/ref.bam"
		reffile = "tmp/ref.bam"
		# Load an annotation, flattened as bam
		print >> sys.stderr, "[%s] Trying to load annotation as bam" % (spanki_utils.timestamp())
		reffh = pysam.Samfile( reffile, "rb" )
		edgedict, refjuncs = spanki_parse_utils.parseRefAsBam(reffh)
		print >> sys.stderr, "[%s] Done loading annotation as bam" % (spanki_utils.timestamp())
		for junc in refjuncs:
			print junc

	### Below are functions that operate on a junction list

	if jlist:
		# Load reference junction list
		reflist = tab_to_dict(jlist)
		# Find the junctions in jlist that are not in jtab
		myjuncs = reflist.keys()
		print >> sys.stderr, len(myjuncs), "in junction list"
		updonor = 20
		downdonor = 2
		upacceptor = 2
		downacceptor = 20
		for x in myjuncs:
			print x
			j1 = Junctionid(x)
			if j1.strand == "+":
				#print Seq(f[j1.chr][j1.donor-updonor:j1.donor], IUPAC.unambiguous_dna)
				tempseq = Seq(f[j1.chr][j1.donor-updonor:j1.donor], IUPAC.unambiguous_dna)
				#print "***", tempseq.translate()
				#print Seq(f[j1.chr][j1.donor:j1.donor + downdonor], IUPAC.unambiguous_dna)
				#print Seq(f[j1.chr][j1.acceptor-upacceptor:j1.acceptor], IUPAC.unambiguous_dna)
				#print Seq(f[j1.chr][j1.acceptor:j1.acceptor + downacceptor], IUPAC.unambiguous_dna)
				nagstring = find_nag(Seq(f[j1.chr][j1.acceptor:j1.acceptor + downacceptor], IUPAC.unambiguous_dna))
				print nagstring
			elif j1.strand == "-":
				#print Seq(f[j1.chr][j1.donor:j1.donor + updonor], IUPAC.unambiguous_dna).reverse_complement()
				#print Seq(f[j1.chr][j1.donor - downdonor:j1.donor], IUPAC.unambiguous_dna).reverse_complement()
				#print Seq(f[j1.chr][j1.acceptor:j1.acceptor + upacceptor], IUPAC.unambiguous_dna).reverse_complement()
				#print Seq(f[j1.chr][j1.acceptor-downacceptor:j1.acceptor], IUPAC.unambiguous_dna).reverse_complement()
				quit("Don't recognize strand")
				#fiveprimeflank = fiveprimeflank.reverse_complement()
	# Older code that's not used yet
	# IRT
	bamfh = pysam.Samfile( bamfile, "rb" )
	#for alignedread in samfile:
	# Need some kind of iterator to getread length from first alignment in sam
	print >> sys.stderr, "[%s] Getting intron read-though (IRT), may take awhile" % (spanki_utils.timestamp())
	IRT = intron_readthrough(myjuncs,bamfh)
	print >> sys.stderr, "[%s] Done getting IRT" % (spanki_utils.timestamp())

	#for edgeid in covbyedge.keys():
	#	print edgeid, covbyedge[edgeid]

	# These are the fields you end up with after merging:
	#juncid	geneassign	cov	lirt	rirt	irt	dncov	ancov	numsamps
	#chr2L:22427471_22427525:- 	none 	2 	57 	28 	85 	0 	0 	1
	#chr2R:5702257_5702656:+ 	FBgn0040092 	13 	0 	0 	0 	0 	0 	2
	#chr2L:11436293_11436415:- 	FBgn0261648 	23 	0 	0 	0 	0 	0 	2
	#chr2R:9334834_9336812:- 	FBgn0013765 	6 	0 	0 	0 	0 	0 	2

	# Now compile the results
	# First show how you can get in hte myjuncs list
	print >> sys.stderr, "Printing results table"
	print >> juncs_out, "juncid\tgeneassign\tannostatus\tintron_size\tgmcode\tregcode\tcov\tlirt\trirt\tirt\tdncov\tancov"
	for juncid in sorted(keys2):
			results = [juncid, jdict[juncid]['geneassign'], jdict[juncid]['annostatus'], jdict[juncid]['intron_size'], jdict[juncid]['gmcode'], jdict[juncid]['regcode'], jdict[juncid]['cov'], jdict[juncid]['lirt'], jdict[juncid]['rirt'], jdict[juncid]['irt'], jdict[juncid]['dncov'], jdict[juncid]['ancov']]
			print >> juncs_out, ('\t'.join(map(str,results)))
		except KeyError:
			j1 = Junctionid(juncid)
			donid = j1.donid
			accid = j1.accid
			if covbyedge[donid]: dncov = covbyedge[donid]
			else: dncov = 0
			if covbyedge[accid]: ancov = covbyedge[accid]
			else: ancov = 0
			results = [juncid, reflist[juncid]['geneassign'],  reflist[juncid]['annostatus'],  reflist[juncid]['intron_size'],  reflist[juncid]['gmcode'],  reflist[juncid]['regcode'], 0, IRT[juncid]['lirt'], IRT[juncid]['rirt'], IRT[juncid]['irt'], dncov, ancov]
			#print(results, sep='\t')
			print >> juncs_out, ('\t'.join(map(str,results)))


 	# Parse the read alignments
	# Parse the bam file
	## Get a table of junctions, table of donors etc.
	bamfh = pysam.Samfile( bamfile, "rb" )
	#JTAB,UNFILT_JTAB,STAB,NEWDTAB,MMES = parse_aligns_detailed(bamfh)
	JTAB,UNFILT_JTAB = quickcov(bamfh,anchorsize)
	myjuncs = JTAB.keys()
 	# Print junction list to the output directory
 	print "juncid\tunfilt_cov\tcov"
	for juncid in myjuncs:
		print juncid, UNFILT_JTAB[juncid], JTAB[juncid]
