Ejemplo n.º 1
0
def main():
	
	# Functions for jtools:
	# Get sequence
	# Detect tandem acceptors NAGNAG
	# Annotate with genes
	# Jiggle
	# Bed to juncid
	# Guess frame
	# Find stops in intron + in frame
	# SVM recomputes
	# Splice site strength? ppt? 

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Load a fasta file
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	global f
	# Opening fasta filehandle
	print >> sys.stderr, "[%s] Opening fasta file" % (spanki_utils.timestamp())
	f = Fasta(fastafile)
	fastachr = set(sorted(f.keys()))
	#print fastachr

	########################################################
	### Parsing a juncbed file
	########################################################
	if (juncbedfile):

		print "juncid\toriginal_id\tdastring"
		print >> sys.stderr, "Loading", juncbedfile
		lines = csv.reader(open(juncbedfile, 'rb'), delimiter='\t')
		z = []
		for line in lines:
			pattern = re.compile('track')
			track = pattern.search(line[0])
			if not track:
				values = line
				blocksizes = values[10].split(",")
				blockstarts = values[11].split(",")
				chr = values[0]
				rangestart = int(values[1]) - 1
				rangeend = int(values[2])
				strand = values[5]
				id = values[3]
				intronstart = rangestart + int(blocksizes[0]) + 2
				intronend = rangeend - int(blocksizes[1]) 
				# Or..
				#intronend = rangestart + int(blocksizes[0]) + int(blockstarts[1])
			
				#chrXHet	800	1767	JUNC00000001	2	+	800	1767	255,0,0	2	20,63	0,904
			
				intronsize = intronend - intronstart;
			
				juncid = chr + ":" + str(intronstart) + "_" + str(intronend) + ":" + strand
				dastring = intron_sequence_single(juncid,f)
				z.append(str(dastring))
				print juncid, values[3], dastring
		
		print >> sys.stderr, "Distribution of detected motifs:\n",Counter(z)
		quit("Done")
	########################################################
	### Parsing a intronbed file
	########################################################
	#scaffold_12916	13833982	13834044	10
	#scaffold_12916	13838614	13838676	67
	#scaffold_12916	13839119	13839204	75

	if (intronbedfile):
		print "juncid\tid\tdastring"
		lines = csv.reader(open(intronbedfile, 'rb'), delimiter='\t')
		for line in lines:
			pattern = re.compile('track')
			track = pattern.search(line[0])
			values = line
			if not track:
				chr = values[0]
				intronstart = int(values[1]) + 1
				intronend = int(values[2]) - 1
				strand = "+"
				id = values[0]
			
				intronsize = intronend - intronstart;
			
				juncid = chr + ":" + str(intronstart) + "_" + str(intronend) + ":" + strand
				dastring = intron_sequence_single(juncid,f)
				
				print juncid, values[3], dastring
		
		
		quit("Done")
	########################################################

	########################################################
	### Converting from another format
	########################################################

	if gfffile:
	
		#reflist = tab_to_dict(gff)
		results = collections.defaultdict(lambda : collections.defaultdict(dict))
		gffdict = gff_to_dict(gfffile)
		for x in gffdict:
			#print x
			#print gffdict[x]
			if (gffdict[x]['feature_type'] == "exon_junction"):
				juncid = gffdict[x]['chr'] + ":" + str(int(gffdict[x]['start']) + 1) + "_" + str(int(gffdict[x]['end']) - 1) + ":" + gffdict[x]['strand']
			elif (gffdict[x]['feature_type'] == "intron"):
				juncid = gffdict[x]['chr'] + ":" + gffdict[x]['start'] + "_" + gffdict[x]['end'] + ":" + gffdict[x]['strand']
			dastring = intron_sequence_single(juncid,f)
			#print dastring
			results[x]['juncid'] = juncid
			results[x]['dastring'] = dastring
	
		print "ID\tjuncid\tdastring"
		for x in sorted(results.iterkeys()):
			print x, "\t", results[x]['juncid'], "\t", results[x]['dastring']
	
		quit()

	########################################################
	### Converting from another format
	########################################################

	if gtffile:
	
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# Intializing the reference
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# You need the gtf file, and the fasta file
		lookup = spanki_utils.prep_ref(gtffile,fastafile,output_dir)
		## Note that you now have a reference called ref.bam, and a lookup dict
		#tmp_dir = output_dir + "/tmp/"
		#reffile = tmp_dir + "/ref.bam"
		reffile = "tmp/ref.bam"
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		# Load an annotation, flattened as bam
		#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		print >> sys.stderr, "[%s] Trying to load annotation as bam" % (spanki_utils.timestamp())
		reffh = pysam.Samfile( reffile, "rb" )
		edgedict, refjuncs = spanki_parse_utils.parseRefAsBam(reffh)
		reffh.close()
		print >> sys.stderr, "[%s] Done loading annotation as bam" % (spanki_utils.timestamp())
	
		for junc in refjuncs:
			print junc

	
		quit()
	### Below are functions that operate on a junction list
	########################################################


	if jlist:
	
		#~~~~~~~~~~~~~~~~~~~
		# Load reference junction list
		#~~~~~~~~~~~~~~~~~~~
		reflist = tab_to_dict(jlist)
	
		# Find the junctions in jlist that are not in jtab
	
		myjuncs = reflist.keys()
	
	
	
		
		print >> sys.stderr, len(myjuncs), "in junction list"
	
		updonor = 20
		downdonor = 2
		upacceptor = 2
		downacceptor = 20
		
	
		for x in myjuncs:
			print x
			j1 = Junctionid(x)
			j1.display()
			if j1.strand == "+":
				#print Seq(f[j1.chr][j1.donor-updonor:j1.donor], IUPAC.unambiguous_dna)
				tempseq = Seq(f[j1.chr][j1.donor-updonor:j1.donor], IUPAC.unambiguous_dna)
				#print "***", tempseq.translate()
				
				#print Seq(f[j1.chr][j1.donor:j1.donor + downdonor], IUPAC.unambiguous_dna)
				#print Seq(f[j1.chr][j1.acceptor-upacceptor:j1.acceptor], IUPAC.unambiguous_dna)
				#print Seq(f[j1.chr][j1.acceptor:j1.acceptor + downacceptor], IUPAC.unambiguous_dna)
				nagstring = find_nag(Seq(f[j1.chr][j1.acceptor:j1.acceptor + downacceptor], IUPAC.unambiguous_dna))
				print nagstring
			elif j1.strand == "-":
				pass
				#print Seq(f[j1.chr][j1.donor:j1.donor + updonor], IUPAC.unambiguous_dna).reverse_complement()
				#print Seq(f[j1.chr][j1.donor - downdonor:j1.donor], IUPAC.unambiguous_dna).reverse_complement()
				#print Seq(f[j1.chr][j1.acceptor:j1.acceptor + upacceptor], IUPAC.unambiguous_dna).reverse_complement()
				#print Seq(f[j1.chr][j1.acceptor-downacceptor:j1.acceptor], IUPAC.unambiguous_dna).reverse_complement()
			else:
				quit("Don't recognize strand")
	
				#fiveprimeflank = fiveprimeflank.reverse_complement()
		quit("Done")
	
	
	quit()
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Older code that's not used yet
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	
	
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# IRT
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	bamfh = pysam.Samfile( bamfile, "rb" )
	#for alignedread in samfile:
	# Need some kind of iterator to getread length from first alignment in sam
	print >> sys.stderr, "[%s] Getting intron read-though (IRT), may take awhile" % (spanki_utils.timestamp())
	IRT = intron_readthrough(myjuncs,bamfh)
	bamfh.close()
	print >> sys.stderr, "[%s] Done getting IRT" % (spanki_utils.timestamp())



	#for edgeid in covbyedge.keys():
	#	print edgeid, covbyedge[edgeid]
		

	# These are the fields you end up with after merging:
	#juncid	geneassign	cov	lirt	rirt	irt	dncov	ancov	numsamps
	#chr2L:22427471_22427525:- 	none 	2 	57 	28 	85 	0 	0 	1
	#chr2R:5702257_5702656:+ 	FBgn0040092 	13 	0 	0 	0 	0 	0 	2
	#chr2L:11436293_11436415:- 	FBgn0261648 	23 	0 	0 	0 	0 	0 	2
	#chr2R:9334834_9336812:- 	FBgn0013765 	6 	0 	0 	0 	0 	0 	2

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Now compile the results
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# First show how you can get in hte myjuncs list
	print >> sys.stderr, "Printing results table"
	print >> juncs_out, "juncid\tgeneassign\tannostatus\tintron_size\tgmcode\tregcode\tcov\tlirt\trirt\tirt\tdncov\tancov"
	
	for juncid in sorted(keys2):
		try:
			results = [juncid, jdict[juncid]['geneassign'], jdict[juncid]['annostatus'], jdict[juncid]['intron_size'], jdict[juncid]['gmcode'], jdict[juncid]['regcode'], jdict[juncid]['cov'], jdict[juncid]['lirt'], jdict[juncid]['rirt'], jdict[juncid]['irt'], jdict[juncid]['dncov'], jdict[juncid]['ancov']]
			print >> juncs_out, ('\t'.join(map(str,results)))
		except KeyError:
			#myjuncs.append(juncid)	
			j1 = Junctionid(juncid)
			donid = j1.donid
			accid = j1.accid
			if covbyedge[donid]: dncov = covbyedge[donid]
			else: dncov = 0
			if covbyedge[accid]: ancov = covbyedge[accid]
			else: ancov = 0
			results = [juncid, reflist[juncid]['geneassign'],  reflist[juncid]['annostatus'],  reflist[juncid]['intron_size'],  reflist[juncid]['gmcode'],  reflist[juncid]['regcode'], 0, IRT[juncid]['lirt'], IRT[juncid]['rirt'], IRT[juncid]['irt'], dncov, ancov]
			#print(results, sep='\t')
			print >> juncs_out, ('\t'.join(map(str,results)))

	quit("done")

 	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 	# Parse the read alignments
 	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Parse the bam file
	## Get a table of junctions, table of donors etc.
	bamfh = pysam.Samfile( bamfile, "rb" )
	#JTAB,UNFILT_JTAB,STAB,NEWDTAB,MMES = parse_aligns_detailed(bamfh)
	JTAB,UNFILT_JTAB = quickcov(bamfh,anchorsize)
	bamfh.close()
	myjuncs = JTAB.keys()
	myjuncs.sort()
	
 	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 	# Print junction list to the output directory
 	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 	print "juncid\tunfilt_cov\tcov"
	for juncid in myjuncs:
		print juncid, UNFILT_JTAB[juncid], JTAB[juncid]
Ejemplo n.º 2
0
def main():

    # Functions for jtools:
    # Get sequence
    # Detect tandem acceptors NAGNAG
    # Annotate with genes
    # Jiggle
    # Bed to juncid
    # Guess frame
    # Find stops in intron + in frame
    # SVM recomputes
    # Splice site strength? ppt?

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Load a fasta file
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    global f
    # Opening fasta filehandle
    print >> sys.stderr, "[%s] Opening fasta file" % (spanki_utils.timestamp())
    f = Fasta(fastafile)
    fastachr = set(sorted(f.keys()))
    #print fastachr

    ########################################################
    ### Parsing a juncbed file
    ########################################################
    if (juncbedfile):

        print "juncid\toriginal_id\tdastring"
        print >> sys.stderr, "Loading", juncbedfile
        lines = csv.reader(open(juncbedfile, 'rb'), delimiter='\t')
        z = []
        for line in lines:
            pattern = re.compile('track')
            track = pattern.search(line[0])
            if not track:
                values = line
                blocksizes = values[10].split(",")
                blockstarts = values[11].split(",")
                chr = values[0]
                rangestart = int(values[1]) - 1
                rangeend = int(values[2])
                strand = values[5]
                id = values[3]
                intronstart = rangestart + int(blocksizes[0]) + 2
                intronend = rangeend - int(blocksizes[1])
                # Or..
                #intronend = rangestart + int(blocksizes[0]) + int(blockstarts[1])

                #chrXHet	800	1767	JUNC00000001	2	+	800	1767	255,0,0	2	20,63	0,904

                intronsize = intronend - intronstart

                juncid = chr + ":" + str(intronstart) + "_" + str(
                    intronend) + ":" + strand
                dastring = intron_sequence_single(juncid, f)
                z.append(str(dastring))
                print juncid, values[3], dastring

        print >> sys.stderr, "Distribution of detected motifs:\n", Counter(z)
        quit("Done")
    ########################################################
    ### Parsing a intronbed file
    ########################################################
    #scaffold_12916	13833982	13834044	10
    #scaffold_12916	13838614	13838676	67
    #scaffold_12916	13839119	13839204	75

    if (intronbedfile):
        print "juncid\tid\tdastring"
        lines = csv.reader(open(intronbedfile, 'rb'), delimiter='\t')
        for line in lines:
            pattern = re.compile('track')
            track = pattern.search(line[0])
            values = line
            if not track:
                chr = values[0]
                intronstart = int(values[1]) + 1
                intronend = int(values[2]) - 1
                strand = "+"
                id = values[0]

                intronsize = intronend - intronstart

                juncid = chr + ":" + str(intronstart) + "_" + str(
                    intronend) + ":" + strand
                dastring = intron_sequence_single(juncid, f)

                print juncid, values[3], dastring

        quit("Done")
    ########################################################

    ########################################################
    ### Converting from another format
    ########################################################

    if gfffile:

        #reflist = tab_to_dict(gff)
        results = collections.defaultdict(
            lambda: collections.defaultdict(dict))
        gffdict = gff_to_dict(gfffile)
        for x in gffdict:
            #print x
            #print gffdict[x]
            if (gffdict[x]['feature_type'] == "exon_junction"):
                juncid = gffdict[x]['chr'] + ":" + str(
                    int(gffdict[x]['start']) +
                    1) + "_" + str(int(gffdict[x]['end']) -
                                   1) + ":" + gffdict[x]['strand']
            elif (gffdict[x]['feature_type'] == "intron"):
                juncid = gffdict[x]['chr'] + ":" + gffdict[x][
                    'start'] + "_" + gffdict[x]['end'] + ":" + gffdict[x][
                        'strand']
            dastring = intron_sequence_single(juncid, f)
            #print dastring
            results[x]['juncid'] = juncid
            results[x]['dastring'] = dastring

        print "ID\tjuncid\tdastring"
        for x in sorted(results.iterkeys()):
            print x, "\t", results[x]['juncid'], "\t", results[x]['dastring']

        quit()

    ########################################################
    ### Converting from another format
    ########################################################

    if gtffile:

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Intializing the reference
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # You need the gtf file, and the fasta file
        lookup = spanki_utils.prep_ref(gtffile, fastafile, output_dir)
        ## Note that you now have a reference called ref.bam, and a lookup dict
        #tmp_dir = output_dir + "/tmp/"
        #reffile = tmp_dir + "/ref.bam"
        reffile = "tmp/ref.bam"
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Load an annotation, flattened as bam
        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        print >> sys.stderr, "[%s] Trying to load annotation as bam" % (
            spanki_utils.timestamp())
        reffh = pysam.Samfile(reffile, "rb")
        edgedict, refjuncs = spanki_parse_utils.parseRefAsBam(reffh)
        reffh.close()
        print >> sys.stderr, "[%s] Done loading annotation as bam" % (
            spanki_utils.timestamp())

        for junc in refjuncs:
            print junc

        quit()
    ### Below are functions that operate on a junction list
    ########################################################

    if jlist:

        #~~~~~~~~~~~~~~~~~~~
        # Load reference junction list
        #~~~~~~~~~~~~~~~~~~~
        reflist = tab_to_dict(jlist)

        # Find the junctions in jlist that are not in jtab

        myjuncs = reflist.keys()

        print >> sys.stderr, len(myjuncs), "in junction list"

        updonor = 20
        downdonor = 2
        upacceptor = 2
        downacceptor = 20

        for x in myjuncs:
            print x
            j1 = Junctionid(x)
            j1.display()
            if j1.strand == "+":
                #print Seq(f[j1.chr][j1.donor-updonor:j1.donor], IUPAC.unambiguous_dna)
                tempseq = Seq(f[j1.chr][j1.donor - updonor:j1.donor],
                              IUPAC.unambiguous_dna)
                #print "***", tempseq.translate()

                #print Seq(f[j1.chr][j1.donor:j1.donor + downdonor], IUPAC.unambiguous_dna)
                #print Seq(f[j1.chr][j1.acceptor-upacceptor:j1.acceptor], IUPAC.unambiguous_dna)
                #print Seq(f[j1.chr][j1.acceptor:j1.acceptor + downacceptor], IUPAC.unambiguous_dna)
                nagstring = find_nag(
                    Seq(f[j1.chr][j1.acceptor:j1.acceptor + downacceptor],
                        IUPAC.unambiguous_dna))
                print nagstring
            elif j1.strand == "-":
                pass
                #print Seq(f[j1.chr][j1.donor:j1.donor + updonor], IUPAC.unambiguous_dna).reverse_complement()
                #print Seq(f[j1.chr][j1.donor - downdonor:j1.donor], IUPAC.unambiguous_dna).reverse_complement()
                #print Seq(f[j1.chr][j1.acceptor:j1.acceptor + upacceptor], IUPAC.unambiguous_dna).reverse_complement()
                #print Seq(f[j1.chr][j1.acceptor-downacceptor:j1.acceptor], IUPAC.unambiguous_dna).reverse_complement()
            else:
                quit("Don't recognize strand")

                #fiveprimeflank = fiveprimeflank.reverse_complement()
        quit("Done")

    quit()
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Older code that's not used yet
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # IRT
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    bamfh = pysam.Samfile(bamfile, "rb")
    #for alignedread in samfile:
    # Need some kind of iterator to getread length from first alignment in sam
    print >> sys.stderr, "[%s] Getting intron read-though (IRT), may take awhile" % (
        spanki_utils.timestamp())
    IRT = intron_readthrough(myjuncs, bamfh)
    bamfh.close()
    print >> sys.stderr, "[%s] Done getting IRT" % (spanki_utils.timestamp())

    #for edgeid in covbyedge.keys():
    #	print edgeid, covbyedge[edgeid]

    # These are the fields you end up with after merging:
    #juncid	geneassign	cov	lirt	rirt	irt	dncov	ancov	numsamps
    #chr2L:22427471_22427525:- 	none 	2 	57 	28 	85 	0 	0 	1
    #chr2R:5702257_5702656:+ 	FBgn0040092 	13 	0 	0 	0 	0 	0 	2
    #chr2L:11436293_11436415:- 	FBgn0261648 	23 	0 	0 	0 	0 	0 	2
    #chr2R:9334834_9336812:- 	FBgn0013765 	6 	0 	0 	0 	0 	0 	2

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Now compile the results
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # First show how you can get in hte myjuncs list
    print >> sys.stderr, "Printing results table"
    print >> juncs_out, "juncid\tgeneassign\tannostatus\tintron_size\tgmcode\tregcode\tcov\tlirt\trirt\tirt\tdncov\tancov"

    for juncid in sorted(keys2):
        try:
            results = [
                juncid, jdict[juncid]['geneassign'],
                jdict[juncid]['annostatus'], jdict[juncid]['intron_size'],
                jdict[juncid]['gmcode'], jdict[juncid]['regcode'],
                jdict[juncid]['cov'], jdict[juncid]['lirt'],
                jdict[juncid]['rirt'], jdict[juncid]['irt'],
                jdict[juncid]['dncov'], jdict[juncid]['ancov']
            ]
            print >> juncs_out, ('\t'.join(map(str, results)))
        except KeyError:
            #myjuncs.append(juncid)
            j1 = Junctionid(juncid)
            donid = j1.donid
            accid = j1.accid
            if covbyedge[donid]: dncov = covbyedge[donid]
            else: dncov = 0
            if covbyedge[accid]: ancov = covbyedge[accid]
            else: ancov = 0
            results = [
                juncid, reflist[juncid]['geneassign'],
                reflist[juncid]['annostatus'], reflist[juncid]['intron_size'],
                reflist[juncid]['gmcode'], reflist[juncid]['regcode'], 0,
                IRT[juncid]['lirt'], IRT[juncid]['rirt'], IRT[juncid]['irt'],
                dncov, ancov
            ]
            #print(results, sep='\t')
            print >> juncs_out, ('\t'.join(map(str, results)))

    quit("done")

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Parse the read alignments
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Parse the bam file
    ## Get a table of junctions, table of donors etc.
    bamfh = pysam.Samfile(bamfile, "rb")
    #JTAB,UNFILT_JTAB,STAB,NEWDTAB,MMES = parse_aligns_detailed(bamfh)
    JTAB, UNFILT_JTAB = quickcov(bamfh, anchorsize)
    bamfh.close()
    myjuncs = JTAB.keys()
    myjuncs.sort()

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Print junction list to the output directory
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    print "juncid\tunfilt_cov\tcov"
    for juncid in myjuncs:
        print juncid, UNFILT_JTAB[juncid], JTAB[juncid]