Example #1
0
def make_tRNA_fasta_dict(tRNAdf):
	"""
	similar to make_fasta_dict, but for the tRNA database
	"""


	tRNA_fasta_outdict = OrderedDict()

	for i in tRNAdf.index:

		if tRNAdf.loc[i,'feature'] == 'tRNA':
			chrom = tRNAdf.loc[i,'#chrom']
			chrStart = int(tRNAdf.loc[i,'chromStart'])
			chrEnd = int(tRNAdf.loc[i,'chromEnd'])
			strand = tRNAdf.loc[i,'strand']
			
			if strand == "+":
				chrStart = chrStart-1 ### gtf files are 1-based, convert to 0-based
				trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd])
				trdict = parse_entry(tRNAdf.loc[i,'transcript_id'])
			
			else: # for neg strand
				chrStart = chrStart-1
				trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd])
				trSeq = trSeq.reverse_complement()
				trdict = parse_entry(tRNAdf.loc[i,'transcript_id'])

			trID = "tRNA_"+trdict['gene_id'][0]
			desc = "| tRNA | "+trdict['gene_type'][0] + " | %s; %s; %s:%s" % (chrom, strand, chrStart, chrEnd)

			trSeqRec = SeqRecord(trSeq, id=trID, name=trdict['gene_name'][0], description=desc)
			tRNA_fasta_outdict[trID] = trSeqRec
	
	return tRNA_fasta_outdict
Example #2
0
def make_fasta_dict(ncdf):
	
	fasta_outdict = OrderedDict() 

	for i in ncdf.index:

		if ncdf.loc[i,'feature'] == 'transcript':
			chrom = ncdf.loc[i,'#chrom']
			chrStart = int(ncdf.loc[i,'chromStart'])
			chrEnd = int(ncdf.loc[i,'chromEnd'])
			strand = ncdf.loc[i,'strand']
			
			if strand == "+":
				chrStart = chrStart-1 ## gtf files are 1 based, convert to 0-based for python
				trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd])
				trdict = parse_mod_entry(ncdf.loc[i,'transcript_id'])
			
			else: # for neg strand
				chrStart = chrStart-1
				trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd])
				trSeq = trSeq.reverse_complement() # negative strand
				trdict = parse_mod_entry(ncdf.loc[i,'transcript_id'])

			### add output annotation line features
			trID = trdict['ID'][0]
			desc = "| "+trdict['gene_type'][0]+" | "+trdict['gene_name'][0]+ " | %s; %s; %s:%s" % (chrom, strand, chrStart, chrEnd)

			trSeqRec = SeqRecord(trSeq, id=trID, name=trdict['gene_name'][0], description=desc)
			fasta_outdict[trID] = trSeqRec
	
	return fasta_outdict
Example #3
0
def write_consensus_seqs(refseq, contrib_props, contrib_reads, args):
    """
    Generates consensus sequences for each contributor from the assigned reads
    for output in FASTA format and writes them out.

    Args:
        refseq: The reference sequence to which the fragments were aligned.
        contrib_props: A list of lists containing for each contributor
                       - contributor ID (hap#)
                       - haplogroup
                       - proportion in mixture (not used).
        contrib_reads: A table mapping hap# IDs to lists of pysam
                       AlignedSegments + an entry of unassigned.
        args: The argument values from mixemt's argparse results.
    Returns:
        nothing
    """
    with open("%s.fa" % (args.cons_prefix), 'w') as fa_out:
        seqs_to_write = list()
        for con, hap, _ in contrib_props:
            seq = call_consensus(refseq,
                                 contrib_reads[con],
                                 1,
                                 args,
                                 strict=False)
            rec = SeqIO.SeqRecord(SeqIO.Seq(seq), id=con, description=hap)
            seqs_to_write.append(rec)
        if 'unassigned' in contrib_reads:
            seq = call_consensus(refseq,
                                 contrib_reads['unassigned'],
                                 1,
                                 args,
                                 strict=False)
            rec = SeqIO.SeqRecord(SeqIO.Seq(seq),
                                  id='unassigned',
                                  description='')
            seqs_to_write.append(rec)
        SeqIO.write(seqs_to_write, fa_out, 'fasta')
    return
Example #4
0
 def parse_result(self, genome_path):
     result_path = genome_path + '.gmhmm'
     reading_gene = False
     with open(result_path) as f:
         for line in f:
             if line.startswith('>gene'):
                 reading_gene = True
                 seq = []
                 seq_id = re.sub(r'[\s>]', '', line)
                 # >gene_2|GeneMark.hmm|57_nt|+|1|57	>NODE_3_length_713_cov_1.25228
             elif reading_gene:
                 if line.isspace():
                     reading_gene = False
                     seq = SeqIO.Seq(''.join(seq))
                     #genes.append(Gene(contig_id, strand, left_index, right_index, str_seq))
                     yield SeqIO.SeqRecord(seq,
                                           id='>' + seq_id,
                                           description='',
                                           name='')
                 else:
                     seq.append(line.strip())
def build_utr3_stop_positions(GFFlist):
    """
	This is a function to get the cds and utr sizes for an mRNA from a GFF file
	returns a list with: #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name
	Includes most of the functions from densebuilder_main but does not return counts
	"""
    # GFFlist = GFFinput

    transcriptdict = {}
    ucscIDlist = []
    total_transcripts = 0
    nonvalidchorms = 0
    nonATGstart = 0
    wrongstopcodon = 0
    shortcontext = 0
    validchroms = 0
    excluded_chroms = []
    included_chroms = []
    for chrom in GFFlist:
        if not chrom in validChrs:
            excluded_chroms.append(chrom)
            nonvalidchorms += 1
            # print chrom
            continue  # check that only valid choromosomes are used
        validchroms += 1
        included_chroms.append(chrom)
        transcriptnum = -1  # set to negative one so first transcript is == to 0
        for transcript in GFFlist[
                chrom].features:  # this is where the SeqFeatures are actually stored
            tr_attribute_list = []
            transcriptnum += 1
            trsp_id = transcript.id  # it is a number
            trsp_strand = transcript.strand
            trsp_genename = transcript.qualifiers['Name'][0]
            trsp_chromstart = int(
                transcript.location.start.position)  # 0-based
            trsp_chromend = int(transcript.location.end.position)
            transcriptlist = [
                0.0 for x in range(abs(trsp_chromend - trsp_chromstart))
            ]  # a list for transcript (pre-mRNA), not CDS

            exonsplicedseq = SeqIO.Seq('')
            transcriptseq = SeqIO.Seq(
                genome[chrom][trsp_chromstart:trsp_chromend])

            startCodonMrnaList = []
            stopCodonMrnaList = []

            for item in GFFlist[chrom].features[transcriptnum].sub_features:
                if trsp_strand == 1:
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = exonstart - trsp_chromstart
                        exonend_feat = exonend - trsp_chromstart  # Not 0-based, it is fine for length....next line.
                        exonsplicedseq += transcriptseq[
                            exonstart_feat:
                            exonend_feat]  # takes from exonstart to exonend-1
                    if item.type == 'start_codon':
                        startcodonpos = item.location.start.position  # 0-based position
                        # startcodonmrnapos=  chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)  # spliced mRNA position
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum,
                                            GFFlist))  # spliced mRNA position
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.end.position - 1  # 0-based position
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

                if trsp_strand == -1:
                    # reverse_complement() # this comes from seqIO
                    transcriptseq_rev = transcriptseq.reverse_complement()
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = (trsp_chromend - 1) - (exonend - 1
                                                                )  # 0-based
                        exonend_feat = (trsp_chromend -
                                        1) - exonstart  # 0-based
                        exonseq = transcriptseq_rev[
                            exonstart_feat:exonend_feat + 1]
                        exonsplicedseq = exonseq + exonsplicedseq
                    if item.type == 'start_codon':
                        startcodonpos = item.location.end.position - 1  # Need to -1 to be 0-based.
                        # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum, GFFlist))
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.start.position  # start.position is 0-based already.
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

            ### choose start and stop codons
            if len(startCodonMrnaList) > 0:
                # print "MORE THAN 1 START", startCodonMrnaList
                startcodonmrnapos = min(startCodonMrnaList)
            else:
                print "!!! no start codon for %s" % (trsp_id)
            # if len(stopCodonMrnaList)
            if len(stopCodonMrnaList) > 0:
                stopcodonmrnapos = max(stopCodonMrnaList)
            else:
                print "!!! no stop codon for %s" % (trsp_id)

            mRNAseq = exonsplicedseq
            cdsseq = exonsplicedseq[
                startcodonmrnapos:stopcodonmrnapos +
                1]  # take from startcodonmrnapos to stopcodonmrnapos
            utr5seq = exonsplicedseq[:startcodonmrnapos]
            utr3seq = exonsplicedseq[stopcodonmrnapos + 1:]

            if str(cdsseq[:3].upper()) != "ATG":
                nonATGstart += 1
                continue  # ignore non-AUG start codons

            ### stopcodon is included in cdsseq, represnted by the last 3nt's
            stopcodon = str(cdsseq[-3:].upper())

            if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA":
                wrongstopcodon += 1
                continue  # ignore weird stop codons

            # build itmes in transcript attribute list
            mRNAlen = len(exonsplicedseq)
            cdslen = len(cdsseq)
            utr5len = len(utr5seq)
            utr3len = len(utr3seq)
            assert mRNAlen == utr3len + cdslen + utr5len  # check that sum of features equals mRNA length

            ###### Finding inframe stop codons ######

            ### Frame zero for loop,
            ### count each codon into 3'UTR using 0-based counting
            ### With zero-based counting, next stopcodon * 3 == adjusted 3'UTR length
            frameZeroTrans = utr3seq.translate()
            frameZeroStopPositions = []
            frameZeroStopPositionsMRNA = []
            frameZeroPos = -1
            frameZeroStopCounter = 0
            frameZeroUtr3LenAdj = 0
            for codon in frameZeroTrans:
                frameZeroPos += 1
                if codon == '*':
                    frameZeroStopPositions.append(
                        frameZeroPos * 3)  # get utr3position in nucleotides
                    frameZeroStopPositionsMRNA.append((utr5len + cdslen) +
                                                      (frameZeroPos * 3))
                    ### check mRNA position to make sure stop codons are all valid
                    sc = str(mRNAseq[(utr5len + cdslen) +
                                     (frameZeroPos * 3):(utr5len + cdslen) +
                                     (frameZeroPos * 3) + 3].upper())
                    if sc != "TAA" and sc != "TAG" and sc != "TGA":
                        print "stop codon in frame 0 for %s is non correct!" % trsp_id
                        print "stopcodon is: %s" % sc
                        sys.exit()
                    frameZeroStopCounter += 1
                if codon == '*' and frameZeroStopCounter == 1:
                    frameZeroUtr3LenAdj = frameZeroPos * 3
            if frameZeroUtr3LenAdj == 0 and frameZeroStopCounter == 0:
                frameZeroUtr3LenAdj = len(utr3seq)

            ### Frame +1 for loop,
            framePlusOneTrans = utr3seq[1:].translate(
            )  # start one nucleotide into 3'UTR for +1 frameshift
            framePlusOneStopPositions = []
            framePlusOneStopPositionsMRNA = []
            framePlusOnePos = -1
            framePlusOneStopCounter = 0
            framePlusOneUtr3LenAdj = 0
            for codon in framePlusOneTrans:
                framePlusOnePos += 1
                if codon == '*':
                    framePlusOneStopPositions.append(
                        (framePlusOnePos * 3) +
                        1)  # get utr3position in nucleotides
                    framePlusOneStopPositionsMRNA.append((utr5len + cdslen) +
                                                         (framePlusOnePos *
                                                          3) + 1)
                    ### check mRNA position to make sure stop codons are all valid
                    sc = str(
                        mRNAseq[((utr5len + cdslen) + (framePlusOnePos * 3) +
                                 1):((utr5len + cdslen) +
                                     (framePlusOnePos * 3) + 1) + 3].upper())
                    if sc != "TAA" and sc != "TAG" and sc != "TGA":
                        print "stop codon in frame +1 for %s is non correct!" % trsp_id
                        print "stopcodon is: %s" % sc
                        sys.exit()
                    framePlusOneStopCounter += 1
                if codon == '*' and framePlusOneStopCounter == 1:
                    framePlusOneUtr3LenAdj = (framePlusOnePos * 3) + 1
            if framePlusOneUtr3LenAdj == 0 and frameZeroStopCounter == 0:
                framePlusOneUtr3LenAdj = len(utr3seq[1:])

            ### Frame -1 for loop,
            frameMinusOneTrans = (cdsseq[-1] + utr3seq).translate(
            )  # include last nucleotide of cds for -1 frameshift
            frameMinusOneStopPositions = []
            frameMinusOneStopPositionsMRNA = []
            frameMinusOnePos = -1
            frameMinusOneStopCounter = 0
            frameMinusOneUtr3LenAdj = 0
            for codon in frameMinusOneTrans:
                frameMinusOnePos += 1
                if codon == '*':
                    frameMinusOneStopPositions.append(
                        (frameMinusOnePos * 3) -
                        1)  # get utr3position in nucleotides
                    frameMinusOneStopPositionsMRNA.append((utr5len + cdslen) +
                                                          (frameMinusOnePos *
                                                           3) - 1)
                    ### check mRNA position to make sure stop codons are all valid
                    sc = str(
                        mRNAseq[((utr5len + cdslen) + (frameMinusOnePos * 3) -
                                 1):((utr5len + cdslen) +
                                     (frameMinusOnePos * 3) - 1) + 3].upper())
                    if sc != "TAA" and sc != "TAG" and sc != "TGA":
                        print "stop codon in frame -1 for %s is non correct!" % trsp_id
                        print "stopcodon is: %s" % sc
                        sys.exit()
                    frameMinusOneStopCounter += 1
                if codon == '*' and frameMinusOneStopCounter == 1:
                    frameMinusOneUtr3LenAdj = (frameMinusOnePos * 3) - 1
            if frameMinusOneUtr3LenAdj == 0 and frameZeroStopCounter == 0:
                frameMinusOneUtr3LenAdj = len(cdsseq[-1] + utr3seq)

            ####

            trsp_attr_list = [
                trsp_id, trsp_genename, frameZeroStopPositions,
                frameZeroStopPositionsMRNA, framePlusOneStopPositions,
                framePlusOneStopPositionsMRNA, frameMinusOneStopPositions,
                frameMinusOneStopPositionsMRNA
            ]

            ucscIDlist.append(trsp_attr_list[0])
            transcriptdict[trsp_id] = trsp_attr_list
            total_transcripts += 1
            #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name,stopcodon,stop4nt
    print "total number of transcripts in data table: %s" % total_transcripts
    print "Number of included chromosomes chr: %s" % validchroms
    print "Number of excluded chromosomes chr: %s" % nonvalidchorms
    print "included chroms: ", included_chroms
    print "excluded chroms: ", excluded_chroms
    print "transcripts discarded due to non-AUG start codon %s" % nonATGstart
    print "transcripts discarded due to noncanonical stop codon %s" % wrongstopcodon
    return ucscIDlist, transcriptdict
Example #6
0
    def builddense(self):
        transcriptdict = {}
        mappedlocalreads = 0
        dumppedreads = 0
        illegalreads = 0
        tooshortlongreads = 0
        wrongstrandreads = 0
        noStartOrStop = 0
        noStartCodon = 0
        noStopCodon = 0
        totalreads = 0  # not totreads
        GFFlist = self.makeGFFlist(self.GTFgen)

        # validChrs = 'chrLUC' # for building a single chromosome
        validChrs = [
            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
            'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
            'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
            'chrX', 'chrY', 'chrM', 'chrSinV', 'chrLUC'
        ]

        for chrom in GFFlist:
            if not chrom in validChrs: continue
            transcriptnum = -1
            for transcript in GFFlist[chrom].features:
                transcriptnum += 1
                trsp_id = transcript.id  # it is a number
                trsp_strand = transcript.strand
                # if transcript.type == 'inferred_parent': # this is a hack to deal with improperly formatted gtf files, will return strand == 0
                # trsp_strand = transcript.sub_features[0].strand # use the first subfeature entry to get strand instead
                trsp_chromstart = int(
                    transcript.location.start.position)  # 0-based
                trsp_chromend = int(transcript.location.end.position)
                transcriptlist = [
                    0.0 for x in range(abs(trsp_chromend - trsp_chromstart))
                ]  # a list for transcript (pre-mRNA), not CDS

                gb = self.getbam_5or3counts(
                    self.bamfile, transcriptlist, chrom, transcriptnum,
                    trsp_chromstart, trsp_chromend, trsp_id, trsp_strand,
                    self.riboshiftdict, self.assignment, self.bamfileout
                )  # return a riboshifted list (0-based) of unspliced counts

                mappedlocalreads += gb[1]
                dumppedreads += gb[2]
                illegalreads += gb[3]
                tooshortlongreads += gb[4]
                wrongstrandreads += gb[5]
                totalreads += gb[6]
                exonsplicedseq = SeqIO.Seq('')
                exonsplicedcounts = []
                transcriptseq = SeqIO.Seq(
                    genome[chrom][trsp_chromstart:trsp_chromend])

                # For EGFP
                #transcriptseq= genome
                #if transcript.type== 'gene':	# For yeast
                #	if trsp_strand== 1:
                #		startcodonpos= transcript.location.start.position
                #		startcodonmrnapos= seqtools.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                #		stopcodonpos= transcript.location.end.position- 1# 0-based position
                #		stopcodonmrnapos= seqtools.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                #	if trsp_strand== -1:
                #		startcodonpos= transcript.location.end.position- 1
                #		startcodonmrnapos= seqtools.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                #		stopcodonpos= transcript.location.start.position	# 0-based position, the first nt of stop codon
                #		stopcodonmrnapos= seqtools.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)

                ### handling transcripts with no start or stop codon:
                # startcodonmrnapos = 'absent'
                # stopcodonmrnapos = 'absent'

                startCodonMrnaList = []
                stopCodonMrnaList = []

                for item in GFFlist[chrom].features[
                        transcriptnum].sub_features:
                    if trsp_strand == 1:
                        if item.type == 'exon':  # or item.type== 'CDS':	# For yeast, use 'CDS'
                            exonstart = int(item.location.start.position
                                            )  # 0-based position
                            exonend = int(
                                item.location.end.position)  # not 0-based
                            exonstart_feat = exonstart - trsp_chromstart
                            exonend_feat = exonend - trsp_chromstart  # Not 0-based, it is fine for length....next line.
                            exonsplicedcounts += gb[0][
                                exonstart_feat:
                                exonend_feat]  # takes from exonstart to exonend-1
                            exonsplicedseq += transcriptseq[
                                exonstart_feat:
                                exonend_feat]  # takes from exonstart to exonend-1
                        if item.type == 'start_codon':
                            startcodonpos = item.location.start.position  # 0-based position
                            # startcodonmrnapos=  self.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)	# spliced mRNA position
                            startCodonMrnaList.append(
                                self.chrpostomrnapos(
                                    startcodonpos, chrom, transcriptnum,
                                    GFFlist))  # spliced mRNA position
                        if item.type == 'stop_codon':
                            stopcodonpos = item.location.end.position - 1  # 0-based position
                            # stopcodonmrnapos= self.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                            stopCodonMrnaList.append(
                                self.chrpostomrnapos(stopcodonpos, chrom,
                                                     transcriptnum, GFFlist))

                    if trsp_strand == -1:
                        transcriptseq_rev = transcriptseq.reverse_complement()
                        if item.type == 'exon':  # or item.type== 'CDS':	# For yeast, use 'CDS'
                            exonstart = int(item.location.start.position
                                            )  # 0-based position
                            exonend = int(
                                item.location.end.position)  # not 0-based
                            exonstart_feat = (trsp_chromend - 1) - (
                                exonend - 1)  # 0-based
                            exonend_feat = (trsp_chromend -
                                            1) - exonstart  # 0-based
                            exoncounts = gb[0][
                                exonstart_feat:exonend_feat +
                                1]  # both 0-based, need to +1 for length
                            exonsplicedcounts = exoncounts + exonsplicedcounts  # exoncounts added to the upstream of existing counts, so don't flip again.
                            exonseq = transcriptseq_rev[
                                exonstart_feat:exonend_feat + 1]
                            exonsplicedseq = exonseq + exonsplicedseq
                        if item.type == 'start_codon':
                            startcodonpos = item.location.end.position - 1  # Need to -1 to be 0-based.
                            # startcodonmrnapos= self.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                            startCodonMrnaList.append(
                                self.chrpostomrnapos(startcodonpos, chrom,
                                                     transcriptnum, GFFlist))
                        if item.type == 'stop_codon':
                            stopcodonpos = item.location.start.position  # start.position is 0-based already.
                            # stopcodonmrnapos= self.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                            stopCodonMrnaList.append(
                                self.chrpostomrnapos(stopcodonpos, chrom,
                                                     transcriptnum, GFFlist))

                if len(startCodonMrnaList) > 0:
                    # print "MORE THAN 1 START", startCodonMrnaList
                    startcodonmrnapos = min(startCodonMrnaList)
                else:
                    noStartCodon += 1
                    startcodonmrnapos = 0  ### adding for transcripts without start codon
                    # print "!!! no start codon for %s" % (trsp_id)
                if len(stopCodonMrnaList) > 0:
                    stopcodonmrnapos = max(stopCodonMrnaList)
                else:
                    noStopCodon += 1
                    stopcodonmrnapos = len(
                        exonsplicedseq) - 3  ### leave 3nt's in "3'UTR"
                    # print "!!! no stop codon for %s" % (trsp_id)

                # if startcodonmrnapos == 'absent' or stopcodonmrnapos == 'absent':
                # 	noStartOrStop +=1
                # 	continue

                cdsseq = exonsplicedseq[
                    startcodonmrnapos:stopcodonmrnapos +
                    1]  # take from startcodonmrnapos to stopcodonmrnapos
                cdscounts = exonsplicedcounts[
                    startcodonmrnapos:stopcodonmrnapos +
                    1]  # take from startcodonmrnapos to stopcodonmrnapos

                # if str(cdsseq[:3].upper())!= "ATG":	continue	# ignore non-AUG start codons
                # stopcodon= str(cdsseq[-3:].upper())
                # if stopcodon!= "TGA" and stopcodon!= "TAG" and stopcodon!= "TAA":	continue	# ignore weird stop codons
                #utr5len= startcodonmrnapos
                #utr3len= len(exonsplicedseq)- stopcodonmrnapos- 1

                if sum(cdscounts) >= float(
                        self.threshold):  # thresholding minimal reads per CDS.
                    transcriptdict[trsp_id] = exonsplicedcounts
        if self.totreads == '-1':
            print str(
                totalreads) + " total mapped reads used for normalization."
            self.norm_m(
                transcriptdict, totalreads
            )  # Normalzied by total reads mapped to transcriptdict only... but not total mapped reads.
        else:
            print str(
                self.totreads
            ) + " total mapped reads from STAR alignment used for normalization."
            self.norm_m(transcriptdict, self.totreads)

        ### disable writing counts to file here
        # self.writecountsf(transcriptdict, self.outputdata)

        ### assemble dataframe here
        outdict = OrderedDict()

        for key, val in transcriptdict.items():
            outdict[key] = [val]

        df = pd.DataFrame.from_dict(outdict, orient='index')
        df.columns = ['density']
        print df.head()
        df.to_csv('%s.csv.gz' % (self.outputdata), compression='gzip')

        # Write output file of comments.
        fc = open(self.outputdata + "output.txt", "w")
        fc.write("Density was built with parameters:\n")
        fc.write("riboshiftdict=" + str(self.riboshiftdict) + "\n")
        fc.write("threshold=" + str(self.threshold) + "\n")
        fc.write("assignment=" + str(self.assignment) + "\n")
        fc.write("reads mapped to known canonical coding transcripts: " +
                 str(mappedlocalreads) + "\n")
        fc.write("reads are dumpped, due to weird cigar codes: " +
                 str(dumppedreads) + "\n")
        fc.write(
            "reads are illegal, mapped outside of annotated transcripts: " +
            str(illegalreads) + "\n")
        fc.write("reads are too short/long: " + str(tooshortlongreads) + "\n")
        fc.write("reads are on the wrong strand: " + str(wrongstrandreads) +
                 "\n")
        fc.write("total mapped reads from aligner: " + str(totalreads))
        fc.close()

        print str(
            mappedlocalreads
        ) + " reads within length limitation mapped to known canonical coding transcripts. "
        print str(
            dumppedreads) + " reads are dumpped, due to weird cigar codes."
        print str(
            illegalreads
        ) + " reads are illegal, mapped outside of annotated transcripts."
        print str(tooshortlongreads) + " reads are too short/long."
        print str(wrongstrandreads) + " reads are on the wrong strand."
        print str(totalreads) + " total mapped reads from aligner. "
def build_utr_table(GFFlist, inculde_noncanon_start, include_noncanon_stop):
    """
	This is a function to get the cds and utr sizes for an mRNA from a GFF file
	returns a list with: #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name
	Includes most of the functions from densebuilder_main but does not return counts
	"""
    # GFFlist = GFFinput

    transcriptdict = {}
    ucscIDlist = []
    total_transcripts = 0
    nonvalidchorms = 0
    nonATGstart = 0
    wrongstopcodon = 0
    validchroms = 0
    excluded_chroms = []
    included_chroms = []
    for chrom in GFFlist:
        if not chrom in validChrs:
            excluded_chroms.append(chrom)
            nonvalidchorms += 1
            # print chrom
            continue  # check that only valid choromosomes are used
        validchroms += 1
        included_chroms.append(chrom)
        transcriptnum = -1  # set to negative one so first transcript is == to 0
        for transcript in GFFlist[
                chrom].features:  # this is where the SeqFeatures are actually stored
            tr_attribute_list = []
            transcriptnum += 1
            trsp_id = transcript.id  # it is a number
            trsp_strand = transcript.strand
            ### changing this to be compatible with new hg38 annotation
            # print transcript.qualifiers ### these are all of the fields parsed by the GTF parser from column 8, output is a dictionary {'key':['item1', 'item2', 'ect']}
            trsp_genename = transcript.qualifiers['Name'][0]
            trsp_chromstart = int(
                transcript.location.start.position)  # 0-based
            trsp_chromend = int(transcript.location.end.position)
            transcriptlist = [
                0.0 for x in range(abs(trsp_chromend - trsp_chromstart))
            ]  # a list for transcript (pre-mRNA), not CDS

            exonsplicedseq = SeqIO.Seq('')
            transcriptseq = SeqIO.Seq(
                genome[chrom][trsp_chromstart:trsp_chromend])

            ### use lists to handle transcripts with multiple start and stop codons
            startCodonMrnaList = []
            stopCodonMrnaList = []

            for item in GFFlist[chrom].features[transcriptnum].sub_features:

                if trsp_strand == 1:

                    ### dealing with transcripts having multiple start or stop codon entries, if spaning splice junctions

                    if item.type == 'exon':  # or item.type== 'CDS':	# For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = exonstart - trsp_chromstart
                        exonend_feat = exonend - trsp_chromstart  # Not 0-based, it is fine for length....next line.
                        exonsplicedseq += transcriptseq[
                            exonstart_feat:
                            exonend_feat]  # takes from exonstart to exonend-1
                    if item.type == 'start_codon':
                        startcodonpos = item.location.start.position  # 0-based position
                        # startcodonmrnapos=  chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)	# spliced mRNA position
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum,
                                            GFFlist))  # spliced mRNA position
                        # print startcodonmrnapos
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.end.position - 1  # 0-based position
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))
                        # print stopcodonmrnapos

                if trsp_strand == -1:
                    # print 'neg_strand'
                    # reverse_complement() # this comes from seqIO
                    transcriptseq_rev = transcriptseq.reverse_complement()

                    if item.type == 'exon':  # or item.type== 'CDS':	# For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = (trsp_chromend - 1) - (exonend - 1
                                                                )  # 0-based
                        exonend_feat = (trsp_chromend -
                                        1) - exonstart  # 0-based
                        exonseq = transcriptseq_rev[
                            exonstart_feat:exonend_feat + 1]
                        exonsplicedseq = exonseq + exonsplicedseq
                    if item.type == 'start_codon':
                        startcodonpos = item.location.end.position - 1  # Need to -1 to be 0-based.
                        # print startcodonpos
                        # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum, GFFlist))
                        # print "start codon: ", startcodonmrnapos
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.start.position  # start.position is 0-based already.
                        # print stopcodonpos
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))
                        # print "stop codon: ", stopcodonmrnapos

            if len(startCodonMrnaList) > 0:
                # print "MORE THAN 1 START", startCodonMrnaList
                startcodonmrnapos = min(startCodonMrnaList)
            else:
                print "!!! no start codon for %s" % (trsp_id)
                startcodonmrnapos = 0  ### adding for transcripts without start codon
            # if len(stopCodonMrnaList)
            if len(stopCodonMrnaList) > 0:
                stopcodonmrnapos = max(stopCodonMrnaList)
            else:
                print "!!! no stop codon for %s" % (trsp_id)
                stopcodonmrnapos = len(
                    exonsplicedseq) - 3  ### leave 3nt's in "3'UTR"

            cdsseq = exonsplicedseq[
                startcodonmrnapos:stopcodonmrnapos +
                1]  # take from startcodonmrnapos to stopcodonmrnapos
            utr5seq = exonsplicedseq[:startcodonmrnapos]
            utr3seq = exonsplicedseq[stopcodonmrnapos + 1:]

            # print trsp_id
            # # print transcript.qualifiers['transcript_name']
            # print trsp_strand
            # print utr5seq
            # print " - - - "
            # print cdsseq
            # print " - - - "
            # print utr3seq
            # # print utr5seq+cdsseq+utr3seq
            # print ""
            # # print transcriptseq

            if inculde_noncanon_start == False:
                if str(cdsseq[:3].upper()) != "ATG":
                    nonATGstart += 1
                    print "non canon start"
                    print trsp_id
                    print cdsseq
                    print ""
                    continue  # ignore non-AUG start codons

            stopcodon = str(cdsseq[-3:].upper())
            if len(utr3seq) > 0:
                stop4nt = stopcodon + str(utr3seq[0].upper())
            elif len(utr3seq) == 0:
                stop4nt = '0'
            else:
                print "there is a 3'UTR with negative length..."
                sys.exit()

            if include_noncanon_stop == False:
                if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA":
                    wrongstopcodon += 1
                    print "wrong stop!"
                    print trsp_id
                    print cdsseq
                    print ""
                    continue  # ignore weird stop codons

            # build itmes in transcript attribute list
            mRNAlen = len(exonsplicedseq)
            cdslen = len(cdsseq)
            utr5len = len(utr5seq)
            utr3len = len(utr3seq)
            assert mRNAlen == utr3len + cdslen + utr5len  # check that sum of features equals mRNA length

            trsp_attr_list = [
                trsp_id, chrom, transcriptnum, trsp_strand, mRNAlen, cdslen,
                utr5len, utr3len, trsp_genename, stopcodon, stop4nt
            ]
            ucscIDlist.append(trsp_attr_list[0])
            transcriptdict[trsp_id] = trsp_attr_list
            total_transcripts += 1
            #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name,stopcodon,stop4nt
    print "total number of transcripts in data table: %s" % total_transcripts
    print "Number of included chromosomes chr: %s" % validchroms
    print "Number of excluded chromosomes chr: %s" % nonvalidchorms
    print "included chroms: ", included_chroms
    print "excluded chroms: ", excluded_chroms
    print "transcripts discarded due to non-AUG start codon %s" % nonATGstart
    print "transcripts discarded due to noncanonical stop codon %s" % wrongstopcodon
    return ucscIDlist, transcriptdict
Example #8
0
def get_Prot_sequence(GFFlist):

    transcriptdict = {}
    ucscIDlist = []
    total_transcripts = 0
    nonvalidchorms = 0
    nonATGstart = 0
    wrongstopcodon = 0
    validchroms = 0
    excluded_chroms = []
    included_chroms = []
    for chrom in GFFlist:
        if not chrom in validChrs:
            excluded_chroms.append(chrom)
            nonvalidchorms += 1
            # print chrom
            continue  # check that only valid choromosomes are used
        validchroms += 1
        included_chroms.append(chrom)
        transcriptnum = -1  # set to negative one so first transcript is == to 0
        for transcript in GFFlist[
                chrom].features:  # this is where the SeqFeatures are actually stored
            tr_attribute_list = []
            transcriptnum += 1
            trsp_id = transcript.id  # it is a number
            trsp_strand = transcript.strand
            trsp_genename = transcript.qualifiers['Name'][0]
            trsp_chromstart = int(
                transcript.location.start.position)  # 0-based
            trsp_chromend = int(transcript.location.end.position)
            transcriptlist = [
                0.0 for x in range(abs(trsp_chromend - trsp_chromstart))
            ]  # a list for transcript (pre-mRNA), not CDS

            exonsplicedseq = SeqIO.Seq('')
            transcriptseq = SeqIO.Seq(
                genome[chrom][trsp_chromstart:trsp_chromend])

            ### use lists to handle transcripts with multiple start and stop codons
            startCodonMrnaList = []
            stopCodonMrnaList = []

            for item in GFFlist[chrom].features[transcriptnum].sub_features:
                if trsp_strand == 1:
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = exonstart - trsp_chromstart
                        exonend_feat = exonend - trsp_chromstart  # Not 0-based, it is fine for length....next line.
                        exonsplicedseq += transcriptseq[
                            exonstart_feat:
                            exonend_feat]  # takes from exonstart to exonend-1
                    if item.type == 'start_codon':
                        startcodonpos = item.location.start.position  # 0-based position
                        # startcodonmrnapos=  chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)  # spliced mRNA position
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum,
                                            GFFlist))  # spliced mRNA position
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.end.position - 1  # 0-based position
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

                if trsp_strand == -1:
                    # reverse_complement() # this comes from seqIO
                    transcriptseq_rev = transcriptseq.reverse_complement()
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = (trsp_chromend - 1) - (exonend - 1
                                                                )  # 0-based
                        exonend_feat = (trsp_chromend -
                                        1) - exonstart  # 0-based
                        exonseq = transcriptseq_rev[
                            exonstart_feat:exonend_feat + 1]
                        exonsplicedseq = exonseq + exonsplicedseq
                    if item.type == 'start_codon':
                        startcodonpos = item.location.end.position - 1  # Need to -1 to be 0-based.
                        # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum, GFFlist))
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.start.position  # start.position is 0-based already.
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

            ### choose start and stop codons
            if len(startCodonMrnaList) > 0:
                # print "MORE THAN 1 START", startCodonMrnaList
                startcodonmrnapos = min(startCodonMrnaList)
            else:
                print "!!! no start codon for %s" % (trsp_id)
            # if len(stopCodonMrnaList)
            if len(stopCodonMrnaList) > 0:
                stopcodonmrnapos = max(stopCodonMrnaList)
            else:
                print "!!! no stop codon for %s" % (trsp_id)

            mRNAseq = exonsplicedseq
            cdsseq = exonsplicedseq[
                startcodonmrnapos:stopcodonmrnapos +
                1]  # take from startcodonmrnapos to stopcodonmrnapos
            utr5seq = exonsplicedseq[:startcodonmrnapos]
            utr3seq = exonsplicedseq[stopcodonmrnapos + 1:]

            cdsProt = cdsseq.translate()

            # outseq = utr5seq.lower()+cdsseq.upper()+utr3seq.lower()

            if str(cdsseq[:3].upper()) != "ATG":
                nonATGstart += 1
                continue  # ignore non-AUG start codons

            ### stopcodon is included in cdsseq, represnted by the last 3nt's
            stopcodon = str(cdsseq[-3:].upper())

            if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA":
                wrongstopcodon += 1
                continue  # ignore weird stop codons

            # build itmes in transcript attribute list
            mRNAlen = len(exonsplicedseq)
            cdslen = len(cdsseq)
            utr5len = len(utr5seq)
            utr3len = len(utr3seq)
            assert mRNAlen == utr3len + cdslen + utr5len  # check that sum of features equals mRNA length

            trsp_attr_list = [trsp_id, trsp_genename, cdsProt]
            ucscIDlist.append(trsp_attr_list[0])
            transcriptdict[trsp_id] = trsp_attr_list
            total_transcripts += 1
    print "total number of transcripts in data table: %s" % total_transcripts
    print "Number of included chromosomes chr: %s" % validchroms
    print "Number of excluded chromosomes chr: %s" % nonvalidchorms
    print "included chroms: ", included_chroms
    print "excluded chroms: ", excluded_chroms
    print "transcripts discarded due to non-AUG start codon %s" % nonATGstart
    print "transcripts discarded due to noncanonical stop codon %s" % wrongstopcodon
    return ucscIDlist, transcriptdict
def find_uORFs(GFFlist):
    """
	using the same basic structure as denesbuilder_main, this function identifies all uORFs and write csv files
	"""
    ### define start codon
    ## could possibly change this to look at non canonical start codons
    startCodon = Seq('ATG')

    ### build empty data frames, rows will be appended as function iterates over transcripts
    dfCols = [
        'trxname', 'symbol', 'strand', 'uORFCounter', 'startPosition',
        'cdsExtension', 'utr5len', 'cdslen', 'utr3len', 'uORFlen', 'uORFseq',
        'uORFaa'
    ]
    uORFdf = pd.DataFrame(columns=dfCols)

    summaryCols = [
        'trxname', 'symbol', 'chr', 'tr_number', 'strand', 'uORFCounter',
        'cdsExtension'
    ]
    summarydf = pd.DataFrame(columns=summaryCols)

    ####

    total_transcripts = 0
    nonvalidchorms = 0
    nonATGstart = 0
    wrongstopcodon = 0
    validchroms = 0
    excluded_chroms = []
    included_chroms = []
    for chrom in GFFlist:
        if not chrom in validChrs:
            excluded_chroms.append(chrom)
            nonvalidchorms += 1
            #         print chrom
            continue  # check that only valid choromosomes are used
        validchroms += 1
        included_chroms.append(chrom)
        transcriptnum = -1  # set to negative one so first transcript is == to 0
        # print chrom
        for transcript in GFFlist[
                chrom].features:  # this is where the SeqFeatures are actually stored
            #         print transcript
            tr_attribute_list = []
            transcriptnum += 1
            trsp_id = transcript.id  # it is a number
            trsp_strand = transcript.strand
            trsp_genename = transcript.qualifiers['Name'][0]
            trsp_chromstart = int(
                transcript.location.start.position)  # 0-based
            trsp_chromend = int(transcript.location.end.position)
            transcriptlist = [
                0.0 for x in range(abs(trsp_chromend - trsp_chromstart))
            ]  # a list for transcript (pre-mRNA), not CDS

            exonsplicedseq = SeqIO.Seq('')
            transcriptseq = SeqIO.Seq(
                genome[chrom][trsp_chromstart:trsp_chromend])

            ### handling transcripts with no start or stop codon:
            # startcodonmrnapos = 'absent'
            # stopcodonmrnapos = 'absent'

            startCodonMrnaList = []
            stopCodonMrnaList = []

            for item in GFFlist[chrom].features[transcriptnum].sub_features:
                if trsp_strand == 1:
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = exonstart - trsp_chromstart
                        exonend_feat = exonend - trsp_chromstart  # Not 0-based, it is fine for length....next line.
                        exonsplicedseq += transcriptseq[
                            exonstart_feat:
                            exonend_feat]  # takes from exonstart to exonend-1
                    if item.type == 'start_codon':
                        startcodonpos = item.location.start.position  # 0-based position
                        # startcodonmrnapos=  chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)  # spliced mRNA position
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum, GFFlist))
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.end.position - 1  # 0-based position
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

                if trsp_strand == -1:
                    # reverse_complement() # this comes from seqIO
                    transcriptseq_rev = transcriptseq.reverse_complement()
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = (trsp_chromend - 1) - (exonend - 1
                                                                )  # 0-based
                        exonend_feat = (trsp_chromend -
                                        1) - exonstart  # 0-based
                        exonseq = transcriptseq_rev[
                            exonstart_feat:exonend_feat + 1]
                        exonsplicedseq = exonseq + exonsplicedseq
                    if item.type == 'start_codon':
                        startcodonpos = item.location.end.position - 1  # Need to -1 to be 0-based.
                        # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum, GFFlist))
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.start.position  # start.position is 0-based already.
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

            if len(startCodonMrnaList) > 0:
                # print "MORE THAN 1 START", startCodonMrnaList
                startcodonmrnapos = min(startCodonMrnaList)
            else:
                print "!!! no start codon for %s" % (trsp_id)
            # if len(stopCodonMrnaList)
            if len(stopCodonMrnaList) > 0:
                stopcodonmrnapos = max(stopCodonMrnaList)
            else:
                print "!!! no stop codon for %s" % (trsp_id)

    # 		if startcodonmrnapos == 'absent' or stopcodonmrnapos == 'absent':
    # #             print "no start of stop for trsp  %s" % trsp_id
    # 			continue

            cdsseq = exonsplicedseq[
                startcodonmrnapos:stopcodonmrnapos +
                1]  # take from startcodonmrnapos to stopcodonmrnapos
            utr5seq = exonsplicedseq[:startcodonmrnapos]
            utr3seq = exonsplicedseq[stopcodonmrnapos + 1:]

            if str(cdsseq[:3].upper()) != "ATG":
                nonATGstart += 1
                continue  # ignore non-AUG start codons
            stopcodon = str(cdsseq[-3:].upper())
            # if len(utr3seq) > 0:
            # 	stop4nt = stopcodon +str(utr3seq[0].upper())
            # elif len(utr3seq) == 0:
            # 	stop4nt = '0'
            # else:
            # 	print "there is a 3'UTR with negative length..."
            # 	sys.exit()
            if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA":
                wrongstopcodon += 1
                continue  # ignore weird stop codons

            # build itmes in transcript attribute list
            mRNAlen = len(exonsplicedseq)
            cdslen = len(cdsseq)
            utr5len = len(utr5seq)
            utr3len = len(utr3seq)
            assert mRNAlen == utr3len + cdslen + utr5len  # check that sum of features equals mRNA length

            #### Counting of uORFs ####
            uORFcounter = 0
            cdsExtension = 0

            for i in range(len(utr5seq)):
                ### iterate over every nucleotide in the 5'UTR

                codon = utr5seq[i:i + 3]  # define the codon at each position
                if str(codon) == str(
                        startCodon):  # check if it is a start codon
                    uORFcounter += 1

                    startPosition = i
                    seqIndex = i
                    uORFaa = []
                    uORFseq = []
                    # print "found start codon at pos %s" % startPosition
                    aminoAcid = codon.translate()
                    uORFseq.append(str(codon))
                    uORFaa.append(str(aminoAcid))

                    while str(
                            aminoAcid
                    ) != "*":  # continue this loop until a stop codon is encoutered
                        seqIndex += 3  # advance by 3 nt's each time (1 codon)
                        nextCodon = utr5seq[seqIndex:seqIndex + 3]
                        aminoAcid = nextCodon.translate()
                        if len(
                                nextCodon
                        ) == 3:  # ensure that a full codon is still present, do not want 1 or 2 nts
                            uORFseq.append(str(nextCodon))
                            uORFaa.append(str(aminoAcid))

                        if seqIndex > len(
                                utr5seq
                        ) - 2:  # if uORF continues into cds, retreive sequences from here
                            # -2 is because this will not yeild a full codon (only 2 nt's)
                            # print "end of UTR"
                            cdsExtension = 1

                            utrCdsSeq = utr5seq + cdsseq
                            nextCodon = utrCdsSeq[seqIndex:seqIndex + 3]
                            aminoAcid = nextCodon.translate()
                            uORFseq.append(str(nextCodon))
                            uORFaa.append(str(aminoAcid))
                            # print nextCodon, aminoAcid

                            if seqIndex > len(
                                    utrCdsSeq
                            ):  ## if uORF exceeds coding region, stop counting this,
                                ### could eventually extend to the 3'UTR if any transcript exists here
                                print 'end of CDS for trsp %s' % trsp_id
                                break

                    uORFseqCat = "".join(
                        uORFseq
                    )  # remove seperate list entries and concat to a string
                    uORFaaCat = "".join(uORFaa)

                    ### save all uORF features to a list, and build into a dataframe
                    uORF_features = [
                        trsp_id, trsp_genename, trsp_strand, uORFcounter,
                        startPosition, cdsExtension,
                        len(utr5seq),
                        len(cdsseq),
                        len(utr3seq),
                        len(uORFseqCat), uORFseqCat, uORFaaCat
                    ]

                    dftemp = pd.DataFrame([uORF_features], columns=dfCols)  ##
                    # print dftemp

                    uORFdf = pd.concat([uORFdf, dftemp], ignore_index=True)

                if i == (len(utr5seq) -
                         1):  # at the end of the 5'UTR, do this ...
                    # print i
                    uORFsummary = [
                        trsp_id, trsp_genename, chrom, transcriptnum,
                        trsp_strand, uORFcounter, cdsExtension
                    ]
                    # print uORFsummary
                    dfSummaryTemp = pd.DataFrame([uORFsummary],
                                                 columns=summaryCols)
                    # print dfSummaryTemp
                    summarydf = pd.concat([summarydf, dfSummaryTemp],
                                          ignore_index=True)

    uORFdf.to_csv(uORFtableOutfile)
    summarydf.to_csv(uORFsummaryOutfile)
    print summarydf.head()
Example #10
0
primerfile = "/home/pzs/primerdesign/primerdesign/tags/parallel/promoterprimers.csv"
outfile = "processedprimers.csv"

reader = csv.reader(open(primerfile, "r"))
writer = csv.writer(open(outfile, "w"))

for row in reader:
	rowlen = len(row)
	if rowlen == 4:
		assert(row[-1] == "site not present!")
		writer.writerow(row)
	elif rowlen == 6:
		assert(row[-1] == "None found!")
		writer.writerow(row)
	elif rowlen == 11:
		writer.writerow(row)
		continue
	elif rowlen == 10:
		left = row[5]
		right = SeqIO.Seq(row[6])
		right = str(right.reverse_complement())
		fullseq = row[4]
		leftindex = fullseq.index(left)
		rightindex = fullseq.index(right) + len(right)
		product = fullseq[leftindex:rightindex]
		row.insert(7, product)
		writer.writerow(row)
	else:
		print "unknown row type", row