Python SeqIO.Seq Examples

Programming Language: Python

Namespace/Package Name: Bio

Class/Type: SeqIO

Method/Function: Seq

Examples at hotexamples.com: 10

Python SeqIO.Seq - 10 examples found. These are the top rated real world Python examples of Bio.SeqIO.Seq extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

write(30)

index_db(30)

SeqRecord(30)

to_dict(30)

read(30)

convert(30)

parse(30)

index(30)

Seq(10)

SeqIO(3)

parser(2)

to_alignment(2)

p(1)

covert(1)

parst(1)

append(1)

readd(1)

_force_alphabet(1)

indexed_dict(1)

Example #1

Show file

def make_tRNA_fasta_dict(tRNAdf):
	"""
	similar to make_fasta_dict, but for the tRNA database
	"""


	tRNA_fasta_outdict = OrderedDict()

	for i in tRNAdf.index:

		if tRNAdf.loc[i,'feature'] == 'tRNA':
			chrom = tRNAdf.loc[i,'#chrom']
			chrStart = int(tRNAdf.loc[i,'chromStart'])
			chrEnd = int(tRNAdf.loc[i,'chromEnd'])
			strand = tRNAdf.loc[i,'strand']
			
			if strand == "+":
				chrStart = chrStart-1 ### gtf files are 1-based, convert to 0-based
				trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd])
				trdict = parse_entry(tRNAdf.loc[i,'transcript_id'])
			
			else: # for neg strand
				chrStart = chrStart-1
				trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd])
				trSeq = trSeq.reverse_complement()
				trdict = parse_entry(tRNAdf.loc[i,'transcript_id'])

			trID = "tRNA_"+trdict['gene_id'][0]
			desc = "| tRNA | "+trdict['gene_type'][0] + " | %s; %s; %s:%s" % (chrom, strand, chrStart, chrEnd)

			trSeqRec = SeqRecord(trSeq, id=trID, name=trdict['gene_name'][0], description=desc)
			tRNA_fasta_outdict[trID] = trSeqRec
	
	return tRNA_fasta_outdict

Example #2

Show file

def make_fasta_dict(ncdf):
	
	fasta_outdict = OrderedDict() 

	for i in ncdf.index:

		if ncdf.loc[i,'feature'] == 'transcript':
			chrom = ncdf.loc[i,'#chrom']
			chrStart = int(ncdf.loc[i,'chromStart'])
			chrEnd = int(ncdf.loc[i,'chromEnd'])
			strand = ncdf.loc[i,'strand']
			
			if strand == "+":
				chrStart = chrStart-1 ## gtf files are 1 based, convert to 0-based for python
				trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd])
				trdict = parse_mod_entry(ncdf.loc[i,'transcript_id'])
			
			else: # for neg strand
				chrStart = chrStart-1
				trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd])
				trSeq = trSeq.reverse_complement() # negative strand
				trdict = parse_mod_entry(ncdf.loc[i,'transcript_id'])

			### add output annotation line features
			trID = trdict['ID'][0]
			desc = "| "+trdict['gene_type'][0]+" | "+trdict['gene_name'][0]+ " | %s; %s; %s:%s" % (chrom, strand, chrStart, chrEnd)

			trSeqRec = SeqRecord(trSeq, id=trID, name=trdict['gene_name'][0], description=desc)
			fasta_outdict[trID] = trSeqRec
	
	return fasta_outdict

Example #3

Show file

File: assemble.py Project: jiangchb/mixemt

def write_consensus_seqs(refseq, contrib_props, contrib_reads, args):
    """
    Generates consensus sequences for each contributor from the assigned reads
    for output in FASTA format and writes them out.

    Args:
        refseq: The reference sequence to which the fragments were aligned.
        contrib_props: A list of lists containing for each contributor
                       - contributor ID (hap#)
                       - haplogroup
                       - proportion in mixture (not used).
        contrib_reads: A table mapping hap# IDs to lists of pysam
                       AlignedSegments + an entry of unassigned.
        args: The argument values from mixemt's argparse results.
    Returns:
        nothing
    """
    with open("%s.fa" % (args.cons_prefix), 'w') as fa_out:
        seqs_to_write = list()
        for con, hap, _ in contrib_props:
            seq = call_consensus(refseq,
                                 contrib_reads[con],
                                 1,
                                 args,
                                 strict=False)
            rec = SeqIO.SeqRecord(SeqIO.Seq(seq), id=con, description=hap)
            seqs_to_write.append(rec)
        if 'unassigned' in contrib_reads:
            seq = call_consensus(refseq,
                                 contrib_reads['unassigned'],
                                 1,
                                 args,
                                 strict=False)
            rec = SeqIO.SeqRecord(SeqIO.Seq(seq),
                                  id='unassigned',
                                  description='')
            seqs_to_write.append(rec)
        SeqIO.write(seqs_to_write, fa_out, 'fasta')
    return

Example #4

Show file

 def parse_result(self, genome_path):
     result_path = genome_path + '.gmhmm'
     reading_gene = False
     with open(result_path) as f:
         for line in f:
             if line.startswith('>gene'):
                 reading_gene = True
                 seq = []
                 seq_id = re.sub(r'[\s>]', '', line)
                 # >gene_2|GeneMark.hmm|57_nt|+|1|57	>NODE_3_length_713_cov_1.25228
             elif reading_gene:
                 if line.isspace():
                     reading_gene = False
                     seq = SeqIO.Seq(''.join(seq))
                     #genes.append(Gene(contig_id, strand, left_index, right_index, str_seq))
                     yield SeqIO.SeqRecord(seq,
                                           id='>' + seq_id,
                                           description='',
                                           name='')
                 else:
                     seq.append(line.strip())

Example #5

Show file

File: utr3_stop_positions.py Project: elifesciences-publications/G418_readthrough

def build_utr3_stop_positions(GFFlist):
    """
	This is a function to get the cds and utr sizes for an mRNA from a GFF file
	returns a list with: #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name
	Includes most of the functions from densebuilder_main but does not return counts
	"""
    # GFFlist = GFFinput

    transcriptdict = {}
    ucscIDlist = []
    total_transcripts = 0
    nonvalidchorms = 0
    nonATGstart = 0
    wrongstopcodon = 0
    shortcontext = 0
    validchroms = 0
    excluded_chroms = []
    included_chroms = []
    for chrom in GFFlist:
        if not chrom in validChrs:
            excluded_chroms.append(chrom)
            nonvalidchorms += 1
            # print chrom
            continue  # check that only valid choromosomes are used
        validchroms += 1
        included_chroms.append(chrom)
        transcriptnum = -1  # set to negative one so first transcript is == to 0
        for transcript in GFFlist[
                chrom].features:  # this is where the SeqFeatures are actually stored
            tr_attribute_list = []
            transcriptnum += 1
            trsp_id = transcript.id  # it is a number
            trsp_strand = transcript.strand
            trsp_genename = transcript.qualifiers['Name'][0]
            trsp_chromstart = int(
                transcript.location.start.position)  # 0-based
            trsp_chromend = int(transcript.location.end.position)
            transcriptlist = [
                0.0 for x in range(abs(trsp_chromend - trsp_chromstart))
            ]  # a list for transcript (pre-mRNA), not CDS

            exonsplicedseq = SeqIO.Seq('')
            transcriptseq = SeqIO.Seq(
                genome[chrom][trsp_chromstart:trsp_chromend])

            startCodonMrnaList = []
            stopCodonMrnaList = []

            for item in GFFlist[chrom].features[transcriptnum].sub_features:
                if trsp_strand == 1:
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = exonstart - trsp_chromstart
                        exonend_feat = exonend - trsp_chromstart  # Not 0-based, it is fine for length....next line.
                        exonsplicedseq += transcriptseq[
                            exonstart_feat:
                            exonend_feat]  # takes from exonstart to exonend-1
                    if item.type == 'start_codon':
                        startcodonpos = item.location.start.position  # 0-based position
                        # startcodonmrnapos=  chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)  # spliced mRNA position
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum,
                                            GFFlist))  # spliced mRNA position
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.end.position - 1  # 0-based position
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

                if trsp_strand == -1:
                    # reverse_complement() # this comes from seqIO
                    transcriptseq_rev = transcriptseq.reverse_complement()
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = (trsp_chromend - 1) - (exonend - 1
                                                                )  # 0-based
                        exonend_feat = (trsp_chromend -
                                        1) - exonstart  # 0-based
                        exonseq = transcriptseq_rev[
                            exonstart_feat:exonend_feat + 1]
                        exonsplicedseq = exonseq + exonsplicedseq
                    if item.type == 'start_codon':
                        startcodonpos = item.location.end.position - 1  # Need to -1 to be 0-based.
                        # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum, GFFlist))
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.start.position  # start.position is 0-based already.
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

            ### choose start and stop codons
            if len(startCodonMrnaList) > 0:
                # print "MORE THAN 1 START", startCodonMrnaList
                startcodonmrnapos = min(startCodonMrnaList)
            else:
                print "!!! no start codon for %s" % (trsp_id)
            # if len(stopCodonMrnaList)
            if len(stopCodonMrnaList) > 0:
                stopcodonmrnapos = max(stopCodonMrnaList)
            else:
                print "!!! no stop codon for %s" % (trsp_id)

            mRNAseq = exonsplicedseq
            cdsseq = exonsplicedseq[
                startcodonmrnapos:stopcodonmrnapos +
                1]  # take from startcodonmrnapos to stopcodonmrnapos
            utr5seq = exonsplicedseq[:startcodonmrnapos]
            utr3seq = exonsplicedseq[stopcodonmrnapos + 1:]

            if str(cdsseq[:3].upper()) != "ATG":
                nonATGstart += 1
                continue  # ignore non-AUG start codons

            ### stopcodon is included in cdsseq, represnted by the last 3nt's
            stopcodon = str(cdsseq[-3:].upper())

            if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA":
                wrongstopcodon += 1
                continue  # ignore weird stop codons

            # build itmes in transcript attribute list
            mRNAlen = len(exonsplicedseq)
            cdslen = len(cdsseq)
            utr5len = len(utr5seq)
            utr3len = len(utr3seq)
            assert mRNAlen == utr3len + cdslen + utr5len  # check that sum of features equals mRNA length

            ###### Finding inframe stop codons ######

            ### Frame zero for loop,
            ### count each codon into 3'UTR using 0-based counting
            ### With zero-based counting, next stopcodon * 3 == adjusted 3'UTR length
            frameZeroTrans = utr3seq.translate()
            frameZeroStopPositions = []
            frameZeroStopPositionsMRNA = []
            frameZeroPos = -1
            frameZeroStopCounter = 0
            frameZeroUtr3LenAdj = 0
            for codon in frameZeroTrans:
                frameZeroPos += 1
                if codon == '*':
                    frameZeroStopPositions.append(
                        frameZeroPos * 3)  # get utr3position in nucleotides
                    frameZeroStopPositionsMRNA.append((utr5len + cdslen) +
                                                      (frameZeroPos * 3))
                    ### check mRNA position to make sure stop codons are all valid
                    sc = str(mRNAseq[(utr5len + cdslen) +
                                     (frameZeroPos * 3):(utr5len + cdslen) +
                                     (frameZeroPos * 3) + 3].upper())
                    if sc != "TAA" and sc != "TAG" and sc != "TGA":
                        print "stop codon in frame 0 for %s is non correct!" % trsp_id
                        print "stopcodon is: %s" % sc
                        sys.exit()
                    frameZeroStopCounter += 1
                if codon == '*' and frameZeroStopCounter == 1:
                    frameZeroUtr3LenAdj = frameZeroPos * 3
            if frameZeroUtr3LenAdj == 0 and frameZeroStopCounter == 0:
                frameZeroUtr3LenAdj = len(utr3seq)

            ### Frame +1 for loop,
            framePlusOneTrans = utr3seq[1:].translate(
            )  # start one nucleotide into 3'UTR for +1 frameshift
            framePlusOneStopPositions = []
            framePlusOneStopPositionsMRNA = []
            framePlusOnePos = -1
            framePlusOneStopCounter = 0
            framePlusOneUtr3LenAdj = 0
            for codon in framePlusOneTrans:
                framePlusOnePos += 1
                if codon == '*':
                    framePlusOneStopPositions.append(
                        (framePlusOnePos * 3) +
                        1)  # get utr3position in nucleotides
                    framePlusOneStopPositionsMRNA.append((utr5len + cdslen) +
                                                         (framePlusOnePos *
                                                          3) + 1)
                    ### check mRNA position to make sure stop codons are all valid
                    sc = str(
                        mRNAseq[((utr5len + cdslen) + (framePlusOnePos * 3) +
                                 1):((utr5len + cdslen) +
                                     (framePlusOnePos * 3) + 1) + 3].upper())
                    if sc != "TAA" and sc != "TAG" and sc != "TGA":
                        print "stop codon in frame +1 for %s is non correct!" % trsp_id
                        print "stopcodon is: %s" % sc
                        sys.exit()
                    framePlusOneStopCounter += 1
                if codon == '*' and framePlusOneStopCounter == 1:
                    framePlusOneUtr3LenAdj = (framePlusOnePos * 3) + 1
            if framePlusOneUtr3LenAdj == 0 and frameZeroStopCounter == 0:
                framePlusOneUtr3LenAdj = len(utr3seq[1:])

            ### Frame -1 for loop,
            frameMinusOneTrans = (cdsseq[-1] + utr3seq).translate(
            )  # include last nucleotide of cds for -1 frameshift
            frameMinusOneStopPositions = []
            frameMinusOneStopPositionsMRNA = []
            frameMinusOnePos = -1
            frameMinusOneStopCounter = 0
            frameMinusOneUtr3LenAdj = 0
            for codon in frameMinusOneTrans:
                frameMinusOnePos += 1
                if codon == '*':
                    frameMinusOneStopPositions.append(
                        (frameMinusOnePos * 3) -
                        1)  # get utr3position in nucleotides
                    frameMinusOneStopPositionsMRNA.append((utr5len + cdslen) +
                                                          (frameMinusOnePos *
                                                           3) - 1)
                    ### check mRNA position to make sure stop codons are all valid
                    sc = str(
                        mRNAseq[((utr5len + cdslen) + (frameMinusOnePos * 3) -
                                 1):((utr5len + cdslen) +
                                     (frameMinusOnePos * 3) - 1) + 3].upper())
                    if sc != "TAA" and sc != "TAG" and sc != "TGA":
                        print "stop codon in frame -1 for %s is non correct!" % trsp_id
                        print "stopcodon is: %s" % sc
                        sys.exit()
                    frameMinusOneStopCounter += 1
                if codon == '*' and frameMinusOneStopCounter == 1:
                    frameMinusOneUtr3LenAdj = (frameMinusOnePos * 3) - 1
            if frameMinusOneUtr3LenAdj == 0 and frameZeroStopCounter == 0:
                frameMinusOneUtr3LenAdj = len(cdsseq[-1] + utr3seq)

            ####

            trsp_attr_list = [
                trsp_id, trsp_genename, frameZeroStopPositions,
                frameZeroStopPositionsMRNA, framePlusOneStopPositions,
                framePlusOneStopPositionsMRNA, frameMinusOneStopPositions,
                frameMinusOneStopPositionsMRNA
            ]

            ucscIDlist.append(trsp_attr_list[0])
            transcriptdict[trsp_id] = trsp_attr_list
            total_transcripts += 1
            #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name,stopcodon,stop4nt
    print "total number of transcripts in data table: %s" % total_transcripts
    print "Number of included chromosomes chr: %s" % validchroms
    print "Number of excluded chromosomes chr: %s" % nonvalidchorms
    print "included chroms: ", included_chroms
    print "excluded chroms: ", excluded_chroms
    print "transcripts discarded due to non-AUG start codon %s" % nonATGstart
    print "transcripts discarded due to noncanonical stop codon %s" % wrongstopcodon
    return ucscIDlist, transcriptdict

Example #6

Show file

    def builddense(self):
        transcriptdict = {}
        mappedlocalreads = 0
        dumppedreads = 0
        illegalreads = 0
        tooshortlongreads = 0
        wrongstrandreads = 0
        noStartOrStop = 0
        noStartCodon = 0
        noStopCodon = 0
        totalreads = 0  # not totreads
        GFFlist = self.makeGFFlist(self.GTFgen)

        # validChrs = 'chrLUC' # for building a single chromosome
        validChrs = [
            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
            'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
            'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
            'chrX', 'chrY', 'chrM', 'chrSinV', 'chrLUC'
        ]

        for chrom in GFFlist:
            if not chrom in validChrs: continue
            transcriptnum = -1
            for transcript in GFFlist[chrom].features:
                transcriptnum += 1
                trsp_id = transcript.id  # it is a number
                trsp_strand = transcript.strand
                # if transcript.type == 'inferred_parent': # this is a hack to deal with improperly formatted gtf files, will return strand == 0
                # trsp_strand = transcript.sub_features[0].strand # use the first subfeature entry to get strand instead
                trsp_chromstart = int(
                    transcript.location.start.position)  # 0-based
                trsp_chromend = int(transcript.location.end.position)
                transcriptlist = [
                    0.0 for x in range(abs(trsp_chromend - trsp_chromstart))
                ]  # a list for transcript (pre-mRNA), not CDS

                gb = self.getbam_5or3counts(
                    self.bamfile, transcriptlist, chrom, transcriptnum,
                    trsp_chromstart, trsp_chromend, trsp_id, trsp_strand,
                    self.riboshiftdict, self.assignment, self.bamfileout
                )  # return a riboshifted list (0-based) of unspliced counts

                mappedlocalreads += gb[1]
                dumppedreads += gb[2]
                illegalreads += gb[3]
                tooshortlongreads += gb[4]
                wrongstrandreads += gb[5]
                totalreads += gb[6]
                exonsplicedseq = SeqIO.Seq('')
                exonsplicedcounts = []
                transcriptseq = SeqIO.Seq(
                    genome[chrom][trsp_chromstart:trsp_chromend])

                # For EGFP
                #transcriptseq= genome
                #if transcript.type== 'gene':	# For yeast
                #	if trsp_strand== 1:
                #		startcodonpos= transcript.location.start.position
                #		startcodonmrnapos= seqtools.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                #		stopcodonpos= transcript.location.end.position- 1# 0-based position
                #		stopcodonmrnapos= seqtools.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                #	if trsp_strand== -1:
                #		startcodonpos= transcript.location.end.position- 1
                #		startcodonmrnapos= seqtools.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                #		stopcodonpos= transcript.location.start.position	# 0-based position, the first nt of stop codon
                #		stopcodonmrnapos= seqtools.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)

                ### handling transcripts with no start or stop codon:
                # startcodonmrnapos = 'absent'
                # stopcodonmrnapos = 'absent'

                startCodonMrnaList = []
                stopCodonMrnaList = []

                for item in GFFlist[chrom].features[
                        transcriptnum].sub_features:
                    if trsp_strand == 1:
                        if item.type == 'exon':  # or item.type== 'CDS':	# For yeast, use 'CDS'
                            exonstart = int(item.location.start.position
                                            )  # 0-based position
                            exonend = int(
                                item.location.end.position)  # not 0-based
                            exonstart_feat = exonstart - trsp_chromstart
                            exonend_feat = exonend - trsp_chromstart  # Not 0-based, it is fine for length....next line.
                            exonsplicedcounts += gb[0][
                                exonstart_feat:
                                exonend_feat]  # takes from exonstart to exonend-1
                            exonsplicedseq += transcriptseq[
                                exonstart_feat:
                                exonend_feat]  # takes from exonstart to exonend-1
                        if item.type == 'start_codon':
                            startcodonpos = item.location.start.position  # 0-based position
                            # startcodonmrnapos=  self.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)	# spliced mRNA position
                            startCodonMrnaList.append(
                                self.chrpostomrnapos(
                                    startcodonpos, chrom, transcriptnum,
                                    GFFlist))  # spliced mRNA position
                        if item.type == 'stop_codon':
                            stopcodonpos = item.location.end.position - 1  # 0-based position
                            # stopcodonmrnapos= self.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                            stopCodonMrnaList.append(
                                self.chrpostomrnapos(stopcodonpos, chrom,
                                                     transcriptnum, GFFlist))

                    if trsp_strand == -1:
                        transcriptseq_rev = transcriptseq.reverse_complement()
                        if item.type == 'exon':  # or item.type== 'CDS':	# For yeast, use 'CDS'
                            exonstart = int(item.location.start.position
                                            )  # 0-based position
                            exonend = int(
                                item.location.end.position)  # not 0-based
                            exonstart_feat = (trsp_chromend - 1) - (
                                exonend - 1)  # 0-based
                            exonend_feat = (trsp_chromend -
                                            1) - exonstart  # 0-based
                            exoncounts = gb[0][
                                exonstart_feat:exonend_feat +
                                1]  # both 0-based, need to +1 for length
                            exonsplicedcounts = exoncounts + exonsplicedcounts  # exoncounts added to the upstream of existing counts, so don't flip again.
                            exonseq = transcriptseq_rev[
                                exonstart_feat:exonend_feat + 1]
                            exonsplicedseq = exonseq + exonsplicedseq
                        if item.type == 'start_codon':
                            startcodonpos = item.location.end.position - 1  # Need to -1 to be 0-based.
                            # startcodonmrnapos= self.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                            startCodonMrnaList.append(
                                self.chrpostomrnapos(startcodonpos, chrom,
                                                     transcriptnum, GFFlist))
                        if item.type == 'stop_codon':
                            stopcodonpos = item.location.start.position  # start.position is 0-based already.
                            # stopcodonmrnapos= self.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                            stopCodonMrnaList.append(
                                self.chrpostomrnapos(stopcodonpos, chrom,
                                                     transcriptnum, GFFlist))

                if len(startCodonMrnaList) > 0:
                    # print "MORE THAN 1 START", startCodonMrnaList
                    startcodonmrnapos = min(startCodonMrnaList)
                else:
                    noStartCodon += 1
                    startcodonmrnapos = 0  ### adding for transcripts without start codon
                    # print "!!! no start codon for %s" % (trsp_id)
                if len(stopCodonMrnaList) > 0:
                    stopcodonmrnapos = max(stopCodonMrnaList)
                else:
                    noStopCodon += 1
                    stopcodonmrnapos = len(
                        exonsplicedseq) - 3  ### leave 3nt's in "3'UTR"
                    # print "!!! no stop codon for %s" % (trsp_id)

                # if startcodonmrnapos == 'absent' or stopcodonmrnapos == 'absent':
                # 	noStartOrStop +=1
                # 	continue

                cdsseq = exonsplicedseq[
                    startcodonmrnapos:stopcodonmrnapos +
                    1]  # take from startcodonmrnapos to stopcodonmrnapos
                cdscounts = exonsplicedcounts[
                    startcodonmrnapos:stopcodonmrnapos +
                    1]  # take from startcodonmrnapos to stopcodonmrnapos

                # if str(cdsseq[:3].upper())!= "ATG":	continue	# ignore non-AUG start codons
                # stopcodon= str(cdsseq[-3:].upper())
                # if stopcodon!= "TGA" and stopcodon!= "TAG" and stopcodon!= "TAA":	continue	# ignore weird stop codons
                #utr5len= startcodonmrnapos
                #utr3len= len(exonsplicedseq)- stopcodonmrnapos- 1

                if sum(cdscounts) >= float(
                        self.threshold):  # thresholding minimal reads per CDS.
                    transcriptdict[trsp_id] = exonsplicedcounts
        if self.totreads == '-1':
            print str(
                totalreads) + " total mapped reads used for normalization."
            self.norm_m(
                transcriptdict, totalreads
            )  # Normalzied by total reads mapped to transcriptdict only... but not total mapped reads.
        else:
            print str(
                self.totreads
            ) + " total mapped reads from STAR alignment used for normalization."
            self.norm_m(transcriptdict, self.totreads)

        ### disable writing counts to file here
        # self.writecountsf(transcriptdict, self.outputdata)

        ### assemble dataframe here
        outdict = OrderedDict()

        for key, val in transcriptdict.items():
            outdict[key] = [val]

        df = pd.DataFrame.from_dict(outdict, orient='index')
        df.columns = ['density']
        print df.head()
        df.to_csv('%s.csv.gz' % (self.outputdata), compression='gzip')

        # Write output file of comments.
        fc = open(self.outputdata + "output.txt", "w")
        fc.write("Density was built with parameters:\n")
        fc.write("riboshiftdict=" + str(self.riboshiftdict) + "\n")
        fc.write("threshold=" + str(self.threshold) + "\n")
        fc.write("assignment=" + str(self.assignment) + "\n")
        fc.write("reads mapped to known canonical coding transcripts: " +
                 str(mappedlocalreads) + "\n")
        fc.write("reads are dumpped, due to weird cigar codes: " +
                 str(dumppedreads) + "\n")
        fc.write(
            "reads are illegal, mapped outside of annotated transcripts: " +
            str(illegalreads) + "\n")
        fc.write("reads are too short/long: " + str(tooshortlongreads) + "\n")
        fc.write("reads are on the wrong strand: " + str(wrongstrandreads) +
                 "\n")
        fc.write("total mapped reads from aligner: " + str(totalreads))
        fc.close()

        print str(
            mappedlocalreads
        ) + " reads within length limitation mapped to known canonical coding transcripts. "
        print str(
            dumppedreads) + " reads are dumpped, due to weird cigar codes."
        print str(
            illegalreads
        ) + " reads are illegal, mapped outside of annotated transcripts."
        print str(tooshortlongreads) + " reads are too short/long."
        print str(wrongstrandreads) + " reads are on the wrong strand."
        print str(totalreads) + " total mapped reads from aligner. "

Example #7

Show file

File: makeUTRtable.py Project: elifesciences-publications/G418_readthrough

def build_utr_table(GFFlist, inculde_noncanon_start, include_noncanon_stop):
    """
	This is a function to get the cds and utr sizes for an mRNA from a GFF file
	returns a list with: #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name
	Includes most of the functions from densebuilder_main but does not return counts
	"""
    # GFFlist = GFFinput

    transcriptdict = {}
    ucscIDlist = []
    total_transcripts = 0
    nonvalidchorms = 0
    nonATGstart = 0
    wrongstopcodon = 0
    validchroms = 0
    excluded_chroms = []
    included_chroms = []
    for chrom in GFFlist:
        if not chrom in validChrs:
            excluded_chroms.append(chrom)
            nonvalidchorms += 1
            # print chrom
            continue  # check that only valid choromosomes are used
        validchroms += 1
        included_chroms.append(chrom)
        transcriptnum = -1  # set to negative one so first transcript is == to 0
        for transcript in GFFlist[
                chrom].features:  # this is where the SeqFeatures are actually stored
            tr_attribute_list = []
            transcriptnum += 1
            trsp_id = transcript.id  # it is a number
            trsp_strand = transcript.strand
            ### changing this to be compatible with new hg38 annotation
            # print transcript.qualifiers ### these are all of the fields parsed by the GTF parser from column 8, output is a dictionary {'key':['item1', 'item2', 'ect']}
            trsp_genename = transcript.qualifiers['Name'][0]
            trsp_chromstart = int(
                transcript.location.start.position)  # 0-based
            trsp_chromend = int(transcript.location.end.position)
            transcriptlist = [
                0.0 for x in range(abs(trsp_chromend - trsp_chromstart))
            ]  # a list for transcript (pre-mRNA), not CDS

            exonsplicedseq = SeqIO.Seq('')
            transcriptseq = SeqIO.Seq(
                genome[chrom][trsp_chromstart:trsp_chromend])

            ### use lists to handle transcripts with multiple start and stop codons
            startCodonMrnaList = []
            stopCodonMrnaList = []

            for item in GFFlist[chrom].features[transcriptnum].sub_features:

                if trsp_strand == 1:

                    ### dealing with transcripts having multiple start or stop codon entries, if spaning splice junctions

                    if item.type == 'exon':  # or item.type== 'CDS':	# For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = exonstart - trsp_chromstart
                        exonend_feat = exonend - trsp_chromstart  # Not 0-based, it is fine for length....next line.
                        exonsplicedseq += transcriptseq[
                            exonstart_feat:
                            exonend_feat]  # takes from exonstart to exonend-1
                    if item.type == 'start_codon':
                        startcodonpos = item.location.start.position  # 0-based position
                        # startcodonmrnapos=  chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)	# spliced mRNA position
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum,
                                            GFFlist))  # spliced mRNA position
                        # print startcodonmrnapos
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.end.position - 1  # 0-based position
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))
                        # print stopcodonmrnapos

                if trsp_strand == -1:
                    # print 'neg_strand'
                    # reverse_complement() # this comes from seqIO
                    transcriptseq_rev = transcriptseq.reverse_complement()

                    if item.type == 'exon':  # or item.type== 'CDS':	# For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = (trsp_chromend - 1) - (exonend - 1
                                                                )  # 0-based
                        exonend_feat = (trsp_chromend -
                                        1) - exonstart  # 0-based
                        exonseq = transcriptseq_rev[
                            exonstart_feat:exonend_feat + 1]
                        exonsplicedseq = exonseq + exonsplicedseq
                    if item.type == 'start_codon':
                        startcodonpos = item.location.end.position - 1  # Need to -1 to be 0-based.
                        # print startcodonpos
                        # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum, GFFlist))
                        # print "start codon: ", startcodonmrnapos
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.start.position  # start.position is 0-based already.
                        # print stopcodonpos
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))
                        # print "stop codon: ", stopcodonmrnapos

            if len(startCodonMrnaList) > 0:
                # print "MORE THAN 1 START", startCodonMrnaList
                startcodonmrnapos = min(startCodonMrnaList)
            else:
                print "!!! no start codon for %s" % (trsp_id)
                startcodonmrnapos = 0  ### adding for transcripts without start codon
            # if len(stopCodonMrnaList)
            if len(stopCodonMrnaList) > 0:
                stopcodonmrnapos = max(stopCodonMrnaList)
            else:
                print "!!! no stop codon for %s" % (trsp_id)
                stopcodonmrnapos = len(
                    exonsplicedseq) - 3  ### leave 3nt's in "3'UTR"

            cdsseq = exonsplicedseq[
                startcodonmrnapos:stopcodonmrnapos +
                1]  # take from startcodonmrnapos to stopcodonmrnapos
            utr5seq = exonsplicedseq[:startcodonmrnapos]
            utr3seq = exonsplicedseq[stopcodonmrnapos + 1:]

            # print trsp_id
            # # print transcript.qualifiers['transcript_name']
            # print trsp_strand
            # print utr5seq
            # print " - - - "
            # print cdsseq
            # print " - - - "
            # print utr3seq
            # # print utr5seq+cdsseq+utr3seq
            # print ""
            # # print transcriptseq

            if inculde_noncanon_start == False:
                if str(cdsseq[:3].upper()) != "ATG":
                    nonATGstart += 1
                    print "non canon start"
                    print trsp_id
                    print cdsseq
                    print ""
                    continue  # ignore non-AUG start codons

            stopcodon = str(cdsseq[-3:].upper())
            if len(utr3seq) > 0:
                stop4nt = stopcodon + str(utr3seq[0].upper())
            elif len(utr3seq) == 0:
                stop4nt = '0'
            else:
                print "there is a 3'UTR with negative length..."
                sys.exit()

            if include_noncanon_stop == False:
                if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA":
                    wrongstopcodon += 1
                    print "wrong stop!"
                    print trsp_id
                    print cdsseq
                    print ""
                    continue  # ignore weird stop codons

            # build itmes in transcript attribute list
            mRNAlen = len(exonsplicedseq)
            cdslen = len(cdsseq)
            utr5len = len(utr5seq)
            utr3len = len(utr3seq)
            assert mRNAlen == utr3len + cdslen + utr5len  # check that sum of features equals mRNA length

            trsp_attr_list = [
                trsp_id, chrom, transcriptnum, trsp_strand, mRNAlen, cdslen,
                utr5len, utr3len, trsp_genename, stopcodon, stop4nt
            ]
            ucscIDlist.append(trsp_attr_list[0])
            transcriptdict[trsp_id] = trsp_attr_list
            total_transcripts += 1
            #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name,stopcodon,stop4nt
    print "total number of transcripts in data table: %s" % total_transcripts
    print "Number of included chromosomes chr: %s" % validchroms
    print "Number of excluded chromosomes chr: %s" % nonvalidchorms
    print "included chroms: ", included_chroms
    print "excluded chroms: ", excluded_chroms
    print "transcripts discarded due to non-AUG start codon %s" % nonATGstart
    print "transcripts discarded due to noncanonical stop codon %s" % wrongstopcodon
    return ucscIDlist, transcriptdict

Example #8

Show file

def get_Prot_sequence(GFFlist):

    transcriptdict = {}
    ucscIDlist = []
    total_transcripts = 0
    nonvalidchorms = 0
    nonATGstart = 0
    wrongstopcodon = 0
    validchroms = 0
    excluded_chroms = []
    included_chroms = []
    for chrom in GFFlist:
        if not chrom in validChrs:
            excluded_chroms.append(chrom)
            nonvalidchorms += 1
            # print chrom
            continue  # check that only valid choromosomes are used
        validchroms += 1
        included_chroms.append(chrom)
        transcriptnum = -1  # set to negative one so first transcript is == to 0
        for transcript in GFFlist[
                chrom].features:  # this is where the SeqFeatures are actually stored
            tr_attribute_list = []
            transcriptnum += 1
            trsp_id = transcript.id  # it is a number
            trsp_strand = transcript.strand
            trsp_genename = transcript.qualifiers['Name'][0]
            trsp_chromstart = int(
                transcript.location.start.position)  # 0-based
            trsp_chromend = int(transcript.location.end.position)
            transcriptlist = [
                0.0 for x in range(abs(trsp_chromend - trsp_chromstart))
            ]  # a list for transcript (pre-mRNA), not CDS

            exonsplicedseq = SeqIO.Seq('')
            transcriptseq = SeqIO.Seq(
                genome[chrom][trsp_chromstart:trsp_chromend])

            ### use lists to handle transcripts with multiple start and stop codons
            startCodonMrnaList = []
            stopCodonMrnaList = []

            for item in GFFlist[chrom].features[transcriptnum].sub_features:
                if trsp_strand == 1:
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = exonstart - trsp_chromstart
                        exonend_feat = exonend - trsp_chromstart  # Not 0-based, it is fine for length....next line.
                        exonsplicedseq += transcriptseq[
                            exonstart_feat:
                            exonend_feat]  # takes from exonstart to exonend-1
                    if item.type == 'start_codon':
                        startcodonpos = item.location.start.position  # 0-based position
                        # startcodonmrnapos=  chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)  # spliced mRNA position
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum,
                                            GFFlist))  # spliced mRNA position
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.end.position - 1  # 0-based position
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

                if trsp_strand == -1:
                    # reverse_complement() # this comes from seqIO
                    transcriptseq_rev = transcriptseq.reverse_complement()
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = (trsp_chromend - 1) - (exonend - 1
                                                                )  # 0-based
                        exonend_feat = (trsp_chromend -
                                        1) - exonstart  # 0-based
                        exonseq = transcriptseq_rev[
                            exonstart_feat:exonend_feat + 1]
                        exonsplicedseq = exonseq + exonsplicedseq
                    if item.type == 'start_codon':
                        startcodonpos = item.location.end.position - 1  # Need to -1 to be 0-based.
                        # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum, GFFlist))
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.start.position  # start.position is 0-based already.
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

            ### choose start and stop codons
            if len(startCodonMrnaList) > 0:
                # print "MORE THAN 1 START", startCodonMrnaList
                startcodonmrnapos = min(startCodonMrnaList)
            else:
                print "!!! no start codon for %s" % (trsp_id)
            # if len(stopCodonMrnaList)
            if len(stopCodonMrnaList) > 0:
                stopcodonmrnapos = max(stopCodonMrnaList)
            else:
                print "!!! no stop codon for %s" % (trsp_id)

            mRNAseq = exonsplicedseq
            cdsseq = exonsplicedseq[
                startcodonmrnapos:stopcodonmrnapos +
                1]  # take from startcodonmrnapos to stopcodonmrnapos
            utr5seq = exonsplicedseq[:startcodonmrnapos]
            utr3seq = exonsplicedseq[stopcodonmrnapos + 1:]

            cdsProt = cdsseq.translate()

            # outseq = utr5seq.lower()+cdsseq.upper()+utr3seq.lower()

            if str(cdsseq[:3].upper()) != "ATG":
                nonATGstart += 1
                continue  # ignore non-AUG start codons

            ### stopcodon is included in cdsseq, represnted by the last 3nt's
            stopcodon = str(cdsseq[-3:].upper())

            if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA":
                wrongstopcodon += 1
                continue  # ignore weird stop codons

            # build itmes in transcript attribute list
            mRNAlen = len(exonsplicedseq)
            cdslen = len(cdsseq)
            utr5len = len(utr5seq)
            utr3len = len(utr3seq)
            assert mRNAlen == utr3len + cdslen + utr5len  # check that sum of features equals mRNA length

            trsp_attr_list = [trsp_id, trsp_genename, cdsProt]
            ucscIDlist.append(trsp_attr_list[0])
            transcriptdict[trsp_id] = trsp_attr_list
            total_transcripts += 1
    print "total number of transcripts in data table: %s" % total_transcripts
    print "Number of included chromosomes chr: %s" % validchroms
    print "Number of excluded chromosomes chr: %s" % nonvalidchorms
    print "included chroms: ", included_chroms
    print "excluded chroms: ", excluded_chroms
    print "transcripts discarded due to non-AUG start codon %s" % nonATGstart
    print "transcripts discarded due to noncanonical stop codon %s" % wrongstopcodon
    return ucscIDlist, transcriptdict

Example #9

Show file

File: uORF_finder.py Project: elifesciences-publications/G418_readthrough

def find_uORFs(GFFlist):
    """
	using the same basic structure as denesbuilder_main, this function identifies all uORFs and write csv files
	"""
    ### define start codon
    ## could possibly change this to look at non canonical start codons
    startCodon = Seq('ATG')

    ### build empty data frames, rows will be appended as function iterates over transcripts
    dfCols = [
        'trxname', 'symbol', 'strand', 'uORFCounter', 'startPosition',
        'cdsExtension', 'utr5len', 'cdslen', 'utr3len', 'uORFlen', 'uORFseq',
        'uORFaa'
    ]
    uORFdf = pd.DataFrame(columns=dfCols)

    summaryCols = [
        'trxname', 'symbol', 'chr', 'tr_number', 'strand', 'uORFCounter',
        'cdsExtension'
    ]
    summarydf = pd.DataFrame(columns=summaryCols)

    ####

    total_transcripts = 0
    nonvalidchorms = 0
    nonATGstart = 0
    wrongstopcodon = 0
    validchroms = 0
    excluded_chroms = []
    included_chroms = []
    for chrom in GFFlist:
        if not chrom in validChrs:
            excluded_chroms.append(chrom)
            nonvalidchorms += 1
            #         print chrom
            continue  # check that only valid choromosomes are used
        validchroms += 1
        included_chroms.append(chrom)
        transcriptnum = -1  # set to negative one so first transcript is == to 0
        # print chrom
        for transcript in GFFlist[
                chrom].features:  # this is where the SeqFeatures are actually stored
            #         print transcript
            tr_attribute_list = []
            transcriptnum += 1
            trsp_id = transcript.id  # it is a number
            trsp_strand = transcript.strand
            trsp_genename = transcript.qualifiers['Name'][0]
            trsp_chromstart = int(
                transcript.location.start.position)  # 0-based
            trsp_chromend = int(transcript.location.end.position)
            transcriptlist = [
                0.0 for x in range(abs(trsp_chromend - trsp_chromstart))
            ]  # a list for transcript (pre-mRNA), not CDS

            exonsplicedseq = SeqIO.Seq('')
            transcriptseq = SeqIO.Seq(
                genome[chrom][trsp_chromstart:trsp_chromend])

            ### handling transcripts with no start or stop codon:
            # startcodonmrnapos = 'absent'
            # stopcodonmrnapos = 'absent'

            startCodonMrnaList = []
            stopCodonMrnaList = []

            for item in GFFlist[chrom].features[transcriptnum].sub_features:
                if trsp_strand == 1:
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = exonstart - trsp_chromstart
                        exonend_feat = exonend - trsp_chromstart  # Not 0-based, it is fine for length....next line.
                        exonsplicedseq += transcriptseq[
                            exonstart_feat:
                            exonend_feat]  # takes from exonstart to exonend-1
                    if item.type == 'start_codon':
                        startcodonpos = item.location.start.position  # 0-based position
                        # startcodonmrnapos=  chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)  # spliced mRNA position
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum, GFFlist))
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.end.position - 1  # 0-based position
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

                if trsp_strand == -1:
                    # reverse_complement() # this comes from seqIO
                    transcriptseq_rev = transcriptseq.reverse_complement()
                    if item.type == 'exon':  # or item.type== 'CDS':  # For yeast, use 'CDS'
                        exonstart = int(
                            item.location.start.position)  # 0-based position
                        exonend = int(
                            item.location.end.position)  # not 0-based
                        exonstart_feat = (trsp_chromend - 1) - (exonend - 1
                                                                )  # 0-based
                        exonend_feat = (trsp_chromend -
                                        1) - exonstart  # 0-based
                        exonseq = transcriptseq_rev[
                            exonstart_feat:exonend_feat + 1]
                        exonsplicedseq = exonseq + exonsplicedseq
                    if item.type == 'start_codon':
                        startcodonpos = item.location.end.position - 1  # Need to -1 to be 0-based.
                        # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist)
                        startCodonMrnaList.append(
                            chrpostomrnapos(startcodonpos, chrom,
                                            transcriptnum, GFFlist))
                    if item.type == 'stop_codon':
                        stopcodonpos = item.location.start.position  # start.position is 0-based already.
                        # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist)
                        stopCodonMrnaList.append(
                            chrpostomrnapos(stopcodonpos, chrom, transcriptnum,
                                            GFFlist))

            if len(startCodonMrnaList) > 0:
                # print "MORE THAN 1 START", startCodonMrnaList
                startcodonmrnapos = min(startCodonMrnaList)
            else:
                print "!!! no start codon for %s" % (trsp_id)
            # if len(stopCodonMrnaList)
            if len(stopCodonMrnaList) > 0:
                stopcodonmrnapos = max(stopCodonMrnaList)
            else:
                print "!!! no stop codon for %s" % (trsp_id)

    # 		if startcodonmrnapos == 'absent' or stopcodonmrnapos == 'absent':
    # #             print "no start of stop for trsp  %s" % trsp_id
    # 			continue

            cdsseq = exonsplicedseq[
                startcodonmrnapos:stopcodonmrnapos +
                1]  # take from startcodonmrnapos to stopcodonmrnapos
            utr5seq = exonsplicedseq[:startcodonmrnapos]
            utr3seq = exonsplicedseq[stopcodonmrnapos + 1:]

            if str(cdsseq[:3].upper()) != "ATG":
                nonATGstart += 1
                continue  # ignore non-AUG start codons
            stopcodon = str(cdsseq[-3:].upper())
            # if len(utr3seq) > 0:
            # 	stop4nt = stopcodon +str(utr3seq[0].upper())
            # elif len(utr3seq) == 0:
            # 	stop4nt = '0'
            # else:
            # 	print "there is a 3'UTR with negative length..."
            # 	sys.exit()
            if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA":
                wrongstopcodon += 1
                continue  # ignore weird stop codons

            # build itmes in transcript attribute list
            mRNAlen = len(exonsplicedseq)
            cdslen = len(cdsseq)
            utr5len = len(utr5seq)
            utr3len = len(utr3seq)
            assert mRNAlen == utr3len + cdslen + utr5len  # check that sum of features equals mRNA length

            #### Counting of uORFs ####
            uORFcounter = 0
            cdsExtension = 0

            for i in range(len(utr5seq)):
                ### iterate over every nucleotide in the 5'UTR

                codon = utr5seq[i:i + 3]  # define the codon at each position
                if str(codon) == str(
                        startCodon):  # check if it is a start codon
                    uORFcounter += 1

                    startPosition = i
                    seqIndex = i
                    uORFaa = []
                    uORFseq = []
                    # print "found start codon at pos %s" % startPosition
                    aminoAcid = codon.translate()
                    uORFseq.append(str(codon))
                    uORFaa.append(str(aminoAcid))

                    while str(
                            aminoAcid
                    ) != "*":  # continue this loop until a stop codon is encoutered
                        seqIndex += 3  # advance by 3 nt's each time (1 codon)
                        nextCodon = utr5seq[seqIndex:seqIndex + 3]
                        aminoAcid = nextCodon.translate()
                        if len(
                                nextCodon
                        ) == 3:  # ensure that a full codon is still present, do not want 1 or 2 nts
                            uORFseq.append(str(nextCodon))
                            uORFaa.append(str(aminoAcid))

                        if seqIndex > len(
                                utr5seq
                        ) - 2:  # if uORF continues into cds, retreive sequences from here
                            # -2 is because this will not yeild a full codon (only 2 nt's)
                            # print "end of UTR"
                            cdsExtension = 1

                            utrCdsSeq = utr5seq + cdsseq
                            nextCodon = utrCdsSeq[seqIndex:seqIndex + 3]
                            aminoAcid = nextCodon.translate()
                            uORFseq.append(str(nextCodon))
                            uORFaa.append(str(aminoAcid))
                            # print nextCodon, aminoAcid

                            if seqIndex > len(
                                    utrCdsSeq
                            ):  ## if uORF exceeds coding region, stop counting this,
                                ### could eventually extend to the 3'UTR if any transcript exists here
                                print 'end of CDS for trsp %s' % trsp_id
                                break

                    uORFseqCat = "".join(
                        uORFseq
                    )  # remove seperate list entries and concat to a string
                    uORFaaCat = "".join(uORFaa)

                    ### save all uORF features to a list, and build into a dataframe
                    uORF_features = [
                        trsp_id, trsp_genename, trsp_strand, uORFcounter,
                        startPosition, cdsExtension,
                        len(utr5seq),
                        len(cdsseq),
                        len(utr3seq),
                        len(uORFseqCat), uORFseqCat, uORFaaCat
                    ]

                    dftemp = pd.DataFrame([uORF_features], columns=dfCols)  ##
                    # print dftemp

                    uORFdf = pd.concat([uORFdf, dftemp], ignore_index=True)

                if i == (len(utr5seq) -
                         1):  # at the end of the 5'UTR, do this ...
                    # print i
                    uORFsummary = [
                        trsp_id, trsp_genename, chrom, transcriptnum,
                        trsp_strand, uORFcounter, cdsExtension
                    ]
                    # print uORFsummary
                    dfSummaryTemp = pd.DataFrame([uORFsummary],
                                                 columns=summaryCols)
                    # print dfSummaryTemp
                    summarydf = pd.concat([summarydf, dfSummaryTemp],
                                          ignore_index=True)

    uORFdf.to_csv(uORFtableOutfile)
    summarydf.to_csv(uORFsummaryOutfile)
    print summarydf.head()

Example #10

Show file

File: patchprimerlist.py Project: neilrobertson/BICRCode

primerfile = "/home/pzs/primerdesign/primerdesign/tags/parallel/promoterprimers.csv"
outfile = "processedprimers.csv"

reader = csv.reader(open(primerfile, "r"))
writer = csv.writer(open(outfile, "w"))

for row in reader:
	rowlen = len(row)
	if rowlen == 4:
		assert(row[-1] == "site not present!")
		writer.writerow(row)
	elif rowlen == 6:
		assert(row[-1] == "None found!")
		writer.writerow(row)
	elif rowlen == 11:
		writer.writerow(row)
		continue
	elif rowlen == 10:
		left = row[5]
		right = SeqIO.Seq(row[6])
		right = str(right.reverse_complement())
		fullseq = row[4]
		leftindex = fullseq.index(left)
		rightindex = fullseq.index(right) + len(right)
		product = fullseq[leftindex:rightindex]
		row.insert(7, product)
		writer.writerow(row)
	else:
		print "unknown row type", row