def make_tRNA_fasta_dict(tRNAdf): """ similar to make_fasta_dict, but for the tRNA database """ tRNA_fasta_outdict = OrderedDict() for i in tRNAdf.index: if tRNAdf.loc[i,'feature'] == 'tRNA': chrom = tRNAdf.loc[i,'#chrom'] chrStart = int(tRNAdf.loc[i,'chromStart']) chrEnd = int(tRNAdf.loc[i,'chromEnd']) strand = tRNAdf.loc[i,'strand'] if strand == "+": chrStart = chrStart-1 ### gtf files are 1-based, convert to 0-based trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd]) trdict = parse_entry(tRNAdf.loc[i,'transcript_id']) else: # for neg strand chrStart = chrStart-1 trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd]) trSeq = trSeq.reverse_complement() trdict = parse_entry(tRNAdf.loc[i,'transcript_id']) trID = "tRNA_"+trdict['gene_id'][0] desc = "| tRNA | "+trdict['gene_type'][0] + " | %s; %s; %s:%s" % (chrom, strand, chrStart, chrEnd) trSeqRec = SeqRecord(trSeq, id=trID, name=trdict['gene_name'][0], description=desc) tRNA_fasta_outdict[trID] = trSeqRec return tRNA_fasta_outdict
def make_fasta_dict(ncdf): fasta_outdict = OrderedDict() for i in ncdf.index: if ncdf.loc[i,'feature'] == 'transcript': chrom = ncdf.loc[i,'#chrom'] chrStart = int(ncdf.loc[i,'chromStart']) chrEnd = int(ncdf.loc[i,'chromEnd']) strand = ncdf.loc[i,'strand'] if strand == "+": chrStart = chrStart-1 ## gtf files are 1 based, convert to 0-based for python trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd]) trdict = parse_mod_entry(ncdf.loc[i,'transcript_id']) else: # for neg strand chrStart = chrStart-1 trSeq = SeqIO.Seq(genome[chrom][chrStart:chrEnd]) trSeq = trSeq.reverse_complement() # negative strand trdict = parse_mod_entry(ncdf.loc[i,'transcript_id']) ### add output annotation line features trID = trdict['ID'][0] desc = "| "+trdict['gene_type'][0]+" | "+trdict['gene_name'][0]+ " | %s; %s; %s:%s" % (chrom, strand, chrStart, chrEnd) trSeqRec = SeqRecord(trSeq, id=trID, name=trdict['gene_name'][0], description=desc) fasta_outdict[trID] = trSeqRec return fasta_outdict
def write_consensus_seqs(refseq, contrib_props, contrib_reads, args): """ Generates consensus sequences for each contributor from the assigned reads for output in FASTA format and writes them out. Args: refseq: The reference sequence to which the fragments were aligned. contrib_props: A list of lists containing for each contributor - contributor ID (hap#) - haplogroup - proportion in mixture (not used). contrib_reads: A table mapping hap# IDs to lists of pysam AlignedSegments + an entry of unassigned. args: The argument values from mixemt's argparse results. Returns: nothing """ with open("%s.fa" % (args.cons_prefix), 'w') as fa_out: seqs_to_write = list() for con, hap, _ in contrib_props: seq = call_consensus(refseq, contrib_reads[con], 1, args, strict=False) rec = SeqIO.SeqRecord(SeqIO.Seq(seq), id=con, description=hap) seqs_to_write.append(rec) if 'unassigned' in contrib_reads: seq = call_consensus(refseq, contrib_reads['unassigned'], 1, args, strict=False) rec = SeqIO.SeqRecord(SeqIO.Seq(seq), id='unassigned', description='') seqs_to_write.append(rec) SeqIO.write(seqs_to_write, fa_out, 'fasta') return
def parse_result(self, genome_path): result_path = genome_path + '.gmhmm' reading_gene = False with open(result_path) as f: for line in f: if line.startswith('>gene'): reading_gene = True seq = [] seq_id = re.sub(r'[\s>]', '', line) # >gene_2|GeneMark.hmm|57_nt|+|1|57 >NODE_3_length_713_cov_1.25228 elif reading_gene: if line.isspace(): reading_gene = False seq = SeqIO.Seq(''.join(seq)) #genes.append(Gene(contig_id, strand, left_index, right_index, str_seq)) yield SeqIO.SeqRecord(seq, id='>' + seq_id, description='', name='') else: seq.append(line.strip())
def build_utr3_stop_positions(GFFlist): """ This is a function to get the cds and utr sizes for an mRNA from a GFF file returns a list with: #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name Includes most of the functions from densebuilder_main but does not return counts """ # GFFlist = GFFinput transcriptdict = {} ucscIDlist = [] total_transcripts = 0 nonvalidchorms = 0 nonATGstart = 0 wrongstopcodon = 0 shortcontext = 0 validchroms = 0 excluded_chroms = [] included_chroms = [] for chrom in GFFlist: if not chrom in validChrs: excluded_chroms.append(chrom) nonvalidchorms += 1 # print chrom continue # check that only valid choromosomes are used validchroms += 1 included_chroms.append(chrom) transcriptnum = -1 # set to negative one so first transcript is == to 0 for transcript in GFFlist[ chrom].features: # this is where the SeqFeatures are actually stored tr_attribute_list = [] transcriptnum += 1 trsp_id = transcript.id # it is a number trsp_strand = transcript.strand trsp_genename = transcript.qualifiers['Name'][0] trsp_chromstart = int( transcript.location.start.position) # 0-based trsp_chromend = int(transcript.location.end.position) transcriptlist = [ 0.0 for x in range(abs(trsp_chromend - trsp_chromstart)) ] # a list for transcript (pre-mRNA), not CDS exonsplicedseq = SeqIO.Seq('') transcriptseq = SeqIO.Seq( genome[chrom][trsp_chromstart:trsp_chromend]) startCodonMrnaList = [] stopCodonMrnaList = [] for item in GFFlist[chrom].features[transcriptnum].sub_features: if trsp_strand == 1: if item.type == 'exon': # or item.type== 'CDS': # For yeast, use 'CDS' exonstart = int( item.location.start.position) # 0-based position exonend = int( item.location.end.position) # not 0-based exonstart_feat = exonstart - trsp_chromstart exonend_feat = exonend - trsp_chromstart # Not 0-based, it is fine for length....next line. exonsplicedseq += transcriptseq[ exonstart_feat: exonend_feat] # takes from exonstart to exonend-1 if item.type == 'start_codon': startcodonpos = item.location.start.position # 0-based position # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) # spliced mRNA position startCodonMrnaList.append( chrpostomrnapos(startcodonpos, chrom, transcriptnum, GFFlist)) # spliced mRNA position if item.type == 'stop_codon': stopcodonpos = item.location.end.position - 1 # 0-based position # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) stopCodonMrnaList.append( chrpostomrnapos(stopcodonpos, chrom, transcriptnum, GFFlist)) if trsp_strand == -1: # reverse_complement() # this comes from seqIO transcriptseq_rev = transcriptseq.reverse_complement() if item.type == 'exon': # or item.type== 'CDS': # For yeast, use 'CDS' exonstart = int( item.location.start.position) # 0-based position exonend = int( item.location.end.position) # not 0-based exonstart_feat = (trsp_chromend - 1) - (exonend - 1 ) # 0-based exonend_feat = (trsp_chromend - 1) - exonstart # 0-based exonseq = transcriptseq_rev[ exonstart_feat:exonend_feat + 1] exonsplicedseq = exonseq + exonsplicedseq if item.type == 'start_codon': startcodonpos = item.location.end.position - 1 # Need to -1 to be 0-based. # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) startCodonMrnaList.append( chrpostomrnapos(startcodonpos, chrom, transcriptnum, GFFlist)) if item.type == 'stop_codon': stopcodonpos = item.location.start.position # start.position is 0-based already. # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) stopCodonMrnaList.append( chrpostomrnapos(stopcodonpos, chrom, transcriptnum, GFFlist)) ### choose start and stop codons if len(startCodonMrnaList) > 0: # print "MORE THAN 1 START", startCodonMrnaList startcodonmrnapos = min(startCodonMrnaList) else: print "!!! no start codon for %s" % (trsp_id) # if len(stopCodonMrnaList) if len(stopCodonMrnaList) > 0: stopcodonmrnapos = max(stopCodonMrnaList) else: print "!!! no stop codon for %s" % (trsp_id) mRNAseq = exonsplicedseq cdsseq = exonsplicedseq[ startcodonmrnapos:stopcodonmrnapos + 1] # take from startcodonmrnapos to stopcodonmrnapos utr5seq = exonsplicedseq[:startcodonmrnapos] utr3seq = exonsplicedseq[stopcodonmrnapos + 1:] if str(cdsseq[:3].upper()) != "ATG": nonATGstart += 1 continue # ignore non-AUG start codons ### stopcodon is included in cdsseq, represnted by the last 3nt's stopcodon = str(cdsseq[-3:].upper()) if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA": wrongstopcodon += 1 continue # ignore weird stop codons # build itmes in transcript attribute list mRNAlen = len(exonsplicedseq) cdslen = len(cdsseq) utr5len = len(utr5seq) utr3len = len(utr3seq) assert mRNAlen == utr3len + cdslen + utr5len # check that sum of features equals mRNA length ###### Finding inframe stop codons ###### ### Frame zero for loop, ### count each codon into 3'UTR using 0-based counting ### With zero-based counting, next stopcodon * 3 == adjusted 3'UTR length frameZeroTrans = utr3seq.translate() frameZeroStopPositions = [] frameZeroStopPositionsMRNA = [] frameZeroPos = -1 frameZeroStopCounter = 0 frameZeroUtr3LenAdj = 0 for codon in frameZeroTrans: frameZeroPos += 1 if codon == '*': frameZeroStopPositions.append( frameZeroPos * 3) # get utr3position in nucleotides frameZeroStopPositionsMRNA.append((utr5len + cdslen) + (frameZeroPos * 3)) ### check mRNA position to make sure stop codons are all valid sc = str(mRNAseq[(utr5len + cdslen) + (frameZeroPos * 3):(utr5len + cdslen) + (frameZeroPos * 3) + 3].upper()) if sc != "TAA" and sc != "TAG" and sc != "TGA": print "stop codon in frame 0 for %s is non correct!" % trsp_id print "stopcodon is: %s" % sc sys.exit() frameZeroStopCounter += 1 if codon == '*' and frameZeroStopCounter == 1: frameZeroUtr3LenAdj = frameZeroPos * 3 if frameZeroUtr3LenAdj == 0 and frameZeroStopCounter == 0: frameZeroUtr3LenAdj = len(utr3seq) ### Frame +1 for loop, framePlusOneTrans = utr3seq[1:].translate( ) # start one nucleotide into 3'UTR for +1 frameshift framePlusOneStopPositions = [] framePlusOneStopPositionsMRNA = [] framePlusOnePos = -1 framePlusOneStopCounter = 0 framePlusOneUtr3LenAdj = 0 for codon in framePlusOneTrans: framePlusOnePos += 1 if codon == '*': framePlusOneStopPositions.append( (framePlusOnePos * 3) + 1) # get utr3position in nucleotides framePlusOneStopPositionsMRNA.append((utr5len + cdslen) + (framePlusOnePos * 3) + 1) ### check mRNA position to make sure stop codons are all valid sc = str( mRNAseq[((utr5len + cdslen) + (framePlusOnePos * 3) + 1):((utr5len + cdslen) + (framePlusOnePos * 3) + 1) + 3].upper()) if sc != "TAA" and sc != "TAG" and sc != "TGA": print "stop codon in frame +1 for %s is non correct!" % trsp_id print "stopcodon is: %s" % sc sys.exit() framePlusOneStopCounter += 1 if codon == '*' and framePlusOneStopCounter == 1: framePlusOneUtr3LenAdj = (framePlusOnePos * 3) + 1 if framePlusOneUtr3LenAdj == 0 and frameZeroStopCounter == 0: framePlusOneUtr3LenAdj = len(utr3seq[1:]) ### Frame -1 for loop, frameMinusOneTrans = (cdsseq[-1] + utr3seq).translate( ) # include last nucleotide of cds for -1 frameshift frameMinusOneStopPositions = [] frameMinusOneStopPositionsMRNA = [] frameMinusOnePos = -1 frameMinusOneStopCounter = 0 frameMinusOneUtr3LenAdj = 0 for codon in frameMinusOneTrans: frameMinusOnePos += 1 if codon == '*': frameMinusOneStopPositions.append( (frameMinusOnePos * 3) - 1) # get utr3position in nucleotides frameMinusOneStopPositionsMRNA.append((utr5len + cdslen) + (frameMinusOnePos * 3) - 1) ### check mRNA position to make sure stop codons are all valid sc = str( mRNAseq[((utr5len + cdslen) + (frameMinusOnePos * 3) - 1):((utr5len + cdslen) + (frameMinusOnePos * 3) - 1) + 3].upper()) if sc != "TAA" and sc != "TAG" and sc != "TGA": print "stop codon in frame -1 for %s is non correct!" % trsp_id print "stopcodon is: %s" % sc sys.exit() frameMinusOneStopCounter += 1 if codon == '*' and frameMinusOneStopCounter == 1: frameMinusOneUtr3LenAdj = (frameMinusOnePos * 3) - 1 if frameMinusOneUtr3LenAdj == 0 and frameZeroStopCounter == 0: frameMinusOneUtr3LenAdj = len(cdsseq[-1] + utr3seq) #### trsp_attr_list = [ trsp_id, trsp_genename, frameZeroStopPositions, frameZeroStopPositionsMRNA, framePlusOneStopPositions, framePlusOneStopPositionsMRNA, frameMinusOneStopPositions, frameMinusOneStopPositionsMRNA ] ucscIDlist.append(trsp_attr_list[0]) transcriptdict[trsp_id] = trsp_attr_list total_transcripts += 1 #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name,stopcodon,stop4nt print "total number of transcripts in data table: %s" % total_transcripts print "Number of included chromosomes chr: %s" % validchroms print "Number of excluded chromosomes chr: %s" % nonvalidchorms print "included chroms: ", included_chroms print "excluded chroms: ", excluded_chroms print "transcripts discarded due to non-AUG start codon %s" % nonATGstart print "transcripts discarded due to noncanonical stop codon %s" % wrongstopcodon return ucscIDlist, transcriptdict
def builddense(self): transcriptdict = {} mappedlocalreads = 0 dumppedreads = 0 illegalreads = 0 tooshortlongreads = 0 wrongstrandreads = 0 noStartOrStop = 0 noStartCodon = 0 noStopCodon = 0 totalreads = 0 # not totreads GFFlist = self.makeGFFlist(self.GTFgen) # validChrs = 'chrLUC' # for building a single chromosome validChrs = [ 'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM', 'chrSinV', 'chrLUC' ] for chrom in GFFlist: if not chrom in validChrs: continue transcriptnum = -1 for transcript in GFFlist[chrom].features: transcriptnum += 1 trsp_id = transcript.id # it is a number trsp_strand = transcript.strand # if transcript.type == 'inferred_parent': # this is a hack to deal with improperly formatted gtf files, will return strand == 0 # trsp_strand = transcript.sub_features[0].strand # use the first subfeature entry to get strand instead trsp_chromstart = int( transcript.location.start.position) # 0-based trsp_chromend = int(transcript.location.end.position) transcriptlist = [ 0.0 for x in range(abs(trsp_chromend - trsp_chromstart)) ] # a list for transcript (pre-mRNA), not CDS gb = self.getbam_5or3counts( self.bamfile, transcriptlist, chrom, transcriptnum, trsp_chromstart, trsp_chromend, trsp_id, trsp_strand, self.riboshiftdict, self.assignment, self.bamfileout ) # return a riboshifted list (0-based) of unspliced counts mappedlocalreads += gb[1] dumppedreads += gb[2] illegalreads += gb[3] tooshortlongreads += gb[4] wrongstrandreads += gb[5] totalreads += gb[6] exonsplicedseq = SeqIO.Seq('') exonsplicedcounts = [] transcriptseq = SeqIO.Seq( genome[chrom][trsp_chromstart:trsp_chromend]) # For EGFP #transcriptseq= genome #if transcript.type== 'gene': # For yeast # if trsp_strand== 1: # startcodonpos= transcript.location.start.position # startcodonmrnapos= seqtools.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) # stopcodonpos= transcript.location.end.position- 1# 0-based position # stopcodonmrnapos= seqtools.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) # if trsp_strand== -1: # startcodonpos= transcript.location.end.position- 1 # startcodonmrnapos= seqtools.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) # stopcodonpos= transcript.location.start.position # 0-based position, the first nt of stop codon # stopcodonmrnapos= seqtools.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) ### handling transcripts with no start or stop codon: # startcodonmrnapos = 'absent' # stopcodonmrnapos = 'absent' startCodonMrnaList = [] stopCodonMrnaList = [] for item in GFFlist[chrom].features[ transcriptnum].sub_features: if trsp_strand == 1: if item.type == 'exon': # or item.type== 'CDS': # For yeast, use 'CDS' exonstart = int(item.location.start.position ) # 0-based position exonend = int( item.location.end.position) # not 0-based exonstart_feat = exonstart - trsp_chromstart exonend_feat = exonend - trsp_chromstart # Not 0-based, it is fine for length....next line. exonsplicedcounts += gb[0][ exonstart_feat: exonend_feat] # takes from exonstart to exonend-1 exonsplicedseq += transcriptseq[ exonstart_feat: exonend_feat] # takes from exonstart to exonend-1 if item.type == 'start_codon': startcodonpos = item.location.start.position # 0-based position # startcodonmrnapos= self.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) # spliced mRNA position startCodonMrnaList.append( self.chrpostomrnapos( startcodonpos, chrom, transcriptnum, GFFlist)) # spliced mRNA position if item.type == 'stop_codon': stopcodonpos = item.location.end.position - 1 # 0-based position # stopcodonmrnapos= self.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) stopCodonMrnaList.append( self.chrpostomrnapos(stopcodonpos, chrom, transcriptnum, GFFlist)) if trsp_strand == -1: transcriptseq_rev = transcriptseq.reverse_complement() if item.type == 'exon': # or item.type== 'CDS': # For yeast, use 'CDS' exonstart = int(item.location.start.position ) # 0-based position exonend = int( item.location.end.position) # not 0-based exonstart_feat = (trsp_chromend - 1) - ( exonend - 1) # 0-based exonend_feat = (trsp_chromend - 1) - exonstart # 0-based exoncounts = gb[0][ exonstart_feat:exonend_feat + 1] # both 0-based, need to +1 for length exonsplicedcounts = exoncounts + exonsplicedcounts # exoncounts added to the upstream of existing counts, so don't flip again. exonseq = transcriptseq_rev[ exonstart_feat:exonend_feat + 1] exonsplicedseq = exonseq + exonsplicedseq if item.type == 'start_codon': startcodonpos = item.location.end.position - 1 # Need to -1 to be 0-based. # startcodonmrnapos= self.chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) startCodonMrnaList.append( self.chrpostomrnapos(startcodonpos, chrom, transcriptnum, GFFlist)) if item.type == 'stop_codon': stopcodonpos = item.location.start.position # start.position is 0-based already. # stopcodonmrnapos= self.chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) stopCodonMrnaList.append( self.chrpostomrnapos(stopcodonpos, chrom, transcriptnum, GFFlist)) if len(startCodonMrnaList) > 0: # print "MORE THAN 1 START", startCodonMrnaList startcodonmrnapos = min(startCodonMrnaList) else: noStartCodon += 1 startcodonmrnapos = 0 ### adding for transcripts without start codon # print "!!! no start codon for %s" % (trsp_id) if len(stopCodonMrnaList) > 0: stopcodonmrnapos = max(stopCodonMrnaList) else: noStopCodon += 1 stopcodonmrnapos = len( exonsplicedseq) - 3 ### leave 3nt's in "3'UTR" # print "!!! no stop codon for %s" % (trsp_id) # if startcodonmrnapos == 'absent' or stopcodonmrnapos == 'absent': # noStartOrStop +=1 # continue cdsseq = exonsplicedseq[ startcodonmrnapos:stopcodonmrnapos + 1] # take from startcodonmrnapos to stopcodonmrnapos cdscounts = exonsplicedcounts[ startcodonmrnapos:stopcodonmrnapos + 1] # take from startcodonmrnapos to stopcodonmrnapos # if str(cdsseq[:3].upper())!= "ATG": continue # ignore non-AUG start codons # stopcodon= str(cdsseq[-3:].upper()) # if stopcodon!= "TGA" and stopcodon!= "TAG" and stopcodon!= "TAA": continue # ignore weird stop codons #utr5len= startcodonmrnapos #utr3len= len(exonsplicedseq)- stopcodonmrnapos- 1 if sum(cdscounts) >= float( self.threshold): # thresholding minimal reads per CDS. transcriptdict[trsp_id] = exonsplicedcounts if self.totreads == '-1': print str( totalreads) + " total mapped reads used for normalization." self.norm_m( transcriptdict, totalreads ) # Normalzied by total reads mapped to transcriptdict only... but not total mapped reads. else: print str( self.totreads ) + " total mapped reads from STAR alignment used for normalization." self.norm_m(transcriptdict, self.totreads) ### disable writing counts to file here # self.writecountsf(transcriptdict, self.outputdata) ### assemble dataframe here outdict = OrderedDict() for key, val in transcriptdict.items(): outdict[key] = [val] df = pd.DataFrame.from_dict(outdict, orient='index') df.columns = ['density'] print df.head() df.to_csv('%s.csv.gz' % (self.outputdata), compression='gzip') # Write output file of comments. fc = open(self.outputdata + "output.txt", "w") fc.write("Density was built with parameters:\n") fc.write("riboshiftdict=" + str(self.riboshiftdict) + "\n") fc.write("threshold=" + str(self.threshold) + "\n") fc.write("assignment=" + str(self.assignment) + "\n") fc.write("reads mapped to known canonical coding transcripts: " + str(mappedlocalreads) + "\n") fc.write("reads are dumpped, due to weird cigar codes: " + str(dumppedreads) + "\n") fc.write( "reads are illegal, mapped outside of annotated transcripts: " + str(illegalreads) + "\n") fc.write("reads are too short/long: " + str(tooshortlongreads) + "\n") fc.write("reads are on the wrong strand: " + str(wrongstrandreads) + "\n") fc.write("total mapped reads from aligner: " + str(totalreads)) fc.close() print str( mappedlocalreads ) + " reads within length limitation mapped to known canonical coding transcripts. " print str( dumppedreads) + " reads are dumpped, due to weird cigar codes." print str( illegalreads ) + " reads are illegal, mapped outside of annotated transcripts." print str(tooshortlongreads) + " reads are too short/long." print str(wrongstrandreads) + " reads are on the wrong strand." print str(totalreads) + " total mapped reads from aligner. "
def build_utr_table(GFFlist, inculde_noncanon_start, include_noncanon_stop): """ This is a function to get the cds and utr sizes for an mRNA from a GFF file returns a list with: #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name Includes most of the functions from densebuilder_main but does not return counts """ # GFFlist = GFFinput transcriptdict = {} ucscIDlist = [] total_transcripts = 0 nonvalidchorms = 0 nonATGstart = 0 wrongstopcodon = 0 validchroms = 0 excluded_chroms = [] included_chroms = [] for chrom in GFFlist: if not chrom in validChrs: excluded_chroms.append(chrom) nonvalidchorms += 1 # print chrom continue # check that only valid choromosomes are used validchroms += 1 included_chroms.append(chrom) transcriptnum = -1 # set to negative one so first transcript is == to 0 for transcript in GFFlist[ chrom].features: # this is where the SeqFeatures are actually stored tr_attribute_list = [] transcriptnum += 1 trsp_id = transcript.id # it is a number trsp_strand = transcript.strand ### changing this to be compatible with new hg38 annotation # print transcript.qualifiers ### these are all of the fields parsed by the GTF parser from column 8, output is a dictionary {'key':['item1', 'item2', 'ect']} trsp_genename = transcript.qualifiers['Name'][0] trsp_chromstart = int( transcript.location.start.position) # 0-based trsp_chromend = int(transcript.location.end.position) transcriptlist = [ 0.0 for x in range(abs(trsp_chromend - trsp_chromstart)) ] # a list for transcript (pre-mRNA), not CDS exonsplicedseq = SeqIO.Seq('') transcriptseq = SeqIO.Seq( genome[chrom][trsp_chromstart:trsp_chromend]) ### use lists to handle transcripts with multiple start and stop codons startCodonMrnaList = [] stopCodonMrnaList = [] for item in GFFlist[chrom].features[transcriptnum].sub_features: if trsp_strand == 1: ### dealing with transcripts having multiple start or stop codon entries, if spaning splice junctions if item.type == 'exon': # or item.type== 'CDS': # For yeast, use 'CDS' exonstart = int( item.location.start.position) # 0-based position exonend = int( item.location.end.position) # not 0-based exonstart_feat = exonstart - trsp_chromstart exonend_feat = exonend - trsp_chromstart # Not 0-based, it is fine for length....next line. exonsplicedseq += transcriptseq[ exonstart_feat: exonend_feat] # takes from exonstart to exonend-1 if item.type == 'start_codon': startcodonpos = item.location.start.position # 0-based position # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) # spliced mRNA position startCodonMrnaList.append( chrpostomrnapos(startcodonpos, chrom, transcriptnum, GFFlist)) # spliced mRNA position # print startcodonmrnapos if item.type == 'stop_codon': stopcodonpos = item.location.end.position - 1 # 0-based position # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) stopCodonMrnaList.append( chrpostomrnapos(stopcodonpos, chrom, transcriptnum, GFFlist)) # print stopcodonmrnapos if trsp_strand == -1: # print 'neg_strand' # reverse_complement() # this comes from seqIO transcriptseq_rev = transcriptseq.reverse_complement() if item.type == 'exon': # or item.type== 'CDS': # For yeast, use 'CDS' exonstart = int( item.location.start.position) # 0-based position exonend = int( item.location.end.position) # not 0-based exonstart_feat = (trsp_chromend - 1) - (exonend - 1 ) # 0-based exonend_feat = (trsp_chromend - 1) - exonstart # 0-based exonseq = transcriptseq_rev[ exonstart_feat:exonend_feat + 1] exonsplicedseq = exonseq + exonsplicedseq if item.type == 'start_codon': startcodonpos = item.location.end.position - 1 # Need to -1 to be 0-based. # print startcodonpos # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) startCodonMrnaList.append( chrpostomrnapos(startcodonpos, chrom, transcriptnum, GFFlist)) # print "start codon: ", startcodonmrnapos if item.type == 'stop_codon': stopcodonpos = item.location.start.position # start.position is 0-based already. # print stopcodonpos # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) stopCodonMrnaList.append( chrpostomrnapos(stopcodonpos, chrom, transcriptnum, GFFlist)) # print "stop codon: ", stopcodonmrnapos if len(startCodonMrnaList) > 0: # print "MORE THAN 1 START", startCodonMrnaList startcodonmrnapos = min(startCodonMrnaList) else: print "!!! no start codon for %s" % (trsp_id) startcodonmrnapos = 0 ### adding for transcripts without start codon # if len(stopCodonMrnaList) if len(stopCodonMrnaList) > 0: stopcodonmrnapos = max(stopCodonMrnaList) else: print "!!! no stop codon for %s" % (trsp_id) stopcodonmrnapos = len( exonsplicedseq) - 3 ### leave 3nt's in "3'UTR" cdsseq = exonsplicedseq[ startcodonmrnapos:stopcodonmrnapos + 1] # take from startcodonmrnapos to stopcodonmrnapos utr5seq = exonsplicedseq[:startcodonmrnapos] utr3seq = exonsplicedseq[stopcodonmrnapos + 1:] # print trsp_id # # print transcript.qualifiers['transcript_name'] # print trsp_strand # print utr5seq # print " - - - " # print cdsseq # print " - - - " # print utr3seq # # print utr5seq+cdsseq+utr3seq # print "" # # print transcriptseq if inculde_noncanon_start == False: if str(cdsseq[:3].upper()) != "ATG": nonATGstart += 1 print "non canon start" print trsp_id print cdsseq print "" continue # ignore non-AUG start codons stopcodon = str(cdsseq[-3:].upper()) if len(utr3seq) > 0: stop4nt = stopcodon + str(utr3seq[0].upper()) elif len(utr3seq) == 0: stop4nt = '0' else: print "there is a 3'UTR with negative length..." sys.exit() if include_noncanon_stop == False: if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA": wrongstopcodon += 1 print "wrong stop!" print trsp_id print cdsseq print "" continue # ignore weird stop codons # build itmes in transcript attribute list mRNAlen = len(exonsplicedseq) cdslen = len(cdsseq) utr5len = len(utr5seq) utr3len = len(utr3seq) assert mRNAlen == utr3len + cdslen + utr5len # check that sum of features equals mRNA length trsp_attr_list = [ trsp_id, chrom, transcriptnum, trsp_strand, mRNAlen, cdslen, utr5len, utr3len, trsp_genename, stopcodon, stop4nt ] ucscIDlist.append(trsp_attr_list[0]) transcriptdict[trsp_id] = trsp_attr_list total_transcripts += 1 #transcript,chrom,featnum,strand,mrna_len,cds_len,5utr_len,3utr_len,gene_name,stopcodon,stop4nt print "total number of transcripts in data table: %s" % total_transcripts print "Number of included chromosomes chr: %s" % validchroms print "Number of excluded chromosomes chr: %s" % nonvalidchorms print "included chroms: ", included_chroms print "excluded chroms: ", excluded_chroms print "transcripts discarded due to non-AUG start codon %s" % nonATGstart print "transcripts discarded due to noncanonical stop codon %s" % wrongstopcodon return ucscIDlist, transcriptdict
def get_Prot_sequence(GFFlist): transcriptdict = {} ucscIDlist = [] total_transcripts = 0 nonvalidchorms = 0 nonATGstart = 0 wrongstopcodon = 0 validchroms = 0 excluded_chroms = [] included_chroms = [] for chrom in GFFlist: if not chrom in validChrs: excluded_chroms.append(chrom) nonvalidchorms += 1 # print chrom continue # check that only valid choromosomes are used validchroms += 1 included_chroms.append(chrom) transcriptnum = -1 # set to negative one so first transcript is == to 0 for transcript in GFFlist[ chrom].features: # this is where the SeqFeatures are actually stored tr_attribute_list = [] transcriptnum += 1 trsp_id = transcript.id # it is a number trsp_strand = transcript.strand trsp_genename = transcript.qualifiers['Name'][0] trsp_chromstart = int( transcript.location.start.position) # 0-based trsp_chromend = int(transcript.location.end.position) transcriptlist = [ 0.0 for x in range(abs(trsp_chromend - trsp_chromstart)) ] # a list for transcript (pre-mRNA), not CDS exonsplicedseq = SeqIO.Seq('') transcriptseq = SeqIO.Seq( genome[chrom][trsp_chromstart:trsp_chromend]) ### use lists to handle transcripts with multiple start and stop codons startCodonMrnaList = [] stopCodonMrnaList = [] for item in GFFlist[chrom].features[transcriptnum].sub_features: if trsp_strand == 1: if item.type == 'exon': # or item.type== 'CDS': # For yeast, use 'CDS' exonstart = int( item.location.start.position) # 0-based position exonend = int( item.location.end.position) # not 0-based exonstart_feat = exonstart - trsp_chromstart exonend_feat = exonend - trsp_chromstart # Not 0-based, it is fine for length....next line. exonsplicedseq += transcriptseq[ exonstart_feat: exonend_feat] # takes from exonstart to exonend-1 if item.type == 'start_codon': startcodonpos = item.location.start.position # 0-based position # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) # spliced mRNA position startCodonMrnaList.append( chrpostomrnapos(startcodonpos, chrom, transcriptnum, GFFlist)) # spliced mRNA position if item.type == 'stop_codon': stopcodonpos = item.location.end.position - 1 # 0-based position # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) stopCodonMrnaList.append( chrpostomrnapos(stopcodonpos, chrom, transcriptnum, GFFlist)) if trsp_strand == -1: # reverse_complement() # this comes from seqIO transcriptseq_rev = transcriptseq.reverse_complement() if item.type == 'exon': # or item.type== 'CDS': # For yeast, use 'CDS' exonstart = int( item.location.start.position) # 0-based position exonend = int( item.location.end.position) # not 0-based exonstart_feat = (trsp_chromend - 1) - (exonend - 1 ) # 0-based exonend_feat = (trsp_chromend - 1) - exonstart # 0-based exonseq = transcriptseq_rev[ exonstart_feat:exonend_feat + 1] exonsplicedseq = exonseq + exonsplicedseq if item.type == 'start_codon': startcodonpos = item.location.end.position - 1 # Need to -1 to be 0-based. # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) startCodonMrnaList.append( chrpostomrnapos(startcodonpos, chrom, transcriptnum, GFFlist)) if item.type == 'stop_codon': stopcodonpos = item.location.start.position # start.position is 0-based already. # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) stopCodonMrnaList.append( chrpostomrnapos(stopcodonpos, chrom, transcriptnum, GFFlist)) ### choose start and stop codons if len(startCodonMrnaList) > 0: # print "MORE THAN 1 START", startCodonMrnaList startcodonmrnapos = min(startCodonMrnaList) else: print "!!! no start codon for %s" % (trsp_id) # if len(stopCodonMrnaList) if len(stopCodonMrnaList) > 0: stopcodonmrnapos = max(stopCodonMrnaList) else: print "!!! no stop codon for %s" % (trsp_id) mRNAseq = exonsplicedseq cdsseq = exonsplicedseq[ startcodonmrnapos:stopcodonmrnapos + 1] # take from startcodonmrnapos to stopcodonmrnapos utr5seq = exonsplicedseq[:startcodonmrnapos] utr3seq = exonsplicedseq[stopcodonmrnapos + 1:] cdsProt = cdsseq.translate() # outseq = utr5seq.lower()+cdsseq.upper()+utr3seq.lower() if str(cdsseq[:3].upper()) != "ATG": nonATGstart += 1 continue # ignore non-AUG start codons ### stopcodon is included in cdsseq, represnted by the last 3nt's stopcodon = str(cdsseq[-3:].upper()) if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA": wrongstopcodon += 1 continue # ignore weird stop codons # build itmes in transcript attribute list mRNAlen = len(exonsplicedseq) cdslen = len(cdsseq) utr5len = len(utr5seq) utr3len = len(utr3seq) assert mRNAlen == utr3len + cdslen + utr5len # check that sum of features equals mRNA length trsp_attr_list = [trsp_id, trsp_genename, cdsProt] ucscIDlist.append(trsp_attr_list[0]) transcriptdict[trsp_id] = trsp_attr_list total_transcripts += 1 print "total number of transcripts in data table: %s" % total_transcripts print "Number of included chromosomes chr: %s" % validchroms print "Number of excluded chromosomes chr: %s" % nonvalidchorms print "included chroms: ", included_chroms print "excluded chroms: ", excluded_chroms print "transcripts discarded due to non-AUG start codon %s" % nonATGstart print "transcripts discarded due to noncanonical stop codon %s" % wrongstopcodon return ucscIDlist, transcriptdict
def find_uORFs(GFFlist): """ using the same basic structure as denesbuilder_main, this function identifies all uORFs and write csv files """ ### define start codon ## could possibly change this to look at non canonical start codons startCodon = Seq('ATG') ### build empty data frames, rows will be appended as function iterates over transcripts dfCols = [ 'trxname', 'symbol', 'strand', 'uORFCounter', 'startPosition', 'cdsExtension', 'utr5len', 'cdslen', 'utr3len', 'uORFlen', 'uORFseq', 'uORFaa' ] uORFdf = pd.DataFrame(columns=dfCols) summaryCols = [ 'trxname', 'symbol', 'chr', 'tr_number', 'strand', 'uORFCounter', 'cdsExtension' ] summarydf = pd.DataFrame(columns=summaryCols) #### total_transcripts = 0 nonvalidchorms = 0 nonATGstart = 0 wrongstopcodon = 0 validchroms = 0 excluded_chroms = [] included_chroms = [] for chrom in GFFlist: if not chrom in validChrs: excluded_chroms.append(chrom) nonvalidchorms += 1 # print chrom continue # check that only valid choromosomes are used validchroms += 1 included_chroms.append(chrom) transcriptnum = -1 # set to negative one so first transcript is == to 0 # print chrom for transcript in GFFlist[ chrom].features: # this is where the SeqFeatures are actually stored # print transcript tr_attribute_list = [] transcriptnum += 1 trsp_id = transcript.id # it is a number trsp_strand = transcript.strand trsp_genename = transcript.qualifiers['Name'][0] trsp_chromstart = int( transcript.location.start.position) # 0-based trsp_chromend = int(transcript.location.end.position) transcriptlist = [ 0.0 for x in range(abs(trsp_chromend - trsp_chromstart)) ] # a list for transcript (pre-mRNA), not CDS exonsplicedseq = SeqIO.Seq('') transcriptseq = SeqIO.Seq( genome[chrom][trsp_chromstart:trsp_chromend]) ### handling transcripts with no start or stop codon: # startcodonmrnapos = 'absent' # stopcodonmrnapos = 'absent' startCodonMrnaList = [] stopCodonMrnaList = [] for item in GFFlist[chrom].features[transcriptnum].sub_features: if trsp_strand == 1: if item.type == 'exon': # or item.type== 'CDS': # For yeast, use 'CDS' exonstart = int( item.location.start.position) # 0-based position exonend = int( item.location.end.position) # not 0-based exonstart_feat = exonstart - trsp_chromstart exonend_feat = exonend - trsp_chromstart # Not 0-based, it is fine for length....next line. exonsplicedseq += transcriptseq[ exonstart_feat: exonend_feat] # takes from exonstart to exonend-1 if item.type == 'start_codon': startcodonpos = item.location.start.position # 0-based position # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) # spliced mRNA position startCodonMrnaList.append( chrpostomrnapos(startcodonpos, chrom, transcriptnum, GFFlist)) if item.type == 'stop_codon': stopcodonpos = item.location.end.position - 1 # 0-based position # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) stopCodonMrnaList.append( chrpostomrnapos(stopcodonpos, chrom, transcriptnum, GFFlist)) if trsp_strand == -1: # reverse_complement() # this comes from seqIO transcriptseq_rev = transcriptseq.reverse_complement() if item.type == 'exon': # or item.type== 'CDS': # For yeast, use 'CDS' exonstart = int( item.location.start.position) # 0-based position exonend = int( item.location.end.position) # not 0-based exonstart_feat = (trsp_chromend - 1) - (exonend - 1 ) # 0-based exonend_feat = (trsp_chromend - 1) - exonstart # 0-based exonseq = transcriptseq_rev[ exonstart_feat:exonend_feat + 1] exonsplicedseq = exonseq + exonsplicedseq if item.type == 'start_codon': startcodonpos = item.location.end.position - 1 # Need to -1 to be 0-based. # startcodonmrnapos= chrpostomrnapos(startcodonpos,chrom,transcriptnum,GFFlist) startCodonMrnaList.append( chrpostomrnapos(startcodonpos, chrom, transcriptnum, GFFlist)) if item.type == 'stop_codon': stopcodonpos = item.location.start.position # start.position is 0-based already. # stopcodonmrnapos= chrpostomrnapos(stopcodonpos,chrom,transcriptnum,GFFlist) stopCodonMrnaList.append( chrpostomrnapos(stopcodonpos, chrom, transcriptnum, GFFlist)) if len(startCodonMrnaList) > 0: # print "MORE THAN 1 START", startCodonMrnaList startcodonmrnapos = min(startCodonMrnaList) else: print "!!! no start codon for %s" % (trsp_id) # if len(stopCodonMrnaList) if len(stopCodonMrnaList) > 0: stopcodonmrnapos = max(stopCodonMrnaList) else: print "!!! no stop codon for %s" % (trsp_id) # if startcodonmrnapos == 'absent' or stopcodonmrnapos == 'absent': # # print "no start of stop for trsp %s" % trsp_id # continue cdsseq = exonsplicedseq[ startcodonmrnapos:stopcodonmrnapos + 1] # take from startcodonmrnapos to stopcodonmrnapos utr5seq = exonsplicedseq[:startcodonmrnapos] utr3seq = exonsplicedseq[stopcodonmrnapos + 1:] if str(cdsseq[:3].upper()) != "ATG": nonATGstart += 1 continue # ignore non-AUG start codons stopcodon = str(cdsseq[-3:].upper()) # if len(utr3seq) > 0: # stop4nt = stopcodon +str(utr3seq[0].upper()) # elif len(utr3seq) == 0: # stop4nt = '0' # else: # print "there is a 3'UTR with negative length..." # sys.exit() if stopcodon != "TGA" and stopcodon != "TAG" and stopcodon != "TAA": wrongstopcodon += 1 continue # ignore weird stop codons # build itmes in transcript attribute list mRNAlen = len(exonsplicedseq) cdslen = len(cdsseq) utr5len = len(utr5seq) utr3len = len(utr3seq) assert mRNAlen == utr3len + cdslen + utr5len # check that sum of features equals mRNA length #### Counting of uORFs #### uORFcounter = 0 cdsExtension = 0 for i in range(len(utr5seq)): ### iterate over every nucleotide in the 5'UTR codon = utr5seq[i:i + 3] # define the codon at each position if str(codon) == str( startCodon): # check if it is a start codon uORFcounter += 1 startPosition = i seqIndex = i uORFaa = [] uORFseq = [] # print "found start codon at pos %s" % startPosition aminoAcid = codon.translate() uORFseq.append(str(codon)) uORFaa.append(str(aminoAcid)) while str( aminoAcid ) != "*": # continue this loop until a stop codon is encoutered seqIndex += 3 # advance by 3 nt's each time (1 codon) nextCodon = utr5seq[seqIndex:seqIndex + 3] aminoAcid = nextCodon.translate() if len( nextCodon ) == 3: # ensure that a full codon is still present, do not want 1 or 2 nts uORFseq.append(str(nextCodon)) uORFaa.append(str(aminoAcid)) if seqIndex > len( utr5seq ) - 2: # if uORF continues into cds, retreive sequences from here # -2 is because this will not yeild a full codon (only 2 nt's) # print "end of UTR" cdsExtension = 1 utrCdsSeq = utr5seq + cdsseq nextCodon = utrCdsSeq[seqIndex:seqIndex + 3] aminoAcid = nextCodon.translate() uORFseq.append(str(nextCodon)) uORFaa.append(str(aminoAcid)) # print nextCodon, aminoAcid if seqIndex > len( utrCdsSeq ): ## if uORF exceeds coding region, stop counting this, ### could eventually extend to the 3'UTR if any transcript exists here print 'end of CDS for trsp %s' % trsp_id break uORFseqCat = "".join( uORFseq ) # remove seperate list entries and concat to a string uORFaaCat = "".join(uORFaa) ### save all uORF features to a list, and build into a dataframe uORF_features = [ trsp_id, trsp_genename, trsp_strand, uORFcounter, startPosition, cdsExtension, len(utr5seq), len(cdsseq), len(utr3seq), len(uORFseqCat), uORFseqCat, uORFaaCat ] dftemp = pd.DataFrame([uORF_features], columns=dfCols) ## # print dftemp uORFdf = pd.concat([uORFdf, dftemp], ignore_index=True) if i == (len(utr5seq) - 1): # at the end of the 5'UTR, do this ... # print i uORFsummary = [ trsp_id, trsp_genename, chrom, transcriptnum, trsp_strand, uORFcounter, cdsExtension ] # print uORFsummary dfSummaryTemp = pd.DataFrame([uORFsummary], columns=summaryCols) # print dfSummaryTemp summarydf = pd.concat([summarydf, dfSummaryTemp], ignore_index=True) uORFdf.to_csv(uORFtableOutfile) summarydf.to_csv(uORFsummaryOutfile) print summarydf.head()
primerfile = "/home/pzs/primerdesign/primerdesign/tags/parallel/promoterprimers.csv" outfile = "processedprimers.csv" reader = csv.reader(open(primerfile, "r")) writer = csv.writer(open(outfile, "w")) for row in reader: rowlen = len(row) if rowlen == 4: assert(row[-1] == "site not present!") writer.writerow(row) elif rowlen == 6: assert(row[-1] == "None found!") writer.writerow(row) elif rowlen == 11: writer.writerow(row) continue elif rowlen == 10: left = row[5] right = SeqIO.Seq(row[6]) right = str(right.reverse_complement()) fullseq = row[4] leftindex = fullseq.index(left) rightindex = fullseq.index(right) + len(right) product = fullseq[leftindex:rightindex] row.insert(7, product) writer.writerow(row) else: print "unknown row type", row