def checkGenes(chromosomes, exon_records, chromosome_load_fxn, cdnas, id_fxn): total_genes = 0 total_errors = 0 # Reconstruct genes for chr in chromosomes: print "# Chromosome", chr # Load this chromosome's data chrseq = chromosome_load_fxn(chr) n_genes = 0 n_errors = 0 n_wrong_length = 0 n_bad_translation = 0 n_different_translation = 0 n_no_comparison = 0 for (pepid, recdict) in exon_records.items(): recs = recdict.values() if recs[0].chromosome == chr: recs.sort(key=lambda e: e.exon_start) (seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0) # Reverse-complement the sequence if it's on the negative strand if recs[0].strand == '-1': #print "reversing strand" seq = translate.reverseComplement(seq) n_genes += 1 try: orf = id_fxn(sentinel_rec) cdna = cdnas[orf] #print orf #print seq #print cdna if len(seq) == len(cdna): if seq != cdna: n_errors += 1 elif len(seq) - 3 == len(cdna): if seq[0:-3] != cdna: n_errors += 1 else: n_errors += 1 n_wrong_length += 1 except KeyError: n_no_comparison += 1 continue print "# Processed %d genes with %d errors" % (n_genes, n_errors) print "# %d length errors" % (n_wrong_length, ) print "# %d no comparison errors" % (n_no_comparison, ) print "# %d bad translation errors" % (n_bad_translation, ) print "# %d different translation errors" % ( n_different_translation, ) total_errors += n_errors total_genes += n_genes print "# Processed %d genes with %d errors total" % (total_genes, total_errors)
def checkGenes(chromosomes, exon_records, chromosome_load_fxn, cdnas, id_fxn): total_genes = 0 total_errors = 0 # Reconstruct genes for chr in chromosomes: print "# Chromosome", chr # Load this chromosome's data chrseq = chromosome_load_fxn(chr) n_genes = 0 n_errors = 0 n_wrong_length = 0 n_bad_translation = 0 n_different_translation = 0 n_no_comparison = 0 for (pepid, recdict) in exon_records.items(): recs = recdict.values() if recs[0].chromosome == chr: recs.sort( key = lambda e: e.exon_start) (seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0) # Reverse-complement the sequence if it's on the negative strand if recs[0].strand == '-1': #print "reversing strand" seq = translate.reverseComplement(seq) n_genes += 1 try: orf = id_fxn(sentinel_rec) cdna = cdnas[orf] #print orf #print seq #print cdna if len(seq) == len(cdna): if seq != cdna: n_errors += 1 elif len(seq)-3 == len(cdna): if seq[0:-3] != cdna: n_errors += 1 else: n_errors += 1 n_wrong_length += 1 except KeyError: n_no_comparison += 1 continue print "# Processed %d genes with %d errors" % (n_genes, n_errors) print "# %d length errors" % (n_wrong_length,) print "# %d no comparison errors" % (n_no_comparison,) print "# %d bad translation errors" % (n_bad_translation,) print "# %d different translation errors" % (n_different_translation,) total_errors += n_errors total_genes += n_genes print "# Processed %d genes with %d errors total" % (total_genes, total_errors)
def getSequence(self, raw_seq, n_bases_upstream=0, n_bases_downstream=0): # Sort exon records self.exons.sort(key=lambda x: x.coding_start) # Figure out which direction "upstream" is if self.strand == '-': # Swap -- downstream is really upstream on the negative strand (n_bases_upstream, n_bases_downstream) = (n_bases_downstream, n_bases_upstream) seq = '' if len(self.exons) == 1: seq += self.exons[0].pullCodingSequence(raw_seq, n_bases_upstream, n_bases_downstream) else: for (i,x) in enumerate(self.exons): if i==0: seq += x.pullCodingSequence(raw_seq, n_bases_upstream, 0) elif i==(len(self.exons)-1): seq += x.pullCodingSequence(raw_seq, 0, n_bases_downstream) else: seq += x.pullCodingSequence(raw_seq, 0, 0) if self.strand == '-': seq = translate.reverseComplement(seq) return seq
def getSequence(self, raw_seq, n_bases_upstream=0, n_bases_downstream=0): # Sort exon records self.exons.sort(key=lambda x: x.coding_start) # Figure out which direction "upstream" is if self.strand == '-': # Swap -- downstream is really upstream on the negative strand (n_bases_upstream, n_bases_downstream) = (n_bases_downstream, n_bases_upstream) seq = '' if len(self.exons) == 1: seq += self.exons[0].pullCodingSequence(raw_seq, n_bases_upstream, n_bases_downstream) else: for (i, x) in enumerate(self.exons): if i == 0: seq += x.pullCodingSequence(raw_seq, n_bases_upstream, 0) elif i == (len(self.exons) - 1): seq += x.pullCodingSequence(raw_seq, 0, n_bases_downstream) else: seq += x.pullCodingSequence(raw_seq, 0, 0) if self.strand == '-': seq = translate.reverseComplement(seq) return seq
def getStrandAlts(self, strand): res = self.alt if strand == '-': res = [translate.reverseComplement(a) for a in self.alt] return res
def getStrandRef(self, strand): res = self.ref if strand == '-': res = translate.reverseComplement(self.ref) return res
def test002(self): """reverse complement""" s = "ATGCatgc" self.assertTrue(translate.reverseComplement(s) == "gcatGCAT") self.assertTrue(translate.reverseComplement(translate.reverseComplement(s)) == s)
def writeGenes(chromosomes, exon_records, chromosome_load_fxn, id_fxn, exclude_nt_from_boundary, outfile): total_genes = 0 total_errors = 0 # Reconstruct genes for chr in chromosomes: print "# Chromosome", chr # Load this chromosome's data chrseq = chromosome_load_fxn(chr) n_genes = 0 n_errors = 0 n_wrong_length = 0 n_bad_translation = 0 n_different_translation = 0 n_no_comparison = 0 for (pepid, recdict) in exon_records.items(): recs = recdict.values() if recs[0].chromosome == chr: strand_sign = int(recs[0].strand) recs.sort( key = lambda e: e.exon_start) (seq, sentinel_rec) = buildCodingSequence(recs, chrseq, exclude_nt_from_boundary) rseq = seq # Reverse-complement the sequence if it's on the negative strand if recs[0].strand == '-1': #print "reversing strand" seq = translate.reverseComplement(seq) n_genes += 1 if len(seq) % 3 != 0: n_errors += 1 n_wrong_length += 1 continue if exclude_nt_from_boundary == 0: #if sentinel_rec.peptide_ID == 'ENSDARP00000076309': # print seq # print rseq # print translate.translateRaw(seq) # print translate.translateRaw(rseq) # sys.exit() try: prot = translate.translate(seq) if False: #not prot: print "****" print id_fxn(sentinel_rec) print seq print rseq for rec in recs: print rec #print "^%s^\t%s" % (recs[0].strand, seq[0:]) n_errors += 1 n_bad_translation += 1 continue except translate.BioUtilsError: continue # Translation is good... write it. line = ">%s\n%s\n" % (id_fxn(sentinel_rec), seq) outfile.write(line) #outfile.write(">%s\n%s\n" % (id_fxn(sentinel_rec), seq)) print "# Processed %d genes with %d errors" % (n_genes, n_errors) print "# %d length errors" % (n_wrong_length,) print "# %d bad translation errors" % (n_bad_translation,) total_errors += n_errors total_genes += n_genes print "# Wrote %d genes with %d errors total" % (total_genes, total_errors)
def writeStats(chromosomes, peptide_records, bounds, chromosome_load_fxn, outfile): header = "gene\tpep\ttrans\tchr\tlen.intron\tn.exons\tfrac.coding\tgc.intron\tgcind.intron\tgt.intron\tlen.coding\tfrac.%s\n" % \ ('\tfrac.'.join([str(b) for b in bounds],)) outfile.write(header) print header, total_genes = 0 total_errors = 0 # Reconstruct genes for chr in chromosomes: print "# Chromosome", chr # Load this chromosome's data chrseq = chromosome_load_fxn(chr) n_genes = 0 n_errors = 0 n_wrong_length = 0 n_bad_translation = 0 n_different_translation = 0 for (pepid, recdict) in peptide_records.items(): recs = recdict.values() if recs[0].chromosome == chr: recs.sort( key = lambda e: e.exon_start) (seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0) (intron_seq, sentinel_rec_intron) = buildIntronSequence(recs, chrseq) # Reverse-complement the sequence if it's on the negative strand if recs[0].strand == '-1': #print "reversing strand" seq = translate.reverseComplement(seq) intron_seq = translate.reverseComplement(intron_seq) n_genes += 1 # Write out statistics if False: print 'g', seq print 'i', intron_seq if len(intron_seq)>0: gc_intron = '%1.4f' % cai.getGC(intron_seq) intron_length = len(intron_seq) frac_coding = '%1.4f' % (len(seq)/(len(seq)+float(intron_length)),) gcind_intron = '%1.4f' % cai.getDinucleotideIndex(intron_seq, 'GC') gt_intron = '%1.4f' % cai.getContent(intron_seq, 'GT') else: gc_intron = 'NA' intron_length = 0 gcind_intron = 'NA' if len(seq)>0: frac_coding = '1.0' else: frac_coding = 'NA' num_coding_exons = len([xr for xr in recs if xr.coding_end > xr.coding_start]) # Test to ensure agreement between codingOutsideBoundary and buildCodingSequence if False: (coding_outside, total_coding) = codingOutsideBoundary(recs, 0) print len(seq), total_coding, coding_outside assert total_coding == coding_outside assert coding_outside == len(seq) fracs_inside = [] for bound in bounds: (coding_outside, total_coding) = codingOutsideBoundary(recs, bound) if total_coding > 0: frac_inside_bound = "%1.4f" % (1-float(coding_outside)/total_coding,) else: frac_inside_bound = "NA" fracs_inside.append(frac_inside_bound) line = "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%s\n" % \ (sentinel_rec.gene_ID, sentinel_rec.peptide_ID, sentinel_rec.transcript_ID, sentinel_rec.chromosome, intron_length, num_coding_exons, frac_coding, gc_intron, gcind_intron, gt_intron, len(seq), '\t'.join(fracs_inside)) outfile.write(line) #print line, outfile.flush() print "# Processed %d genes with %d errors" % (n_genes, n_errors) print "# %d length errors" % (n_wrong_length,) print "# %d bad translation errors" % (n_bad_translation,) print "# %d different translation errors" % (n_different_translation,) total_errors += n_errors total_genes += n_genes chrseqlist = None chrseq = None print "# Processed %d genes with %d errors total" % (total_genes, total_errors) outfile.close()
def test002(self): """reverse complement""" s = 'ATGCatgc' self.assertTrue(translate.reverseComplement(s) == 'gcatGCAT') self.assertTrue( translate.reverseComplement(translate.reverseComplement(s)) == s)
def writeGenes(chromosomes, exon_records, chromosome_load_fxn, id_fxn, exclude_nt_from_boundary, outfile): total_genes = 0 total_errors = 0 # Reconstruct genes for chr in chromosomes: print "# Chromosome", chr # Load this chromosome's data chrseq = chromosome_load_fxn(chr) n_genes = 0 n_errors = 0 n_wrong_length = 0 n_bad_translation = 0 n_different_translation = 0 n_no_comparison = 0 for (pepid, recdict) in exon_records.items(): recs = recdict.values() if recs[0].chromosome == chr: strand_sign = int(recs[0].strand) recs.sort(key=lambda e: e.exon_start) (seq, sentinel_rec) = buildCodingSequence(recs, chrseq, exclude_nt_from_boundary) rseq = seq # Reverse-complement the sequence if it's on the negative strand if recs[0].strand == '-1': #print "reversing strand" seq = translate.reverseComplement(seq) n_genes += 1 if len(seq) % 3 != 0: n_errors += 1 n_wrong_length += 1 continue if exclude_nt_from_boundary == 0: #if sentinel_rec.peptide_ID == 'ENSDARP00000076309': # print seq # print rseq # print translate.translateRaw(seq) # print translate.translateRaw(rseq) # sys.exit() try: prot = translate.translate(seq) if False: #not prot: print "****" print id_fxn(sentinel_rec) print seq print rseq for rec in recs: print rec #print "^%s^\t%s" % (recs[0].strand, seq[0:]) n_errors += 1 n_bad_translation += 1 continue except translate.BioUtilsError: continue # Translation is good... write it. line = ">%s\n%s\n" % (id_fxn(sentinel_rec), seq) outfile.write(line) #outfile.write(">%s\n%s\n" % (id_fxn(sentinel_rec), seq)) print "# Processed %d genes with %d errors" % (n_genes, n_errors) print "# %d length errors" % (n_wrong_length, ) print "# %d bad translation errors" % (n_bad_translation, ) total_errors += n_errors total_genes += n_genes print "# Wrote %d genes with %d errors total" % (total_genes, total_errors)
def writeStats(chromosomes, peptide_records, bounds, chromosome_load_fxn, outfile): header = "gene\tpep\ttrans\tchr\tlen.intron\tn.exons\tfrac.coding\tgc.intron\tgcind.intron\tgt.intron\tlen.coding\tfrac.%s\n" % \ ('\tfrac.'.join([str(b) for b in bounds],)) outfile.write(header) print header, total_genes = 0 total_errors = 0 # Reconstruct genes for chr in chromosomes: print "# Chromosome", chr # Load this chromosome's data chrseq = chromosome_load_fxn(chr) n_genes = 0 n_errors = 0 n_wrong_length = 0 n_bad_translation = 0 n_different_translation = 0 for (pepid, recdict) in peptide_records.items(): recs = recdict.values() if recs[0].chromosome == chr: recs.sort(key=lambda e: e.exon_start) (seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0) (intron_seq, sentinel_rec_intron) = buildIntronSequence(recs, chrseq) # Reverse-complement the sequence if it's on the negative strand if recs[0].strand == '-1': #print "reversing strand" seq = translate.reverseComplement(seq) intron_seq = translate.reverseComplement(intron_seq) n_genes += 1 # Write out statistics if False: print 'g', seq print 'i', intron_seq if len(intron_seq) > 0: gc_intron = '%1.4f' % cai.getGC(intron_seq) intron_length = len(intron_seq) frac_coding = '%1.4f' % ( len(seq) / (len(seq) + float(intron_length)), ) gcind_intron = '%1.4f' % cai.getDinucleotideIndex( intron_seq, 'GC') gt_intron = '%1.4f' % cai.getContent(intron_seq, 'GT') else: gc_intron = 'NA' intron_length = 0 gcind_intron = 'NA' if len(seq) > 0: frac_coding = '1.0' else: frac_coding = 'NA' num_coding_exons = len( [xr for xr in recs if xr.coding_end > xr.coding_start]) # Test to ensure agreement between codingOutsideBoundary and buildCodingSequence if False: (coding_outside, total_coding) = codingOutsideBoundary(recs, 0) print len(seq), total_coding, coding_outside assert total_coding == coding_outside assert coding_outside == len(seq) fracs_inside = [] for bound in bounds: (coding_outside, total_coding) = codingOutsideBoundary(recs, bound) if total_coding > 0: frac_inside_bound = "%1.4f" % ( 1 - float(coding_outside) / total_coding, ) else: frac_inside_bound = "NA" fracs_inside.append(frac_inside_bound) line = "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%s\n" % \ (sentinel_rec.gene_ID, sentinel_rec.peptide_ID, sentinel_rec.transcript_ID, sentinel_rec.chromosome, intron_length, num_coding_exons, frac_coding, gc_intron, gcind_intron, gt_intron, len(seq), '\t'.join(fracs_inside)) outfile.write(line) #print line, outfile.flush() print "# Processed %d genes with %d errors" % (n_genes, n_errors) print "# %d length errors" % (n_wrong_length, ) print "# %d bad translation errors" % (n_bad_translation, ) print "# %d different translation errors" % ( n_different_translation, ) total_errors += n_errors total_genes += n_genes chrseqlist = None chrseq = None print "# Processed %d genes with %d errors total" % (total_genes, total_errors) outfile.close()