opt_codons = cai.getOptimalCodons(options.species) relad_dict = cai.getRelativeAdaptivenessValues(options.species) cai_fxn = cai.getCAIFunction(options.species) # DAD: cai.getCAI takes only log-transformed relative adaptiveness values. # Here, knowing that some values are == 0, add half of the minimum nonzero value. # Should be using some sort of better estimator! min_relad_value = 0.5 * min([v for v in relad_dict.values() if v > 0.0]) for k in relad_dict.keys(): if relad_dict[k] <= 0.0: relad_dict[k] = min_relad_value ln_relad_dict = dict([(k, math.log(v)) for (k, v) in relad_dict.items()]) # Assay the provided sequences for (id, seq) in seqs: line = "{0} Fop = {1:.4f}, CAI = {2:.4f}, GC = {3:.2f}\n".format( id, cai.getFop(seq, opt_codons), cai_fxn(seq), cai.getGC(seq)) info_outs.write(line) # If optimization is desired, do it. if options.optimize: info_outs.write("# Optimizing sequences...\n") gc = translate.geneticCode(rna=False) codons = {} opt_codon_dict = dict([(gc[c], c) for c in opt_codons]) opt_codon_dict['W'] = 'TGG' opt_codon_dict['M'] = 'ATG' opt_headers = [] opt_seqs = [] # optimize the codon sequences for (id, seq) in seqs:
def writeStats(chromosomes, peptide_records, bounds, chromosome_load_fxn, outfile): header = "gene\tpep\ttrans\tchr\tlen.intron\tn.exons\tfrac.coding\tgc.intron\tgcind.intron\tgt.intron\tlen.coding\tfrac.%s\n" % \ ('\tfrac.'.join([str(b) for b in bounds],)) outfile.write(header) print header, total_genes = 0 total_errors = 0 # Reconstruct genes for chr in chromosomes: print "# Chromosome", chr # Load this chromosome's data chrseq = chromosome_load_fxn(chr) n_genes = 0 n_errors = 0 n_wrong_length = 0 n_bad_translation = 0 n_different_translation = 0 for (pepid, recdict) in peptide_records.items(): recs = recdict.values() if recs[0].chromosome == chr: recs.sort(key=lambda e: e.exon_start) (seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0) (intron_seq, sentinel_rec_intron) = buildIntronSequence(recs, chrseq) # Reverse-complement the sequence if it's on the negative strand if recs[0].strand == '-1': #print "reversing strand" seq = translate.reverseComplement(seq) intron_seq = translate.reverseComplement(intron_seq) n_genes += 1 # Write out statistics if False: print 'g', seq print 'i', intron_seq if len(intron_seq) > 0: gc_intron = '%1.4f' % cai.getGC(intron_seq) intron_length = len(intron_seq) frac_coding = '%1.4f' % ( len(seq) / (len(seq) + float(intron_length)), ) gcind_intron = '%1.4f' % cai.getDinucleotideIndex( intron_seq, 'GC') gt_intron = '%1.4f' % cai.getContent(intron_seq, 'GT') else: gc_intron = 'NA' intron_length = 0 gcind_intron = 'NA' if len(seq) > 0: frac_coding = '1.0' else: frac_coding = 'NA' num_coding_exons = len( [xr for xr in recs if xr.coding_end > xr.coding_start]) # Test to ensure agreement between codingOutsideBoundary and buildCodingSequence if False: (coding_outside, total_coding) = codingOutsideBoundary(recs, 0) print len(seq), total_coding, coding_outside assert total_coding == coding_outside assert coding_outside == len(seq) fracs_inside = [] for bound in bounds: (coding_outside, total_coding) = codingOutsideBoundary(recs, bound) if total_coding > 0: frac_inside_bound = "%1.4f" % ( 1 - float(coding_outside) / total_coding, ) else: frac_inside_bound = "NA" fracs_inside.append(frac_inside_bound) line = "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%s\n" % \ (sentinel_rec.gene_ID, sentinel_rec.peptide_ID, sentinel_rec.transcript_ID, sentinel_rec.chromosome, intron_length, num_coding_exons, frac_coding, gc_intron, gcind_intron, gt_intron, len(seq), '\t'.join(fracs_inside)) outfile.write(line) #print line, outfile.flush() print "# Processed %d genes with %d errors" % (n_genes, n_errors) print "# %d length errors" % (n_wrong_length, ) print "# %d bad translation errors" % (n_bad_translation, ) print "# %d different translation errors" % ( n_different_translation, ) total_errors += n_errors total_genes += n_genes chrseqlist = None chrseq = None print "# Processed %d genes with %d errors total" % (total_genes, total_errors) outfile.close()
def writeStats(chromosomes, peptide_records, bounds, chromosome_load_fxn, outfile): header = "gene\tpep\ttrans\tchr\tlen.intron\tn.exons\tfrac.coding\tgc.intron\tgcind.intron\tgt.intron\tlen.coding\tfrac.%s\n" % \ ('\tfrac.'.join([str(b) for b in bounds],)) outfile.write(header) print header, total_genes = 0 total_errors = 0 # Reconstruct genes for chr in chromosomes: print "# Chromosome", chr # Load this chromosome's data chrseq = chromosome_load_fxn(chr) n_genes = 0 n_errors = 0 n_wrong_length = 0 n_bad_translation = 0 n_different_translation = 0 for (pepid, recdict) in peptide_records.items(): recs = recdict.values() if recs[0].chromosome == chr: recs.sort( key = lambda e: e.exon_start) (seq, sentinel_rec) = buildCodingSequence(recs, chrseq, 0) (intron_seq, sentinel_rec_intron) = buildIntronSequence(recs, chrseq) # Reverse-complement the sequence if it's on the negative strand if recs[0].strand == '-1': #print "reversing strand" seq = translate.reverseComplement(seq) intron_seq = translate.reverseComplement(intron_seq) n_genes += 1 # Write out statistics if False: print 'g', seq print 'i', intron_seq if len(intron_seq)>0: gc_intron = '%1.4f' % cai.getGC(intron_seq) intron_length = len(intron_seq) frac_coding = '%1.4f' % (len(seq)/(len(seq)+float(intron_length)),) gcind_intron = '%1.4f' % cai.getDinucleotideIndex(intron_seq, 'GC') gt_intron = '%1.4f' % cai.getContent(intron_seq, 'GT') else: gc_intron = 'NA' intron_length = 0 gcind_intron = 'NA' if len(seq)>0: frac_coding = '1.0' else: frac_coding = 'NA' num_coding_exons = len([xr for xr in recs if xr.coding_end > xr.coding_start]) # Test to ensure agreement between codingOutsideBoundary and buildCodingSequence if False: (coding_outside, total_coding) = codingOutsideBoundary(recs, 0) print len(seq), total_coding, coding_outside assert total_coding == coding_outside assert coding_outside == len(seq) fracs_inside = [] for bound in bounds: (coding_outside, total_coding) = codingOutsideBoundary(recs, bound) if total_coding > 0: frac_inside_bound = "%1.4f" % (1-float(coding_outside)/total_coding,) else: frac_inside_bound = "NA" fracs_inside.append(frac_inside_bound) line = "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%s\n" % \ (sentinel_rec.gene_ID, sentinel_rec.peptide_ID, sentinel_rec.transcript_ID, sentinel_rec.chromosome, intron_length, num_coding_exons, frac_coding, gc_intron, gcind_intron, gt_intron, len(seq), '\t'.join(fracs_inside)) outfile.write(line) #print line, outfile.flush() print "# Processed %d genes with %d errors" % (n_genes, n_errors) print "# %d length errors" % (n_wrong_length,) print "# %d bad translation errors" % (n_bad_translation,) print "# %d different translation errors" % (n_different_translation,) total_errors += n_errors total_genes += n_genes chrseqlist = None chrseq = None print "# Processed %d genes with %d errors total" % (total_genes, total_errors) outfile.close()
info_outs.write("# Using optimal codons for {}\n".format(options.species)) opt_codons = cai.getOptimalCodons(options.species) relad_dict = cai.getRelativeAdaptivenessValues(options.species) cai_fxn = cai.getCAIFunction(options.species) # DAD: cai.getCAI takes only log-transformed relative adaptiveness values. # Here, knowing that some values are == 0, add half of the minimum nonzero value. # Should be using some sort of better estimator! min_relad_value = 0.5 * min([v for v in relad_dict.values() if v>0.0]) for k in relad_dict.keys(): if relad_dict[k] <= 0.0: relad_dict[k] = min_relad_value ln_relad_dict = dict([(k,math.log(v)) for (k,v) in relad_dict.items()]) # Assay the provided sequences for (id, seq) in seqs: line = "{0} Fop = {1:.4f}, CAI = {2:.4f}, GC = {3:.2f}\n".format(id, cai.getFop(seq, opt_codons), cai_fxn(seq), cai.getGC(seq)) info_outs.write(line) # If optimization is desired, do it. if options.optimize: info_outs.write("# Optimizing sequences...\n") gc = translate.geneticCode(rna=False) codons = {} opt_codon_dict = dict([(gc[c],c) for c in opt_codons]) opt_codon_dict['W'] = 'TGG' opt_codon_dict['M'] = 'ATG' opt_headers = [] opt_seqs = [] # optimize the codon sequences for (id, seq) in seqs: