def getKL(self, usage): """return Kullback-Leibler Divergence (relative entropy) of sequences with respect to reference codon usage. """ e = 0 freqs = Genomics.CalculateCodonFrequenciesFromCounts( self.mCodonCounts, self.mPseudoCounts) for codon, count in list(self.mCodonCounts.items()): e += usage[codon] * math.log(usage[codon] / freqs[codon]) return e
def getEntropy(self, usage=None): """return entropy of a source in terms of a reference usage. Also called conditional entropy or encoding cost. Note that here I compute the sum over 20 entropies, one for each amino acid. If not given, calculate entropy. """ e = 0 freqs = Genomics.CalculateCodonFrequenciesFromCounts( self.mCodonCounts, self.mPseudoCounts) if usage is None: usage = freqs for codon, count in list(self.mCodonCounts.items()): e -= freqs[codon] * math.log(usage[codon]) return e
def updateProperties(self): SequencePropertiesCodons.updateProperties(self) self.mCodonFrequencies = Genomics.CalculateCodonFrequenciesFromCounts( self.mCodonCounts)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: codonbias_acai2tsv.py 865 2007-01-15 13:44:43Z andreas $" ) parser.add_option("-o", "--input-file-trace", dest="input_filename_trace", type="string", help="input filename for cai.", metavar="FILE") parser.add_option("-e", "--input-file-genes", dest="input_filename_genes", type="string", help="input filename for genes information from cai.", metavar="FILE") parser.add_option("-c", "--input-file-codons", dest="input_filename_codons", type="string", help="input filename for codon usage information.", metavar="FILE") parser.add_option("--input-file-sequences", dest="input_filename_sequences", type="string", help="input filename with sequences.", metavar="FILE") parser.add_option("-t", "--input-file-subset", dest="input_filename_subset", type="string", help="input filename with subset.", metavar="FILE") parser.add_option("--codon-table-format", dest="codon_table_format", type="choice", choices=("list", "matrix"), help="output options for output codon tables.") parser.add_option("--codon-table-type", dest="codon_table_type", type="choice", choices=("counts", "frequencies", "weights", "absolute-frequencies"), help="type of codon table.") parser.add_option("-r", "--reference", dest="reference", type="string", help="dump CAI reference weights for species.") parser.add_option("-s", "--select", dest="select", type="string", help="fields to select from genes table.") parser.add_option("-m", "--map", dest="input_filename_map", type="string", help="filename with mapping information for gene names.", metavar="FILE") parser.add_option("-i", "--invert-map", dest="invert_map", action="store_true", help="invert map.") parser.add_option( "-d", "--dominant-set", dest="dominant_set", type="float", help="only print out dominant set (# fraction of most biased genes).") parser.add_option( "--reverse-set", dest="reverse_set", action="store_true", help="print the reverse set, i.e., then non-dominant set.") parser.add_option( "-u", "--codon-usage", dest="codon_usage", type="string", help="print codon usage for the full/biased set of genes [full|biased]." ) parser.add_option( "-w", "--weights", dest="weights", type="string", help= "print weights [final-list|final-matrix|random|compute|weights|frequencies|absolute-frequencies]." ) parser.add_option("--weights-matrix2table", dest="weights_matrix2table", action="store_true", help="convert a weights matrix to a weights table.") parser.add_option("--get-preferred-codons", dest="get_preferred_codons", type="string", help="compute overview of preferred codons.") parser.set_defaults(input_filename="-", input_filename_trace=None, input_filename_genes=None, input_filename_codons=None, input_filename_map=None, input_filename_subset=None, input_filename_sequences=None, invert_map=False, select=None, codon_usage=None, weights=None, revserse_set=False, pseudocounts=1, codon_table_format="list", codon_table_type="weights", weights_matrix2table=False, random_size=1000, get_preferred_codons=None, dominant_set=0.0) (options, args) = E.Start(parser) if options.select: options.select = options.select.split(",") outfile = options.stdout ################################################################### # convert weights table to a codon table if options.weights_matrix2table: lines = options.stdin.readlines() data = [] for line in lines: if line[0] == "#": continue data += list(map(float, line[:-1].split(","))) weights = {} x = 0 for cc in OUTPUT_ORDER_CODON_MATRIX: for c in cc: weights[c] = data[x] x += 1 outfile.write("CODON\tWEIGHT\n") codons = weights.keys() codons.sort() for codon in codons: outfile.write("%s\t%f\n" % (codon, weights[codon])) E.Stop() sys.exit(1) ################################################################### map_genes = {} if options.input_filename_map: data = map( lambda x: x[:-1].split("\t")[:2], filter(lambda x: x[0] != "#", open(options.input_filename_map, "r").readlines())) for a, b in data: if options.invert_map: a, b = b, a map_genes[a] = b result = WrapperAdaptiveCAI.AdaptiveCAIResult() if options.input_filename_genes: gene_file = open(options.input_filename_genes, "r") else: gene_file = None if options.input_filename_codons: codon_file = open(options.input_filename_codons, "r") else: codon_file = None if options.input_filename_trace: trace_file = open(options.input_filename_trace, "r") else: trace_file = None if options.input_filename_subset: l, e = IOTools.ReadList(open(options.input_filename_subset, "r")) subset = set(l) if options.loglevel >= 1: options.stdlog.write("# read %i entries into subset from %s.\n" % (len(subset), options.input_filename_subset)) else: subset = None result.Read(gene_file=gene_file, codon_file=codon_file, trace_file=trace_file) if gene_file: gene_file.close() if codon_file: codon_file.close() if trace_file: trace_file.close() if options.reference: if options.reference not in CODON_PREFERENCES: raise "unknown species %s: possibles species are: %s" % ( options.reference, str(CODON_PREFERNCES.keys())) weights = Genomics.CalculateCAIWeightsFromCounts( CODON_PREFERENCES[options.reference], options.pseudocounts) for x in range(len(OUTPUT_ORDER_CODON_MATRIX)): outfile.write(",".join( map(lambda z: "%5.3f" % z, [ weights[codon.upper()] for codon in OUTPUT_ORDER_CODON_MATRIX[x] ]))) outfile.write("\n") if options.dominant_set and gene_file: cai_threshold = result.GetDominantThreshold(options.dominant_set) else: if options.reverse_set: cai_threshold = 1.0 else: cai_threshold = 0.0 if options.select: fields = [] titles = [] for x in options.select: f = re.match("(\S+) (AS|as) (\S+)", x) if f: fields.append(f.groups()[0].upper()) titles.append(f.groups()[2]) else: fields.append(x.upper()) titles.append(x) outfile.write("GENENAME\t" + string.join(titles, "\t") + "\n") for genename, data in result.mGeneInfo.items(): if genename in map_genes: genename = map_genes[genename] if options.reverse_set: if data["CAICLASS"] >= cai_threshold: continue else: if data["CAICLASS"] < cai_threshold: continue outfile.write(genename) for c in fields: outfile.write("\t%s" % str(data[c])) outfile.write("\n") if options.weights: format = options.codon_table_format if options.weights in ("compute-counts", "compute-weights", "compute-frequencies"): # compute codon usage weights from a set of sequences codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for x in codons: counts[x] = 0 if options.input_filename_sequences: sequences = Genomics.ReadPeptideSequences(open( options.input_filename_sequences, "r"), filter=subset) for key, sequence in sequences.items(): sequence = re.sub(" ", "", sequence) if len(sequence) % 3 != 0: raise "warning: sequence %s is not multiple of 3" % key for codon in [ sequence[x:x + 3] for x in range(0, len(sequence), 3) ]: counts[codon.upper()] += 1 if options.weights == "compute-frequencies": weights = Genomics.CalculateCodonFrequenciesFromCounts( counts, options.pseudocounts) elif options.weights == "compute-weights": weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) else: weights = counts elif options.weights in ("final-list", "final-matrix"): weights = result.mFinalWeights if options.weights == "final-list": format = "list" else: format = "matrix" elif options.weights == "random": # get random weights codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for x in codons: counts[x] = random.randint(1, options.random_size) weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) format = "matrix" elif options.weights == "biased": # get biased weights codons = Genomics.GetUniformCodonUsage() weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) format = "matrix" elif options.weights in ("uniform-weights", "uniform-frequencies"): # get uniform weights codons = Genomics.GetUniformCodonUsage() if options.weights == ("uniform-weights"): weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) format = "matrix" else: weights = codons format = "list" elif options.weights in ("counts", "frequencies", "absolute-frequencies"): # get weights as frequencies # compute from scratch. In the caijava file, the absolute frequencey f / gene_length is # given. Thus the total number of codons is f * gene_length. codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for c in codons: counts[c] = 0 for genename, data in result.mGeneInfo.items(): if options.reverse_set: if data["CAICLASS"] >= cai_threshold: continue else: if data["CAICLASS"] < cai_threshold: continue l = data["GENELENGTH"] for c in codons: counts[c] += int(data[c] * l) if options.weights == "frequencies": weights = Genomics.CalculateCodonFrequenciesFromCounts( counts, options.pseudocounts) elif options.weights == "counts": weights = counts elif options.weights == "absolute-frequencies": # compute absolute frequencies (with pseudo-counts, but do not # normalize per aa) weights = {} m = sum(counts.values()) for k, v in counts.items(): weights[k] = float(v) / m format = "list" elif options.weights == "subset": codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for c in codons: counts[c] = 0 for genename, data in result.mGeneInfo.items(): found = genename in subset if (not found and not options.reverse_set) or ( found and options.reverse_set): continue l = data["GENELENGTH"] for c in codons: counts[c] += int(data[c] * l) if options.codon_table_type == "frequencies": weights = Genomics.CalculateCodonFrequenciesFromCounts( counts, options.pseudocounts) elif options.codon_table_type == "weights": weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) elif options.codon_table_type == "counts": weights = counts if options.codon_table_type == "absolute-frequencies": # compute absolute frequencies (with pseudo-counts, but do not # normalize per aa) weights = {} m = sum(counts.values()) for k, v in counts.items(): weights[k] = float(v) / m else: raise "unknown weights %s" % options.weights if format == "list": outfile.write("CODON\tWEIGHT\n") codons = weights.keys() codons.sort() for codon in codons: outfile.write("%s\t%f\n" % (codon, weights[codon])) elif format == "matrix": for x in range(len(OUTPUT_ORDER_CODON_MATRIX)): outfile.write(",".join( map(lambda z: "%5.3f" % z, [ weights[codon.upper()] for codon in OUTPUT_ORDER_CODON_MATRIX[x] ]))) outfile.write("\n") if options.codon_usage: outfile.write("CODON\tFREQUENCY\n") if options.codon_usage == "biased": usages = result.mCodonUsages[-1] elif options.codon_usage == "full": usages = result.mCodonUsages[0] elif options.codon_usage == "weights": usages = WrapperAdaptiveCAI.CalculateWeightsFromUsage( result.mCodonUsages[0]) else: raise "unknown option '%s' for codon-usage." % options.codon_usage codons = usages.keys() codons.sort() for codon in codons: outfile.write("%s\t%f\n" % (codon, usages[codon])) E.Stop()