def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-w", "--filename-weights", dest="filename_weights", type="string", help= "filename with codon frequencies. Multiple filenames can be separated by comma [default=%default]." ) parser.add_option("-s", "--sections", dest="sections", type="choice", action="append", choices=("length", "hid", "na", "aa", "degeneracy", "bias", "codons", "codon-usage", "codon-translator"), help="which sections to output [default=%default]") parser.add_option( "-t", "--type", dest="seqtype", type="choice", choices=("na", "aa"), help= "type of sequence: na=nucleotides, aa=amino acids [default=%default].") parser.add_option( "-e", "--regex-identifier", dest="regex_identifier", type="string", help= "regular expression to extract identifier from fasta description line [default=%default]." ) parser.set_defaults( filename_weights="uniform", pseudocounts=1, sections=[], regex_identifier="(.+)", seqtype="na", ) (options, args) = E.Start(parser, argv=argv) options.filename_weights = options.filename_weights.split(",") rx = re.compile(options.regex_identifier) reference_codons = [] if options.filename_weights: for filename in options.filename_weights: if filename == "uniform": reference_codons.append(Genomics.GetUniformCodonUsage()) else: reference_codons.append( IOTools.ReadMap(open(filename, "r"), has_header=True, map_functions=(str, float))) ## print codon table differences E.info("difference between supplied codon usage preferences.") for x in range(0, len(reference_codons)): for y in range(0, len(reference_codons)): if x == y: continue # calculate KL distance a = reference_codons[x] b = reference_codons[y] d = 0 for codon, p in a.items(): if Genomics.IsStopCodon(codon): continue d += b[codon] * math.log(b[codon] / p) E.info("tablediff\t%s\t%s\t%f" % (options.filename_weights[x], options.filename_weights[y], d)) iterator = FastaIterator.FastaIterator(options.stdin) def getCounter(section): if options.seqtype == "na": if section == "length": s = SequencePropertiesLength() elif section == "hid": s = SequencePropertiesHid() elif section == "na": s = SequencePropertiesNA() elif section == "aa": s = SequencePropertiesAA() elif section == "degeneracy": s = SequencePropertiesDegeneracy() elif section == "bias": s = SequencePropertiesBias(reference_codons) elif section == "codons": s = SequencePropertiesCodons() elif section == "codon-usage": s = SequencePropertiesCodonUsage() elif section == "codon-translator": s = SequencePropertiesCodonTranslator() else: raise ValueError("unknown section %s" % section) elif options.seqtype == "aa": if section == "length": s = SequencePropertiesLength() elif section == "hid": s = SequencePropertiesHid() elif section == "aa": s = SequencePropertiesAminoAcids() else: raise ValueError("unknown section %s" % section) return s ## setup totals totals = {} for section in options.sections: totals[section] = getCounter(section) options.stdout.write("id") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getHeaders())) options.stdout.write("\n") options.stdout.flush() for cur_record in iterator: sequence = re.sub(" ", "", cur_record.sequence).upper() if len(sequence) == 0: E.warning("empty sequence %s" % cur_record.title) continue id = rx.search(cur_record.title).groups()[0] options.stdout.write("%s" % id) options.stdout.flush() for section in options.sections: s = getCounter(section) s.loadSequence(sequence) totals[section].addProperties(s) options.stdout.write("\t" + "\t".join(s.getFields())) options.stdout.write("\n") options.stdout.write("total") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getFields())) options.stdout.write("\n") E.Stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-w", "--weights-tsv-file", dest="filename_weights", type="string", help="filename with codon frequencies. Multiple filenames " "can be separated by comma.") parser.add_option("-s", "--section", dest="sections", type="choice", action="append", choices=("length", "sequence", "hid", "na", "aa", "cpg", "dn", "degeneracy", "gaps", "codons", "codon-usage", "codon-translator", "codon-bias"), help="which sections to output [%default]") parser.add_option( "-t", "--sequence-type", dest="seqtype", type="choice", choices=("na", "aa"), help="type of sequence: na=nucleotides, aa=amino acids [%default].") parser.add_option( "-e", "--regex-identifier", dest="regex_identifier", type="string", help="regular expression to extract identifier from fasta " "description line.") parser.add_option("--split-fasta-identifier", dest="split_id", action="store_true", help="split fasta description line (starting >) and use " "only text before first space") parser.add_option( "--add-total", dest="add_total", action="store_true", help="add a row with column totals at the end of the table" "[%default]") parser.set_defaults( filename_weights=None, pseudocounts=1, sections=[], regex_identifier="(.+)", seqtype="na", gap_chars='xXnN', split_id=False, add_total=False, ) (options, args) = E.Start(parser, argv=argv) rx = re.compile(options.regex_identifier) reference_codons = [] if options.filename_weights: options.filename_weights = options.filename_weights.split(",") for filename in options.filename_weights: if filename == "uniform": reference_codons.append(Genomics.GetUniformCodonUsage()) else: reference_codons.append( IOTools.ReadMap(IOTools.openFile(filename, "r"), has_header=True, map_functions=(str, float))) # print codon table differences options.stdlog.write( "# Difference between supplied codon usage preferences.\n") for x in range(0, len(reference_codons)): for y in range(0, len(reference_codons)): if x == y: continue # calculate KL distance a = reference_codons[x] b = reference_codons[y] d = 0 for codon, p in a.items(): if Genomics.IsStopCodon(codon): continue d += b[codon] * math.log(b[codon] / p) options.stdlog.write("# tablediff\t%s\t%s\t%f\n" % (options.filename_weights[x], options.filename_weights[y], d)) iterator = FastaIterator.FastaIterator(options.stdin) def getCounter(section): if options.seqtype == "na": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "na": s = SequenceProperties.SequencePropertiesNA() elif section == "gaps": s = SequenceProperties.SequencePropertiesGaps( options.gap_chars) elif section == "cpg": s = SequenceProperties.SequencePropertiesCpg() elif section == "dn": s = SequenceProperties.SequencePropertiesDN() # these sections requires sequence length to be a multiple of 3 elif section == "aa": s = SequenceProperties.SequencePropertiesAA() elif section == "degeneracy": s = SequenceProperties.SequencePropertiesDegeneracy() elif section == "codon-bias": s = SequenceProperties.SequencePropertiesBias(reference_codons) elif section == "codons": s = SequenceProperties.SequencePropertiesCodons() elif section == "codon-usage": s = SequenceProperties.SequencePropertiesCodonUsage() elif section == "codon-translator": s = SequenceProperties.SequencePropertiesCodonTranslator() else: raise ValueError("unknown section %s" % section) elif options.seqtype == "aa": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "aa": s = SequenceProperties.SequencePropertiesAminoAcids() else: raise ValueError("unknown section %s" % section) return s # setup totals totals = {} for section in options.sections: totals[section] = getCounter(section) options.stdout.write("id") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getHeaders())) options.stdout.write("\n") options.stdout.flush() s = getCounter("hid") s.loadSequence("AAAAAAAAA", "na") for cur_record in iterator: sequence = re.sub(" ", "", cur_record.sequence).upper() if len(sequence) == 0: raise ValueError("empty sequence %s" % cur_record.title) id = rx.search(cur_record.title).groups()[0] if options.split_id is True: options.stdout.write("%s" % id.split()[0]) else: options.stdout.write("%s" % id) options.stdout.flush() for section in options.sections: s = getCounter(section) s.loadSequence(sequence, options.seqtype) totals[section].addProperties(s) options.stdout.write("\t" + "\t".join(s.getFields())) options.stdout.write("\n") if options.add_total: options.stdout.write("total") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getFields())) options.stdout.write("\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: codonbias_acai2tsv.py 865 2007-01-15 13:44:43Z andreas $" ) parser.add_option("-o", "--input-file-trace", dest="input_filename_trace", type="string", help="input filename for cai.", metavar="FILE") parser.add_option("-e", "--input-file-genes", dest="input_filename_genes", type="string", help="input filename for genes information from cai.", metavar="FILE") parser.add_option("-c", "--input-file-codons", dest="input_filename_codons", type="string", help="input filename for codon usage information.", metavar="FILE") parser.add_option("--input-file-sequences", dest="input_filename_sequences", type="string", help="input filename with sequences.", metavar="FILE") parser.add_option("-t", "--input-file-subset", dest="input_filename_subset", type="string", help="input filename with subset.", metavar="FILE") parser.add_option("--codon-table-format", dest="codon_table_format", type="choice", choices=("list", "matrix"), help="output options for output codon tables.") parser.add_option("--codon-table-type", dest="codon_table_type", type="choice", choices=("counts", "frequencies", "weights", "absolute-frequencies"), help="type of codon table.") parser.add_option("-r", "--reference", dest="reference", type="string", help="dump CAI reference weights for species.") parser.add_option("-s", "--select", dest="select", type="string", help="fields to select from genes table.") parser.add_option("-m", "--map", dest="input_filename_map", type="string", help="filename with mapping information for gene names.", metavar="FILE") parser.add_option("-i", "--invert-map", dest="invert_map", action="store_true", help="invert map.") parser.add_option( "-d", "--dominant-set", dest="dominant_set", type="float", help="only print out dominant set (# fraction of most biased genes).") parser.add_option( "--reverse-set", dest="reverse_set", action="store_true", help="print the reverse set, i.e., then non-dominant set.") parser.add_option( "-u", "--codon-usage", dest="codon_usage", type="string", help="print codon usage for the full/biased set of genes [full|biased]." ) parser.add_option( "-w", "--weights", dest="weights", type="string", help= "print weights [final-list|final-matrix|random|compute|weights|frequencies|absolute-frequencies]." ) parser.add_option("--weights-matrix2table", dest="weights_matrix2table", action="store_true", help="convert a weights matrix to a weights table.") parser.add_option("--get-preferred-codons", dest="get_preferred_codons", type="string", help="compute overview of preferred codons.") parser.set_defaults(input_filename="-", input_filename_trace=None, input_filename_genes=None, input_filename_codons=None, input_filename_map=None, input_filename_subset=None, input_filename_sequences=None, invert_map=False, select=None, codon_usage=None, weights=None, revserse_set=False, pseudocounts=1, codon_table_format="list", codon_table_type="weights", weights_matrix2table=False, random_size=1000, get_preferred_codons=None, dominant_set=0.0) (options, args) = E.Start(parser) if options.select: options.select = options.select.split(",") outfile = options.stdout ################################################################### # convert weights table to a codon table if options.weights_matrix2table: lines = options.stdin.readlines() data = [] for line in lines: if line[0] == "#": continue data += list(map(float, line[:-1].split(","))) weights = {} x = 0 for cc in OUTPUT_ORDER_CODON_MATRIX: for c in cc: weights[c] = data[x] x += 1 outfile.write("CODON\tWEIGHT\n") codons = weights.keys() codons.sort() for codon in codons: outfile.write("%s\t%f\n" % (codon, weights[codon])) E.Stop() sys.exit(1) ################################################################### map_genes = {} if options.input_filename_map: data = map( lambda x: x[:-1].split("\t")[:2], filter(lambda x: x[0] != "#", open(options.input_filename_map, "r").readlines())) for a, b in data: if options.invert_map: a, b = b, a map_genes[a] = b result = WrapperAdaptiveCAI.AdaptiveCAIResult() if options.input_filename_genes: gene_file = open(options.input_filename_genes, "r") else: gene_file = None if options.input_filename_codons: codon_file = open(options.input_filename_codons, "r") else: codon_file = None if options.input_filename_trace: trace_file = open(options.input_filename_trace, "r") else: trace_file = None if options.input_filename_subset: l, e = IOTools.ReadList(open(options.input_filename_subset, "r")) subset = set(l) if options.loglevel >= 1: options.stdlog.write("# read %i entries into subset from %s.\n" % (len(subset), options.input_filename_subset)) else: subset = None result.Read(gene_file=gene_file, codon_file=codon_file, trace_file=trace_file) if gene_file: gene_file.close() if codon_file: codon_file.close() if trace_file: trace_file.close() if options.reference: if options.reference not in CODON_PREFERENCES: raise "unknown species %s: possibles species are: %s" % ( options.reference, str(CODON_PREFERNCES.keys())) weights = Genomics.CalculateCAIWeightsFromCounts( CODON_PREFERENCES[options.reference], options.pseudocounts) for x in range(len(OUTPUT_ORDER_CODON_MATRIX)): outfile.write(",".join( map(lambda z: "%5.3f" % z, [ weights[codon.upper()] for codon in OUTPUT_ORDER_CODON_MATRIX[x] ]))) outfile.write("\n") if options.dominant_set and gene_file: cai_threshold = result.GetDominantThreshold(options.dominant_set) else: if options.reverse_set: cai_threshold = 1.0 else: cai_threshold = 0.0 if options.select: fields = [] titles = [] for x in options.select: f = re.match("(\S+) (AS|as) (\S+)", x) if f: fields.append(f.groups()[0].upper()) titles.append(f.groups()[2]) else: fields.append(x.upper()) titles.append(x) outfile.write("GENENAME\t" + string.join(titles, "\t") + "\n") for genename, data in result.mGeneInfo.items(): if genename in map_genes: genename = map_genes[genename] if options.reverse_set: if data["CAICLASS"] >= cai_threshold: continue else: if data["CAICLASS"] < cai_threshold: continue outfile.write(genename) for c in fields: outfile.write("\t%s" % str(data[c])) outfile.write("\n") if options.weights: format = options.codon_table_format if options.weights in ("compute-counts", "compute-weights", "compute-frequencies"): # compute codon usage weights from a set of sequences codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for x in codons: counts[x] = 0 if options.input_filename_sequences: sequences = Genomics.ReadPeptideSequences(open( options.input_filename_sequences, "r"), filter=subset) for key, sequence in sequences.items(): sequence = re.sub(" ", "", sequence) if len(sequence) % 3 != 0: raise "warning: sequence %s is not multiple of 3" % key for codon in [ sequence[x:x + 3] for x in range(0, len(sequence), 3) ]: counts[codon.upper()] += 1 if options.weights == "compute-frequencies": weights = Genomics.CalculateCodonFrequenciesFromCounts( counts, options.pseudocounts) elif options.weights == "compute-weights": weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) else: weights = counts elif options.weights in ("final-list", "final-matrix"): weights = result.mFinalWeights if options.weights == "final-list": format = "list" else: format = "matrix" elif options.weights == "random": # get random weights codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for x in codons: counts[x] = random.randint(1, options.random_size) weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) format = "matrix" elif options.weights == "biased": # get biased weights codons = Genomics.GetUniformCodonUsage() weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) format = "matrix" elif options.weights in ("uniform-weights", "uniform-frequencies"): # get uniform weights codons = Genomics.GetUniformCodonUsage() if options.weights == ("uniform-weights"): weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) format = "matrix" else: weights = codons format = "list" elif options.weights in ("counts", "frequencies", "absolute-frequencies"): # get weights as frequencies # compute from scratch. In the caijava file, the absolute frequencey f / gene_length is # given. Thus the total number of codons is f * gene_length. codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for c in codons: counts[c] = 0 for genename, data in result.mGeneInfo.items(): if options.reverse_set: if data["CAICLASS"] >= cai_threshold: continue else: if data["CAICLASS"] < cai_threshold: continue l = data["GENELENGTH"] for c in codons: counts[c] += int(data[c] * l) if options.weights == "frequencies": weights = Genomics.CalculateCodonFrequenciesFromCounts( counts, options.pseudocounts) elif options.weights == "counts": weights = counts elif options.weights == "absolute-frequencies": # compute absolute frequencies (with pseudo-counts, but do not # normalize per aa) weights = {} m = sum(counts.values()) for k, v in counts.items(): weights[k] = float(v) / m format = "list" elif options.weights == "subset": codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for c in codons: counts[c] = 0 for genename, data in result.mGeneInfo.items(): found = genename in subset if (not found and not options.reverse_set) or ( found and options.reverse_set): continue l = data["GENELENGTH"] for c in codons: counts[c] += int(data[c] * l) if options.codon_table_type == "frequencies": weights = Genomics.CalculateCodonFrequenciesFromCounts( counts, options.pseudocounts) elif options.codon_table_type == "weights": weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) elif options.codon_table_type == "counts": weights = counts if options.codon_table_type == "absolute-frequencies": # compute absolute frequencies (with pseudo-counts, but do not # normalize per aa) weights = {} m = sum(counts.values()) for k, v in counts.items(): weights[k] = float(v) / m else: raise "unknown weights %s" % options.weights if format == "list": outfile.write("CODON\tWEIGHT\n") codons = weights.keys() codons.sort() for codon in codons: outfile.write("%s\t%f\n" % (codon, weights[codon])) elif format == "matrix": for x in range(len(OUTPUT_ORDER_CODON_MATRIX)): outfile.write(",".join( map(lambda z: "%5.3f" % z, [ weights[codon.upper()] for codon in OUTPUT_ORDER_CODON_MATRIX[x] ]))) outfile.write("\n") if options.codon_usage: outfile.write("CODON\tFREQUENCY\n") if options.codon_usage == "biased": usages = result.mCodonUsages[-1] elif options.codon_usage == "full": usages = result.mCodonUsages[0] elif options.codon_usage == "weights": usages = WrapperAdaptiveCAI.CalculateWeightsFromUsage( result.mCodonUsages[0]) else: raise "unknown option '%s' for codon-usage." % options.codon_usage codons = usages.keys() codons.sort() for codon in codons: outfile.write("%s\t%f\n" % (codon, usages[codon])) E.Stop()