def loadSequence(self, sequence, seqtype="na"): """load sequence properties from a sequence.""" SequencePropertiesLength.loadSequence(self, sequence, seqtype) if len(sequence) % 3: raise ValueError( '''sequence length is not a multiple of 3 (length=%i)''' % (len(sequence))) # uppercase all letters sequence = sequence.upper() self.mNStopCodons = 0 # setup counting arrays # nucleotide counts for each position (is not a sum of the counts # per degenerate site, as the codon might be intelligible, e.g. GNN). self.mCounts = [{'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0}, {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0}, {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0}] # nucleotide counts for each position per degeneracy self.mCountsDegeneracy = [] for x in (0, 1, 2): xx = [] for y in range(5): yy = {} for z in Bio.Alphabet.IUPAC.extended_dna.letters: yy[z] = 0 xx.append(yy) self.mCountsDegeneracy.append(xx) # use generator rather than list to save memory for codon in (sequence[x:x + 3] for x in xrange(0, len(sequence), 3)): for x in (0, 1, 2): self.mCounts[x][codon[x]] += 1 if Genomics.IsStopCodon(codon): self.mNStopCodons += 1 continue try: aa, deg1, deg2, deg3 = Genomics.GetDegeneracy(codon) degrees = (deg1, deg2, deg3) for x in range(len(degrees)): self.mCountsDegeneracy[x][degrees[x]][codon[x]] += 1 except KeyError: pass
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-w", "--weights-tsv-file", dest="filename_weights", type="string", help="filename with codon frequencies. Multiple filenames " "can be separated by comma.") parser.add_option("-s", "--section", dest="sections", type="choice", action="append", choices=("length", "sequence", "hid", "na", "aa", "cpg", "dn", "degeneracy", "gaps", "codons", "codon-usage", "codon-translator", "codon-bias"), help="which sections to output [%default]") parser.add_option( "-t", "--sequence-type", dest="seqtype", type="choice", choices=("na", "aa"), help="type of sequence: na=nucleotides, aa=amino acids [%default].") parser.add_option( "-e", "--regex-identifier", dest="regex_identifier", type="string", help="regular expression to extract identifier from fasta " "description line.") parser.add_option("--split-fasta-identifier", dest="split_id", action="store_true", help="split fasta description line (starting >) and use " "only text before first space") parser.add_option( "--add-total", dest="add_total", action="store_true", help="add a row with column totals at the end of the table" "[%default]") parser.set_defaults( filename_weights=None, pseudocounts=1, sections=[], regex_identifier="(.+)", seqtype="na", gap_chars='xXnN', split_id=False, add_total=False, ) (options, args) = E.Start(parser, argv=argv) rx = re.compile(options.regex_identifier) reference_codons = [] if options.filename_weights: options.filename_weights = options.filename_weights.split(",") for filename in options.filename_weights: if filename == "uniform": reference_codons.append(Genomics.GetUniformCodonUsage()) else: reference_codons.append( IOTools.ReadMap(IOTools.openFile(filename, "r"), has_header=True, map_functions=(str, float))) # print codon table differences options.stdlog.write( "# Difference between supplied codon usage preferences.\n") for x in range(0, len(reference_codons)): for y in range(0, len(reference_codons)): if x == y: continue # calculate KL distance a = reference_codons[x] b = reference_codons[y] d = 0 for codon, p in a.items(): if Genomics.IsStopCodon(codon): continue d += b[codon] * math.log(b[codon] / p) options.stdlog.write("# tablediff\t%s\t%s\t%f\n" % (options.filename_weights[x], options.filename_weights[y], d)) iterator = FastaIterator.FastaIterator(options.stdin) def getCounter(section): if options.seqtype == "na": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "na": s = SequenceProperties.SequencePropertiesNA() elif section == "gaps": s = SequenceProperties.SequencePropertiesGaps( options.gap_chars) elif section == "cpg": s = SequenceProperties.SequencePropertiesCpg() elif section == "dn": s = SequenceProperties.SequencePropertiesDN() # these sections requires sequence length to be a multiple of 3 elif section == "aa": s = SequenceProperties.SequencePropertiesAA() elif section == "degeneracy": s = SequenceProperties.SequencePropertiesDegeneracy() elif section == "codon-bias": s = SequenceProperties.SequencePropertiesBias(reference_codons) elif section == "codons": s = SequenceProperties.SequencePropertiesCodons() elif section == "codon-usage": s = SequenceProperties.SequencePropertiesCodonUsage() elif section == "codon-translator": s = SequenceProperties.SequencePropertiesCodonTranslator() else: raise ValueError("unknown section %s" % section) elif options.seqtype == "aa": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "aa": s = SequenceProperties.SequencePropertiesAminoAcids() else: raise ValueError("unknown section %s" % section) return s # setup totals totals = {} for section in options.sections: totals[section] = getCounter(section) options.stdout.write("id") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getHeaders())) options.stdout.write("\n") options.stdout.flush() s = getCounter("hid") s.loadSequence("AAAAAAAAA", "na") for cur_record in iterator: sequence = re.sub(" ", "", cur_record.sequence).upper() if len(sequence) == 0: raise ValueError("empty sequence %s" % cur_record.title) id = rx.search(cur_record.title).groups()[0] if options.split_id is True: options.stdout.write("%s" % id.split()[0]) else: options.stdout.write("%s" % id) options.stdout.flush() for section in options.sections: s = getCounter(section) s.loadSequence(sequence, options.seqtype) totals[section].addProperties(s) options.stdout.write("\t" + "\t".join(s.getFields())) options.stdout.write("\n") if options.add_total: options.stdout.write("total") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getFields())) options.stdout.write("\n") E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("translate", "translate-to-stop", "truncate-at-stop", "back-translate", "mark-codons", "apply-map", "build-map", "pseudo-codons", "filter", "interleaved-codons", "map-codons", "remove-gaps", "mask-seg", "mask-bias", "mask-codons", "mask-incomplete-codons", "mask-stops", "mask-soft", "remove-stops", "upper", "lower", "reverse-complement", "sample", "shuffle"), help="method to apply to sequences.") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one " "[default=%default].") parser.add_option("-x", "--ignore-errors", dest="ignore_errors", action="store_true", help="ignore errors [default = %default].") parser.add_option("--sample-proportion", dest="sample_proportion", type="float", help="sample proportion [default = %default].") parser.add_option("--exclude-pattern", dest="exclude_pattern", type="string", help="exclude all sequences with ids matching pattern " "[default = %default].") parser.add_option("--include-pattern", dest="include_pattern", type="string", help="include only sequences with ids matching pattern " "[default = %default].") parser.add_option("--filter-method", dest="filter_methods", type="string", action="append", help="filtering methods to apply " "[default = %default].") parser.add_option( "-t", "--sequence-type", dest="type", type="choice", choices=("aa", "na"), help="sequence type (aa or na) [%default]. This option determines " "which characters to use for masking [default = %default].") parser.add_option( "-l", "--template-identifier", dest="template_identifier", type="string", help="template for numerical identifier [default = %default] " "for the operation --build-map. A %i is replaced by the position " "of the sequence in the file.") parser.set_defaults( methods=[], parameters="", type="na", aa_mask_chars="xX", aa_mask_char="x", na_mask_chars="nN", na_mask_char="n", gap_chars="-.", gap_char="-", template_identifier="ID%06i", ignore_errors=False, exclude_pattern=None, include_pattern=None, sample_proportion=None, filter_methods=[], ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") rx_include, rx_exclude = None, None if options.include_pattern: rx_include = re.compile(options.include_pattern) if options.exclude_pattern: rx_exclude = re.compile(options.exclude_pattern) iterator = FastaIterator.FastaIterator(options.stdin) nseq = 0 map_seq2nid = {} if "apply-map" in options.methods: map_seq2nid = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if options.type == "na": mask_chars = options.na_mask_chars mask_char = options.na_mask_char else: mask_chars = options.aa_mask_chars mask_char = options.aa_mask_char if "map-codons" in options.methods: map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if "mask-soft" in options.methods: f = options.parameters[0] del options.parameters[0] hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r")) if "mask-codons" in options.methods or "back-translate" in options.methods: # open a second stream to read sequences from f = options.parameters[0] del options.parameters[0] other_iterator = FastaIterator.FastaIterator(open(f, "r")) ninput, noutput, nerrors, nskipped = 0, 0, 0, 0 if "sample" in options.methods: if not options.sample_proportion: raise ValueError("specify a sample proportion") sample_proportion = options.sample_proportion else: sample_proportion = None filter_min_sequence_length = None filter_max_sequence_length = None filter_id_list = None for f in options.filter_methods: if f.startswith("min-length"): filter_min_sequence_length = int(f.split("=")[1]) elif f.startswith("max-length"): filter_max_sequence_length = int(f.split("=")[1]) elif f.startswith("id-file"): filter_id_list = [ line[:-1] for line in IOTools.openFile(f.split("=")[1]) ] def raiseIfNotCodon(l, title): '''raise ValueError if sequence length l is not divisible by 3''' if l % 3 != 0: raise ValueError("length of sequence %s not divisible by 3" % (title)) while 1: try: cur_record = next(iterator) except StopIteration: break if cur_record is None: break nseq += 1 ninput += 1 sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) if rx_include and not rx_include.search(cur_record.title): nskipped += 1 continue if rx_exclude and rx_exclude.search(cur_record.title): nskipped += 1 continue if sample_proportion: if random.random() > sample_proportion: continue if not (filter_id_list is None or cur_record.title in filter_id_list): nskipped += 1 continue for method in options.methods: if method == "translate": # translate such that gaps are preserved seq = [] ls = len(re.sub('[%s]' % options.gap_chars, sequence, "")) if ls % 3 != 0: msg = "length of sequence %s (%i) not divisible by 3" % ( cur_record.title, ls) nerrors += 1 if options.ignore_errors: E.warn(msg) continue else: raise ValueError(msg) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "back-translate": # translate from an amino acid alignment to codon alignment seq = [] try: other_record = next(other_iterator) except StopIteration: raise ValueError("run out of sequences") if cur_record.title != other_record.title: raise "sequence titles don't match: %s %s" % ( cur_record.title, other_record.title) other_sequence = re.sub("[ %s]" % options.gap_chars, "", other_record.sequence) if len(other_sequence) % 3 != 0: raise ValueError( "length of sequence %s not divisible by 3" % (other_record.title)) r = re.sub("[%s]" % options.gap_chars, "", sequence) if len(other_sequence) != len(r) * 3: raise ValueError( "length of sequences do not match: %i vs %i" % (len(other_sequence), len(r))) x = 0 for aa in sequence: if aa in options.gap_chars: c = options.gap_char * 3 else: c = other_sequence[x:x + 3] x += 3 seq.append(c) sequence = "".join(seq) elif method == "pseudo-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = " ".join(seq) elif method == "reverse-complement": sequence = string.translate( sequence, string.maketrans("ACGTacgt", "TGCAtgca"))[::-1] elif method in ("mask-stops", "remove-stops"): c = [] codon = [] new_sequence = [] if method == "mask-stops": char = options.na_mask_char elif method == "remove-stops": char = options.gap_char for x in sequence: if x not in options.gap_chars: codon.append(x.upper()) c.append(x) if len(codon) == 3: codon = "".join(codon).upper() # mask all non-gaps if Genomics.IsStopCodon(codon): for x in c: if x in options.gap_chars: new_sequence.append(x) else: new_sequence.append(char) else: new_sequence += c c = [] codon = [] new_sequence += c sequence = "".join(new_sequence) elif method == "mask-soft": # Get next hard masked record and extract sequence and length try: cur_hm_record = next(hard_masked_iterator) except StopIteration: break hm_sequence = re.sub(" ", "", cur_hm_record.sequence) lhm = len(hm_sequence) new_sequence = [] # Check lengths of unmasked and soft masked sequences the same if l != lhm: raise ValueError( "length of unmasked and hard masked sequences not " "identical for record %s" % (cur_record.title)) # Check if hard masked seq contains repeat (N), if so replace N # with lowercase sequence from unmasked version if sequence == hm_sequence: pass else: for x, y in zip_longest(sequence, hm_sequence): if y == "N": new_sequence += x.lower() else: new_sequence += x.upper() sequence = "".join(new_sequence) elif method == "map-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in (sequence[x:x + 3].upper() for x in range(0, l, 3)): if codon not in map_codon2code: aa = "X" else: aa = map_codon2code[codon] seq.append(aa) sequence = "".join(seq) elif method == "interleaved-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append("%s:%s" % (aa, codon)) sequence = " ".join(seq) elif method == "translate-to-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "truncate-at-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break seq.append(codon) sequence = "".join(seq) elif method == "remove-gaps": seq = [] for s in sequence: if s in options.gap_chars: continue seq.append(s) sequence = "".join(seq) elif method == "upper": sequence = sequence.upper() elif method == "lower": sequence = sequence.lower() elif method == "mark-codons": raiseIfNotCodon(l, cur_record.title) seq = [] sequence = " ".join( [sequence[x:x + 3] for x in range(0, l, 3)]) elif method == "apply-map": id = re.match("^(\S+)", cur_record.title).groups()[0] if id in map_seq2nid: rest = cur_record.title[len(id):] cur_record.title = map_seq2nid[id] + rest elif method == "build-map": # build a map of identifiers id = re.match("^(\S+)", cur_record.title).groups()[0] new_id = options.template_identifier % nseq if id in map_seq2nid: raise "duplicate fasta entries - can't map those: %s" % id map_seq2nid[id] = new_id cur_record.title = new_id elif method == "mask-bias": masker = Masker.MaskerBias() sequence = masker(sequence) elif method == "mask-seg": masker = Masker.MaskerSeg() sequence = masker(sequence) elif method == "shuffle": s = list(sequence) random.shuffle(s) sequence = "".join(s) elif method == "mask-incomplete-codons": seq = list(sequence) for x in range(0, l, 3): nm = len([x for x in seq[x:x + 3] if x in mask_chars]) if 0 < nm < 3: seq[x:x + 3] = [mask_char] * 3 sequence = "".join(seq) elif method == "mask-codons": # mask codons based on amino acids given as reference # sequences. other_record = next(other_iterator) if other_record is None: raise ValueError("run out of sequences.") if cur_record.title != other_record.title: raise ValueError("sequence titles don't match: %s %s" % (cur_record.title, other_record.title)) other_sequence = re.sub(" ", "", other_record.sequence) if len(other_sequence) * 3 != len(sequence): raise ValueError( "sequences for %s don't have matching lengths %i - %i" % (cur_record.title, len(other_sequence) * 3, len(sequence))) seq = list(sequence) c = 0 for x in other_sequence: if x in options.aa_mask_chars: if x.isupper(): seq[c:c + 3] = [options.na_mask_char.upper()] * 3 else: seq[c:c + 3] = [options.na_mask_char.lower()] * 3 c += 3 sequence = "".join(seq) l = len(sequence) if filter_min_sequence_length is not None and \ l < filter_min_sequence_length: nskipped += 1 if filter_max_sequence_length is not None and \ l > filter_max_sequence_length: nskipped += 1 continue options.stdout.write(">%s\n%s\n" % (cur_record.title, sequence)) noutput += 1 if "build-map" in options.methods: p = options.parameters[0] if p: outfile = IOTools.openFile(p, "w") else: outfile = options.stdout outfile.write("old\tnew\n") for old_id, new_id in list(map_seq2nid.items()): outfile.write("%s\t%s\n" % (old_id, new_id)) if p: outfile.close() E.info("ninput=%i, noutput=%i, nskipped=%i, nerrors=%i" % (ninput, noutput, nskipped, nerrors)) E.Stop()
def processMali(mali, options): map_new2old = mali.mapIdentifiers() ids = mali.getIdentifiers() invalid_chars = options.gap_chars + options.mask_chars has_non_overlaps = False pairs = [] if options.iteration == "all-vs-all": for x in range(len(ids)): for y in range(0, x): pairs.append((x, y)) elif options.iteration == "first-vs-all": for y in range(1, len(ids)): pairs.append((0, y)) elif options.iteration == "pairwise": if len(ids) % 2 != 0: raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len( ids) for x in range(0, len(ids), 2): pairs.append((x, x + 1)) elif options.iteration == "tree": pairs = [] else: raise "unknown iteration mode: %s" % (options.iteration) if options.remove_stops: for id, entry in mali.items(): s = entry.mString.upper() fragments = [] for x in range(0, len(s), 3): codon = s[x:x + 3] if Genomics.IsStopCodon(codon): codon = "NNN" fragments.append(codon) entry.mString = "".join(fragments) for x, y in pairs: noverlap = 0 for a, b in zip(mali[ids[x]], mali[ids[y]]): if a not in invalid_chars and b not in invalid_chars: noverlap += 1 if noverlap >= options.min_overlap: break else: has_non_overlaps = True break if options.tree: tree = TreeTools.Newick2Nexus(options.tree).trees[0] map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True) tree.relabel(map_old2new) else: tree = None if options.method == "paml": runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options) elif options.method == "xrate": runXrate(mali, has_non_overlaps, pairs, map_new2old, options)
def prepareGrammar(xgram, mali, options): """prepare grammar for custom grammars.""" ids = mali.getIdentifiers() fh, filename = tempfile.mkstemp() os.close(fh) outfile = open(filename, "w") mali.writeToFile(outfile, format="stockholm", write_ranges=False, options=("#=GF NH (%s:1.0)%s;" % tuple(ids), )) outfile.close() if options.xrate_model == "sn": infile = open(XGram.PATH_DATA + "/sn.eg", "r") input_model = XGram.Parser.parseGrammar(infile.readlines()) elif options.xrate_model == "akaksgc": infile = open(XGram.PATH_DATA + "/akaksgc.eg", "r") input_model = XGram.Parser.parseGrammar(infile.readlines()) elif options.xrate_model in ("f3x4-two", "f3x4-four", "f3x4-fourproducts"): input_model = Codons.buildCodonML(codon_model=options.xrate_model, fix_kappa=options.fix_kappa, fix_omega=options.fix_omega) if options.xrate_model in ("ef3x4-four", ): sequences = getSequencesFromStk(filename) frequencies = Codons.getFrequenciesPerCodonPosition(sequences.values()) codon_frequencies = {} if options.xrate_insert_frequencies: for c1 in ('A', 'C', 'G', 'T'): for c2 in ('A', 'C', 'G', 'T'): for c3 in ('A', 'C', 'G', 'T'): codon = "".join((c1, c2, c3)) if not Genomics.IsStopCodon(codon): codon_frequencies[codon] = frequencies[0][ c1] * frequencies[1][c2] * frequencies[2][c3] total = sum(codon_frequencies.values()) for k, v in codon_frequencies.items(): codon_frequencies[k] /= total else: for c1 in ('A', 'C', 'G', 'T'): for c2 in ('A', 'C', 'G', 'T'): for c3 in ('A', 'C', 'G', 'T'): codon = "".join((c1, c2, c3)) codon_frequencies[codon] = 1 / 61.0 input_model = Codons.buildCodonML(codon_model="codons-four", codon_frequencies=codon_frequencies, fix_kappa=options.fix_kappa, fix_omega=options.fix_omega) else: if options.xrate_insert_frequencies: setFrequencies(input_model, filename) if options.xrate_fix_frequencies: for char in ('a', 'c', 'g', 't'): for x in (0, 1, 2): param = "p%s%i" % (char, x) input_model.mGrammar.moveVariableToConst(param) if options.dump: options.stdlog.write("## input model:\n%s\n" % input_model.getGrammar()) writeModel(input_model, "input", options) t1 = time.time() result = xgram.train(input_model, filename) t2 = time.time() trained_model = result.getModel() if options.dump: options.stdlog.write("## trained model:\n%s\n" % trained_model.getGrammar()) writeModel(trained_model, "trained", options) return result, mali, ids
def Load(self, in_sequence): """load sequence properties from a sequence.""" ## uppercase all letters sequence = in_sequence.upper() self.mNCodons = len(sequence) / 3 self.mNStopCodons = 0 ## setup counting arrays ## counts of amino acids self.mCountsAA = {} for x in Bio.Alphabet.IUPAC.extended_protein.letters: self.mCountsAA[x] = 0 ## nucleotide counts for each position (is not a sum of the counts ## per degenerate site, as the codon might be intelligible, e.g. GNN). self.mCounts = [{ 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }, { 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }, { 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }] ## nucleotide counts for each position per degeneracy self.mCountsDegeneracy = [] self.mLength = len(sequence) for x in (0, 1, 2): xx = [] for y in range(5): yy = {} for z in Bio.Alphabet.IUPAC.extended_dna.letters: yy[z] = 0 xx.append(yy) self.mCountsDegeneracy.append(xx) for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]: for x in (0, 1, 2): self.mCounts[x][codon[x]] += 1 if Genomics.IsStopCodon(codon): self.mNStopCodons += 1 continue try: aa, deg1, deg2, deg3 = Genomics.GetDegeneracy(codon) degrees = (deg1, deg2, deg3) for x in range(len(degrees)): self.mCountsDegeneracy[x][degrees[x]][codon[x]] += 1 self.mCountsAA[aa] += 1 except KeyError: pass self.Update()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-w", "--filename-weights", dest="filename_weights", type="string", help= "filename with codon frequencies. Multiple filenames can be separated by comma [default=%default]." ) parser.add_option("-s", "--sections", dest="sections", type="choice", action="append", choices=("length", "hid", "na", "aa", "degeneracy", "bias", "codons", "codon-usage", "codon-translator"), help="which sections to output [default=%default]") parser.add_option( "-t", "--type", dest="seqtype", type="choice", choices=("na", "aa"), help= "type of sequence: na=nucleotides, aa=amino acids [default=%default].") parser.add_option( "-e", "--regex-identifier", dest="regex_identifier", type="string", help= "regular expression to extract identifier from fasta description line [default=%default]." ) parser.set_defaults( filename_weights="uniform", pseudocounts=1, sections=[], regex_identifier="(.+)", seqtype="na", ) (options, args) = E.Start(parser, argv=argv) options.filename_weights = options.filename_weights.split(",") rx = re.compile(options.regex_identifier) reference_codons = [] if options.filename_weights: for filename in options.filename_weights: if filename == "uniform": reference_codons.append(Genomics.GetUniformCodonUsage()) else: reference_codons.append( IOTools.ReadMap(open(filename, "r"), has_header=True, map_functions=(str, float))) ## print codon table differences E.info("difference between supplied codon usage preferences.") for x in range(0, len(reference_codons)): for y in range(0, len(reference_codons)): if x == y: continue # calculate KL distance a = reference_codons[x] b = reference_codons[y] d = 0 for codon, p in a.items(): if Genomics.IsStopCodon(codon): continue d += b[codon] * math.log(b[codon] / p) E.info("tablediff\t%s\t%s\t%f" % (options.filename_weights[x], options.filename_weights[y], d)) iterator = FastaIterator.FastaIterator(options.stdin) def getCounter(section): if options.seqtype == "na": if section == "length": s = SequencePropertiesLength() elif section == "hid": s = SequencePropertiesHid() elif section == "na": s = SequencePropertiesNA() elif section == "aa": s = SequencePropertiesAA() elif section == "degeneracy": s = SequencePropertiesDegeneracy() elif section == "bias": s = SequencePropertiesBias(reference_codons) elif section == "codons": s = SequencePropertiesCodons() elif section == "codon-usage": s = SequencePropertiesCodonUsage() elif section == "codon-translator": s = SequencePropertiesCodonTranslator() else: raise ValueError("unknown section %s" % section) elif options.seqtype == "aa": if section == "length": s = SequencePropertiesLength() elif section == "hid": s = SequencePropertiesHid() elif section == "aa": s = SequencePropertiesAminoAcids() else: raise ValueError("unknown section %s" % section) return s ## setup totals totals = {} for section in options.sections: totals[section] = getCounter(section) options.stdout.write("id") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getHeaders())) options.stdout.write("\n") options.stdout.flush() for cur_record in iterator: sequence = re.sub(" ", "", cur_record.sequence).upper() if len(sequence) == 0: E.warning("empty sequence %s" % cur_record.title) continue id = rx.search(cur_record.title).groups()[0] options.stdout.write("%s" % id) options.stdout.flush() for section in options.sections: s = getCounter(section) s.loadSequence(sequence) totals[section].addProperties(s) options.stdout.write("\t" + "\t".join(s.getFields())) options.stdout.write("\n") options.stdout.write("total") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getFields())) options.stdout.write("\n") E.Stop()