def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: malis2profiles.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.set_defaults() (options, args) = E.Start(parser) mali = Mali.SequenceCollection() last_id = None ninput, noutput, nskipped = 0, 0, 0 for line in sys.stdin: if line[0] == "#": continue start, ali, end, id = line[:-1].split("\t") ninput += 1 if id != last_id: if last_id: mali.setName(last_id) mali.writeToFile(sys.stdout, format="profile") noutput += 1 mali = Mali.SequenceCollection() last_id = id mali.addSequence(id, start, end, ali) if last_id: mali.setName(last_id) mali.writeToFile(sys.stdout, format="profile") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped)) E.Stop()
xrate_min_increment=0.000001, with_rho=True, separator="|", single_omega=False, shared_frequencies=False, shared_rates=False, block_size=None, replicates=None, ) (options, args) = Experiment.Start(parser) if options.replicates != None: # read a sequence collection with possible duplicate names # used for benchmarking mali = Mali.SequenceCollection() else: mali = Mali.Mali() mali.readFromFile(sys.stdin, format=options.input_format) options.stdout.write( "seq1\tseq2\tdN\tdS\tdNdS\tN\tS\tdN_err\tdS_err\tkappa\tlnL\ttau\tlen") if options.with_rho: options.stdout.write("\trN\trS\tt\trN0\trS0\tt0") options.stdout.write("\terror_str\n") if options.replicates != None: ids = mali.getIdentifiers()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2kaks.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option("--set-omega", dest="omega", type="float", help="initial omega value.") parser.add_option("--set-kappa", dest="kappa", type="float", help="initial kappa value.") parser.add_option("--fix-kappa", dest="fix_kappa", action="store_true", help="do not estimate kappa.") parser.add_option("--fix-omega", dest="fix_omega", action="store_true", help="do not estimate omega.") parser.add_option("--set-codon-frequencies", dest="codon_frequencies", type="choice", choices=("uniform", "fequal", "f3x4", "f1x4", "f61"), help="set codon frequencies.") parser.add_option("--set-method", dest="paml_method", type="int", help="set paml optimization method [0|1].") parser.add_option("--set-sequence-type", dest="seqtype", type="choice", choices=("codon", "aa", "trans"), help="sequence type.") parser.add_option( "--set-clean-data", dest="clean_data", type="choice", choices=("0", "1"), help= "PAML should cleanup data: 0=only gaps within pair are removed, 1=columns in the mali with gaps are removed." ) parser.add_option("--dump", dest="dump", action="store_true", help="dump raw output [%default].") parser.add_option("--set-optimization-threshold", dest="optimization_threshold", type="string", help="set paml optimization threshold [%default].") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment [%default].") parser.add_option("--pairwise", dest="pairwise", action="store_true", help="force pairwise comparison [%default].") parser.add_option("--iteration", dest="iteration", type="choice", choices=("all-vs-all", "first-vs-all", "pairwise", "tree"), help="iteration mode [%default].") parser.add_option( "--no-clean", dest="clean_mali", action="store_false", help= "do not clean multiple alignment before submitting to codeml. It might take too long for very large sequences." ) parser.add_option("--method", dest="method", type="choice", choices=("paml", "xrate"), help="choose method for rate computation [%default]") parser.add_option("--xrate-model", dest="xrate_model", type="choice", choices=("f3x4-two", "f3x4-four", "sn", "akaksgc", "ef3x4-four", "f3x4-fourproducts"), help="models to use [%default].") parser.add_option("-w", "--write", dest="write", type="choice", action="append", choices=("input_fixed", "trained_fixed", "input_variable", "trained_variable", "all"), help="output sections to write [%default].") parser.add_option("-o", "--output-pattern", dest="output_pattern", type="string", help="output pattern for output files [%default].") parser.add_option("--xrate-insert-frequencies", dest="xrate_insert_frequencies", action="store_true", help="estimate codon frequencies from input [%default].") parser.add_option("--xrate-uniform-frequencies", dest="xrate_insert_frequencies", action="store_false", help="use uniform codon frequencies [%default].") parser.add_option("--xrate-fix-frequencies", dest="xrate_fix_frequencies", action="store_true", help="set initial frequencies to const [%default].") parser.add_option("--xrate-estimate-frequencies", dest="xrate_fix_frequencies", action="store_false", help="estimate nucleotide frequencies [%default].") parser.add_option( "--xrate-fix-rates", dest="fix_rates", type="string", help= """fix rates to specified values. Note that the number of rates has to match the ones in the model. Provide values in a comma-separated list [%default].""") parser.add_option( "--xrate-min-increment", dest="xrate_min_increment", type=float, help="minimum increment to stop iteration in xrate [%default].") parser.add_option( "--min-overlap", dest="min_overlap", type="int", help="minimum overlap between a sequence pair in residues [%default].") parser.add_option( "--with-rho", dest="with_rho", action="store_true", help= "output rho values (substitution rates per codon). This requires a patched version of PAML [%default]." ) parser.add_option( "--with-counts", dest="with_counts", action="store_true", help= "output counts of aligned positions, transitions and transversions [%default]." ) parser.add_option("--remove-stops", dest="remove_stops", action="store_true", help="remove stop codons [%default].") parser.add_option( "--replicates", dest="replicates", type="int", help="in benchmarking mode expect ## replicates [%default].") parser.add_option("--tree", dest="tree", type="string", help="use tree for estimation [%default].") parser.set_defaults( input_format="fasta", omega=None, codon_frequencies=None, paml_method=None, optimization_threshold=None, seqtype="codon", dump=False, clean_data=False, min_overlap=60, gap_chars="-.", mask_chars="nN", pairwise=False, kappa=None, fix_kappa=False, fix_omega=False, clean_mali=True, method="paml", report_step=1000, loglevel=1, xrate_insert_frequencies=False, xrate_fix_frequencies=False, write=[], output_pattern="%s.eg", value_format="%6.4f", fix_rates=None, xrate_from_parameters=False, xrate_model="f3x4-four", with_rho=False, with_counts=False, iteration="all-vs-all", remove_stops=False, xrate_min_increment=0.000001, replicates=None, tree=None, ) (options, args) = E.Start(parser) if options.method == "xrate": # imports for xrate computation from XGram.Generator.Prebuilt import Codons from XGram.Model import Annotation import XGram.Run import Bio.Data.CodonTable # paml like estimation using xrate if options.codon_frequencies == "uniform": options.xrate_fix_frequencies = True options.xrate_insert_frequencies = False elif options.codon_frequencies == "f3x4": options.xrate_fix_frequencies = True options.xrate_insert_frequencies = True elif options.method == "paml": if not options.codon_frequencies: options.codon_frequencies = "F3X4" if options.fix_rates: options.fix_rates = map(float, options.fix_rates.split(",")) if options.pairwise or options.replicates: ## read sequences, but not as a multiple alignment. This permits multiple names. mali = Mali.SequenceCollection() else: mali = Mali.Mali() mali.readFromFile(sys.stdin, format=options.input_format) E.info("read multiple alignment") if mali.getLength() == 0: raise "refusing to process empty alignment." ################################################################ ################################################################ ################################################################ ## setup methods ################################################################ options.stdout.write( "seq1\tseq2\tdN\tdS\tdNdS\tN\tS\tdN_err\tdS_err\tkappa\tlnL\ttau") if options.with_rho: options.stdout.write("\trN\trS\tt\trN0\trS0\tt0") if options.with_counts: options.stdout.write("\t%s" % Genomics.SequencePairInfo().getHeader()) options.stdout.write("\terror_str\n") if options.replicates != None: ids = mali.getIdentifiers() assert (len(ids) % options.replicates == 0) s = len(ids) / options.replicates for x in range(0, len(ids), s): m = Mali.Mali() for id in ids[x:x + s]: m.addEntry(mali.getEntry(id)) processMali(m, options) else: processMali(mali, options) E.Stop()
def main(argv=sys.argv): parser = E.OptionParser( version= "%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment [default=%default].") parser.add_option( "-o", "--output-format", dest="output_format", type="choice", choices=("plain", "fasta", "stockholm", "phylip", "nexus", "plain-fasta"), help="output format of multiple alignment [default=%default].") parser.add_option( "--with-ranges", dest="with_ranges", action="store_true", help= "output alignment ranges (suffix /from-to after identifier) [default=%default]." ) parser.add_option( "--without-ranges", dest="with_ranges", action="store_false", help= "do not output alignment ranges (suffix /from-to after identifier) [default=%default]." ) parser.add_option("-u", "--allow-duplicates", dest="allow_duplicates", action="store_true", help="permit duplicate entries [default=%default].") parser.add_option( "-m", "--method", dest="methods", type="string", help= """methods to apply. Several methods can be specified in a ','-separated list [default=%default].""" ) parser.add_option( "-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one [default=%default]." ) parser.add_option( "-a", "--mask-char", dest="mask_char", type="string", help="character to identify/set masked characters [default=%default].") parser.set_defaults( input_format="fasta", output_format="fasta", methods="", parameters="", mask_char="x", gap_chars="-.nN", with_ranges=True, allow_duplicates=False, ) (options, args) = E.Start(parser) options.methods = options.methods.split(",") options.parameters = options.parameters.split(",") # 1. read multiple alignment in various formats if options.allow_duplicates: mali = Mali.SequenceCollection() else: mali = Mali.Mali() t1 = time.time() mali.readFromFile(options.stdin, format=options.input_format) E.info("read mali with %i entries in %i seconds." % (len(mali), time.time() - t1)) if len(mali) == 0: raise ValueError("empty multiple alignment") for method in options.methods: t1 = time.time() if method == "remove-unaligned-ends": mali.removeUnalignedEnds() elif method == "remove-end-gaps": mali.removeEndGaps() elif method == "remove-all-gaps": mali.removeGaps(minimum_gaps=len(mali)) elif method == "remove-any-gaps": mali.removeGaps(minimum_gaps=1) elif method == "remove-some-gaps": minimum_gaps = int(options.parameters[0]) del options.parameters[0] mali.removeGaps(minimum_gaps=minimum_gaps) elif method == "remove-empty-sequences": mali.removeEmptySequences() elif method == "upper": mali.upperCase() elif method == "lower": mali.lowerCase() elif method == "mark-codons": mali.markCodons() elif method == "remove-stops": mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"), allowed_matches=0, minimum_matches=1, delete_frame=3, search_frame=3) elif method == "shift-alignment": map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"), map_functions=(str, int)) del options.parameters[0] mali.shiftAlignment(map_id2offset) elif method == "propagate-masks": mali.propagateMasks(mask_char=options.mask_char) elif method == "recount": mali.recount() elif method in ("mark-transitions", "filter-odd-transitions", "filter-even-transitions", "keep-even-segments", "keep-odd-segments"): if os.path.exists(options.parameters[0]): map_id2transitions = IOTools.readMultiMap( open(options.parameters[0], "r"), map_functions=(str, int)) else: map_id2transitions = {} r = map(int, options.parameters[0].split(':')) r.sort() map_id2transitions["mali"] = r del options.parameters[0] if method == "mark-transitions": mali.markTransitions(map_id2transitions) elif method in ("filter-odd-transitions", "keep-even-segments"): mali.markTransitions(map_id2transitions, mode="keep-odd") elif method in ("filter-even-transitions", "keep-odd-segments"): mali.markTransitions(map_id2transitions, mode="keep-even") elif method == "propagate-transitions": mali.propagateTransitions() elif method == "map-annotation": # map annotations in one mali (stockholm-format) to the annotations in another. # Note: the first two sequence identifiers must be shared and the sequence of the # same length other_mali = Mali.Mali() other_mali.readFromFile(open(options.parameters[0], "r"), format="stockholm") del options.parameters[0] mali.copyAnnotations(other_mali) elif method == "add-annotation": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] AddAnnotation(mali, annotation_type, annotation_file) elif method == "mask-columns": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] maskColumns(mali, annotation_type, annotation_file) elif method == "remove-unaligned-pairs": removeUnalignedPairs(mali, options) elif method == "filter-3rd": filterMali(mali, "3rd") elif method == "filter-4d": filterMali(mali, "4d") elif method in ("mask-seg", "mask-bias"): a, b = method.split("-") maskMali(mali, b) elif method == "exclude-with-stop": mali.filter(method="with-stop") elif method == "exclude-with-stop": mali.filter(method="with-frameshift") E.info("applied method %s in %i seconds." % (method, time.time() - t1)) mali.writeToFile(options.stdout, format=options.output_format, write_ranges=options.with_ranges) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2rates.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment") parser.add_option( "-s", "--sites", dest="sites", type="string", help="sites to use [default=%default].", ) parser.add_option( "-f", "--file", dest="filename", type="string", help="filename of multiple alignment (- for stdin) [default=%default].", metavar="FILE") parser.add_option("-o", "--format", dest="format", type="string", help="format [default=%default].", metavar="format") parser.add_option( "-d", "--distance", dest="distance", type="choice", choices=("PID", "T92", "JC69", "POVL", "F84", "LogDet", "K80", "F81", "HKY85", "TN93", "REV", "UNREST", "REVU", "UNRESTU", "JTT", "PMB", "PAM", "Kimura", "CategoriesModel"), help="method to use for distance calculation [default=%default].") parser.add_option("--method", dest="method", type="choice", choices=("phylip", "baseml", "own", "xrate"), help="program to use for rate calculation.") parser.add_option("--output-format", dest="output_format", type="choice", choices=("list", "tree"), help="output format.") parser.add_option( "-m", "--min-sites", dest="min_sites", type="int", help="minimum number of sites for output[default=%default].", ) parser.add_option( "-a", "--alphabet", dest="alphabet", type="choice", choices=("aa", "na", "auto"), help="alphabet to use.", ) parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="filename with tree information.") parser.add_option("--set-alpha", dest="alpha", type="float", help="initial alpha value.") parser.add_option("--fix-alpha", dest="fix_alpha", action="store_true", help="do not estimate alpha.") parser.add_option("--set-kappa", dest="kappa", type="float", help="initial kappa value.") parser.add_option("--fix-kappa", dest="fix_kappa", action="store_true", help="do not estimate kappa.") parser.add_option("--dump", dest="dump", action="store_true", help="dump output.") parser.add_option("--test", dest="test", action="store_true", help="test run - does not clean up.") parser.add_option("--pairwise", dest="pairwise", action="store_true", help="force pairwise comparison.") parser.add_option( "--set-clean-data", dest="clean_data", type="choice", choices=("0", "1"), help= "PAML should cleanup data: 0=only gaps within pair are removed, 1=columns in the mali with gaps are removed." ) parser.add_option( "--with-counts", dest="with_counts", action="store_true", help= "output counts of aligned positions, transitions and transversions.") parser.add_option("-w", "--write", dest="write", type="choice", action="append", choices=("input", "trained", "all"), help="output sections to write for xrate.") parser.add_option("--output-pattern", dest="output_pattern", type="string", help="output pattern for output files.") parser.add_option("--xrate-min-increment", dest="xrate_min_increment", type=float, help="minimum increment to stop iteration in xrate.") parser.set_defaults( input_format="fasta", filename_tree=None, with_counts=False, sites="d4", distance="T92", min_sites=1, filename="-", alphabet="auto", format="%6.4f", method="phylip", kappa=None, fix_kappa=False, alpha=None, fix_alpha=False, dump=False, clean_data=None, output_format="list", iteration="all-vs-all", pairwise=False, report_step=1000, output_pattern="%s.eg", write=[], test_xrate=False, xrate_min_increment=None, is_codons=False, ) (options, args) = E.Start(parser) if options.filename != "-": infile = open(options.filename, "r") else: infile = sys.stdin # read multiple alignment if options.pairwise: # read sequences, but not as a multiple alignment. This permits # multiple names. mali = Mali.SequenceCollection() options.iteration = "pairwise" else: mali = Mali.Mali() mali.readFromFile(infile, format=options.input_format) ids = mali.getIdentifiers() if options.alphabet == "auto": s = "".join(map(lambda x: x.mString, mali.values())).lower() ss = re.sub("[acgtxn]", "", s) if float(len(ss)) < (len(s) * 0.1): options.alphabet = "na" if mali.getNumColumns() % 3 == 0: options.is_codons = True else: options.alphabet = "aa" if options.loglevel >= 1: options.stdlog.write("# autodetected alphabet: %s\n" % options.alphabet) if options.filename != "-": infile.close() npairs = 0 nskipped_length = 0 nskipped_distance = 0 pairs = [] if options.iteration == "all-vs-all": for x in range(len(ids) - 1): for y in range(x + 1, len(ids)): pairs.append((x, y)) elif options.iteration == "first-vs-all": for y in range(1, len(ids)): pairs.append((0, y)) elif options.iteration == "pairwise": if len(ids) % 2 != 0: raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len( ids) for x in range(0, len(ids), 2): pairs.append((x, x + 1)) if options.alphabet == "na": if options.method == "baseml": runBaseML(mali, pairs, options) elif options.method == "phylip" and options.distance in ("F84", "K80", "JC69", "LogDet"): runDNADIST(mali, pairs, options) elif options.method == "xrate": runXrate(mali, pairs, options) else: if options.is_codons: h = Genomics.SequencePairInfoCodons().getHeader() else: h = Genomics.SequencePairInfo().getHeader() options.stdout.write("seq1\tseq2\tdist\tvar\t%s\n" % (h)) for x, y in pairs: id_x = ids[x] npairs += 1 id_y = ids[y] info = Genomics.CalculatePairIndices( mali[id_x], mali[id_y], with_codons=options.is_codons) if options.distance in ("T92", "JC69"): if options.sites == "d4": seq1, seq2 = Genomics.GetDegenerateSites(mali[id_x], mali[id_y], position=3, degeneracy=4) if len(seq1) < options.min_sites: nskipped_length += 1 continue else: raise "unknown sites %s" % options.sites if options.distance == "T92": distance, variance = CalculateDistanceT92(info) elif options.distance == "JC69": distance, variance = CalculateDistanceJC69(info) elif options.distance == "PID": distance, variance = CalculateDistancePID( mali[id_x], mali[id_y]) elif options.distance == "POVL": distance, variance = CalculateDistancePOVL( mali[id_x], mali[id_y]) if distance >= 0: options.stdout.write("\t".join( map(str, (id_x, id_y, options.format % distance, options.format % variance, info))) + "\n") else: nskipped_distance += 1 elif options.alphabet == "aa": if options.distance in ("JTT", "PMB", "PAM", "Kimura", "CategoriesModel"): # use phylip for these phylip = WrapperPhylip.Phylip() phylip.setProgram("protdist") phylip.setMali(mali) phylip_options = [] if options.distance == "PMG": phylip_options += ["D"] * 1 elif options.distance == "PAM": phylip_options += ["D"] * 2 elif options.distance == "Kimura": phylip_options += ["D"] * 3 elif options.distance == "CategoriesModel": phylip_options += ["D"] * 4 phylip_options.append("Y") phylip.setOptions(phylip_options) result = phylip.run() writePhylipResult(result, options) else: options.stdout.write("id1\tid2\tdist\tvar\n") # iterate over all pairs of sequences for x, y in pairs: id_x = ids[x] npairs += 1 id_y = ids[y] if options.distance == "PID": distance, variance = CalculateDistancePID( mali[id_x], mali[id_y]) elif options.distance == "POVL": # percentage overlap distance, variance = CalculateDistancePOVL( mali[id_x], mali[id_y]) if distance >= 0: options.stdout.write("\t".join( (id_x, id_y, options.format % distance, options.format % variance)) + "\n") else: nskipped_distance += 1 if options.loglevel >= 1: options.stdlog.write( "# nseqs=%i, npairs=%i, nskipped_length=%i, nskipped_distance=%i\n" % (len(ids), npairs, nskipped_length, nskipped_distance)) E.Stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm"), help="input format of multiple alignment") parser.add_option( "-a", "--alphabet", dest="alphabet", type="choice", choices=("aa", "na"), help="alphabet to use [default=%default].", ) parser.add_option("-s", "--sections", dest="sections", type="choice", action="append", choices=("length", "composition", "entropy", "all"), help="which sections to output") parser.add_option("-u", "--allow-duplicates", dest="allow_duplicates", action="store_true", help="permit duplicate entries [default=%default].") parser.set_defaults( input_format="fasta", output_format="fasta", mask_chars="nN", gap_chars="-.", alphabet="na", sections=[], allow_duplicates=False, ) (options, args) = E.Start(parser) if len(options.sections) == 0: raise ValueError("please supply at least one method.") if "all" in options.sections: options.sections = ["length", "composition", "entropy"] counters = [] def getCounter(section): if options.alphabet == "na": if section == "length": s = SequencePropertiesLength() elif section == "composition": s = SequencePropertiesNA() elif section == "entropy": s = SequencePropertiesEntropy("ACGT") else: raise ValueError("unknown section %s" % section) elif options.alphabet == "aa": if section == "length": s = SequencePropertiesLength() elif section == "composition": s = SequencePropertiesAminoAcids() elif section == "entropy": s = SequencePropertiesEntropy("ACDEFGHIKLMNPQRSTVWY") else: raise ValueError("unknown section %s" % section) return s # read multiple alignment in various formats # 1. read multiple alignment in various formats if options.allow_duplicates: mali = Mali.SequenceCollection() else: mali = Mali.Mali() mali.readFromFile(options.stdin, format=options.input_format) # do not use column, as it is a reserved word in sql options.stdout.write("col") for section in options.sections: options.stdout.write("\t" + "\t".join(getCounter(section).getHeaders())) options.stdout.write("\n") columns = mali.getColumns() counter = E.Counter() for x, column in enumerate(columns): counter.input += 1 sequence = "".join(column) options.stdout.write("%i" % x) for section in options.sections: s = getCounter(section) s.loadSequence(sequence) options.stdout.write("\t" + "\t".join(s.getFields())) options.stdout.write("\n") counter.output += 1 E.info("%s" % str(counter)) E.Stop()