Exemple #1
0
def runDNADIST(mali, pairs, options):
    """run dnadist."""
    # use phylip for these
    phylip = WrapperPhylip.Phylip()
    phylip.setProgram("dnadist")
    phylip.setMali(mali)

    phylip_options = []
    if options.distance == "K80":
        phylip_options += ["D"] * 1
    elif options.distance == "JC69":
        phylip_options += ["D"] * 2
    elif options.distance == "LogDet":
        phylip_options += ["D"] * 3

    phylip_options.append("Y")
    phylip.setOptions(phylip_options)

    if options.dump:
        phylip.setLogLevel(2)

    result = phylip.run()

    writePhylipResult(result, options)
Exemple #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: trees2tree.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("counts", "min", "max", "sum", "mean", "median", "stddev", "non-redundant", "consensus",
                               "select-largest"),
                      help="aggregation function.")

    parser.add_option("-r", "--regex-id", dest="regex_id", type="string",
                      help="regex pattern to extract identifier from tree name for the selection functions.")

    parser.add_option("-w", "--write-values", dest="write_values", type="string",
                      help="if processing multiple trees, write values to file.")

    parser.add_option("-e", "--error-branchlength", dest="error_branchlength", type="float",
                      help="set branch length without counts to this value.")

    parser.set_defaults(
        method="mean",
        regex_id=None,
        filtered_branch_lengths=(-999.0, 999.0),
        write_values = None,
        error_branchlength = None,
        separator=":",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.loglevel >= 2:
        options.stdlog.write("# reading trees from stdin.\n")
        options.stdlog.flush()

    nexus = TreeTools.Newick2Nexus(sys.stdin)
    if options.loglevel >= 1:
        options.stdlog.write(
            "# read %i trees from stdin.\n" % len(nexus.trees))

    nskipped = 0
    ninput = len(nexus.trees)
    noutput = 0
    nerrors = 0

    if options.method == "non-redundant":
        # compute non-redudant trees
        template_trees = []
        template_counts = []
        ntree = 0
        for tree in nexus.trees:

            for x in range(0, len(template_trees)):
                is_compatible, reason = TreeTools.IsCompatible(
                    tree, template_trees[x])
                if is_compatible:
                    template_counts[x] += 1
                    break
            else:
                template_counts.append(1)
                template_trees.append(tree)

            if options.loglevel >= 2:
                options.stdlog.write(
                    "# tree=%i, ntemplates=%i\n" % (ntree, len(template_trees)))

            ntree += 1

        for x in range(0, len(template_trees)):
            if options.loglevel >= 1:
                options.stdlog.write("# tree: %i, counts: %i, percent=%5.2f\n" %
                                     (x, template_counts[x], template_counts[x] * 100.0 / ntotal))
            options.stdout.write(
                TreeTools.Tree2Newick(template_trees[x]) + "\n")

    elif options.method in ("select-largest",):
        # select one of the trees with the same name.
        clusters = {}
        for x in range(0, len(nexus.trees)):
            n = nexus.trees[x].name

            if options.regex_id:
                n = re.search(options.regex_id, n).groups()[0]

            if n not in clusters:
                clusters[n] = []
            clusters[n].append(x)

        new_trees = []

        for name, cluster in clusters.items():
            new_trees.append(
                getBestTree([nexus.trees[x] for x in cluster], options.method))

        for x in range(0, len(new_trees)):
            options.stdout.write(">%s\n" % new_trees[x].name)
            options.stdout.write(TreeTools.Tree2Newick(new_trees[x],) + "\n")
            noutput += 1

        nskipped = ntotal - noutput

    elif options.method == "consensus":

        phylip = WrapperPhylip.Phylip()
        phylip.setLogLevel(options.loglevel - 2)
        phylip.setProgram("consense")
        phylip_options = []
        phylip_options.append("Y")

        phylip.setOptions(phylip_options)
        phylip.setTrees(nexus.trees)

        result = phylip.run()

        options.stdout.write(
            "# consensus tree built from %i trees\n" % (phylip.mNInputTrees))
        options.stdout.write(
            TreeTools.Tree2Newick(result.mNexus.trees[0]) + "\n")
        noutput = 1

    else:
        if options.method in ("min", "max", "sum", "mean", "counts"):

            xtree = nexus.trees[0]
            for n in xtree.chain.keys():
                if xtree.node(n).data.branchlength in options.filtered_branch_lengths:
                    xtree.node(n).data.branchlength = 0
                ntotals = [1] * len(xtree.chain.keys())

            if options.method == "min":
                f = min
            elif options.method == "max":
                f = max
            elif options.method == "sum":
                f = lambda x, y: x + y
            elif options.method == "mean":
                f = lambda x, y: x + y
            elif options.method == "counts":
                f = lambda x, y: x + 1
                for n in xtree.chain.keys():
                    if xtree.node(n).data.branchlength not in options.filtered_branch_lengths:
                        xtree.node(n).data.branchlength = 1
                    else:
                        xtree.node(n).data.branchlength = 0
            else:
                raise "unknown option %s" % options.method

            for tree in nexus.trees[1:]:

                for n in tree.chain.keys():
                    if tree.node(n).data.branchlength not in options.filtered_branch_lengths:
                        xtree.node(n).data.branchlength = f(
                            xtree.node(n).data.branchlength, tree.node(n).data.branchlength)
                        ntotals[n] += 1

            if options.method == "mean":
                for n in xtree.chain.keys():
                    if ntotals[n] > 0:
                        xtree.node(n).data.branchlength = float(
                            xtree.node(n).data.branchlength) / ntotals[n]
                    else:
                        if options.error_branchlength is not None:
                            xtree.node(
                                n).data.branchlength = options.error_branchlength
                            if options.loglevel >= 1:
                                options.stdlog.write(
                                    "# no counts for node %i - set to %f\n" % (n, options.error_branchlength))
                                nerrors += 1
                        else:
                            raise "no counts for node %i" % n

        else:
            # collect all values for trees
            values = [[] for x in range(TreeTools.GetSize(nexus.trees[0]))]

            for tree in nexus.trees:
                for n, node in tree.chain.items():
                    if node.data.branchlength not in options.filtered_branch_lengths:
                        values[n].append(node.data.branchlength)

            tree = nexus.trees[0]
            for n, node in tree.chain.items():
                if len(values[n]) > 0:
                    if options.method == "stddev":
                        node.data.branchlength = scipy.std(values[n])
                    elif options.method == "median":
                        node.data.branchlength = scipy.median(values[n])
                else:
                    if options.error_branchlength is not None:
                        node.data.branchlength = options.error_branchlength
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# no counts for node %i - set to %f\n" % (n, options.error_branchlength))
                            nerrors += 1
                    else:
                        raise "no counts for node %i" % n

            if options.write_values:
                outfile = open(options.write_values, "w")
                for n, node in tree.chain.items():
                    values[n].sort()
                    id = options.separator.join(
                        sorted(TreeTools.GetLeaves(tree, n)))
                    outfile.write("%s\t%s\n" %
                                  (id, ";".join(map(str, values[n]))))
                outfile.close()

        del nexus.trees[1:]
        options.stdout.write(TreeTools.Nexus2Newick(nexus) + "\n")
        noutput = 1

    if options.loglevel >= 1:
        options.stdlog.write("# ntotal=%i, nskipped=%i, noutput=%i, nerrors=%i\n" % (
            ninput, nskipped, noutput, nerrors))

    E.Stop()
Exemple #3
0
def Process(lines, other_trees, options, map_old2new, ntree):

    nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines))

    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees.\n" % len(nexus.trees))

    nskipped = 0
    ntotal = len(nexus.trees)
    extract_pattern = None
    species2remove = None
    write_map = False

    phylip_executable = None
    phylip_options = None

    index = 0

    # default: do not output internal node names
    write_all_taxa = False

    for tree in nexus.trees:

        if options.outgroup:
            tree.root_with_outgroup(options.outgroup)

        for method in options.methods:

            if options.loglevel >= 3:
                options.stdlog.write("# applying method %s to tree %i.\n" %
                                     (method, index))

            if method == "midpoint-root":
                tree.root_midpoint()

            elif method == "balanced-root":
                tree.root_balanced()

            elif method == "unroot":
                TreeTools.Unroot(tree)

            elif method == "phylip":
                if not phylip_executable:
                    phylip_executable = options.parameters[0]
                    del options.parameters[0]
                    phylip_options = re.split("@", options.parameters[0])
                    del options.parameters[0]

                    phylip = WrapperPhylip.Phylip()
                    phylip.setProgram(phylip_executable)
                    phylip.setOptions(phylip_options)

                phylip.setTree(tree)

                result = phylip.run()

                nexus.trees[index] = result.mNexus.trees[0]

            elif method == "normalize":
                if options.value == 0:
                    v = 0
                    for n in tree.chain.keys():
                        v = max(v, tree.node(n).data.branchlength)
                else:
                    v = options.value

                for n in tree.chain.keys():
                    tree.node(n).data.branchlength /= float(options.value)

            elif method == "divide-by-tree":

                if len(other_trees) > 1:
                    other_tree = other_trees[ntree]
                else:
                    other_tree = other_trees[0]

                # the trees have to be exactly the same!!
                if options.loglevel >= 2:
                    print tree.display()
                    print other_tree.display()

                if not tree.is_identical(other_tree):
                    nskipped += 1
                    continue

                # even if the trees are the same (in topology), the node numbering might not be
                # the same. Thus build a map of node ids.
                map_a2b = TreeTools.GetNodeMap(tree, other_tree)

                for n in tree.chain.keys():
                    try:
                        tree.node(n).data.branchlength /= float(
                            other_tree.node(map_a2b[n]).data.branchlength)
                    except ZeroDivisionError:
                        options.stdlog.write(
                            "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n"
                            % (n, map_a2b[n], ntree))
                        continue

            elif method == "rename":
                if not map_old2new:

                    map_old2new = IOTools.ReadMap(open(options.parameters[0],
                                                       "r"),
                                                  columns=(0, 1))

                    if options.invert_map:
                        map_old2new = IOTools.getInvertedDictionary(
                            map_old2new, make_unique=True)

                    del options.parameters[0]

                unknown = []
                for n, node in tree.chain.items():
                    if node.data.taxon:
                        try:
                            node.data.taxon = map_old2new[node.data.taxon]
                        except KeyError:
                            unknown.append(node.data.taxon)

                for taxon in unknown:
                    tree.prune(taxon)

            # reformat terminals
            elif method == "extract-with-pattern":

                if not extract_pattern:
                    extract_pattern = re.compile(options.parameters[0])
                    del options.parameters[0]

                for n in tree.get_terminals():
                    node = tree.node(n)
                    node.data.taxon = extract_pattern.search(
                        node.data.taxon).groups()[0]

            elif method == "set-uniform-branchlength":
                for n in tree.chain.keys():
                    tree.node(n).data.branchlength = options.value

            elif method == "build-map":
                # build a map of identifiers
                options.write_map = True
                for n in tree.get_terminals():
                    node = tree.node(n)
                    if node.data.taxon not in map_old2new:
                        new = options.template_identifier % (len(map_old2new) +
                                                             1)
                        map_old2new[node.data.taxon] = new
                    node.data.taxon = map_old2new[node.data.taxon]

            elif method == "remove-pattern":
                if species2remove is None:
                    species2remove = re.compile(options.parameters[0])
                    del options.parameters
                taxa = []
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    skip = False
                    if species2remove.search(t):
                        continue
                    if not skip:
                        taxa.append(t)
                TreeTools.PruneTree(tree, taxa)

            elif method == "add-node-names":

                inode = 0
                write_all_taxa = True
                for n, node in tree.chain.items():
                    if not node.data.taxon:
                        node.data.taxon = "inode%i" % inode
                        inode += 1

            elif method == "newick2nhx":
                # convert names to species names
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    d = t.split("|")
                    if len(d) >= 2:
                        tree.node(n).data.species = d[0]

        index += 1
        ntree += 1

    if options.output_format == "nh":
        options.stdout.write(
            TreeTools.Nexus2Newick(
                nexus,
                write_all_taxa=True,
                with_branchlengths=options.with_branchlengths) + "\n")
    else:
        for tree in nexus.trees:
            tree.writeToFile(options.stdout, format=options.output_format)

    return ntotal, nskipped, ntree
Exemple #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: matrix2tree.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-i",
                      "--invert-map",
                      dest="invert_map",
                      action="store_true",
                      help="""invert map.""")

    parser.add_option("--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("phylip", "full"),
                      help="""input format.""")

    parser.add_option("-t",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="""filename with tree to fit.""")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("nj", "kitsch", "fitch"),
                      help="""algorithm to run.""")

    parser.add_option("-e",
                      "--replicates",
                      dest="replicates",
                      action="store_true",
                      help="replicates.")

    parser.add_option("-r",
                      "--root",
                      dest="root",
                      action="store_true",
                      help="midpoint root (if it is not rooted).")

    parser.add_option("-u",
                      "--unroot",
                      dest="unroot",
                      action="store_true",
                      help="unroot tree (if it is rooted).")

    parser.add_option("--skip-separators",
                      dest="write_separators",
                      action="store_false",
                      help="do not echo separators (starting with >)")

    #    parser.add_option("-i", "--iterations", dest="iterations", type="int",
    #                      help="number of iterations." )

    parser.add_option("-p",
                      "--power",
                      dest="power",
                      type="float",
                      help="power.")

    parser.add_option(
        "--prune-tree",
        dest="prune_tree",
        action="store_true",
        help=
        "prune tree such to include only taxa which are part of the input matrix."
    )

    parser.add_option(
        "--add-random",
        dest="add_random",
        action="store_true",
        help="add small random value to off-diagonal zero elements in matrix.")

    parser.add_option(
        "--pseudo-replicates",
        dest="pseudo_replicates",
        action="store_true",
        help=
        "add small random value to off-diagonal zero elements in matrix, even if they have no replicates."
    )

    parser.add_option("--debug",
                      dest="debug",
                      action="store_true",
                      help="dump debug information.")

    parser.set_defaults(
        value=0,
        method="nj",
        input_format="phylip",
        filename_tree=None,
        outgroup=None,
        replicates=False,
        root=False,
        unroot=False,
        power=0,
        write_separators=True,
        prune_tree=False,
        add_random=False,
        debug=False,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    phylip = WrapperPhylip.Phylip()

    if options.debug:
        phylip.setLogLevel(options.loglevel)

    phylip.setPruneTree(options.prune_tree)

    lines = filter(lambda x: x[0] != "#", sys.stdin.readlines())

    chunks = filter(lambda x: lines[x][0] == ">", range(len(lines)))

    if not chunks:
        options.write_separators = False
        chunks = [-1]

    chunks.append(len(lines))

    for x in range(len(chunks) - 1):

        matrix = lines[chunks[x] + 1:chunks[x + 1]]

        # parse phylip matrix
        if options.add_random:
            mm = []
            ids = []
            for l in range(1, len(matrix)):
                values = re.split("\s+", matrix[l][:-1])
                ids.append(values[0])
                mm.append(map(lambda x: x.strip(), values[1:]))

            d = len(mm)
            if options.replicates:
                for row in range(d - 1):
                    for col in range(row + 1, d):
                        cc = col * 2
                        rr = row * 2
                        if mm[row][cc] == "0" and mm[row][cc + 1] != "0":
                            mm[row][cc + 1] = "1"
                            mm[col][rr + 1] = "1"
                            v = str(random.random() / 10000.0)
                            mm[row][cc] = v
                            mm[col][rr] = v

            else:
                for row in range(d - 1):
                    for col in range(row + 1, d):
                        if mm[row][col] == "0":
                            v = str(random.random() / 10000.0)
                            mm[row][col] = v
                            mm[col][row] = v

            matrix = ["%i\n" % d]
            for row in range(d):
                matrix.append(ids[row] + "    " + "    ".join(mm[row]) + "\n")

        # parse phylip matrix
        if options.pseudo_replicates:
            mm = []
            ids = []
            for l in range(1, len(matrix)):
                values = re.split("\s+", matrix[l][:-1])
                ids.append(values[0])
                mm.append(map(lambda x: x.strip(), values[1:]))

            d = len(mm)
            if options.replicates:
                for row in range(d - 1):
                    for col in range(row + 1, d):
                        cc = col * 2
                        rr = row * 2
                        if mm[row][cc + 1] == "0":
                            mm[row][cc + 1] = "1"
                            mm[col][rr + 1] = "1"
                            v = str(random.random() / 10000.0)
                            mm[row][cc] = v
                            mm[col][rr] = v
                        else:
                            mm[row][cc + 1] = "100"
                            mm[col][rr + 1] = "100"
            else:
                for row in range(d - 1):
                    for col in range(row + 1, d):
                        if mm[row][col] == "0":
                            v = str(random.random() / 10000.0)
                            mm[row][col] = v
                            mm[col][row] = v

            matrix = ["%i\n" % d]
            for row in range(d):
                matrix.append(ids[row] + "    " + "    ".join(mm[row]) + "\n")

        phylip.setMatrix(matrix)

        phylip_options = []

        if options.filename_tree:
            nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r"))
            ref_tree = nexus.trees[0]
            phylip.setTree(ref_tree)
            phylip_options.append("U")
        else:
            ref_tree = None

        if options.method == "nj":
            phylip.setProgram("neighbor")

        elif options.method == "fitch":
            phylip.setProgram("fitch")

        elif options.method == "kitsch":
            phylip.setProgram("kitsch")

        if options.replicates:
            phylip_options.append("S")

        if options.power > 0:
            phylip_options.append("P")
            phylip_options.append("%f" % options.power)

        phylip_options.append("Y")

        phylip.setOptions(phylip_options)

        result = phylip.run()

        # root with outgroup
        if options.root:
            if options.outgroup:
                pass
            # midpoint root
            else:
                for tree in result.mNexus.trees:
                    tree.root_midpoint()

        # explicitely unroot
        elif options.unroot:
            phylip.setOptions(("Y", "W", "U", "Q"))
            phylip.setProgram("retree")
            for x in range(len(result.mNexus.trees)):
                phylip.setTree(result.mNexus.trees[x])
                xresult = phylip.run()
                result.mNexus.trees[x] = xresult.mNexus.trees[0]

        if options.write_separators:
            options.stdout.write(lines[chunks[x]])

        if result.mNexus:
            options.stdout.write(TreeTools.Nexus2Newick(result.mNexus) + "\n")

        if options.loglevel >= 1:
            if ref_tree:
                nref = len(ref_tree.get_terminals())
            else:
                nref = 0
            for tree in result.mNexus.trees:
                options.stdlog.write(
                    "# ninput=%i, nreference=%i, noutput=%i\n" %
                    (len(matrix) - 1, nref, len(tree.get_terminals())))

    E.Stop()
Exemple #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2rates.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm",
                               "phylip"),
                      help="input format of multiple alignment")

    parser.add_option(
        "-s",
        "--sites",
        dest="sites",
        type="string",
        help="sites to use [default=%default].",
    )

    parser.add_option(
        "-f",
        "--file",
        dest="filename",
        type="string",
        help="filename of multiple alignment (- for stdin) [default=%default].",
        metavar="FILE")

    parser.add_option("-o",
                      "--format",
                      dest="format",
                      type="string",
                      help="format [default=%default].",
                      metavar="format")

    parser.add_option(
        "-d",
        "--distance",
        dest="distance",
        type="choice",
        choices=("PID", "T92", "JC69", "POVL", "F84", "LogDet", "K80", "F81",
                 "HKY85", "TN93", "REV", "UNREST", "REVU", "UNRESTU", "JTT",
                 "PMB", "PAM", "Kimura", "CategoriesModel"),
        help="method to use for distance calculation [default=%default].")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("phylip", "baseml", "own", "xrate"),
                      help="program to use for rate calculation.")

    parser.add_option("--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("list", "tree"),
                      help="output format.")

    parser.add_option(
        "-m",
        "--min-sites",
        dest="min_sites",
        type="int",
        help="minimum number of sites for output[default=%default].",
    )

    parser.add_option(
        "-a",
        "--alphabet",
        dest="alphabet",
        type="choice",
        choices=("aa", "na", "auto"),
        help="alphabet to use.",
    )

    parser.add_option("-t",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree information.")

    parser.add_option("--set-alpha",
                      dest="alpha",
                      type="float",
                      help="initial alpha value.")

    parser.add_option("--fix-alpha",
                      dest="fix_alpha",
                      action="store_true",
                      help="do not estimate alpha.")

    parser.add_option("--set-kappa",
                      dest="kappa",
                      type="float",
                      help="initial kappa value.")

    parser.add_option("--fix-kappa",
                      dest="fix_kappa",
                      action="store_true",
                      help="do not estimate kappa.")

    parser.add_option("--dump",
                      dest="dump",
                      action="store_true",
                      help="dump output.")

    parser.add_option("--test",
                      dest="test",
                      action="store_true",
                      help="test run - does not clean up.")

    parser.add_option("--pairwise",
                      dest="pairwise",
                      action="store_true",
                      help="force pairwise comparison.")

    parser.add_option(
        "--set-clean-data",
        dest="clean_data",
        type="choice",
        choices=("0", "1"),
        help=
        "PAML should cleanup data:  0=only gaps within pair are removed, 1=columns in the mali with gaps are removed."
    )

    parser.add_option(
        "--with-counts",
        dest="with_counts",
        action="store_true",
        help=
        "output counts of aligned positions, transitions and transversions.")

    parser.add_option("-w",
                      "--write",
                      dest="write",
                      type="choice",
                      action="append",
                      choices=("input", "trained", "all"),
                      help="output sections to write for xrate.")

    parser.add_option("--output-pattern",
                      dest="output_pattern",
                      type="string",
                      help="output pattern for output files.")

    parser.add_option("--xrate-min-increment",
                      dest="xrate_min_increment",
                      type=float,
                      help="minimum increment to stop iteration in xrate.")

    parser.set_defaults(
        input_format="fasta",
        filename_tree=None,
        with_counts=False,
        sites="d4",
        distance="T92",
        min_sites=1,
        filename="-",
        alphabet="auto",
        format="%6.4f",
        method="phylip",
        kappa=None,
        fix_kappa=False,
        alpha=None,
        fix_alpha=False,
        dump=False,
        clean_data=None,
        output_format="list",
        iteration="all-vs-all",
        pairwise=False,
        report_step=1000,
        output_pattern="%s.eg",
        write=[],
        test_xrate=False,
        xrate_min_increment=None,
        is_codons=False,
    )

    (options, args) = E.Start(parser)

    if options.filename != "-":
        infile = open(options.filename, "r")
    else:
        infile = sys.stdin

    # read multiple alignment
    if options.pairwise:
        # read sequences, but not as a multiple alignment. This permits
        # multiple names.
        mali = Mali.SequenceCollection()
        options.iteration = "pairwise"
    else:
        mali = Mali.Mali()

    mali.readFromFile(infile, format=options.input_format)

    ids = mali.getIdentifiers()

    if options.alphabet == "auto":
        s = "".join(map(lambda x: x.mString, mali.values())).lower()
        ss = re.sub("[acgtxn]", "", s)
        if float(len(ss)) < (len(s) * 0.1):
            options.alphabet = "na"
            if mali.getNumColumns() % 3 == 0:
                options.is_codons = True
        else:
            options.alphabet = "aa"

        if options.loglevel >= 1:
            options.stdlog.write("# autodetected alphabet: %s\n" %
                                 options.alphabet)

    if options.filename != "-":
        infile.close()

    npairs = 0
    nskipped_length = 0
    nskipped_distance = 0

    pairs = []
    if options.iteration == "all-vs-all":
        for x in range(len(ids) - 1):
            for y in range(x + 1, len(ids)):
                pairs.append((x, y))
    elif options.iteration == "first-vs-all":
        for y in range(1, len(ids)):
            pairs.append((0, y))
    elif options.iteration == "pairwise":
        if len(ids) % 2 != 0:
            raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len(
                ids)
        for x in range(0, len(ids), 2):
            pairs.append((x, x + 1))

    if options.alphabet == "na":

        if options.method == "baseml":
            runBaseML(mali, pairs, options)
        elif options.method == "phylip" and options.distance in ("F84", "K80",
                                                                 "JC69",
                                                                 "LogDet"):
            runDNADIST(mali, pairs, options)
        elif options.method == "xrate":
            runXrate(mali, pairs, options)
        else:
            if options.is_codons:
                h = Genomics.SequencePairInfoCodons().getHeader()
            else:
                h = Genomics.SequencePairInfo().getHeader()
            options.stdout.write("seq1\tseq2\tdist\tvar\t%s\n" % (h))

            for x, y in pairs:
                id_x = ids[x]
                npairs += 1

                id_y = ids[y]

                info = Genomics.CalculatePairIndices(
                    mali[id_x], mali[id_y], with_codons=options.is_codons)

                if options.distance in ("T92", "JC69"):
                    if options.sites == "d4":
                        seq1, seq2 = Genomics.GetDegenerateSites(mali[id_x],
                                                                 mali[id_y],
                                                                 position=3,
                                                                 degeneracy=4)

                        if len(seq1) < options.min_sites:
                            nskipped_length += 1
                            continue
                    else:
                        raise "unknown sites %s" % options.sites

                if options.distance == "T92":
                    distance, variance = CalculateDistanceT92(info)
                elif options.distance == "JC69":
                    distance, variance = CalculateDistanceJC69(info)
                elif options.distance == "PID":
                    distance, variance = CalculateDistancePID(
                        mali[id_x], mali[id_y])
                elif options.distance == "POVL":
                    distance, variance = CalculateDistancePOVL(
                        mali[id_x], mali[id_y])

                if distance >= 0:
                    options.stdout.write("\t".join(
                        map(str, (id_x, id_y, options.format % distance,
                                  options.format % variance, info))) + "\n")
                else:
                    nskipped_distance += 1

    elif options.alphabet == "aa":

        if options.distance in ("JTT", "PMB", "PAM", "Kimura",
                                "CategoriesModel"):

            # use phylip for these
            phylip = WrapperPhylip.Phylip()
            phylip.setProgram("protdist")
            phylip.setMali(mali)

            phylip_options = []
            if options.distance == "PMG":
                phylip_options += ["D"] * 1
            elif options.distance == "PAM":
                phylip_options += ["D"] * 2
            elif options.distance == "Kimura":
                phylip_options += ["D"] * 3
            elif options.distance == "CategoriesModel":
                phylip_options += ["D"] * 4

            phylip_options.append("Y")
            phylip.setOptions(phylip_options)
            result = phylip.run()

            writePhylipResult(result, options)

        else:
            options.stdout.write("id1\tid2\tdist\tvar\n")

            # iterate over all pairs of sequences
            for x, y in pairs:
                id_x = ids[x]
                npairs += 1

                id_y = ids[y]

                if options.distance == "PID":
                    distance, variance = CalculateDistancePID(
                        mali[id_x], mali[id_y])
                elif options.distance == "POVL":
                    # percentage overlap
                    distance, variance = CalculateDistancePOVL(
                        mali[id_x], mali[id_y])

                if distance >= 0:
                    options.stdout.write("\t".join(
                        (id_x, id_y, options.format % distance,
                         options.format % variance)) + "\n")
                else:
                    nskipped_distance += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# nseqs=%i, npairs=%i, nskipped_length=%i, nskipped_distance=%i\n"
            % (len(ids), npairs, nskipped_length, nskipped_distance))

    E.Stop()
Exemple #6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: data2phylocontrasts.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header",
                      dest="add_header",
                      action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--write-header",
                      dest="write_header",
                      action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug",
                      dest="debug",
                      action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree",
                      dest="display_tree",
                      action="store_true",
                      help="display the tree")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("contrasts", "spearman", "pearson", "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.Start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    phylip = WrapperPhylip.Phylip()

    if options.debug:
        phylip.setLogLevel(options.loglevel)

    phylip.setProgram("contrast")

    ##########################################################
    ##########################################################
    ##########################################################
    # retrieve data and give to phylip
    data = []
    headers = []
    first = True
    for line in sys.stdin:
        if line[0] == "#":
            continue
        d = line[:-1].strip().split("\t")
        if first:
            first = False
            headers = d[1:]
            continue
        data.append(d)

    phylip.setData(data)
    ncolumns = len(headers)
    nrows = len(data)

    ##########################################################
    ##########################################################
    ##########################################################
    # read trees
    nexus = None
    if options.filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r"))

    if not nexus:
        raise ValueError("please provide trees with branchlenghts")

    ##########################################################
    ##########################################################
    ##########################################################
    # set up phylip
    phylip_options = []
    # print out contrasts
    phylip_options.append("C")
    phylip_options.append("Y")
    phylip.setOptions(phylip_options)

    ##########################################################
    ##########################################################
    ##########################################################
    # main loop
    ##########################################################
    for tree in nexus.trees:

        if options.display_tree:
            tree.display()

        # compute this before giving the tree to the phylip module,
        # as it remaps taxon names.
        map_node2data = {}
        for x in range(nrows):
            taxon = data[x][0]
            map_node2data[tree.search_taxon(taxon)] = x

        phylip.setTree(tree)

        result = phylip.run()

        for method in options.methods:

            if method in ("pearson", "spearman"):

                options.stdout.write("header1\theader2\tr\tp\tcode\n")

                n = len(result.mContrasts)
                columns = []
                for c in range(ncolumns):
                    columns.append(map(lambda x: x[c], result.mContrasts))

                for x in range(0, ncolumns - 1):
                    for y in range(x + 1, ncolumns):

                        # phylip value
                        phy_r = result.mCorrelations[x][y]

                        import rpy
                        from rpy import r as R

                        # Various ways to calculate r. It is not possible to use
                        # cor.test or lsfit directly, as you have to perform a
                        # regression through the origin.

                        # uncomment to check pearson r against phylip's value
                        ## r = calculateCorrelationCoefficient( columns[x], columns[y] )

                        # for significance, use linear regression models in R
                        rpy.set_default_mode(rpy.NO_CONVERSION)
                        linear_model = R.lm(R("y ~ x - 1"),
                                            data=R.data_frame(x=columns[x],
                                                              y=columns[y]))
                        rpy.set_default_mode(rpy.BASIC_CONVERSION)

                        ss = R.summary(linear_model)

                        # extract the p-value
                        p = ss['coefficients'][-1][-1]

                        if p < 0.001:
                            code = "***"
                        elif p < 0.01:
                            code = "**"
                        elif p < 0.05:
                            code = "*"
                        else:
                            code = ""

                        options.stdout.write("\t".join(
                            (headers[x], headers[y], options.value_format %
                             phy_r, options.pvalue_format % p, code)) + "\n")

            elif method == "contrasts":

                options.stdout.write("\t".join(headers) + "\n")
                for d in result.mContrasts:
                    options.stdout.write(
                        "\t".join(map(lambda x: options.value_format % x, d)) +
                        "\n ")

            elif method == "compute":

                # make room for all internal nodes and one dummy node
                # for unrooted trees.
                max_index = TreeTools.GetMaxIndex(tree) + 2
                variances = [None] * max_index
                values = [[None] * nrows for x in range(max_index)]
                contrasts = []
                for x in range(max_index):
                    contrasts.append([None] * ncolumns)
                branchlengths = [None] * max_index

                def update_data(
                    node_id,
                    bl,
                    c1,
                    c2,
                ):

                    b1, b2 = branchlengths[c1], branchlengths[c2]
                    rb1 = 1.0 / b1
                    rb2 = 1.0 / b2
                    # compute variance
                    variance = math.sqrt(b1 + b2)

                    # extend branch length of this node to create correct
                    # variance for parent
                    branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2)
                    variances[node_id] = variance

                    for c in range(ncolumns):
                        v1, v2 = values[c1][c], values[c2][c]
                        # save ancestral value as weighted mean
                        values[node_id][c] = (
                            (rb1 * v1 + rb2 * v2)) / (rb1 + rb2)
                        # compute normalized contrast
                        contrasts[node_id][c] = (v1 - v2) / variance

                def update_contrasts(node_id):
                    """update contrasts for a node."""
                    node = tree.node(node_id)
                    if node.succ:
                        if len(node.succ) == 2:
                            c1, c2 = node.succ
                            update_data(node_id, node.data.branchlength, c1,
                                        c2)
                        else:
                            assert (node_id == tree.root)
                            assert (len(node.succ) == 3)
                            update_data(node_id, node.data.branchlength,
                                        node.succ[0], node.succ[1])
                            update_data(max_index - 1, node.data.branchlength,
                                        node_id, node.succ[2])
                    else:
                        for c in range(ncolumns):
                            values[node_id][c] = float(
                                data[map_node2data[node_id]][c + 1])

                        branchlengths[node_id] = node.data.branchlength

                tree.dfs(tree.root, post_function=update_contrasts)

                options.stdout.write("node_id\tvariance\t%s\n" %
                                     "\t".join(headers))
                for node_id in range(max_index):
                    if variances[node_id] is None:
                        continue
                    options.stdout.write("%s\t%s\t%s\n" % (
                        node_id,
                        options.value_format % variances[node_id],
                        "\t".join(
                            map(lambda x: options.value_format % x,
                                contrasts[node_id])),
                    ))

    E.Stop()