Example #1
0
def filterTree(tree, options, map_id2location=None):
    """apply location and type filter to tree.

    if outgroups are defined, they are not removed.
    """

    otus = TreeTools.GetTaxa(tree)

    to_remove = set()
    if options.remove_unplaced:
        tt = set()
        for id in otus:
            if id not in map_id2location:
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# WARNING: unknown location for id %s.\n" % id)
                continue

            if map_id2location[id].mShortName.lower() in MAP_CONTIG2JUNK:
                to_remove.add(id)
                tt.add(id)

        if options.loglevel >= 3:
            options.stdlog.write("# tree %s: removing %i entries because of location: %s\n" %
                                 (tree.name, len(tt), ";".join(tt)))

    new_otus = list(set(otus).difference(to_remove))

    if len(new_otus) != len(otus):

        TreeTools.PruneTree(tree, new_otus, keep_distance_to_root=True)

    if options.loglevel >= 1:
        options.stdlog.write("# tree %s: filtering: before=%i, remove=%i, after=%i, final=%i\n" %
                             (tree.name, len(otus), len(to_remove), len(new_otus), len(TreeTools.GetTaxa(tree))))
        options.stdlog.flush()
Example #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/count_orgs.py 1706 2007-12-11 16:46:11Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--reference-tree",
                      dest="reference_tree",
                      type="string",
                      help="reference tree to read.")

    parser.add_option("-p",
                      "--filename-patterns",
                      dest="filename_patterns",
                      type="string",
                      help="filename with patterns to output.")

    parser.add_option("-u",
                      "--filename-summary",
                      dest="filename_summary",
                      type="string",
                      help="filename with summary to output.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("map", "links", "trees"),
                      help="output format.")

    parser.add_option("-o",
                      "--organisms",
                      dest="column2org",
                      type="string",
                      help="sorted list of organisms.")

    parser.add_option(
        "-s",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")

    parser.add_option(
        "-g",
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.set_defaults(
        reference_tree=None,
        format="map",
        filename_patterns=None,
        column2org=None,
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        separator="|",
        filename_summary=None,
    )

    (options, args) = E.Start(parser)

    if options.reference_tree:
        if options.reference_tree[0] == "(":
            nexus = TreeTools.Newick2Nexus(options.reference_tree)
        else:
            nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r"))
        reference_tree = nexus.trees[0]

        if options.loglevel >= 3:
            print "# reference tree:"
            print reference_tree.display()

    else:
        reference_tree = None

    clusters = {}
    if options.format == "map":

        for line in sys.stdin:
            if line[0] == "#": continue
            id, r = line[:-1].split("\t")
            if r not in clusters: clusters[r] = []
            clusters[r].append(id)

    elif options.format == "trees":

        nexus = TreeTools.Newick2Nexus(sys.stdin)

        for tree in nexus.trees:
            clusters[tree.name] = tree.get_taxa()

    elif options.format == "links":
        members = set()
        id = None
        for line in sys.stdin:
            if line[0] == "#": continue

            if line[0] == ">":
                if id: clusters[id] = members
                x = re.match(">cluster #(\d+)", line[:-1])
                if x:
                    id = x.groups()[0]
                else:
                    id = line[1:-1]
                members = set()
                continue

            data = line[:-1].split("\t")[:2]
            members.add(data[0])
            members.add(data[1])

        if id: clusters[id] = members

    if len(clusters) == 0:
        raise "empty input."

    ########################################################################
    ########################################################################
    ########################################################################
    ## sort out reference tree
    ########################################################################
    rs = re.compile(options.species_regex)
    rg = re.compile(options.gene_regex)
    extract_species = lambda x: rs.search(x).groups()[0]

    ## prune tree to species present
    species_set = set()
    for cluster, members in clusters.items():
        species_set = species_set.union(set(map(extract_species, members)))

    if reference_tree:

        TreeTools.PruneTree(reference_tree, species_set)

        if options.loglevel >= 1:
            options.stdlog.write("# Tree after pruning: %i taxa.\n" %
                                 len(reference_tree.get_taxa()))

    if options.column2org:
        options.column2org = options.column2org.split(",")
    elif reference_tree:
        options.column2org = []
        for nx in reference_tree.get_terminals():
            options.column2org.append(reference_tree.node(nx).get_data().taxon)
    else:
        options.column2org = []
        for x in species_set:
            options.column2org.append(x)

    options.org2column = {}
    for x in range(len(options.column2org)):
        options.org2column[options.column2org[x]] = x

    if reference_tree:
        reference_patterns = TreeTools.calculatePatternsFromTree(
            reference_tree, options.column2org)

        if options.loglevel >= 3:
            print "# reference patterns:"
            print reference_patterns

    ##############################################################################
    notus = len(options.column2org)
    patterns = {}
    species_counts = [SpeciesCounts() for x in options.column2org]

    ## first genes, then transcripts
    options.stdout.write(
        "mali\tpattern\tpresent\tngenes\t%s\tntranscripts\t%s\n" %
        ("\t".join(options.column2org), "\t".join(options.column2org)))

    keys = clusters.keys()
    keys.sort()
    for cluster in keys:
        members = clusters[cluster]

        count_genes = [{} for x in range(len(options.org2column))]
        count_transcripts = [0] * len(options.org2column)

        for m in members:
            data = m.split(options.separator)

            if len(data) == 4:
                s, t, g, q = data
            elif len(data) == 2:
                s, g = data
                t = g

            if s not in options.org2column:
                raise "unknown species %s" % s

            col = options.org2column[s]

            count_transcripts[col] += 1
            if g not in count_genes[col]:
                count_genes[col][g] = 0

            count_genes[col][g] += 1

            species_counts[col].mGenes.add(g)
            species_counts[col].mTranscripts.add(t)
            species_counts[col].mTrees.add(cluster)

        ntotal_transcripts = reduce(lambda x, y: x + y, count_transcripts)
        npresent_transcripts = len(filter(lambda x: x > 0, count_transcripts))
        ntotal_genes = reduce(lambda x, y: x + y, map(len, count_genes))
        npresent_genes = len(filter(lambda x: x > 0, map(len, count_genes)))

        pattern = GetPattern(count_transcripts, notus)
        if pattern not in patterns: patterns[pattern] = 0
        patterns[pattern] += 1
        options.stdout.write(
            string.join(
                (cluster, pattern, str(npresent_genes), str(ntotal_genes),
                 string.join(map(str, map(len, count_genes)), "\t"),
                 str(ntotal_transcripts),
                 string.join(map(str, count_transcripts), "\t")), "\t") + "\n")

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write pattern summary
    #######################################################################################
    xx = patterns.keys()
    xx.sort()
    if options.filename_patterns:
        outfile = open(options.filename_patterns, "w")
    else:
        outfile = sys.stdout

    for x in range(len(options.column2org)):
        outfile.write("# %i = %s\n" % (x, options.column2org[x]))

    if reference_tree:
        outfile.write("pattern\tcounts\tisok\n")
    else:
        outfile.write("pattern\tcounts\n")

    for x in xx:
        if reference_tree:
            if x in reference_patterns:
                is_ok = "1"
            else:
                is_ok = "0"
            outfile.write("%s\t%s\t%s\n" % (x, patterns[x], is_ok))
        else:
            outfile.write("%s\t%s\n" % (x, patterns[x]))

    if outfile != sys.stdout: outfile.close()

    #######################################################################################
    #######################################################################################
    #######################################################################################
    ## write summary counts per species
    #######################################################################################
    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = sys.stdout

    outfile.write("species\tntranscripts\tngenes\tntrees\n")

    for species, col in options.org2column.items():
        outfile.write(
            "%s\t%i\t%i\t%i\n" %
            (species, len(species_counts[col].mTranscripts),
             len(species_counts[col].mGenes), len(species_counts[col].mTrees)))

    if outfile != sys.stdout: outfile.close()

    E.Stop()
Example #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: trees2sets.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--reference-tree",
                      dest="reference_tree",
                      type="string",
                      help="reference tree to read.")

    parser.add_option("-e",
                      "--enumeration",
                      dest="enumeration",
                      type="choice",
                      choices=("monophyletic", "full", "pairwise",
                               "exhaustive", "explicit", "lineage"),
                      help="enumeration of ortholog groups.")

    parser.add_option("-o",
                      "--organisms",
                      dest="column2org",
                      type="string",
                      help="sorted list of organisms.")

    parser.add_option("-p",
                      "--filename-patterns",
                      dest="filename_patterns",
                      type="string",
                      help="filename with patterns to output.")

    parser.add_option("-u",
                      "--filename-summary",
                      dest="filename_summary",
                      type="string",
                      help="filename with summary to output.")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("strict", "degenerate", "any", "outgroup",
                               "lineage"),
                      help="sets to extract.")

    parser.add_option("-s",
                      "--species-set",
                      dest="species_set",
                      type="string",
                      help="comma separated list of species.")

    parser.add_option("-g",
                      "--outgroups",
                      dest="outgroups",
                      type="string",
                      help="comma separated list of outgroup species.")

    parser.add_option(
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")

    parser.add_option(
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.add_option("--reroot",
                      dest="reroot",
                      type="choice",
                      choices=("outgroup", "midpoint"),
                      help="reroot trees before computing sets.")

    parser.set_defaults(
        reference_tree=None,
        enumeration="full",
        column2org=None,
        separator="|",
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        filename_summary=None,
        methods=[],
        species_set=None,
        outgroups=None,
        reroot=None,
    )

    (options, args) = E.Start(parser)

    if len(options.methods) == 0:
        options.methods.append("strict")

    if options.species_set:
        options.species_set = options.species_set.split(",")
        options.enumeration = "explicit"

    #######################################################################
    # warning: outgroup method is useless, as it requires
    # only a single outgroup per tree and the tree rooted
    # with the outgroup.
    if "outgroup" in options.methods and not options.outgroups:
        raise "please supply --outgroups if method 'outgroup' is chosen."

    if options.outgroups:
        options.outgroups = options.outgroups.split(",")

    ########################################################################
    ########################################################################
    ########################################################################
    if options.reference_tree:
        if options.reference_tree[0] == "(":
            nexus = TreeTools.Newick2Nexus(options.reference_tree)
        else:
            nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r"))
        reference_tree = nexus.trees[0]

        if options.loglevel >= 3:
            options.stdlog.write("# reference tree:\n%s\n" %
                                 reference_tree.display())
    else:
        reference_tree = None
        raise ValueError("please supply a reference tree")

    ########################################################################
    ########################################################################
    ########################################################################
    # read all trees
    ########################################################################
    nexus = TreeTools.Newick2Nexus(sys.stdin)

    ########################################################################
    ########################################################################
    ########################################################################
    # sort out reference tree
    ########################################################################
    rs = re.compile(options.species_regex)
    rg = re.compile(options.gene_regex)
    extract_species = lambda x: parseIdentifier(x, options)[0]
    extract_gene = lambda x: parseIdentifier(x, options)[2]

    # prune reference tree to species present
    species_set = set()
    for tree in nexus.trees:
        try:
            species_set = species_set.union(
                set(map(extract_species, tree.get_taxa())))
        except AttributeError:
            raise "parsing error while extracting species from %s" % str(
                tree.get_taxa())

    TreeTools.PruneTree(reference_tree, species_set)

    if options.loglevel >= 1:
        options.stdlog.write("# reference tree after pruning has %i taxa.\n" %
                             len(reference_tree.get_taxa()))

    if options.column2org:
        options.column2org = options.column2org.split(",")
    elif reference_tree:
        options.column2org = []
        for nx in reference_tree.get_terminals():
            options.column2org.append(reference_tree.node(nx).get_data().taxon)

    options.org2column = {}
    for x in range(len(options.column2org)):
        options.org2column[options.column2org[x]] = x

    for method in options.methods:

        ###################################################################
        ###################################################################
        ###################################################################
        # print out a list of ortholog clusters
        ###################################################################
        writeOrthologSets(options.stdout,
                          nexus,
                          extract_species,
                          extract_gene,
                          options=options,
                          reference_tree=reference_tree,
                          method=method,
                          outgroups=options.outgroups)

    E.Stop()
Example #4
0
def Process(lines, other_trees, options, map_old2new, ntree):

    nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines))

    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees.\n" % len(nexus.trees))

    nskipped = 0
    ntotal = len(nexus.trees)
    extract_pattern = None
    species2remove = None
    write_map = False

    phylip_executable = None
    phylip_options = None

    index = 0

    # default: do not output internal node names
    write_all_taxa = False

    for tree in nexus.trees:

        if options.outgroup:
            tree.root_with_outgroup(options.outgroup)

        for method in options.methods:

            if options.loglevel >= 3:
                options.stdlog.write("# applying method %s to tree %i.\n" %
                                     (method, index))

            if method == "midpoint-root":
                tree.root_midpoint()

            elif method == "balanced-root":
                tree.root_balanced()

            elif method == "unroot":
                TreeTools.Unroot(tree)

            elif method == "phylip":
                if not phylip_executable:
                    phylip_executable = options.parameters[0]
                    del options.parameters[0]
                    phylip_options = re.split("@", options.parameters[0])
                    del options.parameters[0]

                    phylip = WrapperPhylip.Phylip()
                    phylip.setProgram(phylip_executable)
                    phylip.setOptions(phylip_options)

                phylip.setTree(tree)

                result = phylip.run()

                nexus.trees[index] = result.mNexus.trees[0]

            elif method == "normalize":
                if options.value == 0:
                    v = 0
                    for n in tree.chain.keys():
                        v = max(v, tree.node(n).data.branchlength)
                else:
                    v = options.value

                for n in tree.chain.keys():
                    tree.node(n).data.branchlength /= float(options.value)

            elif method == "divide-by-tree":

                if len(other_trees) > 1:
                    other_tree = other_trees[ntree]
                else:
                    other_tree = other_trees[0]

                # the trees have to be exactly the same!!
                if options.loglevel >= 2:
                    print tree.display()
                    print other_tree.display()

                if not tree.is_identical(other_tree):
                    nskipped += 1
                    continue

                # even if the trees are the same (in topology), the node numbering might not be
                # the same. Thus build a map of node ids.
                map_a2b = TreeTools.GetNodeMap(tree, other_tree)

                for n in tree.chain.keys():
                    try:
                        tree.node(n).data.branchlength /= float(
                            other_tree.node(map_a2b[n]).data.branchlength)
                    except ZeroDivisionError:
                        options.stdlog.write(
                            "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n"
                            % (n, map_a2b[n], ntree))
                        continue

            elif method == "rename":
                if not map_old2new:

                    map_old2new = IOTools.ReadMap(open(options.parameters[0],
                                                       "r"),
                                                  columns=(0, 1))

                    if options.invert_map:
                        map_old2new = IOTools.getInvertedDictionary(
                            map_old2new, make_unique=True)

                    del options.parameters[0]

                unknown = []
                for n, node in tree.chain.items():
                    if node.data.taxon:
                        try:
                            node.data.taxon = map_old2new[node.data.taxon]
                        except KeyError:
                            unknown.append(node.data.taxon)

                for taxon in unknown:
                    tree.prune(taxon)

            # reformat terminals
            elif method == "extract-with-pattern":

                if not extract_pattern:
                    extract_pattern = re.compile(options.parameters[0])
                    del options.parameters[0]

                for n in tree.get_terminals():
                    node = tree.node(n)
                    node.data.taxon = extract_pattern.search(
                        node.data.taxon).groups()[0]

            elif method == "set-uniform-branchlength":
                for n in tree.chain.keys():
                    tree.node(n).data.branchlength = options.value

            elif method == "build-map":
                # build a map of identifiers
                options.write_map = True
                for n in tree.get_terminals():
                    node = tree.node(n)
                    if node.data.taxon not in map_old2new:
                        new = options.template_identifier % (len(map_old2new) +
                                                             1)
                        map_old2new[node.data.taxon] = new
                    node.data.taxon = map_old2new[node.data.taxon]

            elif method == "remove-pattern":
                if species2remove is None:
                    species2remove = re.compile(options.parameters[0])
                    del options.parameters
                taxa = []
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    skip = False
                    if species2remove.search(t):
                        continue
                    if not skip:
                        taxa.append(t)
                TreeTools.PruneTree(tree, taxa)

            elif method == "add-node-names":

                inode = 0
                write_all_taxa = True
                for n, node in tree.chain.items():
                    if not node.data.taxon:
                        node.data.taxon = "inode%i" % inode
                        inode += 1

            elif method == "newick2nhx":
                # convert names to species names
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    d = t.split("|")
                    if len(d) >= 2:
                        tree.node(n).data.species = d[0]

        index += 1
        ntree += 1

    if options.output_format == "nh":
        options.stdout.write(
            TreeTools.Nexus2Newick(
                nexus,
                write_all_taxa=True,
                with_branchlengths=options.with_branchlengths) + "\n")
    else:
        for tree in nexus.trees:
            tree.writeToFile(options.stdout, format=options.output_format)

    return ntotal, nskipped, ntree
Example #5
0
    def prepareRun(self):

        self.__reset()

        self.mTempdir = tempfile.mkdtemp()
        # self.mTempdir = "tmp"
        if not os.path.exists(self.mTempdir):
            os.mkdir(self.mTempdir)

        if self.mInputMatrix and self.mInputData:
            raise ValueError(
                "please specify either input matrix or input data, but not both."
            )

        # prepare input matrix. Should already be in phylip like
        # format, but long identifiers are shortened and tabs are
        # replaced by spaces.
        if self.mInputMatrix:

            outfile = open(self.mTempdir + "/infile", "w")

            identifiers = map(lambda x: re.split("\s+", x[:-1])[0],
                              self.mInputMatrix[1:])
            self.updateMaps(identifiers)

            outfile.write(self.mInputMatrix[0])
            for line in self.mInputMatrix[1:]:
                data = re.split("\s+", line[:-1])
                new_line = self.mMapInput2Phylip[
                    data[0]] + "       " + "  ".join(data[1:])
                outfile.write(new_line + "\n")

            outfile.close()

            if self.mLogLevel >= 1:
                print "# written input matrix with %i taxa to %s" % (
                    len(identifiers), self.mTempdir + "/infile")
                os.system("cat %s" % self.mTempdir + "/infile")

        elif self.mInputData:

            outfile = open(self.mTempdir + "/infile", "w")
            outfile.write("%i %i\n" %
                          (len(self.mInputData), len(self.mInputData[0]) - 1))
            identifiers = map(lambda x: x[0], self.mInputData)
            self.updateMaps(identifiers)

            for x in range(len(identifiers)):
                outfile.write("%-10s %s\n" %
                              (self.mMapInput2Phylip[identifiers[x]], " ".join(
                                  self.mInputData[x][1:])))

            outfile.close()

            if self.mLogLevel >= 1:
                print "# written input matrix with %i taxa to %s" % (
                    len(identifiers), self.mTempdir + "/infile")
                os.system("cat %s" % self.mTempdir + "/infile")

        # prepare input tree or trees
        self.mNInputTrees = 0
        if self.mInputTree or self.mInputTrees:

            outfile = open(self.mTempdir + "/intree", "w")

            if self.mInputTree and self.mInputTrees:
                raise UsageError(
                    "please supply either one or mupltiple trees, but not both."
                )

            if self.mInputTree:
                trees = [self.mInputTree]
            else:
                trees = self.mInputTrees

            for tree in trees:
                if self.mPruneTree:
                    taxa = self.mMapInput2Phylip.keys()
                    TreeTools.PruneTree(tree, taxa)

                taxa = TreeTools.GetTaxa(tree)
                self.updateMaps(taxa)
                TreeTools.MapTaxa(tree, self.mMapInput2Phylip)

                # check if taxa are unique
                taxa = tree.get_taxa()
                staxa = set()

                skip = False
                for t in taxa:
                    if t in staxa:
                        if self.mLogLevel >= 1:
                            print "# skipping tree %s because of duplicate taxa." % (
                                tree.name)
                        skip = True
                    staxa.add(t)

                if skip:
                    continue

                outfile.write(TreeTools.Tree2Newick(tree) + "\n")
                self.mNInputTrees += 1

                if self.mLogLevel >= 1:
                    print "# written input tree with %i taxa to %s" % (len(
                        TreeTools.GetTaxa(tree)), self.mTempdir + "/intree")
                    print "#", TreeTools.Tree2Newick(tree)

            outfile.close()

        # prepare input multiple alignment
        if self.mInputMali:

            if self.mInputMatrix:
                raise "both mali and matrix supplied - infile conflict."

            outfile = open(self.mTempdir + "/infile", "w")

            identifiers = self.mInputMali.getIdentifiers()
            self.updateMaps(identifiers)
            self.mInputMali.mapIdentifiers(self.mMapInput2Phylip)
            self.mInputMali.writeToFile(outfile, format="phylip")

            outfile.close()

            if self.mLogLevel >= 1:
                print "# written input multiple alignments with %i taxa and with %i to %s" %\
                      (self.mInputMali.getLength(),
                       self.mInputMali.getWidth(), self.mTempdir + "/intree")