Esempi in Python per TreeTools.GetSubsets

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: CGAT

Classe/tipologia: TreeTools

Metodo/funzione: GetSubsets

Esempi su hotexamples.com: 3

TreeTools.GetSubsets in Python: 3 esempi trovati. Questi sono i migliori esempi reali in Python per CGAT.TreeTools.GetSubsets, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Newick2Nexus(30)

Tree2Newick(9)

GetTaxa(8)

PruneTree(5)

GetSize(5)

Nexus2Newick(5)

MapTaxa(4)

TreeDFS(3)

GetSubsets(3)

Newick2Tree(3)

calculatePatternsFromTree(2)

GetLeaves(2)

IsCompatible(2)

GetDistanceToRoot(1)

IsMonophyleticForTaxa(1)

GetNodeMap(1)

Tree2Graph(1)

GetMaxIndex(1)

Unroot(1)

GetAllNodes(1)

Esempio n. 1

Mostra file

File: plot_duplications.py Progetto: lesheng/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/plot_duplications.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--headers",
                      dest="headers",
                      action="store_true",
                      help="first row is a header [ignored].")
    parser.add_option("-t",
                      "--title",
                      dest="title",
                      type="string",
                      help="page title.")
    parser.add_option("-f",
                      "--footer",
                      dest="footer",
                      type="string",
                      help="page footer.")
    parser.add_option("-c",
                      "--contig-sizes",
                      dest="filename_contig_sizes",
                      type="string",
                      help="filname with contig sizes.")
    parser.add_option("-r",
                      "--radius",
                      dest="radius",
                      type="int",
                      help="radius.")
    parser.add_option("-i",
                      "--increment",
                      dest="radius_increment",
                      type="int",
                      help="radius increment.")
    parser.add_option("-u",
                      "--url",
                      dest="url",
                      type="string",
                      help="string to build url for annotation.")
    parser.add_option("--min-contig",
                      dest="min_contig_size",
                      type="string",
                      help="minimum contig size to delineate.")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum branch length.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum branch length.")

    parser.set_defaults(
        filename_contig_sizes=None,
        headers=False,
        titles="",
        pattern_filename=None,
        title="",
        footer="",
        radius=3000,
        min_value=0.0,
        max_value=0.2,
        url=None,
        radius_increment=40,
        min_contig_size=10000,
        remove_empty_contigs=True,
        separator="|",
        quality2symbol={
            'CG': "circle",
            'PG': "circle",
            'SG': "circle"
        },
        quality2mask=("RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG",
                      "UP", "UF", "BF", "UK"),
        sort_by_size=True,
        input_format="pairwise",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_contig_sizes:
        map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes,
                                               "r"),
                                          map_functions=(str, int))

    # read data and get contigs that are used (i.e.: remove empty contigs)
    chrs = {}
    lines = sys.stdin.readlines()

    if options.remove_empty_contigs:
        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
        for k in map_contig2size.keys():
            if k not in chrs:
                del map_contig2size[k]

    k = map_contig2size.keys()

    if len(k) == 0:
        E.Stop()
        sys.exit(0)

    k.sort()

    if options.sort_by_size:
        k.sort(lambda x, y: cmp(map_contig2size[x], map_contig2size[y]))

    plot = DuplicationPlot(k, map_contig2size, num_entries=0)

    plot.mRadiusIncrement = options.radius_increment
    plot.mRadius = options.radius
    plot.mMaxValue = options.max_value
    plot.mMinValue = options.min_value

    if options.title:
        plot.setTitle(options.title)
    if options.footer:
        plot.setFooter(options.footer)

    plot.initializePlot()

    data = []

    if options.input_format == "pairwise":

        # read data from pairwise analysis
        # format is: cluster_id, locations of duplications, tree of
        # duplications

        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            mi, ma = 0, 0
            found = False
            n = 0
            chrs = {}
            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
                sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)

                xi = plot.getPosition(chr, strand, sbjct_from)
                xa = plot.getPosition(chr, strand, sbjct_to)

                if not mi:
                    mi = xi
                else:
                    mi = min(mi, xi)

                n += 1
                ma = max(ma, xa)
                found = True

            if not found:
                continue
            cis = len(chrs) == 1
            if options.loglevel >= 2:
                options.stdlog.write(
                    "# adding duplications in cluster %s: %s with tree %s\n" %
                    (cluster_id, in_locations, in_tree))
            data.append((cis, n, mi, ma, cluster_id, in_locations, in_tree))

    data.sort()

    plot.mNumEntries = len(data)
    plot.initializePlot()

    last_ndups = 0

    for cis, ndups, mi, ma, cluster_id, in_locations, in_tree in data[:]:

        if ndups != last_ndups:
            plot.pushRadius()
            plot.addSeparator()

        last_ndups = ndups

        map_gene2location = {}
        for l in in_locations.split(";"):
            gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
            if chr not in map_contig2size:
                continue
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
            map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to)

        if not map_gene2location:
            continue

        tree = TreeTools.Newick2Tree(in_tree)

        # the last subset is all nodes again.
        s = TreeTools.GetSubsets(tree)

        is_first = True
        for children, height, branchlength in s[:-1]:
            if len(children) == 1:
                continue
            c = map(lambda x: x.split(options.separator), children)
            plot.addDuplication(c,
                                map_gene2location,
                                height,
                                url=options.url,
                                with_separator=is_first,
                                link_to_previous=not is_first,
                                quality2symbol=options.quality2symbol,
                                quality2mask=options.quality2mask)
            is_first = False

    plot.writeToFile(sys.stdout)

    E.Stop()

Esempio n. 2

Mostra file

def writeOrthologSets(outfile,
                      nexus,
                      extract_species,
                      extract_gene,
                      options,
                      reference_tree=None,
                      method="strict",
                      outgroups=None):
    """output ortholog sets.

    A "strict" ortholog set contains exactly one gene for each species,
    while a "degenerate" ortholog set contains at least one gene for each
    species. 
    """

    ######################################################################
    # build species set to compare
    sets = []
    species = options.column2org
    nspecies = len(species)

    if options.enumeration == "monophyletic":
        if reference_tree:
            for members, h1, h2 in TreeTools.GetSubsets(reference_tree):
                if len(members) > 1:
                    sets.append(members)
        else:
            raise "please specify a species tree for monophyletic enumeration"

    elif options.enumeration == "exhaustive":
        for x in range(2, len(species)):
            sets += list(SetTools.xuniqueCombinations(species, x))
        sets.append(species)

    elif options.enumeration == "pairwise":

        for x in range(len(species) - 1):
            for y in range(x + 1, len(species)):
                sets.append((species[x], species[y]))

    elif options.enumeration == "full":
        sets.append(species)

    elif options.enumeration == "lineage":
        for s in species:
            sets.append((s, ))

    elif options.enumeration == "explicit":
        for x in range(2, len(options.species_set)):
            sets += list(SetTools.xuniqueCombinations(options.species_set, x))
        sets.append(options.species_set)

    ######################################################################
    # build sets with positional information
    xsets = []
    map_frozenset2set = {}
    for x in range(len(sets)):
        ss = frozenset(map(lambda x: options.org2column[x], sets[x]))
        xsets.append(ss)
        map_frozenset2set[ss] = x

    ######################################################################
    # collect outgroups
    if outgroups:
        noutgroups = set()
        for x in outgroups:
            noutgroups.add(options.org2column[x])
    else:
        noutgroups = None

    ######################################################################
    # loop over each tree and set
    # I did not see a way to loop a tree once for all sets without doing
    # complicated counting. The problem is that counting has to be stopped
    # at different tree heights for different sets.
    ninput, noutput, nempty, nskipped = 0, 0, 0, 0

    counts = [0] * len(sets)

    options.stdout.write(
        "nspecies\tname\tid\tcluster\tpattern\t%s\tnode_id\tmembers\n" %
        "\t".join(species))

    cluster_id = 0
    nerrors = 0

    for tree in nexus.trees:

        ninput += 1
        ntotal_tree = 0

        if options.loglevel >= 3:
            options.stdlog.write("# processing tree %s\n" % tree.name)

        if options.reroot:
            rerootTree(tree, extract_species, options)

        for c in range(len(xsets)):
            # numbered species set: 0,1,...
            sn = xsets[c]
            # literal species set: species1, species2, ...
            sl = sets[c]

            ortholog_nodes = getOrthologNodes(tree,
                                              sn,
                                              options,
                                              selector=method,
                                              outgroups=noutgroups)
            ntotal_tree += len(ortholog_nodes)

            n = 0

            pattern = buildPattern(nspecies, sn)

            # check for inconsistent partitions (the same gene in different
            # ortholog clusters) within the current tree
            found_genes = set()
            ortho_sets = set()

            # reverse ortholog_node - work in top-down manner.
            ortholog_nodes.reverse()

            for node_id, members in ortholog_nodes:
                n += 1
                cluster_id += 1

                otus = filter(lambda x: extract_species(x) in sl,
                              tree.get_taxa(node_id))
                genes = set(map(extract_gene, otus))

                if found_genes.intersection(genes):

                    # only take largest cluster for lineage specific
                    # duplications
                    if method == "lineage":
                        continue

                    if frozenset(genes) in ortho_sets:
                        nskipped += 1
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# %s: cluster %i: redundant node: %i - skipped because already present: %s\n"
                                % (tree.name, n, node_id,
                                   str(found_genes.intersection(genes))))
                    else:
                        nerrors += 1
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# %s: cluster %i: inconsistent node: %i - the same gene in different clusters: %s\n"
                                % (tree.name, n, node_id,
                                   str(found_genes.intersection(genes))))

                found_genes = found_genes.union(genes)
                ortho_sets.add(frozenset(genes))

                xpattern = buildPattern(nspecies, sn, members)

                options.stdout.write(
                    "%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\n" %
                    (len(sl), tree.name, n, cluster_id, "".join(pattern),
                     "\t".join(xpattern), node_id, ";".join(otus)))

            counts[c] += n

        if ntotal_tree == 0:
            nempty += 1
        else:
            noutput += 1

    if options.loglevel >= 1:
        options.stdout.write(
            "# ninput=%i, nempty=%i, noutput=%i, nskipped=%i, nerrors=%i\n" %
            (ninput, nempty, noutput, nskipped, nerrors))

    # write summary information

    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = options.stdout
        outfile.write("//\n")

    outfile.write("cluster\tpattern\tcounts\t%s\n" % ("\t".join(species)))

    for c in range(len(xsets)):
        pattern = buildPattern(nspecies, xsets[c])
        outfile.write("%i\t%s\t%i\t%s\n" %
                      (c, "".join(pattern), counts[c], "\t".join(pattern)))

    if outfile != options.stdout:
        outfile.close()

Esempio n. 3

Mostra file

File: SVGDuplicationsWheel.py Progetto: lesheng/cgat

        map_gene2location = {}
        for l in in_locations.split(";"):
            gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
            if chr not in map_contig2size:
                continue
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
            map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to)

        if not map_gene2location:
            continue

        tree = TreeTools.Newick2Tree(in_tree)

        # the last subset is all nodes again.
        s = TreeTools.GetSubsets(tree)
        is_first = True
        for children, height, branchlength in s[:-1]:
            if len(children) == 1:
                continue
            data = []
            for c in map(lambda x: x.split(options.separator), children):
                if len(c) == 2:
                    data.append((c[0], c[1], c[1], "CG"))
                elif len(c) == 1:
                    data.append(("unk", c[0], c[0], "CG"))
                elif len(c) == 3:
                    data.append((c[0], c[2], c[3], "CG"))

            for species, transcript, gene, quality in data:
                if not gene in map_gene2location: