Esempio n. 1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/plot_duplications.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--headers",
                      dest="headers",
                      action="store_true",
                      help="first row is a header [ignored].")
    parser.add_option("-t",
                      "--title",
                      dest="title",
                      type="string",
                      help="page title.")
    parser.add_option("-f",
                      "--footer",
                      dest="footer",
                      type="string",
                      help="page footer.")
    parser.add_option("-c",
                      "--contig-sizes",
                      dest="filename_contig_sizes",
                      type="string",
                      help="filname with contig sizes.")
    parser.add_option("-r",
                      "--radius",
                      dest="radius",
                      type="int",
                      help="radius.")
    parser.add_option("-i",
                      "--increment",
                      dest="radius_increment",
                      type="int",
                      help="radius increment.")
    parser.add_option("-u",
                      "--url",
                      dest="url",
                      type="string",
                      help="string to build url for annotation.")
    parser.add_option("--min-contig",
                      dest="min_contig_size",
                      type="string",
                      help="minimum contig size to delineate.")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum branch length.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum branch length.")

    parser.set_defaults(
        filename_contig_sizes=None,
        headers=False,
        titles="",
        pattern_filename=None,
        title="",
        footer="",
        radius=3000,
        min_value=0.0,
        max_value=0.2,
        url=None,
        radius_increment=40,
        min_contig_size=10000,
        remove_empty_contigs=True,
        separator="|",
        quality2symbol={
            'CG': "circle",
            'PG': "circle",
            'SG': "circle"
        },
        quality2mask=("RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG",
                      "UP", "UF", "BF", "UK"),
        sort_by_size=True,
        input_format="pairwise",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_contig_sizes:
        map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes,
                                               "r"),
                                          map_functions=(str, int))

    # read data and get contigs that are used (i.e.: remove empty contigs)
    chrs = {}
    lines = sys.stdin.readlines()

    if options.remove_empty_contigs:
        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
        for k in map_contig2size.keys():
            if k not in chrs:
                del map_contig2size[k]

    k = map_contig2size.keys()

    if len(k) == 0:
        E.Stop()
        sys.exit(0)

    k.sort()

    if options.sort_by_size:
        k.sort(lambda x, y: cmp(map_contig2size[x], map_contig2size[y]))

    plot = DuplicationPlot(k, map_contig2size, num_entries=0)

    plot.mRadiusIncrement = options.radius_increment
    plot.mRadius = options.radius
    plot.mMaxValue = options.max_value
    plot.mMinValue = options.min_value

    if options.title:
        plot.setTitle(options.title)
    if options.footer:
        plot.setFooter(options.footer)

    plot.initializePlot()

    data = []

    if options.input_format == "pairwise":

        # read data from pairwise analysis
        # format is: cluster_id, locations of duplications, tree of
        # duplications

        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            mi, ma = 0, 0
            found = False
            n = 0
            chrs = {}
            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
                sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)

                xi = plot.getPosition(chr, strand, sbjct_from)
                xa = plot.getPosition(chr, strand, sbjct_to)

                if not mi:
                    mi = xi
                else:
                    mi = min(mi, xi)

                n += 1
                ma = max(ma, xa)
                found = True

            if not found:
                continue
            cis = len(chrs) == 1
            if options.loglevel >= 2:
                options.stdlog.write(
                    "# adding duplications in cluster %s: %s with tree %s\n" %
                    (cluster_id, in_locations, in_tree))
            data.append((cis, n, mi, ma, cluster_id, in_locations, in_tree))

    data.sort()

    plot.mNumEntries = len(data)
    plot.initializePlot()

    last_ndups = 0

    for cis, ndups, mi, ma, cluster_id, in_locations, in_tree in data[:]:

        if ndups != last_ndups:
            plot.pushRadius()
            plot.addSeparator()

        last_ndups = ndups

        map_gene2location = {}
        for l in in_locations.split(";"):
            gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
            if chr not in map_contig2size:
                continue
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
            map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to)

        if not map_gene2location:
            continue

        tree = TreeTools.Newick2Tree(in_tree)

        # the last subset is all nodes again.
        s = TreeTools.GetSubsets(tree)

        is_first = True
        for children, height, branchlength in s[:-1]:
            if len(children) == 1:
                continue
            c = map(lambda x: x.split(options.separator), children)
            plot.addDuplication(c,
                                map_gene2location,
                                height,
                                url=options.url,
                                with_separator=is_first,
                                link_to_previous=not is_first,
                                quality2symbol=options.quality2symbol,
                                quality2mask=options.quality2mask)
            is_first = False

    plot.writeToFile(sys.stdout)

    E.Stop()
Esempio n. 2
0
def writeOrthologSets(outfile,
                      nexus,
                      extract_species,
                      extract_gene,
                      options,
                      reference_tree=None,
                      method="strict",
                      outgroups=None):
    """output ortholog sets.

    A "strict" ortholog set contains exactly one gene for each species,
    while a "degenerate" ortholog set contains at least one gene for each
    species. 
    """

    ######################################################################
    # build species set to compare
    sets = []
    species = options.column2org
    nspecies = len(species)

    if options.enumeration == "monophyletic":
        if reference_tree:
            for members, h1, h2 in TreeTools.GetSubsets(reference_tree):
                if len(members) > 1:
                    sets.append(members)
        else:
            raise "please specify a species tree for monophyletic enumeration"

    elif options.enumeration == "exhaustive":
        for x in range(2, len(species)):
            sets += list(SetTools.xuniqueCombinations(species, x))
        sets.append(species)

    elif options.enumeration == "pairwise":

        for x in range(len(species) - 1):
            for y in range(x + 1, len(species)):
                sets.append((species[x], species[y]))

    elif options.enumeration == "full":
        sets.append(species)

    elif options.enumeration == "lineage":
        for s in species:
            sets.append((s, ))

    elif options.enumeration == "explicit":
        for x in range(2, len(options.species_set)):
            sets += list(SetTools.xuniqueCombinations(options.species_set, x))
        sets.append(options.species_set)

    ######################################################################
    # build sets with positional information
    xsets = []
    map_frozenset2set = {}
    for x in range(len(sets)):
        ss = frozenset(map(lambda x: options.org2column[x], sets[x]))
        xsets.append(ss)
        map_frozenset2set[ss] = x

    ######################################################################
    # collect outgroups
    if outgroups:
        noutgroups = set()
        for x in outgroups:
            noutgroups.add(options.org2column[x])
    else:
        noutgroups = None

    ######################################################################
    # loop over each tree and set
    # I did not see a way to loop a tree once for all sets without doing
    # complicated counting. The problem is that counting has to be stopped
    # at different tree heights for different sets.
    ninput, noutput, nempty, nskipped = 0, 0, 0, 0

    counts = [0] * len(sets)

    options.stdout.write(
        "nspecies\tname\tid\tcluster\tpattern\t%s\tnode_id\tmembers\n" %
        "\t".join(species))

    cluster_id = 0
    nerrors = 0

    for tree in nexus.trees:

        ninput += 1
        ntotal_tree = 0

        if options.loglevel >= 3:
            options.stdlog.write("# processing tree %s\n" % tree.name)

        if options.reroot:
            rerootTree(tree, extract_species, options)

        for c in range(len(xsets)):
            # numbered species set: 0,1,...
            sn = xsets[c]
            # literal species set: species1, species2, ...
            sl = sets[c]

            ortholog_nodes = getOrthologNodes(tree,
                                              sn,
                                              options,
                                              selector=method,
                                              outgroups=noutgroups)
            ntotal_tree += len(ortholog_nodes)

            n = 0

            pattern = buildPattern(nspecies, sn)

            # check for inconsistent partitions (the same gene in different
            # ortholog clusters) within the current tree
            found_genes = set()
            ortho_sets = set()

            # reverse ortholog_node - work in top-down manner.
            ortholog_nodes.reverse()

            for node_id, members in ortholog_nodes:
                n += 1
                cluster_id += 1

                otus = filter(lambda x: extract_species(x) in sl,
                              tree.get_taxa(node_id))
                genes = set(map(extract_gene, otus))

                if found_genes.intersection(genes):

                    # only take largest cluster for lineage specific
                    # duplications
                    if method == "lineage":
                        continue

                    if frozenset(genes) in ortho_sets:
                        nskipped += 1
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# %s: cluster %i: redundant node: %i - skipped because already present: %s\n"
                                % (tree.name, n, node_id,
                                   str(found_genes.intersection(genes))))
                    else:
                        nerrors += 1
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# %s: cluster %i: inconsistent node: %i - the same gene in different clusters: %s\n"
                                % (tree.name, n, node_id,
                                   str(found_genes.intersection(genes))))

                found_genes = found_genes.union(genes)
                ortho_sets.add(frozenset(genes))

                xpattern = buildPattern(nspecies, sn, members)

                options.stdout.write(
                    "%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\n" %
                    (len(sl), tree.name, n, cluster_id, "".join(pattern),
                     "\t".join(xpattern), node_id, ";".join(otus)))

            counts[c] += n

        if ntotal_tree == 0:
            nempty += 1
        else:
            noutput += 1

    if options.loglevel >= 1:
        options.stdout.write(
            "# ninput=%i, nempty=%i, noutput=%i, nskipped=%i, nerrors=%i\n" %
            (ninput, nempty, noutput, nskipped, nerrors))

    # write summary information

    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = options.stdout
        outfile.write("//\n")

    outfile.write("cluster\tpattern\tcounts\t%s\n" % ("\t".join(species)))

    for c in range(len(xsets)):
        pattern = buildPattern(nspecies, xsets[c])
        outfile.write("%i\t%s\t%i\t%s\n" %
                      (c, "".join(pattern), counts[c], "\t".join(pattern)))

    if outfile != options.stdout:
        outfile.close()
Esempio n. 3
0
        map_gene2location = {}
        for l in in_locations.split(";"):
            gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
            if chr not in map_contig2size:
                continue
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
            map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to)

        if not map_gene2location:
            continue

        tree = TreeTools.Newick2Tree(in_tree)

        # the last subset is all nodes again.
        s = TreeTools.GetSubsets(tree)
        is_first = True
        for children, height, branchlength in s[:-1]:
            if len(children) == 1:
                continue
            data = []
            for c in map(lambda x: x.split(options.separator), children):
                if len(c) == 2:
                    data.append((c[0], c[1], c[1], "CG"))
                elif len(c) == 1:
                    data.append(("unk", c[0], c[0], "CG"))
                elif len(c) == 3:
                    data.append((c[0], c[2], c[3], "CG"))

            for species, transcript, gene, quality in data:
                if not gene in map_gene2location: