Example #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/plot_duplications.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--headers",
                      dest="headers",
                      action="store_true",
                      help="first row is a header [ignored].")
    parser.add_option("-t",
                      "--title",
                      dest="title",
                      type="string",
                      help="page title.")
    parser.add_option("-f",
                      "--footer",
                      dest="footer",
                      type="string",
                      help="page footer.")
    parser.add_option("-c",
                      "--contig-sizes",
                      dest="filename_contig_sizes",
                      type="string",
                      help="filname with contig sizes.")
    parser.add_option("-r",
                      "--radius",
                      dest="radius",
                      type="int",
                      help="radius.")
    parser.add_option("-i",
                      "--increment",
                      dest="radius_increment",
                      type="int",
                      help="radius increment.")
    parser.add_option("-u",
                      "--url",
                      dest="url",
                      type="string",
                      help="string to build url for annotation.")
    parser.add_option("--min-contig",
                      dest="min_contig_size",
                      type="string",
                      help="minimum contig size to delineate.")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum branch length.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum branch length.")

    parser.set_defaults(
        filename_contig_sizes=None,
        headers=False,
        titles="",
        pattern_filename=None,
        title="",
        footer="",
        radius=3000,
        min_value=0.0,
        max_value=0.2,
        url=None,
        radius_increment=40,
        min_contig_size=10000,
        remove_empty_contigs=True,
        separator="|",
        quality2symbol={
            'CG': "circle",
            'PG': "circle",
            'SG': "circle"
        },
        quality2mask=("RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG",
                      "UP", "UF", "BF", "UK"),
        sort_by_size=True,
        input_format="pairwise",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_contig_sizes:
        map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes,
                                               "r"),
                                          map_functions=(str, int))

    # read data and get contigs that are used (i.e.: remove empty contigs)
    chrs = {}
    lines = sys.stdin.readlines()

    if options.remove_empty_contigs:
        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
        for k in map_contig2size.keys():
            if k not in chrs:
                del map_contig2size[k]

    k = map_contig2size.keys()

    if len(k) == 0:
        E.Stop()
        sys.exit(0)

    k.sort()

    if options.sort_by_size:
        k.sort(lambda x, y: cmp(map_contig2size[x], map_contig2size[y]))

    plot = DuplicationPlot(k, map_contig2size, num_entries=0)

    plot.mRadiusIncrement = options.radius_increment
    plot.mRadius = options.radius
    plot.mMaxValue = options.max_value
    plot.mMinValue = options.min_value

    if options.title:
        plot.setTitle(options.title)
    if options.footer:
        plot.setFooter(options.footer)

    plot.initializePlot()

    data = []

    if options.input_format == "pairwise":

        # read data from pairwise analysis
        # format is: cluster_id, locations of duplications, tree of
        # duplications

        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            mi, ma = 0, 0
            found = False
            n = 0
            chrs = {}
            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
                sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)

                xi = plot.getPosition(chr, strand, sbjct_from)
                xa = plot.getPosition(chr, strand, sbjct_to)

                if not mi:
                    mi = xi
                else:
                    mi = min(mi, xi)

                n += 1
                ma = max(ma, xa)
                found = True

            if not found:
                continue
            cis = len(chrs) == 1
            if options.loglevel >= 2:
                options.stdlog.write(
                    "# adding duplications in cluster %s: %s with tree %s\n" %
                    (cluster_id, in_locations, in_tree))
            data.append((cis, n, mi, ma, cluster_id, in_locations, in_tree))

    data.sort()

    plot.mNumEntries = len(data)
    plot.initializePlot()

    last_ndups = 0

    for cis, ndups, mi, ma, cluster_id, in_locations, in_tree in data[:]:

        if ndups != last_ndups:
            plot.pushRadius()
            plot.addSeparator()

        last_ndups = ndups

        map_gene2location = {}
        for l in in_locations.split(";"):
            gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
            if chr not in map_contig2size:
                continue
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
            map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to)

        if not map_gene2location:
            continue

        tree = TreeTools.Newick2Tree(in_tree)

        # the last subset is all nodes again.
        s = TreeTools.GetSubsets(tree)

        is_first = True
        for children, height, branchlength in s[:-1]:
            if len(children) == 1:
                continue
            c = map(lambda x: x.split(options.separator), children)
            plot.addDuplication(c,
                                map_gene2location,
                                height,
                                url=options.url,
                                with_separator=is_first,
                                link_to_previous=not is_first,
                                quality2symbol=options.quality2symbol,
                                quality2mask=options.quality2mask)
            is_first = False

    plot.writeToFile(sys.stdout)

    E.Stop()
Example #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_duplications.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use.")

    parser.add_option("-p",
                      "--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to use for temporary files.")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="string",
                      help="method to use [counts|lists|hists|links].")

    parser.add_option("-o",
                      "--filename-output",
                      dest="filename_output",
                      type="string",
                      help="output filename.")

    parser.add_option("-f",
                      "--functions",
                      dest="functions",
                      type="string",
                      help="functions to grep [functional|pseudo|all].")

    parser.add_option("-l",
                      "--locations",
                      dest="locations",
                      type="string",
                      help="locations to grep [local|nojunk|all|...].")

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")

    parser.add_option("-i",
                      "--fit",
                      dest="fit",
                      type="string",
                      help="fitting method [decay|power]")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")

    parser.add_option("--use-relative-height",
                      dest="use_relative_height",
                      action="store_true",
                      help="use relative height values.")

    parser.add_option(
        "--reverse",
        dest="reverse",
        action="store_true",
        help="""reverse species. Histograms will show the age of duplications for
                      duplicates in other genomes.""")

    parser.set_defaults(species="",
                        functions="functional,pseudo,all",
                        locations="local,nojunk,all",
                        filename_output=None,
                        bin_size=1.0,
                        min_value=None,
                        max_value=None,
                        nonnull=None,
                        use_relative_height=False,
                        header=True,
                        fit=None,
                        reverse=False,
                        method="counts")

    (options, args) = E.Start(parser, add_psql_options=True)

    options.species = options.species.split(",")
    options.locations = options.locations.split(",")
    options.functions = options.functions.split(",")

    if len(options.species) == 0:
        raise "please supply list of species."

    dbhandle = pgdb.connect(options.psql_connection)

    input_data = map(lambda x: x[:-1].split("\t"),
                     filter(lambda x: x[0] != "#", sys.stdin.readlines()))

    ## remove header
    if options.header:
        del input_data[0]

    ## decide which columns to take
    ## 1st column: species1: this is the species in which duplications have occured.
    ## 2nd column: species2: this is the species with respect to which duplications occured.
    ## 3rd column: clusterid
    ## 4th column: chromosomes
    ## 5th column: function
    ## 6th column: height
    ## 7th column: relative height
    ## 8th column: locations
    ## 9th column: tree
    if options.use_relative_height:
        take = (0, 1, 2, 3, 4, 6, 7, 8)
    else:
        take = (0, 1, 2, 3, 4, 5, 7, 8)

    for x in range(len(input_data)):
        input_data[x] = tuple([input_data[x][y] for y in take])

    map_pos2species = []
    map_species2pos = {}
    for x in range(len(options.species)):
        map_species2pos[options.species[x]] = x
        map_pos2species.append(options.species[x])

    outfile = None

    if options.method in ("counts", "medians"):

        if options.method == "counts":
            func = len
        elif options.method == "medians":
            func = numpy.median

        for location in options.locations:

            for function in options.functions:
                matrix = numpy.zeros(
                    (len(options.species), len(options.species)), numpy.Float)

                data = GetSubset(input_data, location, function)

                ## sort by species1 and species2
                data.sort()

                last_species1, last_species2 = None, None
                values = []
                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        if len(values) > 0:
                            matrix[map_species2pos[last_species1],
                                   map_species2pos[last_species2]] = func(
                                       values)

                        values = []
                        last_species1 = species1
                        last_species2 = species2

                    values.append(float(height))

                if len(values) > 0:
                    matrix[map_species2pos[last_species1],
                           map_species2pos[last_species2]] = func(values)

                if options.filename_output:
                    dict = {"f": function, "l": location}
                    outfile = open(options.filename_output % dict, "w")
                else:
                    outfile = sys.stdout
                    outfile.write(
                        "matrix for method %s: location: %s, function: %s\n" %
                        (options.method, location, function))

                if options.method == "medians":
                    format = "%6.4f"
                elif options.method == "counts":
                    format = "%i"
                MatlabTools.WriteMatrix(matrix,
                                        outfile=outfile,
                                        format=format,
                                        row_headers=options.species,
                                        col_headers=options.species)

                if options.filename_output:
                    outfile.close()

    elif options.method in ("lists", "lists-union"):
        ## write lists of duplicated genes in species1 as compared to species2
        ##      according to location/function
        ## First field : gene name
        ## Second field: cluster id
        ## Third field : number of other genes in cluster
        ## Fourth field: location of gene
        written = {}
        for location in options.locations:

            for function in options.functions:

                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                data = GetSubset(input_data, location, function)

                ## sort by species1 and species2
                data.sort()

                last_species1, last_species2 = None, None

                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        ## write trees per cluster
                        if options.filename_output:
                            if options.method == "lists":
                                if outfile: outfile.close()
                                dict = {
                                    "f": function,
                                    "l": location,
                                    "s": species1,
                                    "o": species2
                                }
                                written = {}
                                outfile = open(options.filename_output % dict,
                                               "w")
                            elif options.method == "lists-union":
                                if last_species1 != species1:
                                    if outfile: outfile.close()
                                    dict = {
                                        "f": function,
                                        "l": location,
                                        "s": species1
                                    }
                                    written = {}
                                    outfile = open(
                                        options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            if options.method == "lists":
                                outfile.write(
                                    "location: %s, function: %s, species1: %s, species2: %s\n"
                                    % (location, function, species1, species2))
                                written = {}
                            elif options.method == "lists-union":
                                if last_species1 != species1:
                                    outfile.write(
                                        "location: %s, function: %s, species1: %s\n"
                                        % (location, function, species1))
                                    written = {}

                        last_species1 = species1
                        last_species2 = species2

                    # get tree
                    tt = TreeTools.Newick2Tree(tree)
                    taxa = TreeTools.GetTaxa(tt)
                    for t in taxa:
                        if t in written: continue
                        outfile.write("%s\t%s\t%i\n" %
                                      (t, cluster_id, len(taxa)))
                        written[t] = 1

    elif options.method in ("hists", "fit-decay"):

        for location in options.locations:

            for function in options.functions:

                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                data = GetSubset(input_data, location, function)

                data.sort()

                ################################################################
                ## convert to matrix of list
                ## values[x][y] contains heights of duplications in species x with reference to y

                for species1, species2, cluster_id, l, f, height, locations, tree in data:
                    try:
                        values[map_species2pos[species1]][
                            map_species2pos[species2]].append(float(height))
                    except KeyError:
                        continue

                ################################################################
                ################################################################
                ################################################################
                # calculate histograms per species
                ################################################################
                for s in options.species:
                    histograms = []
                    headers = []

                    if options.filename_output:
                        dict = {"f": function, "l": location, "s": s}
                        outfile = open(options.filename_output % dict, "w")
                    else:
                        outfile = sys.stdout
                        outfile.write("location: %s, function: %s\n" %
                                      (location, function))

                    for x in range(len(options.species)):

                        if options.reverse:
                            ## duplications in species x
                            vv = values[x][map_species2pos[s]]
                        else:
                            ## duplications in species s
                            vv = values[map_species2pos[s]][x]

                        if len(vv) == 0:
                            pass
                        else:
                            headers.append(options.species[x])
                            h = Histogram.Calculate(
                                vv,
                                increment=options.bin_size,
                                min_value=options.min_value,
                                max_value=options.max_value,
                                no_empty_bins=True)

                            if options.method == "fit-decay":
                                result = fit(h, [2.0, -1.0])
                                if result:
                                    outfile.write(
                                        "%s\t%s\t%s\t%i\t%f\t%f\ty = %f * exp ( %f * x )\n"
                                        % (
                                            "function",
                                            s,
                                            options.species[x],
                                            h[0][1],
                                            result[0],
                                            result[1],
                                            result[0],
                                            result[1],
                                        ))
                            elif options.method == "hists":
                                histograms.append(h)

                    if options.method == "hists":
                        combined_histogram = Histogram.Combine(
                            histograms, missing_value="-")

                        outfile.write("bin\t" + "\t".join(headers) + "\n")
                        Histogram.Write(outfile, combined_histogram)

                    if options.filename_output:
                        outfile.close()
                    else:
                        outfile.flush()

    elif options.method == "pairs":

        ## get branches with 0 branchlength

        for location in options.locations:

            if options.loglevel >= 2:
                options.stdlog.write("# processing location %s\n" % location)

            for function in options.functions:

                if options.loglevel >= 2:
                    options.stdlog.write("#   processing function %s " %
                                         function)
                    options.stdlog.flush()

                data = GetSubset(input_data, location, function)

                if options.loglevel >= 2:
                    options.stdlog.write("with %i data points\n" % len(data))
                    options.stdlog.flush()

                data.sort()
                last_species1, last_species2, last_cluster_id = None, None, None

                values = []
                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        ## write trees per cluster
                        if options.filename_output:
                            if outfile: outfile.close()
                            dict = {
                                "f": function,
                                "l": location,
                                "s": species1,
                                "o": species2
                            }
                            outfile = open(options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            outfile.write(
                                "location: %s, function: %s, species1: %s, species2: %s\n"
                                % (location, function, species1, species2))

                        last_species1 = species1
                        last_species2 = species2
                        last_cluster_id = None

                    if last_cluster_id != cluster_id:
                        if last_cluster_id != None:
                            pass

                        last_cluster_id = cluster_id

                    outfile.write("%s\t%s\t%s\t%s\n" %
                                  (cluster_id, height, locations, tree))

    elif options.method == "links":

        ## write a tree for each species pair:
        ## each node is a gene+location, the weight of the vertex is the height
        ## further info added: cluster_id for the duplication

        for location in options.locations:

            if options.loglevel >= 2:
                options.stdlog.write("# processing location %s\n" % location)

            for function in options.functions:

                if options.loglevel >= 2:
                    options.stdlog.write("#   processing function %s " %
                                         function)
                    options.stdlog.flush()

                data = GetSubset(input_data, location, function)

                if options.loglevel >= 2:
                    options.stdlog.write("with %i data points\n" % len(data))
                    options.stdlog.flush()

                ## stores duplications within first species as compared to second species
                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                for species1, species2, cluster_id, l, f, height, locations, tree in data:
                    values[map_species2pos[species1]][
                        map_species2pos[species2]].append(
                            (cluster_id, -len(locations), locations, tree))

                # get links per species
                for s in options.species:
                    if options.loglevel >= 2:
                        options.stdlog.write("#     processing species %s\n" %
                                             s)

                    headers = []
                    for x in range(len(options.species)):

                        if map_pos2species[x] == s: continue

                        vv = values[map_species2pos[s]][x]
                        vv.sort()

                        ## write trees per cluster
                        if options.filename_output:
                            dict = {
                                "f": function,
                                "l": location,
                                "s": s,
                                "o": map_pos2species[x]
                            }
                            outfile = open(options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            outfile.write(
                                "location: %s, function: %s, species1: %s, species2: %s\n"
                                % (location, function, s, map_pos2species[x]))

                        ## only print out largest tree
                        last_cluster_id = None
                        for cluster_id, n, locations, tree in vv:
                            if cluster_id != last_cluster_id:
                                outfile.write("%s\t%s\t%s\n" %
                                              (cluster_id, locations, tree))
                                last_cluster_id = cluster_id

                        if options.filename_output:
                            outfile.close()

    E.Stop()
Example #3
0
            plot.addSeparator()

        last_ndups = ndups

        map_gene2location = {}
        for l in in_locations.split(";"):
            gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
            if chr not in map_contig2size:
                continue
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
            map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to)

        if not map_gene2location:
            continue

        tree = TreeTools.Newick2Tree(in_tree)

        # the last subset is all nodes again.
        s = TreeTools.GetSubsets(tree)
        is_first = True
        for children, height, branchlength in s[:-1]:
            if len(children) == 1:
                continue
            data = []
            for c in map(lambda x: x.split(options.separator), children):
                if len(c) == 2:
                    data.append((c[0], c[1], c[1], "CG"))
                elif len(c) == 1:
                    data.append(("unk", c[0], c[0], "CG"))
                elif len(c) == 3:
                    data.append((c[0], c[2], c[3], "CG"))