Beispiel #1
0
def printHistogram(values, section, options, min_value=0, increment=1.0):

    if len(values) == 0:
        if options.loglevel >= 1:
            options.stdlog.write(
                "# no histogram data for section %s\n" % (section))
        return

    outfile = IOTools.openFile(options.output_filename_pattern % section, "w")
    h = Histogram.Calculate(
        values, no_empty_bins=True, min_value=0, increment=1.0)

    outfile.write("bin\t%s\n" % section)
    for bin, val in h:
        outfile.write("%5.2f\t%i\n" % (bin, val))
    outfile.close()
Beispiel #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_duplications.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-s",
                      "--species",
                      dest="species",
                      type="string",
                      help="species to use.")

    parser.add_option("-p",
                      "--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to use for temporary files.")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="string",
                      help="method to use [counts|lists|hists|links].")

    parser.add_option("-o",
                      "--filename-output",
                      dest="filename_output",
                      type="string",
                      help="output filename.")

    parser.add_option("-f",
                      "--functions",
                      dest="functions",
                      type="string",
                      help="functions to grep [functional|pseudo|all].")

    parser.add_option("-l",
                      "--locations",
                      dest="locations",
                      type="string",
                      help="locations to grep [local|nojunk|all|...].")

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")

    parser.add_option("-i",
                      "--fit",
                      dest="fit",
                      type="string",
                      help="fitting method [decay|power]")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")

    parser.add_option("--use-relative-height",
                      dest="use_relative_height",
                      action="store_true",
                      help="use relative height values.")

    parser.add_option(
        "--reverse",
        dest="reverse",
        action="store_true",
        help="""reverse species. Histograms will show the age of duplications for
                      duplicates in other genomes.""")

    parser.set_defaults(species="",
                        functions="functional,pseudo,all",
                        locations="local,nojunk,all",
                        filename_output=None,
                        bin_size=1.0,
                        min_value=None,
                        max_value=None,
                        nonnull=None,
                        use_relative_height=False,
                        header=True,
                        fit=None,
                        reverse=False,
                        method="counts")

    (options, args) = E.Start(parser, add_psql_options=True)

    options.species = options.species.split(",")
    options.locations = options.locations.split(",")
    options.functions = options.functions.split(",")

    if len(options.species) == 0:
        raise "please supply list of species."

    dbhandle = pgdb.connect(options.psql_connection)

    input_data = map(lambda x: x[:-1].split("\t"),
                     filter(lambda x: x[0] != "#", sys.stdin.readlines()))

    ## remove header
    if options.header:
        del input_data[0]

    ## decide which columns to take
    ## 1st column: species1: this is the species in which duplications have occured.
    ## 2nd column: species2: this is the species with respect to which duplications occured.
    ## 3rd column: clusterid
    ## 4th column: chromosomes
    ## 5th column: function
    ## 6th column: height
    ## 7th column: relative height
    ## 8th column: locations
    ## 9th column: tree
    if options.use_relative_height:
        take = (0, 1, 2, 3, 4, 6, 7, 8)
    else:
        take = (0, 1, 2, 3, 4, 5, 7, 8)

    for x in range(len(input_data)):
        input_data[x] = tuple([input_data[x][y] for y in take])

    map_pos2species = []
    map_species2pos = {}
    for x in range(len(options.species)):
        map_species2pos[options.species[x]] = x
        map_pos2species.append(options.species[x])

    outfile = None

    if options.method in ("counts", "medians"):

        if options.method == "counts":
            func = len
        elif options.method == "medians":
            func = numpy.median

        for location in options.locations:

            for function in options.functions:
                matrix = numpy.zeros(
                    (len(options.species), len(options.species)), numpy.Float)

                data = GetSubset(input_data, location, function)

                ## sort by species1 and species2
                data.sort()

                last_species1, last_species2 = None, None
                values = []
                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        if len(values) > 0:
                            matrix[map_species2pos[last_species1],
                                   map_species2pos[last_species2]] = func(
                                       values)

                        values = []
                        last_species1 = species1
                        last_species2 = species2

                    values.append(float(height))

                if len(values) > 0:
                    matrix[map_species2pos[last_species1],
                           map_species2pos[last_species2]] = func(values)

                if options.filename_output:
                    dict = {"f": function, "l": location}
                    outfile = open(options.filename_output % dict, "w")
                else:
                    outfile = sys.stdout
                    outfile.write(
                        "matrix for method %s: location: %s, function: %s\n" %
                        (options.method, location, function))

                if options.method == "medians":
                    format = "%6.4f"
                elif options.method == "counts":
                    format = "%i"
                MatlabTools.WriteMatrix(matrix,
                                        outfile=outfile,
                                        format=format,
                                        row_headers=options.species,
                                        col_headers=options.species)

                if options.filename_output:
                    outfile.close()

    elif options.method in ("lists", "lists-union"):
        ## write lists of duplicated genes in species1 as compared to species2
        ##      according to location/function
        ## First field : gene name
        ## Second field: cluster id
        ## Third field : number of other genes in cluster
        ## Fourth field: location of gene
        written = {}
        for location in options.locations:

            for function in options.functions:

                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                data = GetSubset(input_data, location, function)

                ## sort by species1 and species2
                data.sort()

                last_species1, last_species2 = None, None

                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        ## write trees per cluster
                        if options.filename_output:
                            if options.method == "lists":
                                if outfile: outfile.close()
                                dict = {
                                    "f": function,
                                    "l": location,
                                    "s": species1,
                                    "o": species2
                                }
                                written = {}
                                outfile = open(options.filename_output % dict,
                                               "w")
                            elif options.method == "lists-union":
                                if last_species1 != species1:
                                    if outfile: outfile.close()
                                    dict = {
                                        "f": function,
                                        "l": location,
                                        "s": species1
                                    }
                                    written = {}
                                    outfile = open(
                                        options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            if options.method == "lists":
                                outfile.write(
                                    "location: %s, function: %s, species1: %s, species2: %s\n"
                                    % (location, function, species1, species2))
                                written = {}
                            elif options.method == "lists-union":
                                if last_species1 != species1:
                                    outfile.write(
                                        "location: %s, function: %s, species1: %s\n"
                                        % (location, function, species1))
                                    written = {}

                        last_species1 = species1
                        last_species2 = species2

                    # get tree
                    tt = TreeTools.Newick2Tree(tree)
                    taxa = TreeTools.GetTaxa(tt)
                    for t in taxa:
                        if t in written: continue
                        outfile.write("%s\t%s\t%i\n" %
                                      (t, cluster_id, len(taxa)))
                        written[t] = 1

    elif options.method in ("hists", "fit-decay"):

        for location in options.locations:

            for function in options.functions:

                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                data = GetSubset(input_data, location, function)

                data.sort()

                ################################################################
                ## convert to matrix of list
                ## values[x][y] contains heights of duplications in species x with reference to y

                for species1, species2, cluster_id, l, f, height, locations, tree in data:
                    try:
                        values[map_species2pos[species1]][
                            map_species2pos[species2]].append(float(height))
                    except KeyError:
                        continue

                ################################################################
                ################################################################
                ################################################################
                # calculate histograms per species
                ################################################################
                for s in options.species:
                    histograms = []
                    headers = []

                    if options.filename_output:
                        dict = {"f": function, "l": location, "s": s}
                        outfile = open(options.filename_output % dict, "w")
                    else:
                        outfile = sys.stdout
                        outfile.write("location: %s, function: %s\n" %
                                      (location, function))

                    for x in range(len(options.species)):

                        if options.reverse:
                            ## duplications in species x
                            vv = values[x][map_species2pos[s]]
                        else:
                            ## duplications in species s
                            vv = values[map_species2pos[s]][x]

                        if len(vv) == 0:
                            pass
                        else:
                            headers.append(options.species[x])
                            h = Histogram.Calculate(
                                vv,
                                increment=options.bin_size,
                                min_value=options.min_value,
                                max_value=options.max_value,
                                no_empty_bins=True)

                            if options.method == "fit-decay":
                                result = fit(h, [2.0, -1.0])
                                if result:
                                    outfile.write(
                                        "%s\t%s\t%s\t%i\t%f\t%f\ty = %f * exp ( %f * x )\n"
                                        % (
                                            "function",
                                            s,
                                            options.species[x],
                                            h[0][1],
                                            result[0],
                                            result[1],
                                            result[0],
                                            result[1],
                                        ))
                            elif options.method == "hists":
                                histograms.append(h)

                    if options.method == "hists":
                        combined_histogram = Histogram.Combine(
                            histograms, missing_value="-")

                        outfile.write("bin\t" + "\t".join(headers) + "\n")
                        Histogram.Write(outfile, combined_histogram)

                    if options.filename_output:
                        outfile.close()
                    else:
                        outfile.flush()

    elif options.method == "pairs":

        ## get branches with 0 branchlength

        for location in options.locations:

            if options.loglevel >= 2:
                options.stdlog.write("# processing location %s\n" % location)

            for function in options.functions:

                if options.loglevel >= 2:
                    options.stdlog.write("#   processing function %s " %
                                         function)
                    options.stdlog.flush()

                data = GetSubset(input_data, location, function)

                if options.loglevel >= 2:
                    options.stdlog.write("with %i data points\n" % len(data))
                    options.stdlog.flush()

                data.sort()
                last_species1, last_species2, last_cluster_id = None, None, None

                values = []
                for species1, species2, cluster_id, l, f, height, locations, tree in data:

                    if last_species1 != species1 or last_species2 != species2:

                        ## write trees per cluster
                        if options.filename_output:
                            if outfile: outfile.close()
                            dict = {
                                "f": function,
                                "l": location,
                                "s": species1,
                                "o": species2
                            }
                            outfile = open(options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            outfile.write(
                                "location: %s, function: %s, species1: %s, species2: %s\n"
                                % (location, function, species1, species2))

                        last_species1 = species1
                        last_species2 = species2
                        last_cluster_id = None

                    if last_cluster_id != cluster_id:
                        if last_cluster_id != None:
                            pass

                        last_cluster_id = cluster_id

                    outfile.write("%s\t%s\t%s\t%s\n" %
                                  (cluster_id, height, locations, tree))

    elif options.method == "links":

        ## write a tree for each species pair:
        ## each node is a gene+location, the weight of the vertex is the height
        ## further info added: cluster_id for the duplication

        for location in options.locations:

            if options.loglevel >= 2:
                options.stdlog.write("# processing location %s\n" % location)

            for function in options.functions:

                if options.loglevel >= 2:
                    options.stdlog.write("#   processing function %s " %
                                         function)
                    options.stdlog.flush()

                data = GetSubset(input_data, location, function)

                if options.loglevel >= 2:
                    options.stdlog.write("with %i data points\n" % len(data))
                    options.stdlog.flush()

                ## stores duplications within first species as compared to second species
                values = [[[] for y in range(len(options.species))]
                          for x in range(len(options.species))]

                for species1, species2, cluster_id, l, f, height, locations, tree in data:
                    values[map_species2pos[species1]][
                        map_species2pos[species2]].append(
                            (cluster_id, -len(locations), locations, tree))

                # get links per species
                for s in options.species:
                    if options.loglevel >= 2:
                        options.stdlog.write("#     processing species %s\n" %
                                             s)

                    headers = []
                    for x in range(len(options.species)):

                        if map_pos2species[x] == s: continue

                        vv = values[map_species2pos[s]][x]
                        vv.sort()

                        ## write trees per cluster
                        if options.filename_output:
                            dict = {
                                "f": function,
                                "l": location,
                                "s": s,
                                "o": map_pos2species[x]
                            }
                            outfile = open(options.filename_output % dict, "w")
                        else:
                            outfile = sys.stdout
                            outfile.write(
                                "location: %s, function: %s, species1: %s, species2: %s\n"
                                % (location, function, s, map_pos2species[x]))

                        ## only print out largest tree
                        last_cluster_id = None
                        for cluster_id, n, locations, tree in vv:
                            if cluster_id != last_cluster_id:
                                outfile.write("%s\t%s\t%s\n" %
                                              (cluster_id, locations, tree))
                                last_cluster_id = cluster_id

                        if options.filename_output:
                            outfile.close()

    E.Stop()
Beispiel #3
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: data2histogram.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-r", "--range", dest="range", type="string", help="range to calculate histogram for.")
    parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.")
    parser.add_option("-i", "--titles", dest="titles", action="store_true", help="use supplied column titles.")
    parser.add_option("--no-null", dest="nonull", action="store_true", help="do not output null values")
    parser.add_option("--no-titles", dest="titles", action="store_false", help="no column titles given.")
    parser.add_option(
        "-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms."
    )
    parser.add_option(
        "--min-data",
        dest="min_data",
        type="int",
        help="minimum amount of data required, if less data, then the histogram will be empty [default=%default].",
    )
    parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.")
    parser.add_option("--max-value", dest="max_value", type="float", help="maximum value for histogram.")
    parser.add_option("--no-empty-bins", dest="no_empty_bins", action="store_true", help="do not display empty bins.")
    parser.add_option("--with-empty-bins", dest="no_empty_bins", action="store_false", help="display empty bins.")
    parser.add_option("--normalize", dest="normalize", action="store_true", help="normalize histogram.")
    parser.add_option("--cumulative", dest="cumulative", action="store_true", help="calculate cumulative histogram.")
    parser.add_option(
        "--reverse-cumulative",
        dest="reverse_cumulative",
        action="store_true",
        help="calculate reverse cumulative histogram.",
    )
    parser.add_option("--headers", dest="headers", type="string", help="use the following headers.")
    parser.add_option(
        "--ignore-out-of-range",
        dest="ignore_out_of_range",
        action="store_true",
        help="ignore values that are out of range (as opposed to truncating them to range border.",
    )
    parser.add_option("--missing", dest="missing_value", type="string", help="entry for missing values [%default].")
    parser.add_option(
        "--dynamic-bins", dest="dynamic_bins", action="store_true", help="each value constitutes its own bin."
    )
    parser.add_option(
        "--on-the-fly",
        dest="on_the_fly",
        action="store_true",
        help="on the fly computation of histograms. Requires setting of min-value, max-value and bin_size.",
    )

    parser.set_defaults(
        bin_size=None,
        range=None,
        titles=True,
        columns="all",
        append=(),
        no_empty_bins=True,
        min_value=None,
        max_value=None,
        normalize=False,
        cumulative=False,
        reverse_cumulative=False,
        nonull=None,
        ignore_out_of_range=False,
        min_data=1,
        headers=None,
        missing_value="na",
        dynamic_bins=False,
        on_the_fly=False,
        bin_format="%.2f",
        value_format="%6.4f",
    )

    (options, args) = E.Start(parser)

    if options.columns != "all":
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    if options.range:
        options.min_value, options.max_value = map(float, options.range.split(","))

    if options.headers:
        options.headers = options.headers.split(",")

    if options.on_the_fly:
        if options.min_value == None or options.max_value == None or options.bin_size == None:
            raise "please supply columns, min-value, max-value and bin-size for on-the-fly computation."

        # try to glean titles from table:
        if options.titles:
            while 1:
                line = sys.stdin.readline()
                if not line:
                    break
                if line[0] == "#":
                    continue
                data = line[:-1].split("\t")
                break

            if options.columns == "all":
                options.titles = data
                options.columns = range(len(data))
            else:
                options.titles = [data[x] for x in options.columns]

        bins = numpy.arange(options.min_value, options.max_value, float(options.bin_size))
        hh = Histogram.fillHistograms(sys.stdin, options.columns, [bins for x in range(len(options.columns))])
        n = len(hh)

        titles = ["bin"]

        if options.headers:
            titles.append(options.headers[x])
        elif options.titles:
            titles.append(options.titles[x])
        else:
            for x in options.columns:
                titles.append("col%i" % (x + 1))

        if len(titles) > 1:
            options.stdout.write("\t".join(titles) + "\n")

        for x in range(len(bins)):
            v = []
            v.append(options.bin_format % bins[x])
            for c in range(n):
                v.append(options.value_format % hh[c][x])

            options.stdout.write("\t".join(v) + "\n")

    else:
        ## in-situ computation of histograms
        # retrieve data
        first = True
        vals = []

        # parse data, convert to floats
        for l in options.stdin:

            if l[0] == "#":
                continue

            data = string.split(l[:-1], "\t")

            if first:
                first = False
                ncols = len(data)
                if options.columns == "all":
                    options.columns = range(ncols)

                vals = [[] for x in options.columns]

                if options.titles:
                    try:
                        options.titles = [data[x] for x in options.columns]
                    except IndexError:
                        raise IndexError, "not all columns %s found in data %s" % (str(options.columns), str(data))
                    continue

            for x in range(len(options.columns)):

                try:
                    v = string.atof(data[options.columns[x]])
                except IndexError:
                    print "# IndexError in line:", l[:-1]
                    continue
                except ValueError:
                    continue

                vals[x].append(v)

        lines = None

        hists = []
        titles = []

        if not vals:
            if options.loglevel >= 1:
                options.stdlog.write("# no data\n")
            E.Stop()
            sys.exit(0)

        for x in range(len(options.columns)):

            if options.loglevel >= 1:
                options.stdlog.write("# column=%i, num_values=%i\n" % (options.columns[x], len(vals[x])))

            if len(vals[x]) < options.min_data:
                continue

            h = Histogram.Calculate(
                vals[x],
                no_empty_bins=options.no_empty_bins,
                increment=options.bin_size,
                min_value=options.min_value,
                max_value=options.max_value,
                dynamic_bins=options.dynamic_bins,
                ignore_out_of_range=options.ignore_out_of_range,
            )

            if options.normalize:
                h = Histogram.Normalize(h)
            if options.cumulative:
                h = Histogram.Cumulate(h)
            if options.reverse_cumulative:
                h = Histogram.Cumulate(h, direction=0)

            hists.append(h)

            for m in options.append:
                if m == "normalize":
                    hists.append(Histogram.Normalize(h))

            if options.headers:
                titles.append(options.headers[x])
            elif options.titles:
                titles.append(options.titles[x])
            else:
                titles.append("col%i" % options.columns[x])

        if titles:
            options.stdout.write("bin\t" + "\t".join(titles) + "\n")

        if len(hists) == 1:
            Histogram.Print(hists[0], nonull=options.nonull)
        else:
            combined_histogram = Histogram.Combine(hists, missing_value=options.missing_value)
            Histogram.Print(combined_histogram, nonull=options.nonull)

    E.Stop()
Beispiel #4
0
        order = []
        for x in sort_order:
            if x in map_header2pos:
                order.append(map_header2pos[x])

        new_headers = [headers[0]]
        new_histograms = []

        for x in order:
            new_headers.append(headers[x])
            new_histograms.append(histograms[x - 1])

        histograms = new_histograms
        headers = new_headers

    combined_histogram = Histogram.Combine(histograms, param_missing_value)

    if headers:
        print "\t".join(headers)

    if param_normalize:
        combined_histogram = Histogram.Normalize(combined_histogram)

    Histogram.Print(combined_histogram,
                    format_bin=param_format_bin,
                    format_value=param_format_value,
                    )

    print E.GetFooter()

Beispiel #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    try:
        optlist, args = getopt.getopt(sys.argv[1:], param_short_options,
                                      param_long_options)

    except getopt.error as msg:
        print(globals()["__doc__"], msg)
        sys.exit(1)

    for o, a in optlist:
        if o in ("--help", ):
            print(globals()["__doc__"])
            sys.exit(0)
        elif o in ("--version", ):
            print("version=")
            sys.exit(0)
        elif o in ("-h", "--header-names"):
            param_headers = a.split(",")
        elif o in ("-n", "--normalize"):
            param_normalize = 1
        elif o in ("-m", "--missing-value"):
            param_missing_value = a
        elif o == "--no-titles":
            param_titles = False
        elif o == "--no-titles":
            param_titles = False
        elif o in ("-f", "--format"):
            param_format = a
        elif o == "--format-value":
            param_format_value = a
        elif o == "--bin-format":
            param_format_bin = a
        elif o in ("-s", "--method=sort --sort-order"):
            if a in ("numerical", "alphabetic"):
                param_sort = a
            else:
                param_sort = a.split(",")

    if len(args) < 1:
        print(globals()["__doc__"], "please specify at one histogram.")
        sys.exit(1)

    param_filenames = args

    print(E.GetHeader())
    print(E.GetParams())

    histograms = []

    # first
    headers = [
        'bin',
    ]
    if param_headers and headers != "auto":
        headers = [
            param_headers[0],
        ]
        del param_headers[0]

    for x in range(len(param_filenames)):

        filename = param_filenames[x]
        if not os.path.exists(filename):
            print("# skipped because file not present: %s" % filename)
            continue

        file = IOTools.open_file(filename, "r")

        lines = [x for x in file if x[0] != "#"]

        if len(lines) == 0:
            continue

        if param_titles:
            h = lines[0][:-1].split("\t")[1:]
            del lines[0]

        if param_headers == "auto":
            headers.append(os.path.basename(filename))
        elif param_headers:
            headers.append(param_headers[x])
        elif param_titles:
            headers += h

        data = [list(map(float, x[:-1].split("\t"))) for x in lines]

        # add empty data point for empty histograms
        if len(data) == 0:
            data = [(0, 0)]

        histograms.append(data)

    # sort the whole thing:
    if param_sort:
        sort_order = []

        if param_sort == "numerical":
            t = list(
                zip(list(map(int, headers[1:])),
                    list(range(1,
                               len(headers) + 1))))
            t.sort()

            for tt in t:
                sort_order.append(headers[tt[1]])

        elif param_sort == "alphabetical":
            t = list(zip(headers[1:], list(range(1, len(headers) + 1))))
            t.sort()

            for tt in t:
                sort_order.append(headers[tt[1]])
        else:
            sort_order = param_sort

        # map header to old position
        map_header2pos = {}
        for x in range(1, len(headers)):
            map_header2pos[headers[x]] = x

        order = []
        for x in sort_order:
            if x in map_header2pos:
                order.append(map_header2pos[x])

        new_headers = [headers[0]]
        new_histograms = []

        for x in order:
            new_headers.append(headers[x])
            new_histograms.append(histograms[x - 1])

        histograms = new_histograms
        headers = new_headers

    combined_histogram = Histogram.Combine(histograms, param_missing_value)

    if headers:
        print("\t".join(headers))

    if param_normalize:
        combined_histogram = Histogram.Normalize(combined_histogram)

    Histogram.Print(
        combined_histogram,
        format_bin=param_format_bin,
        format_value=param_format_value,
    )

    print(E.GetFooter())
Beispiel #6
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: data2histogram.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-r",
                      "--range",
                      dest="range",
                      type="string",
                      help="range to calculate histogram for.")
    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")
    parser.add_option("-i",
                      "--titles",
                      dest="titles",
                      action="store_true",
                      help="use supplied column titles.")
    parser.add_option("--no-null",
                      dest="nonull",
                      action="store_true",
                      help="do not output null values")
    parser.add_option("--no-titles",
                      dest="titles",
                      action="store_false",
                      help="no column titles given.")
    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option(
        "--min-data",
        dest="min_data",
        type="int",
        help=
        "minimum amount of data required, if less data, then the histogram will be empty [default=%default]."
    )
    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")
    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")
    parser.add_option("--no-empty-bins",
                      dest="no_empty_bins",
                      action="store_true",
                      help="do not display empty bins.")
    parser.add_option("--with-empty-bins",
                      dest="no_empty_bins",
                      action="store_false",
                      help="display empty bins.")
    parser.add_option("--normalize",
                      dest="normalize",
                      action="store_true",
                      help="normalize histogram.")
    parser.add_option("--cumulative",
                      dest="cumulative",
                      action="store_true",
                      help="calculate cumulative histogram.")
    parser.add_option("--reverse-cumulative",
                      dest="reverse_cumulative",
                      action="store_true",
                      help="calculate reverse cumulative histogram.")
    parser.add_option("--header-names",
                      dest="headers",
                      type="string",
                      help="use the following headers.")
    parser.add_option(
        "--ignore-out-of-range",
        dest="ignore_out_of_range",
        action="store_true",
        help=
        "ignore values that are out of range (as opposed to truncating them to range border."
    )
    parser.add_option("--missing-value",
                      dest="missing_value",
                      type="string",
                      help="entry for missing values [%default].")
    parser.add_option("--use-dynamic-bins",
                      dest="dynamic_bins",
                      action="store_true",
                      help="each value constitutes its own bin.")
    parser.add_option(
        "--on-the-fly",
        dest="on_the_fly",
        action="store_true",
        help=
        "on the fly computation of histograms. Requires setting of min-value, max-value and bin_size."
    )

    parser.set_defaults(
        bin_size=None,
        range=None,
        titles=True,
        columns="all",
        append=(),
        no_empty_bins=True,
        min_value=None,
        max_value=None,
        normalize=False,
        cumulative=False,
        reverse_cumulative=False,
        nonull=None,
        ignore_out_of_range=False,
        min_data=1,
        headers=None,
        missing_value="na",
        dynamic_bins=False,
        on_the_fly=False,
        bin_format="%.2f",
        value_format="%6.4f",
    )

    (options, args) = E.Start(parser)

    if options.columns != "all":
        options.columns = [int(x) - 1 for x in options.columns.split(",")]

    if options.range:
        options.min_value, options.max_value = list(
            map(float, options.range.split(",")))

    if options.headers:
        options.headers = options.headers.split(",")

    if options.on_the_fly:
        if options.min_value is None or options.max_value is None or \
           options.bin_size is None:
            raise ValueError("please supply columns, min-value, max-value and "
                             "bin-size for on-the-fly computation.")

        # try to glean titles from table:
        if options.titles:
            while 1:
                line = sys.stdin.readline()
                if not line:
                    break
                if line[0] == "#":
                    continue
                data = line[:-1].split("\t")
                break

            if options.columns == "all":
                options.titles = data
                options.columns = list(range(len(data)))
            else:
                options.titles = [data[x] for x in options.columns]

        bins = numpy.arange(options.min_value, options.max_value,
                            float(options.bin_size))
        hh = Histogram.fillHistograms(
            sys.stdin, options.columns,
            [bins for x in range(len(options.columns))])
        n = len(hh)

        titles = ['bin']

        if options.headers:
            titles.append(options.headers[x])
        elif options.titles:
            titles.append(options.titles[x])
        else:
            for x in options.columns:
                titles.append("col%i" % (x + 1))

        if len(titles) > 1:
            options.stdout.write("\t".join(titles) + "\n")

        for x in range(len(bins)):
            v = []
            v.append(options.bin_format % bins[x])
            for c in range(n):
                v.append(options.value_format % hh[c][x])

            options.stdout.write("\t".join(v) + "\n")

    else:
        # in-situ computation of histograms
        # retrieve data
        first = True
        vals = []

        # parse data, convert to floats
        for l in options.stdin:

            if l[0] == "#":
                continue

            data = l[:-1].split("\t")

            if first:
                first = False
                ncols = len(data)
                if options.columns == "all":
                    options.columns = list(range(ncols))

                vals = [[] for x in options.columns]

                if options.titles:
                    try:
                        options.titles = [data[x] for x in options.columns]
                    except IndexError:
                        raise IndexError(
                            "not all columns %s found in data %s" %
                            (str(options.columns), str(data)))
                    continue

            for x in range(len(options.columns)):

                try:
                    v = float(data[options.columns[x]])
                except IndexError:
                    print("# IndexError in line:", l[:-1])
                    continue
                except ValueError:
                    continue

                vals[x].append(v)

        lines = None

        hists = []
        titles = []

        if not vals:
            if options.loglevel >= 1:
                options.stdlog.write("# no data\n")
            E.Stop()
            sys.exit(0)

        for x in range(len(options.columns)):

            if options.loglevel >= 1:
                options.stdlog.write("# column=%i, num_values=%i\n" %
                                     (options.columns[x], len(vals[x])))

            if len(vals[x]) < options.min_data:
                continue

            h = Histogram.Calculate(
                vals[x],
                no_empty_bins=options.no_empty_bins,
                increment=options.bin_size,
                min_value=options.min_value,
                max_value=options.max_value,
                dynamic_bins=options.dynamic_bins,
                ignore_out_of_range=options.ignore_out_of_range)

            if options.normalize:
                h = Histogram.Normalize(h)
            if options.cumulative:
                h = Histogram.Cumulate(h)
            if options.reverse_cumulative:
                h = Histogram.Cumulate(h, direction=0)

            hists.append(h)

            for m in options.append:
                if m == "normalize":
                    hists.append(Histogram.Normalize(h))

            if options.headers:
                titles.append(options.headers[x])
            elif options.titles:
                titles.append(options.titles[x])
            else:
                titles.append("col%i" % options.columns[x])

        if titles:
            options.stdout.write("bin\t" + "\t".join(titles) + "\n")

        if len(hists) == 1:
            Histogram.Print(hists[0],
                            nonull=options.nonull,
                            format_bin=options.bin_format)
        else:
            combined_histogram = Histogram.Combine(
                hists, missing_value=options.missing_value)
            Histogram.Print(combined_histogram,
                            nonull=options.nonull,
                            format_bin=options.bin_format)

    E.Stop()
def main(argv=None):

    if argv == None: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-n",
                      "--nonull",
                      dest="nonull",
                      action="store_true",
                      help="no null [default=%default]")

    parser.add_option("-e",
                      "--show-empty",
                      dest="empty_bins",
                      action="store_true",
                      help="show empty bins [default=%default]")

    parser.add_option("-o",
                      "--normalize",
                      dest="normalize",
                      action="store_true",
                      help="normalize histogram [default=%default]")

    parser.add_option("-i",
                      "--titles",
                      dest="titles",
                      action="store_true",
                      help="use titles supplied in ... [default=%default]")

    parser.add_option("--cumulative",
                      dest="cumulative",
                      action="store_true",
                      help="compute cumulative histogram [default=%default]")

    parser.add_option(
        "--reverse-cumulative",
        dest="reverse_cumulative",
        action="store_true",
        help="compute reverse cumulative histogram [default=%default]")

    parser.add_option("-c",
                      "--column",
                      dest="column",
                      type="int",
                      help="columns to take [default=%default]")

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="float",
                      help="bin size to use [default=%default]")

    parser.add_option("-u",
                      "--upper",
                      dest="upper_limit",
                      type="float",
                      help="upper limit to use [default=%default]")

    parser.add_option("-l",
                      "--lower",
                      dest="lower_limit",
                      type="float",
                      help="lower limit to use [default=%default]")

    parser.add_option("-s",
                      "--scale",
                      dest="scale",
                      type="float",
                      help="scale to use [default=%default]")

    parser.add_option("-a",
                      "--append",
                      dest="append",
                      type="choice",
                      action="append",
                      choices=("normalize", ),
                      help="append columns [default=%default]")

    parser.set_defaults(nonull=None,
                        columns=[
                            0,
                        ],
                        empty_bins=True,
                        titles=False,
                        lower_limit=None,
                        upper_limit=None,
                        bin_size=None,
                        scale=None,
                        normalize=None,
                        append=[],
                        cumulative=False,
                        reverse_cumulative=False)

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.columns:
        if options.columns != "all":
            options.columns = [int(x) - 1 for x in options.columns.split(",")]
    else:
        options.columns.append(0)

    histograms = []

    vals = []

    for x in options.columns:
        vals.append([])

    # retrieve histogram
    lines = filter(lambda x: x[0] <> "#", sys.stdin.readlines())

    ncols = len(string.split(lines[0][:-1], "\t"))
    if options.columns == "all":
        options.columns = range(ncols)
        for x in options.columns:
            vals.append([])

    if options.titles:
        data = lines[0][:-1].split("\t")
        del lines[0]
        options.titles = map(lambda x: data[x], options.columns)

    for l in lines:
        data = string.split(l[:-1], "\t")

        for x in range(len(options.columns)):
            try:
                v = string.atof(data[options.columns[x]])
            except IndexError:
                print "# IndexError in line:", l[:-1]
                continue
            except ValueError:
                continue

            if options.scale:
                v *= options.scale

            if options.upper_limit != None and v > options.upper_limit:
                v = options.upper_limit

            if options.lower_limit != None and v < options.lower_limit:
                v = options.lower_limit

            vals[x].append(v)

    lines = None

    hists = []
    titles = []

    for x in range(len(options.columns)):
        E.info("column=%i, num_values=%i" % (options.columns[x], len(vals[x])))

        if len(vals[x]) == 0: continue

        h = Histogram.Calculate(vals[x],
                                no_empty_bins=options.empty_bins,
                                increment=options.bin_size)
        if options.scale: h = Histogram.Scale(h, 1.0 / options.scale)

        if options.normalize: h = Histogram.Normalize(h)
        if options.cumulative: h = Histogram.Cumulate(h)
        if options.reverse_cumulative: h = Histogram.Cumulate(h, direction=0)

        hists.append(h)

        for m in options.append:
            if m == "normalize":
                hists.append(Histogram.Normalize(h))

        if options.titles:
            titles.append(options.titles[x])

    if titles:
        options.stdout.write("bin\t" + "\t".join(titles) + "\n")

    if len(hists) == 1:
        Histogram.Print(hists[0], nonull=options.nonull)
    else:
        combined_histogram = Histogram.Combine(hists)
        Histogram.Print(combined_histogram, nonull=options.nonull)

    E.Stop()
Beispiel #8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-b", "--bin-size", dest="bin_size", type="string",
                      help="bin size.")

    parser.add_option("--min-value", dest="min_value", type="float",
                      help="minimum value for histogram.")

    parser.add_option(
        "--max-value", dest="max_value", type="float",
        help="maximum value for histogram.")

    parser.add_option(
        "--no-empty-bins", dest="no_empty_bins", action="store_true",
        help="do not display empty bins.")

    parser.add_option(
        "--with-empty-bins", dest="no_empty_bins", action="store_false",
        help="display empty bins.")

    parser.add_option(
        "--ignore-out-of-range", dest="ignore_out_of_range",
        action="store_true",
        help="ignore values that are out of range (as opposed to truncating "
        "them to range border.")

    parser.add_option("--missing-value", dest="missing_value", type="string",
                      help="entry for missing values [%default].")

    parser.add_option("--use-dynamic-bins", dest="dynamic_bins",
                      action="store_true",
                      help="each value constitutes its own bin.")

    parser.add_option("--format", dest="format", type="choice",
                      choices=("gff", "gtf", "bed"),
                      help="input file format [%default].")

    parser.add_option("--method", dest="methods", type="choice",
                      action="append",
                      choices=("all", "hist", "stats", "overlaps", "values"),
                      help="methods to apply [%default].")

    parser.add_option("--output-section", dest="output_section", type="choice",
                      choices=("all", "size", "distance"),
                      help="data to compute [%default].")

    parser.set_defaults(
        no_empty_bins=True,
        bin_size=None,
        dynamic_bins=False,
        ignore_out_of_range=False,
        min_value=None,
        max_value=None,
        nonull=None,
        missing_value="na",
        output_filename_pattern="%s",
        methods=[],
        output_section="all",
        format="gff",
    )

    (options, args) = E.Start(parser, add_output_options=True)

    if "all" in options.methods:
        options.methods = ("hist", "stats", "overlaps")
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"

    if len(options.methods) == 0:
        raise ValueError(
            "please provide counting method using --method option")

    if options.format in ("gff", "gtf"):
        gffs = GTF.iterator(options.stdin)
    elif options.format == "bed":
        gffs = Bed.iterator(options.stdin)

    values_between = []
    values_within = []
    values_overlaps = []

    if "overlaps" in options.methods:
        if not options.output_filename_pattern:
            options.output_filename_pattern = "%s"
        outfile_overlaps = E.openOutputFile("overlaps")
    else:
        outfile_overlaps = None

    last = None
    ninput, noverlaps = 0, 0
    for this in gffs:
        ninput += 1
        values_within.append(this.end - this.start)

        if last and last.contig == this.contig:
            if this.start < last.end:
                noverlaps += 1
                if outfile_overlaps:
                    outfile_overlaps.write("%s\t%s\n" % (str(last), str(this)))
                values_overlaps.append(
                    min(this.end, last.end) - max(last.start, this.start))
                if this.end > last.end:
                    last = this
                continue
            else:
                values_between.append(this.start - last.end)
                # if this.start - last.end < 10:
                #     print str(last)
                #     print str(this)
                #     print "=="
                values_overlaps.append(0)

        last = this

    if "hist" in options.methods:
        outfile = E.openOutputFile("hist")
        h_within = Histogram.Calculate(
            values_within,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        h_between = Histogram.Calculate(
            values_between,
            no_empty_bins=options.no_empty_bins,
            increment=options.bin_size,
            min_value=options.min_value,
            max_value=options.max_value,
            dynamic_bins=options.dynamic_bins,
            ignore_out_of_range=options.ignore_out_of_range)

        if "all" == options.output_section:
            outfile.write("residues\tsize\tdistance\n")
            combined_histogram = Histogram.Combine(
                [h_within, h_between], missing_value=options.missing_value)
            Histogram.Write(outfile, combined_histogram, nonull=options.nonull)
        elif options.output_section == "size":
            outfile.write("residues\tsize\n")
            Histogram.Write(outfile, h_within, nonull=options.nonull)
        elif options.output_section == "distance":
            outfile.write("residues\tdistance\n")
            Histogram.Write(outfile, h_between, nonull=options.nonull)

        outfile.close()

    if "stats" in options.methods:
        outfile = E.openOutputFile("stats")
        outfile.write("data\t%s\n" % Stats.Summary().getHeader())
        if options.output_section in ("size", "all"):
            outfile.write("size\t%s\n" % str(Stats.Summary(values_within)))
        if options.output_section in ("distance", "all"):
            outfile.write("distance\t%s\n" %
                          str(Stats.Summary(values_between)))
        outfile.close()

    if "values" in options.methods:
        outfile = E.openOutputFile("distances")
        outfile.write("distance\n%s\n" % "\n".join(map(str, values_between)))
        outfile.close()
        outfile = E.openOutputFile("sizes")
        outfile.write("size\n%s\n" % "\n".join(map(str, values_within)))
        outfile.close()
        outfile = E.openOutputFile("overlaps")
        outfile.write("overlap\n%s\n" % "\n".join(map(str, values_overlaps)))
        outfile.close()

    E.info("ninput=%i, ndistance=%i, nsize=%i, noverlap=%i" %
           (ninput,
            len(values_between),
            len(values_within),
            noverlaps))

    E.Stop()
Beispiel #9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: graph2stats.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-r",
                      "--range",
                      dest="range",
                      type="string",
                      help="range to calculate histogram for.")
    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")
    parser.add_option("-i",
                      "--titles",
                      dest="titles",
                      action="store_true",
                      help="use supplied column titles.")
    parser.add_option("-s",
                      "--make-symmetric",
                      dest="make_symmetric",
                      action="store_true",
                      help="symmetrize graph.")
    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-p",
                      "--output-pattern",
                      dest="output_pattern",
                      type="string",
                      help="pattern for output files.")
    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="string",
                      help="method.")
    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="string",
                      help="output format.")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")
    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")

    parser.set_defaults(bin_size=None,
                        range=None,
                        titles=False,
                        columns="all",
                        append=(),
                        empty_bins=False,
                        min_value=None,
                        max_value=None,
                        normalize=False,
                        cumulative=False,
                        reverse_cumulative=False,
                        nonull=None,
                        make_symmetric=False,
                        output_pattern="%s.hist",
                        method="histograms",
                        output_format="semi")

    (options, args) = E.Start(parser)

    if options.columns != "all":
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    if options.range:
        options.min_value, options.max_value = map(float,
                                                   options.range(split(",")))

    # retrieve data
    lines = filter(lambda x: x[0] <> "#", sys.stdin.readlines())

    vals = {}

    if options.method == "histograms":

        ## read data
        for line in lines:

            v1, v2, w = line[:-1].split("\t")[:3]

            try:
                w[3] = float(w[3])
            except ValueError:
                nerrors += 1
                continnue

            if v1 not in vals: vals[v1] = {}
            if v2 not in vals[v1]: vals[v1][v2] = []
            vals[v1][v2].append(w)
            if options.make_symmetric:
                if v2 not in vals: vals[v2] = {}
                if v1 not in vals[v2]: vals[v2][v1] = []
                vals[v2][v1].append(w)

        ## convert to histograms
        for k1, vv in vals.items():
            for k2 in vv.keys():
                if len(vv[k2]) == 0: continue

                h = Histogram.Calculate(vv[k2],
                                        no_empty_bins=options.empty_bins,
                                        increment=options.bin_size,
                                        min_value=options.min_value,
                                        max_value=options.max_value)

                if options.normalize: h = Histogram.Normalize(h)
                if options.cumulative: h = Histogram.Cumulate(h)
                if options.reverse_cumulative:
                    h = Histogram.Cumulate(h, direction=0)

                vv[k2] = h

        ## write output
        if options.output == "semi":
            for k1, vv in vals.items():

                outfile = open(options.output_pattern % k1)

                kk2 = vv.keys()
                kk2.sort()

                hists = []
                for k2 in kk2:
                    hists.append(vv[k2])

                PrintHistograms(outfile, kk2, hists, options)

                outfile.close()

    elif options.method == "counts":

        ## read data
        for line in lines:

            v1, v2 = line[:-1].split("\t")[:2]

            if v1 not in vals: vals[v1] = {}
            if v2 not in vals[v1]: vals[v1][v2] = 0
            vals[v1][v2] += 1
            if options.make_symmetric:
                if v2 not in vals: vals[v2] = {}
                if v1 not in vals[v2]: vals[v2][v1] = 0
                vals[v2][v1] += 1

        ## convert to histograms
        for k1, vv in vals.items():
            for k2 in vv.keys():
                options.stdout.write("%s\t%s\t%i\n" % (k1, k2, vv[k2]))

    E.Stop()
Beispiel #10
0
def PrintHistograms(outfile, titles, histograms, options):

    combined_histogram = Histogram.Combine(hists)

    outfile.write("\t".join(("bin", ) + titles))
    Histogram.Print(combined_histogram, nonull=options.nonull)