コード例 #1
0
def main(argv=None):

    if argv == None: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-n",
                      "--nonull",
                      dest="nonull",
                      action="store_true",
                      help="no null [default=%default]")

    parser.add_option("-e",
                      "--show-empty",
                      dest="empty_bins",
                      action="store_true",
                      help="show empty bins [default=%default]")

    parser.add_option("-o",
                      "--normalize",
                      dest="normalize",
                      action="store_true",
                      help="normalize histogram [default=%default]")

    parser.add_option("-i",
                      "--titles",
                      dest="titles",
                      action="store_true",
                      help="use titles supplied in ... [default=%default]")

    parser.add_option("--cumulative",
                      dest="cumulative",
                      action="store_true",
                      help="compute cumulative histogram [default=%default]")

    parser.add_option(
        "--reverse-cumulative",
        dest="reverse_cumulative",
        action="store_true",
        help="compute reverse cumulative histogram [default=%default]")

    parser.add_option("-c",
                      "--column",
                      dest="column",
                      type="int",
                      help="columns to take [default=%default]")

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="float",
                      help="bin size to use [default=%default]")

    parser.add_option("-u",
                      "--upper",
                      dest="upper_limit",
                      type="float",
                      help="upper limit to use [default=%default]")

    parser.add_option("-l",
                      "--lower",
                      dest="lower_limit",
                      type="float",
                      help="lower limit to use [default=%default]")

    parser.add_option("-s",
                      "--scale",
                      dest="scale",
                      type="float",
                      help="scale to use [default=%default]")

    parser.add_option("-a",
                      "--append",
                      dest="append",
                      type="choice",
                      action="append",
                      choices=("normalize", ),
                      help="append columns [default=%default]")

    parser.set_defaults(nonull=None,
                        columns=[
                            0,
                        ],
                        empty_bins=True,
                        titles=False,
                        lower_limit=None,
                        upper_limit=None,
                        bin_size=None,
                        scale=None,
                        normalize=None,
                        append=[],
                        cumulative=False,
                        reverse_cumulative=False)

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.columns:
        if options.columns != "all":
            options.columns = [int(x) - 1 for x in options.columns.split(",")]
    else:
        options.columns.append(0)

    histograms = []

    vals = []

    for x in options.columns:
        vals.append([])

    # retrieve histogram
    lines = filter(lambda x: x[0] <> "#", sys.stdin.readlines())

    ncols = len(string.split(lines[0][:-1], "\t"))
    if options.columns == "all":
        options.columns = range(ncols)
        for x in options.columns:
            vals.append([])

    if options.titles:
        data = lines[0][:-1].split("\t")
        del lines[0]
        options.titles = map(lambda x: data[x], options.columns)

    for l in lines:
        data = string.split(l[:-1], "\t")

        for x in range(len(options.columns)):
            try:
                v = string.atof(data[options.columns[x]])
            except IndexError:
                print "# IndexError in line:", l[:-1]
                continue
            except ValueError:
                continue

            if options.scale:
                v *= options.scale

            if options.upper_limit != None and v > options.upper_limit:
                v = options.upper_limit

            if options.lower_limit != None and v < options.lower_limit:
                v = options.lower_limit

            vals[x].append(v)

    lines = None

    hists = []
    titles = []

    for x in range(len(options.columns)):
        E.info("column=%i, num_values=%i" % (options.columns[x], len(vals[x])))

        if len(vals[x]) == 0: continue

        h = Histogram.Calculate(vals[x],
                                no_empty_bins=options.empty_bins,
                                increment=options.bin_size)
        if options.scale: h = Histogram.Scale(h, 1.0 / options.scale)

        if options.normalize: h = Histogram.Normalize(h)
        if options.cumulative: h = Histogram.Cumulate(h)
        if options.reverse_cumulative: h = Histogram.Cumulate(h, direction=0)

        hists.append(h)

        for m in options.append:
            if m == "normalize":
                hists.append(Histogram.Normalize(h))

        if options.titles:
            titles.append(options.titles[x])

    if titles:
        options.stdout.write("bin\t" + "\t".join(titles) + "\n")

    if len(hists) == 1:
        Histogram.Print(hists[0], nonull=options.nonull)
    else:
        combined_histogram = Histogram.Combine(hists)
        Histogram.Print(combined_histogram, nonull=options.nonull)

    E.Stop()
コード例 #2
0
ファイル: data2histogram.py プロジェクト: wangdi2014/cgat
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: data2histogram.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-r",
                      "--range",
                      dest="range",
                      type="string",
                      help="range to calculate histogram for.")
    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")
    parser.add_option("-i",
                      "--titles",
                      dest="titles",
                      action="store_true",
                      help="use supplied column titles.")
    parser.add_option("--no-null",
                      dest="nonull",
                      action="store_true",
                      help="do not output null values")
    parser.add_option("--no-titles",
                      dest="titles",
                      action="store_false",
                      help="no column titles given.")
    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option(
        "--min-data",
        dest="min_data",
        type="int",
        help=
        "minimum amount of data required, if less data, then the histogram will be empty [default=%default]."
    )
    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")
    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")
    parser.add_option("--no-empty-bins",
                      dest="no_empty_bins",
                      action="store_true",
                      help="do not display empty bins.")
    parser.add_option("--with-empty-bins",
                      dest="no_empty_bins",
                      action="store_false",
                      help="display empty bins.")
    parser.add_option("--normalize",
                      dest="normalize",
                      action="store_true",
                      help="normalize histogram.")
    parser.add_option("--cumulative",
                      dest="cumulative",
                      action="store_true",
                      help="calculate cumulative histogram.")
    parser.add_option("--reverse-cumulative",
                      dest="reverse_cumulative",
                      action="store_true",
                      help="calculate reverse cumulative histogram.")
    parser.add_option("--header-names",
                      dest="headers",
                      type="string",
                      help="use the following headers.")
    parser.add_option(
        "--ignore-out-of-range",
        dest="ignore_out_of_range",
        action="store_true",
        help=
        "ignore values that are out of range (as opposed to truncating them to range border."
    )
    parser.add_option("--missing-value",
                      dest="missing_value",
                      type="string",
                      help="entry for missing values [%default].")
    parser.add_option("--use-dynamic-bins",
                      dest="dynamic_bins",
                      action="store_true",
                      help="each value constitutes its own bin.")
    parser.add_option(
        "--on-the-fly",
        dest="on_the_fly",
        action="store_true",
        help=
        "on the fly computation of histograms. Requires setting of min-value, max-value and bin_size."
    )

    parser.set_defaults(
        bin_size=None,
        range=None,
        titles=True,
        columns="all",
        append=(),
        no_empty_bins=True,
        min_value=None,
        max_value=None,
        normalize=False,
        cumulative=False,
        reverse_cumulative=False,
        nonull=None,
        ignore_out_of_range=False,
        min_data=1,
        headers=None,
        missing_value="na",
        dynamic_bins=False,
        on_the_fly=False,
        bin_format="%.2f",
        value_format="%6.4f",
    )

    (options, args) = E.Start(parser)

    if options.columns != "all":
        options.columns = [int(x) - 1 for x in options.columns.split(",")]

    if options.range:
        options.min_value, options.max_value = list(
            map(float, options.range.split(",")))

    if options.headers:
        options.headers = options.headers.split(",")

    if options.on_the_fly:
        if options.min_value is None or options.max_value is None or \
           options.bin_size is None:
            raise ValueError("please supply columns, min-value, max-value and "
                             "bin-size for on-the-fly computation.")

        # try to glean titles from table:
        if options.titles:
            while 1:
                line = sys.stdin.readline()
                if not line:
                    break
                if line[0] == "#":
                    continue
                data = line[:-1].split("\t")
                break

            if options.columns == "all":
                options.titles = data
                options.columns = list(range(len(data)))
            else:
                options.titles = [data[x] for x in options.columns]

        bins = numpy.arange(options.min_value, options.max_value,
                            float(options.bin_size))
        hh = Histogram.fillHistograms(
            sys.stdin, options.columns,
            [bins for x in range(len(options.columns))])
        n = len(hh)

        titles = ['bin']

        if options.headers:
            titles.append(options.headers[x])
        elif options.titles:
            titles.append(options.titles[x])
        else:
            for x in options.columns:
                titles.append("col%i" % (x + 1))

        if len(titles) > 1:
            options.stdout.write("\t".join(titles) + "\n")

        for x in range(len(bins)):
            v = []
            v.append(options.bin_format % bins[x])
            for c in range(n):
                v.append(options.value_format % hh[c][x])

            options.stdout.write("\t".join(v) + "\n")

    else:
        # in-situ computation of histograms
        # retrieve data
        first = True
        vals = []

        # parse data, convert to floats
        for l in options.stdin:

            if l[0] == "#":
                continue

            data = l[:-1].split("\t")

            if first:
                first = False
                ncols = len(data)
                if options.columns == "all":
                    options.columns = list(range(ncols))

                vals = [[] for x in options.columns]

                if options.titles:
                    try:
                        options.titles = [data[x] for x in options.columns]
                    except IndexError:
                        raise IndexError(
                            "not all columns %s found in data %s" %
                            (str(options.columns), str(data)))
                    continue

            for x in range(len(options.columns)):

                try:
                    v = float(data[options.columns[x]])
                except IndexError:
                    print("# IndexError in line:", l[:-1])
                    continue
                except ValueError:
                    continue

                vals[x].append(v)

        lines = None

        hists = []
        titles = []

        if not vals:
            if options.loglevel >= 1:
                options.stdlog.write("# no data\n")
            E.Stop()
            sys.exit(0)

        for x in range(len(options.columns)):

            if options.loglevel >= 1:
                options.stdlog.write("# column=%i, num_values=%i\n" %
                                     (options.columns[x], len(vals[x])))

            if len(vals[x]) < options.min_data:
                continue

            h = Histogram.Calculate(
                vals[x],
                no_empty_bins=options.no_empty_bins,
                increment=options.bin_size,
                min_value=options.min_value,
                max_value=options.max_value,
                dynamic_bins=options.dynamic_bins,
                ignore_out_of_range=options.ignore_out_of_range)

            if options.normalize:
                h = Histogram.Normalize(h)
            if options.cumulative:
                h = Histogram.Cumulate(h)
            if options.reverse_cumulative:
                h = Histogram.Cumulate(h, direction=0)

            hists.append(h)

            for m in options.append:
                if m == "normalize":
                    hists.append(Histogram.Normalize(h))

            if options.headers:
                titles.append(options.headers[x])
            elif options.titles:
                titles.append(options.titles[x])
            else:
                titles.append("col%i" % options.columns[x])

        if titles:
            options.stdout.write("bin\t" + "\t".join(titles) + "\n")

        if len(hists) == 1:
            Histogram.Print(hists[0],
                            nonull=options.nonull,
                            format_bin=options.bin_format)
        else:
            combined_histogram = Histogram.Combine(
                hists, missing_value=options.missing_value)
            Histogram.Print(combined_histogram,
                            nonull=options.nonull,
                            format_bin=options.bin_format)

    E.Stop()
コード例 #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: graph2stats.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-r",
                      "--range",
                      dest="range",
                      type="string",
                      help="range to calculate histogram for.")
    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="string",
                      help="bin size.")
    parser.add_option("-i",
                      "--titles",
                      dest="titles",
                      action="store_true",
                      help="use supplied column titles.")
    parser.add_option("-s",
                      "--make-symmetric",
                      dest="make_symmetric",
                      action="store_true",
                      help="symmetrize graph.")
    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-p",
                      "--output-pattern",
                      dest="output_pattern",
                      type="string",
                      help="pattern for output files.")
    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="string",
                      help="method.")
    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="string",
                      help="output format.")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum value for histogram.")
    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum value for histogram.")

    parser.set_defaults(bin_size=None,
                        range=None,
                        titles=False,
                        columns="all",
                        append=(),
                        empty_bins=False,
                        min_value=None,
                        max_value=None,
                        normalize=False,
                        cumulative=False,
                        reverse_cumulative=False,
                        nonull=None,
                        make_symmetric=False,
                        output_pattern="%s.hist",
                        method="histograms",
                        output_format="semi")

    (options, args) = E.Start(parser)

    if options.columns != "all":
        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

    if options.range:
        options.min_value, options.max_value = map(float,
                                                   options.range(split(",")))

    # retrieve data
    lines = filter(lambda x: x[0] <> "#", sys.stdin.readlines())

    vals = {}

    if options.method == "histograms":

        ## read data
        for line in lines:

            v1, v2, w = line[:-1].split("\t")[:3]

            try:
                w[3] = float(w[3])
            except ValueError:
                nerrors += 1
                continnue

            if v1 not in vals: vals[v1] = {}
            if v2 not in vals[v1]: vals[v1][v2] = []
            vals[v1][v2].append(w)
            if options.make_symmetric:
                if v2 not in vals: vals[v2] = {}
                if v1 not in vals[v2]: vals[v2][v1] = []
                vals[v2][v1].append(w)

        ## convert to histograms
        for k1, vv in vals.items():
            for k2 in vv.keys():
                if len(vv[k2]) == 0: continue

                h = Histogram.Calculate(vv[k2],
                                        no_empty_bins=options.empty_bins,
                                        increment=options.bin_size,
                                        min_value=options.min_value,
                                        max_value=options.max_value)

                if options.normalize: h = Histogram.Normalize(h)
                if options.cumulative: h = Histogram.Cumulate(h)
                if options.reverse_cumulative:
                    h = Histogram.Cumulate(h, direction=0)

                vv[k2] = h

        ## write output
        if options.output == "semi":
            for k1, vv in vals.items():

                outfile = open(options.output_pattern % k1)

                kk2 = vv.keys()
                kk2.sort()

                hists = []
                for k2 in kk2:
                    hists.append(vv[k2])

                PrintHistograms(outfile, kk2, hists, options)

                outfile.close()

    elif options.method == "counts":

        ## read data
        for line in lines:

            v1, v2 = line[:-1].split("\t")[:2]

            if v1 not in vals: vals[v1] = {}
            if v2 not in vals[v1]: vals[v1][v2] = 0
            vals[v1][v2] += 1
            if options.make_symmetric:
                if v2 not in vals: vals[v2] = {}
                if v1 not in vals[v2]: vals[v2][v1] = 0
                vals[v2][v1] += 1

        ## convert to histograms
        for k1, vv in vals.items():
            for k2 in vv.keys():
                options.stdout.write("%s\t%s\t%i\n" % (k1, k2, vv[k2]))

    E.Stop()