def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      help="method to use [kl=kullback-leibler]",
                      choices=("kl", ))
    parser.add_option("-n",
                      "--no-normalize",
                      dest="normalize",
                      action="store_false",
                      help="do not normalize data")
    parser.add_option("-p",
                      "--pseudocounts",
                      dest="pseudocounts",
                      type="int",
                      help="pseudocounts to add.")
    parser.add_option("-f",
                      "--number-format",
                      dest="number_format",
                      type="string",
                      help="number format.")

    parser.set_defaults(method="kl",
                        columns="all",
                        headers=True,
                        xrange=None,
                        pseudocounts=1,
                        normalize=True,
                        number_format="%6.4f")

    (options, args) = E.start(parser, add_pipe_options=True)

    if options.xrange:
        options.xrange = list(map(float, options.xrange.split(",")))

    data, legend = IOTools.readTable(sys.stdin,
                                     numeric_type=numpy.float32,
                                     take=options.columns,
                                     headers=options.headers,
                                     truncate=options.xrange)

    nrows, ncols = data.shape

    # first: normalize rows
    for y in range(1, ncols):
        for x in range(nrows):
            data[x, y] = data[x, y] + float(options.pseudocounts)
        if options.normalize:
            t = numpy.sum(data[:, y])
            for x in range(nrows):
                data[x, y] = data[x, y] / t

    for x in range(1, len(legend) - 1):
        for y in range(x + 1, len(legend)):

            if options.method == "kl":
                d1 = 0.0
                d2 = 0.0
                for bin in range(nrows):
                    p = data[bin, x]
                    q = data[bin, y]
                    d1 += p * math.log(p / q)
                    d2 += q * math.log(q / p)

                options.stdout.write(
                    "%s\t%s\t%s\n" %
                    (legend[x], legend[y], options.number_format % d1))
                options.stdout.write(
                    "%s\t%s\t%s\n" %
                    (legend[y], legend[x], options.number_format % d2))

    E.stop()
Esempio n. 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: data2multiple_anova.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-c", "--columns", dest="columns", type="string",
                      help="columns to take for calculating histograms.")
    parser.add_option("-t", "--tree-nh-file", dest="filename_tree", type="string",
                      help="filename with tree(s).")
    parser.add_option("--skip-header", dest="add_header", action="store_false",
                      help="do not add header to flat format.")
    parser.add_option("--output-with-header", dest="write_header", action="store_true",
                      help="write header and exit.")
    parser.add_option("--debug", dest="debug", action="store_true",
                      help="debug mode")
    parser.add_option("--display-tree", dest="display_tree", action="store_true",
                      help="display the tree")

    parser.add_option("-m", "--method", dest="methods", type="choice", action="append",
                      choices=("contrasts", "spearman", "pearson", "compute"),
                      help="methods to perform on contrasts.")

    parser.set_defaults(
        columns="all",
        filename_tree=None,
        add_header=True,
        write_header=False,
        debug=False,
        methods=[],
        value_format="%6.4f",
        pvalue_format="%e",
        display_tree=False,
    )

    (options, args) = E.start(parser, quiet=True)

    if options.columns not in ("all", "all-but-first"):
        options.columns = [int(x) - 1 for x in options.columns.split(",")]

    data = []

    options.filenames = args

    for filename in options.filenames:

        infile = IOTools.open_file(filename, "r")
        table, headers = IOTools.readTable(
            infile, take=options.columns, headers=False)
        infile.close()

        data.append(table)

    fields = ["Df", "Sum Sq", "F value", "Pr(>F)", "Mean Sq"]

    options.stdout.write("set1\tset2")
    for field in fields:
        options.stdout.write("\t%s" % field)
    options.stdout.write("\n")

    # CODE needs to be refactored for rpy2 usage

    for x in range(len(data)):

        for y in range(x + 1, len(data)):

            rpy.set_default_mode(rpy.NO_CONVERSION)

            factors = ["x"] * len(data[x][:, 0]) + ["y"] * len(data[y][:, 0])
            values = list(data[x][:, 0]) + list(data[y][:, 0])

            linear_model = R.lm(
                R("y ~ x"), data=R.data_frame(x=factors, y=values))
            rpy.set_default_mode(rpy.BASIC_CONVERSION)
            result = R.anova(linear_model)

            options.stdout.write(
                "%s\t%s" % (options.filenames[x], options.filenames[y]))
            for field in fields:
                options.stdout.write("\t%s" % str(result[field]))
            options.stdout.write("\n")
Esempio n. 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: histogram2histogram.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-i", "--is-int", dest="is_ints", action="store_true",
                      help="categories are integers.")
    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("append", "cumul", "rcumul", "normalize"),
                      help="method(s) to apply.")
    parser.add_option("--no-headers", dest="headers", action="store_false",
                      help="histogram has no headers.")
    parser.add_option("-c", "--columns", dest="columns", type="string",
                      help="columns to use for plotting.")
    parser.add_option("", "--truncate", dest="truncate", type="string",
                      help="truncate at range.")
    parser.add_option("", "--no-out-of-range", dest="cumulate_out_of_range", action="store_false",
                      help="add up bins out of range.")
    parser.add_option("--bin-format", dest="format_bin", type="string",
                      help="format for bins.")
    parser.add_option("--value-format", dest="format_val", type="string",
                      help="format for vals.")

    parser.set_defaults(
        is_ints=False,
        method="append",
        columns="all",
        headers=True,
        truncate=None,
        cumulate_out_of_range=True,
        format_bin="%6.4f",
        format_val="%6.4f",
    )

    (options, args) = E.start(parser)

    # old histogram2histogram.py semantics - need to merged with newer
    # code below.
    if options.method == "append":

        vals = []

        # retrieve histogram
        lines = [x for x in sys.stdin.readlines() if x[0] != "#"]

        # check if first line contains a header
        d = lines[0][:-1].split("\t")[0]
        try:
            if options.is_ints:
                value = int(d)
            else:
                value = float(d)
        except ValueError:
            print("\t".join(
                (d, "counts", "frequency",
                 "cumulative counts", "increasing cumulative frequency",
                 "cumulative counts", "decreasing cumulative frequency")))
            del lines[0]

        data = [list(map(float, x[:-1].split("\t"))) for x in lines]

        if len(data) == 0:
            raise ValueError("no data found")

        total = float(reduce(lambda x, y: x + y, [x[1] for x in data]))

        cumul_down = int(total)
        cumul_up = 0

        if options.is_ints:
            form = "%i\t%i\t%6.4f\t%i\t%6.4f\t%i\t%6.4f"
        else:
            form = "%6.4f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%6.4f"

        for bin, val in data:
            percent = float(val) / total
            cumul_up += val
            percent_cumul_up = float(cumul_up) / total
            percent_cumul_down = float(cumul_down) / total

            print(form %
                  (bin, val, percent, cumul_up, percent_cumul_up,
                   cumul_down, percent_cumul_down))

            cumul_down -= val

    else:

        if options.truncate:
            options.truncate = list(map(float, options.truncate.split(",")))

        options.method = options.method.split(",")
        data, legend = IOTools.readTable(sys.stdin,
                                         numeric_type=numpy.float32,
                                         take=options.columns,
                                         headers=options.headers,
                                         truncate=options.truncate,
                                         cumulate_out_of_range=options.cumulate_out_of_range)

        nfields = len(legend)

        # note: because of MA, iteration makes copy of slices
        # Solution: inplace edits.
        nrows, ncols = data.shape

        for method in options.method:
            if method == "cumul":
                l = [0] * ncols
                for x in range(nrows):
                    for y in range(1, ncols):
                        data[x, y] += l[y]
                        l[y] = data[x, y]

            elif method == "rcumul":
                l = [0] * ncols
                for x in range(nrows - 1, 0, -1):
                    for y in range(1, ncols):
                        data[x, y] += l[y]
                        l[y] = data[x, y]

            elif method == "normalize":
                m = [0] * ncols
                for x in range(nrows):
                    for y in range(1, ncols):
                        # the conversion to float is necessary
                        m[y] = max(m[y], float(data[x, y]))

                for y in range(1, ncols):
                    if m[y] == 0:
                        m[y] = 1.0

                for x in range(nrows):
                    for y in range(1, ncols):
                        data[x, y] = data[x, y] / m[y]
            else:
                raise "unknown method %s" % method

        print("\t".join(legend))

        format = options.format_bin + "\t" + \
            "\t".join([options.format_val] * (nfields - 1))

        for d in data:
            print(format % tuple(d))

    E.stop()
Esempio n. 4
0
def main(argv=None):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: plot_histogram.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-l",
                      "--plot-legend",
                      dest="legend",
                      type="string",
                      help="legend for plot [default=%default].")
    parser.add_option("-t",
                      "--title",
                      dest="title",
                      type="string",
                      help="title for plot [default=%default].")
    parser.add_option(
        "-p",
        "--hardcopy",
        dest="hardcopy",
        type="string",
        help=
        "filename for hardcopy of plot. The extension defines the format. Known extensions are: 'emf, eps, jpeg, jpg, pdf, png, ps, raw, rgba, svg, svgz' [default=%default].",
        metavar="FILE")
    parser.add_option("",
                      "--xrange",
                      dest="xrange",
                      type="string",
                      help="x viewing range of plot [default=%default].")
    parser.add_option("",
                      "--yrange",
                      dest="yrange",
                      type="string",
                      help="y viewing range of plot[default=%default].")
    parser.add_option("-o",
                      "--logscale",
                      dest="logscale",
                      type="string",
                      help="use logscale on x, y or xy [default=%default]")
    parser.add_option("-x",
                      "--xtitle",
                      dest="xtitle",
                      type="string",
                      help="title for x axis [default=%default]")
    parser.add_option("-y",
                      "--ytitle",
                      dest="ytitle",
                      type="string",
                      help="title for y axis [default=%default]")
    parser.add_option("-d",
                      "--dpi",
                      dest="dpi",
                      type="int",
                      help="dpi of images [default=%default]")
    parser.add_option("-n",
                      "--normalize",
                      dest="normalize",
                      action="store_true",
                      help="normalize histograms [default=%default]")
    parser.add_option(
        "--cumulate",
        dest="cumulate",
        action="store_true",
        help="calculate cumulative histogram [default=%default].")
    parser.add_option(
        "--reverse-cumulate",
        dest="reverse_cumulate",
        action="store_true",
        help=
        "calculate cumulative histogram in reverse order [default=%default].")
    parser.add_option("--legend-location",
                      dest="legend_location",
                      type="choice",
                      choices=("upper left", "upper right", "lower left",
                               "lower right", "center", "center right",
                               "center left", "none"),
                      help="location of legend [default=%default]")
    parser.add_option("--backend",
                      dest="backend",
                      type="string",
                      help="backend to use [Agg|SVG|PS] [default=%default]")
    parser.add_option(
        "--symbols",
        dest="symbols",
        type="string",
        help="symbols to use for each histogram [steps|...] [default=%default]."
    )
    parser.add_option("--dump",
                      dest="dump",
                      action="store_true",
                      help="dump data for debug purposes [default=%default].")
    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to use for plotting [default=%default].")
    parser.add_option(
        "--truncate",
        dest="truncate",
        action="store_true",
        help=
        "truncate date within x range. If not set, xrange is simply a viewing range [default=%default]."
    )
    parser.add_option("--as-lines",
                      dest="as_lines",
                      action="store_true",
                      help="plot only lines, no symbols [default=%default].")
    parser.add_option(
        "--noheaders",
        dest="headers",
        action="store_false",
        help="do not take first input line as header [default=%default].")
    parser.add_option("--stacked",
                      dest="stacked",
                      action="store_true",
                      help="do a stacked plot [default=%default].")
    parser.add_option("--add-function",
                      dest="function",
                      type="string",
                      help="add a function to the plot [default=%default].")
    parser.add_option(
        "--add-error-bars",
        dest="error_bars",
        type="choice",
        choices=("interleaved", "blocked"),
        help=
        "add error bars. The input format is 'interleaved' or 'blocked'. In the interleaved format the error follows each column. I the blocked format first the data, then the errors in the same order [default=%default]."
    )

    parser.set_defaults(
        legend=None,
        title=None,
        hardcopy=None,
        logscale=None,
        xtitle=None,
        ytitle=None,
        xrange=None,
        yrange=None,
        normalize=None,
        columns="all",
        headers=True,
        legend_location="upper right",
        backend="cairo",
        symbols="g-D,b-h,r-+,c-+,m-+,y-+,k-o,g-^,b-<,r->,c-D,m-h",
        dump=False,
        truncate=False,
        cumulate=False,
        reverse_cumulate=False,
        function=None,
        add_error_bars=None,
        as_lines=False,
        stacked=False,
        dpi=80,
    )

    (options, args) = E.start(parser)

    # import matplotlib/pylab. Has to be done here
    # for batch scripts without GUI.
    import matplotlib
    if options.hardcopy:
        matplotlib.use("cairo")
    import pylab

    # put this method here (because it requires pylab)
    def doStackedPlot(data, legend):

        colors = [
            "red", "blue", "green", "cyan", "magenta", "yellow", "brown",
            "silver", "purple", "lightyellow", "black", "ivory", "pink",
            "orange", "gray", "teal"
        ]

        ax = data[:, 0]
        xvals = numpy.concatenate((ax, ax[::-1]))
        y_top = numpy.zeros(len(ax))

        min_y = min(data[:, 1:].flat)
        max_y = min_y
        new_legend, dummy_lines = [], []

        for i in range(1, len(legend)):
            new_y_top = y_top + data[:, i]
            yvals = numpy.concatenate((new_y_top, y_top[::-1]))
            p = pylab.fill(xvals, yvals, colors[i % len(colors)])

            y_top = new_y_top
            max_y = max(y_top)

            dummy_lines.append(
                pylab.plot(xvals, yvals, colors[i % len(colors)]))

            new_legend.append(legend[i])

        if not options.xrange:
            options.xrange = min(data[:, 0]), max(data[:, 0])

        if not options.yrange:
            options.yrange = 0, max_y

        return dummy_lines, new_legend

    if options.as_lines:
        options.symbols = []
        for y in ("-", ":", "--"):
            for x in "gbrcmyk":
                options.symbols.append(y + x)
    else:
        options.symbols = options.symbols.split(",")

    if options.xrange:
        options.xrange = list(map(float, options.xrange.split(",")))
    if options.yrange:
        options.yrange = list(map(float, options.yrange.split(",")))

    # Added support for (inclusive) range format: "1,3,5,7-100"  (Gerton
    # 13/12/06)
    if options.columns != "all":
        cols = []
        for d in options.columns.split(','):
            colopts = d.split('-')
            if len(colopts) == 2:
                cols += list(range(int(colopts[0]), int(colopts[1]) + 1))
            else:
                cols += [int(d) - 1]
        options.columns = cols

    if args:
        if args[0] == "-":
            infile = sys.stdin
        else:
            infile = IOTools.open_file(args[0], "r")
    else:
        infile = sys.stdin

    if options.truncate:
        xr = options.xrange
    else:
        xr = None

    data, legend = IOTools.readTable(infile,
                                     numeric_type=numpy.float,
                                     take=options.columns,
                                     headers=options.headers,
                                     truncate=xr)

    if infile != sys.stdin:
        infile.close()
    if len(data) == 0:  # or data is None:
        E.info("empty table: no plot")
        E.stop()
        return

    nrows, ncols = data.shape

    # note: because of MA, iteration makes copy of slices
    # Solution: inplace edits.
    if options.cumulate:
        if options.add_error_bars:
            raise ValueError("can not add error bars to cumulative histogram")
        if data.mask.any():
            # cumsum does not work with masked arrays, so do it manually
            for y in range(1, ncols):
                c = 0
                for x in range(0, nrows):
                    if not data.mask[x, y]:
                        data[x, y] += c
                        c = data[x, y]
        else:
            for x in range(1, ncols):
                data[:, x] = data[:, x].cumsum()

    elif options.reverse_cumulate:
        if options.add_error_bars:
            raise ValueError("can not add error bars to cumulative histogram")
        if data.mask.any():
            l = [0] * ncols
            for x in range(nrows - 1, -1, -1):
                for y in range(1, ncols):
                    if not data.mask[x, y]:
                        data[x, y] += l[y]
                        l[y] = data[x, y]
        else:
            l = [0] * ncols
            for x in range(nrows - 1, -1, -1):
                for y in range(1, ncols):
                    data[x, y] += l[y]
                    l[y] = data[x, y]

    if options.normalize:
        if options.add_error_bars:
            raise ValueError("can not add error bars to normalized histogram")
        if data.mask.any():
            m = [0] * ncols
            for x in range(nrows):
                for y in range(1, ncols):
                    if not data.mask[x, y]:
                        m[y] = max(m[y], float(data[x, y]))

            for y in range(1, ncols):
                if m[y] == 0:
                    m[y] = 1.0

            for x in range(nrows):
                for y in range(1, ncols):
                    data[x, y] = data[x, y] / m[y]
        else:
            for x in range(1, ncols):
                m = float(data[:, x].max())
                data[:, x] /= m

    if options.legend:
        legend = options.legend.split(",")

    if options.dump:
        for d in data:
            print(d)

    if options.title:
        pylab.title(options.title)

    if options.xtitle:
        pylab.xlabel(options.xtitle)
    else:
        pylab.xlabel(legend[0])

    if options.ytitle:
        pylab.ylabel(options.ytitle)

    lines = []
    # use dummy_lines to workaround a bug in errorbars that
    # causes the line styles to be set incorrectly.
    dummy_lines = []
    new_legend = []

    if options.error_bars:
        if options.error_bars == "interleaved":
            step_size = 2
            max_size = len(legend)
        elif options.error_bars == "blocked":
            step_size = 1
            max_size = (len(legend) - 1) / 2
    else:
        step_size = 1
        max_size = len(legend)

    if options.stacked:
        dummy_lines, new_legend = doStackedPlot(data, legend)
    else:
        nplotted = 0
        nskipped = 0
        for x in range(1, max_size, step_size):

            s = options.symbols[nplotted % len(options.symbols)]

            yvals = data[:, x]

            xvals = numpy.ma.masked_array(data[:, 0], numpy.ma.getmask(yvals))

            xvals = xvals.compressed()
            yvals = yvals.compressed()

            if len(xvals) == 0:
                E.warn("skipped empty column %i: %s" % (x, legend[x]))

            if options.error_bars == "interleaved":
                yerr = data[:, x + 1]
                yerr = yerr.compressed()
            else:
                yerr = None

            lines.append(pylab.errorbar(xvals, yvals, yerr=yerr, fmt=s))

            dummy_lines.append(pylab.plot(xvals, yvals, s))

            new_legend.append(legend[x])

            nplotted += 1

        E.info("nplotted=%i, nskipped=%i" % (nplotted, nskipped))

    if len(lines) == 0:
        E.stop()
        return

    if options.legend_location != "none":
        pylab.figlegend(dummy_lines, new_legend, options.legend_location)

    if options.logscale:
        if "x" in options.logscale:
            pylab.gca().set_xscale('log')
        if "y" in options.logscale:
            pylab.gca().set_yscale('log')

    if options.xrange:
        pylab.xlim(options.xrange)

    if options.yrange:
        pylab.ylim(options.yrange)

    if options.function:
        xstart, xend = pylab.gca().get_xlim()
        increment = (xend - xstart) / 100.0
        exec(("f = lambda x: %s" % options.function), locals())
        xvals, yvals = [], []
        for x in range(0, 100):
            xvals.append(xstart)
            yvals.append(f(xstart))
            xstart += increment
        xvals.append(xstart)
        yvals.append(f(xstart))

        pylab.plot(xvals, yvals)

    if options.hardcopy:
        pylab.savefig(os.path.expanduser(options.hardcopy), dpi=options.dpi)
    else:
        pylab.show()

    E.stop()