Ejemplo n.º 1
0
    def GET(self, tracker):

        cache = Cache.Cache(tracker, mode="r")
        data = DataTree.fromCache(cache)
        table, row_headers, col_headers = DataTree.tree2table(data)

        return render.data_table(table, row_headers, col_headers)
Ejemplo n.º 2
0
    def GET(self, tracker):

        cache = Cache.Cache(tracker, mode="r")
        data = DataTree.fromCache(cache)
        table, row_headers, col_headers = DataTree.tree2table(data)

        return render.data_table(table, row_headers, col_headers)
Ejemplo n.º 3
0
    def getData(self, path):
        """get data for track and slice. Save data in persistent cache for
        further use.

        For functions, path should be an empty tuple.
        """

        if path:
            key = DataTree.path2str(path)
        else:
            key = "all"

        result, fromcache = None, False
        # trackers with options are not cached
        if not self.nocache and not self.tracker_options:
            try:
                result = self.cache[key]
                fromcache = True
            except KeyError:
                pass
            except RuntimeError as msg:
                raise RuntimeError(
                    "error when accessing key %s from cache: %s "
                    "- potential problem with unpickable object?" % (key, msg))

        kwargs = {}
        if self.tracker_options:
            kwargs['options'] = self.tracker_options

        if result is None:
            try:
                result = self.tracker(*path, **kwargs)
            except Exception as msg:
                self.warn("exception for tracker '%s', path '%s': msg=%s" %
                          (str(self.tracker),
                           DataTree.path2str(
                               path),
                           msg))
                if VERBOSE:
                    self.warn(traceback.format_exc())
                raise

        # store in cache
        if not self.nocache and not fromcache:
            # exception - do not store data frames
            # test with None fails for some reason
            self.cache[key] = result

        return result
Ejemplo n.º 4
0
    def render(self, data):

        # initiate output structure
        results = ResultBlocks(title='user')

        labels = DataTree.getPaths(data)
        # iterate over all items at leaf
        for path, branch in DataTree.getNodes(data, len(labels) - 2):
            for key in Utils.TrackerKeywords:
                if key in branch:
                    # add a result block
                    results.append(ResultBlock(branch[key],
                                               title=path2str(path)))

        return results
    def transform(self, data):

        # check if data is melted:
        if len(data.columns) != 1:
            raise ValueError(
                'transformer requires dataframe with '
                'a single column, got %s' % data.columns)
        column = data.columns[0]
        # iterate over lowest levels to build a dictionary of
        # sets
        genesets = {}
        nlevels = Utils.getDataFrameLevels(data)
        for key, group in data.groupby(level=list(range(nlevels))):
            if "background" in key and not self.background:
                continue
            genesets[key] = set(group[column])

        values = []
        if len(genesets) == 2:
            a = set(genesets[list(genesets.keys())[0]])
            b = set(genesets[list(genesets.keys())[1]])

            values.append(("10", len(a - b)))
            values.append(("01", len(b - a)))
            values.append(("11", len(a & b)))
            values.append(
                ("labels", list(map(path2str, list(genesets.keys())))))
        elif len(genesets) == 3:
            a = set(genesets[list(genesets.keys())[0]])
            b = set(genesets[list(genesets.keys())[1]])
            c = set(genesets[list(genesets.keys())[2]])

            values.append(("100", len(a - b - c)))
            values.append(("010", len(b - a - c)))
            values.append(("001", len(c - a - b)))
            values.append(("110", len((a & b) - c)))
            values.append(("101", len((a & c) - b)))
            values.append(("011", len((b & c) - a)))
            values.append(("111", len((a & b) & c)))
            values.append(
                ("labels", list(map(path2str, list(genesets.keys())))))
        else:
            raise ValueError(
                "Can currently only cope with 2 or 3 way intersections")

        return DataTree.listAsDataFrame(values)
Ejemplo n.º 6
0
    def __call__(self, *args, **kwargs):

        try:
            self.parseArguments(*args, **kwargs)
        except:
            self.error("%s: exception in parsing" % self)
            return ResultBlocks(ResultBlocks(Utils.buildException("parsing")))

        # collect no data if tracker is the empty tracker
        # and go straight to rendering
        try:
            if self.tracker.getTracks() == ["empty"]:
                # is instance does not work because of module mapping
                # type(Tracker.Empty) == CGATReport.Tracker.Empty
                # type(self.tracker) == Tracker.Empty
                # if isinstance(self.tracker, Tracker.Empty):
                result = self.renderer()
                return ResultBlocks(result)
        except AttributeError:
            # for function trackers
            pass

        self.debug("profile: started: tracker: %s" % (self.tracker))

        # collecting data
        try:
            self.collect()
        except:
            self.error("%s: exception in collection" % self)
            return ResultBlocks(ResultBlocks(
                Utils.buildException("collection")))
        finally:
            self.debug("profile: finished: tracker: %s" % (self.tracker))

        if self.tree is None or len(self.tree) == 0:
            self.info("%s: no data - processing complete" % self.tracker)
            return None

        data_paths = DataTree.getPaths(self.tree)
        self.debug("%s: after collection: %i data_paths: %s" %
                   (self, len(data_paths), str(data_paths)))

        # special Renderers - do not process data further but render
        # directly. Note that no transformations will be applied.
        if isinstance(self.renderer, Renderer.User):
            results = ResultBlocks(title="main")
            results.append(self.renderer(self.tree))
            return results
        elif isinstance(self.renderer, Renderer.Debug):
            results = ResultBlocks(title="main")
            results.append(self.renderer(self.tree))
            return results

        # merge all data to hierarchical indexed dataframe
        self.data = DataTree.asDataFrame(self.tree)

        self.debug("dataframe memory usage: total=%i,data=%i,index=%i,col=%i" %
                   (self.data.values.nbytes +
                    self.data.index.nbytes +
                    self.data.columns.nbytes,
                    self.data.values.nbytes,
                    self.data.index.nbytes,
                    self.data.columns.nbytes))

        # if tracks are set by tracker, call tracker with dataframe
        if self.indexFromTracker:
            self.tracker.setIndex(self.data)

        # transform data
        try:
            self.transform()
        except:
            self.error("%s: exception in transformation" % self)
            return ResultBlocks(ResultBlocks(
                Utils.buildException("transformation")))

        # data_paths = DataTree.getPaths(self.data)
        # self.debug("%s: after transformation: %i data_paths: %s" %
        #           (self, len(data_paths), str(data_paths)))
        # restrict
        try:
            self.filterPaths(self.restrict_paths, mode="restrict")
        except:
            self.error("%s: exception in restrict" % self)
            return ResultBlocks(ResultBlocks(
                Utils.buildException("restrict")))

        # data_paths = DataTree.getPaths(self.data)
        # self.debug("%s: after restrict: %i data_paths: %s" %
        #          (self, len(data_paths), str(data_paths)))
        # exclude
        try:
            self.filterPaths(self.exclude_paths, mode="exclude")
        except:
            self.error("%s: exception in exclude" % self)
            return ResultBlocks(ResultBlocks(Utils.buildException("exclude")))

        # data_paths = DataTree.getPaths(self.data)
        # self.debug("%s: after exclude: %i data_paths: %s" %
        #          (self, len(data_paths), str(data_paths)))

        # No pruning - maybe enable later as a user option
        self.pruned = []
        # try:
        #     self.prune()
        # except:
        #     self.error("%s: exception in pruning" % self)
        #     return ResultBlocks(ResultBlocks(Utils.buildException("pruning")))

        # data_paths = DataTree.getPaths(self.data)
        # self.debug("%s: after pruning: %i data_paths: %s" %
        #           (self, len(data_paths), str(data_paths)))
        try:
            self.group()
        except:
            self.error("%s: exception in grouping" % self)
            return ResultBlocks(ResultBlocks(Utils.buildException("grouping")))

        # data_paths = DataTree.getPaths(self.data)
        # self.debug("%s: after grouping: %i data_paths: %s" %
        #           (self, len(data_paths), str(data_paths)))
        if self.renderer is not None:
            self.debug("profile: started: renderer: %s" % (self.renderer))

            try:
                result = self.render()
            except:
                self.error("%s: exception in rendering" % self)
                return ResultBlocks(ResultBlocks(
                    Utils.buildException("rendering")))
            finally:
                self.debug("profile: finished: renderer: %s" % (self.renderer))
        else:
            result = ResultBlocks(title="")

        return result
Ejemplo n.º 7
0
    def collect(self):
        '''collect all data.

        Data is stored in a multi-level dictionary (DataTree)
        '''

        self.tree = odict()

        self.debug("%s: collecting data paths." % (self.tracker))
        is_function, datapaths = self.getDataPaths(self.tracker)
        self.debug("%s: collected data paths." % (self.tracker))

        # if function, no datapaths
        if is_function:
            d = self.getData(())

            # save in data tree as leaf
            DataTree.setLeaf(self.tree, ("all",), d)

            self.debug("%s: collecting data finished for function." %
                       (self.tracker))
            return

        # if no tracks, error
        if len(datapaths) == 0 or len(datapaths[0]) == 0:
            self.warn("%s: no tracks found - no output" % self.tracker)
            return

        # filter data paths
        self.debug("%s: filtering data paths: %s" %
                   (self.tracker, datapaths))
        datapaths = self.filterDataPaths(datapaths)
        self.debug("%s: filtered data paths: %s" %
                   (self.tracker, datapaths))

        # if no tracks, error
        if len(datapaths) == 0 or len(datapaths[0]) == 0:
            self.warn(
                "%s: no tracks remain after filtering "
                "- no output" % self.tracker)
            return

        self.debug("%s: building all_paths" % (self.tracker))
        if len(datapaths) > MAX_PATH_NESTING:
            self.warn("%s: number of nesting in data paths too large: %i" % (
                self.tracker, len(datapaths)))
            raise ValueError(
                "%s: number of nesting in data paths too large: %i" % (
                    self.tracker, len(datapaths)))

        all_paths = list(itertools.product(*datapaths))
        self.debug(
            "%s: collecting data started for %i data paths" % (
                self.tracker,
                len(all_paths)))

        self.tree = odict()
        for path in all_paths:

            d = self.getData(path)

            # ignore empty data sets
            if d is None:
                continue

            # save in data tree as leaf
            DataTree.setLeaf(self.tree, path, d)

        self.debug(
            "%s: collecting data finished for %i data paths" % (
                self.tracker,
                len(all_paths)))
        return self.tree
    def transform(self, data):

        # check if data is melted:
        if len(data.columns) != 1:
            raise ValueError(
                'transformer requires dataframe with'
                'a single column, got %s' % data.columns)
        column = data.columns[0]

        # iterate over lowest levels to build a dictionary of
        # sets
        genesets = {}
        nlevels = Utils.getDataFrameLevels(data)
        for key, group in data.groupby(level=range(nlevels)):
            genesets[path2str(key)] = set(group[column])

        keys = genesets.keys()

        background = None
        foreground = []
        for key in keys:
            if "background" in key:
                background = genesets[key]
            else:
                foreground.append(key)

        if len(keys) < 3 or background is None:
            raise ValueError(
                "Expected at least 3 lists, with one called background, "
                "instead got %i lists called %s" %
                (len(keys), ", ".join(keys)))

        missing = {
            y: [str(x) for x in genesets[y]
                if x not in background] for y in foreground}

        if any([len(missing[x]) > 0 for x in missing]):
            missing_items = "\n\t".join(
                ["%s:\t%s" % (x, ",".join(missing[x])) for x in missing])
            raise ValueError(
                "Found items in lists not in background. "
                "Missing items:\n\t %s" % missing_items)

        M = len(set(background))
        if len(keys) == 2:

            n = len(set(genesets[keys[1]]))
            N = len(set(genesets[keys[0]]))
            x = len(set(genesets[keys[0]]) & set(genesets[keys[1]]))

            p = scipy.stats.hypergeom.sf(x, M, n, N)

            fc = ((x + 0.0) / N) / ((n + 0.0) / M)

            values = [("Enrichment", fc),
                      ("P-value", p)]
        else:
            enrichments = []
            pvals = []
            As = []
            Bs = []
            for a, b in itertools.combinations(keys, 2):
                N = len(set(genesets[a]))
                n = len(set(genesets[b]))
                x = len(set(genesets[a]) & set(genesets[b]))

                p = scipy.stats.hypergeom.sf(x, M, n, N)

                fc = ((x + 0.0) / N) / ((n + 0.0) / M)

                As.append(a)
                Bs.append(b)
                pvals.append(p)
                enrichments.append(fc)

            values = [("ListA", As),
                      ("ListB", Bs),
                      ("Enrichment", enrichments),
                      ("P-value", pvals)]

        return DataTree.listAsDataFrame(values, values_are_rows=True)
Ejemplo n.º 9
0
    def asSpreadSheet(self, dataframe, row_headers, col_headers, title):
        '''save the table as an xls file.

        Multiple files of the same Renderer/Tracker combination are
        distinguished by the title.
        '''

        self.debug("%s: saving %i x %i table as spread-sheet'" %
                   (id(self),
                    len(row_headers),
                    len(col_headers)))

        is_hierarchical = isinstance(dataframe.index,
                                     pandas.core.index.MultiIndex)

        split = is_hierarchical and len(dataframe.index.levels) > 1

        quick = len(dataframe) > 10000
        if quick and not split:
            # quick writing, only append method works
            wb = openpyxl.Workbook(optimized_write=True)

            def fillWorksheet(ws, dataframe, title):
                ws.append([""] + list(col_headers))
                for x, row in enumerate(dataframe.iterrows()):
                    ws.append([path2str(row[0])] + list(row[1]))

                # patch: maximum title length seems to be 31
                ws.title = title[:30]

        else:
            # do it cell-by-cell, this might be slow
            wb = openpyxl.Workbook(optimized_write=False)

            def fillWorksheet(ws, dataframe, title):
                # regex to detect rst hypelinks
                regex_link = re.compile('`(.*) <(.*)>`_')
                # write row names
                for row, row_name in enumerate(dataframe.index):
                    # rows and columns start at 1
                    c = ws.cell(row=row + 2, column=1)
                    c.value = row_name

                # write columns
                for column, column_name in enumerate(dataframe.columns):
                    # set column title
                    # rows and columns start at 1
                    c = ws.cell(row=1, column=column + 2)
                    c.value = column_name

                    # set column values
                    dataseries = dataframe[column_name]

                    if dataseries.dtype == object:
                        for row, value in enumerate(dataseries):
                            c = ws.cell(row=row + 2,
                                        column=column + 2)
                            value = str(value)
                            if value.startswith('`'):
                                c.value, c.hyperlink =\
                                    regex_link.match(value).groups()
                            else:
                                c.value = value
                    else:
                        for row, value in enumerate(dataseries):
                            c = ws.cell(row=row + 2,
                                        column=column + 2)
                            c.value = value
                # patch: maximum title length seems to be 31
                ws.title = re.sub("/", "_", title)[:30]

        if len(wb.worksheets) == 0:
            wb.create_sheet()

        if split:
            # create separate worksheets for nested indices
            nlevels = len(dataframe.index.levels)
            paths = map(tuple, DataTree.unique(
                [x[:nlevels - 1]
                 for x in dataframe.index.unique()]))

            ws = wb.worksheets[0]
            ws.title = 'Summary'
            ws.append(
                [""] * (nlevels - 1) + ["Worksheet", "Rows"])

            for row, path in enumerate(paths):
                # select data frame as cross-section
                work = dataframe.xs(path, axis=0)
                title = path2str(path)
                if len(title) > 30:
                    title = "sheet%i" % row

                ws.append(list(path) + [title, len(work)])
                c = ws.cell(row=row + 1,
                            column=nlevels)
                # this does not work in oocalc
                c.hyperlink = "#%s!A1" % title
                fillWorksheet(wb.create_sheet(),
                              work,
                              title=title)
        else:
            fillWorksheet(wb.worksheets[0], dataframe,
                          title=title)

        # write result block
        lines = []
        lines.append("`%i x %i table <#$xls %s$#>`__" %
                     (len(row_headers), len(col_headers),
                      title))
        lines.append("")

        r = ResultBlock("\n".join(lines), title=title)
        r.xls = wb

        self.debug("%s: saved %i x %i table as spread-sheet'" %
                   (id(self),
                    len(row_headers),
                    len(col_headers)))
        return r
Ejemplo n.º 10
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("-v", "--verbose", dest="loglevel", type="int",
                      help="loglevel. The higher, the more output [default=%default]")

    parser.add_option("-i", "--view", dest="view", action="store_true",
                      help="view keys in cache [default=%default]")

    parser.add_option("-t", "--tracker", dest="tracker", type="string",
                      help="tracker to use [default=%default]")

    parser.add_option("-a", "--tracks", dest="tracks", type="string",
                      help="tracks to include [default=%default]")

    parser.add_option("-s", "--slices", dest="slices", type="string",
                      help="slices to include [default=%default]")

    parser.add_option("-g", "--groupby", dest="groupby", type="choice",
                      choices=("track", "slice", "all"),
                      help="groupby by track or slice [default=%default]")

    parser.add_option("-f", "--format", dest="format", type="choice",
                      choices=("tsv", "csv"),
                      help="output format [default=%default]")

    parser.set_defaults(
        loglevel=2,
        view=False,
        tracker=None,
        tracks=None,
        slices=None,
        groupby="slice",
        format="tsv",
    )

    (options, args) = parser.parse_args()

    if len(args) != 1 and options.tracker == None:
        print(USAGE)
        raise ValueError("please supply a tracker.""")

    if options.tracker:
        tracker = options.tracker
    else:
        tracker = args[0]

    cache = Cache.Cache(tracker, mode="r")

    if options.view:
        keys = [x.split("/") for x in list(cache.keys())]
        sys.stdout.write("# available tracks\n")
        sys.stdout.write("track\n%s" % "\n".join(set([x[0] for x in keys])))
        sys.stdout.write("\n")
        sys.stdout.write("# available slices\n")
        sys.stdout.write("slice\n%s" % "\n".join(set([x[1] for x in keys])))
        sys.stdout.write("\n")
        return

    data = DataTree.fromCache(cache,
                              tracks=options.tracks,
                              slices=options.slices,
                              groupby=options.groupby)

    table, row_headers, col_headers = DataTree.tree2table(data)

    if options.format in ("tsv", "csv"):
        if options.format == "tsv":
            sep = "\t"
        elif options.format == "csv":
            sep = ","
        sys.stdout.write(sep + sep.join(col_headers) + "\n")
        for h, row in zip(row_headers, table):
            sys.stdout.write("%s%s%s\n" % (h, sep, sep.join(row)))