def transform(self, data):

        # check if data is melted:
        if len(data.columns) != 1:
            raise ValueError(
                'transformer requires dataframe with '
                'a single column, got %s' % data.columns)
        column = data.columns[0]
        # iterate over lowest levels to build a dictionary of
        # sets
        genesets = {}
        nlevels = Utils.getDataFrameLevels(data)
        for key, group in data.groupby(level=list(range(nlevels))):
            if "background" in key and not self.background:
                continue
            genesets[key] = set(group[column])

        values = []
        if len(genesets) == 2:
            a = set(genesets[list(genesets.keys())[0]])
            b = set(genesets[list(genesets.keys())[1]])

            values.append(("10", len(a - b)))
            values.append(("01", len(b - a)))
            values.append(("11", len(a & b)))
            values.append(
                ("labels", list(map(path2str, list(genesets.keys())))))
        elif len(genesets) == 3:
            a = set(genesets[list(genesets.keys())[0]])
            b = set(genesets[list(genesets.keys())[1]])
            c = set(genesets[list(genesets.keys())[2]])

            values.append(("100", len(a - b - c)))
            values.append(("010", len(b - a - c)))
            values.append(("001", len(c - a - b)))
            values.append(("110", len((a & b) - c)))
            values.append(("101", len((a & c) - b)))
            values.append(("011", len((b & c) - a)))
            values.append(("111", len((a & b) & c)))
            values.append(
                ("labels", list(map(path2str, list(genesets.keys())))))
        else:
            raise ValueError(
                "Can currently only cope with 2 or 3 way intersections")

        return DataTree.listAsDataFrame(values)
Esempio n. 2
0
    def render(self):
        '''supply the:class:`Renderer.Renderer` with the data to render.

        The data supplied will depend on the ``groupby`` option.

        returns a ResultBlocks data structure.
        '''
        self.debug("%s: rendering data started for %i items" %
                   (self,
                    len(self.data)))

        # initiate output structure
        results = ResultBlocks(title="")

        dataframe = self.data

        # dataframe.write_csv("test.csv")

        if dataframe is None:
            self.warn("%s: no data after conversion" % self)
            raise ValueError("no data for renderer")

        # special patch: set column names to pruned levels
        # if there are no column names
        if len(dataframe.columns) == len(self.pruned):
            if list(dataframe.columns) == list(range(len(dataframe.columns))):
                dataframe.columns = [x[1] for x in self.pruned]

        nlevels = Utils.getDataFrameLevels(dataframe)

        self.debug("%s: rendering data started. "
                   "levels=%i, group_level=%s" %
                   (self, nlevels,
                    str(self.group_level)))

        if self.group_level < 0:
            # no grouping for renderers that will accept
            # a dataframe with any level of indices and no explicit
            # grouping has been asked for.
            results.append(self.renderer(dataframe, path=()))
        else:
            level = Utils.getGroupLevels(
                dataframe,
                max_level=self.group_level+1)

            self.debug("%s: grouping by levels: %s" %
                       (self, str(level)))

            for key, work in dataframe.groupby(level=level):

                try:
                    results.append(self.renderer(work,
                                                 path=key))
                except:
                    self.error("%s: exception in rendering" % self)
                    results.append(
                        ResultBlocks(Utils.buildException("rendering")))

        if len(results) == 0:
            self.warn("renderer returned no data.")
            raise ValueError("renderer returned no data.")

        self.debug("%s: rendering data finished with %i blocks" %
                   (self.tracker, len(results)))

        return results
Esempio n. 3
0
    def group(self):
        '''rearrange dataframe for desired grouping.

        Through grouping the dataframe is rearranged such that the
        level at which data will be grouped will be the first level in
        hierarchical index.

        If grouping by "track" is set, additional level will be added
        to ensure that grouping will happen.

        If grouping by "slice" is set, the first two levels will
        be swopped.

        The group level indicates at which level in the nested dictionary
        the data will be grouped with 0 indicating that everything will
        grouped together.
        '''

        nlevels = Utils.getDataFrameLevels(self.data)
        try:
            default_level = self.renderer.group_level
        except AttributeError:
            # User rendere that is pure functions does not
            # have a group_level attribute
            default_level = 0

        groupby = self.groupby

        if str(default_level).startswith("force"):
            groupby = default_level[len("force-"):]
        elif groupby == "default":
            groupby = default_level

        if groupby == "none":
            self.group_level = nlevels - 1

        elif groupby == "track":
            self.group_level = 0

        elif groupby == "slice":
            # rearrange first two levels in data tree
            if nlevels > 1:
                self.data = self.data.reorder_levels(
                    [1, 0] + range(2, nlevels))
            self.group_level = 0

        elif groupby == "all":
            # group everything together
            self.group_level = -1

        elif isinstance(groupby, int):
            # get group level from Renderer
            if groupby < 0:
                # negative levels - subtract from lowest
                # level
                g = nlevels + groupby - 1
            else:
                g = groupby
            self.group_level = g
        else:
            self.group_level = 0

        self.debug("grouping: nlevel=%i, groupby=%s, default=%s, group=%i" %
                   (nlevels, groupby, str(default_level), self.group_level))

        return self.data
    def transform(self, data):

        # check if data is melted:
        if len(data.columns) != 1:
            raise ValueError(
                'transformer requires dataframe with'
                'a single column, got %s' % data.columns)
        column = data.columns[0]

        # iterate over lowest levels to build a dictionary of
        # sets
        genesets = {}
        nlevels = Utils.getDataFrameLevels(data)
        for key, group in data.groupby(level=range(nlevels)):
            genesets[path2str(key)] = set(group[column])

        keys = genesets.keys()

        background = None
        foreground = []
        for key in keys:
            if "background" in key:
                background = genesets[key]
            else:
                foreground.append(key)

        if len(keys) < 3 or background is None:
            raise ValueError(
                "Expected at least 3 lists, with one called background, "
                "instead got %i lists called %s" %
                (len(keys), ", ".join(keys)))

        missing = {
            y: [str(x) for x in genesets[y]
                if x not in background] for y in foreground}

        if any([len(missing[x]) > 0 for x in missing]):
            missing_items = "\n\t".join(
                ["%s:\t%s" % (x, ",".join(missing[x])) for x in missing])
            raise ValueError(
                "Found items in lists not in background. "
                "Missing items:\n\t %s" % missing_items)

        M = len(set(background))
        if len(keys) == 2:

            n = len(set(genesets[keys[1]]))
            N = len(set(genesets[keys[0]]))
            x = len(set(genesets[keys[0]]) & set(genesets[keys[1]]))

            p = scipy.stats.hypergeom.sf(x, M, n, N)

            fc = ((x + 0.0) / N) / ((n + 0.0) / M)

            values = [("Enrichment", fc),
                      ("P-value", p)]
        else:
            enrichments = []
            pvals = []
            As = []
            Bs = []
            for a, b in itertools.combinations(keys, 2):
                N = len(set(genesets[a]))
                n = len(set(genesets[b]))
                x = len(set(genesets[a]) & set(genesets[b]))

                p = scipy.stats.hypergeom.sf(x, M, n, N)

                fc = ((x + 0.0) / N) / ((n + 0.0) / M)

                As.append(a)
                Bs.append(b)
                pvals.append(p)
                enrichments.append(fc)

            values = [("ListA", As),
                      ("ListB", Bs),
                      ("Enrichment", enrichments),
                      ("P-value", pvals)]

        return DataTree.listAsDataFrame(values, values_are_rows=True)
Esempio n. 5
0
def as_dataframe(data, tracker=None):
    '''convert data tree to pandas DataFrame.

    The data frame is multi-indexed according to the depth within the
    data tree.

    If the data-tree has only one level, the data will be
    single-indexed because pandas will not tolerate a single level
    MultiIndex.

    The code assumes that the data tree has a uniform
    depth and structure.

    The inner-most level in the *data* tree will be columns. However,
    if *data* is only a single-level dictionary, the keys in the
    dictionary will be row labels and the resultant dataframe will
    have only one column.

    Depending on the type of the leaf, the data frame is constructed
    as follows:

    Leaves are multiple arrays of the same size

        The data is assumed to be coordinate type data (x,y,z
        values). Leaves will be added to a dataframe as multiple
        columns.

    Leaves are a single array or arrays with dissimilar size

        A melted data frame will be constructed where
        the hierarchical index contains the path information
        and the data frame has a single column with the value.

    Leaf is a dataframe

        Dataframes will be concatenated. Existing indices
        of the dataframes will be preserved with the exception
        of the trivial index for the row numbers.

        Requires:
            All dataframes need to have the same columns.

    Leaf is a scalar
        Dataframes will be built from a nested dictionary

    Special cases for backwards compatibility:

    1. Lowest level dictionary contains the following arrays: rows,
        columns, matrix - numpy matrix, convert to dataframe and apply
        as above

    2. Lowest level dictionary contains the following keys:
        '01', '10', '11' - Venn 2-set data, convert columns
        '001', '010', ... - Venn 3-set data, convert columns

    Pandas attempts to find a column data type that will
    fit all values in a column. Thus, if a column is numeric,
    but contains values such as "inf", "Inf", as well, the
    column type might be set to object or char.

    '''
    if data is None or len(data) == 0:
        return None

    logger = Component.get_logger()

    levels = getDepths(data)
    if len(levels) == 0:
        return None

    mi, ma = min(levels), max(levels)
    if mi != ma:
        raise NotImplementedError(
            'data tree not of uniform depth, min=%i, max=%i' %
            (mi, ma))

    labels = getPaths(data)

    ######################################################
    ######################################################
    ######################################################
    # check special cases
    MATRIX = ('rows', 'columns', 'matrix')
    VENN2 = ('10', '01', '11')
    VENN3 = ('010', '001', '011')
    dataframe_prune_index = True
    branches = list(getNodes(data, len(labels) - 2))
    for path, branch in branches:
        # numpy matrix - dictionary with keys matrix, rows, columns
        if len(set(branch.keys()).intersection(MATRIX)) == len(MATRIX):
            df = pandas.DataFrame(branch['matrix'],
                                  columns=branch['columns'],
                                  index=branch['rows'])
            setLeaf(data, path, df)
            dataframe_prune_index = False

        elif len(set(branch.keys()).intersection(VENN2)) == len(VENN2) or \
                len(set(branch.keys()).intersection(VENN3)) == len(VENN3):
            # sort so that 'labels' is not the first item
            # specify data such that 'labels' will a single tuple entry
            values = sorted(branch.items())
            df = listAsDataFrame(values)
            dataframe_prune_index = False
            setLeaf(data, path, df)

    ######################################################
    ######################################################
    ######################################################
    labels = getPaths(data)
    # build multi-index
    leaves = list(getNodes(data, len(labels) - 1))

    # if set to a number, any superfluous levels in the
    # hierarchical index of the final dataframe will
    # be removed.
    expected_levels = None

    leaf = leaves[0][1]

    if is_array(leaf):

        # build dataframe from arrays
        dataframes = []
        index_tuples = []

        # not a nested dictionary
        if len(labels) == 1:
            branches = [(('all',), data)]
        else:
            branches = list(getNodes(data, max(0, len(labels) - 2)))

        # check if it is coordinate data
        # All arrays need to have the same length
        is_coordinate = True
        for path, subtree in branches:
            lengths = [len(x) for x in list(subtree.values())]
            if len(lengths) == 0:
                continue

            # all arrays have the same length - coordinate data
            if len(lengths) == 1 or min(lengths) != max(lengths):
                is_coordinate = False
                break

        if is_coordinate:
            logger.debug('dataframe conversion: from array - coordinates')
            for path, leaves in branches:
                # skip empty leaves
                if len(leaves) == 0:
                    continue
                dataframes.append(pandas.DataFrame(leaves))
                index_tuples.append(path)
        else:
            logger.debug('dataframe conversion: from array - series')
            # arrays of unequal length are measurements
            # build a melted data frame with a single column
            # given by the name of the path.
            for key, leave in leaves:
                # skip empty leaves
                if len(leave) == 0:
                    continue
                index_tuples.append(key)
                dataframes.append(pandas.DataFrame(leave,
                                                   columns=('value',)))

        expected_levels = len(index_tuples[0])
        df = concatDataFrames(dataframes, index_tuples)

    elif is_dataframe(leaf):
        logger.debug('dataframe conversion: from dataframe')

        # build dataframe from list of dataframes
        # by concatenation.
        # Existing indices of the dataframes will
        # be added as columns.
        dataframes = []
        index_tuples = []
        path_lengths = []
        levels = []
        for path, dataframe in leaves:
            if len(dataframe) == 0:
                continue
            path_lengths.append(len(path))
            if len(path) == 1:
                # if only one level, do not use tuple
                index_tuples.append(path[0])
            else:
                index_tuples.append(path)
            dataframes.append(dataframe)

            levels.append(Utils.getDataFrameLevels(
                dataframe,
                test_for_trivial=True))

        if len(path_lengths) == 0:
            return None

        assert min(path_lengths) == max(path_lengths)
        assert min(levels) == max(levels)

        # if only a single dataframe without given
        # tracks, return dataframe
        if index_tuples == ["all"]:
            df = dataframes[0]
            # if index is a simple numeric list, change to "all"
            if isinstance(df.index, pandas.Int64Index) and \
               df.index.name is None:
                df.index = ["all"] * len(df)
            return df

        expected_levels = min(path_lengths) + min(levels)
        df = concatDataFrames(dataframes, index_tuples)

    else:
        logger.debug('dataframe conversion: from values')
        if len(labels) == 1:
            # { 'x': 1, 'y': 2 } -> DF with one row and two columns (x, y)
            df = pandas.DataFrame(list(data.values()), index=list(data.keys()))
        elif len(labels) == 2:
            # { 'a': {'x':1, 'y':2}, 'b': {'y',2}
            # -> DF with two columns(x,y) and two rows(a,b)
            df = pandas.DataFrame.from_dict(data).transpose()
            # reorder so that order of columns corresponds to data
            df = df[labels[-1]]
        else:
            # We are dealing with a simple nested dictionary
            branches = list(getNodes(data, max(0, len(labels) - 3)))
            dataframes = []
            index_tuples = []
            for path, nested_dict in branches:
                # transpose to invert columns and rows
                # in cgatreport convention, the deeper
                # level in a dictionary in cgatreport are columns, while
                # in pandas they are rows.
                df = pandas.DataFrame(nested_dict).transpose()
                dataframes.append(df)
                index_tuples.extend([path])
            df = concatDataFrames(dataframes, index_tuples)

    # remove index with row numbers
    if expected_levels is not None and dataframe_prune_index:
        Utils.pruneDataFrameIndex(df, expected_levels)

    # rename levels in hierarchical index
    is_hierarchical = isinstance(df.index,
                                 pandas.core.index.MultiIndex)

    if is_hierarchical:
        n = list(df.index.names)
        try:
            if tracker is not None:
                l = getattr(tracker, "levels")
        except AttributeError:
            l = ["track", "slice"] + ["level%i" % x for x in range(len(n))]

        for x, y in enumerate(n):
            if y is None:
                n[x] = l[x]
        df.index.names = n
    else:
        df.index.name = 'track'

    return df