Esempio n. 1
0
    def __call__(self, data):

        if self.nlevels is None:
            # do not group
            return self.transform(data)

        dataframes, keys = [], []
        group_levels = Utils.getGroupLevels(
            data,
            modify_levels=self.nlevels,
        )

        for key, group in data.groupby(level=group_levels):
            self.debug('applying transformation on group %s' % str(key))
            df = self.transform(group)
            if df is not None:
                dataframes.append(df)
                keys.append(key)

        df = pandas.concat(dataframes, keys=keys)

        if self.prune_dataframe:
            # reset dataframe index - keep the same levels
            Utils.pruneDataFrameIndex(df, original=data)

        self.debug("transform: finished")

        return df
Esempio n. 2
0
def as_dataframe(data, tracker=None):
    '''convert data tree to pandas DataFrame.

    The data frame is multi-indexed according to the depth within the
    data tree.

    If the data-tree has only one level, the data will be
    single-indexed because pandas will not tolerate a single level
    MultiIndex.

    The code assumes that the data tree has a uniform
    depth and structure.

    The inner-most level in the *data* tree will be columns. However,
    if *data* is only a single-level dictionary, the keys in the
    dictionary will be row labels and the resultant dataframe will
    have only one column.

    Depending on the type of the leaf, the data frame is constructed
    as follows:

    Leaves are multiple arrays of the same size

        The data is assumed to be coordinate type data (x,y,z
        values). Leaves will be added to a dataframe as multiple
        columns.

    Leaves are a single array or arrays with dissimilar size

        A melted data frame will be constructed where
        the hierarchical index contains the path information
        and the data frame has a single column with the value.

    Leaf is a dataframe

        Dataframes will be concatenated. Existing indices
        of the dataframes will be preserved with the exception
        of the trivial index for the row numbers.

        Requires:
            All dataframes need to have the same columns.

    Leaf is a scalar
        Dataframes will be built from a nested dictionary

    Special cases for backwards compatibility:

    1. Lowest level dictionary contains the following arrays: rows,
        columns, matrix - numpy matrix, convert to dataframe and apply
        as above

    2. Lowest level dictionary contains the following keys:
        '01', '10', '11' - Venn 2-set data, convert columns
        '001', '010', ... - Venn 3-set data, convert columns

    Pandas attempts to find a column data type that will
    fit all values in a column. Thus, if a column is numeric,
    but contains values such as "inf", "Inf", as well, the
    column type might be set to object or char.

    '''
    if data is None or len(data) == 0:
        return None

    logger = Component.get_logger()

    levels = getDepths(data)
    if len(levels) == 0:
        return None

    mi, ma = min(levels), max(levels)
    if mi != ma:
        raise NotImplementedError(
            'data tree not of uniform depth, min=%i, max=%i' %
            (mi, ma))

    labels = getPaths(data)

    ######################################################
    ######################################################
    ######################################################
    # check special cases
    MATRIX = ('rows', 'columns', 'matrix')
    VENN2 = ('10', '01', '11')
    VENN3 = ('010', '001', '011')
    dataframe_prune_index = True
    branches = list(getNodes(data, len(labels) - 2))
    for path, branch in branches:
        # numpy matrix - dictionary with keys matrix, rows, columns
        if len(set(branch.keys()).intersection(MATRIX)) == len(MATRIX):
            df = pandas.DataFrame(branch['matrix'],
                                  columns=branch['columns'],
                                  index=branch['rows'])
            setLeaf(data, path, df)
            dataframe_prune_index = False

        elif len(set(branch.keys()).intersection(VENN2)) == len(VENN2) or \
                len(set(branch.keys()).intersection(VENN3)) == len(VENN3):
            # sort so that 'labels' is not the first item
            # specify data such that 'labels' will a single tuple entry
            values = sorted(branch.items())
            df = listAsDataFrame(values)
            dataframe_prune_index = False
            setLeaf(data, path, df)

    ######################################################
    ######################################################
    ######################################################
    labels = getPaths(data)
    # build multi-index
    leaves = list(getNodes(data, len(labels) - 1))

    # if set to a number, any superfluous levels in the
    # hierarchical index of the final dataframe will
    # be removed.
    expected_levels = None

    leaf = leaves[0][1]

    if is_array(leaf):

        # build dataframe from arrays
        dataframes = []
        index_tuples = []

        # not a nested dictionary
        if len(labels) == 1:
            branches = [(('all',), data)]
        else:
            branches = list(getNodes(data, max(0, len(labels) - 2)))

        # check if it is coordinate data
        # All arrays need to have the same length
        is_coordinate = True
        for path, subtree in branches:
            lengths = [len(x) for x in list(subtree.values())]
            if len(lengths) == 0:
                continue

            # all arrays have the same length - coordinate data
            if len(lengths) == 1 or min(lengths) != max(lengths):
                is_coordinate = False
                break

        if is_coordinate:
            logger.debug('dataframe conversion: from array - coordinates')
            for path, leaves in branches:
                # skip empty leaves
                if len(leaves) == 0:
                    continue
                dataframes.append(pandas.DataFrame(leaves))
                index_tuples.append(path)
        else:
            logger.debug('dataframe conversion: from array - series')
            # arrays of unequal length are measurements
            # build a melted data frame with a single column
            # given by the name of the path.
            for key, leave in leaves:
                # skip empty leaves
                if len(leave) == 0:
                    continue
                index_tuples.append(key)
                dataframes.append(pandas.DataFrame(leave,
                                                   columns=('value',)))

        expected_levels = len(index_tuples[0])
        df = concatDataFrames(dataframes, index_tuples)

    elif is_dataframe(leaf):
        logger.debug('dataframe conversion: from dataframe')

        # build dataframe from list of dataframes
        # by concatenation.
        # Existing indices of the dataframes will
        # be added as columns.
        dataframes = []
        index_tuples = []
        path_lengths = []
        levels = []
        for path, dataframe in leaves:
            if len(dataframe) == 0:
                continue
            path_lengths.append(len(path))
            if len(path) == 1:
                # if only one level, do not use tuple
                index_tuples.append(path[0])
            else:
                index_tuples.append(path)
            dataframes.append(dataframe)

            levels.append(Utils.getDataFrameLevels(
                dataframe,
                test_for_trivial=True))

        if len(path_lengths) == 0:
            return None

        assert min(path_lengths) == max(path_lengths)
        assert min(levels) == max(levels)

        # if only a single dataframe without given
        # tracks, return dataframe
        if index_tuples == ["all"]:
            df = dataframes[0]
            # if index is a simple numeric list, change to "all"
            if isinstance(df.index, pandas.Int64Index) and \
               df.index.name is None:
                df.index = ["all"] * len(df)
            return df

        expected_levels = min(path_lengths) + min(levels)
        df = concatDataFrames(dataframes, index_tuples)

    else:
        logger.debug('dataframe conversion: from values')
        if len(labels) == 1:
            # { 'x': 1, 'y': 2 } -> DF with one row and two columns (x, y)
            df = pandas.DataFrame(list(data.values()), index=list(data.keys()))
        elif len(labels) == 2:
            # { 'a': {'x':1, 'y':2}, 'b': {'y',2}
            # -> DF with two columns(x,y) and two rows(a,b)
            df = pandas.DataFrame.from_dict(data).transpose()
            # reorder so that order of columns corresponds to data
            df = df[labels[-1]]
        else:
            # We are dealing with a simple nested dictionary
            branches = list(getNodes(data, max(0, len(labels) - 3)))
            dataframes = []
            index_tuples = []
            for path, nested_dict in branches:
                # transpose to invert columns and rows
                # in cgatreport convention, the deeper
                # level in a dictionary in cgatreport are columns, while
                # in pandas they are rows.
                df = pandas.DataFrame(nested_dict).transpose()
                dataframes.append(df)
                index_tuples.extend([path])
            df = concatDataFrames(dataframes, index_tuples)

    # remove index with row numbers
    if expected_levels is not None and dataframe_prune_index:
        Utils.pruneDataFrameIndex(df, expected_levels)

    # rename levels in hierarchical index
    is_hierarchical = isinstance(df.index,
                                 pandas.core.index.MultiIndex)

    if is_hierarchical:
        n = list(df.index.names)
        try:
            if tracker is not None:
                l = getattr(tracker, "levels")
        except AttributeError:
            l = ["track", "slice"] + ["level%i" % x for x in range(len(n))]

        for x, y in enumerate(n):
            if y is None:
                n[x] = l[x]
        df.index.names = n
    else:
        df.index.name = 'track'

    return df