def asDataFrame(data): '''convert data tree to pandas DataFrame. The data frame is multi-indexed according to the depth within the data tree. If the data-tree has only one level, the data will be single-indexed because pandas will not tolerate a single level MultiIndex. The code assumes that the data tree has a uniform depth and structure. The inner-most level in the *data* tree will be columns. However, if *data* is only a single-level dictionary, the keys in the dictionary will be row labels and the resultant dataframe will have only one column. Depending on the type of the leaf, the data frame is constructed as follows: Leaves are multiple arrays of the same size The data is assumed to be coordinate type data (x,y,z values). Leaves will be added to a dataframe as multiple columns. Leaves are a single array or arrays with dissimilar size A melted data frame will be constructed where the hierarchical index contains the path information and the data frame has a single column with the value. Leaf is a dataframe Dataframes will be concatenated. Existing indices of the dataframes will be preserved with the exception of the trivial index for the row numbers. Requires: All dataframes need to have the same columns. Leaf is a scalar Dataframes will be built from a nested dictionary Special cases for backwards compatibility: 1. Lowest level dictionary contains the following arrays: rows, columns, matrix - numpy matrix, convert to dataframe and apply as above 2. Lowest level dictionary contains the following keys: '01', '10', '11' - Venn 2-set data, convert columns '001', '010', ... - Venn 3-set data, convert columns Pandas attempts to find a column data type that will fit all values in a column. Thus, if a column is numeric, but contains values such as "inf", "Inf", as well, the column type might be set to object or char. ''' if data is None or len(data) == 0: return None levels = getDepths(data) mi, ma = min(levels), max(levels) if mi != ma: raise NotImplementedError( 'data tree not of uniform depth, min=%i, max=%i' % (mi, ma)) labels = getPaths(data) ###################################################### ###################################################### ###################################################### # check special cases MATRIX = ('rows', 'columns', 'matrix') VENN2 = ('10', '01', '11') VENN3 = ('010', '001', '011') dataframe_prune_index = True branches = list(getNodes(data, len(labels) - 2)) for path, branch in branches: # numpy matrix - dictionary with keys matrix, rows, columns if len(set(branch.keys()).intersection(MATRIX)) == len(MATRIX): df = pandas.DataFrame(branch['matrix'], columns=branch['columns'], index=branch['rows']) setLeaf(data, path, df) dataframe_prune_index = False elif len(set(branch.keys()).intersection(VENN2)) == len(VENN2) or \ len(set(branch.keys()).intersection(VENN3)) == len(VENN3): # sort so that 'labels' is not the first item # specify data such that 'labels' will a single tuple entry values = sorted(branch.items()) df = listAsDataFrame(values) dataframe_prune_index = False setLeaf(data, path, df) ###################################################### ###################################################### ###################################################### labels = getPaths(data) # build multi-index leaves = list(getNodes(data, len(labels) - 1)) # if set to a number, any superfluous levels in the # hierarchical index of the final dataframe will # be removed. expected_levels = None leaf = leaves[0][1] if Utils.isArray(leaf): # build dataframe from arrays dataframes = [] index_tuples = [] # not a nested dictionary if len(labels) == 1: branches = [(('all',), data)] else: branches = list(getNodes(data, max(0, len(labels) - 2))) # check if it is coordinate data # All arrays need to have the same length is_coordinate = True for path, subtree in branches: lengths = [len(x) for x in subtree.values()] if len(lengths) == 0: continue # all arrays have the same length - coordinate data if len(lengths) == 1 or min(lengths) != max(lengths): is_coordinate = False break if is_coordinate: debug('dataframe conversion: from array - coordinates') for path, leaves in branches: # skip empty leaves if len(leaves) == 0: continue dataframes.append(pandas.DataFrame(leaves)) index_tuples.append(path) else: debug('dataframe conversion: from array - series') # arrays of unequal length are measurements # build a melted data frame with a single column # given by the name of the path. for key, leave in leaves: # skip empty leaves if len(leave) == 0: continue index_tuples.append(key) dataframes.append(pandas.DataFrame(leave, columns=('value',))) expected_levels = len(index_tuples[0]) df = concatDataFrames(dataframes, index_tuples) elif Utils.isDataFrame(leaf): debug('dataframe conversion: from dataframe') # build dataframe from list of dataframes # by concatenation. # Existing indices of the dataframes will # be added as columns. dataframes = [] index_tuples = [] path_lengths = [] levels = [] for path, dataframe in leaves: path_lengths.append(len(path)) if len(path) == 1: # if only one level, do not use tuple index_tuples.append(path[0]) else: index_tuples.append(path) dataframes.append(dataframe) levels.append(Utils.getDataFrameLevels( dataframe, test_for_trivial=True)) assert min(path_lengths) == max(path_lengths) assert min(levels) == max(levels) # if only a single dataframe without given # tracks, return dataframe if index_tuples == ["all"]: df = dataframes[0] # if index is a simple numeric list, change to "all" if isinstance(df.index, pandas.Int64Index): df.index = ["all" * len(df)] return df expected_levels = min(path_lengths) + min(levels) df = concatDataFrames(dataframes, index_tuples) else: debug('dataframe conversion: from values') if len(labels) == 1: # { 'x': 1, 'y': 2 } -> DF with one row and two columns (x, y) df = pandas.DataFrame(data.values(), index=data.keys()) elif len(labels) == 2: # { 'a': {'x':1, 'y':2}, 'b': {'y',2} # -> DF with two columns(x,y) and two rows(a,b) df = pandas.DataFrame.from_dict(data).transpose() # reorder so that order of columns corresponds to data df = df[labels[-1]] else: # We are dealing with a simple nested dictionary branches = list(getNodes(data, max(0, len(labels) - 3))) dataframes = [] index_tuples = [] for path, nested_dict in branches: # transpose to invert columns and rows # in cgatreport convention, the deeper # level in a dictionary in cgatreport are columns, while # in pandas they are rows. df = pandas.DataFrame(nested_dict).transpose() dataframes.append(df) index_tuples.extend([path]) df = concatDataFrames(dataframes, index_tuples) # remove index with row numbers if expected_levels is not None and dataframe_prune_index: Utils.pruneDataFrameIndex(df, expected_levels) # rename levels in hierarchical index is_hierarchical = isinstance(df.index, pandas.core.index.MultiIndex) if is_hierarchical: n = list(df.index.names) l = ["track", "slice"] + ["level%i" % x for x in range(len(n))] for x, y in enumerate(n): if y is None: n[x] = l[x] df.index.names = n else: df.index.name = 'track' return df