Beispiel #1
0
    def transform(self, data, path ):
        debug( "%s: called" % str(self))

        if Utils.isArray( data ):
            return Stats.Summary( data )._data
        else:
            return None
Beispiel #2
0
    def transform(self, data, path):
        debug( "%s: called for path %s" % (str(self), str(path)))

        if not Utils.isArray( data ): return None

        bins, values = self.toHistogram(data)
        if bins != None:
            for converter in self.mConverters: values = converter(values)

        debug( "%s: completed for path %s" % (str(self), str(path)))            
        header = "bins"
        #if len(path) > 1: header = path[-1]
        #else: header = "bins"
        return odict( ((header, bins), ("frequency", values)))
Beispiel #3
0
    def transform(self, data, path ):
        debug( "%s: called" % str(self))

        t = odict()
        for minor_key, values in data.items():
            if not Utils.isArray(values): raise ValueError("expected a list for data frame creation, got %s", type(data))
            if len(values) == 0: raise ValueError( "empty list for %s" % (minor_key))
            v = values[0]
            if Utils.isInt( v ):
                t[minor_key] = rpy2.robjects.IntVector( values )
            elif Utils.isFloat(v):
                t[minor_key] = rpy2.robjects.FloatVector( values )
            else:
                t[minor_key] = rpy2.robjects.StrVector( values )

        return rpy2.robjects.DataFrame(t)
    def render(self, work, path ):

        self.startPlot()

        plts, legend = [], []
        all_data = []

        for line, data in work.iteritems():

            assert len(data) == 1, "multicolumn data not supported yet: %s" % str(data)

            for label, values in data.iteritems():
                assert Utils.isArray( values ), "work is of type '%s'" % values
                d = [ x for x in values if x != None ]
                if len(d) > 0:
                    all_data.append( ro.FloatVector( d ) )
                    legend.append( "/".join((line,label)))

        R.boxplot( all_data )

        return self.endPlot( work, path )
Beispiel #5
0
def asDataFrame( data ):
    '''return data tree as a pandas series.
    
    The data frame is multi-indexed according to the
    depth within the data tree.

    If the data-tree has only one level, the 
    data will be single-indexed because pandas
    will not tolerate a single level MultiIndex.

    The code assumes that the data tree has a uniform
    depth and structure.

    The inner-most level in the *data* tree will be columns. However, if
    *data* is only a single-level dictionary, the keys in the dictionary
    will be row labels and the resultant dataframe will have only one
    column.

    Depending on the type of the leaf, the data frame is
    constructed as follows:

    Leaf is an array
        Leaves on the same branch are added as columns
        
        Requires:
            All leaves on the same branch need to have the
            same length.

    Leaf is a dataframe
        Dataframes will be concatenated. 

        Requires:
            All dataframes need to have the same columns.

    Leaf is a scalar
        Dataframes will be built from a nested dictionary

    Special cases for backwards compatibility:

    1. Lowest level dictionary contains the following arrays:
        rows, columns, matrix - numpy matrix, convert to dataframe and apply as above

    2. Lowest level dictionary contains the following keys:
        '01', '10', '11' - Venn 2-set data, convert columns
        '001', '010', ... - Venn 3-set data, convert columns
    
    Pandas attempts to find a column data type that will
    fit all values in a column. Thus, if a column is numeric,
    but contains values such as "inf", "Inf", as well, the
    column type might be set to object or char.
    '''
    if data is None or len(data) == 0:
        return None

    levels = getDepths( data )
    mi, ma = min(levels), max(levels)
    if mi != ma: 
        raise NotImplementedError( 'data tree not of uniform depth, min=%i, max=%i' %(mi,ma))

    labels = getPaths( data )

    ######################################################
    ######################################################
    ######################################################
    # check special cases
    MATRIX = ('rows', 'columns', 'matrix')
    VENN2 = ('10', '01', '11')
    VENN3 = ('010', '001', '011')

    branches = list(getNodes( data, len(labels) -2 ))
    for path, branch in branches:
        # Numpy matrix - dictionary with keys matrix, rows, columns

        if len(set(branch.keys()).intersection( MATRIX )) == len(MATRIX):
            df = pandas.DataFrame( branch['matrix'], 
                                   columns = branch['columns'],
                                   index = branch['rows'] )
            setLeaf( data, path, df )

        elif len(set(branch.keys()).intersection( VENN2 )) == len(VENN2) or \
                len(set(branch.keys()).intersection( VENN3 )) == len(VENN3):
            # sort so that 'labels' is not the first item
            # specify data such that 'labels' will a single tuple entry
            values = sorted( branch.items() )
            df = pandas.DataFrame( [x[1] for x in values], index = [x[0] for x in values] )
            setLeaf( data, path, df )

    ######################################################
    ######################################################
    ######################################################
    labels = getPaths( data )
    # build multi-index
    leaves = list(getNodes( data, len(labels) -1 ))
    leaf = leaves[0][1]
    if Utils.isArray( leaf ):
        # build dataframe from array
        dataframes = []
        index_tuples = []
        # not a nested dictionary
        if len(labels) == 1:
            branches = [('all', data)]
        else:
            branches = list(getNodes( data, max(0, len(labels)-2 )) )
        
        for path, leaves in branches:
            dataframe = pandas.DataFrame( leaves ) 
            dataframes.append( dataframe )
            if len(path) == 1:
                # if only one level, do not use tuple
                index_tuples.append( path )
            else: 
                index_tuples.append( path )

        df = pandas.concat( dataframes, keys = index_tuples )

    elif Utils.isDataFrame( leaf ):
        # build dataframe from list of dataframes
        # dataframes are not stacked
        # the existing index of dataframes 
        # will be overwritten
        dataframes = []
        index_tuples = []
        for path, dataframe in leaves:
            if len(path) == 1:
                # if only one level, do not use tuple
                index_tuples.append( path[0] )
            else: 
                index_tuples.append( path )
            dataframes.append( dataframe )
        df = pandas.concat( dataframes, keys = index_tuples )
    else:

        if len(labels) == 1:
            # { 'x' : 1, 'y': 2 } -> DF with one row and two columns (x, y)
            df = pandas.DataFrame( data.values(), index = data.keys() )
        elif len(labels) == 2:
            # { 'a': {'x':1, 'y':2}, 'b': {'y',2} -> DF with two columns(x,y) and two rows(a,b) 
            df = pandas.DataFrame.from_dict( data ).transpose()
            # reorder so that order of columns corresponds to data
            df = df[labels[-1]]
        else:
            # We are dealing with a simple nested dictionary
            branches = list(getNodes( data, max(0, len(labels)-3 )) )
            dataframes = []
            index_tuples = []
            for path, nested_dict in branches:
                # transpose to invert columns and rows
                # in sphinxreport convention, the deeper 
                # level in a dictionary in sphinxreport are columns, while
                # in pandas they are rows.
                df = pandas.DataFrame( nested_dict ).transpose()
                dataframes.append( df )
                index_tuples.extend( [path] )
            df = pandas.concat( dataframes, keys = index_tuples)
            
    # rename levels in hierarchical index
    is_hierarchical = isinstance( df.index, pandas.core.index.MultiIndex )
    if is_hierarchical:
        d = ['track','slice'] + ['level%i' for i in range( len(df.index.names) - 2 )]
        df.index.names = d[:len(df.index.names)]
    else:
        df.index.name = 'track'

    return df