Exemple #1
0
def itermergeduplicates(table, key, missing):
    it = iter(table)
    fields, it = iterpeek(it)

    # determine output fields
    if isinstance(key, basestring):
        outflds = [key]
        keyflds = set([key])
    else:
        outflds = list(key)
        keyflds = set(key)
    valflds = [f for f in fields if f not in keyflds]
    valfldidxs = [fields.index(f) for f in valflds]
    outflds.extend(valflds)
    yield tuple(outflds)

    # do the work
    for k, grp in rowgroupby(it, key):
        grp = list(grp)
        if isinstance(key, basestring):
            outrow = [k]
        else:
            outrow = list(k)
        mergedvals = [set(row[i] for row in grp
                          if len(row) > i and row[i] != missing)
                      for i in valfldidxs]
        normedvals = [vals.pop() if len(vals) == 1
                      else missing if len(vals) == 0
                      else Conflict(vals)
                      for vals in mergedvals]
        outrow.extend(normedvals)
        yield tuple(outrow)
Exemple #2
0
def iterrowreduce(source, key, reducer, fields):
    if fields is None:
        # output fields from source
        fields, source = iterpeek(source)
    yield tuple(fields)
    for key, rows in rowgroupby(source, key):
        yield tuple(reducer(key, rows))
Exemple #3
0
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix):
    lit = iter(left)
    lflds = lit.next()

    rflds, rit = iterpeek(right)  # need the whole lot to pass to lookup
    from petl.util import lookupone
    rlookup = lookupone(rit, rkey, strict=False)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lflds, lkey)
    rkind = asindices(rflds, rkey)

    # construct functions to extract key values from left table
    lgetk = operator.itemgetter(*lkind)

    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rflds)) if i not in rkind]
    rgetv = rowgetter(*rvind)

    # determine the output fields
    if lprefix is None:
        outflds = list(lflds)
    else:
        outflds = [(str(lprefix) + str(f))
                   for f in lflds]
    if rprefix is None:
        outflds.extend(rgetv(rflds))
    else:
        outflds.extend([(str(rprefix) + str(f))
                        for f in rgetv(rflds)])
    yield tuple(outflds)

    # define a function to join rows
    def joinrows(_lrow, _rrow):
        # start with the left row
        _outrow = list(_lrow)
        # extend with non-key values from the right row
        _outrow.extend(rgetv(_rrow))
        return tuple(_outrow)

    for lrow in lit:
        k = lgetk(lrow)
        if k in rlookup:
            rrow = rlookup[k]
            yield joinrows(lrow, rrow)
        else:
            outrow = list(lrow) # start with the left row
            # extend with missing values in place of the right row
            outrow.extend([missing] * len(rvind))
            yield tuple(outrow)
Exemple #4
0
def guessdtype(table):
    try:
        import numpy as np
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:
        # get numpy to infer dtypes for each field individually
        fields, table = iterpeek(table, 1)
        cols = columns(table)
        dtype = []
        for f in fields:
            a = np.array(cols[f]) # load into 1D array to get numpy to infer a dtype for the column
            dtype.append((f, a.dtype))
        return np.dtype(dtype)
Exemple #5
0
def valuestoarray(vals, dtype=None, count=-1, sample=1000):

    try:
        import numpy as np
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:

        it = iter(vals)

        if dtype is None:
            peek, it = iterpeek(it, sample)
            dtype = np.array(peek).dtype

        a = np.fromiter(it, dtype=dtype, count=count)
        return a
Exemple #6
0
def tohdf5(table,
           source,
           where=None,
           name=None,
           create=False,
           description=None,
           title='',
           filters=None,
           expectedrows=10000,
           chunkshape=None,
           byteorder=None,
           createparents=False,
           sample=1000):
    """
    Write to an HDF5 table. If `create` is `False`, assumes the table
    already exists, and attempts to truncate it before loading. If `create`
    is `True`, any existing table is dropped, and a new table is created;
    if `description` is None, the datatype will be guessed. E.g.::
    
        >>> from petl import look
        >>> look(table1)
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 1     | 'asdfgh' |
        +-------+----------+
        | 2     | 'qwerty' |
        +-------+----------+
        | 3     | 'zxcvbn' |
        +-------+----------+
        
        >>> from petlx.hdf5 import tohdf5, fromhdf5
        >>> tohdf5(table1, 'test1.h5', '/testgroup', 'testtable', create=True, createparents=True)
        >>> look(fromhdf5('test1.h5', '/testgroup', 'testtable'))
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 1     | 'asdfgh' |
        +-------+----------+
        | 2     | 'qwerty' |
        +-------+----------+
        | 3     | 'zxcvbn' |
        +-------+----------+

    See also :func:`appendhdf5`.
    
    .. versionadded:: 0.3
    
    """

    it = iter(table)

    if create:

        try:
            import tables
        except ImportError as e:
            raise UnsatisfiedDependency(e, dep_message)

        if isinstance(source, basestring):
            # assume it's the name of an HDF5 file
            h5file = tables.openFile(source,
                                     mode='a')  # don't replace the whole file!
        elif isinstance(source, tables.File):
            h5file = source
        else:
            raise Exception(
                'invalid source argument, expected file name or tables.File, found: %r'
                % source)

        # determine datatype
        if description is None:
            peek, it = iterpeek(it, sample)
            # use a numpy dtype
            description = guessdtype(peek)

        # check if the table node already exists
        try:
            h5table = h5file.getNode(where, name)
        except tables.NoSuchNodeError:
            pass
        else:
            # drop the node
            h5file.removeNode(where, name)

        # create the table
        h5table = h5file.createTable(where,
                                     name,
                                     description,
                                     title=title,
                                     filters=filters,
                                     expectedrows=expectedrows,
                                     chunkshape=chunkshape,
                                     byteorder=byteorder,
                                     createparents=createparents)

    else:
        h5file, h5table = _get_hdf5_table(source, where, name, mode='a')

    try:
        # truncate the existing table
        h5table.truncate(0)

        # load the data
        _insert(it, h5table)

    finally:
        if isinstance(source, basestring):
            # close the file if we opened it here
            h5file.close()
Exemple #7
0
def toarray(table, dtype=None, count=-1, sample=1000):
    """
    Convenience function to load data from the given `table` into a numpy 
    structured array. E.g.::

        >>> from petl import look
        >>> from petlx.array import toarray
        >>> look(table)
        +-----------+-------+-------+
        | 'foo'     | 'bar' | 'baz' |
        +===========+=======+=======+
        | 'apples'  | 1     | 2.5   |
        +-----------+-------+-------+
        | 'oranges' | 3     | 4.4   |
        +-----------+-------+-------+
        | 'pears'   | 7     | 0.1   |
        +-----------+-------+-------+
        
        >>> a = toarray(table)
        >>> a
        array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)], 
              dtype=[('foo', '|S7'), ('bar', '<i8'), ('baz', '<f8')])
        >>> a['foo']
        array(['apples', 'oranges', 'pears'], 
              dtype='|S7')
        >>> a['bar']
        array([1, 3, 7])
        >>> a['baz']
        array([ 2.5,  4.4,  0.1])
        >>> a['foo'][0]
        'apples'
        >>> a['bar'][1]
        3
        >>> a['baz'][2]
        0.10000000000000001
        
    If no datatype is specified, `sample` rows will be examined to infer an
    appropriate datatype for each field.
        
    The datatype can be specified as a string, e.g.:

        >>> a = toarray(table, dtype='a4, i2, f4')
        >>> a
        array([('appl', 1, 2.5), ('oran', 3, 4.400000095367432),
               ('pear', 7, 0.10000000149011612)], 
              dtype=[('foo', '|S4'), ('bar', '<i2'), ('baz', '<f4')])

    The datatype can also be partially specified, in which case datatypes will
    be inferred for other fields, e.g.:
    
        >>> a = toarray(table, dtype={'foo': 'a4'})
        >>> a
        array([('appl', 1, 2.5), ('oran', 3, 4.4), ('pear', 7, 0.1)], 
              dtype=[('foo', '|S4'), ('bar', '<i8'), ('baz', '<f8')])
    
    """
    
    try:
        import numpy as np
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:

        it = iter(table)
        peek, it = iterpeek(it, sample)
        fields = it.next()
        
        if dtype is None:
            dtype = guessdtype(peek)
           
        elif isinstance(dtype, basestring):
            # insert field names from source table
            typestrings = [s.strip() for s in dtype.split(',')]
            dtype = [(f, t) for f, t in zip(fields, typestrings)]
            
        elif isinstance(dtype, dict) and ('names' not in dtype or 'formats' not in dtype):
            # allow for partial specification of dtype
            cols = columns(peek)
            newdtype = {'names': [], 'formats': []}
            for f in fields:
                newdtype['names'].append(f)
                if f in dtype and isinstance(dtype[f], tuple):
                    # assume fully specified
                    newdtype['formats'].append(dtype[f][0])
                elif f not in dtype:
                    # not specified at all
                    a = np.array(cols[f])
                    newdtype['formats'].append(a.dtype)
                else:
                    # assume directly specified, just need to add offset
                    newdtype['formats'].append(dtype[f])
            dtype = newdtype
            
        else:
            pass # leave dtype as-is
                         
        it = (tuple(row) for row in it) # numpy is fussy about having tuples, need to make sure
        sa = np.fromiter(it, dtype=dtype, count=count)
        return sa
Exemple #8
0
def tohdf5(
    table,
    source,
    where=None,
    name=None,
    create=False,
    description=None,
    title="",
    filters=None,
    expectedrows=10000,
    chunkshape=None,
    byteorder=None,
    createparents=False,
    sample=1000,
):
    """
    Write to an HDF5 table. If `create` is `False`, assumes the table
    already exists, and attempts to truncate it before loading. If `create`
    is `True`, any existing table is dropped, and a new table is created;
    if `description` is None, the datatype will be guessed. E.g.::
    
        >>> from petl import look
        >>> look(table1)
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 1     | 'asdfgh' |
        +-------+----------+
        | 2     | 'qwerty' |
        +-------+----------+
        | 3     | 'zxcvbn' |
        +-------+----------+
        
        >>> from petlx.hdf5 import tohdf5, fromhdf5
        >>> tohdf5(table1, 'test1.h5', '/testgroup', 'testtable', create=True, createparents=True)
        >>> look(fromhdf5('test1.h5', '/testgroup', 'testtable'))
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 1     | 'asdfgh' |
        +-------+----------+
        | 2     | 'qwerty' |
        +-------+----------+
        | 3     | 'zxcvbn' |
        +-------+----------+

    See also :func:`appendhdf5`.
    
    .. versionadded:: 0.3
    
    """

    it = iter(table)

    if create:

        try:
            import tables
        except ImportError as e:
            raise UnsatisfiedDependency(e, dep_message)

        if isinstance(source, basestring):
            # assume it's the name of an HDF5 file
            h5file = tables.openFile(source, mode="a")  # don't replace the whole file!
        elif isinstance(source, tables.File):
            h5file = source
        else:
            raise Exception("invalid source argument, expected file name or tables.File, found: %r" % source)

        # determine datatype
        if description is None:
            peek, it = iterpeek(it, sample)
            # use a numpy dtype
            description = guessdtype(peek)

        # check if the table node already exists
        try:
            h5table = h5file.getNode(where, name)
        except tables.NoSuchNodeError:
            pass
        else:
            # drop the node
            h5file.removeNode(where, name)

        # create the table
        h5table = h5file.createTable(
            where,
            name,
            description,
            title=title,
            filters=filters,
            expectedrows=expectedrows,
            chunkshape=chunkshape,
            byteorder=byteorder,
            createparents=createparents,
        )

    else:
        h5file, h5table = _get_hdf5_table(source, where, name, mode="a")

    try:
        # truncate the existing table
        h5table.truncate(0)

        # load the data
        _insert(it, h5table)

    finally:
        if isinstance(source, basestring):
            # close the file if we opened it here
            h5file.close()
Exemple #9
0
def toarray(table, dtype=None, count=-1, sample=1000):
    """
    Convenience function to load data from the given `table` into a numpy 
    structured array. E.g.::

        >>> from petl import look
        >>> from petlx.array import toarray
        >>> look(table)
        +-----------+-------+-------+
        | 'foo'     | 'bar' | 'baz' |
        +===========+=======+=======+
        | 'apples'  | 1     | 2.5   |
        +-----------+-------+-------+
        | 'oranges' | 3     | 4.4   |
        +-----------+-------+-------+
        | 'pears'   | 7     | 0.1   |
        +-----------+-------+-------+
        
        >>> a = toarray(table)
        >>> a
        array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)], 
              dtype=[('foo', '|S7'), ('bar', '<i8'), ('baz', '<f8')])
        >>> a['foo']
        array(['apples', 'oranges', 'pears'], 
              dtype='|S7')
        >>> a['bar']
        array([1, 3, 7])
        >>> a['baz']
        array([ 2.5,  4.4,  0.1])
        >>> a['foo'][0]
        'apples'
        >>> a['bar'][1]
        3
        >>> a['baz'][2]
        0.10000000000000001
        
    If no datatype is specified, `sample` rows will be examined to infer an
    appropriate datatype for each field.
        
    The datatype can be specified as a string, e.g.:

        >>> a = toarray(table, dtype='a4, i2, f4')
        >>> a
        array([('appl', 1, 2.5), ('oran', 3, 4.400000095367432),
               ('pear', 7, 0.10000000149011612)], 
              dtype=[('foo', '|S4'), ('bar', '<i2'), ('baz', '<f4')])

    The datatype can also be partially specified, in which case datatypes will
    be inferred for other fields, e.g.:
    
        >>> a = toarray(table, dtype={'foo': 'a4'})
        >>> a
        array([('appl', 1, 2.5), ('oran', 3, 4.4), ('pear', 7, 0.1)], 
              dtype=[('foo', '|S4'), ('bar', '<i8'), ('baz', '<f8')])
    
    """

    try:
        import numpy as np
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:

        it = iter(table)
        peek, it = iterpeek(it, sample)
        fields = it.next()

        if dtype is None:
            dtype = guessdtype(peek)

        elif isinstance(dtype, basestring):
            # insert field names from source table
            typestrings = [s.strip() for s in dtype.split(',')]
            dtype = [(f, t) for f, t in zip(fields, typestrings)]

        elif (isinstance(dtype, dict)
              and ('names' not in dtype or 'formats' not in dtype)):
            # allow for partial specification of dtype
            cols = columns(peek)
            newdtype = {'names': [], 'formats': []}
            for f in fields:
                newdtype['names'].append(f)
                if f in dtype and isinstance(dtype[f], tuple):
                    # assume fully specified
                    newdtype['formats'].append(dtype[f][0])
                elif f not in dtype:
                    # not specified at all
                    a = np.array(cols[f])
                    newdtype['formats'].append(a.dtype)
                else:
                    # assume directly specified, just need to add offset
                    newdtype['formats'].append(dtype[f])
            dtype = newdtype

        else:
            pass  # leave dtype as-is

        # numpy is fussy about having tuples, need to make sure
        it = (tuple(row) for row in it)
        sa = np.fromiter(it, dtype=dtype, count=count)

        return sa