def itermergeduplicates(table, key, missing): it = iter(table) fields, it = iterpeek(it) # determine output fields if isinstance(key, basestring): outflds = [key] keyflds = set([key]) else: outflds = list(key) keyflds = set(key) valflds = [f for f in fields if f not in keyflds] valfldidxs = [fields.index(f) for f in valflds] outflds.extend(valflds) yield tuple(outflds) # do the work for k, grp in rowgroupby(it, key): grp = list(grp) if isinstance(key, basestring): outrow = [k] else: outrow = list(k) mergedvals = [set(row[i] for row in grp if len(row) > i and row[i] != missing) for i in valfldidxs] normedvals = [vals.pop() if len(vals) == 1 else missing if len(vals) == 0 else Conflict(vals) for vals in mergedvals] outrow.extend(normedvals) yield tuple(outrow)
def iterrowreduce(source, key, reducer, fields): if fields is None: # output fields from source fields, source = iterpeek(source) yield tuple(fields) for key, rows in rowgroupby(source, key): yield tuple(reducer(key, rows))
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix): lit = iter(left) lflds = lit.next() rflds, rit = iterpeek(right) # need the whole lot to pass to lookup from petl.util import lookupone rlookup = lookupone(rit, rkey, strict=False) # determine indices of the key fields in left and right tables lkind = asindices(lflds, lkey) rkind = asindices(rflds, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rflds)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outflds = list(lflds) else: outflds = [(str(lprefix) + str(f)) for f in lflds] if rprefix is None: outflds.extend(rgetv(rflds)) else: outflds.extend([(str(rprefix) + str(f)) for f in rgetv(rflds)]) yield tuple(outflds) # define a function to join rows def joinrows(_lrow, _rrow): # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) return tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrow = rlookup[k] yield joinrows(lrow, rrow) else: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow)
def guessdtype(table): try: import numpy as np except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: # get numpy to infer dtypes for each field individually fields, table = iterpeek(table, 1) cols = columns(table) dtype = [] for f in fields: a = np.array(cols[f]) # load into 1D array to get numpy to infer a dtype for the column dtype.append((f, a.dtype)) return np.dtype(dtype)
def valuestoarray(vals, dtype=None, count=-1, sample=1000): try: import numpy as np except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: it = iter(vals) if dtype is None: peek, it = iterpeek(it, sample) dtype = np.array(peek).dtype a = np.fromiter(it, dtype=dtype, count=count) return a
def tohdf5(table, source, where=None, name=None, create=False, description=None, title='', filters=None, expectedrows=10000, chunkshape=None, byteorder=None, createparents=False, sample=1000): """ Write to an HDF5 table. If `create` is `False`, assumes the table already exists, and attempts to truncate it before loading. If `create` is `True`, any existing table is dropped, and a new table is created; if `description` is None, the datatype will be guessed. E.g.:: >>> from petl import look >>> look(table1) +-------+----------+ | 'foo' | 'bar' | +=======+==========+ | 1 | 'asdfgh' | +-------+----------+ | 2 | 'qwerty' | +-------+----------+ | 3 | 'zxcvbn' | +-------+----------+ >>> from petlx.hdf5 import tohdf5, fromhdf5 >>> tohdf5(table1, 'test1.h5', '/testgroup', 'testtable', create=True, createparents=True) >>> look(fromhdf5('test1.h5', '/testgroup', 'testtable')) +-------+----------+ | 'foo' | 'bar' | +=======+==========+ | 1 | 'asdfgh' | +-------+----------+ | 2 | 'qwerty' | +-------+----------+ | 3 | 'zxcvbn' | +-------+----------+ See also :func:`appendhdf5`. .. versionadded:: 0.3 """ it = iter(table) if create: try: import tables except ImportError as e: raise UnsatisfiedDependency(e, dep_message) if isinstance(source, basestring): # assume it's the name of an HDF5 file h5file = tables.openFile(source, mode='a') # don't replace the whole file! elif isinstance(source, tables.File): h5file = source else: raise Exception( 'invalid source argument, expected file name or tables.File, found: %r' % source) # determine datatype if description is None: peek, it = iterpeek(it, sample) # use a numpy dtype description = guessdtype(peek) # check if the table node already exists try: h5table = h5file.getNode(where, name) except tables.NoSuchNodeError: pass else: # drop the node h5file.removeNode(where, name) # create the table h5table = h5file.createTable(where, name, description, title=title, filters=filters, expectedrows=expectedrows, chunkshape=chunkshape, byteorder=byteorder, createparents=createparents) else: h5file, h5table = _get_hdf5_table(source, where, name, mode='a') try: # truncate the existing table h5table.truncate(0) # load the data _insert(it, h5table) finally: if isinstance(source, basestring): # close the file if we opened it here h5file.close()
def toarray(table, dtype=None, count=-1, sample=1000): """ Convenience function to load data from the given `table` into a numpy structured array. E.g.:: >>> from petl import look >>> from petlx.array import toarray >>> look(table) +-----------+-------+-------+ | 'foo' | 'bar' | 'baz' | +===========+=======+=======+ | 'apples' | 1 | 2.5 | +-----------+-------+-------+ | 'oranges' | 3 | 4.4 | +-----------+-------+-------+ | 'pears' | 7 | 0.1 | +-----------+-------+-------+ >>> a = toarray(table) >>> a array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)], dtype=[('foo', '|S7'), ('bar', '<i8'), ('baz', '<f8')]) >>> a['foo'] array(['apples', 'oranges', 'pears'], dtype='|S7') >>> a['bar'] array([1, 3, 7]) >>> a['baz'] array([ 2.5, 4.4, 0.1]) >>> a['foo'][0] 'apples' >>> a['bar'][1] 3 >>> a['baz'][2] 0.10000000000000001 If no datatype is specified, `sample` rows will be examined to infer an appropriate datatype for each field. The datatype can be specified as a string, e.g.: >>> a = toarray(table, dtype='a4, i2, f4') >>> a array([('appl', 1, 2.5), ('oran', 3, 4.400000095367432), ('pear', 7, 0.10000000149011612)], dtype=[('foo', '|S4'), ('bar', '<i2'), ('baz', '<f4')]) The datatype can also be partially specified, in which case datatypes will be inferred for other fields, e.g.: >>> a = toarray(table, dtype={'foo': 'a4'}) >>> a array([('appl', 1, 2.5), ('oran', 3, 4.4), ('pear', 7, 0.1)], dtype=[('foo', '|S4'), ('bar', '<i8'), ('baz', '<f8')]) """ try: import numpy as np except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: it = iter(table) peek, it = iterpeek(it, sample) fields = it.next() if dtype is None: dtype = guessdtype(peek) elif isinstance(dtype, basestring): # insert field names from source table typestrings = [s.strip() for s in dtype.split(',')] dtype = [(f, t) for f, t in zip(fields, typestrings)] elif isinstance(dtype, dict) and ('names' not in dtype or 'formats' not in dtype): # allow for partial specification of dtype cols = columns(peek) newdtype = {'names': [], 'formats': []} for f in fields: newdtype['names'].append(f) if f in dtype and isinstance(dtype[f], tuple): # assume fully specified newdtype['formats'].append(dtype[f][0]) elif f not in dtype: # not specified at all a = np.array(cols[f]) newdtype['formats'].append(a.dtype) else: # assume directly specified, just need to add offset newdtype['formats'].append(dtype[f]) dtype = newdtype else: pass # leave dtype as-is it = (tuple(row) for row in it) # numpy is fussy about having tuples, need to make sure sa = np.fromiter(it, dtype=dtype, count=count) return sa
def tohdf5( table, source, where=None, name=None, create=False, description=None, title="", filters=None, expectedrows=10000, chunkshape=None, byteorder=None, createparents=False, sample=1000, ): """ Write to an HDF5 table. If `create` is `False`, assumes the table already exists, and attempts to truncate it before loading. If `create` is `True`, any existing table is dropped, and a new table is created; if `description` is None, the datatype will be guessed. E.g.:: >>> from petl import look >>> look(table1) +-------+----------+ | 'foo' | 'bar' | +=======+==========+ | 1 | 'asdfgh' | +-------+----------+ | 2 | 'qwerty' | +-------+----------+ | 3 | 'zxcvbn' | +-------+----------+ >>> from petlx.hdf5 import tohdf5, fromhdf5 >>> tohdf5(table1, 'test1.h5', '/testgroup', 'testtable', create=True, createparents=True) >>> look(fromhdf5('test1.h5', '/testgroup', 'testtable')) +-------+----------+ | 'foo' | 'bar' | +=======+==========+ | 1 | 'asdfgh' | +-------+----------+ | 2 | 'qwerty' | +-------+----------+ | 3 | 'zxcvbn' | +-------+----------+ See also :func:`appendhdf5`. .. versionadded:: 0.3 """ it = iter(table) if create: try: import tables except ImportError as e: raise UnsatisfiedDependency(e, dep_message) if isinstance(source, basestring): # assume it's the name of an HDF5 file h5file = tables.openFile(source, mode="a") # don't replace the whole file! elif isinstance(source, tables.File): h5file = source else: raise Exception("invalid source argument, expected file name or tables.File, found: %r" % source) # determine datatype if description is None: peek, it = iterpeek(it, sample) # use a numpy dtype description = guessdtype(peek) # check if the table node already exists try: h5table = h5file.getNode(where, name) except tables.NoSuchNodeError: pass else: # drop the node h5file.removeNode(where, name) # create the table h5table = h5file.createTable( where, name, description, title=title, filters=filters, expectedrows=expectedrows, chunkshape=chunkshape, byteorder=byteorder, createparents=createparents, ) else: h5file, h5table = _get_hdf5_table(source, where, name, mode="a") try: # truncate the existing table h5table.truncate(0) # load the data _insert(it, h5table) finally: if isinstance(source, basestring): # close the file if we opened it here h5file.close()
def toarray(table, dtype=None, count=-1, sample=1000): """ Convenience function to load data from the given `table` into a numpy structured array. E.g.:: >>> from petl import look >>> from petlx.array import toarray >>> look(table) +-----------+-------+-------+ | 'foo' | 'bar' | 'baz' | +===========+=======+=======+ | 'apples' | 1 | 2.5 | +-----------+-------+-------+ | 'oranges' | 3 | 4.4 | +-----------+-------+-------+ | 'pears' | 7 | 0.1 | +-----------+-------+-------+ >>> a = toarray(table) >>> a array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)], dtype=[('foo', '|S7'), ('bar', '<i8'), ('baz', '<f8')]) >>> a['foo'] array(['apples', 'oranges', 'pears'], dtype='|S7') >>> a['bar'] array([1, 3, 7]) >>> a['baz'] array([ 2.5, 4.4, 0.1]) >>> a['foo'][0] 'apples' >>> a['bar'][1] 3 >>> a['baz'][2] 0.10000000000000001 If no datatype is specified, `sample` rows will be examined to infer an appropriate datatype for each field. The datatype can be specified as a string, e.g.: >>> a = toarray(table, dtype='a4, i2, f4') >>> a array([('appl', 1, 2.5), ('oran', 3, 4.400000095367432), ('pear', 7, 0.10000000149011612)], dtype=[('foo', '|S4'), ('bar', '<i2'), ('baz', '<f4')]) The datatype can also be partially specified, in which case datatypes will be inferred for other fields, e.g.: >>> a = toarray(table, dtype={'foo': 'a4'}) >>> a array([('appl', 1, 2.5), ('oran', 3, 4.4), ('pear', 7, 0.1)], dtype=[('foo', '|S4'), ('bar', '<i8'), ('baz', '<f8')]) """ try: import numpy as np except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: it = iter(table) peek, it = iterpeek(it, sample) fields = it.next() if dtype is None: dtype = guessdtype(peek) elif isinstance(dtype, basestring): # insert field names from source table typestrings = [s.strip() for s in dtype.split(',')] dtype = [(f, t) for f, t in zip(fields, typestrings)] elif (isinstance(dtype, dict) and ('names' not in dtype or 'formats' not in dtype)): # allow for partial specification of dtype cols = columns(peek) newdtype = {'names': [], 'formats': []} for f in fields: newdtype['names'].append(f) if f in dtype and isinstance(dtype[f], tuple): # assume fully specified newdtype['formats'].append(dtype[f][0]) elif f not in dtype: # not specified at all a = np.array(cols[f]) newdtype['formats'].append(a.dtype) else: # assume directly specified, just need to add offset newdtype['formats'].append(dtype[f]) dtype = newdtype else: pass # leave dtype as-is # numpy is fussy about having tuples, need to make sure it = (tuple(row) for row in it) sa = np.fromiter(it, dtype=dtype, count=count) return sa