def _get_td_css(h, v, td_styles): # check for user-provided style if td_styles: if isinstance(td_styles, string_types): return td_styles elif callable(td_styles): return td_styles(v) elif isinstance(td_styles, dict): if h in td_styles: s = td_styles[h] if isinstance(s, string_types): return s elif callable(s): return s(v) else: raise ArgumentError('expected string or callable, got %r' % s) else: raise ArgumentError('expected string, callable or dict, got %r' % td_styles) # fall back to default style if isinstance(v, numeric_types) and not isinstance(v, bool): return 'text-align: right' else: return ''
def iterunpack(source, field, newfields, include_original, missing): it = iter(source) hdr = next(it) flds = list(map(text_type, hdr)) if field in flds: field_index = flds.index(field) elif isinstance(field, int) and field < len(flds): field_index = field field = flds[field_index] else: raise ArgumentError( 'field invalid: must be either field name or index') # determine output fields outhdr = list(flds) if not include_original: outhdr.remove(field) if isinstance(newfields, (list, tuple)): outhdr.extend(newfields) nunpack = len(newfields) elif isinstance(newfields, int): nunpack = newfields newfields = [ text_type(field) + text_type(i + 1) for i in range(newfields) ] outhdr.extend(newfields) elif newfields is None: nunpack = 0 else: raise ArgumentError( 'newfields argument must be list or tuple of field ' 'names, or int (number of values to unpack)') yield tuple(outhdr) # construct the output data for row in it: value = row[field_index] if include_original: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i != field_index] nvals = len(value) if nunpack > 0: if nvals >= nunpack: newvals = value[:nunpack] else: newvals = list(value) + ([missing] * (nunpack - nvals)) out_row.extend(newvals) yield tuple(out_row)
def open(self, mode='r'): if not mode.startswith('r'): raise ArgumentError('source is read-only') if 'b' in mode: yield Uncloseable(stdin_binary) else: yield Uncloseable(sys.stdin)
def open(self, mode='rb'): try: if 'r' in mode: if self.s is not None: if 'b' in mode: self.buffer = BytesIO(self.s) else: self.buffer = StringIO(self.s) else: raise ArgumentError('no string data supplied') elif 'w' in mode: if self.buffer is not None: self.buffer.close() if 'b' in mode: self.buffer = BytesIO() else: self.buffer = StringIO() elif 'a' in mode: if self.buffer is None: if 'b' in mode: self.buffer = BytesIO() else: self.buffer = StringIO() yield Uncloseable(self.buffer) except: raise finally: pass # don't close the buffer
def itersplit(source, field, pattern, newfields, include_original, maxsplit, flags): it = iter(source) prog = re.compile(pattern, flags) hdr = next(it) flds = list(map(text_type, hdr)) if isinstance(field, int) and field < len(hdr): field_index = field field = hdr[field_index] elif field in flds: field_index = flds.index(field) else: raise ArgumentError( 'field invalid: must be either field name or index') # determine output fields outhdr = list(flds) if not include_original: outhdr.remove(field) if newfields: outhdr.extend(newfields) yield tuple(outhdr) # construct the output data for row in it: value = row[field_index] if include_original: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i != field_index] out_row.extend(prog.split(value, maxsplit)) yield tuple(out_row)
def __init__(self, source, key, aggregation=None, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.source = source else: self.source = sort(source, key, buffersize=buffersize, tempdir=tempdir, cache=cache) self.key = key if aggregation is None: self.aggregation = OrderedDict() elif isinstance(aggregation, (list, tuple)): self.aggregation = OrderedDict() for t in aggregation: self.aggregation[t[0]] = t[1:] elif isinstance(aggregation, dict): self.aggregation = aggregation else: raise ArgumentError( 'expected aggregation is None, list, tuple or dict, found %r' % aggregation)
def _get_hdf5_file(source, mode='r'): import tables needs_closing = False # allow for polymorphic args if isinstance(source, string_types): # assume source is the name of an HDF5 file, try to open it h5file = tables.open_file(source, mode=mode) needs_closing = True elif isinstance(source, tables.File): # source is an HDF5 file object h5file = source else: # invalid source raise ArgumentError('invalid source argument, expected file name or ' 'tables.File object, found: %r' % source) try: yield h5file finally: if needs_closing: h5file.close()
def itermultiaggregate(source, key, aggregation): aggregation = OrderedDict(aggregation.items()) # take a copy it = iter(source) hdr = next(it) # push back header to ensure we iterate only once it = itertools.chain([hdr], it) # normalise aggregators for outfld in aggregation: agg = aggregation[outfld] if callable(agg): aggregation[outfld] = None, agg elif isinstance(agg, string_types): aggregation[outfld] = agg, list # list is default elif len(agg) == 1 and isinstance(agg[0], string_types): aggregation[outfld] = agg[0], list # list is default elif len(agg) == 1 and callable(agg[0]): aggregation[outfld] = None, agg[0] # aggregate whole rows elif len(agg) == 2: pass # no need to normalise else: raise ArgumentError('invalid aggregation: %r, %r' % (outfld, agg)) # determine output header if isinstance(key, (list, tuple)): outhdr = list(key) elif callable(key): outhdr = ['key'] else: outhdr = [key] for outfld in aggregation: outhdr.append(outfld) yield tuple(outhdr) # generate data for k, rows in rowgroupby(it, key): rows = list(rows) # may need to iterate over these more than once # handle compound key if isinstance(key, (list, tuple)): outrow = list(k) else: outrow = [k] for outfld in aggregation: srcfld, aggfun = aggregation[outfld] if srcfld is None: aggval = aggfun(rows) outrow.append(aggval) elif isinstance(srcfld, (list, tuple)): idxs = [hdr.index(f) for f in srcfld] valgetter = operator.itemgetter(*idxs) vals = (valgetter(row) for row in rows) aggval = aggfun(vals) outrow.append(aggval) else: idx = hdr.index(srcfld) # try using generator comprehension vals = (row[idx] for row in rows) aggval = aggfun(vals) outrow.append(aggval) yield tuple(outrow)
def open(self, mode): if mode.startswith('r'): raise ArgumentError('source is write-only') if 'b' in mode: yield Uncloseable(stdout_binary) else: yield Uncloseable(sys.stdout)
def open(self, mode='r'): if not mode.startswith('r'): raise ArgumentError('source is read-only') f = urlopen(*self.args, **self.kwargs) try: yield f finally: f.close()
def iterfieldmap(source, mappings, failonerror, errorvalue): it = iter(source) hdr = next(it) flds = list(map(text_type, hdr)) outhdr = mappings.keys() yield tuple(outhdr) mapfuns = dict() for outfld, m in mappings.items(): if m in hdr: mapfuns[outfld] = operator.itemgetter(m) elif isinstance(m, int) and m < len(hdr): mapfuns[outfld] = operator.itemgetter(m) elif isinstance(m, string_types): mapfuns[outfld] = expr(m) elif callable(m): mapfuns[outfld] = m elif isinstance(m, (tuple, list)) and len(m) == 2: srcfld = m[0] fm = m[1] if callable(fm): mapfuns[outfld] = composefun(fm, srcfld) elif isinstance(fm, dict): mapfuns[outfld] = composedict(fm, srcfld) else: raise ArgumentError('expected callable or dict') else: raise ArgumentError('invalid mapping %r: %r' % (outfld, m)) # wrap rows as records it = (Record(row, flds) for row in it) for row in it: outrow = list() for outfld in outhdr: try: val = mapfuns[outfld](row) except Exception as e: if failonerror == 'inline': val = e elif failonerror: raise e else: val = errorvalue outrow.append(val) yield tuple(outrow)
def __getattr__(self, f): if f in self.flds: try: return super(Record, self).__getitem__(self.flds.index(f)) except IndexError: # handle short rows return self.missing else: raise ArgumentError('item ' + repr(f) + ' not in fields ' + repr(self.flds))
def open(self, mode='r'): if not mode.startswith('r'): raise ArgumentError('source is read-only') self.kwargs['stdout'] = subprocess.PIPE proc = subprocess.Popen(*self.args, **self.kwargs) try: yield proc.stdout finally: pass
def appendtextindex(table, index_or_dirname, indexname=None, merge=True, optimize=False): """ Load all rows from `table` into a Whoosh index, adding them to any existing data in the index. Keyword arguments: table A table container with the data to be loaded. index_or_dirname Either an instance of `whoosh.index.Index` or a string containing the directory path where the index is to be stored. indexname String containing the name of the index, if multiple indexes are stored in the same directory. merge Merge small segments during commit? optimize Merge all segments together? """ import whoosh.index # deal with polymorphic argument if isinstance(index_or_dirname, string_types): dirname = index_or_dirname index = whoosh.index.open_dir(dirname, indexname=indexname, readonly=False) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise ArgumentError('expected string or index, found %r' % index_or_dirname) writer = index.writer() try: for d in dicts(table): writer.add_document(**d) writer.commit(merge=merge, optimize=optimize) except Exception: writer.cancel() raise finally: if needs_closing: index.close()
def open(self, mode='r'): if self.remote: if not mode.startswith('r'): raise ArgumentError('source is read-only') filehandle = urlopen(self.filename) else: filehandle = self.filename source = bz2.BZ2File(filehandle, mode, **self.kwargs) try: yield source finally: source.close()
def _get_tr_css(row, tr_style): # check for user-provided style if tr_style: if isinstance(tr_style, string_types): return tr_style elif callable(tr_style): return tr_style(row) else: raise ArgumentError('expected string or callable, got %r' % tr_style) # fall back to default style return ''
def __getitem__(self, f): if isinstance(f, int): idx = f elif f in self.flds: idx = self.flds.index(f) else: raise ArgumentError('item ' + repr(f) + ' not in fields ' + repr(self.flds)) try: return super(Record, self).__getitem__(idx) except IndexError: # handle short rows return self.missing
def search(table, *args, **kwargs): """ Perform a regular expression search, returning rows that match a given pattern, either anywhere in the row or within a specific field. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar', 'baz'], ... ['orange', 12, 'oranges are nice fruit'], ... ['mango', 42, 'I like them'], ... ['banana', 74, 'lovely too'], ... ['cucumber', 41, 'better than mango']] >>> # search any field ... table2 = etl.search(table1, '.g.') >>> table2 +------------+-----+--------------------------+ | foo | bar | baz | +============+=====+==========================+ | 'orange' | 12 | 'oranges are nice fruit' | +------------+-----+--------------------------+ | 'mango' | 42 | 'I like them' | +------------+-----+--------------------------+ | 'cucumber' | 41 | 'better than mango' | +------------+-----+--------------------------+ >>> # search a specific field ... table3 = etl.search(table1, 'foo', '.g.') >>> table3 +----------+-----+--------------------------+ | foo | bar | baz | +==========+=====+==========================+ | 'orange' | 12 | 'oranges are nice fruit' | +----------+-----+--------------------------+ | 'mango' | 42 | 'I like them' | +----------+-----+--------------------------+ The complement can be found via :func:`petl.transform.regex.searchcomplement`. """ if len(args) == 1: field = None pattern = args[0] elif len(args) == 2: field = args[0] pattern = args[1] else: raise ArgumentError('expected 1 or 2 positional arguments') return SearchView(table, pattern, field=field, **kwargs)
def itertextindex(index_or_dirname, indexname, docnum_field): import whoosh.index if isinstance(index_or_dirname, string_types): dirname = index_or_dirname index = whoosh.index.open_dir(dirname, indexname=indexname, readonly=True) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise ArgumentError('expected string or index, found %r' % index_or_dirname) try: if docnum_field is None: # figure out the field names hdr = tuple(index.schema.stored_names()) yield hdr # yield all documents astuple = operator.itemgetter(*index.schema.stored_names()) for _, stored_fields_dict in index.reader().iter_docs(): yield astuple(stored_fields_dict) else: # figure out the field names hdr = (docnum_field, ) + tuple(index.schema.stored_names()) yield hdr # yield all documents astuple = operator.itemgetter(*index.schema.stored_names()) for docnum, stored_fields_dict in index.reader().iter_docs(): yield (docnum, ) + astuple(stored_fields_dict) except: raise finally: if needs_closing: # close the index if we're the ones who opened it index.close()
def keys_from_args(left, right, key, lkey, rkey): if key is lkey is rkey is None: # no keys specified, attempt natural join lkey = rkey = natural_key(left, right) elif key is not None and lkey is rkey is None: # common key specified lkey = rkey = key elif key is None and lkey is not None and rkey is not None: # left and right keys specified pass else: raise ArgumentError( 'bad key arguments: either specify key, or specify both lkey and ' 'rkey, or provide no key/lkey/rkey arguments at all (natural join)' ) return lkey, rkey
def _execute(sql, dbo, commit): debug(sql) # need to deal with polymorphic dbo argument # what sort of duck is it? # does it quack like a standard DB-API 2.0 connection? if _is_dbapi_connection(dbo): debug('assuming %r is standard DB-API 2.0 connection', dbo) _execute_dbapi_connection(sql, dbo, commit) # does it quack like a standard DB-API 2.0 cursor? elif _is_dbapi_cursor(dbo): debug('assuming %r is standard DB-API 2.0 cursor') _execute_dbapi_cursor(sql, dbo, commit) # does it quack like an SQLAlchemy engine? elif _is_sqlalchemy_engine(dbo): debug('assuming %r is an instance of sqlalchemy.engine.base.Engine', dbo) _execute_sqlalchemy_engine(sql, dbo, commit) # does it quack like an SQLAlchemy session? elif _is_sqlalchemy_session(dbo): debug('assuming %r is an instance of sqlalchemy.orm.session.Session', dbo) _execute_sqlalchemy_session(sql, dbo, commit) # does it quack like an SQLAlchemy connection? elif _is_sqlalchemy_connection(dbo): debug( 'assuming %r is an instance of ' 'sqlalchemy.engine.base.Connection', dbo) _execute_sqlalchemy_connection(sql, dbo, commit) elif callable(dbo): debug( 'assuming %r is a function returning standard DB-API 2.0 cursor ' 'objects', dbo) _execute_dbapi_mkcurs(sql, dbo, commit) # some other sort of duck... else: raise ArgumentError('unsupported database object type: %r' % dbo)
def itercapture(source, field, pattern, newfields, include_original, flags, fill): it = iter(source) prog = re.compile(pattern, flags) hdr = next(it) flds = list(map(text_type, hdr)) if isinstance(field, int) and field < len(hdr): field_index = field elif field in flds: field_index = flds.index(field) else: raise ArgumentError( 'field invalid: must be either field name or index') # determine output fields outhdr = list(flds) if not include_original: outhdr.remove(field) if newfields: outhdr.extend(newfields) yield tuple(outhdr) # construct the output data for row in it: value = row[field_index] if include_original: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i != field_index] match = prog.search(value) if match is None: if fill is not None: fill = [ item(row) if callable(dict(zip(flds, item))) else item for item in fill ] out_row.extend(fill) else: raise TransformError('value %r did not match pattern %r' % (value, pattern)) else: out_row.extend(match.groups()) yield tuple(out_row)
def _todb(table, dbo, tablename, schema=None, commit=True, truncate=False): # need to deal with polymorphic dbo argument # what sort of duck is it? # does it quack like a standard DB-API 2.0 connection? if _is_dbapi_connection(dbo): debug('assuming %r is standard DB-API 2.0 connection', dbo) _todb_dbapi_connection(table, dbo, tablename, schema=schema, commit=commit, truncate=truncate) # does it quack like a standard DB-API 2.0 cursor? elif _is_dbapi_cursor(dbo): debug('assuming %r is standard DB-API 2.0 cursor') _todb_dbapi_cursor(table, dbo, tablename, schema=schema, commit=commit, truncate=truncate) # does it quack like an SQLAlchemy engine? elif _is_sqlalchemy_engine(dbo): debug('assuming %r instance of sqlalchemy.engine.base.Engine', dbo) _todb_sqlalchemy_engine(table, dbo, tablename, schema=schema, commit=commit, truncate=truncate) # does it quack like an SQLAlchemy session? elif _is_sqlalchemy_session(dbo): debug('assuming %r instance of sqlalchemy.orm.session.Session', dbo) _todb_sqlalchemy_session(table, dbo, tablename, schema=schema, commit=commit, truncate=truncate) # does it quack like an SQLAlchemy connection? elif _is_sqlalchemy_connection(dbo): debug('assuming %r instance of sqlalchemy.engine.base.Connection', dbo) _todb_sqlalchemy_connection(table, dbo, tablename, schema=schema, commit=commit, truncate=truncate) elif callable(dbo): debug('assuming %r is a function returning standard DB-API 2.0 cursor ' 'objects', dbo) _todb_dbapi_mkcurs(table, dbo, tablename, schema=schema, commit=commit, truncate=truncate) # some other sort of duck... else: raise ArgumentError('unsupported database object type: %r' % dbo)
def __iter__(self): # does it quack like a standard DB-API 2.0 connection? if _is_dbapi_connection(self.dbo): debug('assuming %r is standard DB-API 2.0 connection', self.dbo) _iter = _iter_dbapi_connection # does it quack like a standard DB-API 2.0 cursor? elif _is_dbapi_cursor(self.dbo): debug('assuming %r is standard DB-API 2.0 cursor') warning('using a DB-API cursor with fromdb() is not recommended ' 'and may lead to unexpected results, a DB-API connection ' 'is better') _iter = _iter_dbapi_cursor # does it quack like an SQLAlchemy engine? elif _is_sqlalchemy_engine(self.dbo): debug('assuming %r instance of sqlalchemy.engine.base.Engine', self.dbo) _iter = _iter_sqlalchemy_engine # does it quack like an SQLAlchemy session? elif _is_sqlalchemy_session(self.dbo): debug('assuming %r instance of sqlalchemy.orm.session.Session', self.dbo) _iter = _iter_sqlalchemy_session # does it quack like an SQLAlchemy connection? elif _is_sqlalchemy_connection(self.dbo): debug('assuming %r instance of sqlalchemy.engine.base.Connection', self.dbo) _iter = _iter_sqlalchemy_connection elif callable(self.dbo): debug('assuming %r is a function returning a cursor', self.dbo) _iter = _iter_dbapi_mkcurs # some other sort of duck... else: raise ArgumentError('unsupported database object type: %r' % self.dbo) return _iter(self.dbo, self.query, *self.args, **self.kwargs)
def __init__(self, source, converters=None, failonerror=False, errorvalue=None, where=None, pass_row=False): self.source = source if converters is None: self.converters = dict() elif isinstance(converters, dict): self.converters = converters elif isinstance(converters, (tuple, list)): self.converters = dict([(i, v) for i, v in enumerate(converters)]) else: raise ArgumentError('unexpected converters: %r' % converters) self.failonerror = failonerror self.errorvalue = errorvalue self.where = where self.pass_row = pass_row
def itersplitdown(table, field, pattern, maxsplit, flags): prog = re.compile(pattern, flags) it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) if isinstance(field, int) and field < len(hdr): field_index = field field = hdr[field_index] elif field in flds: field_index = flds.index(field) else: raise ArgumentError( 'field invalid: must be either field name or index') yield tuple(hdr) for row in it: value = row[field_index] for v in prog.split(value, maxsplit): yield tuple(v if i == field_index else row[i] for i in range(len(hdr)))
def iterfieldconvert(source, converters, failonerror, errorvalue, where, pass_row): # grab the fields in the source table it = iter(source) hdr = next(it) flds = list(map(text_type, hdr)) yield tuple(hdr) # these are not modified # build converter functions converter_functions = dict() for k, c in converters.items(): # turn field names into row indices if not isinstance(k, integer_types): try: k = flds.index(k) except ValueError: # not in list raise FieldSelectionError(k) assert isinstance(k, int), 'expected integer, found %r' % k # is converter a function? if callable(c): converter_functions[k] = c # is converter a method name? elif isinstance(c, string_types): converter_functions[k] = methodcaller(c) # is converter a method name with arguments? elif isinstance(c, (tuple, list)) and isinstance(c[0], string_types): methnm = c[0] methargs = c[1:] converter_functions[k] = methodcaller(methnm, *methargs) # is converter a dictionary? elif isinstance(c, dict): converter_functions[k] = dictconverter(c) # is it something else? elif c is None: pass # ignore else: raise ArgumentError( 'unexpected converter specification on field %r: %r' % (k, c)) # define a function to transform a value def transform_value(i, v, *args): if i not in converter_functions: # no converter defined on this field, return value as-is return v else: try: return converter_functions[i](v, *args) except Exception as e: if failonerror: raise e else: return errorvalue # define a function to transform a row if pass_row: def transform_row(_row): return tuple( transform_value(i, v, _row) for i, v in enumerate(_row)) else: def transform_row(_row): return tuple(transform_value(i, v) for i, v in enumerate(_row)) # prepare where function if isinstance(where, string_types): where = expr(where) elif where is not None: assert callable(where), 'expected callable for "where" argument, ' \ 'found %r' % where # prepare iterator if pass_row or where: # wrap rows as records it = (Record(row, flds) for row in it) # construct the data rows if where is None: # simple case, transform all rows for row in it: yield transform_row(row) else: # conditionally transform rows for row in it: if where(row): yield transform_row(row) else: yield row
def itersearchindex(index_or_dirname, query, limit, pagenum, pagelen, indexname, docnum_field, score_field, fieldboosts, search_kwargs): import whoosh.index import whoosh.query import whoosh.qparser if not search_kwargs: search_kwargs = dict() if isinstance(index_or_dirname, string_types): dirname = index_or_dirname index = whoosh.index.open_dir(dirname, indexname=indexname, readonly=True) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise ArgumentError('expected string or index, found %r' % index_or_dirname) try: # figure out header hdr = tuple() if docnum_field is not None: hdr += (docnum_field, ) if score_field is not None: hdr += (score_field, ) stored_names = tuple(index.schema.stored_names()) hdr += stored_names yield hdr # parse the query if isinstance(query, string_types): # search all fields by default parser = whoosh.qparser.MultifieldParser(index.schema.names(), index.schema, fieldboosts=fieldboosts) query = parser.parse(query) elif isinstance(query, whoosh.query.Query): pass else: raise ArgumentError( 'expected string or whoosh.query.Query, found %r' % query) # make a function to turn docs into tuples astuple = operator.itemgetter(*index.schema.stored_names()) with index.searcher() as searcher: if limit is not None: results = searcher.search(query, limit=limit, **search_kwargs) else: results = searcher.search_page(query, pagenum, pagelen=pagelen, **search_kwargs) if docnum_field is None and score_field is None: for doc in results: yield astuple(doc) else: for (docnum, score), doc in izip(results.items(), results): row = tuple() if docnum_field is not None: row += (docnum, ) if score_field is not None: row += (score, ) row += astuple(doc) yield row except: raise finally: if needs_closing: # close the index if we're the ones who opened it index.close() # TODO guess schema
def totextindex(table, index_or_dirname, schema=None, indexname=None, merge=False, optimize=False): """ Load all rows from `table` into a Whoosh index. N.B., this will clear any existing data in the index before loading. E.g.:: >>> import petl as etl >>> import datetime >>> import os >>> # here is the table we want to load into an index ... table = (('f0', 'f1', 'f2', 'f3', 'f4'), ... ('AAA', 12, 4.3, True, datetime.datetime.now()), ... ('BBB', 6, 3.4, False, datetime.datetime(1900, 1, 31)), ... ('CCC', 42, 7.8, True, datetime.datetime(2100, 12, 25))) >>> # define a schema for the index ... from whoosh.fields import * >>> schema = Schema(f0=TEXT(stored=True), ... f1=NUMERIC(int, stored=True), ... f2=NUMERIC(float, stored=True), ... f3=BOOLEAN(stored=True), ... f4=DATETIME(stored=True)) >>> # load index ... dirname = 'example.whoosh' >>> if not os.path.exists(dirname): ... os.mkdir(dirname) ... >>> etl.totextindex(table, dirname, schema=schema) Keyword arguments: table A table container with the data to be loaded. index_or_dirname Either an instance of `whoosh.index.Index` or a string containing the directory path where the index is to be stored. schema Index schema to use if creating the index. indexname String containing the name of the index, if multiple indexes are stored in the same directory. merge Merge small segments during commit? optimize Merge all segments together? """ import whoosh.index import whoosh.writing # deal with polymorphic argument if isinstance(index_or_dirname, string_types): dirname = index_or_dirname index = whoosh.index.create_in(dirname, schema, indexname=indexname) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise ArgumentError('expected string or index, found %r' % index_or_dirname) writer = index.writer() try: for d in dicts(table): writer.add_document(**d) writer.commit(merge=merge, optimize=optimize, mergetype=whoosh.writing.CLEAR) except: writer.cancel() raise finally: if needs_closing: index.close()
def aggregate(table, key, aggregation=None, value=None, presorted=False, buffersize=None, tempdir=None, cache=True): """Group rows under the given key then apply aggregation functions. E.g.:: >>> import petl as etl >>> >>> table1 = [['foo', 'bar', 'baz'], ... ['a', 3, True], ... ['a', 7, False], ... ['b', 2, True], ... ['b', 2, False], ... ['b', 9, False], ... ['c', 4, True]] >>> # aggregate whole rows ... table2 = etl.aggregate(table1, 'foo', len) >>> table2 +-----+-------+ | foo | value | +=====+=======+ | 'a' | 2 | +-----+-------+ | 'b' | 3 | +-----+-------+ | 'c' | 1 | +-----+-------+ >>> # aggregate single field ... table3 = etl.aggregate(table1, 'foo', sum, 'bar') >>> table3 +-----+-------+ | foo | value | +=====+=======+ | 'a' | 10 | +-----+-------+ | 'b' | 13 | +-----+-------+ | 'c' | 4 | +-----+-------+ >>> # alternative signature using keyword args ... table4 = etl.aggregate(table1, key=('foo', 'bar'), ... aggregation=list, value=('bar', 'baz')) >>> table4 +-----+-----+-------------------------+ | foo | bar | value | +=====+=====+=========================+ | 'a' | 3 | [(3, True)] | +-----+-----+-------------------------+ | 'a' | 7 | [(7, False)] | +-----+-----+-------------------------+ | 'b' | 2 | [(2, True), (2, False)] | +-----+-----+-------------------------+ | 'b' | 9 | [(9, False)] | +-----+-----+-------------------------+ | 'c' | 4 | [(4, True)] | +-----+-----+-------------------------+ >>> # aggregate multiple fields ... from collections import OrderedDict >>> import petl as etl >>> >>> aggregation = OrderedDict() >>> aggregation['count'] = len >>> aggregation['minbar'] = 'bar', min >>> aggregation['maxbar'] = 'bar', max >>> aggregation['sumbar'] = 'bar', sum >>> # default aggregation function is list ... aggregation['listbar'] = 'bar' >>> aggregation['listbarbaz'] = ('bar', 'baz'), list >>> aggregation['bars'] = 'bar', etl.strjoin(', ') >>> table5 = etl.aggregate(table1, 'foo', aggregation) >>> table5 +-----+-------+--------+--------+--------+-----------+-------------------------------------+-----------+ | foo | count | minbar | maxbar | sumbar | listbar | listbarbaz | bars | +=====+=======+========+========+========+===========+=====================================+===========+ | 'a' | 2 | 3 | 7 | 10 | [3, 7] | [(3, True), (7, False)] | '3, 7' | +-----+-------+--------+--------+--------+-----------+-------------------------------------+-----------+ | 'b' | 3 | 2 | 9 | 13 | [2, 2, 9] | [(2, True), (2, False), (9, False)] | '2, 2, 9' | +-----+-------+--------+--------+--------+-----------+-------------------------------------+-----------+ | 'c' | 1 | 4 | 4 | 4 | [4] | [(4, True)] | '4' | +-----+-------+--------+--------+--------+-----------+-------------------------------------+-----------+ If `presorted` is True, it is assumed that the data are already sorted by the given key, and the `buffersize`, `tempdir` and `cache` arguments are ignored. Otherwise, the data are sorted, see also the discussion of the `buffersize`, `tempdir` and `cache` arguments under the :func:`petl.transform.sorts.sort` function. """ if callable(aggregation): return SimpleAggregateView(table, key, aggregation=aggregation, value=value, presorted=presorted, buffersize=buffersize, tempdir=tempdir, cache=cache) elif aggregation is None or isinstance(aggregation, (list, tuple, dict)): # ignore value arg return MultiAggregateView(table, key, aggregation=aggregation, presorted=presorted, buffersize=buffersize, tempdir=tempdir, cache=cache) else: raise ArgumentError( 'expected aggregation is callable, list, tuple, dict ' 'or None')