Ejemplo n.º 1
0
def _get_td_css(h, v, td_styles):
    # check for user-provided style
    if td_styles:
        if isinstance(td_styles, string_types):
            return td_styles
        elif callable(td_styles):
            return td_styles(v)
        elif isinstance(td_styles, dict):
            if h in td_styles:
                s = td_styles[h]
                if isinstance(s, string_types):
                    return s
                elif callable(s):
                    return s(v)
                else:
                    raise ArgumentError('expected string or callable, got %r'
                                        % s)
        else:
            raise ArgumentError('expected string, callable or dict, got %r'
                                % td_styles)
    # fall back to default style
    if isinstance(v, numeric_types) and not isinstance(v, bool):
        return 'text-align: right'
    else:
        return ''
Ejemplo n.º 2
0
def iterunpack(source, field, newfields, include_original, missing):
    it = iter(source)

    hdr = next(it)
    flds = list(map(text_type, hdr))
    if field in flds:
        field_index = flds.index(field)
    elif isinstance(field, int) and field < len(flds):
        field_index = field
        field = flds[field_index]
    else:
        raise ArgumentError(
            'field invalid: must be either field name or index')

    # determine output fields
    outhdr = list(flds)
    if not include_original:
        outhdr.remove(field)
    if isinstance(newfields, (list, tuple)):
        outhdr.extend(newfields)
        nunpack = len(newfields)
    elif isinstance(newfields, int):
        nunpack = newfields
        newfields = [
            text_type(field) + text_type(i + 1) for i in range(newfields)
        ]
        outhdr.extend(newfields)
    elif newfields is None:
        nunpack = 0
    else:
        raise ArgumentError(
            'newfields argument must be list or tuple of field '
            'names, or int (number of values to unpack)')
    yield tuple(outhdr)

    # construct the output data
    for row in it:
        value = row[field_index]
        if include_original:
            out_row = list(row)
        else:
            out_row = [v for i, v in enumerate(row) if i != field_index]
        nvals = len(value)
        if nunpack > 0:
            if nvals >= nunpack:
                newvals = value[:nunpack]
            else:
                newvals = list(value) + ([missing] * (nunpack - nvals))
            out_row.extend(newvals)
        yield tuple(out_row)
Ejemplo n.º 3
0
 def open(self, mode='r'):
     if not mode.startswith('r'):
         raise ArgumentError('source is read-only')
     if 'b' in mode:
         yield Uncloseable(stdin_binary)
     else:
         yield Uncloseable(sys.stdin)
Ejemplo n.º 4
0
 def open(self, mode='rb'):
     try:
         if 'r' in mode:
             if self.s is not None:
                 if 'b' in mode:
                     self.buffer = BytesIO(self.s)
                 else:
                     self.buffer = StringIO(self.s)
             else:
                 raise ArgumentError('no string data supplied')
         elif 'w' in mode:
             if self.buffer is not None:
                 self.buffer.close()
             if 'b' in mode:
                 self.buffer = BytesIO()
             else:
                 self.buffer = StringIO()
         elif 'a' in mode:
             if self.buffer is None:
                 if 'b' in mode:
                     self.buffer = BytesIO()
                 else:
                     self.buffer = StringIO()
         yield Uncloseable(self.buffer)
     except:
         raise
     finally:
         pass  # don't close the buffer
Ejemplo n.º 5
0
def itersplit(source, field, pattern, newfields, include_original, maxsplit,
              flags):

    it = iter(source)
    prog = re.compile(pattern, flags)

    hdr = next(it)
    flds = list(map(text_type, hdr))
    if isinstance(field, int) and field < len(hdr):
        field_index = field
        field = hdr[field_index]
    elif field in flds:
        field_index = flds.index(field)
    else:
        raise ArgumentError(
            'field invalid: must be either field name or index')

    # determine output fields
    outhdr = list(flds)
    if not include_original:
        outhdr.remove(field)
    if newfields:
        outhdr.extend(newfields)
    yield tuple(outhdr)

    # construct the output data
    for row in it:
        value = row[field_index]
        if include_original:
            out_row = list(row)
        else:
            out_row = [v for i, v in enumerate(row) if i != field_index]
        out_row.extend(prog.split(value, maxsplit))
        yield tuple(out_row)
Ejemplo n.º 6
0
 def __init__(self,
              source,
              key,
              aggregation=None,
              presorted=False,
              buffersize=None,
              tempdir=None,
              cache=True):
     if presorted:
         self.source = source
     else:
         self.source = sort(source,
                            key,
                            buffersize=buffersize,
                            tempdir=tempdir,
                            cache=cache)
     self.key = key
     if aggregation is None:
         self.aggregation = OrderedDict()
     elif isinstance(aggregation, (list, tuple)):
         self.aggregation = OrderedDict()
         for t in aggregation:
             self.aggregation[t[0]] = t[1:]
     elif isinstance(aggregation, dict):
         self.aggregation = aggregation
     else:
         raise ArgumentError(
             'expected aggregation is None, list, tuple or dict, found %r' %
             aggregation)
Ejemplo n.º 7
0
def _get_hdf5_file(source, mode='r'):
    import tables

    needs_closing = False

    # allow for polymorphic args
    if isinstance(source, string_types):

        # assume source is the name of an HDF5 file, try to open it
        h5file = tables.open_file(source, mode=mode)
        needs_closing = True

    elif isinstance(source, tables.File):

        # source is an HDF5 file object
        h5file = source

    else:

        # invalid source
        raise ArgumentError('invalid source argument, expected file name or '
                            'tables.File object, found: %r' % source)

    try:
        yield h5file
    finally:
        if needs_closing:
            h5file.close()
Ejemplo n.º 8
0
def itermultiaggregate(source, key, aggregation):
    aggregation = OrderedDict(aggregation.items())  # take a copy
    it = iter(source)
    hdr = next(it)
    # push back header to ensure we iterate only once
    it = itertools.chain([hdr], it)

    # normalise aggregators
    for outfld in aggregation:
        agg = aggregation[outfld]
        if callable(agg):
            aggregation[outfld] = None, agg
        elif isinstance(agg, string_types):
            aggregation[outfld] = agg, list  # list is default
        elif len(agg) == 1 and isinstance(agg[0], string_types):
            aggregation[outfld] = agg[0], list  # list is default
        elif len(agg) == 1 and callable(agg[0]):
            aggregation[outfld] = None, agg[0]  # aggregate whole rows
        elif len(agg) == 2:
            pass  # no need to normalise
        else:
            raise ArgumentError('invalid aggregation: %r, %r' % (outfld, agg))

    # determine output header
    if isinstance(key, (list, tuple)):
        outhdr = list(key)
    elif callable(key):
        outhdr = ['key']
    else:
        outhdr = [key]
    for outfld in aggregation:
        outhdr.append(outfld)
    yield tuple(outhdr)

    # generate data
    for k, rows in rowgroupby(it, key):
        rows = list(rows)  # may need to iterate over these more than once
        # handle compound key
        if isinstance(key, (list, tuple)):
            outrow = list(k)
        else:
            outrow = [k]
        for outfld in aggregation:
            srcfld, aggfun = aggregation[outfld]
            if srcfld is None:
                aggval = aggfun(rows)
                outrow.append(aggval)
            elif isinstance(srcfld, (list, tuple)):
                idxs = [hdr.index(f) for f in srcfld]
                valgetter = operator.itemgetter(*idxs)
                vals = (valgetter(row) for row in rows)
                aggval = aggfun(vals)
                outrow.append(aggval)
            else:
                idx = hdr.index(srcfld)
                # try using generator comprehension
                vals = (row[idx] for row in rows)
                aggval = aggfun(vals)
                outrow.append(aggval)
        yield tuple(outrow)
Ejemplo n.º 9
0
 def open(self, mode):
     if mode.startswith('r'):
         raise ArgumentError('source is write-only')
     if 'b' in mode:
         yield Uncloseable(stdout_binary)
     else:
         yield Uncloseable(sys.stdout)
Ejemplo n.º 10
0
 def open(self, mode='r'):
     if not mode.startswith('r'):
         raise ArgumentError('source is read-only')
     f = urlopen(*self.args, **self.kwargs)
     try:
         yield f
     finally:
         f.close()
Ejemplo n.º 11
0
def iterfieldmap(source, mappings, failonerror, errorvalue):
    it = iter(source)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    outhdr = mappings.keys()
    yield tuple(outhdr)

    mapfuns = dict()
    for outfld, m in mappings.items():
        if m in hdr:
            mapfuns[outfld] = operator.itemgetter(m)
        elif isinstance(m, int) and m < len(hdr):
            mapfuns[outfld] = operator.itemgetter(m)
        elif isinstance(m, string_types):
            mapfuns[outfld] = expr(m)
        elif callable(m):
            mapfuns[outfld] = m
        elif isinstance(m, (tuple, list)) and len(m) == 2:
            srcfld = m[0]
            fm = m[1]
            if callable(fm):
                mapfuns[outfld] = composefun(fm, srcfld)
            elif isinstance(fm, dict):
                mapfuns[outfld] = composedict(fm, srcfld)
            else:
                raise ArgumentError('expected callable or dict')
        else:
            raise ArgumentError('invalid mapping %r: %r' % (outfld, m))

    # wrap rows as records
    it = (Record(row, flds) for row in it)
    for row in it:
        outrow = list()
        for outfld in outhdr:
            try:
                val = mapfuns[outfld](row)
            except Exception as e:
                if failonerror == 'inline':
                    val = e
                elif failonerror:
                    raise e
                else:
                    val = errorvalue
            outrow.append(val)
        yield tuple(outrow)
Ejemplo n.º 12
0
 def __getattr__(self, f):
     if f in self.flds:
         try:
             return super(Record, self).__getitem__(self.flds.index(f))
         except IndexError:  # handle short rows
             return self.missing
     else:
         raise ArgumentError('item ' + repr(f) + ' not in fields ' +
                             repr(self.flds))
Ejemplo n.º 13
0
 def open(self, mode='r'):
     if not mode.startswith('r'):
         raise ArgumentError('source is read-only')
     self.kwargs['stdout'] = subprocess.PIPE
     proc = subprocess.Popen(*self.args, **self.kwargs)
     try:
         yield proc.stdout
     finally:
         pass
Ejemplo n.º 14
0
def appendtextindex(table,
                    index_or_dirname,
                    indexname=None,
                    merge=True,
                    optimize=False):
    """
    Load all rows from `table` into a Whoosh index, adding them to any existing
    data in the index.

    Keyword arguments:

    table
        A table container with the data to be loaded.
    index_or_dirname
        Either an instance of `whoosh.index.Index` or a string containing the
        directory path where the index is to be stored.
    indexname
        String containing the name of the index, if multiple indexes are stored
        in the same directory.
    merge
        Merge small segments during commit?
    optimize
        Merge all segments together?

    """
    import whoosh.index

    # deal with polymorphic argument
    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.open_dir(dirname,
                                      indexname=indexname,
                                      readonly=False)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r' %
                            index_or_dirname)

    writer = index.writer()
    try:

        for d in dicts(table):
            writer.add_document(**d)
        writer.commit(merge=merge, optimize=optimize)

    except Exception:
        writer.cancel()
        raise

    finally:
        if needs_closing:
            index.close()
Ejemplo n.º 15
0
 def open(self, mode='r'):
     if self.remote:
         if not mode.startswith('r'):
             raise ArgumentError('source is read-only')
         filehandle = urlopen(self.filename)
     else:
         filehandle = self.filename
     source = bz2.BZ2File(filehandle, mode, **self.kwargs)
     try:
         yield source
     finally:
         source.close()
Ejemplo n.º 16
0
def _get_tr_css(row, tr_style):
    # check for user-provided style
    if tr_style:
        if isinstance(tr_style, string_types):
            return tr_style
        elif callable(tr_style):
            return tr_style(row)
        else:
            raise ArgumentError('expected string or callable, got %r'
                                % tr_style)
    # fall back to default style
    return ''
Ejemplo n.º 17
0
 def __getitem__(self, f):
     if isinstance(f, int):
         idx = f
     elif f in self.flds:
         idx = self.flds.index(f)
     else:
         raise ArgumentError('item ' + repr(f) + ' not in fields ' +
                             repr(self.flds))
     try:
         return super(Record, self).__getitem__(idx)
     except IndexError:  # handle short rows
         return self.missing
Ejemplo n.º 18
0
def search(table, *args, **kwargs):
    """
    Perform a regular expression search, returning rows that match a given
    pattern, either anywhere in the row or within a specific field. E.g.::

        >>> import petl as etl
        >>> table1 = [['foo', 'bar', 'baz'],
        ...           ['orange', 12, 'oranges are nice fruit'],
        ...           ['mango', 42, 'I like them'],
        ...           ['banana', 74, 'lovely too'],
        ...           ['cucumber', 41, 'better than mango']]
        >>> # search any field
        ... table2 = etl.search(table1, '.g.')
        >>> table2
        +------------+-----+--------------------------+
        | foo        | bar | baz                      |
        +============+=====+==========================+
        | 'orange'   |  12 | 'oranges are nice fruit' |
        +------------+-----+--------------------------+
        | 'mango'    |  42 | 'I like them'            |
        +------------+-----+--------------------------+
        | 'cucumber' |  41 | 'better than mango'      |
        +------------+-----+--------------------------+

        >>> # search a specific field
        ... table3 = etl.search(table1, 'foo', '.g.')
        >>> table3
        +----------+-----+--------------------------+
        | foo      | bar | baz                      |
        +==========+=====+==========================+
        | 'orange' |  12 | 'oranges are nice fruit' |
        +----------+-----+--------------------------+
        | 'mango'  |  42 | 'I like them'            |
        +----------+-----+--------------------------+

    The complement can be found via
    :func:`petl.transform.regex.searchcomplement`.

    """

    if len(args) == 1:
        field = None
        pattern = args[0]
    elif len(args) == 2:
        field = args[0]
        pattern = args[1]
    else:
        raise ArgumentError('expected 1 or 2 positional arguments')
    return SearchView(table, pattern, field=field, **kwargs)
Ejemplo n.º 19
0
def itertextindex(index_or_dirname, indexname, docnum_field):
    import whoosh.index

    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.open_dir(dirname,
                                      indexname=indexname,
                                      readonly=True)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r' %
                            index_or_dirname)

    try:

        if docnum_field is None:

            # figure out the field names
            hdr = tuple(index.schema.stored_names())
            yield hdr

            # yield all documents
            astuple = operator.itemgetter(*index.schema.stored_names())
            for _, stored_fields_dict in index.reader().iter_docs():
                yield astuple(stored_fields_dict)

        else:

            # figure out the field names
            hdr = (docnum_field, ) + tuple(index.schema.stored_names())
            yield hdr

            # yield all documents
            astuple = operator.itemgetter(*index.schema.stored_names())
            for docnum, stored_fields_dict in index.reader().iter_docs():
                yield (docnum, ) + astuple(stored_fields_dict)

    except:
        raise

    finally:
        if needs_closing:
            # close the index if we're the ones who opened it
            index.close()
Ejemplo n.º 20
0
def keys_from_args(left, right, key, lkey, rkey):

    if key is lkey is rkey is None:
        # no keys specified, attempt natural join
        lkey = rkey = natural_key(left, right)
    elif key is not None and lkey is rkey is None:
        # common key specified
        lkey = rkey = key
    elif key is None and lkey is not None and rkey is not None:
        # left and right keys specified
        pass
    else:
        raise ArgumentError(
            'bad key arguments: either specify key, or specify both lkey and '
            'rkey, or provide no key/lkey/rkey arguments at all (natural join)'
        )
    return lkey, rkey
Ejemplo n.º 21
0
def _execute(sql, dbo, commit):

    debug(sql)

    # need to deal with polymorphic dbo argument
    # what sort of duck is it?

    # does it quack like a standard DB-API 2.0 connection?
    if _is_dbapi_connection(dbo):
        debug('assuming %r is standard DB-API 2.0 connection', dbo)
        _execute_dbapi_connection(sql, dbo, commit)

    # does it quack like a standard DB-API 2.0 cursor?
    elif _is_dbapi_cursor(dbo):
        debug('assuming %r is standard DB-API 2.0 cursor')
        _execute_dbapi_cursor(sql, dbo, commit)

    # does it quack like an SQLAlchemy engine?
    elif _is_sqlalchemy_engine(dbo):
        debug('assuming %r is an instance of sqlalchemy.engine.base.Engine',
              dbo)
        _execute_sqlalchemy_engine(sql, dbo, commit)

    # does it quack like an SQLAlchemy session?
    elif _is_sqlalchemy_session(dbo):
        debug('assuming %r is an instance of sqlalchemy.orm.session.Session',
              dbo)
        _execute_sqlalchemy_session(sql, dbo, commit)

    # does it quack like an SQLAlchemy connection?
    elif _is_sqlalchemy_connection(dbo):
        debug(
            'assuming %r is an instance of '
            'sqlalchemy.engine.base.Connection', dbo)
        _execute_sqlalchemy_connection(sql, dbo, commit)

    elif callable(dbo):
        debug(
            'assuming %r is a function returning standard DB-API 2.0 cursor '
            'objects', dbo)
        _execute_dbapi_mkcurs(sql, dbo, commit)

    # some other sort of duck...
    else:
        raise ArgumentError('unsupported database object type: %r' % dbo)
Ejemplo n.º 22
0
def itercapture(source, field, pattern, newfields, include_original, flags,
                fill):
    it = iter(source)
    prog = re.compile(pattern, flags)

    hdr = next(it)
    flds = list(map(text_type, hdr))
    if isinstance(field, int) and field < len(hdr):
        field_index = field
    elif field in flds:
        field_index = flds.index(field)
    else:
        raise ArgumentError(
            'field invalid: must be either field name or index')

    # determine output fields
    outhdr = list(flds)
    if not include_original:
        outhdr.remove(field)
    if newfields:
        outhdr.extend(newfields)
    yield tuple(outhdr)

    # construct the output data
    for row in it:
        value = row[field_index]
        if include_original:
            out_row = list(row)
        else:
            out_row = [v for i, v in enumerate(row) if i != field_index]
        match = prog.search(value)
        if match is None:
            if fill is not None:
                fill = [
                    item(row) if callable(dict(zip(flds, item))) else item
                    for item in fill
                ]
                out_row.extend(fill)
            else:
                raise TransformError('value %r did not match pattern %r' %
                                     (value, pattern))
        else:
            out_row.extend(match.groups())
        yield tuple(out_row)
Ejemplo n.º 23
0
def _todb(table, dbo, tablename, schema=None, commit=True, truncate=False):

    # need to deal with polymorphic dbo argument
    # what sort of duck is it?

    # does it quack like a standard DB-API 2.0 connection?
    if _is_dbapi_connection(dbo):
        debug('assuming %r is standard DB-API 2.0 connection', dbo)
        _todb_dbapi_connection(table, dbo, tablename, schema=schema,
                               commit=commit, truncate=truncate)

    # does it quack like a standard DB-API 2.0 cursor?
    elif _is_dbapi_cursor(dbo):
        debug('assuming %r is standard DB-API 2.0 cursor')
        _todb_dbapi_cursor(table, dbo, tablename, schema=schema, commit=commit,
                           truncate=truncate)

    # does it quack like an SQLAlchemy engine?
    elif _is_sqlalchemy_engine(dbo):
        debug('assuming %r instance of sqlalchemy.engine.base.Engine', dbo)
        _todb_sqlalchemy_engine(table, dbo, tablename, schema=schema,
                                commit=commit, truncate=truncate)

    # does it quack like an SQLAlchemy session?
    elif _is_sqlalchemy_session(dbo):
        debug('assuming %r instance of sqlalchemy.orm.session.Session', dbo)
        _todb_sqlalchemy_session(table, dbo, tablename, schema=schema,
                                 commit=commit, truncate=truncate)

    # does it quack like an SQLAlchemy connection?
    elif _is_sqlalchemy_connection(dbo):
        debug('assuming %r instance of sqlalchemy.engine.base.Connection', dbo)
        _todb_sqlalchemy_connection(table, dbo, tablename, schema=schema,
                                    commit=commit, truncate=truncate)

    elif callable(dbo):
        debug('assuming %r is a function returning standard DB-API 2.0 cursor '
              'objects', dbo)
        _todb_dbapi_mkcurs(table, dbo, tablename, schema=schema, commit=commit,
                           truncate=truncate)

    # some other sort of duck...
    else:
        raise ArgumentError('unsupported database object type: %r' % dbo)
Ejemplo n.º 24
0
Archivo: db.py Proyecto: larissarmp/TCC
    def __iter__(self):

        # does it quack like a standard DB-API 2.0 connection?
        if _is_dbapi_connection(self.dbo):
            debug('assuming %r is standard DB-API 2.0 connection', self.dbo)
            _iter = _iter_dbapi_connection

        # does it quack like a standard DB-API 2.0 cursor?
        elif _is_dbapi_cursor(self.dbo):
            debug('assuming %r is standard DB-API 2.0 cursor')
            warning('using a DB-API cursor with fromdb() is not recommended '
                    'and may lead to unexpected results, a DB-API connection '
                    'is better')
            _iter = _iter_dbapi_cursor

        # does it quack like an SQLAlchemy engine?
        elif _is_sqlalchemy_engine(self.dbo):
            debug('assuming %r instance of sqlalchemy.engine.base.Engine',
                  self.dbo)
            _iter = _iter_sqlalchemy_engine

        # does it quack like an SQLAlchemy session?
        elif _is_sqlalchemy_session(self.dbo):
            debug('assuming %r instance of sqlalchemy.orm.session.Session',
                  self.dbo)
            _iter = _iter_sqlalchemy_session

        # does it quack like an SQLAlchemy connection?
        elif _is_sqlalchemy_connection(self.dbo):
            debug('assuming %r instance of sqlalchemy.engine.base.Connection',
                  self.dbo)
            _iter = _iter_sqlalchemy_connection

        elif callable(self.dbo):
            debug('assuming %r is a function returning a cursor', self.dbo)
            _iter = _iter_dbapi_mkcurs

        # some other sort of duck...
        else:
            raise ArgumentError('unsupported database object type: %r' %
                                self.dbo)

        return _iter(self.dbo, self.query, *self.args, **self.kwargs)
Ejemplo n.º 25
0
 def __init__(self,
              source,
              converters=None,
              failonerror=False,
              errorvalue=None,
              where=None,
              pass_row=False):
     self.source = source
     if converters is None:
         self.converters = dict()
     elif isinstance(converters, dict):
         self.converters = converters
     elif isinstance(converters, (tuple, list)):
         self.converters = dict([(i, v) for i, v in enumerate(converters)])
     else:
         raise ArgumentError('unexpected converters: %r' % converters)
     self.failonerror = failonerror
     self.errorvalue = errorvalue
     self.where = where
     self.pass_row = pass_row
Ejemplo n.º 26
0
def itersplitdown(table, field, pattern, maxsplit, flags):

    prog = re.compile(pattern, flags)
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))

    if isinstance(field, int) and field < len(hdr):
        field_index = field
        field = hdr[field_index]
    elif field in flds:
        field_index = flds.index(field)
    else:
        raise ArgumentError(
            'field invalid: must be either field name or index')

    yield tuple(hdr)

    for row in it:
        value = row[field_index]
        for v in prog.split(value, maxsplit):
            yield tuple(v if i == field_index else row[i]
                        for i in range(len(hdr)))
Ejemplo n.º 27
0
def iterfieldconvert(source, converters, failonerror, errorvalue, where,
                     pass_row):

    # grab the fields in the source table
    it = iter(source)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    yield tuple(hdr)  # these are not modified

    # build converter functions
    converter_functions = dict()
    for k, c in converters.items():

        # turn field names into row indices
        if not isinstance(k, integer_types):
            try:
                k = flds.index(k)
            except ValueError:  # not in list
                raise FieldSelectionError(k)
        assert isinstance(k, int), 'expected integer, found %r' % k

        # is converter a function?
        if callable(c):
            converter_functions[k] = c

        # is converter a method name?
        elif isinstance(c, string_types):
            converter_functions[k] = methodcaller(c)

        # is converter a method name with arguments?
        elif isinstance(c, (tuple, list)) and isinstance(c[0], string_types):
            methnm = c[0]
            methargs = c[1:]
            converter_functions[k] = methodcaller(methnm, *methargs)

        # is converter a dictionary?
        elif isinstance(c, dict):
            converter_functions[k] = dictconverter(c)

        # is it something else?
        elif c is None:
            pass  # ignore
        else:
            raise ArgumentError(
                'unexpected converter specification on field %r: %r' % (k, c))

    # define a function to transform a value
    def transform_value(i, v, *args):
        if i not in converter_functions:
            # no converter defined on this field, return value as-is
            return v
        else:
            try:
                return converter_functions[i](v, *args)
            except Exception as e:
                if failonerror:
                    raise e
                else:
                    return errorvalue

    # define a function to transform a row
    if pass_row:

        def transform_row(_row):
            return tuple(
                transform_value(i, v, _row) for i, v in enumerate(_row))
    else:

        def transform_row(_row):
            return tuple(transform_value(i, v) for i, v in enumerate(_row))

    # prepare where function
    if isinstance(where, string_types):
        where = expr(where)
    elif where is not None:
        assert callable(where), 'expected callable for "where" argument, ' \
                                'found %r' % where

    # prepare iterator
    if pass_row or where:
        # wrap rows as records
        it = (Record(row, flds) for row in it)

    # construct the data rows
    if where is None:
        # simple case, transform all rows
        for row in it:
            yield transform_row(row)
    else:
        # conditionally transform rows
        for row in it:
            if where(row):
                yield transform_row(row)
            else:
                yield row
Ejemplo n.º 28
0
def itersearchindex(index_or_dirname, query, limit, pagenum, pagelen,
                    indexname, docnum_field, score_field, fieldboosts,
                    search_kwargs):
    import whoosh.index
    import whoosh.query
    import whoosh.qparser

    if not search_kwargs:
        search_kwargs = dict()

    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.open_dir(dirname,
                                      indexname=indexname,
                                      readonly=True)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r' %
                            index_or_dirname)

    try:

        # figure out header
        hdr = tuple()
        if docnum_field is not None:
            hdr += (docnum_field, )
        if score_field is not None:
            hdr += (score_field, )
        stored_names = tuple(index.schema.stored_names())
        hdr += stored_names
        yield hdr

        # parse the query
        if isinstance(query, string_types):
            # search all fields by default
            parser = whoosh.qparser.MultifieldParser(index.schema.names(),
                                                     index.schema,
                                                     fieldboosts=fieldboosts)
            query = parser.parse(query)
        elif isinstance(query, whoosh.query.Query):
            pass
        else:
            raise ArgumentError(
                'expected string or whoosh.query.Query, found %r' % query)

        # make a function to turn docs into tuples
        astuple = operator.itemgetter(*index.schema.stored_names())

        with index.searcher() as searcher:
            if limit is not None:
                results = searcher.search(query, limit=limit, **search_kwargs)
            else:
                results = searcher.search_page(query,
                                               pagenum,
                                               pagelen=pagelen,
                                               **search_kwargs)

            if docnum_field is None and score_field is None:

                for doc in results:
                    yield astuple(doc)

            else:

                for (docnum, score), doc in izip(results.items(), results):
                    row = tuple()
                    if docnum_field is not None:
                        row += (docnum, )
                    if score_field is not None:
                        row += (score, )
                    row += astuple(doc)
                    yield row

    except:
        raise

    finally:
        if needs_closing:
            # close the index if we're the ones who opened it
            index.close()


# TODO guess schema
Ejemplo n.º 29
0
def totextindex(table,
                index_or_dirname,
                schema=None,
                indexname=None,
                merge=False,
                optimize=False):
    """
    Load all rows from `table` into a Whoosh index. N.B., this will clear any
    existing data in the index before loading. E.g.::

        >>> import petl as etl
        >>> import datetime
        >>> import os
        >>> # here is the table we want to load into an index
        ... table = (('f0', 'f1', 'f2', 'f3', 'f4'),
        ...          ('AAA', 12, 4.3, True, datetime.datetime.now()),
        ...          ('BBB', 6, 3.4, False, datetime.datetime(1900, 1, 31)),
        ...          ('CCC', 42, 7.8, True, datetime.datetime(2100, 12, 25)))
        >>> # define a schema for the index
        ... from whoosh.fields import *
        >>> schema = Schema(f0=TEXT(stored=True),
        ...                 f1=NUMERIC(int, stored=True),
        ...                 f2=NUMERIC(float, stored=True),
        ...                 f3=BOOLEAN(stored=True),
        ...                 f4=DATETIME(stored=True))
        >>> # load index
        ... dirname = 'example.whoosh'
        >>> if not os.path.exists(dirname):
        ...     os.mkdir(dirname)
        ...
        >>> etl.totextindex(table, dirname, schema=schema)

    Keyword arguments:

    table
        A table container with the data to be loaded.
    index_or_dirname
        Either an instance of `whoosh.index.Index` or a string containing the
        directory path where the index is to be stored.
    schema
        Index schema to use if creating the index.
    indexname
        String containing the name of the index, if multiple indexes are stored
        in the same directory.
    merge
        Merge small segments during commit?
    optimize
        Merge all segments together?

    """
    import whoosh.index
    import whoosh.writing

    # deal with polymorphic argument
    if isinstance(index_or_dirname, string_types):
        dirname = index_or_dirname
        index = whoosh.index.create_in(dirname, schema, indexname=indexname)
        needs_closing = True
    elif isinstance(index_or_dirname, whoosh.index.Index):
        index = index_or_dirname
        needs_closing = False
    else:
        raise ArgumentError('expected string or index, found %r' %
                            index_or_dirname)

    writer = index.writer()
    try:

        for d in dicts(table):
            writer.add_document(**d)
        writer.commit(merge=merge,
                      optimize=optimize,
                      mergetype=whoosh.writing.CLEAR)

    except:
        writer.cancel()
        raise

    finally:
        if needs_closing:
            index.close()
Ejemplo n.º 30
0
def aggregate(table,
              key,
              aggregation=None,
              value=None,
              presorted=False,
              buffersize=None,
              tempdir=None,
              cache=True):
    """Group rows under the given key then apply aggregation functions.
    E.g.::

        >>> import petl as etl
        >>>
        >>> table1 = [['foo', 'bar', 'baz'],
        ...           ['a', 3, True],
        ...           ['a', 7, False],
        ...           ['b', 2, True],
        ...           ['b', 2, False],
        ...           ['b', 9, False],
        ...           ['c', 4, True]]
        >>> # aggregate whole rows
        ... table2 = etl.aggregate(table1, 'foo', len)
        >>> table2
        +-----+-------+
        | foo | value |
        +=====+=======+
        | 'a' |     2 |
        +-----+-------+
        | 'b' |     3 |
        +-----+-------+
        | 'c' |     1 |
        +-----+-------+

        >>> # aggregate single field
        ... table3 = etl.aggregate(table1, 'foo', sum, 'bar')
        >>> table3
        +-----+-------+
        | foo | value |
        +=====+=======+
        | 'a' |    10 |
        +-----+-------+
        | 'b' |    13 |
        +-----+-------+
        | 'c' |     4 |
        +-----+-------+

        >>> # alternative signature using keyword args
        ... table4 = etl.aggregate(table1, key=('foo', 'bar'),
        ...                        aggregation=list, value=('bar', 'baz'))
        >>> table4
        +-----+-----+-------------------------+
        | foo | bar | value                   |
        +=====+=====+=========================+
        | 'a' |   3 | [(3, True)]             |
        +-----+-----+-------------------------+
        | 'a' |   7 | [(7, False)]            |
        +-----+-----+-------------------------+
        | 'b' |   2 | [(2, True), (2, False)] |
        +-----+-----+-------------------------+
        | 'b' |   9 | [(9, False)]            |
        +-----+-----+-------------------------+
        | 'c' |   4 | [(4, True)]             |
        +-----+-----+-------------------------+

        >>> # aggregate multiple fields
        ... from collections import OrderedDict
        >>> import petl as etl
        >>>
        >>> aggregation = OrderedDict()
        >>> aggregation['count'] = len
        >>> aggregation['minbar'] = 'bar', min
        >>> aggregation['maxbar'] = 'bar', max
        >>> aggregation['sumbar'] = 'bar', sum
        >>> # default aggregation function is list
        ... aggregation['listbar'] = 'bar'
        >>> aggregation['listbarbaz'] = ('bar', 'baz'), list
        >>> aggregation['bars'] = 'bar', etl.strjoin(', ')
        >>> table5 = etl.aggregate(table1, 'foo', aggregation)
        >>> table5
        +-----+-------+--------+--------+--------+-----------+-------------------------------------+-----------+
        | foo | count | minbar | maxbar | sumbar | listbar   | listbarbaz                          | bars      |
        +=====+=======+========+========+========+===========+=====================================+===========+
        | 'a' |     2 |      3 |      7 |     10 | [3, 7]    | [(3, True), (7, False)]             | '3, 7'    |
        +-----+-------+--------+--------+--------+-----------+-------------------------------------+-----------+
        | 'b' |     3 |      2 |      9 |     13 | [2, 2, 9] | [(2, True), (2, False), (9, False)] | '2, 2, 9' |
        +-----+-------+--------+--------+--------+-----------+-------------------------------------+-----------+
        | 'c' |     1 |      4 |      4 |      4 | [4]       | [(4, True)]                         | '4'       |
        +-----+-------+--------+--------+--------+-----------+-------------------------------------+-----------+

    If `presorted` is True, it is assumed that the data are already sorted by
    the given key, and the `buffersize`, `tempdir` and `cache` arguments are 
    ignored. Otherwise, the data are sorted, see also the discussion of the 
    `buffersize`, `tempdir` and `cache` arguments under the
    :func:`petl.transform.sorts.sort` function.

    """

    if callable(aggregation):
        return SimpleAggregateView(table,
                                   key,
                                   aggregation=aggregation,
                                   value=value,
                                   presorted=presorted,
                                   buffersize=buffersize,
                                   tempdir=tempdir,
                                   cache=cache)
    elif aggregation is None or isinstance(aggregation, (list, tuple, dict)):
        # ignore value arg
        return MultiAggregateView(table,
                                  key,
                                  aggregation=aggregation,
                                  presorted=presorted,
                                  buffersize=buffersize,
                                  tempdir=tempdir,
                                  cache=cache)
    else:
        raise ArgumentError(
            'expected aggregation is callable, list, tuple, dict '
            'or None')