Python chunk Examples, tabutils.fntools.chunk Python Examples

Example #1

0

Show file

File: test_fntools.py Project: chrisorwa/tabutils

    def test_chunk(self):
        content = io.StringIO('Iñtërnâtiônàližætiøn')
        nt.assert_equal('Iñtër', next(ft.chunk(content, 5)))
        nt.assert_equal('nâtiônàližætiøn', next(ft.chunk(content)))

        url = 'http://google.com'
        body = '<!doctype html><html itemtype="http://schema.org/page">'
        responses.add(responses.GET, url=url, body=body)
        r = requests.get(url, stream=True)

        # http://docs.python-requests.org/en/latest/api/
        # The chunk size is the number of bytes it should read into
        # memory. This is not necessarily the length of each item returned
        # as decoding can take place.
        nt.assert_equal(20, len(next(ft.chunk(r.iter_content, 20, 29, 200))))
        nt.assert_equal(55, len(next(ft.chunk(r.iter_content))))

Example #2

0

Show file

    def gen_groups(self, records, chunksize=None):
        for chnk in chunk(records, chunksize):
            cleansed = [{k: xmlize([v]).next()
                         for k, v in c.items()} for c in chnk]
            keyfunc = self.id if self.is_split else self.account

            for group in utils.group_transactions(cleansed, keyfunc):
                yield group

Example #3

0

Show file

File: ofx.py Project: Drey/csv2ofx

    def gen_groups(self, records, chunksize=None):
        for chnk in chunk(records, chunksize):
            cleansed = [
                {k: xmlize([v]).next() for k, v in c.items()} for c in chnk]
            keyfunc = self.id if self.is_split else self.account

            for group in utils.group_transactions(cleansed, keyfunc):
                yield group

Example #4

0

Show file

def populate():
    """Populates db with data"""
    limit = 0

    with app.app_context():
        table_name = 'Data'
        table = getattr(models, table_name)
        row_limit = app.config['ROW_LIMIT']
        chunk_size = min(row_limit or 'inf', app.config['CHUNK_SIZE'])
        debug, test = app.config['DEBUG'], app.config['TESTING']

        if test:
            createdb()

        data = utils.fetch_data(app.config)

        del_count = table.query.delete(synchronize_session=False)
        db.session.commit()

        if debug:
            print('Deleted %s records from the %s table...' %
                  (del_count, table_name))

        for records in ft.chunk(data, chunk_size):
            in_count = len(records)
            pprint(records[0])
            limit += in_count

            if debug:
                print('Inserting %s records into the %s table...' %
                      (in_count, table_name))

            if test:
                pprint(records)

            db.engine.execute(table.__table__.insert(), records)

            if row_limit and limit >= row_limit:
                break

        if debug:
            print('Successfully inserted %s records into the %s table!' %
                  (limit, table_name))

Example #5

0

Show file

File: manage.py Project: nelsonkimaiga/hdx-file-proxy

def populate():
    """Populates db with data"""
    limit = 0

    with app.app_context():
        table_name = "Data"
        table = getattr(models, table_name)
        row_limit = app.config["ROW_LIMIT"]
        chunk_size = min(row_limit or "inf", app.config["CHUNK_SIZE"])
        debug, test = app.config["DEBUG"], app.config["TESTING"]

        if test:
            createdb()

        data = utils.fetch_data(app.config)

        del_count = table.query.delete(synchronize_session=False)
        db.session.commit()

        if debug:
            print("Deleted %s records from the %s table..." % (del_count, table_name))

        for records in ft.chunk(data, chunk_size):
            in_count = len(records)
            pprint(records[0])
            limit += in_count

            if debug:
                print("Inserting %s records into the %s table..." % (in_count, table_name))

            if test:
                pprint(records)

            db.engine.execute(table.__table__.insert(), records)

            if row_limit and limit >= row_limit:
                break

        if debug:
            print("Successfully inserted %s records into the %s table!" % (limit, table_name))

Example #6

0

Show file

File: swutils.py Project: reubano/swutils

def populate(engine, models=None, get_name=None, **kwargs):
    """Populates a SQLAlchemy db with data. Supports both declarative
    SQLAlchemy and Flask-SQLAlchemy

    Note: Either `TABLES` or `KEY` must be defined.

    Args:
        gen_data (func): A function used to generate the data to be inserted
            into the db. It will receive keywords comprised of combining
            `kwargs` with a table defined in `TABLES`.

        engine (obj): A SQLAlchemy engine.
        models (module): A models module of SQLAlchemy table classes
            (default: None).
        get_name (func): A function used to generate the table name if
            `TABLES` is unset. It will receive the name of each
            each grouped obtained by grouping the data generated from
            `gen_data` (default: None).
        kwargs (dict): Keyword arguments passed to `gen_data`.

    Kwargs:
        mixin (class): Base table that dynamically create tables inherit.
            Required if `TABLES` is unset.
        TABLES (list[dicts]): The table options. Required if `KEY` is unset.
        KEY (str): The field used to group data generated from `gen_data`.
            Required if `TABLES` is unset.
        ROW_LIMIT (int): The max total number of rows to process
        CHUNK_SIZE (int): The max number of rows to process at one time
        DEBUG (bool): Run in debug mode
        TESTING (bool): Run in test mode

    Examples:
        >>> # Test dynamic tables
        >>> from sqlalchemy import create_engine
        >>> class BaseMixin(object):
        ...    id = Column(Integer, primary_key=True)
        ...    value = Column(Integer)
        ...
        >>> meta = MetaData()
        >>> kwargs = {'KEY': 'kind', 'ROW_LIMIT': 4, 'mixin': BaseMixin}
        >>> f = lambda x: {'kind': 'odd' if x % 2 else 'even', 'value': x}
        >>> gen_data = lambda **x: map(f, range(15))
        >>> engine = create_engine('sqlite:///:memory:')
        >>> populate(gen_data, engine, **kwargs)
        >>> session = sessionmaker(engine)()
        >>> meta.reflect(engine)
        >>> tables = meta.sorted_tables
        >>> dict(session.query(tables[0]).all()) == {1: 0, 2: 2, 3: 4, 4: 6}
        True
        >>> dict(session.query(tables[1]).all()) == {1: 1, 2: 3, 3: 5, 4: 7}
        True
        >>> meta.drop_all(engine)
        >>>
        >>> # Test tables without specifying the `rid`
        >>> Base = declarative_base()
        >>> class Single(Base):
        ...     __tablename__ = 'single'
        ...     id = Column(Integer, primary_key=True)
        ...     rid = Column(Integer)
        ...     value = Column(Integer)
        ...
        >>> class Triple(Base):
        ...     __tablename__ = 'triple'
        ...     id = Column(Integer, primary_key=True)
        ...     rid = Column(Integer)
        ...     value = Column(Integer)
        ...
        >>> options = [
        ...     {'mul': 1, 'name': 'single'}, {'mul': 3, 'name': 'triple'}]
        >>> kwargs = {'TABLES': options, 'ROW_LIMIT': 4}
        >>> def gen_data(**x):
        ...     return ({'value': n * x['mul'], 'rid': n} for n in it.count())
        >>> Base.metadata.create_all(engine)
        >>> populate(gen_data, engine, **kwargs)
        >>> Base.metadata.reflect(engine)
        >>> tables = Base.metadata.sorted_tables
        >>> session.query(tables[0]).all()
        [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)]
        >>> session.query(tables[1]).all()
        [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)]
        >>>
        >>> # Test tables with a specified `rid`
        >>> populate(gen_data, engine, rid='rid', **kwargs)
        >>> Base.metadata.reflect(engine)
        >>> tables = Base.metadata.sorted_tables
        >>> session.query(tables[0]).all()
        [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)]
        >>> session.query(tables[1]).all()
        [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)]

    Returns
        str: The message
    """
    log_level = logging.DEBUG if kwargs.get("DEBUG") else logging.INFO
    logger.setLevel(log_level)
    console_handler = logging.StreamHandler()
    logger.addHandler(console_handler)
    test = kwargs.get("TESTING")
    row_limit = kwargs.get("ROW_LIMIT")
    tables = kwargs.get("TABLES")
    chunk_size = min(row_limit or "inf", kwargs.get("CHUNK_SIZE", row_limit))
    engine.session = sessionmaker(engine)()
    dynamic = not tables

    if test:
        meta.create_all(engine)

    if dynamic:
        data = gen_data(**kwargs)
        tables = get_tables(data, kwargs["KEY"])
        result_func = partial(get_dynamic_res, engine, get_name, **kwargs)
    elif models:
        result_func = partial(res_from_models, models, **kwargs)
    else:
        result_func = partial(res_from_meta, engine, **kwargs)

    for t in tables:
        count = 0
        data = data if dynamic else gen_data(**pr.merge([kwargs, t]))
        result = result_func(t, data=data)
        table, rid, data = result["table"], result["rid"], result["data"]
        table.name = table.__table__.name
        table.query = engine.session.query(table)
        del_count = delete_records(table, rid, engine)

        if del_count:
            logger.debug(get_message(del_count, table.name))

        for records in ft.chunk(data, chunk_size):
            del_count, in_count = execute(records, engine, table, rid)
            count += in_count

            if del_count:
                logger.debug(get_message(del_count, table.name))

            logger.debug(get_message(in_count, table.name, False))

            if test:
                pprint(records)

            if row_limit and count >= row_limit:
                break

        logger.debug("Success! %s" % get_message(count, table.name, False))

Example #7

0

Show file

File: ckanutils.py Project: reubano/ckanutils

    def insert_records(self, resource_id, records, **kwargs):
        """Inserts records into a datastore table.

        Args:
            resource_id (str): The datastore resource id.
            records (List[dict]): The records to insert.
            **kwargs: Keyword arguments that are passed to datastore_create.

        Kwargs:
            method (str): Insert method. One of ['update, 'insert', 'upsert']
                (default: 'insert').
            force (bool): Create resource even if read-only.
            start (int): Row number to start from (zero indexed).
            stop (int): Row number to stop at (zero indexed).
            chunksize (int): Number of rows to write at a time.

        Returns:
            int: Number of records inserted.

        Raises:
            NotFound: If unable to find the resource.

        Examples:
            >>> CKAN(quiet=True).insert_records('rid', [{'field': 'value'}])
            Traceback (most recent call last):
            NotFound: Resource `rid` was not found in filestore.
        """
        recoded = pr.json_recode(records)
        chunksize = kwargs.pop('chunksize', 0)
        start = kwargs.pop('start', 0)
        stop = kwargs.pop('stop', None)

        kwargs.setdefault('force', self.force)
        kwargs.setdefault('method', 'insert')
        kwargs['resource_id'] = resource_id
        count = 1

        for chunk in ft.chunk(recoded, chunksize, start=start, stop=stop):
            length = len(chunk)

            if self.verbose:
                print(
                    'Adding records %i - %i to resource %s...' % (
                        count, count + length - 1, resource_id))

            kwargs['records'] = chunk
            err_msg = 'Resource `%s` was not found in filestore.' % resource_id

            try:
                self.datastore_upsert(**kwargs)
            except requests.exceptions.ConnectionError as err:
                if 'Broken pipe' in err.message[1]:
                    print('Chunksize too large. Try using a smaller chunksize.')
                    return 0
                else:
                    raise err
            except NotFound:
                # Keep exception message consistent with the others
                raise NotFound(err_msg)
            except ValidationError as err:
                if err.error_dict.get('resource_id') == ['Not found: Resource']:
                    raise NotFound(err_msg)
                else:
                    raise err

            count += length

        return count

Example #8

0

Show file

File: qif.py Project: Drey/csv2ofx

    def gen_groups(self, records, chunksize=None):
        for chnk in chunk(records, chunksize):
            keyfunc = self.id if self.is_split else self.account

            for group in utils.group_transactions(chnk, keyfunc):
                yield group

Example #9

0

Show file

def populate(engine, models=None, get_name=None, **kwargs):
    """Populates a SQLAlchemy db with data. Supports both declarative
    SQLAlchemy and Flask-SQLAlchemy

    Note: Either `TABLES` or `KEY` must be defined.

    Args:
        gen_data (func): A function used to generate the data to be inserted
            into the db. It will receive keywords comprised of combining
            `kwargs` with a table defined in `TABLES`.

        engine (obj): A SQLAlchemy engine.
        models (module): A models module of SQLAlchemy table classes
            (default: None).
        get_name (func): A function used to generate the table name if
            `TABLES` is unset. It will receive the name of each
            each grouped obtained by grouping the data generated from
            `gen_data` (default: None).
        kwargs (dict): Keyword arguments passed to `gen_data`.

    Kwargs:
        mixin (class): Base table that dynamically create tables inherit.
            Required if `TABLES` is unset.
        TABLES (list[dicts]): The table options. Required if `KEY` is unset.
        KEY (str): The field used to group data generated from `gen_data`.
            Required if `TABLES` is unset.
        ROW_LIMIT (int): The max total number of rows to process
        CHUNK_SIZE (int): The max number of rows to process at one time
        DEBUG (bool): Run in debug mode
        TESTING (bool): Run in test mode

    Examples:
        >>> # Test dynamic tables
        >>> from sqlalchemy import create_engine
        >>> class BaseMixin(object):
        ...    id = Column(Integer, primary_key=True)
        ...    value = Column(Integer)
        ...
        >>> meta = MetaData()
        >>> kwargs = {'KEY': 'kind', 'ROW_LIMIT': 4, 'mixin': BaseMixin}
        >>> f = lambda x: {'kind': 'odd' if x % 2 else 'even', 'value': x}
        >>> gen_data = lambda **x: map(f, range(15))
        >>> engine = create_engine('sqlite:///:memory:')
        >>> populate(gen_data, engine, **kwargs)
        >>> session = sessionmaker(engine)()
        >>> meta.reflect(engine)
        >>> tables = meta.sorted_tables
        >>> dict(session.query(tables[0]).all()) == {1: 0, 2: 2, 3: 4, 4: 6}
        True
        >>> dict(session.query(tables[1]).all()) == {1: 1, 2: 3, 3: 5, 4: 7}
        True
        >>> meta.drop_all(engine)
        >>>
        >>> # Test tables without specifying the `rid`
        >>> Base = declarative_base()
        >>> class Single(Base):
        ...     __tablename__ = 'single'
        ...     id = Column(Integer, primary_key=True)
        ...     rid = Column(Integer)
        ...     value = Column(Integer)
        ...
        >>> class Triple(Base):
        ...     __tablename__ = 'triple'
        ...     id = Column(Integer, primary_key=True)
        ...     rid = Column(Integer)
        ...     value = Column(Integer)
        ...
        >>> options = [
        ...     {'mul': 1, 'name': 'single'}, {'mul': 3, 'name': 'triple'}]
        >>> kwargs = {'TABLES': options, 'ROW_LIMIT': 4}
        >>> def gen_data(**x):
        ...     return ({'value': n * x['mul'], 'rid': n} for n in it.count())
        >>> Base.metadata.create_all(engine)
        >>> populate(gen_data, engine, **kwargs)
        >>> Base.metadata.reflect(engine)
        >>> tables = Base.metadata.sorted_tables
        >>> session.query(tables[0]).all()
        [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)]
        >>> session.query(tables[1]).all()
        [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)]
        >>>
        >>> # Test tables with a specified `rid`
        >>> populate(gen_data, engine, rid='rid', **kwargs)
        >>> Base.metadata.reflect(engine)
        >>> tables = Base.metadata.sorted_tables
        >>> session.query(tables[0]).all()
        [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)]
        >>> session.query(tables[1]).all()
        [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)]

    Returns
        str: The message
    """
    log_level = logging.DEBUG if kwargs.get('DEBUG') else logging.INFO
    logger.setLevel(log_level)
    console_handler = logging.StreamHandler()
    logger.addHandler(console_handler)
    test = kwargs.get('TESTING')
    row_limit = kwargs.get('ROW_LIMIT')
    tables = kwargs.get('TABLES')
    chunk_size = min(row_limit or 'inf', kwargs.get('CHUNK_SIZE', row_limit))
    engine.session = sessionmaker(engine)()
    dynamic = not tables

    if test:
        meta.create_all(engine)

    if dynamic:
        data = gen_data(**kwargs)
        tables = get_tables(data, kwargs['KEY'])
        result_func = partial(get_dynamic_res, engine, get_name, **kwargs)
    elif models:
        result_func = partial(res_from_models, models, **kwargs)
    else:
        result_func = partial(res_from_meta, engine, **kwargs)

    for t in tables:
        count = 0
        data = data if dynamic else gen_data(**pr.merge([kwargs, t]))
        result = result_func(t, data=data)
        table, rid, data = result['table'], result['rid'], result['data']
        table.name = table.__table__.name
        table.query = engine.session.query(table)
        del_count = delete_records(table, rid, engine)

        if del_count:
            logger.debug(get_message(del_count, table.name))

        for records in ft.chunk(data, chunk_size):
            del_count, in_count = execute(records, engine, table, rid)
            count += in_count

            if del_count:
                logger.debug(get_message(del_count, table.name))

            logger.debug(get_message(in_count, table.name, False))

            if test:
                pprint(records)

            if row_limit and count >= row_limit:
                break

        logger.debug('Success! %s' % get_message(count, table.name, False))

Example #10

0

Show file

    def insert_records(self, resource_id, records, **kwargs):
        """Inserts records into a datastore table.

        Args:
            resource_id (str): The datastore resource id.
            records (List[dict]): The records to insert.
            **kwargs: Keyword arguments that are passed to datastore_create.

        Kwargs:
            method (str): Insert method. One of ['update, 'insert', 'upsert']
                (default: 'insert').
            force (bool): Create resource even if read-only.
            start (int): Row number to start from (zero indexed).
            stop (int): Row number to stop at (zero indexed).
            chunksize (int): Number of rows to write at a time.

        Returns:
            int: Number of records inserted.

        Raises:
            NotFound: If unable to find the resource.

        Examples:
            >>> CKAN(quiet=True).insert_records('rid', [{'field': 'value'}])
            Traceback (most recent call last):
            NotFound: Resource `rid` was not found in filestore.
        """
        recoded = pr.json_recode(records)
        chunksize = kwargs.pop('chunksize', 0)
        start = kwargs.pop('start', 0)
        stop = kwargs.pop('stop', None)

        kwargs.setdefault('force', self.force)
        kwargs.setdefault('method', 'insert')
        kwargs['resource_id'] = resource_id
        count = 1

        for chunk in ft.chunk(recoded, chunksize, start=start, stop=stop):
            length = len(chunk)

            if self.verbose:
                print('Adding records %i - %i to resource %s...' %
                      (count, count + length - 1, resource_id))

            kwargs['records'] = chunk
            err_msg = 'Resource `%s` was not found in filestore.' % resource_id

            try:
                self.datastore_upsert(**kwargs)
            except requests.exceptions.ConnectionError as err:
                if 'Broken pipe' in err.message[1]:
                    print(
                        'Chunksize too large. Try using a smaller chunksize.')
                    return 0
                else:
                    raise err
            except NotFound:
                # Keep exception message consistent with the others
                raise NotFound(err_msg)
            except ValidationError as err:
                if err.error_dict.get('resource_id') == [
                        'Not found: Resource'
                ]:
                    raise NotFound(err_msg)
                else:
                    raise err

            count += length

        return count

Example #11

0

Show file

File: qif.py Project: mypalmike/csv2ofx

    def gen_groups(self, records, chunksize=None):
        for chnk in chunk(records, chunksize):
            keyfunc = self.id if self.is_split else self.account

            for group in utils.group_transactions(chnk, keyfunc):
                yield group