def test_chunk(self): content = io.StringIO('Iñtërnâtiônàližætiøn') nt.assert_equal('Iñtër', next(ft.chunk(content, 5))) nt.assert_equal('nâtiônàližætiøn', next(ft.chunk(content))) url = 'http://google.com' body = '<!doctype html><html itemtype="http://schema.org/page">' responses.add(responses.GET, url=url, body=body) r = requests.get(url, stream=True) # http://docs.python-requests.org/en/latest/api/ # The chunk size is the number of bytes it should read into # memory. This is not necessarily the length of each item returned # as decoding can take place. nt.assert_equal(20, len(next(ft.chunk(r.iter_content, 20, 29, 200)))) nt.assert_equal(55, len(next(ft.chunk(r.iter_content))))
def gen_groups(self, records, chunksize=None): for chnk in chunk(records, chunksize): cleansed = [{k: xmlize([v]).next() for k, v in c.items()} for c in chnk] keyfunc = self.id if self.is_split else self.account for group in utils.group_transactions(cleansed, keyfunc): yield group
def gen_groups(self, records, chunksize=None): for chnk in chunk(records, chunksize): cleansed = [ {k: xmlize([v]).next() for k, v in c.items()} for c in chnk] keyfunc = self.id if self.is_split else self.account for group in utils.group_transactions(cleansed, keyfunc): yield group
def populate(): """Populates db with data""" limit = 0 with app.app_context(): table_name = 'Data' table = getattr(models, table_name) row_limit = app.config['ROW_LIMIT'] chunk_size = min(row_limit or 'inf', app.config['CHUNK_SIZE']) debug, test = app.config['DEBUG'], app.config['TESTING'] if test: createdb() data = utils.fetch_data(app.config) del_count = table.query.delete(synchronize_session=False) db.session.commit() if debug: print('Deleted %s records from the %s table...' % (del_count, table_name)) for records in ft.chunk(data, chunk_size): in_count = len(records) pprint(records[0]) limit += in_count if debug: print('Inserting %s records into the %s table...' % (in_count, table_name)) if test: pprint(records) db.engine.execute(table.__table__.insert(), records) if row_limit and limit >= row_limit: break if debug: print('Successfully inserted %s records into the %s table!' % (limit, table_name))
def populate(): """Populates db with data""" limit = 0 with app.app_context(): table_name = "Data" table = getattr(models, table_name) row_limit = app.config["ROW_LIMIT"] chunk_size = min(row_limit or "inf", app.config["CHUNK_SIZE"]) debug, test = app.config["DEBUG"], app.config["TESTING"] if test: createdb() data = utils.fetch_data(app.config) del_count = table.query.delete(synchronize_session=False) db.session.commit() if debug: print("Deleted %s records from the %s table..." % (del_count, table_name)) for records in ft.chunk(data, chunk_size): in_count = len(records) pprint(records[0]) limit += in_count if debug: print("Inserting %s records into the %s table..." % (in_count, table_name)) if test: pprint(records) db.engine.execute(table.__table__.insert(), records) if row_limit and limit >= row_limit: break if debug: print("Successfully inserted %s records into the %s table!" % (limit, table_name))
def populate(engine, models=None, get_name=None, **kwargs): """Populates a SQLAlchemy db with data. Supports both declarative SQLAlchemy and Flask-SQLAlchemy Note: Either `TABLES` or `KEY` must be defined. Args: gen_data (func): A function used to generate the data to be inserted into the db. It will receive keywords comprised of combining `kwargs` with a table defined in `TABLES`. engine (obj): A SQLAlchemy engine. models (module): A models module of SQLAlchemy table classes (default: None). get_name (func): A function used to generate the table name if `TABLES` is unset. It will receive the name of each each grouped obtained by grouping the data generated from `gen_data` (default: None). kwargs (dict): Keyword arguments passed to `gen_data`. Kwargs: mixin (class): Base table that dynamically create tables inherit. Required if `TABLES` is unset. TABLES (list[dicts]): The table options. Required if `KEY` is unset. KEY (str): The field used to group data generated from `gen_data`. Required if `TABLES` is unset. ROW_LIMIT (int): The max total number of rows to process CHUNK_SIZE (int): The max number of rows to process at one time DEBUG (bool): Run in debug mode TESTING (bool): Run in test mode Examples: >>> # Test dynamic tables >>> from sqlalchemy import create_engine >>> class BaseMixin(object): ... id = Column(Integer, primary_key=True) ... value = Column(Integer) ... >>> meta = MetaData() >>> kwargs = {'KEY': 'kind', 'ROW_LIMIT': 4, 'mixin': BaseMixin} >>> f = lambda x: {'kind': 'odd' if x % 2 else 'even', 'value': x} >>> gen_data = lambda **x: map(f, range(15)) >>> engine = create_engine('sqlite:///:memory:') >>> populate(gen_data, engine, **kwargs) >>> session = sessionmaker(engine)() >>> meta.reflect(engine) >>> tables = meta.sorted_tables >>> dict(session.query(tables[0]).all()) == {1: 0, 2: 2, 3: 4, 4: 6} True >>> dict(session.query(tables[1]).all()) == {1: 1, 2: 3, 3: 5, 4: 7} True >>> meta.drop_all(engine) >>> >>> # Test tables without specifying the `rid` >>> Base = declarative_base() >>> class Single(Base): ... __tablename__ = 'single' ... id = Column(Integer, primary_key=True) ... rid = Column(Integer) ... value = Column(Integer) ... >>> class Triple(Base): ... __tablename__ = 'triple' ... id = Column(Integer, primary_key=True) ... rid = Column(Integer) ... value = Column(Integer) ... >>> options = [ ... {'mul': 1, 'name': 'single'}, {'mul': 3, 'name': 'triple'}] >>> kwargs = {'TABLES': options, 'ROW_LIMIT': 4} >>> def gen_data(**x): ... return ({'value': n * x['mul'], 'rid': n} for n in it.count()) >>> Base.metadata.create_all(engine) >>> populate(gen_data, engine, **kwargs) >>> Base.metadata.reflect(engine) >>> tables = Base.metadata.sorted_tables >>> session.query(tables[0]).all() [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)] >>> session.query(tables[1]).all() [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)] >>> >>> # Test tables with a specified `rid` >>> populate(gen_data, engine, rid='rid', **kwargs) >>> Base.metadata.reflect(engine) >>> tables = Base.metadata.sorted_tables >>> session.query(tables[0]).all() [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)] >>> session.query(tables[1]).all() [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)] Returns str: The message """ log_level = logging.DEBUG if kwargs.get("DEBUG") else logging.INFO logger.setLevel(log_level) console_handler = logging.StreamHandler() logger.addHandler(console_handler) test = kwargs.get("TESTING") row_limit = kwargs.get("ROW_LIMIT") tables = kwargs.get("TABLES") chunk_size = min(row_limit or "inf", kwargs.get("CHUNK_SIZE", row_limit)) engine.session = sessionmaker(engine)() dynamic = not tables if test: meta.create_all(engine) if dynamic: data = gen_data(**kwargs) tables = get_tables(data, kwargs["KEY"]) result_func = partial(get_dynamic_res, engine, get_name, **kwargs) elif models: result_func = partial(res_from_models, models, **kwargs) else: result_func = partial(res_from_meta, engine, **kwargs) for t in tables: count = 0 data = data if dynamic else gen_data(**pr.merge([kwargs, t])) result = result_func(t, data=data) table, rid, data = result["table"], result["rid"], result["data"] table.name = table.__table__.name table.query = engine.session.query(table) del_count = delete_records(table, rid, engine) if del_count: logger.debug(get_message(del_count, table.name)) for records in ft.chunk(data, chunk_size): del_count, in_count = execute(records, engine, table, rid) count += in_count if del_count: logger.debug(get_message(del_count, table.name)) logger.debug(get_message(in_count, table.name, False)) if test: pprint(records) if row_limit and count >= row_limit: break logger.debug("Success! %s" % get_message(count, table.name, False))
def insert_records(self, resource_id, records, **kwargs): """Inserts records into a datastore table. Args: resource_id (str): The datastore resource id. records (List[dict]): The records to insert. **kwargs: Keyword arguments that are passed to datastore_create. Kwargs: method (str): Insert method. One of ['update, 'insert', 'upsert'] (default: 'insert'). force (bool): Create resource even if read-only. start (int): Row number to start from (zero indexed). stop (int): Row number to stop at (zero indexed). chunksize (int): Number of rows to write at a time. Returns: int: Number of records inserted. Raises: NotFound: If unable to find the resource. Examples: >>> CKAN(quiet=True).insert_records('rid', [{'field': 'value'}]) Traceback (most recent call last): NotFound: Resource `rid` was not found in filestore. """ recoded = pr.json_recode(records) chunksize = kwargs.pop('chunksize', 0) start = kwargs.pop('start', 0) stop = kwargs.pop('stop', None) kwargs.setdefault('force', self.force) kwargs.setdefault('method', 'insert') kwargs['resource_id'] = resource_id count = 1 for chunk in ft.chunk(recoded, chunksize, start=start, stop=stop): length = len(chunk) if self.verbose: print( 'Adding records %i - %i to resource %s...' % ( count, count + length - 1, resource_id)) kwargs['records'] = chunk err_msg = 'Resource `%s` was not found in filestore.' % resource_id try: self.datastore_upsert(**kwargs) except requests.exceptions.ConnectionError as err: if 'Broken pipe' in err.message[1]: print('Chunksize too large. Try using a smaller chunksize.') return 0 else: raise err except NotFound: # Keep exception message consistent with the others raise NotFound(err_msg) except ValidationError as err: if err.error_dict.get('resource_id') == ['Not found: Resource']: raise NotFound(err_msg) else: raise err count += length return count
def gen_groups(self, records, chunksize=None): for chnk in chunk(records, chunksize): keyfunc = self.id if self.is_split else self.account for group in utils.group_transactions(chnk, keyfunc): yield group
def populate(engine, models=None, get_name=None, **kwargs): """Populates a SQLAlchemy db with data. Supports both declarative SQLAlchemy and Flask-SQLAlchemy Note: Either `TABLES` or `KEY` must be defined. Args: gen_data (func): A function used to generate the data to be inserted into the db. It will receive keywords comprised of combining `kwargs` with a table defined in `TABLES`. engine (obj): A SQLAlchemy engine. models (module): A models module of SQLAlchemy table classes (default: None). get_name (func): A function used to generate the table name if `TABLES` is unset. It will receive the name of each each grouped obtained by grouping the data generated from `gen_data` (default: None). kwargs (dict): Keyword arguments passed to `gen_data`. Kwargs: mixin (class): Base table that dynamically create tables inherit. Required if `TABLES` is unset. TABLES (list[dicts]): The table options. Required if `KEY` is unset. KEY (str): The field used to group data generated from `gen_data`. Required if `TABLES` is unset. ROW_LIMIT (int): The max total number of rows to process CHUNK_SIZE (int): The max number of rows to process at one time DEBUG (bool): Run in debug mode TESTING (bool): Run in test mode Examples: >>> # Test dynamic tables >>> from sqlalchemy import create_engine >>> class BaseMixin(object): ... id = Column(Integer, primary_key=True) ... value = Column(Integer) ... >>> meta = MetaData() >>> kwargs = {'KEY': 'kind', 'ROW_LIMIT': 4, 'mixin': BaseMixin} >>> f = lambda x: {'kind': 'odd' if x % 2 else 'even', 'value': x} >>> gen_data = lambda **x: map(f, range(15)) >>> engine = create_engine('sqlite:///:memory:') >>> populate(gen_data, engine, **kwargs) >>> session = sessionmaker(engine)() >>> meta.reflect(engine) >>> tables = meta.sorted_tables >>> dict(session.query(tables[0]).all()) == {1: 0, 2: 2, 3: 4, 4: 6} True >>> dict(session.query(tables[1]).all()) == {1: 1, 2: 3, 3: 5, 4: 7} True >>> meta.drop_all(engine) >>> >>> # Test tables without specifying the `rid` >>> Base = declarative_base() >>> class Single(Base): ... __tablename__ = 'single' ... id = Column(Integer, primary_key=True) ... rid = Column(Integer) ... value = Column(Integer) ... >>> class Triple(Base): ... __tablename__ = 'triple' ... id = Column(Integer, primary_key=True) ... rid = Column(Integer) ... value = Column(Integer) ... >>> options = [ ... {'mul': 1, 'name': 'single'}, {'mul': 3, 'name': 'triple'}] >>> kwargs = {'TABLES': options, 'ROW_LIMIT': 4} >>> def gen_data(**x): ... return ({'value': n * x['mul'], 'rid': n} for n in it.count()) >>> Base.metadata.create_all(engine) >>> populate(gen_data, engine, **kwargs) >>> Base.metadata.reflect(engine) >>> tables = Base.metadata.sorted_tables >>> session.query(tables[0]).all() [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)] >>> session.query(tables[1]).all() [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)] >>> >>> # Test tables with a specified `rid` >>> populate(gen_data, engine, rid='rid', **kwargs) >>> Base.metadata.reflect(engine) >>> tables = Base.metadata.sorted_tables >>> session.query(tables[0]).all() [(1, 0, 0), (2, 1, 1), (3, 2, 2), (4, 3, 3)] >>> session.query(tables[1]).all() [(1, 0, 0), (2, 1, 3), (3, 2, 6), (4, 3, 9)] Returns str: The message """ log_level = logging.DEBUG if kwargs.get('DEBUG') else logging.INFO logger.setLevel(log_level) console_handler = logging.StreamHandler() logger.addHandler(console_handler) test = kwargs.get('TESTING') row_limit = kwargs.get('ROW_LIMIT') tables = kwargs.get('TABLES') chunk_size = min(row_limit or 'inf', kwargs.get('CHUNK_SIZE', row_limit)) engine.session = sessionmaker(engine)() dynamic = not tables if test: meta.create_all(engine) if dynamic: data = gen_data(**kwargs) tables = get_tables(data, kwargs['KEY']) result_func = partial(get_dynamic_res, engine, get_name, **kwargs) elif models: result_func = partial(res_from_models, models, **kwargs) else: result_func = partial(res_from_meta, engine, **kwargs) for t in tables: count = 0 data = data if dynamic else gen_data(**pr.merge([kwargs, t])) result = result_func(t, data=data) table, rid, data = result['table'], result['rid'], result['data'] table.name = table.__table__.name table.query = engine.session.query(table) del_count = delete_records(table, rid, engine) if del_count: logger.debug(get_message(del_count, table.name)) for records in ft.chunk(data, chunk_size): del_count, in_count = execute(records, engine, table, rid) count += in_count if del_count: logger.debug(get_message(del_count, table.name)) logger.debug(get_message(in_count, table.name, False)) if test: pprint(records) if row_limit and count >= row_limit: break logger.debug('Success! %s' % get_message(count, table.name, False))
def insert_records(self, resource_id, records, **kwargs): """Inserts records into a datastore table. Args: resource_id (str): The datastore resource id. records (List[dict]): The records to insert. **kwargs: Keyword arguments that are passed to datastore_create. Kwargs: method (str): Insert method. One of ['update, 'insert', 'upsert'] (default: 'insert'). force (bool): Create resource even if read-only. start (int): Row number to start from (zero indexed). stop (int): Row number to stop at (zero indexed). chunksize (int): Number of rows to write at a time. Returns: int: Number of records inserted. Raises: NotFound: If unable to find the resource. Examples: >>> CKAN(quiet=True).insert_records('rid', [{'field': 'value'}]) Traceback (most recent call last): NotFound: Resource `rid` was not found in filestore. """ recoded = pr.json_recode(records) chunksize = kwargs.pop('chunksize', 0) start = kwargs.pop('start', 0) stop = kwargs.pop('stop', None) kwargs.setdefault('force', self.force) kwargs.setdefault('method', 'insert') kwargs['resource_id'] = resource_id count = 1 for chunk in ft.chunk(recoded, chunksize, start=start, stop=stop): length = len(chunk) if self.verbose: print('Adding records %i - %i to resource %s...' % (count, count + length - 1, resource_id)) kwargs['records'] = chunk err_msg = 'Resource `%s` was not found in filestore.' % resource_id try: self.datastore_upsert(**kwargs) except requests.exceptions.ConnectionError as err: if 'Broken pipe' in err.message[1]: print( 'Chunksize too large. Try using a smaller chunksize.') return 0 else: raise err except NotFound: # Keep exception message consistent with the others raise NotFound(err_msg) except ValidationError as err: if err.error_dict.get('resource_id') == [ 'Not found: Resource' ]: raise NotFound(err_msg) else: raise err count += length return count