def test_flatten_rows_include(data): rows = [ Row(key=1, value={'a': 1}), Row(key=2, value={'b': 2}), ] assert list(flatten(rows, include=['a', 'b'])) == [ ('a', 'b'), (1, None), (None, 2), ]
def test_flatten_rows_value(data): rows = [ Row(key=1, value='a'), Row(key=2, value='b'), ] assert list(flatten(rows)) == [ ('key', 'value'), (1, 'a'), (2, 'b'), ]
def test_flatten_rows_update_without_include(data): rows = [ Row(key=1, value={'text': 'abc'}), Row(key=1, value={'text': 'abcde'}), ] update = {'size': databot.this.value.text.apply(len)} assert list(flatten(rows, update=update)) == [ ('key', 'size', 'text'), (1, 3, 'abc'), (1, 5, 'abcde'), ]
def test_flatten_rows_callable_update(data): rows = [ Row(key=1, value={'text': 'abc'}), Row(key=1, value={'text': 'abcde'}), ] def update(row): return {'size': len(row.value['text'])} assert list(flatten(rows, update=update)) == [ ('size', ), (3, ), (5, ), ]
def test_flatten_int_key(data): rows = [ Row(key=1, value={'year': { 2000: 1, 2001: 2 }}), Row(key=2, value={'year': { 2000: 3, 2001: 4 }}), ] assert list(flatten(rows)) == [ ('key', 'year.2000', 'year.2001'), (1, 1, 2), (2, 3, 4), ]
def select(expr, pos, value, *args, **kwargs): if isinstance(value, Task): return value.select(*args, **kwargs) else: row = Row({ 'key': None, 'value': { 'content': value.encode('utf-8'), } }) selector = Select(*args, **kwargs) return list(selector(row))
def run(self, args): from databot import this from databot.db.utils import Row from databot.handlers import download exclude = args.exclude.split(',') if args.exclude else None key, value = next( download.download(self.bot.requests, this.key)(Row(key=args.url, value=None))) self.bot.output.key_value(key, value, exclude=exclude) if args.append: self.pipe(args.append).append(key, value)
def updated_rows(rows, update=None): update = update or {} for row in rows: if callable(update): yield Row(key=None, value=_force_dict(update(row))) else: value = _force_dict(row.value) for k, call in update.items(): if isinstance(call, Expression): value[k] = call._eval(row) else: value[k] = call(row) row.value = value yield row
def test_flatten_list(data): rows = [ Row(key=1, value={ 'events': [ { 'name': 'Event 1', 'date': '2017-01-01' }, { 'name': 'Event 2', 'date': '2017-02-01' }, ] }), Row(key=2, value={ 'events': [ { 'name': 'Event 3', 'date': '2017-03-01' }, { 'name': 'Event 4', 'date': '2017-04-01' }, ] }), ] assert list(flatten(rows)) == [ ('events.date', 'events.name', 'key'), ('2017-01-01', 'Event 1', 1), ('2017-02-01', 'Event 2', 1), ('2017-03-01', 'Event 3', 2), ('2017-04-01', 'Event 4', 2), ]
def test_select_method(bot): row = Row({ 'key': 1, 'value': { 'xml': ( '<div>' ' <p>1</p>' ' <p>2</p>' ' <p>3</p>' '</div>' ), }, }) selector = html.Select(this.value.xml.select([select('div p:text').cast(int)])) assert selector(row) == [1, 2, 3]
def __call__(self, key=None, reverse=False): if self.task.source: state = self.task.get_state() error = self.task.target.models.errors.alias('error') table = self.task.source.table.alias('table') # Filter by key if provided if key is not None: row = self.task.source.last(key) if row is None: return where = sa.and_( error.c.state_id == state.id, error.c.row_id == row.id, ) else: where = error.c.state_id == state.id # Ordering if reverse: order_by = error.c.id.desc() else: order_by = error.c.id # Query if all tables stored in same database if self.task.target.samedb and self.task.source.samedb: query = (sa.select( [error, table], use_labels=True).select_from( error.join(table, error.c.row_id == table.c.id)).where( where).order_by(order_by)) for row in windowed_query(self.task.target.engine, query, table.c.id): item = strip_prefix(row, 'error_') item['row'] = create_row(strip_prefix(row, 'table_')) yield item # Query if some tables are stored in external database else: query = error.select(where).order_by(order_by) for err in windowed_query(self.task.target.engine, query, error.c.id): query = table.select(table.c.id == err['row_id']) row = self.task.source.engine.execute(query).first() if row: yield Row(err, row=create_row(row))
def scrape(): if progressbar: if isinstance( export, types.ModuleType) and export.__name__ == 'pandas': desc = '%s -> pandas' % source else: desc = '%s -> %s' % (source, export) total = pipe.count() rows = tqdm.tqdm(pipe.rows(), desc, total, leave=True) else: rows = pipe.rows() try: for row in rows: for key, value in keyvalueitems(selector(row)): if key is not None: yield Row(key=key, value=value) except: if progressbar: rows.close() raise
def test_value_multiple_items(): row = Row(key=1, value={'x': {'y': {'z': 42}}}) assert this.value['x']['y']['z']._eval(row) == 42
def call(self, source, target=None, query=None, key=None, table=False, export=None, errors=False, raw=False, progressbar=True, check=True): """Select structured data from an unstructured source. Parameters ---------- source : databot.pipes.Pipe Source pipe. Should be a pipe with downloaded HTML pages. target : databot.pipes.Pipe Target pipe. query : list | dict | tuple Query for selecting data. key : str, optional Use specific key from source pipe. table : bool, optional Output results as table. export : str, optional Export all data to specified file. errors : bool, optional Read data frm target's errors. raw : bool, optional Return raw python objects instead of printing results to stdout. progressbar : bool, optional Show progress bar if export is given. check : bool or string, optional See ``databot.handlers.html.Select.__init__``. """ import tqdm from databot.pipes import keyvalueitems from databot.handlers import html from databot.db.utils import Row assert query is not None if isinstance(query, tuple) and len(query) == 2: selector = html.Select(query[0], query[1], check=check) else: selector = html.Select(query, check=check) if errors: assert target pipe = target(source).errors elif target: pipe = target(source) else: pipe = source if export: def scrape(): if progressbar: if isinstance( export, types.ModuleType) and export.__name__ == 'pandas': desc = '%s -> pandas' % source else: desc = '%s -> %s' % (source, export) total = pipe.count() rows = tqdm.tqdm(pipe.rows(), desc, total, leave=True) else: rows = pipe.rows() try: for row in rows: for key, value in keyvalueitems(selector(row)): if key is not None: yield Row(key=key, value=value) except: if progressbar: rows.close() raise return export_service(scrape(), export) else: row = pipe.last(key) row = row['row'] if row and errors else row if raw: if row: rows = keyvalueitems(selector(row)) return [ Row(key=key, value=value) for key, value in rows if key is not None ] else: return [] else: if row: rows = keyvalueitems(selector(row)) if table: self.bot.output.table([ Row(key=key, value=value) for key, value in rows if key is not None ]) else: for key, value in rows: if key is not None: self.bot.output.key_value(key, value) else: if key: self.info('Item with key=%r not found.' % key) else: self.info('No items found.')
def test_flatten(): rows = [ Row(key=1, value={ 'foo': 'bar', 'events': [ { 'name': 'Event 1', 'date': '2017-01-01', 'people': ['a', 'b'] }, { 'name': 'Event 2', 'date': '2017-01-02', 'people': ['a'] }, ] }), Row(key=2, value={ 'foo': 'baz', 'events': [ { 'name': 'Event 3', 'date': '2017-01-03', 'people': ['x', 'y'] }, { 'name': 'Event 4', 'date': '2017-01-04', 'people': ['z'] }, ] }), ] assert list(flatten(rows)) == [ ('events.date', 'events.name', 'events.people', 'foo', 'key'), ('2017-01-01', 'Event 1', 'a', 'bar', 1), ('2017-01-01', 'Event 1', 'b', 'bar', 1), ('2017-01-02', 'Event 2', 'a', 'bar', 1), ('2017-01-03', 'Event 3', 'x', 'baz', 2), ('2017-01-03', 'Event 3', 'y', 'baz', 2), ('2017-01-04', 'Event 4', 'z', 'baz', 2), ] assert list(flatten(rows, include=('key', 'foo', 'events.people'))) == [ ('key', 'foo', 'events.people'), (1, 'bar', 'a'), (1, 'bar', 'b'), (1, 'bar', 'a'), (2, 'baz', 'x'), (2, 'baz', 'y'), (2, 'baz', 'z'), ] assert list(flatten(rows, include=('key', 'foo'))) == [ ('key', 'foo'), (1, 'bar'), (2, 'baz'), ]
def test_key(): row = Row(key=1, value=2) assert this.key._eval(row) == 1
def test_url(): row = Row(key=1, value='http://example.com/path?key=42') assert this.value.urlparse().query.key.cast(int)._eval(row) == 42 assert this.value.urlparse().path._eval(row) == '/path' assert this.value.urlparse().hostname._eval(row) == 'example.com' assert this.value.url()._eval(row) == 'http://example.com/path?key=42'
def test_value_attr(): row = Row(key=1, value={'x': 3, 'y': 4}) assert this.value.x._eval(row) == 3 row = Row(key=1, value={'x': {'y': {'z': 42}}}) assert this.value.x.y.z._eval(row) == 42
def test_row(): row = Row(key=1, value=2) assert this._eval(row) == {'key': 1, 'value': 2}
def test_value_function_arguments(): def getitem(value, key, default): return value.get(key, default) row = Row(key=1, value={'x': 'abc'}) assert this.value.apply(getitem, 'y', 'zz')._eval(row) == 'zz'
def test_nested_dict(): source = {'value': {'number': this.value}} target = {'value': {'number': 42}} assert recursive.call(source, Row(value=42)) == target
def test_value_item(): row = Row(key=1, value={'x': 3, 'y': 4}) assert this.value['x']._eval(row) == 3
def test_value(): row = Row(key=1, value=2) assert this.value._eval(row) == 2
def check_download(url, response, check): row = Row({'key': url, 'value': response}) select = Select(check) select.set_row(row) select.check_render(row, select.html, check, many=True)
def test_value_function(): row = Row(key=1, value={'x': 'abc'}) assert this.value.x.apply(len)._eval(row) == 3 assert this.value.x.upper()._eval(row) == 'ABC' assert this.value.apply(list)._eval(row) == ['x'] assert this.value.apply(list)[0].upper()._eval(row) == 'X'
def test_list(): source = {'value': [this.value, this.value]} target = {'value': [42, 42]} assert recursive.call(source, Row(value=42)) == target