コード例 #1
0
def test_flatten_rows_include(data):
    rows = [
        Row(key=1, value={'a': 1}),
        Row(key=2, value={'b': 2}),
    ]
    assert list(flatten(rows, include=['a', 'b'])) == [
        ('a', 'b'),
        (1, None),
        (None, 2),
    ]
コード例 #2
0
def test_flatten_rows_value(data):
    rows = [
        Row(key=1, value='a'),
        Row(key=2, value='b'),
    ]
    assert list(flatten(rows)) == [
        ('key', 'value'),
        (1, 'a'),
        (2, 'b'),
    ]
コード例 #3
0
def test_flatten_rows_update_without_include(data):
    rows = [
        Row(key=1, value={'text': 'abc'}),
        Row(key=1, value={'text': 'abcde'}),
    ]
    update = {'size': databot.this.value.text.apply(len)}
    assert list(flatten(rows, update=update)) == [
        ('key', 'size', 'text'),
        (1, 3, 'abc'),
        (1, 5, 'abcde'),
    ]
コード例 #4
0
def test_flatten_rows_callable_update(data):
    rows = [
        Row(key=1, value={'text': 'abc'}),
        Row(key=1, value={'text': 'abcde'}),
    ]

    def update(row):
        return {'size': len(row.value['text'])}

    assert list(flatten(rows, update=update)) == [
        ('size', ),
        (3, ),
        (5, ),
    ]
コード例 #5
0
def test_flatten_int_key(data):
    rows = [
        Row(key=1, value={'year': {
            2000: 1,
            2001: 2
        }}),
        Row(key=2, value={'year': {
            2000: 3,
            2001: 4
        }}),
    ]
    assert list(flatten(rows)) == [
        ('key', 'year.2000', 'year.2001'),
        (1, 1, 2),
        (2, 3, 4),
    ]
コード例 #6
0
ファイル: html.py プロジェクト: sirex/databot
def select(expr, pos, value, *args, **kwargs):
    if isinstance(value, Task):
        return value.select(*args, **kwargs)
    else:
        row = Row({
            'key': None,
            'value': {
                'content': value.encode('utf-8'),
            }
        })
        selector = Select(*args, **kwargs)
        return list(selector(row))
コード例 #7
0
ファイル: commands.py プロジェクト: sirex/databot
    def run(self, args):
        from databot import this
        from databot.db.utils import Row
        from databot.handlers import download

        exclude = args.exclude.split(',') if args.exclude else None
        key, value = next(
            download.download(self.bot.requests, this.key)(Row(key=args.url,
                                                               value=None)))
        self.bot.output.key_value(key, value, exclude=exclude)

        if args.append:
            self.pipe(args.append).append(key, value)
コード例 #8
0
ファイル: utils.py プロジェクト: sirex/databot
def updated_rows(rows, update=None):
    update = update or {}
    for row in rows:
        if callable(update):
            yield Row(key=None, value=_force_dict(update(row)))
        else:
            value = _force_dict(row.value)
            for k, call in update.items():
                if isinstance(call, Expression):
                    value[k] = call._eval(row)
                else:
                    value[k] = call(row)
            row.value = value
            yield row
コード例 #9
0
def test_flatten_list(data):
    rows = [
        Row(key=1,
            value={
                'events': [
                    {
                        'name': 'Event 1',
                        'date': '2017-01-01'
                    },
                    {
                        'name': 'Event 2',
                        'date': '2017-02-01'
                    },
                ]
            }),
        Row(key=2,
            value={
                'events': [
                    {
                        'name': 'Event 3',
                        'date': '2017-03-01'
                    },
                    {
                        'name': 'Event 4',
                        'date': '2017-04-01'
                    },
                ]
            }),
    ]
    assert list(flatten(rows)) == [
        ('events.date', 'events.name', 'key'),
        ('2017-01-01', 'Event 1', 1),
        ('2017-02-01', 'Event 2', 1),
        ('2017-03-01', 'Event 3', 2),
        ('2017-04-01', 'Event 4', 2),
    ]
コード例 #10
0
def test_select_method(bot):
    row = Row({
        'key': 1,
        'value': {
            'xml': (
                '<div>'
                '  <p>1</p>'
                '  <p>2</p>'
                '  <p>3</p>'
                '</div>'
            ),
        },
    })

    selector = html.Select(this.value.xml.select([select('div p:text').cast(int)]))
    assert selector(row) == [1, 2, 3]
コード例 #11
0
    def __call__(self, key=None, reverse=False):
        if self.task.source:
            state = self.task.get_state()
            error = self.task.target.models.errors.alias('error')
            table = self.task.source.table.alias('table')

            # Filter by key if provided
            if key is not None:
                row = self.task.source.last(key)
                if row is None:
                    return
                where = sa.and_(
                    error.c.state_id == state.id,
                    error.c.row_id == row.id,
                )
            else:
                where = error.c.state_id == state.id

            # Ordering
            if reverse:
                order_by = error.c.id.desc()
            else:
                order_by = error.c.id

            # Query if all tables stored in same database
            if self.task.target.samedb and self.task.source.samedb:
                query = (sa.select(
                    [error, table], use_labels=True).select_from(
                        error.join(table, error.c.row_id == table.c.id)).where(
                            where).order_by(order_by))

                for row in windowed_query(self.task.target.engine, query,
                                          table.c.id):
                    item = strip_prefix(row, 'error_')
                    item['row'] = create_row(strip_prefix(row, 'table_'))
                    yield item

            # Query if some tables are stored in external database
            else:
                query = error.select(where).order_by(order_by)
                for err in windowed_query(self.task.target.engine, query,
                                          error.c.id):
                    query = table.select(table.c.id == err['row_id'])
                    row = self.task.source.engine.execute(query).first()
                    if row:
                        yield Row(err, row=create_row(row))
コード例 #12
0
ファイル: commands.py プロジェクト: sirex/databot
            def scrape():
                if progressbar:
                    if isinstance(
                            export,
                            types.ModuleType) and export.__name__ == 'pandas':
                        desc = '%s -> pandas' % source
                    else:
                        desc = '%s -> %s' % (source, export)
                    total = pipe.count()
                    rows = tqdm.tqdm(pipe.rows(), desc, total, leave=True)
                else:
                    rows = pipe.rows()

                try:
                    for row in rows:
                        for key, value in keyvalueitems(selector(row)):
                            if key is not None:
                                yield Row(key=key, value=value)
                except:
                    if progressbar:
                        rows.close()
                    raise
コード例 #13
0
ファイル: test_row.py プロジェクト: sirex/databot
def test_value_multiple_items():
    row = Row(key=1, value={'x': {'y': {'z': 42}}})
    assert this.value['x']['y']['z']._eval(row) == 42
コード例 #14
0
ファイル: commands.py プロジェクト: sirex/databot
    def call(self,
             source,
             target=None,
             query=None,
             key=None,
             table=False,
             export=None,
             errors=False,
             raw=False,
             progressbar=True,
             check=True):
        """Select structured data from an unstructured source.

        Parameters
        ----------
        source : databot.pipes.Pipe
            Source pipe. Should be a pipe with downloaded HTML pages.
        target : databot.pipes.Pipe
            Target pipe.
        query : list | dict | tuple
            Query for selecting data.
        key : str, optional
            Use specific key from source pipe.
        table : bool, optional
            Output results as table.
        export : str, optional
            Export all data to specified file.
        errors : bool, optional
            Read data frm target's errors.
        raw : bool, optional
            Return raw python objects instead of printing results to stdout.
        progressbar : bool, optional
            Show progress bar if export is given.
        check : bool or string, optional
            See ``databot.handlers.html.Select.__init__``.

        """
        import tqdm

        from databot.pipes import keyvalueitems
        from databot.handlers import html
        from databot.db.utils import Row

        assert query is not None

        if isinstance(query, tuple) and len(query) == 2:
            selector = html.Select(query[0], query[1], check=check)
        else:
            selector = html.Select(query, check=check)

        if errors:
            assert target
            pipe = target(source).errors
        elif target:
            pipe = target(source)
        else:
            pipe = source

        if export:

            def scrape():
                if progressbar:
                    if isinstance(
                            export,
                            types.ModuleType) and export.__name__ == 'pandas':
                        desc = '%s -> pandas' % source
                    else:
                        desc = '%s -> %s' % (source, export)
                    total = pipe.count()
                    rows = tqdm.tqdm(pipe.rows(), desc, total, leave=True)
                else:
                    rows = pipe.rows()

                try:
                    for row in rows:
                        for key, value in keyvalueitems(selector(row)):
                            if key is not None:
                                yield Row(key=key, value=value)
                except:
                    if progressbar:
                        rows.close()
                    raise

            return export_service(scrape(), export)

        else:
            row = pipe.last(key)
            row = row['row'] if row and errors else row

            if raw:
                if row:
                    rows = keyvalueitems(selector(row))
                    return [
                        Row(key=key, value=value) for key, value in rows
                        if key is not None
                    ]
                else:
                    return []
            else:
                if row:
                    rows = keyvalueitems(selector(row))
                    if table:
                        self.bot.output.table([
                            Row(key=key, value=value) for key, value in rows
                            if key is not None
                        ])
                    else:
                        for key, value in rows:
                            if key is not None:
                                self.bot.output.key_value(key, value)
                else:
                    if key:
                        self.info('Item with key=%r not found.' % key)
                    else:
                        self.info('No items found.')
コード例 #15
0
def test_flatten():
    rows = [
        Row(key=1,
            value={
                'foo':
                'bar',
                'events': [
                    {
                        'name': 'Event 1',
                        'date': '2017-01-01',
                        'people': ['a', 'b']
                    },
                    {
                        'name': 'Event 2',
                        'date': '2017-01-02',
                        'people': ['a']
                    },
                ]
            }),
        Row(key=2,
            value={
                'foo':
                'baz',
                'events': [
                    {
                        'name': 'Event 3',
                        'date': '2017-01-03',
                        'people': ['x', 'y']
                    },
                    {
                        'name': 'Event 4',
                        'date': '2017-01-04',
                        'people': ['z']
                    },
                ]
            }),
    ]
    assert list(flatten(rows)) == [
        ('events.date', 'events.name', 'events.people', 'foo', 'key'),
        ('2017-01-01', 'Event 1', 'a', 'bar', 1),
        ('2017-01-01', 'Event 1', 'b', 'bar', 1),
        ('2017-01-02', 'Event 2', 'a', 'bar', 1),
        ('2017-01-03', 'Event 3', 'x', 'baz', 2),
        ('2017-01-03', 'Event 3', 'y', 'baz', 2),
        ('2017-01-04', 'Event 4', 'z', 'baz', 2),
    ]

    assert list(flatten(rows, include=('key', 'foo', 'events.people'))) == [
        ('key', 'foo', 'events.people'),
        (1, 'bar', 'a'),
        (1, 'bar', 'b'),
        (1, 'bar', 'a'),
        (2, 'baz', 'x'),
        (2, 'baz', 'y'),
        (2, 'baz', 'z'),
    ]

    assert list(flatten(rows, include=('key', 'foo'))) == [
        ('key', 'foo'),
        (1, 'bar'),
        (2, 'baz'),
    ]
コード例 #16
0
ファイル: test_row.py プロジェクト: sirex/databot
def test_key():
    row = Row(key=1, value=2)
    assert this.key._eval(row) == 1
コード例 #17
0
ファイル: test_row.py プロジェクト: sirex/databot
def test_url():
    row = Row(key=1, value='http://example.com/path?key=42')
    assert this.value.urlparse().query.key.cast(int)._eval(row) == 42
    assert this.value.urlparse().path._eval(row) == '/path'
    assert this.value.urlparse().hostname._eval(row) == 'example.com'
    assert this.value.url()._eval(row) == 'http://example.com/path?key=42'
コード例 #18
0
ファイル: test_row.py プロジェクト: sirex/databot
def test_value_attr():
    row = Row(key=1, value={'x': 3, 'y': 4})
    assert this.value.x._eval(row) == 3

    row = Row(key=1, value={'x': {'y': {'z': 42}}})
    assert this.value.x.y.z._eval(row) == 42
コード例 #19
0
ファイル: test_row.py プロジェクト: sirex/databot
def test_row():
    row = Row(key=1, value=2)
    assert this._eval(row) == {'key': 1, 'value': 2}
コード例 #20
0
ファイル: test_row.py プロジェクト: sirex/databot
def test_value_function_arguments():
    def getitem(value, key, default):
        return value.get(key, default)

    row = Row(key=1, value={'x': 'abc'})
    assert this.value.apply(getitem, 'y', 'zz')._eval(row) == 'zz'
コード例 #21
0
ファイル: test_recursive.py プロジェクト: sirex/databot
def test_nested_dict():
    source = {'value': {'number': this.value}}
    target = {'value': {'number': 42}}
    assert recursive.call(source, Row(value=42)) == target
コード例 #22
0
ファイル: test_row.py プロジェクト: sirex/databot
def test_value_item():
    row = Row(key=1, value={'x': 3, 'y': 4})
    assert this.value['x']._eval(row) == 3
コード例 #23
0
ファイル: test_row.py プロジェクト: sirex/databot
def test_value():
    row = Row(key=1, value=2)
    assert this.value._eval(row) == 2
コード例 #24
0
def check_download(url, response, check):
    row = Row({'key': url, 'value': response})
    select = Select(check)
    select.set_row(row)
    select.check_render(row, select.html, check, many=True)
コード例 #25
0
ファイル: test_row.py プロジェクト: sirex/databot
def test_value_function():
    row = Row(key=1, value={'x': 'abc'})
    assert this.value.x.apply(len)._eval(row) == 3
    assert this.value.x.upper()._eval(row) == 'ABC'
    assert this.value.apply(list)._eval(row) == ['x']
    assert this.value.apply(list)[0].upper()._eval(row) == 'X'
コード例 #26
0
ファイル: test_recursive.py プロジェクト: sirex/databot
def test_list():
    source = {'value': [this.value, this.value]}
    target = {'value': [42, 42]}
    assert recursive.call(source, Row(value=42)) == target