def test_external_db_error_when_migrations_not_applied(mocker, db): mocker.patch('sys.exit') mocker.patch('databot.db.migrations.Migrations.migrations', {migrations.ValueToMsgpack: set()}) # Create tables, but do not apply any migrations engine = sa.create_engine('sqlite:///:memory:') models = Models(sa.MetaData()) models.metadata.create_all(engine, checkfirst=True) bot1 = Bot(engine, output=io.StringIO(), models=models) bot1.define('p1') bot2 = db.Bot() bot2.define('p1', bot1.engine) bot2.define('p2') bot2.main(argv=['status']) assert bot2.output.output.getvalue() == dedent('''\ External database 'sqlite:///:memory:' from 'p1' pipe has unapplied migrations. You need to run database migrations: databot sqlite:///:memory: migrate List of unapplied migrations: - ValueToMsgpack id rows source errors left target ================================= 1 0 p1 --------------------------------- 2 0 p2 --------------------------------- ''')
def html(self, code): bot = Bot() html = bot.define('html') p1 = bot.define('p1') with (here / 'fixtures/sample.html').open('rb') as f: content = f.read() html.append([('https://example.com/', { 'headers': {}, 'cookies': {}, 'status_code': 200, 'encoding': 'utf-8', 'content': content, })]) eval(code, {}, { 'bot': bot, 'html': html, 'p1': p1(html), 'this': this, 'int': int, 'select': select, }) return pformat(list(p1.items()), width=42)
def main(): words = ['贸易战'] baidu_url = 'https://www.baidu.com/s?wd=%s' urls = [baidu_url % (word) for word in words] # make data flow net insert = Insert( "insert into test.baidu (id,name ,url,page_rank,page_no)values('{id}','{name}' ,'{url}',{page_rank},{page_no})", **dbconf) p = Pipe( urls, HttpLoader(), Branch(get_all_items, join=True), Branch(get_all_page_url, HttpLoader(), get_all_items, share=False, join=True, route_type=HttpResponse), insert, ) Pipe(Timer(delay=2, until=p.finished), show_info) Bot.render('ex_output/baiduspider') Bot.run()
def test_run_limits_and_fail(): def handler(row): if row.key == 'b': raise ValueError('b') else: yield row.key.upper() pipeline = { 'tasks': [ task('p1').once().append(['a', 'b', 'c']), task('p1', 'p2').call(handler), ], } bot = Bot() p1 = bot.define('p1') p2 = bot.define('p2') with pytest.raises(ExpressionError): bot.main(pipeline, ['run', '-l', '1,1,0']) assert list(p1.keys()) == ['a', 'b', 'c'] assert list(p2.keys()) == ['A'] assert pipeline['tasks'][0]._evals == 2 assert pipeline['tasks'][1]._evals == 2
def test_run_limits_and_fail_smaller(): def handler(row): if row.key == 'b': raise ValueError('b') else: yield row.key.upper() pipeline = { 'tasks': [ task('p1').once().append(['a', 'b', 'c']), task('p1', 'p2').call(handler), ], } bot = Bot() p1 = bot.define('p1') p2 = bot.define('p2') bot.main(pipeline, ['run', '-l', '1,1,0', '-f', '2']) assert list(p1.keys()) == ['a', 'b', 'c'] assert list(p2.keys()) == ['A', 'C'] assert list(p2(p1).errors.keys()) == ['b'] assert pipeline['tasks'][0]._evals == 3 assert pipeline['tasks'][1]._evals == 3
def append(self, code): bot = Bot() p1 = bot.define('p1') eval(code, {}, { 'bot': bot, 'p1': p1, }) return repr(list(p1.items()))
def main(): Pipe( Timer(delay=2, max_time=5), "http://api.coindesk.com/v1/bpi/currentprice.json", HttpLoader(), lambda r: r.json['bpi']['USD']['rate_float'], print, ) Bot.render('ex_output/simple_bitcoin_price') Bot.run()
def test_run_once(): tasks = [ task('p1').once().append(1), task('p1').once().append(2), task('p1').append(3), ] bot = Bot() p1 = bot.define('p1') bot.commands.run(tasks, limits=(1, 1, 0)) assert list(p1.keys()) == [1, 2, 3, 3, 3]
def duplicates(self, code): bot = Bot() p1 = bot.define('p1').append([ (1, 'old'), (1, 'new'), (2, 'old'), (2, 'new'), ]) eval(code, {}, { 'bot': bot, 'p1': p1, }) return repr(list(p1.items()))
def main(): words = ['贸易战', '世界杯'] baidu_url = 'https://www.baidu.com/s?wd=%s' urls = [baidu_url % (word) for word in words] outputfile = aiofile('ex_output/baidu.txt') Pipe( urls, HttpLoader(), Branch(get_all_items, outputfile), Branch(get_all_page_url, HttpLoader(), get_all_items, outputfile), ) #生成流程图 Bot.render('ex_output/baiduspider') Bot.run()
def test_run_limits(): pipeline = { 'tasks': [ task('p1').once().append(['a', 'b', 'c']), task('p1', 'p2').select(this.key.upper()), ], } bot = Bot() p1 = bot.define('p1') p2 = bot.define('p2') bot.main(pipeline, ['run', '-l', '1,1,0']) assert list(p1.keys()) == ['a', 'b', 'c'] assert list(p2.keys()) == ['A', 'B', 'C'] assert pipeline['tasks'][0]._evals == 3 assert pipeline['tasks'][1]._evals == 3
def test_run(): pipeline = { 'pipes': [ define('a'), define('b'), ], 'tasks': [ task('a').append(['a', 'A', 'b']), task('a', 'b').select(this.key.upper()), task().compact(), ], } bot = Bot() bot.main(pipeline, ['run', '-f']) assert list(bot.pipe('a').keys()) == ['a', 'A', 'b'] assert list(bot.pipe('b').keys()) == ['A', 'B']
def test_rename(bot): bot.define('p1') bot.define('p2') bot.main(argv=['rename', 'p1', 'pp']) bot = Bot('sqlite:///:memory:', output=io.StringIO()) bot.define('pp') bot.define('p2') bot.main(argv=['status']) assert bot.output.output.getvalue() == ( ' id rows source\n' ' errors left target\n' '=================================\n' ' 1 0 pp\n' '---------------------------------\n' ' 2 0 p2\n' '---------------------------------\n' )
def test_run_freq(): tasks = [ task('p1').freq(days=3).append(['a']), task('p1', 'p2').select(this.key.upper()), ] bot = Bot() p1 = bot.define('p1') p2 = bot.define('p2') with freezegun.freeze_time('2017-01-01 00:00:00'): bot.commands.run(tasks) with freezegun.freeze_time('2017-01-02 00:00:00'): bot.commands.run(tasks) assert list(p1.keys()) == ['a'] assert list(p2.keys()) == ['A'] with freezegun.freeze_time('2017-01-04 00:00:00'): bot.commands.run(tasks) assert list(p1.keys()) == ['a', 'a'] assert list(p2.keys()) == ['A', 'A']
def test_rename(bot): bot.define('p1') bot.define('p2') bot.main(argv=['rename', 'p1', 'pp']) bot = Bot('sqlite:///:memory:', output=io.StringIO()) bot.define('pp') bot.define('p2') bot.main(argv=['status']) assert bot.output.output.getvalue() == ( ' id rows source\n' ' errors left target\n' '=================================\n' ' 1 0 pp\n' '---------------------------------\n' ' 2 0 p2\n' '---------------------------------\n')
def test_init_path(tmpdir): path = str(tmpdir.join('data.db')) assert str(Bot(path).engine.url) == 'sqlite:///%s' % path
def test_run_target(): pipeline = { 'pipes': [], 'tasks': [ task('a').once().append(['a']), task('a', 'b').select(this.key.upper()), task('b', 'c').select(this.key.lower()), task().compact(), ], } bot = Bot() bot.define('a') bot.define('b') bot.define('c') bot.main(pipeline, ['run', 'a', '-f']) assert list(bot.pipe('a').keys()) == ['a'] assert list(bot.pipe('b').keys()) == [] assert list(bot.pipe('c').keys()) == [] bot.main(pipeline, ['run', 'b', '-f']) assert list(bot.pipe('a').keys()) == ['a'] assert list(bot.pipe('b').keys()) == ['A'] assert list(bot.pipe('c').keys()) == [] bot.pipe('a').append('b') bot.main(pipeline, ['run', 'a', 'b', '-f']) assert list(bot.pipe('a').keys()) == ['a', 'b'] assert list(bot.pipe('b').keys()) == ['A', 'B'] assert list(bot.pipe('c').keys()) == [] bot.main(pipeline, ['run', 'b', 'c', '-f']) assert list(bot.pipe('a').keys()) == ['a', 'b'] assert list(bot.pipe('b').keys()) == ['A', 'B'] assert list(bot.pipe('c').keys()) == ['a', 'b'] bot.pipe('b').append('C') bot.main(pipeline, ['run', 'c', '-f']) assert list(bot.pipe('a').keys()) == ['a', 'b'] assert list(bot.pipe('b').keys()) == ['A', 'B', 'C'] assert list(bot.pipe('c').keys()) == ['a', 'b', 'c'] bot.main(pipeline, ['run', '-f']) assert list(bot.pipe('a').keys()) == ['b', 'a'] assert list(bot.pipe('b').keys()) == ['B', 'C', 'A'] assert list(bot.pipe('c').keys()) == ['b', 'c', 'a']
def test_init_with_engine_instance(): Bot(sa.create_engine('sqlite:///:memory:'))
def test_init(): assert str(Bot('sqlite:///:memory:').engine.url) == 'sqlite:///:memory:'
def test_autodefine(): engine = sa.create_engine('sqlite:///:memory:') bot = Bot(engine) bot.define('a').append([1, 2, 3]) bot = Bot(engine) with pytest.raises(KeyError) as e: bot.pipe('a') assert str(e.value) == "'a'" bot = Bot(engine).autodefine() assert list(bot.pipe('a').keys()) == [1, 2, 3]
#!/usr/bin/env python3 from databot import Bot, define, task, first pipeline = { 'pipes': [ define('index'), define('news'), ], 'tasks': [ task('index').once().download('https://www.reddit.com/'), task('index', 'news').select([ '.thing.link', ('.entry .title > a@href', { 'title': '.entry .title > a:text', 'score': '.midcol .score.likes@title', 'time': first(['.tagline time@datetime']), 'comments': '.entry a.comments:text', }) ]), task('news').export('/tmp/reddit.jsonl'), task().compact(), ], } if __name__ == '__main__': Bot('/tmp/reddit.db').main(pipeline)
def show_info(i): Bot.debug()
def test_init_no_args(): assert str(Bot().engine.url) == 'sqlite:///:memory:'