def test_run_error_limit_n(bot, capsys): def handler(row): if row.key > 1: raise ValueError('Error.') else: yield row.key, row.value.upper() pipeline = { 'pipes': [ define('p1'), define('p2'), ], 'tasks': [ task('p1').once().append([(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]), task('p1', 'p2').call(handler), ] } with pytest.raises(ExpressionError): bot.main(pipeline, argv=['run', '-f', '2', '-l', '0']) assert bot.output.output.getvalue() == textwrap.dedent('''\ Validating pipeline. Run pipeline (limit=0). - key: 3 value: 'c' ''') assert list(bot.pipe('p2').items()) == [(1, 'A')] assert capsys.readouterr( )[0] == 'Interrupting bot because error limit of 2 was reached.\n' assert task('p1', 'p2').errors.count()._eval(bot) == 2
def test_run_error_limit_n(bot, capsys): def handler(row): if row.key > 1: raise ValueError('Error.') else: yield row.key, row.value.upper() pipeline = { 'pipes': [ define('p1'), define('p2'), ], 'tasks': [ task('p1').once().append([(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')]), task('p1', 'p2').call(handler), ] } with pytest.raises(ExpressionError): bot.main(pipeline, argv=['run', '-f', '2', '-l', '0']) assert bot.output.output.getvalue() == textwrap.dedent('''\ Validating pipeline. Run pipeline (limit=0). - key: 3 value: 'c' ''') assert list(bot.pipe('p2').items()) == [(1, 'A')] assert capsys.readouterr()[0] == 'Interrupting bot because error limit of 2 was reached.\n' assert task('p1', 'p2').errors.count()._eval(bot) == 2
def test_main(db): pipeline = { 'pipes': [ define('p1'), define('p2'), ], 'tasks': [ task('p1').once().append([('1', 'a'), ('2', 'b'), ('3', 'c')]), task('p1', 'p2').select(this.key, this.value.upper()), ], } bot = db.Bot().main(pipeline, argv=['-v0', 'run']) assert list(bot.pipe('p1').items()) == [('1', 'a'), ('2', 'b'), ('3', 'c')] assert list(bot.pipe('p2').items()) == [('1', 'A'), ('2', 'B'), ('3', 'C')]
def main(): botlib.runbot({ 'pipes': [ define('places'), ], 'tasks': [ task('places').daily().clean().append(query_places(), progress='places'), task('places').export('data/osm/places.csv', include=[ 'osm_id', 'type', 'place', 'population', 'wikipedia_title', 'wikipedia_lang', 'lon', 'lat', 'admin_level_6_osm_id', 'admin_level_6', 'admin_level_5_osm_id', 'admin_level_5', 'admin_level_4_osm_id', 'admin_level_4', ]) ], })
def test_run(): pipeline = { 'pipes': [ define('a'), define('b'), ], 'tasks': [ task('a').append(['a', 'A', 'b']), task('a', 'b').select(this.key.upper()), task().compact(), ], } bot = Bot() bot.main(pipeline, ['run', '-f']) assert list(bot.pipe('a').keys()) == ['a', 'A', 'b'] assert list(bot.pipe('b').keys()) == ['A', 'B']
def test_run(bot): pipeline = { 'pipes': [ define('p1'), define('p2'), ], 'tasks': [ task('p1').once().append([(1, 'a'), (2, 'b')]), task('p1', 'p2').select(this.key, this.value.upper()), ] } bot.main(pipeline, argv=['run', '-f']) assert bot.output.output.getvalue() == textwrap.dedent('''\ Validating pipeline. Run pipeline (limit=1). Run pipeline (limit=0). ''') assert list(bot.pipe('p2').items()) == [(1, 'A'), (2, 'B')]
def test_run(bot): pipeline = { 'pipes': [ define('p1'), define('p2'), ], 'tasks': [ task('p1').once().append([(1, 'a'), (2, 'b')]), task('p1', 'p2').select(this.key, this.value.upper()), ] } bot.main(pipeline, argv=['run', '-f']) assert bot.output.output.getvalue() == textwrap.dedent('''\ Validating pipeline. Run pipeline (limit=1). Run pipeline (limit=0). ''') assert list(bot.pipe('p2').items()) == [(1, 'A'), (2, 'B')]
def main(argv=None, output=sys.stdout): argv = argv or sys.argv[1:] parser = argparse.ArgumentParser() parser.add_argument('db', help='path to sqlite datbase or database connection string') args = parser.parse_args(argv[:1]) bot = databot.Bot(args.db, output=output) pipeline = { 'pipes': [databot.define(pipe.pipe) for pipe in get_pipe_tables(bot)], 'tasks': [], } bot.main(pipeline, argv=argv[1:])
def main(argv=None, output=sys.stdout): argv = argv or sys.argv[1:] parser = argparse.ArgumentParser() parser.add_argument( 'db', help='path to sqlite datbase or database connection string') args = parser.parse_args(argv[:1]) bot = databot.Bot(args.db, output=output) pipeline = { 'pipes': [databot.define(pipe.pipe) for pipe in get_pipe_tables(bot)], 'tasks': [], } bot.main(pipeline, argv=argv[1:])
class key(Call): def __init__(self, *queries): self.queries = queries def __call__(self, select, row, node, many=False, single=True): return '/'.join([ normtime(select.render(row, node, q, many, single)) for q in self.queries ]) pipeline = { 'pipes': [ define('pages', compress=True), define('data'), ], 'tasks': [ task('pages').freq(minutes=5).download('http://www.meteo.lt/lt_LT/miestas?placeCode=Vilnius'), task('pages', 'data').select([ '.forecast-hours', ( key(select(['xpath://body css:.forecast-hours .forecastTime:text']).min(), select('.forecastTime:text')), { 'base': select(['xpath://body css:.forecast-hours .forecastTime:text']).min().apply(normtime), # precision=hours base time 'time': select('.forecastTime:text').apply(normtime), # precision=hours prediction time 'temperature': select('.temperature:text').cast(int), # °C 'wind_direction': select('.windDirectionGroundDegree:text').cast(int), # degrees 'wind_speed': select('.windSpeedGround:text').cast(int), # m/s 'gust_speed': select('.windGustGround:text').cast(int), # m/s
#!/usr/bin/env python3 from databot import Bot, define, task, first pipeline = { 'pipes': [ define('index'), define('news'), ], 'tasks': [ task('index').once().download('https://www.reddit.com/'), task('index', 'news').select([ '.thing.link', ('.entry .title > a@href', { 'title': '.entry .title > a:text', 'score': '.midcol .score.likes@title', 'time': first(['.tagline time@datetime']), 'comments': '.entry a.comments:text', }) ]), task('news').export('/tmp/reddit.jsonl'), task().compact(), ], } if __name__ == '__main__': Bot('/tmp/reddit.db').main(pipeline)
for k, v in json.loads(tops.group(1)).items() } data = json.loads(data.group(1)) assert len(tops) == len(data) data = dict(zip(sorted(tops.keys()), data)) return { 'year': data, 'mean': sum(data.values()) / len(data), 'max': max(data.values()), 'top': min(tops.values()) } pipeline = { 'pipes': [ define('raidės-nuorodos'), define('raidės-puslapiai', compress=True), define('sąrašas-nuorodos'), define('sąrašas-puslapiai', compress=True), define('vardai-nuorodos'), define('vardai-puslapiai', compress=True), define('vardai'), ], 'tasks': [ # Vardo pirmos raidės sąrašas task('raidės-nuorodos').monthly().append( 'https://www.tevu-darzelis.lt/vaiku-vardai/A/'), task('raidės-puslapiai', 'raidės-nuorodos', watch=True).select(['#alphabet li a@href']).dedup(), task('raidės-nuorodos', 'raidės-puslapiai', watch=True).download(),
break try: browser.wait.until( attribute_has_changed(By.CSS_SELECTOR, '.searchResultDescription a', 'href', first_item_link)) except TimeoutException: browser.get_screenshot_as_file( '/tmp/epaveldas_attribute_has_changed.png') except: browser.get_screenshot_as_file('/tmp/epaveldas_error.png') raise finally: browser.quit() pipeline = { 'pipes': [ define('paieškos-nuorodos'), ], 'tasks': [ task('paieškos-nuorodos').once().clean().append( extract_index_urls(), progress='paieškos-nuorodos').dedup(), ], } if __name__ == '__main__': botlib.runbot(pipeline)
#!/usr/bin/env python3 import yaml import botlib from databot import define, task, this, select with open('settings.yml') as f: settings = yaml.load(f) cookies = settings['cookies']['www.lrs.lt'] pipeline = { 'pipes': [ define('klausimų-puslapiai', botlib.dburi('lrs/posedziai')), define('balsavimų-sąrašas'), define('balsavimų-puslapiai', compress=True), define('balsavimų-duomenys'), define('registracijos-sąrašas'), define('registracijos-puslapiai', compress=True), ], 'tasks': [ # Darbotvarkės klausimas (balsavimai) task('klausimų-puslapiai', 'balsavimų-sąrašas').select( [ '.sale_svarst_eiga tr td[2] xpath:a[text()="balsavimas"]', '@href' ], check= 'xpath://h1[contains(text(), "Darbotvarkės klausimas")]/text()', ).dedup(),
#!/usr/bin/env python3 import botlib from datetime import timedelta from databot import define, task, this, select pipeline = { 'pipes': [ define('index urls'), define('index pages'), define('dataset urls'), define('dataset pages'), define('dataset data'), define('datasets'), ], 'tasks': [ task('index urls').daily().append( 'http://opendata.gov.lt/index.php?vars=/public/public/search'), task('index urls', 'index pages', watch=True).download(), task('index pages', 'index urls', watch=True).select(['td > a.path@href']).dedup(), task('index pages', 'dataset urls').select( ['form[name=frm] > table > tr > td[3] > a@href']), task('dataset urls').clean(timedelta(days=7)).dedup(), task('dataset urls', 'dataset pages').download(), task('dataset pages', 'dataset data').select(this.key, [ 'table xpath:tr[count(td)=2]', ( 'td[1]:content', select('td[2]:content').strip(),
#!/usr/bin/env python3 import yaml import botlib from databot import define, task, this, select, call with open('settings.yml') as f: settings = yaml.load(f) cookies = settings['cookies']['www.lrs.lt'] pipeline = { 'pipes': [ define('klausimų-puslapiai', botlib.dburi('lrs/posedziai')), define('dokumentų-sąrašas'), define('dokumentų-puslapiai', compress=True), define('susijusių-dokumentų-sąrašas'), define('susijusių-dokumentų-puslapiai', compress=True), define('metadata'), define('texts'), ], 'tasks': [ task('klausimų-puslapiai', 'dokumentų-sąrašas').select([ '#page-content div.default b xpath:a[text()="dokumento tekstas"]/@href' ]).dedup(), task('dokumentų-sąrašas', 'dokumentų-puslapiai').download( cookies=cookies, check= '#page-content div.default b xpath:a[text()="dokumento tekstas"]'), task('dokumentų-puslapiai', 'susijusių-dokumentų-sąrašas').select([
sep=r';\s*', engine='python', encoding='iso-8859-4', decimal=',', skiprows=12, comment=';', header=None, names=names, ) yield from ((str(x[key]), json.loads(x.to_json())) for _, x in data.iterrows()) pipeline = { 'pipes': [ define('vidurkiai-zip'), define('skaiciai-zip'), define('vidurkiai'), define('skaiciai'), define('imones-puslapis', compress=True), define('imones'), ], 'tasks': [ task('vidurkiai-zip').monthly().download('http://sodra.is.lt/Failai/Vidurkiai.zip'), task('vidurkiai-zip', 'vidurkiai'). call(partial(read_csv, 'VIDURKIAI.CSV', 'kodas', ['regnr', 'kodas', 'alga', 'autorine', 'viso'])). dedup(), task('skaiciai-zip').monthly().download('http://sodra.is.lt/Failai/Apdraustuju_skaicius.zip'), task('skaiciai-zip', 'skaiciai'). call(partial(read_csv, 'APDRAUSTUJU_SKAICIUS.CSV', 'kodas', ['regnr', 'kodas', 'skaicius'])). dedup(),
#!/usr/bin/env python3 from databot import Bot, define, task, first pipeline = { 'pipes': [ define('index'), define('news'), ], 'tasks': [ task('index').once().download('https://www.reddit.com/'), task('index', 'news').select([ '.thing.link', ( '.entry .title > a@href', { 'title': '.entry .title > a:text', 'score': '.midcol .score.likes@title', 'time': first(['.tagline time@datetime']), 'comments': '.entry a.comments:text', } ) ]), task('news').export('/tmp/reddit.jsonl'), task().compact(), ], } if __name__ == '__main__': Bot('/tmp/reddit.db').main(pipeline)
for qry in queries: for row in conn.execute(qry): yield row.osm_id, { 'osm_id': row.osm_id, 'name': row.name, 'lon': row.lon, 'lat': row.lat, 'religion': row.religion, 'denomination': row.denomination, 'place': find_closes_place(conn, point, row), } pipeline = { 'pipes': [ define('baznycios'), ], 'tasks': [ task('baznycios').once().clean().append( query(), progress='baznycios').compact(), task('baznycios').once().export('data/osm/baznycios.csv', include=[ 'osm_id', 'name', 'religion', 'denomination', 'lon', 'lat', 'place.osm_id', 'place.name', 'place.distance',
#!/usr/bin/env python3 import botlib from databot import define, task, this, strformat, select search_url = 'https://www.limis.lt/greita-paieska/rezultatai/-/exhibitList/form?searchOnlySimpleMetadata=false&searchOnlyWithImages=false&searchInExhibits=true&searchInArchives=true&searchInLibraries=true&searchInAudioVideo=true&searchInPhotos=true&s_tab=&s_id=2duvdg1N5K4dHB0W&backUrl=https%3a%2f%2fwww.limis.lt%2fpradinis%2f-%2fexhibitSearchFast%2fform&listDisplayMode=simple&_exhibitListportlet_WAR_limiskportlet_searchType=&page={page}&rowsOnPage=48' pipeline = { 'pipes': [ define('paieška'), define('paieškos-puslapių-numeriai'), define('paieškos-puslapiai', compress=True), define('eksponatų-nuorodos'), define('eksponatų-puslapiai', compress=True), define('eksponatų-duomenys'), ], 'tasks': [ task('paieška').daily().clean().download(search_url.format(page=1), check='select[name=page]'), task('paieška', 'paieškos-puslapių-numeriai').daily(). select(['select[name=page] option @value']). dedup(), task('paieškos-puslapių-numeriai', 'paieškos-puslapiai'). download(strformat(search_url, page=this.key), check='#exhibitListBlockId'), task('paieškos-puslapiai', 'eksponatų-nuorodos').select([ '#exhibitListBlockId .thumbs-with-title > li span.vertical-scroller > a', ('@href', ':text'), ]),
start = min(start) end = max(end) total = sum(b - a for a, b in years) else: start = end = 0 return { 'parapija': parapija, 'pradžia': start, 'pabaiga': end, 'trukmė': total, } pipeline = { 'pipes': [ define('paieškos-nuorodos'), define('paieškos-puslapiai', compress=True), define('knygos-duomenys'), ], 'tasks': [ # task('paieškos-nuorodos').once().append(extract_index_urls(), # progress='paieškos-nuorodos').dedup(), task('paieškos-nuorodos', 'paieškos-puslapiai').download(), task('paieškos-puslapiai', 'knygos-duomenys').select( this.key, { 'url': this.value.url, 'antraštė': select('.authorTitle').text(), 'd1': select([