from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.sources import ProgramSource, WebSource from modelscraper.parsers import TextParser, CSVParser from scrape_models.objects.networking import ip_phase, port_phase start = (Source(url='https://www.manageengine.com/products.html?MEtab'),) demo_template = Template( name='demo', selector='.all_prod_over', db='test', db_type='MongoDB', table='test', attrs=[ Attr( name='url', selector='a', func='sel_url', kws={'regex': 'http[s]?://([\.a-z]*)[\/\?]', 'index': 0}, source={'active': False} ) ] ) manageengine = ScrapeModel( name='manageengine', domain='', num_getters=1, phases=[
from pymongo import MongoClient no_ip = MongoClient().defcon.companies.find({ 'website': { '$ne': None }, 'ip': { '$exists': False } }) companies2 = MongoClient().defcon.companies.find({'website': {'$ne': None}}) defcon_base = Template(db='defcon', db_type='MongoDB') jacco_base = 'jackling.nl' jacco = (Source(url='jackling.nl'), ) jacco_git = (Source(url='http://{}/.git/config'.format(jacco_base)), ) jacco_ds = (Source(url='http://{}/.DS_STORE'.format(jacco_base)), ) git_sources = (Source(url='http://{}/.git/config'.format(c['website']), attrs=[Attr(name='kvk', value=c['id'])], copy_attrs='kvk') for c in companies2) ds_store_sources = (Source(url='http://{}/.DS_STORE'.format(c['website']), attrs=[Attr(name='kvk', value=c['id'])], copy_attrs='kvk') for c in companies2) git_template = Template(name='Git exposed', db_type='MongoDB', db='defcon', table='git', attrs=(Attr(name='vulnerable',
# 'economie', # 'tech', # 'opmerkelijk', # 'cultuur-en-media', # 'koningshuis', ] now = datetime.datetime.now() begin = datetime.datetime.strptime('2010-01-01', '%Y-%m-%d') timezone = datetime.timezone(datetime.timedelta(0, 3600)) dates = [ begin + datetime.timedelta(days=d) for d in range(0, (now - begin).days) ] nos_sources = (Source(url="http://nos.nl/nieuws/{}/archief/{}".format( cat, date.strftime('%Y-%m-%d')), attrs=(Attr(name='category', value=cat), ), copy_attrs=True) for cat in categories for date in dates) title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='.article_body', func='sel_text') date_attr = Attr(name='date', selector='time:nth-of-type(1)', func='sel_attr', kws={'attr': 'datetime'}) author_attr = Attr(name='author', selector='span[itemprop="author"]', func='sel_text') tags_attr = Attr(name='tags', selector='.ib.space-right a.link-grey', func='sel_text')
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source product_categories = Phase(sources=( Source(url="https://www.makro.nl/cat/nl/products"),), templates=( Template( name='product_category', selector='#left-navigation-container ul.vertical > li > a', db_type='MongoDB', db='makro', table='product_categories', attrs=[ Attr(name='url', func='sel_url', source={'active': False}), ] ), ) ) product_lists = Phase(templates=[ Template( name='product_urls', selector='.product-list .product-tiles', db_type='MongoDB', db='makro', table='product_urls', attrs=[ Attr(name='url', selector='.productname a', func='sel_url', source={'active': False}) ] ), Template( name='pagination', selector='.paging', attrs=[ Attr(name='url', selector='a', func='sel_url', source=True), ]),
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient from modelscraper.sources import WebSource from modelscraper.parsers import HTMLParser cl = MongoClient() db = cl.lucky_tv col = db.episode sources = (Source(url="http://www.luckytv.nl/afleveringen/page/{}/".format(i)) for i in range(1, 50)) LuckyTV = ScrapeModel(name='Lucky TV', domain='http://www.luckytv.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=sources, templates=(Template( name='episode', selector='article.video', db_type='mongo_db', db='lucky_tv', table='episodes', attrs=( Attr(name='url', selector='a:nth-of-type(1)', func='sel_url'), Attr(name='title',
attrs=[ Attr(name='submenu_item', selector='.c-product-tile__meta > a', func='sel_url', source={'active': False}), Attr(name='pagination_item', selector='li.is-nexy > a', source=True) ]) product_name = product_name(selector='.c-offer__title') price = price(selector='div.c-offer__price') nutrition = nutrition(selector='.c-offer__nutrition table td') product = Template(name='product', db='foods', table='spar2', db_type='MongoDB', attrs=[product_name, price, nutrition]) spar = ScrapeModel( name='spar.nl', cookie=cookie, domain='https://spar.nl', phases=[ Phase(sources=(Source(url='https://spar.nl/boodschappen/'), ), templates=[menu_template]), Phase(templates=[productmenu_template]), Phase(templates=[product]) ])
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.workers import WebSource from modelscraper.parsers import HTMLParser uefa = ScrapeModel( name='eufa', domain='http://uefa.com', num_getters=2, phases=[ Phase(sources=(Source( url="http://www.uefa.com/uefaeuro/season=2016/teams/index.html"), ), templates=[ Template(name='team', selector='.teams--qualified', attrs=[ Attr(name='url', selector='a', func='sel_url', source={'active': False}), ]) ]), Phase(templates=[ Template(name='player', selector='.squad--team-player', db_type='MongoDB', db='uefa', table='players', attrs=[ Attr(name='name', selector='.squad--player-name',
func='sel_text') article = Template( name='article', selector='.col__inner', attrs=( title_attr, text_attr, date_attr, author_attr, tags_attr ) ) rtl= ScrapeModel( name='rtl', domain='http://www.rtlnieuws.nl/', num_getters=1, phases=[ Phase(sources=[ Source(url="http://www.parool.nl/archief/2012")], templates=(calendar, year) ), Phase(templates=( article_url(db_type='mongo_db', db='parool', table='article_urls'), pagination) ), Phase(templates=(article(db_type='mongo_db', db='parool', table='articles'), ) ), ])
song = Template(name='song', db_type='MongoDB', db='midi', table='songs', attrs=[title, artist, midi_url]) freemidi_template = song( table='freemidi', selector='#mainContent div.col-xs-12:nth-of-type(1)', attrs=[ title(selector= 'li.active:nth-child(3) > a:nth-child(1) > span:nth-child(1)'), artist( selector= 'ol.breadcrumb:nth-child(1) > li:nth-child(2) > a:nth-child(1) > span:nth-child(1)' ), ]) freemidi_sources = (Source( url='https://freemidi.org/download-{}'.format(i), attrs=[midi_url(value='https://freemidi.org/getter-{}'.format(i))]) for i in range(25803)) freemidi = ScrapeModel(domain='http://freemidi.org', phases=[ Phase(n_workers=3, sources=freemidi_sources, templates=[freemidi_template]) ])
city(selector='.location-name'), url(selector='h2 > a'), ] ) autoscout = ScrapeModel( name='autoscout24', domain='autoscout24.nl', phases=[ Phase(sources=[], templates=[autoscout_template]), ]) autotrader = ScrapeModel(name='autotrader', domain='http://autotrader.nl', num_sources=1, cookies={'CookieOptIn': 'true'}, phases=[ Phase( sources=[ Source(url='http://www.autotrader.nl/motor/zoeken/'), ], templates=[ Template(name='motorcycle', selector='.result', store=StoreObject(func=store_mongo, kws={'db': 'moto', 'collection': 'autotrader'}), attrs=[ ]), Template(name='next_page', selector='#pager', attrs=[ Attr(name='url', func=sel_attr, selector='a.tracker', kws={'attr': 'href'}, getter=Source()), ]), ] ) ])
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source search_url = 'https://www.rtlnieuws.nl/search/nieuws/{}' search_terms = ['economie', 'nederland'] title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='p', func='sel_text') date_attr = Attr(name='date', selector='time', func='sel_text') author_attr = Attr(name='author', selector='span[itemprop="author"]', func='sel_text') tags_attr = Attr(name='tags', selector='.tag-list a.cta', func='sel_text') article = Template(name='article', selector='.col__inner', attrs=(title_attr, text_attr, date_attr, author_attr, tags_attr)) rtl = ScrapeModel( name='rtl', domain='http://www.rtlnieuws.nl/', num_getters=1, phases=[ Phase(sources=[Source(url="http://www.parool.nl/archief/2012")], templates=(calendar, year)), Phase(templates=( article_url(db_type='MongoDB', db='parool', table='article_urls'), pagination)), Phase(templates=( article(db_type='MongoDB', db='parool', table='articles'), )), ])
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient import string start_url = Source(url='https://www.jasminedirectory.com/') sub_page_url = Attr(name='sub_page', func='sel_url', source={'active': False, 'parent': True}) sub_page_name = Attr(name='pagename', func='sel_text') category_temp = Template(name='sub_category', selector='li strong a:nth-of-type(1)', db='jasminedirectory', table='maincats', db_type='mongo_db', attrs=(sub_page_url, sub_page_name)) category_temp2 = Template(name='sub_page', selector='li strong a:nth-of-type(1)', db='jasminedirectory', table='subcats', db_type='mongo_db', attrs=(sub_page_url, sub_page_name)) website_url = Attr(name='url', func='sel_url') website_name = Attr(name='name', func='sel_text')
selector='h3 a', func='sel_url', source={'active': False}), )) pagination = Template(name='pagination', selector='.pagers', attrs=(Attr(name='page', selector='a', func='sel_url', source=True), )) bedrijven_pagina = ScrapeModel( name='Bedrijven Pagina', domain='https://www.bedrijvenpagina.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[Source(url='https://www.bedrijvenpagina.nl/')], templates=(category_menu, )), Phase(source_worker=WebSource, parser=HTMLParser, templates=(result_list, pagination)), Phase(source_worker=WebSource, parser=HTMLParser, templates=(company, )) ]) disp = Dispatcher() disp.add_scraper(bedrijven_pagina) disp.run()
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source dabanga = ScrapeModel( name='dabanga', domain='https://www.dabangasudan.org/en', num_getters=2, phases=[ Phase( sources=(Source(url="https://www.dabangasudan.org/en/all-news"), ), templates=( Template(name='article_url', selector='.list-item.news-item-small', db_type='mongo_db', db='dabanga', table='article_urls', attrs=[ Attr(name='url', selector='a:nth-of-type(1)', func='sel_url', source={'active': False}), ]), Template(name='pagination', selector='.pager', attrs=[ Attr(name='url', selector='a', func='sel_url', source=True), ]), )),
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient from modelscraper.sources import WebSource from modelscraper.parsers import HTMLParser series_url = "https://www.npo.nl/media/series?page={}&dateFrom=2014-01-01&tilemapping=normal&tiletype=teaser&pageType=catalogue" npo_tv_programs = ScrapeModel( name='npo_tv_programs', domain='http://npo.nl', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[Source(url=series_url.format(i)) for i in range(0, 242)], templates=( Template( name='program', selector='.content-column.quarter', db_type='mongo_db', db='npo_tv_programs', table='programs', attrs=( Attr(name='title', selector='h3', func='sel_text'), Attr(name='url', selector='a.full-link', func='sel_url', source=Source( active=False)), # source is for next run )),
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.workers import WebSource from modelscraper.parsers import HTMLParser from pymongo import MongoClient cl = MongoClient() cookie = {'nl_cookiewall_version': '1'} telegraaf_url = 'http://www.telegraaf.nl/jsp/search_result_page.jsp?method=&keyword=de&pagenr={}' telegraaf_search = (Source(url=telegraaf_url.format(i)) for i in range(1, 5001)) calendar = Template( name='archive_url', selector='', attrs=( Attr(name='url', selector='td a', func='sel_url', source=Source(active=False)), # source is for next run )) year = Template( name='archive_url_year', selector='.year-list__item', attrs=( Attr(name='url', selector='a', func='sel_url', source=True), # source is for next run ))
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient cl = MongoClient() categories = {39: 'binnenland', 2: 'buitenland', } category_url = "http://www.metronieuws.nl/getsectionlist/{}/{}/0" binnenland = (Source(url=category_url.format(39, i), json_key=['data'], attrs=[Attr(name='category', value='binnenland')]) for i in range(1, 2400)) buitenland = (Source(url=category_url.format(2, i), json_key=['data'], attrs=[Attr(name='category', value='buitenland')]) for i in range(1, 1900)) sources = [*binnenland, *buitenland] title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='.article_body', func='sel_text') date_attr = Attr(name='date', selector='time:nth-of-type(1)', func='sel_attr', kws={'attr': 'datetime'}) author_attr = Attr(name='author', selector='span[itemprop="author"]', func='sel_text') tags_attr = Attr(name='tags', selector='.tag', func='sel_text') article = Template( name='article',
search_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000' next_page_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000&offset={}' search_result = Template( name='search_result', db='volkskrant', table='article_urls', func='create', db_type='mongo_db', selector=('results', 'previews'), attrs=( Attr(name='id', selector=('content_link', 'id'), func='sel_text', source={'src_template': article_url, 'active': False}), ), ) next_search = Template( name='next_result', attrs=( Attr(name='next_limit', selector=('results', 'next_offset'), func='sel_text', source={'src_template': next_page_url}), ) ) sources = (Source(url=search_url),) volkskrant = ScrapeModel( name='volkskrant', domain='volkskrant', phases=[ Phase(parser=JSONParser, n_workers=5, sources=sources, templates=(search_result, next_search)), Phase(parser=JSONParser, n_workers=5, sources=sources, templates=(article,)), ])
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient import string start_url = Source(url='http://www.startpagina.nl/dochters/') sub_page_url = Attr(name='sub_page', func='sel_url', source={'active': False, 'parent': True}) sub_page_name = Attr(name='pagename', func='sel_text') category_temp = Template(name='sub_page', selector='.sections a', db='startpagina', table='subpages', db_type='mongo_db', attrs=(sub_page_url, sub_page_name)) website_url = Attr(name='url', func='sel_url') website_name = Attr(name='name', func='sel_text') website_temp = Template(name='website', selector='#columns a', db='startpagina', table='websites', db_type='mongo_db', attrs=(website_url, website_name)) model = ScrapeModel(name='startpagina', domain='http://www.startpagina.nl', phases=[
from modelscraper.components import ScrapeModel, Attr, Template, Phase, Source data = 'sid={}&options%5B%5D=display_full_history&options%5B%5D=use_cached_data_only&action=View+Complete+Tracking+History' #data = metro = ScrapeModel( name='landmark', domain='https://mercury.landmarkglobal.com/', num_get=2, phases=[ Phase(sources=(Source( url= "https://mercury.landmarkglobal.com/tracking/track.php?trck=LTN{}N1&Submit=Track" .format(i), method='post', data=[('sid', str(i)), ('options[]', 'display_full_history'), ('options[]', 'use_cached_data_only'), ('action', 'View+Complete+Tracking+History')]) for i in range(5000, 50000000)), templates=[ Template( name='shipment', selector=None, db='shipments', db_type='MongoDB', table='shipment', attrs=[ Attr( name='carrier', selector= '#large_shipment_info_box > div:nth-child(2) > div:nth-child(1)', func='sel_text',
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from scrape_models.objects import occassions sources = [ Source( url= 'http://ww4.autoscout24.nl/?atype=B&mmvco=0&cy=NL&ustate=N%2CU&fromhome=1&intcidm=HP-Searchmask-Button&dtr=s&results=20' ) ] autoscout = ScrapeModel(name='autoscout24', domain='autoscout24.nl', phases=[ Phase(sources=sources, templates=[occassions.autoscout_template]), ])
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient search_url = 'https://mobileapi.ad.nl/mobile/lists/search' sources = (Source(url=search_url, params={'query': w}) for w in words) title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='p', func='sel_text') date_attr = Attr(name='date', selector='.published span.small', func='sel_text') author_attr = Attr(name='author', selector='span.author', func='sel_text') tags_attr = Attr(name='tags', selector='.article.tags a span', func='sel_text') article = Template( name='article', selector='.column-content-background', db='nu_nl', db_type='mongo_db', table='articles', attrs=( title_attr, text_attr, date_attr, author_attr, tags_attr ) )
from modelscraper.dispatcher import Dispatcher from modelscraper.sources import WebSource from modelscraper.parsers import HTMLParser from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source motorparts = ScrapeModel( name='motorparts', domain='http://www.2wheelpros.com', num_sources=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=(Source(url='http://www.2wheelpros.com/oem-parts/'),), templates=( Template(name='brand', selector='#nav > ul > li:nth-of-type(1) > a', attrs=( Attr(name='url', func='sel_url', source={'active': False}), ),),) ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='year', selector='.yearlink', attrs=( Attr(name='url', func='sel_url', source={'active': False}),)),), ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='model', selector='.modellink', attrs=( Attr(name='url', func='sel_url', source={'active': False}), ) ), ), ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='partCategory', db='motorparts', db_type='MongoDB', table='part_categories', source={'active':False,
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.sources import ProgramSource, WebSource from modelscraper.parsers import JSONParser, TextParser import datetime now = str(datetime.datetime.now()).replace('-', '')[:8] JSON_URL = 'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-{}.json.zip' JSON_URL = 'http://0.0.0.0:8000/nvdcve-1.0-{}.json' years = range(2002, datetime.datetime.now().year) # years = [2002] cve_source = (Source(url=JSON_URL.format(year), compression='', json_key='CVE_Items') for year in years) meta_template = Template( name='meta', db='defcon', table='cve_meta', db_type='MongoDB', attrs=[ Attr(name='last_modified', func='sel_text', kws={'regex': 'lastModifiedDate:(.*)'}, source=cve_source)]) cve_template = Template( name='meta', db='defcon', table='cve', db_type='MongoDB', #func='update', #kws={'key': 'id'}, attrs=[ Attr(name='id', func='sel_text', selector=['cve', 'CVE_data_meta', 'ID']), Attr(name='cpes', func='sel_text', selector=['configurations', 'nodes', 'cpe', 'cpe23Uri']), Attr(name='affects', func='sel_dict', selector=['cve', 'affects']),
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.parsers import TextParser from modelscraper.dispatcher import Dispatcher from pymongo import MongoClient cl = MongoClient() sources = (Source(url=a['url'][0]) for a in cl.dwdd.episode_urls.find()) programs_az = Phase( sources=[ Source(url="http://www.npo.nl/programmas/a-z", params={'page': i}) for i in range(0, 1) ], templates=( Template( name='program', selector='.content-column.quarter', db_type='mongo_db', db='npo_tv_programs', table='programs', attrs=( Attr(name='title', selector='h3', func='sel_text'), Attr(name='url', selector='a.full-link', func='sel_url', source=Source(active=False)), # source is for next run )), )) nos_search = 'https://www.npo.nl/de-wereld-draait-door/VARA_101377717/search?media_type=broadcast&start_date=&end_date=&start={}&rows=100' episodes_phase = Phase(n_workers=5, sources=(Source(url=nos_search.format(start))
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source base_url = "http://www.nu.nl/block/html/articlelist?footer=ajax§ion={section}&limit=20&offset={offset}&show_tabs=0" sections = [ 'buitenland', 'binnenland', 'economie', 'algemeen', 'tech', 'sport' ] sources = (Source(url=base_url.format(section=section, offset=offset), copy_attrs=['category'], attrs=[Attr(name='category', value=[section])]) for section in sections for offset in range(0, 200000, 20)) headline = Template(name='headline', selector='li', db='nu_nl', db_type='MongoDB', table='article_urls', attrs=[ Attr(name='url', selector='a', func='sel_url', source={ 'active': False, 'copy_attrs': 'category' }), Attr(name='title', selector='.title', func='sel_text'), Attr(name='excerpt', selector='.excerpt', func='sel_text') ]) title_attr = Attr(name='title', selector='h1', func='sel_text')
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.parsers import TextParser from modelscraper.dispatcher import Dispatcher from pymongo import MongoClient cl = MongoClient() urls = (Source(url='https://tt888.omroep.nl/tt888/' + a['url'][0].split('/')[-1], attrs=(Attr(name='url', value=a['url']),)) for a in cl.nos_journaal.episodes.find()) subtitles = ScrapeModel( name='subtitles', domain='https://tt888.omroep.nl/', phases=[ Phase(n_workers=5, sources=urls, parser=TextParser, templates=( Template( name='subtitle', db_type='mongo_db', db='nos_journaal', table='episodes', func='update', kws={'key': 'url'}, attrs=( Attr(name='subtitles', func='sel_text'), ) ), ) ) ]) del cl d = Dispatcher() d.add_scraper(subtitles) d.run()
from modelscraper.sources import ProgramSource port_template = Template(name='ports', selector='port', db_type='mongo_db', db='ports', table='ports', attrs=(Attr(name='portnumber', func='sel_attr', kws={'attr': 'portid'}), Attr(name='state', selector='state', func='sel_attr', kws={'attr': 'state'}), Attr(name='service', selector='service', func='sel_attr', kws={'attr': 'name'}))) nmap = ScrapeModel( name='nmap_test', domain='', phases=[ Phase(sources=(Source(url='nmap -oX - duwo.multiposs.nl'), ), templates=[port_template], source_worker=ProgramSource) ]) disp = Dispatcher() disp.add_scraper(nmap) disp.run()
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient cl = MongoClient() product_sources = cl.makro.product_urls.find() sources = [Source(url=p['url'][0]) for p in product_sources] categories = Phase( sources=(Source(url="https://www.makro.nl/cat/nl/products"), ), templates=(Template( name='product_category', selector='#left-navigation-container ul.vertical > li > a', db_type='mongo_db', db='makro', table='product_categories', attrs=[ Attr(name='url', func='sel_url', source={'active': False}, kws={ 'replacers': 'pageSize=(\d+)', 'substitute': 'pageSize=96' }), ]), )) product_lists = Phase(templates=[ Template(name='product_urls', selector='.product-list .product-tiles', db_type='mongo_db', db='makro',
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source filepath = '/mnt/Movies/theoffice/' create_dir = 'sudo mkdir -p ' + filepath + '/{season}/' youtube_dl = 'sudo youtube-dl -o ' + filepath + '{season}/{episode} {url}' extended_url = 'http://watchtheoffice.online/the-office-s{02d}e{02d}-extended/' theoffice = ScrapeModel( name='theoffice', domain='http://watchtheofficeonline.com', num_getters=2, phases=[ Phase(sources=[ Source(url=extended_url.format(season, episode)) for season in range(1, 10) for episode in range(1, 30) ], templates=(Template( name='episode', selector='#Rapidvideo', db_type='ShellCommand', db='theoffice', table='season', kws={'command': create_dir + ' & ' + youtube_dl}, attrs=( Attr(name='url', selector='a', func=['sel_url', 'sel_text'], kws=[{}, { 'needle': r'.*(s\d+e\d+)' }]), Attr(name='episode',