from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient from modelscraper.workers import WebSource from modelscraper.parsers import HTMLParser category_menu = Template(name='category_menu', selector='li.dropdown:nth-child(2)', attrs=[ Attr(name='category', selector='a', source={'active': False}, func='sel_url') ]) name = Attr(name='name', selector='h1.box-title', func='sel_text') street = Attr(name='street', selector='.street-address', func='sel_text') postal = Attr(name='postal', selector='.postal-code', func='sel_text') city = Attr(name='city', selector='.locality', func='sel_text') telephone = Attr(name='telephone', selector='.tel', func='sel_text') website = Attr(name='website', selector='.url a', func='sel_url') mail = Attr(name='email', selector='.mail a', func='sel_url') kvk = Attr(name='kvk', selector='.kvk a', func='sel_text') description = Attr(name='description', selector='div[itemprop="description"] > p', func='sel_text') branches = Attr(name='branches', selector='.omschrijving a', func='sel_text') company = Template(name='company', selector=None, db_type='mongo_db',
from modelscraper.parsers import TextParser, CSVParser from .objects.networking import ip_phase, ip_template, port_phase from pymongo import MongoClient no_ip = MongoClient().defcon.companies.find({ 'website': { '$ne': None }, 'ip': { '$exists': False } }) companies2 = MongoClient().defcon.companies.find({'website': {'$ne': None}}) defcon_base = Template(db='defcon', db_type='MongoDB') jacco_base = 'jackling.nl' jacco = (Source(url='jackling.nl'), ) jacco_git = (Source(url='http://{}/.git/config'.format(jacco_base)), ) jacco_ds = (Source(url='http://{}/.DS_STORE'.format(jacco_base)), ) git_sources = (Source(url='http://{}/.git/config'.format(c['website']), attrs=[Attr(name='kvk', value=c['id'])], copy_attrs='kvk') for c in companies2) ds_store_sources = (Source(url='http://{}/.DS_STORE'.format(c['website']), attrs=[Attr(name='kvk', value=c['id'])], copy_attrs='kvk') for c in companies2) git_template = Template(name='Git exposed', db_type='MongoDB',
title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='.article_body', func='sel_text') date_attr = Attr(name='date', selector='time:nth-of-type(1)', func='sel_attr', kws={'attr': 'datetime'}) author_attr = Attr(name='author', selector='span[itemprop="author"]', func='sel_text') tags_attr = Attr(name='tags', selector='.ib.space-right a.link-grey', func='sel_text') article = Template(name='article', attrs=(title_attr, text_attr, date_attr, author_attr, tags_attr)) Phase(source_worker=WebSource, parser=HTMLParser, sources=nos_sources, templates=[ Template(name='article_url', selector='#archief li', db_type='mongo_db', db='nos_nl', table='article_urls', attrs=[ Attr(name='url', selector='a', func='sel_attr', kws={'attr': 'href'},
from modelscraper.workers import WebSource from modelscraper.parsers import HTMLParser uefa = ScrapeModel( name='eufa', domain='http://uefa.com', num_getters=2, phases=[ Phase(sources=(Source( url="http://www.uefa.com/uefaeuro/season=2016/teams/index.html"), ), templates=[ Template(name='team', selector='.teams--qualified', attrs=[ Attr(name='url', selector='a', func='sel_url', source={'active': False}), ]) ]), Phase(templates=[ Template(name='player', selector='.squad--team-player', db_type='MongoDB', db='uefa', table='players', attrs=[ Attr(name='name', selector='.squad--player-name', func='sel_text'), Attr(name='player_url',
col = db.episode sources = (Source(url="http://www.luckytv.nl/afleveringen/page/{}/".format(i)) for i in range(1, 50)) LuckyTV = ScrapeModel(name='Lucky TV', domain='http://www.luckytv.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=sources, templates=(Template( name='episode', selector='article.video', db_type='mongo_db', db='lucky_tv', table='episodes', attrs=( Attr(name='url', selector='a:nth-of-type(1)', func='sel_url'), Attr(name='title', selector='.video__title', func='sel_text'), Attr(name='date', selector='.video__date', func='sel_text'), )), )), ])
sources = [*binnenland, *buitenland] title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='.article_body', func='sel_text') date_attr = Attr(name='date', selector='time:nth-of-type(1)', func='sel_attr', kws={'attr': 'datetime'}) author_attr = Attr(name='author', selector='span[itemprop="author"]', func='sel_text') tags_attr = Attr(name='tags', selector='.tag', func='sel_text') article = Template( name='article', attrs=( title_attr, text_attr, date_attr, author_attr, tags_attr ) ) headline_phase = Phase( sources=sources, n_workers=5, templates=[ Template( name='headline', selector='.row', db='metronieuws', db_type='MongoDB', kws={'key':'url'}, table='article_urls', attrs=[ Attr(name='url', selector='a.shadow-block', func='sel_attr', kws={'attr': 'href'}, source=Source(active=False, copy_attrs='category')), Attr(name='title', selector='h3', func='sel_text'), Attr(name='excerpt', selector='div > p', func='sel_text'),
# "SelectedStore":{"StoreId":27,"StoreReferenceKey":384}, "SelectedStore": { "StoreId": 207, "StoreReferenceKey": 493 }, "HasSelectedStore": True, "AcceptedCookies": None, "LastViewedProducts": None } } menu_template = Template(name='menu', attrs=[ Attr(name='menu_item', selector='.c-category-tile__item', func='sel_url', source={ 'active': False, 'src_template': '{}?ppp=72' }) ]) productmenu_template = Template(name='submenu', selector='.c-product-tile', attrs=[ Attr(name='submenu_item', selector='.c-product-tile__meta > a', func='sel_url', source={'active': False}), Attr(name='pagination_item', selector='li.is-nexy > a', source=True)
from modelscraper.parsers import HTMLParser from pymongo import MongoClient cl = MongoClient() cookie = {'nl_cookiewall_version': '1'} telegraaf_url = 'http://www.telegraaf.nl/jsp/search_result_page.jsp?method=&keyword=de&pagenr={}' telegraaf_search = (Source(url=telegraaf_url.format(i)) for i in range(1, 5001)) calendar = Template( name='archive_url', selector='', attrs=( Attr(name='url', selector='td a', func='sel_url', source=Source(active=False)), # source is for next run )) year = Template( name='archive_url_year', selector='.year-list__item', attrs=( Attr(name='url', selector='a', func='sel_url', source=True), # source is for next run )) article_url = Template( name='article_url',
author_attr = Attr(name='author', selector='written_by', func='sel_text') tags_attr = Attr(name='tags', selector=('tags', 'name'), func='sel_text') category_attr = Attr(name='category', selector=('section', 'name'), func='sel_text') counters_attr = Attr(name='counters', selector='counters', func='sel_text') intro_attr = Attr(name='excerpt', selector='intro', func='sel_text') type_attr = Attr(name='excerpt', selector='type', func='sel_text') article = Template( name='article', db='volkskrant', table='articles', db_type='mongo_db', attrs=( title_attr, text_attr, date_attr, author_attr, tags_attr, category_attr, counters_attr, intro_attr, type_attr ) ) article_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles/{}' search_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000' next_page_url = 'http://vkplusmobilebackend.persgroep.net/rest/content/articles?query=&metadataNeeded=true&limit=10000&offset={}' search_result = Template( name='search_result', db='volkskrant', table='article_urls', func='create',
search_url = 'https://www.rtlnieuws.nl/search/nieuws/{}' search_terms = ['economie', 'nederland'] title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='p', func='sel_text') date_attr = Attr(name='date', selector='time', func='sel_text') author_attr = Attr(name='author', selector='span[itemprop="author"]', func='sel_text') tags_attr = Attr(name='tags', selector='.tag-list a.cta', func='sel_text') article = Template( name='article', selector='.col__inner', attrs=( title_attr, text_attr, date_attr, author_attr, tags_attr ) ) rtl= ScrapeModel( name='rtl', domain='http://www.rtlnieuws.nl/', num_getters=1, phases=[ Phase(sources=[ Source(url="http://www.parool.nl/archief/2012")], templates=(calendar, year) ), Phase(templates=( article_url(db_type='mongo_db', db='parool', table='article_urls'),
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.workers import WebSource from modelscraper.parsers import HTMLParser title = Attr(name='title', func='sel_text') artist = Attr(name='artist', func='sel_text') midi_url = Attr(name='midi_url') song = Template(name='song', db_type='MongoDB', db='midi', table='songs', attrs=[title, artist, midi_url]) freemidi_template = song( table='freemidi', selector='#mainContent div.col-xs-12:nth-of-type(1)', attrs=[ title(selector= 'li.active:nth-child(3) > a:nth-child(1) > span:nth-child(1)'), artist( selector= 'ol.breadcrumb:nth-child(1) > li:nth-child(2) > a:nth-child(1) > span:nth-child(1)' ), ]) freemidi_sources = (Source( url='https://freemidi.org/download-{}'.format(i), attrs=[midi_url(value='https://freemidi.org/getter-{}'.format(i))]) for i in range(25803))
#Databases test_db = Sqlite(db='test') test_mongo = MongoDB(db='test') #Parsers jsonp = JSONParser() textp = TextParser() csvp = CSVParser() htmlp = HTMLParser() json_nested = Template( source=json_test, database=[test_db, test_mongo], table='tst', name='json_nested', selector=[jsonp.select('html'), htmlp.select('.content')], attrs=[ Attr(name='url', func=htmlp.text(selector='h1', template='partialtest {}')) ]) html_functions = Template(source=html, name='html_functions', database=test_mongo, table='html_test', dated=True, emits=html2, selector=htmlp.select('html'), attrs=[ Attr(name='table',
make = Attr(name='make', func='sel_text') year = Attr(name='year', func='sel_text', kws={'numbers': True}, type=int) mileage = Attr(name='mileage', func='sel_text', kw={'numbers': True}, type=int) city = Attr(name='city', func='sel_text') url = Attr(name='url', func='sel_url') zipcode = Attr(name='zip', func='sel_text') power = Attr(name='power', func='sel_text') vehicle = Template( name='vehicle', db_type='MongoDB', db='vehicles', attrs=[ vehicle_type, price, brand, make, year, mileage, city, url, zipcode, power ] ) autoscout_template = vehicle( table='autoscout', regex='var articlesFromServer = (.+)\|\|', attrs=[ vehicle_type, price(selector='price_raw'), brand(selector='mk'), make(selector='md'),
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient import string start_url = Source(url='https://www.jasminedirectory.com/') sub_page_url = Attr(name='sub_page', func='sel_url', source={'active': False, 'parent': True}) sub_page_name = Attr(name='pagename', func='sel_text') category_temp = Template(name='sub_category', selector='li strong a:nth-of-type(1)', db='jasminedirectory', table='maincats', db_type='mongo_db', attrs=(sub_page_url, sub_page_name)) category_temp2 = Template(name='sub_page', selector='li strong a:nth-of-type(1)', db='jasminedirectory', table='subcats', db_type='mongo_db', attrs=(sub_page_url, sub_page_name)) website_url = Attr(name='url', func='sel_url') website_name = Attr(name='name', func='sel_text')
name='npo_tv_programs', domain='http://npo.nl', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[Source(url=series_url.format(i)) for i in range(0, 242)], templates=( Template( name='program', selector='.content-column.quarter', db_type='mongo_db', db='npo_tv_programs', table='programs', attrs=( Attr(name='title', selector='h3', func='sel_text'), Attr(name='url', selector='a.full-link', func='sel_url', source=Source( active=False)), # source is for next run )), Template(name='next_url'), )), Phase( source_worker=WebSource, parser=HTMLParser, templates=( Template( name='episodes', selector='.item-list.item-container div.item',
('options[]', 'use_cached_data_only'), ('action', 'View+Complete+Tracking+History')]) for i in range(5000, 50000000)), templates=[ Template( name='shipment', selector=None, db='shipments', db_type='MongoDB', table='shipment', attrs=[ Attr( name='carrier', selector= '#large_shipment_info_box > div:nth-child(2) > div:nth-child(1)', func='sel_text', kws={'regex': 'Carrier:\s(\w+)'}), Attr( name='shipped_to', selector= '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(1) .align_left', func='sel_text'), Attr( name='shipped_from', selector= '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(2) .align_left', func='sel_text'), ]), Template(name='event', selector='table tr:not(:nth-child(1))', db_type='MongoDB', db='shipments',
from modelscraper.components import ScrapeModel, Template, Attr from modelscraper.sources import WebSource text = Attr(name='text', func='sel_html') title = Attr(name='title', func='sel_text') pictures = Attr(name='pictures', func='sel_attr', selector='img', kws={'attr': 'src'}) date = Attr(name='date', func='sel_text') related = Attr(name='related', func='sel_url') author = Attr(name='author', func='sel_text') tags = Attr(name='author', func='sel_text') article = Template(name='article', attrs=(text, title, date, author, tags, pictures, related), db='news', db_type='MongoDB') article_url = Attr(name='url', func='sel_url') tweakers_article_source = WebSource() tweakers_list = Template( selector='', attrs=[article_url(selector='', emits=tweakers_article_source)]) tweakers = article(source=tweakers_article_source, table='tweakers.net', selector='#contentArea', attrs=( text(selector='.article p'), title(selector='h1'),
import re from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.parsers import JSONParser import vehicle city = Attr(name='city', func='sel_text') zipcode = Attr(name='zipcode', func='sel_text') occasion = Template( name='occasion', db_type='mongo_db', attrs=[*vehicle.attrs, city, zipcode, ] ) autotrader_template = vehicle( table='autotrader', selector='.result', attrs=[ brand(selector='h2', kws={'regex': '(^\w+)'}), make(selector='h2', kws={'regex': '^\w+ (.*)'}), price(selector='.result-price-label'), year(selector='.col-left',kws={'regex': '\w{3} (\d{4})'}), mileage(selector='.col-left', kws={'regex': '(.*) km'}), url(selector='a.tracker'), #Attr(name='dealer_name', selector='.dealer-info div', func=sel_text), city, zipcode, power,
from modelscraper.parsers import TextParser from modelscraper.dispatcher import Dispatcher from pymongo import MongoClient cl = MongoClient() urls = (Source(url='https://tt888.omroep.nl/tt888/' + a['url'][0].split('/')[-1], attrs=(Attr(name='url', value=a['url']), )) for a in cl.dwdd.episodes.find()) subtitles = ScrapeModel(name='subtitles', domain='https://tt888.omroep.nl/', phases=[ Phase(n_workers=5, sources=urls, parser=TextParser, templates=(Template( name='subtitle', db_type='MongoDB', db='dwdd', table='episodes', func='update', kws={'key': 'url'}, attrs=(Attr(name='subtitles', func='sel_text'), )), )) ]) del cl d = Dispatcher() d.add_scraper(subtitles) d.run()
from modelscraper.components import Phase, Template, Attr from modelscraper.sources import BaseSourceWorker, ProgramSource from modelscraper.parsers import TextParser import dns.resolver import dns.query import dns.zone ip_template = Template(name='ip', db_type='MongoDB', db='', table='', parser=TextParser, attrs=(Attr( name='ip', func='sel_text', kws={'regex': '(\d+\.\d+\.\d+\.\d+)'}, ), )) ip_phase = Phase(n_workers=10, templates=[ip_template], source_worker=ProgramSource(function='host {}')) port_template = Template(name='ports', selector='port', db_type='MongoDB', db='monog', table='ports', attrs=(Attr(name='portnumber', func='sel_attr', kws={'attr': 'portid'}), Attr(name='state',
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient import string start_url = Source(url='http://www.startpagina.nl/dochters/') sub_page_url = Attr(name='sub_page', func='sel_url', source={'active': False, 'parent': True}) sub_page_name = Attr(name='pagename', func='sel_text') category_temp = Template(name='sub_page', selector='.sections a', db='startpagina', table='subpages', db_type='mongo_db', attrs=(sub_page_url, sub_page_name)) website_url = Attr(name='url', func='sel_url') website_name = Attr(name='name', func='sel_text') website_temp = Template(name='website', selector='#columns a', db='startpagina', table='websites', db_type='mongo_db', attrs=(website_url, website_name)) model = ScrapeModel(name='startpagina', domain='http://www.startpagina.nl', phases=[
title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='p', func='sel_text') date_attr = Attr(name='date', selector='.published span.small', func='sel_text') author_attr = Attr(name='author', selector='span.author', func='sel_text') tags_attr = Attr(name='tags', selector='.article.tags a span', func='sel_text') article = Template( name='article', selector='.column-content-background', db='nu_nl', db_type='mongo_db', table='articles', attrs=( title_attr, text_attr, date_attr, author_attr, tags_attr ) ) next_page = Template( name='next_page', selector='paging', attrs=( Attr(name='paging', ad = ScrapeModel( name='ad.nl', domain='mobileapi.ad.nl', phases=[
from modelscraper.dispatcher import Dispatcher from modelscraper.sources import WebSource from modelscraper.parsers import HTMLParser from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source motorparts = ScrapeModel( name='motorparts', domain='http://www.2wheelpros.com', num_sources=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=(Source(url='http://www.2wheelpros.com/oem-parts/'),), templates=( Template(name='brand', selector='#nav > ul > li:nth-of-type(1) > a', attrs=( Attr(name='url', func='sel_url', source={'active': False}), ),),) ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='year', selector='.yearlink', attrs=( Attr(name='url', func='sel_url', source={'active': False}),)),), ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='model', selector='.modellink', attrs=( Attr(name='url', func='sel_url', source={'active': False}), ) ), ), ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='partCategory', db='motorparts', db_type='MongoDB', table='part_categories', source={'active':False,
from modelscraper.parsers import JSONParser, TextParser import datetime now = str(datetime.datetime.now()).replace('-', '')[:8] JSON_URL = 'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-{}.json.zip' JSON_URL = 'http://0.0.0.0:8000/nvdcve-1.0-{}.json' years = range(2002, datetime.datetime.now().year) # years = [2002] cve_source = (Source(url=JSON_URL.format(year), compression='', json_key='CVE_Items') for year in years) meta_template = Template( name='meta', db='defcon', table='cve_meta', db_type='MongoDB', attrs=[ Attr(name='last_modified', func='sel_text', kws={'regex': 'lastModifiedDate:(.*)'}, source=cve_source)]) cve_template = Template( name='meta', db='defcon', table='cve', db_type='MongoDB', #func='update', #kws={'key': 'id'}, attrs=[ Attr(name='id', func='sel_text', selector=['cve', 'CVE_data_meta', 'ID']), Attr(name='cpes', func='sel_text', selector=['configurations', 'nodes', 'cpe', 'cpe23Uri']), Attr(name='affects', func='sel_dict', selector=['cve', 'affects']), Attr(name='problem_type', func='sel_text', selector=['cve', 'problemtype', 'problemtype_data', 'description',
cl = MongoClient() sources = (Source(url=a['url'][0]) for a in cl.dwdd.episode_urls.find()) programs_az = Phase( sources=[ Source(url="http://www.npo.nl/programmas/a-z", params={'page': i}) for i in range(0, 1) ], templates=( Template( name='program', selector='.content-column.quarter', db_type='mongo_db', db='npo_tv_programs', table='programs', attrs=( Attr(name='title', selector='h3', func='sel_text'), Attr(name='url', selector='a.full-link', func='sel_url', source=Source(active=False)), # source is for next run )), )) nos_search = 'https://www.npo.nl/de-wereld-draait-door/VARA_101377717/search?media_type=broadcast&start_date=&end_date=&start={}&rows=100' episodes_phase = Phase(n_workers=5, sources=(Source(url=nos_search.format(start)) for start in range(0, 2194, 100)), templates=(Template( name='episodes', selector='.list-item', db_type='mongo_db',
'buitenland', 'binnenland', 'economie', 'algemeen', 'tech', 'sport' ] sources = (Source(url=base_url.format(section=section, offset=offset), copy_attrs=['category'], attrs=[Attr(name='category', value=[section])]) for section in sections for offset in range(0, 200000, 20)) headline = Template(name='headline', selector='li', db='nu_nl', db_type='MongoDB', table='article_urls', attrs=[ Attr(name='url', selector='a', func='sel_url', source={ 'active': False, 'copy_attrs': 'category' }), Attr(name='title', selector='.title', func='sel_text'), Attr(name='excerpt', selector='.excerpt', func='sel_text') ]) title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='p', func='sel_text') date_attr = Attr(name='date', selector='.published span.small', func='sel_text') author_attr = Attr(name='author', selector='span.author', func='sel_text')
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.parsers import TextParser from modelscraper.dispatcher import Dispatcher from pymongo import MongoClient cl = MongoClient() urls = (Source(url='https://tt888.omroep.nl/tt888/' + a['url'][0].split('/')[-1], attrs=(Attr(name='url', value=a['url']),)) for a in cl.nos_journaal.episodes.find()) subtitles = ScrapeModel( name='subtitles', domain='https://tt888.omroep.nl/', phases=[ Phase(n_workers=5, sources=urls, parser=TextParser, templates=( Template( name='subtitle', db_type='mongo_db', db='nos_journaal', table='episodes', func='update', kws={'key': 'url'}, attrs=( Attr(name='subtitles', func='sel_text'), ) ), ) ) ]) del cl d = Dispatcher() d.add_scraper(subtitles) d.run()
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.sources import ProgramSource port_template = Template(name='ports', selector='port', db_type='mongo_db', db='ports', table='ports', attrs=(Attr(name='portnumber', func='sel_attr', kws={'attr': 'portid'}), Attr(name='state', selector='state', func='sel_attr', kws={'attr': 'state'}), Attr(name='service', selector='service', func='sel_attr', kws={'attr': 'name'}))) nmap = ScrapeModel( name='nmap_test', domain='', phases=[ Phase(sources=(Source(url='nmap -oX - duwo.multiposs.nl'), ), templates=[port_template], source_worker=ProgramSource) ]) disp = Dispatcher() disp.add_scraper(nmap)
from pymongo import MongoClient cl = MongoClient() product_sources = cl.makro.product_urls.find() sources = [Source(url=p['url'][0]) for p in product_sources] categories = Phase( sources=(Source(url="https://www.makro.nl/cat/nl/products"), ), templates=(Template( name='product_category', selector='#left-navigation-container ul.vertical > li > a', db_type='mongo_db', db='makro', table='product_categories', attrs=[ Attr(name='url', func='sel_url', source={'active': False}, kws={ 'replacers': 'pageSize=(\d+)', 'substitute': 'pageSize=96' }), ]), )) product_lists = Phase(templates=[ Template(name='product_urls', selector='.product-list .product-tiles', db_type='mongo_db', db='makro', table='product_urls', attrs=[
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source dabanga = ScrapeModel( name='dabanga', domain='https://www.dabangasudan.org/en', num_getters=2, phases=[ Phase( sources=(Source(url="https://www.dabangasudan.org/en/all-news"), ), templates=( Template(name='article_url', selector='.list-item.news-item-small', db_type='mongo_db', db='dabanga', table='article_urls', attrs=[ Attr(name='url', selector='a:nth-of-type(1)', func='sel_url', source={'active': False}), ]), Template(name='pagination', selector='.pager', attrs=[ Attr(name='url', selector='a', func='sel_url', source=True), ]), )), Phase(synchronize=True,