def make_objects(self, template, selected, getter): objects = [] # print('aantal links', len(selected)) for sel in selected: objct = Template(name=template.name) objct.url = getter.url # Set predefined attributes from the getter. #print('aantal attrs', len(getter.attrs)) for attr in getter.attrs: objct.attrs.append(attr.duplicate()) # Set the attribute values for temp_attr in template.attrs: parsed = temp_attr.func(sel, temp_attr.selector, **temp_attr.kws) attr = Attr(name=temp_attr.name, value=parsed) objct.attrs.append(attr) # Create a request from the attribute if desirable if temp_attr.getter and parsed: if type(parsed) != list: parsed = [parsed] for value in parsed: new_getter = Getter(**temp_attr.getter) new_getter.url = value self._handle_getter(new_getter) if template.getter: self._handle_object_getter(objct) objects.append(objct) return objects
templates=(Template(name='report_url', selector='.exp-list-table tr', source={ 'active': False, 'copy_attrs': True }, attrs=( Attr(name='url', selector='td:nth-of-type(2) a', func='sel_url'), Attr(name='title', selector='td:nth-of-type(2) a', func='sel_text'), Attr(name='rating', selector='td:nth-of-type(1) img', func='sel_attr', kws={'attr': 'alt'}), Attr(name='author', selector='td:nth-of-type(3)', func='sel_text'), Attr(name='substances', selector='td:nth-of-type(4)', func='sel_text', kws={ 'replacers': '&', 'substitute': ',', 'regex': '([A-z0-9\-]+\s*[A-z0-9\-*\s]*)' }), Attr(name='date', selector='td:nth-of-type(5)', func='sel_text'), Attr(name='views', selector='td:nth-of-type(6)', func='sel_text'), )), )),
cl = MongoClient() db = cl.nytimes col = db.menu nytimes = ScrapeModel(name='nytimes', domain='https://www.nytimes.com/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="https://www.nytimes.com/")], templates=( Template( name='menu', selector='#site-index-navigation li', db_type='MongoDB', db='nytimes', table='menu', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source(active=False)), # source is for next run ) ), ) ), Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="https://www.nytimes.com/")], templates=( Template( name='articlelist', selector='', db_type='MongoDB', db='nytimes', table='articles', attrs=( Attr(name='title', selector='h1', func='sel_text'),
sources=[ Source( url="https://ghostlyhaks.com/forum/rom-eeprom-bios-efi-uefi" ) ], templates=( Template( name='forum_post', selector='.kbody tr', db_type='mongo_db', db='efi_dumps', table='forum_post', attrs=( Attr(name='url', selector='a.ktopic-title', func='sel_url', source=Source( active=False)), # source is for next run Attr(name='user', selector='.kwho-user', func='sel_text'), Attr(name='user_url', selector='.kwho-user', func='sel_url'), )), Template( name='next_page', selector='.kpagination', attrs=[ Attr(name='url', selector='a',
url= 'http://www.kinky.nl/sex-afspraken/gay/default.aspx?pagesize=5000', attrs=[Attr(name='sex', value='gay')]), ], templates=[ Template(name='advert', selector='#advertenties > div', db_type='mongo_db', db='kinky', table='adds', attrs=[ Attr(name='phone', selector='.quickinfo > span', func='sel_text', kws={ 'children': True, 'debug': True, 'regex': 'Mijn telefoonnummer: (.*)' }), Attr(name='city', selector='.quickinfo span.country', func='sel_text'), Attr(name='url', selector='.advertentie_kop a', func='sel_attr', kws={'attr': 'href'}) ]) ]), ]) sexjobs = ScrapeModel( name='sexjobs', domain='http://www.sexjobs.nl/',
phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source( url= "https://fr.wikipedia.org/wiki/Liste_des_gouvernements_de_la_Belgique" ) ], templates=( Template( name='government', selector='.wikitable tr td:nth-of-type(2)', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source( active=False)), # source is for next run )), )), Phase(source_worker=WebSource, parser=HTMLParser, templates=(Template(name='government', selector='table:nth-of-type(1) tr', db_type='mongo_db', db='belgian_politics', table='politicians', attrs=( Attr(name='url', selector='td:nth-of-type(2) a', func='sel_url'),
phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url='https://www.youtube.com/user/ozzymanreviews/videos'), Source(url='https://www.youtube.com/user/Draadstal/videos'), Source(url='https://www.youtube.com/channel/UCQMs9pijXYAdqvkEMJyCM4g/videos'), Source(url='https://www.youtube.com/channel/UCi1LpRIlG1tDY5Z54VTel2w/videos'), Source(url='https://www.youtube.com/user/vpro/videos'), Source(url='https://www.youtube.com/user/nprmusic/videos'), ], templates=( Template( name='channel_videos', selector='li.channels-content-item', db_type='mongo_db', db='youtube_channel', table='channel_videos', attrs=[ Attr(name='url', selector='h3.yt-lockup-title a', func='sel_url'), Attr(name='title', selector='h3', func='sel_text'), Attr(name='views', selector='.yt-lockup-meta-info', func='sel_text', kws={'regex': '(.*) weergaven', 'numbers': True}), ] ), Template( name='next_videos', selector='.browse-items-load-more-button', attrs=[ Attr(name='url', func='sel_attr', kws={'attr': 'data-uix-load-more-href'}, source=Source(src_template='http://youtube.com{}', json_key=['content_html', 'load_more_widget_html'])) ]), ) ),
cl = MongoClient() db = cl.headlines col = db.category headlines = ScrapeModel(name='headlines', domain='http://www.headlines24.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://www.headlines24.nl/")], templates=( Template( name='category', selector='', db_type='mongo_db', db='headlines', table='category', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source(active=False)), # source is for next run Attr(name='title', selector='h1', func='sel_text'), Attr(name='text', selector='p', func='sel_text'), ) ), ) ), Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://www.headlines24.nl/")], templates=( Template( name='category', selector='', db_type='mongo_db', db='headlines', table='category', attrs=(
from parsers import HTMLParser import string from components import ScrapeModel, Source, Phase, Attr, Template paradiso = ScrapeModel( name='paradiso', domain='https://paradiso.nl', phases=[ Phase(source_worker=WebSource, sources=[Source(url='https://paradiso.nl/web/Agenda.htm')], parser=HTMLParser, templates=[ Template(name='event_link', selector='a.event-link', attrs=[ Attr(name='url', func='sel_attr', kws={'attr': 'href'}, source={'active': False}) ]) ]), Phase(templates=[ Template(name='event', db_type='MongoDB', db='paradiso', table='events', attrs=[ Attr(name='name', selector='meta[name=evenementts]', func='sel_attr', kws={'attr': 'content'}), Attr(name='date',
name='volkskrant', domain='http://www.volkskrant.nl/', num_getters=2, cookies={'nl_cookiewall_version': '1'}, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://www.volkskrant.nl/archief/{}".format(year)) for year in range(1987, today) ], templates=(Template(name='day_url', selector='td', attrs=(Attr( name='url', selector='a', func='sel_url', source=Source(active=False)), )), )), Phase( source_worker=WebSource, parser=HTMLParser, templates=( Template(name='article_url', selector='article', attrs=(Attr(name='url', selector='a', func='sel_url', source=Source(active=False)), )), Template(name='next_page_url', selector='a.pager',
meertens = ScrapeModel( name='namen', domain='http://www.meertens.knaw.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=( Source(url="http://www.meertens.knaw.nl/nvb/naam/begintmet/" + l) for l in ['Aad']), # string.ascii_lowercase), templates=[ Template( name='name', selector='tr.data', db_type='mongo_db', db='names', table='name_count_test', attrs=[ Attr(name='name', selector='td:nth-of-type(1)', func='sel_text'), Attr(name='men', selector='td:nth-of-type(2)', func='sel_text', kws={'numbers': True}), Attr(name='women', selector='td:nth-of-type(3)', func='sel_text', kws={'numbers': True}), Attr(name='url', selector='td:nth-of-type(1) a', func='sel_attr', kws={'attr': 'href'}, source={'active': False}, source_condition={'women': '> 50', 'men': '> 50'}), ] ), Template( name='next_url', selector='.right', attrs=[ Attr(name='next', selector='abc', func='sel_attr', kws={'attr': 'href'}, source={'active': True}), ]) ]
from workers import WebSource from parsers import HTMLParser pornstars = ScrapeModel( name='pornhub_pornstars', domain='http://pornhub.com', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[Source(url='http://www.pornhub.com/pornstars?o=a')], templates=[ Template(name='alphabet', selector='.alphabetFilter .dropdownWrapper li', attrs=[ Attr(name='url', selector='a', func='sel_url', source=Source(active=False)) ]) ]), Phase(source_worker=WebSource, parser=HTMLParser, templates=[ Template(name='pornstar', selector='.pornstarIndex li', db_type='MongoDB', db='pornstars', collection='ranking', attrs=[ Attr(name='name', selector='.title',
from dispatcher import Dispatcher from components import ScrapeModel, Phase, Template, Attr, Source from workers import WebSource from parsers import HTMLParser import string from pymongo import MongoClient petitions = ScrapeModel( name='petitions', domain='https://petities.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=( Source(url="https://petities.nl/petitions/borstkankeronderzoek-vervroegen/signatures?locale=nl"),), templates=[ Template(name='next_page', selector='.navigation-bar .navigation-bar', attrs=[Attr(name='url', selector='a', func='sel_url', kws={'attr': 'href'}, source=True)]), Template(name='signature', selector='.petition-signature-list', db_type='mongo_db', db='petitions', table='borstkanker', attrs=[ Attr(name='name', selector='.petition-signature-name', func='sel_text'), Attr(name='time', selector='.signature-time', func='sel_text'), Attr(name='location', selector='.petition-signature-location', func='sel_text'), Attr(name='occupation', selector='.petition-signature-occupation', func='sel_text') ]) ] ) ]
templates=(Template( name='episode', selector='.so-panel.widget.widget_siteorigin-panels-builder', db_type='shell_command', db='theoffice', table='season', kws={ 'command': 'sudo mkdir -p ' + filepath + '/{season}/ &' + ' sudo youtube-dl -o /mnt/Movies/{season}/{episode} {url}' }, attrs=( Attr(name='url', selector='a', func=['sel_url', 'sel_text'], kws=[{}, { 'needle': r'.*(s\d+e\d+)' }]), Attr(name='episode', selector='.textwidget', func='sel_text', kws={ 'index': 3, 'substitute': '_', 'replacers': ' ' }), Attr(name='season', selector='.textwidget', func='sel_text', kws={ 'index': 1, 'replacers': ' ' }), )), )),
cl = MongoClient() db = cl.gsmhelpdesk_nummerreeksen col = db.number_range gsmhelpdesk_nummerreeksen = ScrapeModel(name='gsmhelpdesk_nummerreeksen', domain='http://www.gsmhelpdesk.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://www.gsmhelpdesk.nl/helpdesk/30/nummerreeksen"), ], templates=( Template( name='number_range', selector='tr', db_type='mongo_db', db='gsmhelpdesk_nummerreeksen', table='number_range', attrs=( Attr(name='start', selector='td:nth-of-type(1)', func='sel_text', kws={'numbers': True}), Attr(name='end', selector='td:nth-of-type(2)', func='sel_text', kws={'numbers': True}), ) ), ) ), ] ) disp = Dispatcher() disp.add_scraper(gsmhelpdesk_nummerreeksen) disp.run()
Template( name='house', selector='.search-result', db_type='mongo_db', db='funda', table='for_hire', attrs=[ Attr(name='price', selector='.search-result-price', func='sel_text', kws={'numbers': True}), Attr(name='street', selector='.search-result-title', func='sel_text'), Attr(name='realtor', selector='.realtor', func='sel_text'), Attr(name='rooms', selector='.search-result-info', func='sel_text', kws={ 'regex': '(\d+) kamers', 'numbers': True }), Attr(name='zip', selector='.search-result-subtitle', func='sel_text', kws={'regex': '(\d{4} \w{2})'}), Attr(name='city', func='sel_text', selector='.search-result-subtitle', kws={'regex': '\d{4} \w{2} (\w+)'}), Attr( name='living_area', func='sel_text', selector= '.search-result-info span[title="Woonoppervlakte"]', kws={ 'regex': '(\d+)', 'numbers': True }), Attr(name='meeting_url', selector='.search-result-header a', func='sel_attr', kws={'attr': 'href'}, source={ 'src_template': '{}bezichtiging/', 'active': False }), ]),
thuisbezorgd = ScrapeModel( name='thuisbezorgd', domain='http://thuisbezorgd.nl', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[Source(url="https://www.thuisbezorgd.nl/")], templates=( Template( name='sections', selector='', attrs=( Attr(name='url', selector='a[href*="eten-bestellen-"]', func='sel_url', source=Source()), # source is for next run )), Template( name='restaurant', selector='.restaurant', db_type='MongoDB', db='thuisbezorgd', table='restaurants', attrs=( Attr(name='url', selector='a.restaurantname', func='sel_url', source=Source(
cl = MongoClient() db = cl.southpark col = db.video southpark = ScrapeModel(name='southpark', domain='http://southpark.cc.com/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://southpark.cc.com/")], templates=( Template( name='video', selector='', db_type='MongoDB', db='southpark', table='video', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source(active=False)), # source is for next run Attr(name='title', selector='h1', func='sel_text'), Attr(name='text', selector='p', func='sel_text'), ) ), ) ), Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://southpark.cc.com/")], templates=( Template( name='video', selector='', db_type='MongoDB', db='southpark', table='video', attrs=(