def build_10m(): src = Source(loc=-800, flux=100000) velsel = VelocitySelector(loc=-600, wavelen=5, spread=0.14) atten = Attenuator(loc=-510, number=0, factor=0) srcap = Aperture(loc=-500, shape='circle', dims=[2.54, 2.54]) samap = Aperture(loc=0, shape='circle', dims=[0.635, 0.635]) guides = Guide(loc=-500, number=0, dims=[5,5], parms=[2,2]) sam = Sample(loc=5, label='Cylinders in d2O', dims=[2.54, 2.54, 0.254], model='cylinder') bs = BeamStop(loc=495, coords=[0.2, 5.3], bsnum=2, bsdims=[5.08, 5.08], A=20.1, B=2.0477) det = Detector(loc=500, dpix=[0.508, 0.508], npix=[128,128], beam_cntr=[64.1, 65.2]) sans = Instrument(src, velsel, atten, srcap, samap, guides, sam, bs, det) return sans
cl = MongoClient() db = cl.efi_dumps col = db.forum_post efi_dumps = ScrapeModel( name='efi_dumps', domain='https://ghostlyhaks.com/', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source( url="https://ghostlyhaks.com/forum/rom-eeprom-bios-efi-uefi" ) ], templates=( Template( name='forum_post', selector='.kbody tr', db_type='mongo_db', db='efi_dumps', table='forum_post', attrs=( Attr(name='url', selector='a.ktopic-title', func='sel_url', source=Source( active=False)), # source is for next run
from parsers import HTMLParser kinkycookies = {'ckieLegalIds': '4e17b168-5eb9-4c72-b7ee-3e8aebfd963e'} sexjobscookies = {'algemeneVoorwaardenVersie': '3'} kinky = ScrapeModel( name='kinky', domain='http://www.kinky.nl/', cookies=kinkycookies, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source( url= 'http://www.kinky.nl/sex-afspraken/mannen/default.aspx?pagesize=5000', attrs=[Attr(name='sex', value='man')]), Source( url= 'http://www.kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=5000', attrs=[Attr(name='sex', value='vrouw')]), Source( url= 'http://www.kinky.nl/sex-afspraken/transsexuelen/default.aspx?pagesize=5000', attrs=[Attr(name='sex', value='trans')]), Source( url= 'http://www.kinky.nl/sex-afspraken/stellen/default.aspx?pagesize=5000', attrs=[Attr(name='sex', value='stellen')]), Source( url=
cl = MongoClient() db = cl.belgian_parlement_roles col = db.government belgian_parlement_roles = ScrapeModel( name='belgian_parlement_roles', domain='https://fr.wikipedia.org/', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source( url= "https://fr.wikipedia.org/wiki/Liste_des_gouvernements_de_la_Belgique" ) ], templates=( Template( name='government', selector='.wikitable tr td:nth-of-type(2)', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source( active=False)), # source is for next run )), )), Phase(source_worker=WebSource, parser=HTMLParser,
cl = MongoClient() db = cl.youtube_channel col = db.channel_videos # The base url of the website url = 'https://youtube.com/' # The amount of workers that will get the information youtube_channel = ScrapeModel( name='youtube_channel', domain='https://youtube.com/', num_getters=2, awaiting=True, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url='https://www.youtube.com/user/ozzymanreviews/videos'), Source(url='https://www.youtube.com/user/Draadstal/videos'), Source(url='https://www.youtube.com/channel/UCQMs9pijXYAdqvkEMJyCM4g/videos'), Source(url='https://www.youtube.com/channel/UCi1LpRIlG1tDY5Z54VTel2w/videos'), Source(url='https://www.youtube.com/user/vpro/videos'), Source(url='https://www.youtube.com/user/nprmusic/videos'), ], templates=( Template( name='channel_videos', selector='li.channels-content-item', db_type='mongo_db', db='youtube_channel', table='channel_videos', attrs=[ Attr(name='url', selector='h3.yt-lockup-title a', func='sel_url'), Attr(name='title', selector='h3', func='sel_text'),
from workers import WebSource from parsers import HTMLParser cl = MongoClient() db = cl.southpark col = db.video southpark = ScrapeModel( name='southpark', domain='http://southpark.cc.com/', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[Source(url="http://southpark.cc.com/")], templates=( Template( name='video', selector='', db_type='mongo_db', db='southpark', table='video', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source( active=False)), # source is for next run Attr(name='title', selector='h1', func='sel_text'), Attr(name='text', selector='p', func='sel_text'),
from dispatcher import Dispatcher from components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient from workers import WebSource from parsers import HTMLParser cl = MongoClient() db = cl.headlines col = db.category headlines = ScrapeModel(name='headlines', domain='http://www.headlines24.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://www.headlines24.nl/")], templates=( Template( name='category', selector='', db_type='mongo_db', db='headlines', table='category', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source(active=False)), # source is for next run Attr(name='title', selector='h1', func='sel_text'), Attr(name='text', selector='p', func='sel_text'), ) ), ) ),
from dispatcher import Dispatcher from components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient from workers import WebSource from parsers import HTMLParser cl = MongoClient() db = cl.gsmhelpdesk_nummerreeksen col = db.number_range gsmhelpdesk_nummerreeksen = ScrapeModel(name='gsmhelpdesk_nummerreeksen', domain='http://www.gsmhelpdesk.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://www.gsmhelpdesk.nl/helpdesk/30/nummerreeksen"), ], templates=( Template( name='number_range', selector='tr', db_type='mongo_db', db='gsmhelpdesk_nummerreeksen', table='number_range', attrs=( Attr(name='start', selector='td:nth-of-type(1)', func='sel_text', kws={'numbers': True}), Attr(name='end', selector='td:nth-of-type(2)', func='sel_text', kws={'numbers': True}), ) ), ) ), ]
col = db.artikelen today = datetime.datetime.now().year print(today) volkskrant = ScrapeModel( name='volkskrant', domain='http://www.volkskrant.nl/', num_getters=2, cookies={'nl_cookiewall_version': '1'}, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://www.volkskrant.nl/archief/{}".format(year)) for year in range(1987, today) ], templates=(Template(name='day_url', selector='td', attrs=(Attr( name='url', selector='a', func='sel_url', source=Source(active=False)), )), )), Phase( source_worker=WebSource, parser=HTMLParser, templates=( Template(name='article_url', selector='article',
from dispatcher import Dispatcher from components import ScrapeModel, Phase, Template, Attr, Source from workers import WebSource from parsers import HTMLParser import string meertens = ScrapeModel( name='namen', domain='http://www.meertens.knaw.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=( Source(url="http://www.meertens.knaw.nl/nvb/naam/begintmet/" + l) for l in ['Aad']), # string.ascii_lowercase), templates=[ Template( name='name', selector='tr.data', db_type='mongo_db', db='names', table='name_count_test', attrs=[ Attr(name='name', selector='td:nth-of-type(1)', func='sel_text'), Attr(name='men', selector='td:nth-of-type(2)', func='sel_text', kws={'numbers': True}), Attr(name='women', selector='td:nth-of-type(3)', func='sel_text', kws={'numbers': True}), Attr(name='url', selector='td:nth-of-type(1) a', func='sel_attr', kws={'attr': 'href'}, source={'active': False}, source_condition={'women': '> 50', 'men': '> 50'}), ] ), Template(
from dispatcher import Dispatcher import re from components import ScrapeModel, Phase, Template, Attr, Source from workers import WebSource from parsers import HTMLParser pornstars = ScrapeModel( name='pornhub_pornstars', domain='http://pornhub.com', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[Source(url='http://www.pornhub.com/pornstars?o=a')], templates=[ Template(name='alphabet', selector='.alphabetFilter .dropdownWrapper li', attrs=[ Attr(name='url', selector='a', func='sel_url', source=Source(active=False)) ]) ]), Phase(source_worker=WebSource, parser=HTMLParser, templates=[ Template(name='pornstar', selector='.pornstarIndex li', db_type='MongoDB', db='pornstars',
from dispatcher import Dispatcher from components import ScrapeModel, Phase, Template, Attr, Source from workers import WebSource from parsers import HTMLParser import string from pymongo import MongoClient petitions = ScrapeModel( name='petitions', domain='https://petities.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=( Source(url="https://petities.nl/petitions/borstkankeronderzoek-vervroegen/signatures?locale=nl"),), templates=[ Template(name='next_page', selector='.navigation-bar .navigation-bar', attrs=[Attr(name='url', selector='a', func='sel_url', kws={'attr': 'href'}, source=True)]), Template(name='signature', selector='.petition-signature-list', db_type='mongo_db', db='petitions', table='borstkanker', attrs=[ Attr(name='name', selector='.petition-signature-name', func='sel_text'), Attr(name='time', selector='.signature-time', func='sel_text'), Attr(name='location', selector='.petition-signature-location', func='sel_text'), Attr(name='occupation', selector='.petition-signature-occupation', func='sel_text') ]) ] ) ]
cl = MongoClient() db = cl.theoffice col = db.season filepath = '/mnt/Movies/theoffice/' theoffice = ScrapeModel( name='theoffice', domain='http://watchtheofficeonline.com', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="http://watchtheofficeonline.com/s{}e{}".format( season, episode)) for season in range(1, 10) for episode in range(1, 30) ], templates=(Template( name='episode', selector='.so-panel.widget.widget_siteorigin-panels-builder', db_type='shell_command', db='theoffice', table='season', kws={ 'command': 'sudo mkdir -p ' + filepath + '/{season}/ &' + ' sudo youtube-dl -o /mnt/Movies/{season}/{episode} {url}' }, attrs=( Attr(name='url',
from dispatcher import Dispatcher from components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient from workers import WebSource from parsers import HTMLParser cl = MongoClient() db = cl.nytimes col = db.menu nytimes = ScrapeModel(name='nytimes', domain='https://www.nytimes.com/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="https://www.nytimes.com/")], templates=( Template( name='menu', selector='#site-index-navigation li', db_type='MongoDB', db='nytimes', table='menu', attrs=( Attr(name='url', selector='a', func='sel_url', source=Source(active=False)), # source is for next run ) ), ) ), Phase(source_worker=WebSource, parser=HTMLParser, sources=[ Source(url="https://www.nytimes.com/")], templates=(
from dispatcher import Dispatcher from workers import WebSource from parsers import HTMLParser import string from components import ScrapeModel, Source, Phase, Attr, Template paradiso = ScrapeModel( name='paradiso', domain='https://paradiso.nl', phases=[ Phase(source_worker=WebSource, sources=[Source(url='https://paradiso.nl/web/Agenda.htm')], parser=HTMLParser, templates=[ Template(name='event_link', selector='a.event-link', attrs=[ Attr(name='url', func='sel_attr', kws={'attr': 'href'}, source={'active': False}) ]) ]), Phase(templates=[ Template(name='event', db_type='MongoDB', db='paradiso', table='events', attrs=[ Attr(name='name', selector='meta[name=evenementts]',
cl = MongoClient() db = cl.erowid col = db.drug_report erowid = ScrapeModel( name='erowid', domain='https://www.erowid.org/experiences/', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[ Source( url= "https://www.erowid.org/experiences/exp.cgi?ShowViews=1&Cellar=0&Start=0&Max=24777" ) ], templates=(Template(name='report_url', selector='.exp-list-table tr', source={ 'active': False, 'copy_attrs': True }, attrs=( Attr(name='url', selector='td:nth-of-type(2) a', func='sel_url'), Attr(name='title', selector='td:nth-of-type(2) a', func='sel_text'),
from components import ScrapeModel, Phase, Source, Template, Attr from dispatcher import Dispatcher from workers import WebSource from parsers import HTMLParser funda = ScrapeModel( name='funda.nl', domain='http://funda.nl', num_sources=1, phases=[ Phase( parser=HTMLParser, source_worker=WebSource, sources=[ Source(url='http://funda.nl/huur/amsterdam/woonhuis/'), Source(url='http://funda.nl/huur/amsterdam/appartement/'), Source(url='http://funda.nl/koop/amsterdam/woonhuis/'), Source(url='http://funda.nl/koop/amsterdam/appartement'), ], templates=[ Template( name='house', selector='.search-result', db_type='mongo_db', db='funda', table='for_hire', attrs=[ Attr(name='price', selector='.search-result-price', func='sel_text', kws={'numbers': True}),
from workers import WebSource from parsers import HTMLParser cl = MongoClient() db = cl.thuisbezorgd col = db.reviews thuisbezorgd = ScrapeModel( name='thuisbezorgd', domain='http://thuisbezorgd.nl', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[Source(url="https://www.thuisbezorgd.nl/")], templates=( Template( name='sections', selector='', attrs=( Attr(name='url', selector='a[href*="eten-bestellen-"]', func='sel_url', source=Source()), # source is for next run )), Template( name='restaurant', selector='.restaurant', db_type='MongoDB', db='thuisbezorgd',
# TODO Set the right classes for the websites. from dispatcher import Dispatcher from components import Attr, Source, HTMLObject, Phase, ScrapeModel, Follow kinky =ScrapeModel(name='kinky', domain='kinky.nl', phases=[ Phase(source=[Source(url='http://kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=4500')], templates=[Template(name='advert', selector='.advertentie_kop > a', attrs=[Attr(name='url', source={'active': False})]) ]), Phase(templates=[ Template(name='advert', selector='.advertentie_kop > a', db='kinky.nl', table='adverts', Attr(name='add_text', func= 'sel_text', selector='description p'), Attr(name='possibilities', func= 'sel_text', selector= '.possibilities li'), Attr(name='update', func= 'sel_text', selector='update'), Attr(name='town', func= 'sel_text', selector= '.naw div', kws={'regex': '.*Plaats: ([A-z]*);'}), Attr(name='work_area', func= 'sel_text', selector='naw div', kws={'regex': '.*Werkgebied: '}), Attr(name='prices', func= 'sel_text', selector='.prizes td:nth-of-type(2)'), Attr(name='pictures', func= 'sel_attr', selector='.galleryRow img', kws={'attr': 'src'}), Attr(name='phone', selector='.mainprofile .webbutton span', func='sel_text'), Attr(name='name', selector='h1.title', func='sel_text'), '__reg__age': ['.naw div', '.*Leeftijd: (\d\d)'], '__reg__length': ['.naw div', '.*Lengte: (\d\d\d)'], '__reg__hair_color': ['.naw div', '.*Kleur haar: ([a-z]*);'], '__reg__build': ['.naw div', '.*Lichaamsbouw: ([A-z]*);'], '__reg__looks': ['.naw div', '.*Uiterlijk: ([A-z]*);'], , , , 'start': [ 'meta': { 'sex': 'female',