Example #1
0
    def make_objects(self, template, selected, getter):
        objects = []
        # print('aantal links', len(selected))
        for sel in selected:
            objct = Template(name=template.name)
            objct.url = getter.url

            # Set predefined attributes from the getter.
            #print('aantal attrs', len(getter.attrs))
            for attr in getter.attrs:
                objct.attrs.append(attr.duplicate())

            # Set the attribute values
            for temp_attr in template.attrs:
                parsed = temp_attr.func(sel, temp_attr.selector,
                                        **temp_attr.kws)
                attr = Attr(name=temp_attr.name, value=parsed)
                objct.attrs.append(attr)

                # Create a request from the attribute if desirable
                if temp_attr.getter and parsed:
                    if type(parsed) != list:
                        parsed = [parsed]

                    for value in parsed:
                        new_getter = Getter(**temp_attr.getter)
                        new_getter.url = value
                        self._handle_getter(new_getter)

            if template.getter:
                self._handle_object_getter(objct)
            objects.append(objct)
        return objects
Example #2
0
 parser=HTMLParser,
 sources=[
     Source(
         url=
         "https://www.erowid.org/experiences/exp.cgi?ShowViews=1&Cellar=0&Start=0&Max=24777"
     )
 ],
 templates=(Template(name='report_url',
                     selector='.exp-list-table tr',
                     source={
                         'active': False,
                         'copy_attrs': True
                     },
                     attrs=(
                         Attr(name='url',
                              selector='td:nth-of-type(2) a',
                              func='sel_url'),
                         Attr(name='title',
                              selector='td:nth-of-type(2) a',
                              func='sel_text'),
                         Attr(name='rating',
                              selector='td:nth-of-type(1) img',
                              func='sel_attr',
                              kws={'attr': 'alt'}),
                         Attr(name='author',
                              selector='td:nth-of-type(3)',
                              func='sel_text'),
                         Attr(name='substances',
                              selector='td:nth-of-type(4)',
                              func='sel_text',
                              kws={
Example #3
0
cl = MongoClient()
db = cl.nytimes
col = db.menu

nytimes = ScrapeModel(name='nytimes', domain='https://www.nytimes.com/',
    num_getters=2, phases=[

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="https://www.nytimes.com/")],
        templates=(
            Template(
                name='menu', selector='#site-index-navigation li',
                db_type='MongoDB', db='nytimes', table='menu',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run
                )
            ),
        )
    ),

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="https://www.nytimes.com/")],
        templates=(
            Template(
                name='articlelist', selector='',
                db_type='MongoDB', db='nytimes', table='articles',
                attrs=(
                    Attr(name='title', selector='h1', func='sel_text'),

                    Attr(name='text', selector='.story-body-supplemental p',
Example #4
0
 sources=[
     Source(
         url="https://ghostlyhaks.com/forum/rom-eeprom-bios-efi-uefi"
     )
 ],
 templates=(
     Template(
         name='forum_post',
         selector='.kbody tr',
         db_type='mongo_db',
         db='efi_dumps',
         table='forum_post',
         attrs=(
             Attr(name='url',
                  selector='a.ktopic-title',
                  func='sel_url',
                  source=Source(
                      active=False)),  # source is for next run
             Attr(name='user',
                  selector='.kwho-user',
                  func='sel_text'),
             Attr(name='user_url',
                  selector='.kwho-user',
                  func='sel_url'),
         )),
     Template(
         name='next_page',
         selector='.kpagination',
         attrs=[
             Attr(name='url',
                  selector='a',
Example #5
0
kinkycookies = {'ckieLegalIds': '4e17b168-5eb9-4c72-b7ee-3e8aebfd963e'}
sexjobscookies = {'algemeneVoorwaardenVersie': '3'}

kinky = ScrapeModel(
    name='kinky',
    domain='http://www.kinky.nl/',
    cookies=kinkycookies,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/mannen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='man')]),
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='vrouw')]),
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/transsexuelen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='trans')]),
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/stellen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='stellen')]),
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/gay/default.aspx?pagesize=5000',
     source_worker=WebSource,
     parser=HTMLParser,
     sources=[
         Source(
             url=
             "https://fr.wikipedia.org/wiki/Liste_des_gouvernements_de_la_Belgique"
         )
     ],
     templates=(
         Template(
             name='government',
             selector='.wikitable tr td:nth-of-type(2)',
             attrs=(
                 Attr(name='url',
                      selector='a',
                      func='sel_url',
                      source=Source(
                          active=False)),  # source is for next run
             )), )),
 Phase(source_worker=WebSource,
       parser=HTMLParser,
       templates=(Template(name='government',
                           selector='table:nth-of-type(1) tr',
                           db_type='mongo_db',
                           db='belgian_politics',
                           table='politicians',
                           attrs=(
                               Attr(name='url',
                                    selector='td:nth-of-type(2) a',
                                    func='sel_url'),
                               Attr(name='title',
Example #7
0
    name='youtube_channel', domain='https://youtube.com/', num_getters=2, awaiting=True,
    phases=[
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url='https://www.youtube.com/user/ozzymanreviews/videos'),
        Source(url='https://www.youtube.com/user/Draadstal/videos'),
        Source(url='https://www.youtube.com/channel/UCQMs9pijXYAdqvkEMJyCM4g/videos'),
        Source(url='https://www.youtube.com/channel/UCi1LpRIlG1tDY5Z54VTel2w/videos'),
        Source(url='https://www.youtube.com/user/vpro/videos'),
        Source(url='https://www.youtube.com/user/nprmusic/videos'),
    ],
        templates=(
            Template(
                name='channel_videos', selector='li.channels-content-item',
                db_type='mongo_db', db='youtube_channel', table='channel_videos',
                attrs=[
                    Attr(name='url', selector='h3.yt-lockup-title a', func='sel_url'),

                    Attr(name='title', selector='h3', func='sel_text'),

                    Attr(name='views', selector='.yt-lockup-meta-info', func='sel_text',
                         kws={'regex': '(.*) weergaven', 'numbers': True}),
                ]
            ),
            Template(
                name='next_videos', selector='.browse-items-load-more-button',
                attrs=[
                    Attr(name='url', func='sel_attr',
                         kws={'attr': 'data-uix-load-more-href'},
                         source=Source(src_template='http://youtube.com{}',
                                       json_key=['content_html', 'load_more_widget_html']))
                ]),
Example #8
0
cl = MongoClient()
db = cl.headlines
col = db.category

headlines = ScrapeModel(name='headlines', domain='http://www.headlines24.nl/',
    num_getters=2, phases=[
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.headlines24.nl/")],
        templates=(
            Template(
                name='category', selector='',
                db_type='mongo_db', db='headlines', table='category',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run

                    Attr(name='title', selector='h1', func='sel_text'),

                    Attr(name='text', selector='p', func='sel_text'),
                )
            ),
        )
    ),
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.headlines24.nl/")],
        templates=(
            Template(
                name='category', selector='',
                db_type='mongo_db', db='headlines', table='category',
Example #9
0
import string
from components import ScrapeModel, Source, Phase, Attr, Template

paradiso = ScrapeModel(
    name='paradiso',
    domain='https://paradiso.nl',
    phases=[
        Phase(source_worker=WebSource,
              sources=[Source(url='https://paradiso.nl/web/Agenda.htm')],
              parser=HTMLParser,
              templates=[
                  Template(name='event_link',
                           selector='a.event-link',
                           attrs=[
                               Attr(name='url',
                                    func='sel_attr',
                                    kws={'attr': 'href'},
                                    source={'active': False})
                           ])
              ]),
        Phase(templates=[
            Template(name='event',
                     db_type='MongoDB',
                     db='paradiso',
                     table='events',
                     attrs=[
                         Attr(name='name',
                              selector='meta[name=evenementts]',
                              func='sel_attr',
                              kws={'attr': 'content'}),
                         Attr(name='date',
                              selector='meta[name=evenementts]',
Example #10
0
 domain='http://www.volkskrant.nl/',
 num_getters=2,
 cookies={'nl_cookiewall_version': '1'},
 phases=[
     Phase(
         source_worker=WebSource,
         parser=HTMLParser,
         sources=[
             Source(url="http://www.volkskrant.nl/archief/{}".format(year))
             for year in range(1987, today)
         ],
         templates=(Template(name='day_url',
                             selector='td',
                             attrs=(Attr(
                                 name='url',
                                 selector='a',
                                 func='sel_url',
                                 source=Source(active=False)), )), )),
     Phase(
         source_worker=WebSource,
         parser=HTMLParser,
         templates=(
             Template(name='article_url',
                      selector='article',
                      attrs=(Attr(name='url',
                                  selector='a',
                                  func='sel_url',
                                  source=Source(active=False)), )),
             Template(name='next_page_url',
                      selector='a.pager',
                      attrs=(Attr(name='url',
Example #11
0
from workers import WebSource
from parsers import HTMLParser
import string


meertens = ScrapeModel(
    name='namen', domain='http://www.meertens.knaw.nl/', num_getters=2, phases=[
    Phase(source_worker=WebSource, parser=HTMLParser, sources=(
        Source(url="http://www.meertens.knaw.nl/nvb/naam/begintmet/" + l)
                    for l in ['Aad']), # string.ascii_lowercase),
        templates=[
            Template(
                name='name', selector='tr.data',
                db_type='mongo_db', db='names', table='name_count_test',
                attrs=[
                    Attr(name='name', selector='td:nth-of-type(1)',
                                func='sel_text'),
                    Attr(name='men', selector='td:nth-of-type(2)',
                                func='sel_text', kws={'numbers': True}),
                    Attr(name='women', selector='td:nth-of-type(3)',
                                func='sel_text', kws={'numbers': True}),
                    Attr(name='url', selector='td:nth-of-type(1) a',
                                func='sel_attr', kws={'attr': 'href'},
                                source={'active': False},
                                source_condition={'women': '> 50',
                                                  'men': '> 50'}),
                ]
            ),
            Template(
                name='next_url', selector='.right',
                attrs=[
                    Attr(name='next', selector='abc',  func='sel_attr',
Example #12
0
from parsers import HTMLParser

pornstars = ScrapeModel(
    name='pornhub_pornstars',
    domain='http://pornhub.com',
    num_getters=2,
    phases=[
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              sources=[Source(url='http://www.pornhub.com/pornstars?o=a')],
              templates=[
                  Template(name='alphabet',
                           selector='.alphabetFilter .dropdownWrapper li',
                           attrs=[
                               Attr(name='url',
                                    selector='a',
                                    func='sel_url',
                                    source=Source(active=False))
                           ])
              ]),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=[
                  Template(name='pornstar',
                           selector='.pornstarIndex li',
                           db_type='MongoDB',
                           db='pornstars',
                           collection='ranking',
                           attrs=[
                               Attr(name='name',
                                    selector='.title',
                                    func='sel_text'),
Example #13
0
from dispatcher import Dispatcher
from components import ScrapeModel, Phase, Template, Attr, Source
from workers import WebSource
from parsers import HTMLParser
import string
from pymongo import MongoClient


petitions = ScrapeModel(
    name='petitions', domain='https://petities.nl/', num_getters=2,
    phases=[
        Phase(source_worker=WebSource, parser=HTMLParser, sources=(
            Source(url="https://petities.nl/petitions/borstkankeronderzoek-vervroegen/signatures?locale=nl"),),
            templates=[
                Template(name='next_page', selector='.navigation-bar .navigation-bar',
                         attrs=[Attr(name='url', selector='a', func='sel_url', kws={'attr': 'href'},
                                     source=True)]),
                Template(name='signature', selector='.petition-signature-list',
                         db_type='mongo_db', db='petitions', table='borstkanker',
                         attrs=[
                             Attr(name='name', selector='.petition-signature-name',
                                  func='sel_text'),
                             Attr(name='time', selector='.signature-time', func='sel_text'),
                             Attr(name='location', selector='.petition-signature-location',
                                  func='sel_text'),
                             Attr(name='occupation', selector='.petition-signature-occupation',
                                  func='sel_text')
                        ])
            ]
        )
    ]
)
Example #14
0
 ],
 templates=(Template(
     name='episode',
     selector='.so-panel.widget.widget_siteorigin-panels-builder',
     db_type='shell_command',
     db='theoffice',
     table='season',
     kws={
         'command':
         'sudo mkdir -p ' + filepath + '/{season}/ &' +
         ' sudo youtube-dl -o /mnt/Movies/{season}/{episode} {url}'
     },
     attrs=(
         Attr(name='url',
              selector='a',
              func=['sel_url', 'sel_text'],
              kws=[{}, {
                  'needle': r'.*(s\d+e\d+)'
              }]),
         Attr(name='episode',
              selector='.textwidget',
              func='sel_text',
              kws={
                  'index': 3,
                  'substitute': '_',
                  'replacers': ' '
              }),
         Attr(name='season',
              selector='.textwidget',
              func='sel_text',
              kws={
                  'index': 1,
cl = MongoClient()
db = cl.gsmhelpdesk_nummerreeksen
col = db.number_range

gsmhelpdesk_nummerreeksen = ScrapeModel(name='gsmhelpdesk_nummerreeksen', domain='http://www.gsmhelpdesk.nl/', num_getters=2,
    phases=[

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.gsmhelpdesk.nl/helpdesk/30/nummerreeksen"),
    ],
    templates=(
        Template(
            name='number_range', selector='tr',
            db_type='mongo_db', db='gsmhelpdesk_nummerreeksen', table='number_range',
            attrs=(
                Attr(name='start', selector='td:nth-of-type(1)',
                            func='sel_text', kws={'numbers': True}),
                Attr(name='end', selector='td:nth-of-type(2)',
                            func='sel_text', kws={'numbers': True}),
                )
            ),
        )
    ),
    ]
)

disp = Dispatcher()
disp.add_scraper(gsmhelpdesk_nummerreeksen)
disp.run()
Example #16
0
 sources=[
     Source(url='http://funda.nl/huur/amsterdam/woonhuis/'),
     Source(url='http://funda.nl/huur/amsterdam/appartement/'),
     Source(url='http://funda.nl/koop/amsterdam/woonhuis/'),
     Source(url='http://funda.nl/koop/amsterdam/appartement'),
 ],
 templates=[
     Template(
         name='house',
         selector='.search-result',
         db_type='mongo_db',
         db='funda',
         table='for_hire',
         attrs=[
             Attr(name='price',
                  selector='.search-result-price',
                  func='sel_text',
                  kws={'numbers': True}),
             Attr(name='street',
                  selector='.search-result-title',
                  func='sel_text'),
             Attr(name='realtor',
                  selector='.realtor',
                  func='sel_text'),
             Attr(name='rooms',
                  selector='.search-result-info',
                  func='sel_text',
                  kws={
                      'regex': '(\d+) kamers',
                      'numbers': True
                  }),
             Attr(name='zip',
Example #17
0
thuisbezorgd = ScrapeModel(
    name='thuisbezorgd',
    domain='http://thuisbezorgd.nl',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[Source(url="https://www.thuisbezorgd.nl/")],
            templates=(
                Template(
                    name='sections',
                    selector='',
                    attrs=(
                        Attr(name='url',
                             selector='a[href*="eten-bestellen-"]',
                             func='sel_url',
                             source=Source()),  # source is for next run
                    )),
                Template(
                    name='restaurant',
                    selector='.restaurant',
                    db_type='MongoDB',
                    db='thuisbezorgd',
                    table='restaurants',
                    attrs=(
                        Attr(name='url',
                             selector='a.restaurantname',
                             func='sel_url',
                             source=Source(
                                 active=False,
                                 src_template='{}')),  # source is for next run
Example #18
0
# TODO Set the right classes for the websites.
from dispatcher import Dispatcher
from components import Attr, Source, HTMLObject, Phase, ScrapeModel, Follow

kinky =ScrapeModel(name='kinky', domain='kinky.nl', phases=[
    Phase(source=[Source(url='http://kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=4500')],
        templates=[Template(name='advert', selector='.advertentie_kop > a',
                 attrs=[Attr(name='url', source={'active': False})])
        ]),
    Phase(templates=[
        Template(name='advert', selector='.advertentie_kop > a', db='kinky.nl', table='adverts',
                Attr(name='add_text', func= 'sel_text', selector='description p'),
                Attr(name='possibilities', func= 'sel_text', selector= '.possibilities li'),
                Attr(name='update', func= 'sel_text', selector='update'),
                Attr(name='town', func= 'sel_text', selector= '.naw div', kws={'regex': '.*Plaats: ([A-z]*);'}),
                Attr(name='work_area', func= 'sel_text', selector='naw div', kws={'regex': '.*Werkgebied: '}),
                Attr(name='prices', func= 'sel_text', selector='.prizes td:nth-of-type(2)'),
                Attr(name='pictures', func= 'sel_attr', selector='.galleryRow img', kws={'attr': 'src'}),
                Attr(name='phone', selector='.mainprofile .webbutton span', func='sel_text'),
                Attr(name='name', selector='h1.title', func='sel_text'),
                    '__reg__age': ['.naw div', '.*Leeftijd: (\d\d)'],
                    '__reg__length': ['.naw div', '.*Lengte: (\d\d\d)'],
                    '__reg__hair_color': ['.naw div', '.*Kleur haar: ([a-z]*);'],
                    '__reg__build': ['.naw div', '.*Lichaamsbouw: ([A-z]*);'],
                    '__reg__looks': ['.naw div', '.*Uiterlijk: ([A-z]*);'],
                ,
            ,
        ,
        'start': [
            'meta': {
                'sex': 'female',