Ejemplo n.º 1
0
erowid = ScrapeModel(
    name='erowid',
    domain='https://www.erowid.org/experiences/',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(
                    url=
                    "https://www.erowid.org/experiences/exp.cgi?ShowViews=1&Cellar=0&Start=0&Max=24777"
                )
            ],
            templates=(Template(name='report_url',
                                selector='.exp-list-table tr',
                                source={
                                    'active': False,
                                    'copy_attrs': True
                                },
                                attrs=(
                                    Attr(name='url',
                                         selector='td:nth-of-type(2) a',
                                         func='sel_url'),
                                    Attr(name='title',
                                         selector='td:nth-of-type(2) a',
                                         func='sel_text'),
                                    Attr(name='rating',
                                         selector='td:nth-of-type(1) img',
                                         func='sel_attr',
                                         kws={'attr': 'alt'}),
                                    Attr(name='author',
                                         selector='td:nth-of-type(3)',
                                         func='sel_text'),
                                    Attr(name='substances',
                                         selector='td:nth-of-type(4)',
                                         func='sel_text',
                                         kws={
                                             'replacers':
                                             '&',
                                             'substitute':
                                             ',',
                                             'regex':
                                             '([A-z0-9\-]+\s*[A-z0-9\-*\s]*)'
                                         }),
                                    Attr(name='date',
                                         selector='td:nth-of-type(5)',
                                         func='sel_text'),
                                    Attr(name='views',
                                         selector='td:nth-of-type(6)',
                                         func='sel_text'),
                                )), )),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(Template(name='drug_report',
                                  selector='',
                                  db_type='mongo_db',
                                  db='erowid',
                                  table='drug_report',
                                  attrs=(
                                      Attr(name='text',
                                           selector='.report-text-surround',
                                           func='sel_text'),
                                      Attr(name='weight',
                                           selector='td.bodyweight-amount',
                                           func='sel_text'),
                                  )), )),
    ])
Ejemplo n.º 2
0
efi_dumps = ScrapeModel(
    name='efi_dumps',
    domain='https://ghostlyhaks.com/',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(
                    url="https://ghostlyhaks.com/forum/rom-eeprom-bios-efi-uefi"
                )
            ],
            templates=(
                Template(
                    name='forum_post',
                    selector='.kbody tr',
                    db_type='mongo_db',
                    db='efi_dumps',
                    table='forum_post',
                    attrs=(
                        Attr(name='url',
                             selector='a.ktopic-title',
                             func='sel_url',
                             source=Source(
                                 active=False)),  # source is for next run
                        Attr(name='user',
                             selector='.kwho-user',
                             func='sel_text'),
                        Attr(name='user_url',
                             selector='.kwho-user',
                             func='sel_url'),
                    )),
                Template(
                    name='next_page',
                    selector='.kpagination',
                    attrs=[
                        Attr(name='url',
                             selector='a',
                             func='sel_url',
                             source=Source())  # source is for next run
                    ]),
            )),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(Template(
                  name='forum_post',
                  selector='a[href*=".zip"], a[href*=".tar"]',
                  db_type='mongo_db',
                  db='efi_dumps',
                  table='efi_dumps',
                  attrs=[Attr(name='url', selector='', func='sel_url')]), )),
    ])
Ejemplo n.º 3
0
nytimes = ScrapeModel(name='nytimes', domain='https://www.nytimes.com/',
    num_getters=2, phases=[

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="https://www.nytimes.com/")],
        templates=(
            Template(
                name='menu', selector='#site-index-navigation li',
                db_type='MongoDB', db='nytimes', table='menu',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run
                )
            ),
        )
    ),

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="https://www.nytimes.com/")],
        templates=(
            Template(
                name='articlelist', selector='',
                db_type='MongoDB', db='nytimes', table='articles',
                attrs=(
                    Attr(name='title', selector='h1', func='sel_text'),

                    Attr(name='text', selector='.story-body-supplemental p',
                         func='sel_text'),
                    Attr(name='writer', selector='span.byline-author',
                         func='sel_text'),
                    Attr(name='text', selector='.story-body-supplemental',
                         func='sel_tex')
                    Attr(name='date', selector='time.datine', func='sel_attr',
                         kws={'attr': 'datetime'}),
                    Attr(name='related', func='sel_url',
                         selector='#related-combined-coverage a.story-link'),
                    Attr(name='text',
                         selector='.story-body-supplemental p', func='sel_text'),
                )
            ),
Ejemplo n.º 4
0
belgian_parlement_roles = ScrapeModel(
    name='belgian_parlement_roles',
    domain='https://fr.wikipedia.org/',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(
                    url=
                    "https://fr.wikipedia.org/wiki/Liste_des_gouvernements_de_la_Belgique"
                )
            ],
            templates=(
                Template(
                    name='government',
                    selector='.wikitable tr td:nth-of-type(2)',
                    attrs=(
                        Attr(name='url',
                             selector='a',
                             func='sel_url',
                             source=Source(
                                 active=False)),  # source is for next run
                    )), )),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(Template(name='government',
                                  selector='table:nth-of-type(1) tr',
                                  db_type='mongo_db',
                                  db='belgian_politics',
                                  table='politicians',
                                  attrs=(
                                      Attr(name='url',
                                           selector='td:nth-of-type(2) a',
                                           func='sel_url'),
                                      Attr(name='title',
                                           selector='td:nth-of-type(1)',
                                           func='sel_text'),
                                  )), )),
    ])
Ejemplo n.º 5
0
kinky = ScrapeModel(
    name='kinky',
    domain='http://www.kinky.nl/',
    cookies=kinkycookies,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/mannen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='man')]),
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='vrouw')]),
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/transsexuelen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='trans')]),
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/stellen/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='stellen')]),
                Source(
                    url=
                    'http://www.kinky.nl/sex-afspraken/gay/default.aspx?pagesize=5000',
                    attrs=[Attr(name='sex', value='gay')]),
            ],
            templates=[
                Template(name='advert',
                         selector='#advertenties > div',
                         db_type='mongo_db',
                         db='kinky',
                         table='adds',
                         attrs=[
                             Attr(name='phone',
                                  selector='.quickinfo > span',
                                  func='sel_text',
                                  kws={
                                      'children': True,
                                      'debug': True,
                                      'regex': 'Mijn telefoonnummer: (.*)'
                                  }),
                             Attr(name='city',
                                  selector='.quickinfo span.country',
                                  func='sel_text'),
                             Attr(name='url',
                                  selector='.advertentie_kop a',
                                  func='sel_attr',
                                  kws={'attr': 'href'})
                         ])
            ]),
    ])
Ejemplo n.º 6
0
southpark = ScrapeModel(name='southpark', domain='http://southpark.cc.com/',
    num_getters=2, phases=[
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://southpark.cc.com/")],
        templates=(
            Template(
                name='video', selector='',
                db_type='MongoDB', db='southpark', table='video',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run

                    Attr(name='title', selector='h1', func='sel_text'),

                    Attr(name='text', selector='p', func='sel_text'),
                )
            ),
        )
    ),
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://southpark.cc.com/")],
        templates=(
            Template(
                name='video', selector='',
                db_type='MongoDB', db='southpark', table='video',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run

                    Attr(name='title', selector='h1', func='sel_text'),

                    Attr(name='text', selector='p', func='sel_text'),
                )
            ),
        )
    ),
    
])
Ejemplo n.º 7
0
youtube_channel = ScrapeModel(
    name='youtube_channel', domain='https://youtube.com/', num_getters=2, awaiting=True,
    phases=[
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url='https://www.youtube.com/user/ozzymanreviews/videos'),
        Source(url='https://www.youtube.com/user/Draadstal/videos'),
        Source(url='https://www.youtube.com/channel/UCQMs9pijXYAdqvkEMJyCM4g/videos'),
        Source(url='https://www.youtube.com/channel/UCi1LpRIlG1tDY5Z54VTel2w/videos'),
        Source(url='https://www.youtube.com/user/vpro/videos'),
        Source(url='https://www.youtube.com/user/nprmusic/videos'),
    ],
        templates=(
            Template(
                name='channel_videos', selector='li.channels-content-item',
                db_type='mongo_db', db='youtube_channel', table='channel_videos',
                attrs=[
                    Attr(name='url', selector='h3.yt-lockup-title a', func='sel_url'),

                    Attr(name='title', selector='h3', func='sel_text'),

                    Attr(name='views', selector='.yt-lockup-meta-info', func='sel_text',
                         kws={'regex': '(.*) weergaven', 'numbers': True}),
                ]
            ),
            Template(
                name='next_videos', selector='.browse-items-load-more-button',
                attrs=[
                    Attr(name='url', func='sel_attr',
                         kws={'attr': 'data-uix-load-more-href'},
                         source=Source(src_template='http://youtube.com{}',
                                       json_key=['content_html', 'load_more_widget_html']))
                ]),
        )
    ),
    ]
)
Ejemplo n.º 8
0
headlines = ScrapeModel(name='headlines', domain='http://www.headlines24.nl/',
    num_getters=2, phases=[
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.headlines24.nl/")],
        templates=(
            Template(
                name='category', selector='',
                db_type='mongo_db', db='headlines', table='category',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run

                    Attr(name='title', selector='h1', func='sel_text'),

                    Attr(name='text', selector='p', func='sel_text'),
                )
            ),
        )
    ),
    
    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.headlines24.nl/")],
        templates=(
            Template(
                name='category', selector='',
                db_type='mongo_db', db='headlines', table='category',
                attrs=(
                    Attr(name='url', selector='a', func='sel_url',
                        source=Source(active=False)), # source is for next run

                    Attr(name='title', selector='h1', func='sel_text'),

                    Attr(name='text', selector='p', func='sel_text'),
                )
            ),
        )
    ),
    
])
Ejemplo n.º 9
0
paradiso = ScrapeModel(
    name='paradiso',
    domain='https://paradiso.nl',
    phases=[
        Phase(source_worker=WebSource,
              sources=[Source(url='https://paradiso.nl/web/Agenda.htm')],
              parser=HTMLParser,
              templates=[
                  Template(name='event_link',
                           selector='a.event-link',
                           attrs=[
                               Attr(name='url',
                                    func='sel_attr',
                                    kws={'attr': 'href'},
                                    source={'active': False})
                           ])
              ]),
        Phase(templates=[
            Template(name='event',
                     db_type='MongoDB',
                     db='paradiso',
                     table='events',
                     attrs=[
                         Attr(name='name',
                              selector='meta[name=evenementts]',
                              func='sel_attr',
                              kws={'attr': 'content'}),
                         Attr(name='date',
                              selector='meta[name=evenementts]',
                              func='parse_attr',
                              kws={'attr': 'content'}),
                         Attr(name='time',
                              selector='meta[name=evenementtijd]',
                              func='parse_attr',
                              kws={'attr': 'content'}),
                         Attr(name='price',
                              selector='.info p',
                              func='parse_text',
                              kws={'regex': '(\d+,\d*)'}),
                     ])
        ])
    ])
Ejemplo n.º 10
0
volkskrant = ScrapeModel(
    name='volkskrant',
    domain='http://www.volkskrant.nl/',
    num_getters=2,
    cookies={'nl_cookiewall_version': '1'},
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(url="http://www.volkskrant.nl/archief/{}".format(year))
                for year in range(1987, today)
            ],
            templates=(Template(name='day_url',
                                selector='td',
                                attrs=(Attr(
                                    name='url',
                                    selector='a',
                                    func='sel_url',
                                    source=Source(active=False)), )), )),
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            templates=(
                Template(name='article_url',
                         selector='article',
                         attrs=(Attr(name='url',
                                     selector='a',
                                     func='sel_url',
                                     source=Source(active=False)), )),
                Template(name='next_page_url',
                         selector='a.pager',
                         attrs=(Attr(name='url',
                                     selector='',
                                     func='sel_url',
                                     source=True), )),
            ),
        ),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(Template(
                  name='article',
                  selector='',
                  db_type='MongoDB',
                  db='volkskrant',
                  table='articles',
                  attrs=(
                      Attr(name='url',
                           selector='a',
                           func='sel_url',
                           source=Source(active=False)),
                      Attr(name='title', selector='h1', func='sel_text'),
                      Attr(name='subtitle', selector='h2', func='sel_text'),
                      Attr(name='author',
                           selector='span[itemprop="author"]',
                           func='sel_text'),
                      Attr(name='author',
                           selector='time[itemprop="datePublished"]',
                           func='sel_text'),
                      Attr(name='category',
                           selector='meta[property="article:section"]',
                           func='sel_attr',
                           kws={'attr': 'content'}),
                      Attr(name='description',
                           selector='p[itemprop="description"]',
                           func='sel_text'),
                      Attr(name='text',
                           selector='.article__body__paragraph',
                           func='sel_text'),
                  )), )),
    ])
Ejemplo n.º 11
0
meertens = ScrapeModel(
    name='namen', domain='http://www.meertens.knaw.nl/', num_getters=2, phases=[
    Phase(source_worker=WebSource, parser=HTMLParser, sources=(
        Source(url="http://www.meertens.knaw.nl/nvb/naam/begintmet/" + l)
                    for l in ['Aad']), # string.ascii_lowercase),
        templates=[
            Template(
                name='name', selector='tr.data',
                db_type='mongo_db', db='names', table='name_count_test',
                attrs=[
                    Attr(name='name', selector='td:nth-of-type(1)',
                                func='sel_text'),
                    Attr(name='men', selector='td:nth-of-type(2)',
                                func='sel_text', kws={'numbers': True}),
                    Attr(name='women', selector='td:nth-of-type(3)',
                                func='sel_text', kws={'numbers': True}),
                    Attr(name='url', selector='td:nth-of-type(1) a',
                                func='sel_attr', kws={'attr': 'href'},
                                source={'active': False},
                                source_condition={'women': '> 50',
                                                  'men': '> 50'}),
                ]
            ),
            Template(
                name='next_url', selector='.right',
                attrs=[
                    Attr(name='next', selector='abc',  func='sel_attr',
                                kws={'attr': 'href'}, source={'active': True}),
                ])
            ]
    ),
    Phase(source_worker=WebSource, parser=HTMLParser, templates=[
            Template(
                name='name', selector='table.nameinfo', func='update',
                kws={'key': 'name'}, db_type='mongo_db', db='names',
                table='name_count_test',
                attrs=[
                    Attr(name='name', selector='div.name',
                                func='sel_text'),
                    Attr(name='men', func='sel_text',
                                kws={'numbers': True},
                                selector='tr:nth-of-type(2) td:nth-of-type(3)'),
                    Attr(name='men_second', func='sel_text',
                                kws={'numbers': True},
                                selector='tr:nth-of-type(3) td:nth-of-type(3)'),
                    Attr(name='women', func='sel_text',
                                kws={'numbers': True},
                                selector='tr:nth-of-type(6) td:nth-of-type(3)'),
                    Attr(name='women_second', func='sel_text',
                                kws={'numbers': True},
                                selector='tr:nth-of-type(7) td:nth-of-type(3)'),
                ]
            ),
            Template(
                name='data_url', selector='a[href*="absoluut/man/eerstenaam"]',
                attrs=[
                    Attr(name='next', selector='a',  func='sel_attr',
                                kws={'attr': 'href'},
                                source=Source(active=False, attrs=[
                                    Attr(name='sex_name', value='men')
                                ])
                                ),
                ]
            ),
            Template(
                name='data_url', selector='a[href*="absoluut/man/volgnaam"]',
                attrs=[
                    Attr(name='next', selector='a',  func='sel_attr',
                                kws={'attr': 'href'},
                                source=Source(active=False, attrs=[
                                    Attr(name='sex_name',
                                                value='men_second')
                                ])
                                ),
                ]
            ),
            Template(
                name='data_url', selector='a[href*="absoluut/vrouw/eerstenaam"]',
                attrs=[
                    Attr(name='next', selector='a',  func='sel_attr',
                                kws={'attr': 'href'},
                                source=Source(active=False, attrs=[
                                    Attr(name='sex_name', value='women')
                                ])
                                ),
                ]
            ),
            Template(
                name='data_url', selector='a[href*="absoluut/vrouw/volgnaam"]',
                attrs=[
                    Attr(name='next', selector='a',  func='sel_attr',
                                kws={'attr': 'href'},
                                source=Source(active=False, attrs=[
                                    Attr(name='sex_name',
                                                value='women_second')
                                ])
                                ),
                ]
            ),
        ]
    ),
    Phase(source_worker=WebSource, parser=HTMLParser, templates=[
            Template(
                name='history', selector='#content', db_type='mongo_db', db='names',
                table='history2', kws={'key': 'name'}, attrs=[
                    Attr(name='name', selector='div.name', func='sel_text'),
                    Attr(name='years', selector='script', func='sel_js_array',
                                kws={'var_name': 'year_list', 'var_type': int}),
                    Attr(name='values', selector='script', func='sel_js_array',
                                kws={'var_name': 'value_list', 'var_type': float}),
                    Attr(name='step
                ]
            )
            ]
        )
    ]
)
Ejemplo n.º 12
0
pornstars = ScrapeModel(
    name='pornhub_pornstars',
    domain='http://pornhub.com',
    num_getters=2,
    phases=[
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              sources=[Source(url='http://www.pornhub.com/pornstars?o=a')],
              templates=[
                  Template(name='alphabet',
                           selector='.alphabetFilter .dropdownWrapper li',
                           attrs=[
                               Attr(name='url',
                                    selector='a',
                                    func='sel_url',
                                    source=Source(active=False))
                           ])
              ]),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=[
                  Template(name='pornstar',
                           selector='.pornstarIndex li',
                           db_type='MongoDB',
                           db='pornstars',
                           collection='ranking',
                           attrs=[
                               Attr(name='name',
                                    selector='.title',
                                    func='sel_text'),
                               Attr(name='rank',
                                    selector='.rank_number',
                                    func='sel_text',
                                    kws={'numbers': True}),
                               Attr(name='views',
                                    selector='.pstarViews',
                                    func='sel_text',
                                    kws={'numbers': True}),
                               Attr(name='videos',
                                    selector='.videosNumber',
                                    func='sel_text',
                                    kws={'numbers': True}),
                               Attr(name='url',
                                    selector='a.title',
                                    func='sel_url'),
                               Attr(name='image_url',
                                    selector='img',
                                    func='sel_attr',
                                    kws={'attr': 'src'}),
                           ]),
                  Template(name='next_urls',
                           selector='.pagination3',
                           attrs=[
                               Attr(name='url',
                                    selector='a',
                                    func='sel_url',
                                    source=Source())
                           ])
              ])
    ])
Ejemplo n.º 13
0
import string
from pymongo import MongoClient


petitions = ScrapeModel(
    name='petitions', domain='https://petities.nl/', num_getters=2,
    phases=[
        Phase(source_worker=WebSource, parser=HTMLParser, sources=(
            Source(url="https://petities.nl/petitions/borstkankeronderzoek-vervroegen/signatures?locale=nl"),),
            templates=[
                Template(name='next_page', selector='.navigation-bar .navigation-bar',
                         attrs=[Attr(name='url', selector='a', func='sel_url', kws={'attr': 'href'},
                                     source=True)]),
                Template(name='signature', selector='.petition-signature-list',
                         db_type='mongo_db', db='petitions', table='borstkanker',
                         attrs=[
                             Attr(name='name', selector='.petition-signature-name',
                                  func='sel_text'),
                             Attr(name='time', selector='.signature-time', func='sel_text'),
                             Attr(name='location', selector='.petition-signature-location',
                                  func='sel_text'),
                             Attr(name='occupation', selector='.petition-signature-occupation',
                                  func='sel_text')
                        ])
            ]
        )
    ]
)

ds = Dispatcher()
ds.add_scraper(petitions)
ds.run()
Ejemplo n.º 14
0
theoffice = ScrapeModel(
    name='theoffice',
    domain='http://watchtheofficeonline.com',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[
                Source(url="http://watchtheofficeonline.com/s{}e{}".format(
                    season, episode)) for season in range(1, 10)
                for episode in range(1, 30)
            ],
            templates=(Template(
                name='episode',
                selector='.so-panel.widget.widget_siteorigin-panels-builder',
                db_type='shell_command',
                db='theoffice',
                table='season',
                kws={
                    'command':
                    'sudo mkdir -p ' + filepath + '/{season}/ &' +
                    ' sudo youtube-dl -o /mnt/Movies/{season}/{episode} {url}'
                },
                attrs=(
                    Attr(name='url',
                         selector='a',
                         func=['sel_url', 'sel_text'],
                         kws=[{}, {
                             'needle': r'.*(s\d+e\d+)'
                         }]),
                    Attr(name='episode',
                         selector='.textwidget',
                         func='sel_text',
                         kws={
                             'index': 3,
                             'substitute': '_',
                             'replacers': ' '
                         }),
                    Attr(name='season',
                         selector='.textwidget',
                         func='sel_text',
                         kws={
                             'index': 1,
                             'replacers': ' '
                         }),
                )), )),
    ])
Ejemplo n.º 15
0
cl = MongoClient()
db = cl.gsmhelpdesk_nummerreeksen
col = db.number_range

gsmhelpdesk_nummerreeksen = ScrapeModel(name='gsmhelpdesk_nummerreeksen', domain='http://www.gsmhelpdesk.nl/', num_getters=2,
    phases=[

    Phase(source_worker=WebSource, parser=HTMLParser, sources=[
        Source(url="http://www.gsmhelpdesk.nl/helpdesk/30/nummerreeksen"),
    ],
    templates=(
        Template(
            name='number_range', selector='tr',
            db_type='mongo_db', db='gsmhelpdesk_nummerreeksen', table='number_range',
            attrs=(
                Attr(name='start', selector='td:nth-of-type(1)',
                            func='sel_text', kws={'numbers': True}),
                Attr(name='end', selector='td:nth-of-type(2)',
                            func='sel_text', kws={'numbers': True}),
                )
            ),
        )
    ),
    ]
)

disp = Dispatcher()
disp.add_scraper(gsmhelpdesk_nummerreeksen)
disp.run()
Ejemplo n.º 16
0
funda = ScrapeModel(
    name='funda.nl',
    domain='http://funda.nl',
    num_sources=1,
    phases=[
        Phase(
            parser=HTMLParser,
            source_worker=WebSource,
            sources=[
                Source(url='http://funda.nl/huur/amsterdam/woonhuis/'),
                Source(url='http://funda.nl/huur/amsterdam/appartement/'),
                Source(url='http://funda.nl/koop/amsterdam/woonhuis/'),
                Source(url='http://funda.nl/koop/amsterdam/appartement'),
            ],
            templates=[
                Template(
                    name='house',
                    selector='.search-result',
                    db_type='mongo_db',
                    db='funda',
                    table='for_hire',
                    attrs=[
                        Attr(name='price',
                             selector='.search-result-price',
                             func='sel_text',
                             kws={'numbers': True}),
                        Attr(name='street',
                             selector='.search-result-title',
                             func='sel_text'),
                        Attr(name='realtor',
                             selector='.realtor',
                             func='sel_text'),
                        Attr(name='rooms',
                             selector='.search-result-info',
                             func='sel_text',
                             kws={
                                 'regex': '(\d+) kamers',
                                 'numbers': True
                             }),
                        Attr(name='zip',
                             selector='.search-result-subtitle',
                             func='sel_text',
                             kws={'regex': '(\d{4} \w{2})'}),
                        Attr(name='city',
                             func='sel_text',
                             selector='.search-result-subtitle',
                             kws={'regex': '\d{4} \w{2} (\w+)'}),
                        Attr(
                            name='living_area',
                            func='sel_text',
                            selector=
                            '.search-result-info span[title="Woonoppervlakte"]',
                            kws={
                                'regex': '(\d+)',
                                'numbers': True
                            }),
                        Attr(name='meeting_url',
                             selector='.search-result-header a',
                             func='sel_attr',
                             kws={'attr': 'href'},
                             source={
                                 'src_template': '{}bezichtiging/',
                                 'active': False
                             }),
                    ]),
                Template(
                    selector='.pagination',
                    attrs=[
                        Attr(
                            name='url',
                            selector='a',
                            func='sel_attr',
                            kws={'attr': 'href'},
                            # source=Source()
                        )
                    ])
            ]),
        Phase(
            parser=HTMLParser,
            source_worker=WebSource,
            active=False,
            templates=[
                Template(
                    name='bezichtiging',
                    selector='.makelaars-contact-form',
                    attrs=[
                        Attr(
                            name='__RequestVerificationToken',
                            selector='input[name="__RequestVerificationToken"]',
                            func='sel_attr',
                            kws={'attr': 'value'}),
                        Attr(name='url',
                             selector='form',
                             func='sel_attr',
                             kws={'attr': 'action'}),
                    ],
                    source=Source(method='post',
                                  active=False,
                                  duplicate=True,
                                  data={
                                      'Day': '',
                                      'DayPart': '',
                                      'Opmerking': '',
                                      'Aanhef': 'Dhr',
                                      'Naam': 'Henk de Vries',
                                      'Email': '*****@*****.**',
                                      'ConfirmEmail': '',
                                      'Telefoon': '0205566206',
                                  }))
            ])
    ])
Ejemplo n.º 17
0
thuisbezorgd = ScrapeModel(
    name='thuisbezorgd',
    domain='http://thuisbezorgd.nl',
    num_getters=2,
    phases=[
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            sources=[Source(url="https://www.thuisbezorgd.nl/")],
            templates=(
                Template(
                    name='sections',
                    selector='',
                    attrs=(
                        Attr(name='url',
                             selector='a[href*="eten-bestellen-"]',
                             func='sel_url',
                             source=Source()),  # source is for next run
                    )),
                Template(
                    name='restaurant',
                    selector='.restaurant',
                    db_type='MongoDB',
                    db='thuisbezorgd',
                    table='restaurants',
                    attrs=(
                        Attr(name='url',
                             selector='a.restaurantname',
                             func='sel_url',
                             source=Source(
                                 active=False,
                                 src_template='{}')),  # source is for next run
                        Attr(name='name',
                             selector='a.restaurantname',
                             func='sel_text'),
                    )),
            )),
        Phase(
            source_worker=WebSource,
            parser=HTMLParser,
            templates=(
                Template(
                    name='reviews',
                    selector='',
                    db_type='MongoDB',
                    db='thuisbezorgd',
                    table='reviews',
                    attrs=(
                        Attr(name='url',
                             selector='a',
                             func='sel_url',
                             source=Source(
                                 active=False)),  # source is for next run
                        Attr(name='title', selector='h1', func='sel_text'),
                        Attr(name='text', selector='p', func='sel_text'),
                    )), )),
    ])
Ejemplo n.º 18
0
# TODO Set the right classes for the websites.
from dispatcher import Dispatcher
from components import Attr, Source, HTMLObject, Phase, ScrapeModel, Follow

kinky =ScrapeModel(name='kinky', domain='kinky.nl', phases=[
    Phase(source=[Source(url='http://kinky.nl/sex-afspraken/vrouwen/default.aspx?pagesize=4500')],
        templates=[Template(name='advert', selector='.advertentie_kop > a',
                 attrs=[Attr(name='url', source={'active': False})])
        ]),
    Phase(templates=[
        Template(name='advert', selector='.advertentie_kop > a', db='kinky.nl', table='adverts',
                Attr(name='add_text', func= 'sel_text', selector='description p'),
                Attr(name='possibilities', func= 'sel_text', selector= '.possibilities li'),
                Attr(name='update', func= 'sel_text', selector='update'),
                Attr(name='town', func= 'sel_text', selector= '.naw div', kws={'regex': '.*Plaats: ([A-z]*);'}),
                Attr(name='work_area', func= 'sel_text', selector='naw div', kws={'regex': '.*Werkgebied: '}),
                Attr(name='prices', func= 'sel_text', selector='.prizes td:nth-of-type(2)'),
                Attr(name='pictures', func= 'sel_attr', selector='.galleryRow img', kws={'attr': 'src'}),
                Attr(name='phone', selector='.mainprofile .webbutton span', func='sel_text'),
                Attr(name='name', selector='h1.title', func='sel_text'),
                    '__reg__age': ['.naw div', '.*Leeftijd: (\d\d)'],
                    '__reg__length': ['.naw div', '.*Lengte: (\d\d\d)'],
                    '__reg__hair_color': ['.naw div', '.*Kleur haar: ([a-z]*);'],
                    '__reg__build': ['.naw div', '.*Lichaamsbouw: ([A-z]*);'],
                    '__reg__looks': ['.naw div', '.*Uiterlijk: ([A-z]*);'],
                ,
            ,
        ,
        'start': [
            'meta': {
                'sex': 'female',