Beispiel #1
0
def main(model, dummy):
    if len(model) == 1:
        model = model[0]
    dispatcher = Dispatcher()
    if model not in available_models:
        print('Model', model, 'is not in the folder "scrape_models".')
        print('These models are available:')
        pprint.pprint(available_models, compact=True)
        return
    imported = vars(importlib.import_module(f'scrape_models.{model}')).values()
    scrape_models = [model for model in imported if type(model) == ScrapeModel]
    dispatcher.add_scraper(scrape_models, dummy=dummy)
    dispatcher.run()
Beispiel #2
0
from modelscraper.dispatcher import Dispatcher

import nu_nl
import metronieuws
import parool
import volkskrant

disp = Dispatcher()
disp.add_scraper([volkskrant.volkskrant])
disp.run()
Beispiel #3
0
product = Phase(
    sources=sources,
    templates=[
        Template(
            name='product',
            db_type='mongo_db',
            db='makro',
            table='products',
            attrs=[
                Attr(name='name', selector='h1', func='sel_text'),
                Attr(name='price_gross',
                     selector='.price-gross',
                     func='sel_text'),  #  kws={'replacers': '€ '}),
                Attr(name='price_net', selector='.price-net',
                     func='sel_text'),  # kws={'replacers': '€ '}),
                Attr(name='sku', selector='.articlenumber', func='sel_text'),
                Attr(name='description', selector='.tab-1', func='sel_text'),
                Attr(name='category', selector='li.normal', func='sel_text')
            ])
    ])

makro = ScrapeModel(name='makro',
                    domain='https://www.makro.nl/',
                    num_getters=1,
                    phases=[product])

d = Dispatcher()
d.add_scraper(makro)
d.run()
Beispiel #4
0
                            selector='a',
                            func='sel_attr',
                            kws={'attr': 'href'},
                            source={'active': False}),
                   ])
      ]),

parsed = [
    a['url'] for cat in categories
    for a in cl.nos_nl.articles.find({'category': cat})
]
nos_sources = [
    Source(url=url['url'],
           attrs=[Attr(name='category', value=url['category'])])
    for url in cl.nos_nl.article_urls.find() if url['url'] not in parsed
]
nos = ScrapeModel(name='nos.nl',
                  domain='http://nos.nl',
                  num_getters=10,
                  phases=[
                      Phase(n_workers=5,
                            sources=nos_sources,
                            templates=(article(db_type='mongo_db',
                                               db='nos_nl',
                                               table='articles'), ))
                  ])

disp = Dispatcher()
disp.add_scraper(nos)
disp.run()
Beispiel #5
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.parsers import TextParser
from modelscraper.dispatcher import Dispatcher
from pymongo import MongoClient


cl = MongoClient()
urls = (Source(url='https://tt888.omroep.nl/tt888/' + a['url'][0].split('/')[-1],
               attrs=(Attr(name='url', value=a['url']),))
        for a in cl.nos_journaal.episodes.find())

subtitles = ScrapeModel(
    name='subtitles', domain='https://tt888.omroep.nl/',
    phases=[
        Phase(n_workers=5, sources=urls, parser=TextParser,
            templates=(
                Template(
                    name='subtitle', db_type='mongo_db', db='nos_journaal',
                    table='episodes', func='update', kws={'key': 'url'},
                    attrs=(
                        Attr(name='subtitles', func='sel_text'),
                        )
                ),
            )
            )
    ])
del cl
d = Dispatcher()
d.add_scraper(subtitles)
d.run()
Beispiel #6
0
                                       func='sel_url',
                                       source=Source(active=False)), )), )),

npo_tv_programs = ScrapeModel(
    name='npo_tv_programs',
    domain='http://npo.nl',
    num_getters=2,
    phases=[
        Phase(n_workers=10,
              sources=sources,
              templates=(Template(
                  name='episode',
                  selector='.column-player-info',
                  db='dwdd',
                  func='update',
                  table='episodes',
                  db_type='mongo_db',
                  attrs=(
                      Attr(name='date',
                           selector='ul.the-player-meta-block__date-tags',
                           func='sel_text'),
                      Attr(name='description',
                           selector='.overflow-description',
                           func='sel_text'),
                  )), )),
    ])

disp = Dispatcher()
disp.add_scraper(npo_tv_programs)
disp.run()
Beispiel #7
0
                Attr(name='make', func='sel_text',
                            selector='#ctl00_cphMain_hHeadMake'),
                Attr(name='year', func='sel_text',
                            selector='#ctl00_cphMain_hHeadYear'),
                Attr(name='model', func='sel_text',
                    selector='.breadcrumbs a:last-of-type'),
                Attr(name='part_category_urls',
                    selector='.category a:last-of-type',
                    func='sel_url'),
            )),
            )
        ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='part', selector='.scrollable-area-2 .cart-table tr',
                     db='motorparts', table='parts', func='update',
                     db_type='MongoDB',
                     attrs=(
                         Attr(name='part_number', func='sel_text',
                              selector='h4 + span'),
                         Attr(name='amount', func='sel_text',
                              selector='.col-2 span:last-of-type'),
                         Attr(name='drawing_number', func='sel_text',
                              selector='.col-1 span'),
                     )),
        ))
])

disp = Dispatcher()
disp.add_scraper(motorparts)
disp.run()
Beispiel #8
0
sub_page_name = Attr(name='pagename', func='sel_text')

category_temp = Template(name='sub_page',
                         selector='.sections a',
                         db='startpagina',
                         table='subpages',
                         db_type='mongo_db',
                         attrs=(sub_page_url,
                                sub_page_name))

website_url = Attr(name='url', func='sel_url')
website_name = Attr(name='name', func='sel_text')

website_temp = Template(name='website',
                        selector='#columns a',
                        db='startpagina',
                        table='websites',
                        db_type='mongo_db',
                        attrs=(website_url, website_name))

model = ScrapeModel(name='startpagina', domain='http://www.startpagina.nl',
                    phases=[
                        Phase(n_workers=3, sources=[start_url],
                            templates=[category_temp]),
                        Phase(n_workers=3, templates=[website_temp])
                    ])

d = Dispatcher()
d.add_scraper(model)
d.run()
Beispiel #9
0
                ]
            ),)
    ),
    Phase(synchronize=False,templates=[
            Template(
                name='player', selector='.squad--team-player',
                db_type='mongo_db', db='uefa', table='players',
                attrs=[
                    Attr(name='name', selector='.squad--player-name',
                                func='sel_text'),
                    Attr(name='player_url', selector='.squad--player-name a',
                                func='sel_url'),
                    Attr(name='img', selector='.squad--player-img img',
                                func='sel_attr', kws={'attr': 'src'}),
                ]
            ),
            # Template(
            #     name='team', selector='',
            #     db_type='mongo_db', func='update', db='uefa', table='players',
            #     attrs=[
            #         Attr(name='team', selector='h1.team-name', func='sel_text'),
            #     ]
            # )
        ]
    )]
)

disp = Dispatcher()
disp.add_scraper(uefa)
disp.run()
Beispiel #10
0
from modelscraper.sources import ProgramSource

port_template = Template(name='ports',
                         selector='port',
                         db_type='mongo_db',
                         db='ports',
                         table='ports',
                         attrs=(Attr(name='portnumber',
                                     func='sel_attr',
                                     kws={'attr': 'portid'}),
                                Attr(name='state',
                                     selector='state',
                                     func='sel_attr',
                                     kws={'attr': 'state'}),
                                Attr(name='service',
                                     selector='service',
                                     func='sel_attr',
                                     kws={'attr': 'name'})))
nmap = ScrapeModel(
    name='nmap_test',
    domain='',
    phases=[
        Phase(sources=(Source(url='nmap -oX - duwo.multiposs.nl'), ),
              templates=[port_template],
              source_worker=ProgramSource)
    ])

disp = Dispatcher()
disp.add_scraper(nmap)
disp.run()
Beispiel #11
0
                                   selector='h3 a',
                                   func='sel_url',
                                   source={'active': False}), ))
pagination = Template(name='pagination',
                      selector='.pagers',
                      attrs=(Attr(name='page',
                                  selector='a',
                                  func='sel_url',
                                  source=True), ))

bedrijven_pagina = ScrapeModel(
    name='Bedrijven Pagina',
    domain='https://www.bedrijvenpagina.nl/',
    num_getters=2,
    phases=[
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              sources=[Source(url='https://www.bedrijvenpagina.nl/')],
              templates=(category_menu, )),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(result_list, pagination)),
        Phase(source_worker=WebSource,
              parser=HTMLParser,
              templates=(company, ))
    ])

disp = Dispatcher()
disp.add_scraper(bedrijven_pagina)
disp.run()
Beispiel #12
0
              templates=[
                  Template(name='article',
                           selector='#content',
                           db_type='mongo_db',
                           db='dabanga',
                           table='article',
                           attrs=[
                               Attr(name='title',
                                    selector='h1',
                                    func='sel_text'),
                               Attr(name='text',
                                    selector='.article .body-text',
                                    func='sel_text'),
                               Attr(name='date',
                                    selector='.article .time',
                                    func='sel_text'),
                               Attr(name='place',
                                    selector='.article .place',
                                    func='sel_text'),
                               Attr(name='img',
                                    selector='.article img',
                                    func='sel_attr',
                                    kws={'attr': 'src'}),
                           ]),
              ])
    ])

d = Dispatcher()
d.add_scraper(dabanga)
d.run()