Example #1
0
#Parsers
jsonp = JSONParser()
textp = TextParser()
csvp = CSVParser()
htmlp = HTMLParser()

json_nested = Template(
    source=json_test,
    database=[test_db, test_mongo],
    table='tst',
    name='json_nested',
    selector=[jsonp.select('html'),
              htmlp.select('.content')],
    attrs=[
        Attr(name='url',
             func=htmlp.text(selector='h1', template='partialtest {}'))
    ])

html_functions = Template(source=html,
                          name='html_functions',
                          database=test_mongo,
                          table='html_test',
                          dated=True,
                          emits=html2,
                          selector=htmlp.select('html'),
                          attrs=[
                              Attr(name='table',
                                   func=htmlp.table(selector='table')),
                              Attr(name='attr',
                                   func=htmlp.attr(selector='p',
                                                   attr='class')),
Example #2
0
    },
    'ip': {
        '$exists': False
    }
})
companies2 = MongoClient().defcon.companies.find({'website': {'$ne': None}})

defcon_base = Template(db='defcon', db_type='MongoDB')

jacco_base = 'jackling.nl'
jacco = (Source(url='jackling.nl'), )
jacco_git = (Source(url='http://{}/.git/config'.format(jacco_base)), )
jacco_ds = (Source(url='http://{}/.DS_STORE'.format(jacco_base)), )

git_sources = (Source(url='http://{}/.git/config'.format(c['website']),
                      attrs=[Attr(name='kvk', value=c['id'])],
                      copy_attrs='kvk') for c in companies2)
ds_store_sources = (Source(url='http://{}/.DS_STORE'.format(c['website']),
                           attrs=[Attr(name='kvk', value=c['id'])],
                           copy_attrs='kvk') for c in companies2)

git_template = Template(name='Git exposed',
                        db_type='MongoDB',
                        db='defcon',
                        table='git',
                        attrs=(Attr(name='vulnerable',
                                    func='sel_text',
                                    kws={'needle': '[core]'}), ))

ds_store_template = Template(name='DS_STORE exposed',
                             db_type='MongoDB',
Example #3
0
    #    'tech',
    #    'opmerkelijk',
    #    'cultuur-en-media',
    #    'koningshuis',
]
now = datetime.datetime.now()
begin = datetime.datetime.strptime('2010-01-01', '%Y-%m-%d')
timezone = datetime.timezone(datetime.timedelta(0, 3600))

dates = [
    begin + datetime.timedelta(days=d) for d in range(0, (now - begin).days)
]

nos_sources = (Source(url="http://nos.nl/nieuws/{}/archief/{}".format(
    cat, date.strftime('%Y-%m-%d')),
                      attrs=(Attr(name='category', value=cat), ),
                      copy_attrs=True) for cat in categories for date in dates)

title_attr = Attr(name='title', selector='h1', func='sel_text')
text_attr = Attr(name='text', selector='.article_body', func='sel_text')
date_attr = Attr(name='date',
                 selector='time:nth-of-type(1)',
                 func='sel_attr',
                 kws={'attr': 'datetime'})
author_attr = Attr(name='author',
                   selector='span[itemprop="author"]',
                   func='sel_text')
tags_attr = Attr(name='tags',
                 selector='.ib.space-right a.link-grey',
                 func='sel_text')
Example #4
0
from modelscraper.components import Attr, Model

product_name = Attr(name='product_name')
price = Attr(name='price')
nutrition = Attr(name='nutrition')
brand_name = Attr(name='brand_name')
unitsize = Attr(name='unitsize')
availability = Attr(name='availability')
store_id = Attr(name='store_id')
category = Attr(name='category')
ingredients = Attr(name='ingredients')
url = Attr(name='url')

product = Model(name='product',
                definition=True,
                attrs=[
                    url, product_name, price, nutrition, brand_name, unitsize,
                    availability, store_id, category, ingredients
                ])
Example #5
0
col = db.episode

sources = (Source(url="http://www.luckytv.nl/afleveringen/page/{}/".format(i))
           for i in range(1, 50))

LuckyTV = ScrapeModel(name='Lucky TV',
                      domain='http://www.luckytv.nl/',
                      num_getters=2,
                      phases=[
                          Phase(source_worker=WebSource,
                                parser=HTMLParser,
                                sources=sources,
                                templates=(Template(
                                    name='episode',
                                    selector='article.video',
                                    db_type='mongo_db',
                                    db='lucky_tv',
                                    table='episodes',
                                    attrs=(
                                        Attr(name='url',
                                             selector='a:nth-of-type(1)',
                                             func='sel_url'),
                                        Attr(name='title',
                                             selector='.video__title',
                                             func='sel_text'),
                                        Attr(name='date',
                                             selector='.video__date',
                                             func='sel_text'),
                                    )), )),
                      ])
Example #6
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from pymongo import MongoClient


cl = MongoClient()
categories = {39: 'binnenland',
              2: 'buitenland',
              }
category_url = "http://www.metronieuws.nl/getsectionlist/{}/{}/0"

binnenland = (Source(url=category_url.format(39, i), json_key=['data'],
                  attrs=[Attr(name='category', value='binnenland')])
            for i in range(1, 2400))

buitenland = (Source(url=category_url.format(2, i), json_key=['data'],
                  attrs=[Attr(name='category', value='buitenland')])
            for i in range(1, 1900))

sources = [*binnenland, *buitenland]

title_attr = Attr(name='title', selector='h1', func='sel_text')
text_attr = Attr(name='text', selector='.article_body', func='sel_text')
date_attr = Attr(name='date', selector='time:nth-of-type(1)', func='sel_attr',
                 kws={'attr': 'datetime'})
author_attr = Attr(name='author', selector='span[itemprop="author"]',
                   func='sel_text')
tags_attr = Attr(name='tags', selector='.tag',
                 func='sel_text')

article = Template(
    name='article',
Example #7
0
from modelscraper.dispatcher import Dispatcher
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from pymongo import MongoClient
import string


start_url = Source(url='http://www.startpagina.nl/dochters/')
sub_page_url = Attr(name='sub_page', func='sel_url', source={'active': False,
                                                             'parent': True})
sub_page_name = Attr(name='pagename', func='sel_text')

category_temp = Template(name='sub_page',
                         selector='.sections a',
                         db='startpagina',
                         table='subpages',
                         db_type='mongo_db',
                         attrs=(sub_page_url,
                                sub_page_name))

website_url = Attr(name='url', func='sel_url')
website_name = Attr(name='name', func='sel_text')

website_temp = Template(name='website',
                        selector='#columns a',
                        db='startpagina',
                        table='websites',
                        db_type='mongo_db',
                        attrs=(website_url, website_name))

model = ScrapeModel(name='startpagina', domain='http://www.startpagina.nl',
                    phases=[
Example #8
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.parsers import JSONParser
from pymongo import MongoClient

cl = MongoClient()


title_attr = Attr(name='title', selector='title', func='sel_text')
text_attr = Attr(name='text', selector='body_elements', func='sel_dict')
date_attr = Attr(name='date', selector='publish_date', func='sel_text')
author_attr = Attr(name='author', selector='written_by', func='sel_text')
tags_attr = Attr(name='tags', selector=('tags', 'name'), func='sel_text')
category_attr = Attr(name='category', selector=('section', 'name'),
                     func='sel_text')
counters_attr = Attr(name='counters', selector='counters', func='sel_text')
intro_attr = Attr(name='excerpt', selector='intro', func='sel_text')
type_attr = Attr(name='excerpt', selector='type', func='sel_text')

article = Template(
    name='article',
    db='volkskrant', table='articles',
    db_type='mongo_db',
    attrs=(
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr,
        category_attr,
        counters_attr,
        intro_attr,
Example #9
0
 method='post',
 data=[('sid', str(i)), ('options[]', 'display_full_history'),
       ('options[]', 'use_cached_data_only'),
       ('action', 'View+Complete+Tracking+History')])
            for i in range(5000, 50000000)),
   templates=[
       Template(
           name='shipment',
           selector=None,
           db='shipments',
           db_type='MongoDB',
           table='shipment',
           attrs=[
               Attr(
                   name='carrier',
                   selector=
                   '#large_shipment_info_box > div:nth-child(2) > div:nth-child(1)',
                   func='sel_text',
                   kws={'regex': 'Carrier:\s(\w+)'}),
               Attr(
                   name='shipped_to',
                   selector=
                   '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(1) .align_left',
                   func='sel_text'),
               Attr(
                   name='shipped_from',
                   selector=
                   '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(2) .align_left',
                   func='sel_text'),
           ]),
       Template(name='event',
                selector='table tr:not(:nth-child(1))',
Example #10
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.workers import WebSource
from modelscraper.parsers import HTMLParser

title = Attr(name='title', func='sel_text')
artist = Attr(name='artist', func='sel_text')
midi_url = Attr(name='midi_url')

song = Template(name='song',
                db_type='MongoDB',
                db='midi',
                table='songs',
                attrs=[title, artist, midi_url])

freemidi_template = song(
    table='freemidi',
    selector='#mainContent div.col-xs-12:nth-of-type(1)',
    attrs=[
        title(selector=
              'li.active:nth-child(3) > a:nth-child(1) > span:nth-child(1)'),
        artist(
            selector=
            'ol.breadcrumb:nth-child(1) > li:nth-child(2) > a:nth-child(1) > span:nth-child(1)'
        ),
    ])

freemidi_sources = (Source(
    url='https://freemidi.org/download-{}'.format(i),
    attrs=[midi_url(value='https://freemidi.org/getter-{}'.format(i))])
                    for i in range(25803))
Example #11
0
from pymongo import MongoClient

cl = MongoClient()

cookie = {'nl_cookiewall_version': '1'}

telegraaf_url = 'http://www.telegraaf.nl/jsp/search_result_page.jsp?method=&keyword=de&pagenr={}'
telegraaf_search = (Source(url=telegraaf_url.format(i))
                    for i in range(1, 5001))

calendar = Template(
    name='archive_url',
    selector='',
    attrs=(
        Attr(name='url',
             selector='td a',
             func='sel_url',
             source=Source(active=False)),  # source is for next run
    ))

year = Template(
    name='archive_url_year',
    selector='.year-list__item',
    attrs=(
        Attr(name='url', selector='a', func='sel_url',
             source=True),  # source is for next run
    ))

article_url = Template(
    name='article_url',
    selector='ol.listing li',
    attrs=(
Example #12
0
from modelscraper.dispatcher import Dispatcher
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source


search_url = 'https://www.rtlnieuws.nl/search/nieuws/{}'
search_terms = ['economie', 'nederland']
title_attr = Attr(name='title', selector='h1', func='sel_text')
text_attr = Attr(name='text', selector='p', func='sel_text')
date_attr = Attr(name='date', selector='time', func='sel_text')
author_attr = Attr(name='author', selector='span[itemprop="author"]',
                   func='sel_text')
tags_attr = Attr(name='tags', selector='.tag-list a.cta',
                 func='sel_text')

article = Template(
    name='article', selector='.col__inner',
    attrs=(
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr
    )
)

rtl= ScrapeModel(
    name='rtl', domain='http://www.rtlnieuws.nl/',
    num_getters=1, phases=[
        Phase(sources=[
            Source(url="http://www.parool.nl/archief/2012")],
            templates=(calendar, year)
Example #13
0
from modelscraper.components import Scraper, Model, Attr
from modelscraper.sources import ProgramSource
from modelscraper.parsers import HTMLParser
from modelscraper.databases import MongoDB

nmap_source = ProgramSource(urls=['localhost'],
                            test_urls=['localhost'],
                            func='nmap -oX - {}')
parser = HTMLParser()

port_template = Model(source=nmap_source,
                      name='ports',
                      selector=parser.select('port'),
                      database=MongoDB(db='nmap'),
                      table='ports',
                      attrs=(Attr(name='portnumber',
                                  func=parser.attr(attr='portid')),
                             Attr(name='state',
                                  func=parser.attr(selector='state',
                                                   attr='state')),
                             Attr(name='service',
                                  func=parser.attr(selector='service',
                                                   attr='name'))))

nmap = Scraper(name='nmap_test', models=[port_template])
Example #14
0
base_search = 'service/rest/delegate?url=/zoeken?rq={}&searchType=product'
delegate_url = 'https://www.ah.nl/service/rest/delegate?url={}'
table_trans = str.maketrans('[]', '<>')
translate_table = lambda text: text.translate(table_trans)

search = WebSource(name='search',
                   url_template='https://www.ah.nl/{}',
                   urls=(base_search.format(l) for l in ascii_lowercase))

product_test = 'producten/product/wi238928/ah-biologisch-schouderkarbonade'
product_source = WebSource(name='product_source', url_template=delegate_url,
                           test_urls=[product_test])
db = MongoDB(db='ah_nl')
parser = JSONParser()

url = Attr(name='url', func=parser.text(selector='navItem/link/href'))

search_template = Model(
    source=search, name='search_result',
    selector=parser.select('//type[text() = "SearchLane"]/../_embedded/items'),
    attrs=[url(emits=product_source)]
)

load_more_template = Model(
    source=search,
    name='load_more',
    selector=parser.select('//type[text() = "LoadMoreLane"]/..'),
    attrs=[url(emits=search)]
)

product_selector = '//type[text() = "ProductDetailLane"]/..//type[text() = "Product"]/..'
Example #15
0
 domain='http://npo.nl',
 num_getters=2,
 phases=[
     Phase(
         source_worker=WebSource,
         parser=HTMLParser,
         sources=[Source(url=series_url.format(i)) for i in range(0, 242)],
         templates=(
             Template(
                 name='program',
                 selector='.content-column.quarter',
                 db_type='mongo_db',
                 db='npo_tv_programs',
                 table='programs',
                 attrs=(
                     Attr(name='title', selector='h3', func='sel_text'),
                     Attr(name='url',
                          selector='a.full-link',
                          func='sel_url',
                          source=Source(
                              active=False)),  # source is for next run
                 )),
             Template(name='next_url'),
         )),
     Phase(
         source_worker=WebSource,
         parser=HTMLParser,
         templates=(
             Template(
                 name='episodes',
                 selector='.item-list.item-container div.item',
Example #16
0
import re
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.parsers import JSONParser
import vehicle


city = Attr(name='city', func='sel_text')
zipcode = Attr(name='zipcode', func='sel_text')

occasion = Template(
    name='occasion', db_type='mongo_db',
    attrs=[*vehicle.attrs,
        city,
        zipcode,
    ]
)


autotrader_template = vehicle(
    table='autotrader', selector='.result',
    attrs=[
        brand(selector='h2', kws={'regex': '(^\w+)'}),
        make(selector='h2', kws={'regex': '^\w+ (.*)'}),
        price(selector='.result-price-label'),
        year(selector='.col-left',kws={'regex': '\w{3} (\d{4})'}),
        mileage(selector='.col-left', kws={'regex': '(.*) km'}),
        url(selector='a.tracker'),
        #Attr(name='dealer_name', selector='.dealer-info div', func=sel_text),
        city,
        zipcode,
        power,
Example #17
0
from modelscraper.components import ScrapeModel, Template, Attr
from modelscraper.sources import WebSource

text = Attr(name='text', func='sel_html')
title = Attr(name='title', func='sel_text')
pictures = Attr(name='pictures',
                func='sel_attr',
                selector='img',
                kws={'attr': 'src'})
date = Attr(name='date', func='sel_text')
related = Attr(name='related', func='sel_url')
author = Attr(name='author', func='sel_text')
tags = Attr(name='author', func='sel_text')

article = Template(name='article',
                   attrs=(text, title, date, author, tags, pictures, related),
                   db='news',
                   db_type='MongoDB')

article_url = Attr(name='url', func='sel_url')

tweakers_article_source = WebSource()
tweakers_list = Template(
    selector='',
    attrs=[article_url(selector='', emits=tweakers_article_source)])

tweakers = article(source=tweakers_article_source,
                   table='tweakers.net',
                   selector='#contentArea',
                   attrs=(
                       text(selector='.article p'),
Example #18
0
from modelscraper.components import Phase, Template, Attr
from modelscraper.sources import BaseSourceWorker, ProgramSource
from modelscraper.parsers import TextParser
import dns.resolver
import dns.query
import dns.zone

ip_template = Template(name='ip',
                       db_type='MongoDB',
                       db='',
                       table='',
                       parser=TextParser,
                       attrs=(Attr(
                           name='ip',
                           func='sel_text',
                           kws={'regex': '(\d+\.\d+\.\d+\.\d+)'},
                       ), ))

ip_phase = Phase(n_workers=10,
                 templates=[ip_template],
                 source_worker=ProgramSource(function='host {}'))

port_template = Template(name='ports',
                         selector='port',
                         db_type='MongoDB',
                         db='monog',
                         table='ports',
                         attrs=(Attr(name='portnumber',
                                     func='sel_attr',
                                     kws={'attr': 'portid'}),
                                Attr(name='state',
Example #19
0
        "SelectedStore": {
            "StoreId": 207,
            "StoreReferenceKey": 493
        },
        "HasSelectedStore": True,
        "AcceptedCookies": None,
        "LastViewedProducts": None
    }
}

menu_template = Template(name='menu',
                         attrs=[
                             Attr(name='menu_item',
                                  selector='.c-category-tile__item',
                                  func='sel_url',
                                  source={
                                      'active': False,
                                      'src_template': '{}?ppp=72'
                                  })
                         ])

productmenu_template = Template(name='submenu',
                                selector='.c-product-tile',
                                attrs=[
                                    Attr(name='submenu_item',
                                         selector='.c-product-tile__meta > a',
                                         func='sel_url',
                                         source={'active': False}),
                                    Attr(name='pagination_item',
                                         selector='li.is-nexy > a',
                                         source=True)
Example #20
0
from modelscraper.dispatcher import Dispatcher
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from pymongo import MongoClient


search_url = 'https://mobileapi.ad.nl/mobile/lists/search'
sources = (Source(url=search_url, params={'query': w}) for w in words)

title_attr = Attr(name='title', selector='h1', func='sel_text')
text_attr = Attr(name='text', selector='p', func='sel_text')
date_attr = Attr(name='date', selector='.published span.small',
                        func='sel_text')
author_attr = Attr(name='author', selector='span.author',
                          func='sel_text')
tags_attr = Attr(name='tags', selector='.article.tags a span',
                        func='sel_text')

article = Template(
    name='article', selector='.column-content-background',
    db='nu_nl',
    db_type='mongo_db',
    table='articles',
    attrs=(
        title_attr,
        text_attr,
        date_attr,
        author_attr,
        tags_attr
    )
)
Example #21
0
from modelscraper.parsers import HTMLParser

uefa = ScrapeModel(
    name='eufa',
    domain='http://uefa.com',
    num_getters=2,
    phases=[
        Phase(sources=(Source(
            url="http://www.uefa.com/uefaeuro/season=2016/teams/index.html"),
                       ),
              templates=[
                  Template(name='team',
                           selector='.teams--qualified',
                           attrs=[
                               Attr(name='url',
                                    selector='a',
                                    func='sel_url',
                                    source={'active': False}),
                           ])
              ]),
        Phase(templates=[
            Template(name='player',
                     selector='.squad--team-player',
                     db_type='MongoDB',
                     db='uefa',
                     table='players',
                     attrs=[
                         Attr(name='name',
                              selector='.squad--player-name',
                              func='sel_text'),
                         Attr(name='player_url',
                              selector='.squad--player-name a',
Example #22
0
import datetime


now = str(datetime.datetime.now()).replace('-', '')[:8]
JSON_URL = 'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-{}.json.zip'
JSON_URL = 'http://0.0.0.0:8000/nvdcve-1.0-{}.json'

years = range(2002, datetime.datetime.now().year)
# years = [2002]
cve_source = (Source(url=JSON_URL.format(year), compression='',
                     json_key='CVE_Items') for year in years)

meta_template = Template(
    name='meta', db='defcon', table='cve_meta', db_type='MongoDB', attrs=[
    Attr(name='last_modified', func='sel_text',
         kws={'regex': 'lastModifiedDate:(.*)'},
         source=cve_source)])

cve_template = Template(
    name='meta', db='defcon', table='cve', db_type='MongoDB', #func='update',
    #kws={'key': 'id'},
    attrs=[
        Attr(name='id', func='sel_text',
             selector=['cve', 'CVE_data_meta', 'ID']),
        Attr(name='cpes', func='sel_text',
             selector=['configurations', 'nodes', 'cpe', 'cpe23Uri']),
        Attr(name='affects', func='sel_dict',
             selector=['cve', 'affects']),
        Attr(name='problem_type', func='sel_text',
             selector=['cve', 'problemtype', 'problemtype_data', 'description',
                       'value']),
Example #23
0
from modelscraper.dispatcher import Dispatcher
from modelscraper.sources import WebSource
from modelscraper.parsers import HTMLParser
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source

motorparts = ScrapeModel(
    name='motorparts', domain='http://www.2wheelpros.com', num_sources=2,
    phases=[
        Phase(source_worker=WebSource, parser=HTMLParser,
            sources=(Source(url='http://www.2wheelpros.com/oem-parts/'),),
            templates=(
                Template(name='brand',
                         selector='#nav > ul > li:nth-of-type(1) > a', attrs=(
                             Attr(name='url', func='sel_url',
                                  source={'active': False}),
                         ),),)
            ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='year', selector='.yearlink', attrs=(
                Attr(name='url', func='sel_url', source={'active': False}),)),),
          ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='model', selector='.modellink', attrs=(
                Attr(name='url', func='sel_url', source={'active': False}),
            )
            ),
        ),
        ),
        Phase(source_worker=WebSource, parser=HTMLParser, templates=(
            Template(name='partCategory', db='motorparts', db_type='MongoDB',
                     table='part_categories', source={'active':False,
Example #24
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source

base_url = "http://www.nu.nl/block/html/articlelist?footer=ajax&section={section}&limit=20&offset={offset}&show_tabs=0"
sections = [
    'buitenland', 'binnenland', 'economie', 'algemeen', 'tech', 'sport'
]
sources = (Source(url=base_url.format(section=section, offset=offset),
                  copy_attrs=['category'],
                  attrs=[Attr(name='category', value=[section])])
           for section in sections for offset in range(0, 200000, 20))

headline = Template(name='headline',
                    selector='li',
                    db='nu_nl',
                    db_type='MongoDB',
                    table='article_urls',
                    attrs=[
                        Attr(name='url',
                             selector='a',
                             func='sel_url',
                             source={
                                 'active': False,
                                 'copy_attrs': 'category'
                             }),
                        Attr(name='title', selector='.title', func='sel_text'),
                        Attr(name='excerpt',
                             selector='.excerpt',
                             func='sel_text')
                    ])

title_attr = Attr(name='title', selector='h1', func='sel_text')
Example #25
0
sources = (Source(url=a['url'][0]) for a in cl.dwdd.episode_urls.find())

programs_az = Phase(
    sources=[
        Source(url="http://www.npo.nl/programmas/a-z", params={'page': i})
        for i in range(0, 1)
    ],
    templates=(
        Template(
            name='program',
            selector='.content-column.quarter',
            db_type='mongo_db',
            db='npo_tv_programs',
            table='programs',
            attrs=(
                Attr(name='title', selector='h3', func='sel_text'),
                Attr(name='url',
                     selector='a.full-link',
                     func='sel_url',
                     source=Source(active=False)),  # source is for next run
            )), ))

nos_search = 'https://www.npo.nl/de-wereld-draait-door/VARA_101377717/search?media_type=broadcast&start_date=&end_date=&start={}&rows=100'
episodes_phase = Phase(n_workers=5,
                       sources=(Source(url=nos_search.format(start))
                                for start in range(0, 2194, 100)),
                       templates=(Template(
                           name='episodes',
                           selector='.list-item',
                           db_type='mongo_db',
                           db='dwdd',
Example #26
0
from modelscraper.dispatcher import Dispatcher
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.sources import ProgramSource

port_template = Template(name='ports',
                         selector='port',
                         db_type='mongo_db',
                         db='ports',
                         table='ports',
                         attrs=(Attr(name='portnumber',
                                     func='sel_attr',
                                     kws={'attr': 'portid'}),
                                Attr(name='state',
                                     selector='state',
                                     func='sel_attr',
                                     kws={'attr': 'state'}),
                                Attr(name='service',
                                     selector='service',
                                     func='sel_attr',
                                     kws={'attr': 'name'})))
nmap = ScrapeModel(
    name='nmap_test',
    domain='',
    phases=[
        Phase(sources=(Source(url='nmap -oX - duwo.multiposs.nl'), ),
              templates=[port_template],
              source_worker=ProgramSource)
    ])

disp = Dispatcher()
disp.add_scraper(nmap)
Example #27
0
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.parsers import TextParser
from modelscraper.dispatcher import Dispatcher
from pymongo import MongoClient


cl = MongoClient()
urls = (Source(url='https://tt888.omroep.nl/tt888/' + a['url'][0].split('/')[-1],
               attrs=(Attr(name='url', value=a['url']),))
        for a in cl.nos_journaal.episodes.find())

subtitles = ScrapeModel(
    name='subtitles', domain='https://tt888.omroep.nl/',
    phases=[
        Phase(n_workers=5, sources=urls, parser=TextParser,
            templates=(
                Template(
                    name='subtitle', db_type='mongo_db', db='nos_journaal',
                    table='episodes', func='update', kws={'key': 'url'},
                    attrs=(
                        Attr(name='subtitles', func='sel_text'),
                        )
                ),
            )
            )
    ])
del cl
d = Dispatcher()
d.add_scraper(subtitles)
d.run()
Example #28
0
from modelscraper.sources import WebSource
from modelscraper.databases import MongoDB

list_url = "https://www.erowid.org/experiences/exp.cgi?ShowViews=1&Cellar=0&Start=0&Max=1"
listing_source = WebSource(name='listing',
                           urls=[list_url],
                           domain='https://www.erowid.org/experiences/')
report_source = WebSource()
parser = HTMLParser()

report_listing = Model(
    source=listing_source,
    name='report_url',
    selector=parser.select('.exp-list-table tr'),
    emits=report_source,
    attrs=(Attr(name='url', func=parser.url(selector='td:nth-of-type(2) a')),
           Attr(name='title',
                func=parser.text(selector='td:nth-of-type(2) a')),
           Attr(name='rating',
                func=parser.attr(selector='td:nth-of-type(1) img',
                                 attr='alt')),
           Attr(name='author', func=parser.text(selector='td:nth-of-type(3)')),
           Attr(name='substances',
                func=parser.text(selector='td:nth-of-type(4)',
                                 replacers='&',
                                 substitute=',',
                                 regex='([A-z0-9\-]+\s*[A-z0-9\-*\s]*)')),
           Attr(name='date', func=parser.text(selector='td:nth-of-type(5)')),
           Attr(name='views', func=parser.text(selector='td:nth-of-type(6)'))))

drug_report = Model(
Example #29
0
product_sources = cl.makro.product_urls.find()
sources = [Source(url=p['url'][0]) for p in product_sources]

categories = Phase(
    sources=(Source(url="https://www.makro.nl/cat/nl/products"), ),
    templates=(Template(
        name='product_category',
        selector='#left-navigation-container ul.vertical > li > a',
        db_type='mongo_db',
        db='makro',
        table='product_categories',
        attrs=[
            Attr(name='url',
                 func='sel_url',
                 source={'active': False},
                 kws={
                     'replacers': 'pageSize=(\d+)',
                     'substitute': 'pageSize=96'
                 }),
        ]), ))

product_lists = Phase(templates=[
    Template(name='product_urls',
             selector='.product-list .product-tiles',
             db_type='mongo_db',
             db='makro',
             table='product_urls',
             attrs=[
                 Attr(name='url',
                      selector='.productname a',
                      func='sel_url',
Example #30
0
import re
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source
from modelscraper.parsers import JSONParser


vehicle_type = Attr(name='vehicle_type')
price = Attr(name='price', func='sel_text', kws={'numbers': True}, type=int)
brand = Attr(name='brand', func='sel_text')
make = Attr(name='make', func='sel_text')
year = Attr(name='year', func='sel_text', kws={'numbers': True}, type=int)
mileage = Attr(name='mileage', func='sel_text', kw={'numbers': True}, type=int)
city = Attr(name='city', func='sel_text')
url = Attr(name='url', func='sel_url')
zipcode = Attr(name='zip', func='sel_text')
power = Attr(name='power', func='sel_text')

vehicle = Template(
    name='vehicle', db_type='MongoDB', db='vehicles',
    attrs=[
        vehicle_type,
        price,
        brand,
        make,
        year,
        mileage,
        city,
        url,
        zipcode,
        power
    ]
)