#Parsers jsonp = JSONParser() textp = TextParser() csvp = CSVParser() htmlp = HTMLParser() json_nested = Template( source=json_test, database=[test_db, test_mongo], table='tst', name='json_nested', selector=[jsonp.select('html'), htmlp.select('.content')], attrs=[ Attr(name='url', func=htmlp.text(selector='h1', template='partialtest {}')) ]) html_functions = Template(source=html, name='html_functions', database=test_mongo, table='html_test', dated=True, emits=html2, selector=htmlp.select('html'), attrs=[ Attr(name='table', func=htmlp.table(selector='table')), Attr(name='attr', func=htmlp.attr(selector='p', attr='class')),
}, 'ip': { '$exists': False } }) companies2 = MongoClient().defcon.companies.find({'website': {'$ne': None}}) defcon_base = Template(db='defcon', db_type='MongoDB') jacco_base = 'jackling.nl' jacco = (Source(url='jackling.nl'), ) jacco_git = (Source(url='http://{}/.git/config'.format(jacco_base)), ) jacco_ds = (Source(url='http://{}/.DS_STORE'.format(jacco_base)), ) git_sources = (Source(url='http://{}/.git/config'.format(c['website']), attrs=[Attr(name='kvk', value=c['id'])], copy_attrs='kvk') for c in companies2) ds_store_sources = (Source(url='http://{}/.DS_STORE'.format(c['website']), attrs=[Attr(name='kvk', value=c['id'])], copy_attrs='kvk') for c in companies2) git_template = Template(name='Git exposed', db_type='MongoDB', db='defcon', table='git', attrs=(Attr(name='vulnerable', func='sel_text', kws={'needle': '[core]'}), )) ds_store_template = Template(name='DS_STORE exposed', db_type='MongoDB',
# 'tech', # 'opmerkelijk', # 'cultuur-en-media', # 'koningshuis', ] now = datetime.datetime.now() begin = datetime.datetime.strptime('2010-01-01', '%Y-%m-%d') timezone = datetime.timezone(datetime.timedelta(0, 3600)) dates = [ begin + datetime.timedelta(days=d) for d in range(0, (now - begin).days) ] nos_sources = (Source(url="http://nos.nl/nieuws/{}/archief/{}".format( cat, date.strftime('%Y-%m-%d')), attrs=(Attr(name='category', value=cat), ), copy_attrs=True) for cat in categories for date in dates) title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='.article_body', func='sel_text') date_attr = Attr(name='date', selector='time:nth-of-type(1)', func='sel_attr', kws={'attr': 'datetime'}) author_attr = Attr(name='author', selector='span[itemprop="author"]', func='sel_text') tags_attr = Attr(name='tags', selector='.ib.space-right a.link-grey', func='sel_text')
from modelscraper.components import Attr, Model product_name = Attr(name='product_name') price = Attr(name='price') nutrition = Attr(name='nutrition') brand_name = Attr(name='brand_name') unitsize = Attr(name='unitsize') availability = Attr(name='availability') store_id = Attr(name='store_id') category = Attr(name='category') ingredients = Attr(name='ingredients') url = Attr(name='url') product = Model(name='product', definition=True, attrs=[ url, product_name, price, nutrition, brand_name, unitsize, availability, store_id, category, ingredients ])
col = db.episode sources = (Source(url="http://www.luckytv.nl/afleveringen/page/{}/".format(i)) for i in range(1, 50)) LuckyTV = ScrapeModel(name='Lucky TV', domain='http://www.luckytv.nl/', num_getters=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=sources, templates=(Template( name='episode', selector='article.video', db_type='mongo_db', db='lucky_tv', table='episodes', attrs=( Attr(name='url', selector='a:nth-of-type(1)', func='sel_url'), Attr(name='title', selector='.video__title', func='sel_text'), Attr(name='date', selector='.video__date', func='sel_text'), )), )), ])
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient cl = MongoClient() categories = {39: 'binnenland', 2: 'buitenland', } category_url = "http://www.metronieuws.nl/getsectionlist/{}/{}/0" binnenland = (Source(url=category_url.format(39, i), json_key=['data'], attrs=[Attr(name='category', value='binnenland')]) for i in range(1, 2400)) buitenland = (Source(url=category_url.format(2, i), json_key=['data'], attrs=[Attr(name='category', value='buitenland')]) for i in range(1, 1900)) sources = [*binnenland, *buitenland] title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='.article_body', func='sel_text') date_attr = Attr(name='date', selector='time:nth-of-type(1)', func='sel_attr', kws={'attr': 'datetime'}) author_attr = Attr(name='author', selector='span[itemprop="author"]', func='sel_text') tags_attr = Attr(name='tags', selector='.tag', func='sel_text') article = Template( name='article',
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient import string start_url = Source(url='http://www.startpagina.nl/dochters/') sub_page_url = Attr(name='sub_page', func='sel_url', source={'active': False, 'parent': True}) sub_page_name = Attr(name='pagename', func='sel_text') category_temp = Template(name='sub_page', selector='.sections a', db='startpagina', table='subpages', db_type='mongo_db', attrs=(sub_page_url, sub_page_name)) website_url = Attr(name='url', func='sel_url') website_name = Attr(name='name', func='sel_text') website_temp = Template(name='website', selector='#columns a', db='startpagina', table='websites', db_type='mongo_db', attrs=(website_url, website_name)) model = ScrapeModel(name='startpagina', domain='http://www.startpagina.nl', phases=[
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.parsers import JSONParser from pymongo import MongoClient cl = MongoClient() title_attr = Attr(name='title', selector='title', func='sel_text') text_attr = Attr(name='text', selector='body_elements', func='sel_dict') date_attr = Attr(name='date', selector='publish_date', func='sel_text') author_attr = Attr(name='author', selector='written_by', func='sel_text') tags_attr = Attr(name='tags', selector=('tags', 'name'), func='sel_text') category_attr = Attr(name='category', selector=('section', 'name'), func='sel_text') counters_attr = Attr(name='counters', selector='counters', func='sel_text') intro_attr = Attr(name='excerpt', selector='intro', func='sel_text') type_attr = Attr(name='excerpt', selector='type', func='sel_text') article = Template( name='article', db='volkskrant', table='articles', db_type='mongo_db', attrs=( title_attr, text_attr, date_attr, author_attr, tags_attr, category_attr, counters_attr, intro_attr,
method='post', data=[('sid', str(i)), ('options[]', 'display_full_history'), ('options[]', 'use_cached_data_only'), ('action', 'View+Complete+Tracking+History')]) for i in range(5000, 50000000)), templates=[ Template( name='shipment', selector=None, db='shipments', db_type='MongoDB', table='shipment', attrs=[ Attr( name='carrier', selector= '#large_shipment_info_box > div:nth-child(2) > div:nth-child(1)', func='sel_text', kws={'regex': 'Carrier:\s(\w+)'}), Attr( name='shipped_to', selector= '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(1) .align_left', func='sel_text'), Attr( name='shipped_from', selector= '#large_shipment_info_box > div:nth-child(2) > div:nth-child(2) div:nth-child(2) .align_left', func='sel_text'), ]), Template(name='event', selector='table tr:not(:nth-child(1))',
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.workers import WebSource from modelscraper.parsers import HTMLParser title = Attr(name='title', func='sel_text') artist = Attr(name='artist', func='sel_text') midi_url = Attr(name='midi_url') song = Template(name='song', db_type='MongoDB', db='midi', table='songs', attrs=[title, artist, midi_url]) freemidi_template = song( table='freemidi', selector='#mainContent div.col-xs-12:nth-of-type(1)', attrs=[ title(selector= 'li.active:nth-child(3) > a:nth-child(1) > span:nth-child(1)'), artist( selector= 'ol.breadcrumb:nth-child(1) > li:nth-child(2) > a:nth-child(1) > span:nth-child(1)' ), ]) freemidi_sources = (Source( url='https://freemidi.org/download-{}'.format(i), attrs=[midi_url(value='https://freemidi.org/getter-{}'.format(i))]) for i in range(25803))
from pymongo import MongoClient cl = MongoClient() cookie = {'nl_cookiewall_version': '1'} telegraaf_url = 'http://www.telegraaf.nl/jsp/search_result_page.jsp?method=&keyword=de&pagenr={}' telegraaf_search = (Source(url=telegraaf_url.format(i)) for i in range(1, 5001)) calendar = Template( name='archive_url', selector='', attrs=( Attr(name='url', selector='td a', func='sel_url', source=Source(active=False)), # source is for next run )) year = Template( name='archive_url_year', selector='.year-list__item', attrs=( Attr(name='url', selector='a', func='sel_url', source=True), # source is for next run )) article_url = Template( name='article_url', selector='ol.listing li', attrs=(
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source search_url = 'https://www.rtlnieuws.nl/search/nieuws/{}' search_terms = ['economie', 'nederland'] title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='p', func='sel_text') date_attr = Attr(name='date', selector='time', func='sel_text') author_attr = Attr(name='author', selector='span[itemprop="author"]', func='sel_text') tags_attr = Attr(name='tags', selector='.tag-list a.cta', func='sel_text') article = Template( name='article', selector='.col__inner', attrs=( title_attr, text_attr, date_attr, author_attr, tags_attr ) ) rtl= ScrapeModel( name='rtl', domain='http://www.rtlnieuws.nl/', num_getters=1, phases=[ Phase(sources=[ Source(url="http://www.parool.nl/archief/2012")], templates=(calendar, year)
from modelscraper.components import Scraper, Model, Attr from modelscraper.sources import ProgramSource from modelscraper.parsers import HTMLParser from modelscraper.databases import MongoDB nmap_source = ProgramSource(urls=['localhost'], test_urls=['localhost'], func='nmap -oX - {}') parser = HTMLParser() port_template = Model(source=nmap_source, name='ports', selector=parser.select('port'), database=MongoDB(db='nmap'), table='ports', attrs=(Attr(name='portnumber', func=parser.attr(attr='portid')), Attr(name='state', func=parser.attr(selector='state', attr='state')), Attr(name='service', func=parser.attr(selector='service', attr='name')))) nmap = Scraper(name='nmap_test', models=[port_template])
base_search = 'service/rest/delegate?url=/zoeken?rq={}&searchType=product' delegate_url = 'https://www.ah.nl/service/rest/delegate?url={}' table_trans = str.maketrans('[]', '<>') translate_table = lambda text: text.translate(table_trans) search = WebSource(name='search', url_template='https://www.ah.nl/{}', urls=(base_search.format(l) for l in ascii_lowercase)) product_test = 'producten/product/wi238928/ah-biologisch-schouderkarbonade' product_source = WebSource(name='product_source', url_template=delegate_url, test_urls=[product_test]) db = MongoDB(db='ah_nl') parser = JSONParser() url = Attr(name='url', func=parser.text(selector='navItem/link/href')) search_template = Model( source=search, name='search_result', selector=parser.select('//type[text() = "SearchLane"]/../_embedded/items'), attrs=[url(emits=product_source)] ) load_more_template = Model( source=search, name='load_more', selector=parser.select('//type[text() = "LoadMoreLane"]/..'), attrs=[url(emits=search)] ) product_selector = '//type[text() = "ProductDetailLane"]/..//type[text() = "Product"]/..'
domain='http://npo.nl', num_getters=2, phases=[ Phase( source_worker=WebSource, parser=HTMLParser, sources=[Source(url=series_url.format(i)) for i in range(0, 242)], templates=( Template( name='program', selector='.content-column.quarter', db_type='mongo_db', db='npo_tv_programs', table='programs', attrs=( Attr(name='title', selector='h3', func='sel_text'), Attr(name='url', selector='a.full-link', func='sel_url', source=Source( active=False)), # source is for next run )), Template(name='next_url'), )), Phase( source_worker=WebSource, parser=HTMLParser, templates=( Template( name='episodes', selector='.item-list.item-container div.item',
import re from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.parsers import JSONParser import vehicle city = Attr(name='city', func='sel_text') zipcode = Attr(name='zipcode', func='sel_text') occasion = Template( name='occasion', db_type='mongo_db', attrs=[*vehicle.attrs, city, zipcode, ] ) autotrader_template = vehicle( table='autotrader', selector='.result', attrs=[ brand(selector='h2', kws={'regex': '(^\w+)'}), make(selector='h2', kws={'regex': '^\w+ (.*)'}), price(selector='.result-price-label'), year(selector='.col-left',kws={'regex': '\w{3} (\d{4})'}), mileage(selector='.col-left', kws={'regex': '(.*) km'}), url(selector='a.tracker'), #Attr(name='dealer_name', selector='.dealer-info div', func=sel_text), city, zipcode, power,
from modelscraper.components import ScrapeModel, Template, Attr from modelscraper.sources import WebSource text = Attr(name='text', func='sel_html') title = Attr(name='title', func='sel_text') pictures = Attr(name='pictures', func='sel_attr', selector='img', kws={'attr': 'src'}) date = Attr(name='date', func='sel_text') related = Attr(name='related', func='sel_url') author = Attr(name='author', func='sel_text') tags = Attr(name='author', func='sel_text') article = Template(name='article', attrs=(text, title, date, author, tags, pictures, related), db='news', db_type='MongoDB') article_url = Attr(name='url', func='sel_url') tweakers_article_source = WebSource() tweakers_list = Template( selector='', attrs=[article_url(selector='', emits=tweakers_article_source)]) tweakers = article(source=tweakers_article_source, table='tweakers.net', selector='#contentArea', attrs=( text(selector='.article p'),
from modelscraper.components import Phase, Template, Attr from modelscraper.sources import BaseSourceWorker, ProgramSource from modelscraper.parsers import TextParser import dns.resolver import dns.query import dns.zone ip_template = Template(name='ip', db_type='MongoDB', db='', table='', parser=TextParser, attrs=(Attr( name='ip', func='sel_text', kws={'regex': '(\d+\.\d+\.\d+\.\d+)'}, ), )) ip_phase = Phase(n_workers=10, templates=[ip_template], source_worker=ProgramSource(function='host {}')) port_template = Template(name='ports', selector='port', db_type='MongoDB', db='monog', table='ports', attrs=(Attr(name='portnumber', func='sel_attr', kws={'attr': 'portid'}), Attr(name='state',
"SelectedStore": { "StoreId": 207, "StoreReferenceKey": 493 }, "HasSelectedStore": True, "AcceptedCookies": None, "LastViewedProducts": None } } menu_template = Template(name='menu', attrs=[ Attr(name='menu_item', selector='.c-category-tile__item', func='sel_url', source={ 'active': False, 'src_template': '{}?ppp=72' }) ]) productmenu_template = Template(name='submenu', selector='.c-product-tile', attrs=[ Attr(name='submenu_item', selector='.c-product-tile__meta > a', func='sel_url', source={'active': False}), Attr(name='pagination_item', selector='li.is-nexy > a', source=True)
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from pymongo import MongoClient search_url = 'https://mobileapi.ad.nl/mobile/lists/search' sources = (Source(url=search_url, params={'query': w}) for w in words) title_attr = Attr(name='title', selector='h1', func='sel_text') text_attr = Attr(name='text', selector='p', func='sel_text') date_attr = Attr(name='date', selector='.published span.small', func='sel_text') author_attr = Attr(name='author', selector='span.author', func='sel_text') tags_attr = Attr(name='tags', selector='.article.tags a span', func='sel_text') article = Template( name='article', selector='.column-content-background', db='nu_nl', db_type='mongo_db', table='articles', attrs=( title_attr, text_attr, date_attr, author_attr, tags_attr ) )
from modelscraper.parsers import HTMLParser uefa = ScrapeModel( name='eufa', domain='http://uefa.com', num_getters=2, phases=[ Phase(sources=(Source( url="http://www.uefa.com/uefaeuro/season=2016/teams/index.html"), ), templates=[ Template(name='team', selector='.teams--qualified', attrs=[ Attr(name='url', selector='a', func='sel_url', source={'active': False}), ]) ]), Phase(templates=[ Template(name='player', selector='.squad--team-player', db_type='MongoDB', db='uefa', table='players', attrs=[ Attr(name='name', selector='.squad--player-name', func='sel_text'), Attr(name='player_url', selector='.squad--player-name a',
import datetime now = str(datetime.datetime.now()).replace('-', '')[:8] JSON_URL = 'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-{}.json.zip' JSON_URL = 'http://0.0.0.0:8000/nvdcve-1.0-{}.json' years = range(2002, datetime.datetime.now().year) # years = [2002] cve_source = (Source(url=JSON_URL.format(year), compression='', json_key='CVE_Items') for year in years) meta_template = Template( name='meta', db='defcon', table='cve_meta', db_type='MongoDB', attrs=[ Attr(name='last_modified', func='sel_text', kws={'regex': 'lastModifiedDate:(.*)'}, source=cve_source)]) cve_template = Template( name='meta', db='defcon', table='cve', db_type='MongoDB', #func='update', #kws={'key': 'id'}, attrs=[ Attr(name='id', func='sel_text', selector=['cve', 'CVE_data_meta', 'ID']), Attr(name='cpes', func='sel_text', selector=['configurations', 'nodes', 'cpe', 'cpe23Uri']), Attr(name='affects', func='sel_dict', selector=['cve', 'affects']), Attr(name='problem_type', func='sel_text', selector=['cve', 'problemtype', 'problemtype_data', 'description', 'value']),
from modelscraper.dispatcher import Dispatcher from modelscraper.sources import WebSource from modelscraper.parsers import HTMLParser from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source motorparts = ScrapeModel( name='motorparts', domain='http://www.2wheelpros.com', num_sources=2, phases=[ Phase(source_worker=WebSource, parser=HTMLParser, sources=(Source(url='http://www.2wheelpros.com/oem-parts/'),), templates=( Template(name='brand', selector='#nav > ul > li:nth-of-type(1) > a', attrs=( Attr(name='url', func='sel_url', source={'active': False}), ),),) ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='year', selector='.yearlink', attrs=( Attr(name='url', func='sel_url', source={'active': False}),)),), ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='model', selector='.modellink', attrs=( Attr(name='url', func='sel_url', source={'active': False}), ) ), ), ), Phase(source_worker=WebSource, parser=HTMLParser, templates=( Template(name='partCategory', db='motorparts', db_type='MongoDB', table='part_categories', source={'active':False,
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source base_url = "http://www.nu.nl/block/html/articlelist?footer=ajax§ion={section}&limit=20&offset={offset}&show_tabs=0" sections = [ 'buitenland', 'binnenland', 'economie', 'algemeen', 'tech', 'sport' ] sources = (Source(url=base_url.format(section=section, offset=offset), copy_attrs=['category'], attrs=[Attr(name='category', value=[section])]) for section in sections for offset in range(0, 200000, 20)) headline = Template(name='headline', selector='li', db='nu_nl', db_type='MongoDB', table='article_urls', attrs=[ Attr(name='url', selector='a', func='sel_url', source={ 'active': False, 'copy_attrs': 'category' }), Attr(name='title', selector='.title', func='sel_text'), Attr(name='excerpt', selector='.excerpt', func='sel_text') ]) title_attr = Attr(name='title', selector='h1', func='sel_text')
sources = (Source(url=a['url'][0]) for a in cl.dwdd.episode_urls.find()) programs_az = Phase( sources=[ Source(url="http://www.npo.nl/programmas/a-z", params={'page': i}) for i in range(0, 1) ], templates=( Template( name='program', selector='.content-column.quarter', db_type='mongo_db', db='npo_tv_programs', table='programs', attrs=( Attr(name='title', selector='h3', func='sel_text'), Attr(name='url', selector='a.full-link', func='sel_url', source=Source(active=False)), # source is for next run )), )) nos_search = 'https://www.npo.nl/de-wereld-draait-door/VARA_101377717/search?media_type=broadcast&start_date=&end_date=&start={}&rows=100' episodes_phase = Phase(n_workers=5, sources=(Source(url=nos_search.format(start)) for start in range(0, 2194, 100)), templates=(Template( name='episodes', selector='.list-item', db_type='mongo_db', db='dwdd',
from modelscraper.dispatcher import Dispatcher from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.sources import ProgramSource port_template = Template(name='ports', selector='port', db_type='mongo_db', db='ports', table='ports', attrs=(Attr(name='portnumber', func='sel_attr', kws={'attr': 'portid'}), Attr(name='state', selector='state', func='sel_attr', kws={'attr': 'state'}), Attr(name='service', selector='service', func='sel_attr', kws={'attr': 'name'}))) nmap = ScrapeModel( name='nmap_test', domain='', phases=[ Phase(sources=(Source(url='nmap -oX - duwo.multiposs.nl'), ), templates=[port_template], source_worker=ProgramSource) ]) disp = Dispatcher() disp.add_scraper(nmap)
from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.parsers import TextParser from modelscraper.dispatcher import Dispatcher from pymongo import MongoClient cl = MongoClient() urls = (Source(url='https://tt888.omroep.nl/tt888/' + a['url'][0].split('/')[-1], attrs=(Attr(name='url', value=a['url']),)) for a in cl.nos_journaal.episodes.find()) subtitles = ScrapeModel( name='subtitles', domain='https://tt888.omroep.nl/', phases=[ Phase(n_workers=5, sources=urls, parser=TextParser, templates=( Template( name='subtitle', db_type='mongo_db', db='nos_journaal', table='episodes', func='update', kws={'key': 'url'}, attrs=( Attr(name='subtitles', func='sel_text'), ) ), ) ) ]) del cl d = Dispatcher() d.add_scraper(subtitles) d.run()
from modelscraper.sources import WebSource from modelscraper.databases import MongoDB list_url = "https://www.erowid.org/experiences/exp.cgi?ShowViews=1&Cellar=0&Start=0&Max=1" listing_source = WebSource(name='listing', urls=[list_url], domain='https://www.erowid.org/experiences/') report_source = WebSource() parser = HTMLParser() report_listing = Model( source=listing_source, name='report_url', selector=parser.select('.exp-list-table tr'), emits=report_source, attrs=(Attr(name='url', func=parser.url(selector='td:nth-of-type(2) a')), Attr(name='title', func=parser.text(selector='td:nth-of-type(2) a')), Attr(name='rating', func=parser.attr(selector='td:nth-of-type(1) img', attr='alt')), Attr(name='author', func=parser.text(selector='td:nth-of-type(3)')), Attr(name='substances', func=parser.text(selector='td:nth-of-type(4)', replacers='&', substitute=',', regex='([A-z0-9\-]+\s*[A-z0-9\-*\s]*)')), Attr(name='date', func=parser.text(selector='td:nth-of-type(5)')), Attr(name='views', func=parser.text(selector='td:nth-of-type(6)')))) drug_report = Model(
product_sources = cl.makro.product_urls.find() sources = [Source(url=p['url'][0]) for p in product_sources] categories = Phase( sources=(Source(url="https://www.makro.nl/cat/nl/products"), ), templates=(Template( name='product_category', selector='#left-navigation-container ul.vertical > li > a', db_type='mongo_db', db='makro', table='product_categories', attrs=[ Attr(name='url', func='sel_url', source={'active': False}, kws={ 'replacers': 'pageSize=(\d+)', 'substitute': 'pageSize=96' }), ]), )) product_lists = Phase(templates=[ Template(name='product_urls', selector='.product-list .product-tiles', db_type='mongo_db', db='makro', table='product_urls', attrs=[ Attr(name='url', selector='.productname a', func='sel_url',
import re from modelscraper.components import ScrapeModel, Phase, Template, Attr, Source from modelscraper.parsers import JSONParser vehicle_type = Attr(name='vehicle_type') price = Attr(name='price', func='sel_text', kws={'numbers': True}, type=int) brand = Attr(name='brand', func='sel_text') make = Attr(name='make', func='sel_text') year = Attr(name='year', func='sel_text', kws={'numbers': True}, type=int) mileage = Attr(name='mileage', func='sel_text', kw={'numbers': True}, type=int) city = Attr(name='city', func='sel_text') url = Attr(name='url', func='sel_url') zipcode = Attr(name='zip', func='sel_text') power = Attr(name='power', func='sel_text') vehicle = Template( name='vehicle', db_type='MongoDB', db='vehicles', attrs=[ vehicle_type, price, brand, make, year, mileage, city, url, zipcode, power ] )