def test_add_or_replace_parameter(self): url = 'http://domain/test' self.assertEqual(add_or_replace_parameter(url, 'arg', 'v'), 'http://domain/test?arg=v') url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3' self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4'), 'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4') self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3'), 'http://domain/test?arg1=v1&arg2=v2&arg3=nv3') url = 'http://domain/test?arg1=v1;arg2=v2' self.assertEqual(add_or_replace_parameter(url, 'arg1', 'v3'), 'http://domain/test?arg1=v3&arg2=v2') self.assertEqual(add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'), 'http://domain/moreInfo.asp?prodID=20') url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60' self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'), 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60') url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2,60&CatID=60' self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'), 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60') url = 'http://rmc-offers.co.uk/productlist.asp?' self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'), 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue') url = "http://example.com/?version=1&pageurl=http%3A%2F%2Fwww.example.com%2Ftest%2F%23fragment%3Dy¶m2=value2" self.assertEqual(add_or_replace_parameter(url, 'version', '2'), 'http://example.com/?version=2&pageurl=http%3A%2F%2Fwww.example.com%2Ftest%2F%23fragment%3Dy¶m2=value2') self.assertEqual(add_or_replace_parameter(url, 'pageurl', 'test'), 'http://example.com/?version=1&pageurl=test¶m2=value2')
def start_requests(self): for start_url in self.start_urls: u = url.add_or_replace_parameter(start_url[1], 'p', 1) # page for price in self.prices: u = url.add_or_replace_parameter(u, 'pr', ",".join([str(p) for p in price])) yield Request(u, callback=self.parse_pages_json)
def start_requests(self): today_in_tz = datetime.now(timezone(self.tz)) # d = timedelta(days=2) # today_in_tz += d # href = u"https://www.arb.ca.gov/aqmis2/display.php?year={year}&mon={month}&day={day}¶m={param}&order=basin,county_name,name&county_name=--COUNTY--&basin=--AIR+BASIN--&latitude=--PART+OF+STATE--&o3switch=new&ptype=aqd&report=HVAL&statistic=HVAL&btnsubmit=Update+Display&units=007&hours=all" href = u"https://www.arb.ca.gov/aqmis2/display.php?year=x&mon=x&day=x¶m=&order=basin,county_name,name&county_name=--COUNTY--&basin=--AIR+BASIN--&latitude=--PART+OF+STATE--&o3switch=new&ptype=aqd&report=HVAL&statistic=HVAL&btnsubmit=Update+Display&units=007&hours=all" params = ["BENZENE", "BC", "CO", "CO2", "COH", "H2S", "LTSC", "CH4", "NO2", "NO", "NOX", "NOY", "NMHC", "OZONE_ppm", "SO2", "THC", "PMTEOM", "PMBAM", "PM10_LHR", "PM10_SHR", "PM25HR" ] # params = ["BENZENE", "BC", "CO", "CO2", "COH"] # params = ["OZONE", "CO", "SO2"] # params = ["COH", "H2S",] # params = ["SO2",] url = add_or_replace_parameter(href, "year", today_in_tz.year) url = add_or_replace_parameter(url, "mon", today_in_tz.month) url = add_or_replace_parameter(url, "day", today_in_tz.day) param = params.pop(0) yield RandomRequest( # url=href.format(param=params.pop(0)), url=add_or_replace_parameter(url, "param", param), # callback=self.collect_station_data, callback=self.check_validity, meta={ "params": params, "data": list(), "changed_today_in_tz": None, "href": url, "param": param } )
def get_api_url(api_url, api_key, target, **filters): '''Build the API URL to query a list of proxies''' api_url = urljoin(api_url, API_ENDPOINT) api_url = urljoin(api_url + '/', target) api_url = add_or_replace_parameter(api_url, 'api_key', api_key) for f_key, f_val in filters.items(): api_url = add_or_replace_parameter(api_url, f_key, f_val) return api_url
def parse_product(self, response): base_sku = response.xpath('//@data-ref').extract_first() identifier = re.search('p(\d+)$', url_query_cleaner(response.url)).group(1) url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format( identifier) data = json.load(urlopen(url)) attributes = [attr['values'] for attr in data['attributes']] if [] in attributes: url = add_or_replace_parameter(url, 'attributes[1]', attributes[0][0]['value_id']) data = json.load(urlopen(url)) attributes = [attr['values'] for attr in data['attributes']] variants = itertools.product(*attributes) for variant in variants: url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format( identifier) for idx, option in enumerate(variant): url = add_or_replace_parameter( url, 'attributes[{0}]'.format(idx + 1), option['value_id']) data = json.load(urlopen(url)) selection = data['selection'].values()[0] sku = selection['reference'].strip() if not sku and base_sku not in self.skus_found: sku = base_sku if sku not in self.skus.keys(): continue if sku in self.skus_found: self.logger.info('Duplicated SKU is found: %s' % sku) self.skus_found.add(sku) loader = ProductLoader(item=Product(), response=response) loader.add_value('sku', sku) loader.add_value('identifier', selection['product_id']) loader.add_xpath('name', '//span[@id="js-product-title"]/text()') loader.add_value('name', [option['value'] for option in variant]) loader.replace_value('name', selection['title']) loader.add_value('url', response.url) loader.add_value('price', selection['price_inc']) category = response.css('div.breadcrumb a::attr(title)').extract() loader.add_value('category', category[1:]) try: image_url = [ attr['images'][0]['image'] for attr in data['attributes'][-1]['values'] ] except IndexError: image_url = response.xpath( '//div[@id="js-product-image"]//@src').extract() loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('brand', "Andrew James") item = loader.load_item() metadata = AndrewJamesMeta() metadata['asin'] = self.skus[sku]['ASIN'] item['metadata'] = metadata yield item
def parse(self, response): # Main categories for cat_url in response.xpath( '//ul[@id="main-nav"]/li/a/@href').extract(): yield Request(response.urljoin(cat_url)) sub_categories = response.xpath( '//div[contains(@class, "sub-categories")]' '/div/div//p/a/@href').extract() for sub_cat in sub_categories: yield Request( add_or_replace_parameter(response.urljoin(sub_cat), 'sort', 'lowest')) categories = response.xpath( '//ul[@class="category"]/li/a/@href').extract() categories += response.xpath( '//a[contains(@class, "shop-all-button")]/@href').extract() categories += response.css('.subcat-panel ::attr(href)').extract() for url in categories: yield Request( add_or_replace_parameter(response.urljoin(url), 'sort', 'lowest')) next_page = response.xpath( '//ul[@class="pagination"]/li/a[@class="next"]/@href').extract() if next_page: yield Request(url=response.urljoin(next_page[0])) products = response.xpath('//div[contains(@class, "product")]') for product_xs in products: url = product_xs.xpath('a/@href').extract() if not url: continue product_loader = ProductLoader(item=Product(), selector=product_xs) product_loader.add_value('url', url) try: sku = product_xs.xpath('p[@class="product-sku"]/text()').re( 'KaTom #: (.*)')[0] except: sku = None product_loader.add_value('sku', sku) product_loader.add_value('identifier', sku) product_loader.add_xpath('name', 'a/@title') product_loader.add_css('image_url', '.img ::attr(src)') product_loader.add_xpath('category', '//h1[@class="title"]/text()') product = product_loader.load_item() if len(product.get('sku', '').split('-')) > 1: product['sku'] = '-'.join(product['sku'].split('-')[1:]) yield Request(url=product_loader.get_output_value('url'), meta={"product": product}, callback=self.parse_product)
def parse_category(self, response): products = response.css('div.product') for product_xs in products: try: product_name = product_xs.xpath('./a/@title').extract()[0] except IndexError: continue product_url = response.urljoin( product_xs.xpath('./a/@href').extract()[0]) if product_url not in self.seen: yield Request(product_url, self.parse_product, dont_filter=True) self.seen.add(product_url) continue product_identifier = re.findall(r'/p/(.+?)/', product_url)[0] product_price = product_xs.xpath( './/span[@itemprop="price"]/text()').re(r'[\d\,.]+')[0] product_stock = product_xs.css('div.stockinfo::text').re( r'[\d\,.]+') product_image = product_xs.xpath('.//img[@alt]/@src').extract() loader = ProductLoader(item=Product(), selector=product_xs) loader.add_value('identifier', product_identifier) loader.add_value('sku', product_identifier) loader.add_value('name', product_name) loader.add_value('url', product_url) loader.add_value('price', product_price) if product_stock: loader.add_value('stock', int(product_stock[0])) if product_image: loader.add_value('image_url', response.urljoin(product_image[0])) loader.add_value('category', response.meta['category']) yield loader.load_item() return pages = set( response.xpath('//ul[@id="pagelist1"]/li/a/text()').extract()) next_page = response.meta['page'] + 1 if str(next_page) in pages: url = add_or_replace_parameter(response.url, 'p', next_page) url = add_or_replace_parameter(url, 'q', response.meta['object_id']) yield Request(url, callback=self.parse_category, meta={ 'category': response.meta['category'], 'object_id': response.meta['object_id'], 'page': next_page })
def parse(self, response): data = json.loads(str(response.body, 'utf-8')) for item in data["list"]: id = item["userUrl"] if id not in self.profiles: self.profiles.append(id) user_url = "https://www.kaggle.com" + id yield scrapy.Request(user_url, self.parseLocation) self.page += 1 url = add_or_replace_parameter(response.url, 'page', self.page) url = add_or_replace_parameter(url, 'pageSize', self.pageSize) yield scrapy.Request(url, self.parse)
def test_add_or_replace_parameter(self): url = 'http://domain/test' self.assertEqual(add_or_replace_parameter(url, 'arg', 'v'), 'http://domain/test?arg=v') url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3' self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4'), 'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4') self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3'), 'http://domain/test?arg1=v1&arg2=v2&arg3=nv3') url = 'http://domain/test?arg1=v1' self.assertEqual(add_or_replace_parameter(url, 'arg2', 'v2', sep=';'), 'http://domain/test?arg1=v1;arg2=v2') self.assertEqual( add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'), 'http://domain/moreInfo.asp?prodID=20') url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60' self.assertEqual( add_or_replace_parameter(url, 'BCat', 'newvalue', url_is_quoted=True), 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60') url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2,60&CatID=60' self.assertEqual( add_or_replace_parameter(url, 'BCat', 'newvalue'), 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60') url = 'http://rmc-offers.co.uk/productlist.asp?' self.assertEqual( add_or_replace_parameter(url, 'BCat', 'newvalue'), 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue')
def start_requests(self): # for id # codes = (u"0", u"1", u"2", u"3", u"4", u"5", u"7", u"8", u"9", u"10", u"11") codes = (u"56", u"42", u"50", u"60", u"61", u"62", u"63", u"53", u"134", u"80", u"24", u"25", u"21", u"49", u"46", u"44", u"45", u"28", u"40", u"1", u"3", u"5", u"6", u"8", u"13", u"77", u"75", u"73", u"106", u"79", u"10", u"39", u"38", u"15", u"19", u"18", u"57", u"30", u"36", u"35", u"34", u"33") # codes = (u"40",) # for id # href = u"http://www.nyaqinow.net/DynamicTable.aspx?" # for location # href = u"http://www.nyaqinow.net/StationDetails.aspx?" # for datascrapy crawl href = u"http://www.nyaqinow.net/StationInfo.aspx?" for code_value in codes: # for id # url = add_or_replace_parameter(href, u"G_ID", code_value) # for location url = add_or_replace_parameter(href, u"ST_ID", code_value) # for data # url = add_or_replace_parameter(href, u"ST_ID", code_value) yield Request(url=url, callback=self.parse, meta={u"code": code_value})
def parse_catall(self, response): error = False try: html = response.body.split('@@ebusiness@@')[1] except: error = True if error: req = response.request meta = response.meta retries = meta.get('retries', 0) if retries < 3: meta['retries'] = retries + 1 self.log('Retrying {}, attempt: {}'.format( req.url, retries + 1)) yield req.replace(dont_filter=True, callback=self.parse_catall, meta=meta) return hxs = Selector(text=html) for prod in hxs.xpath( '//td[@valign="middle" or @valign="top"]//a/@href').extract(): yield Request(response.urljoin(prod), callback=self.parse_product) pagination = response.body.split('@@ebusiness@@')[0] if not pagination: return pages = re.findall(r"changePage\('(.+?)',", pagination) for page in pages: url = add_or_replace_parameter(response.url, 'p', page) yield Request(url, self.parse_catall)
def run_crawl_all(self, response): print(' --- run_crawl_all --- ') t = datetime.datetime.now().strftime("%Y.%m.%d-%H:%M:%S") next_offset = int(url_query_parameter(response.url, 'offset')) + 10 list_parse_res = list_parse(eval(response.body.decode())) list_db_data = list_into_dbdata(list_parse_res, self.task['task_biz_enname'], self.task['task_biz_chname'], self.task['_id']) # 到头了或者出错了 if not list_db_data: self.task['task_status'] = 'end_success' print('要出去了') else: res = mongo_instance.loads.insert_many(list_db_data) if self.crawled_times == 1: print(' 插入的第一个id是: %s' % res.inserted_ids[0]) self.task['task_start_loadid'] = res.inserted_ids[0] self.crawled_times += 1 print('还有请求呢别着急出去') self.task['task_updatetime'] = t self.task['task_endtime'] = t mongo_instance.tasks.find_one_and_update( filter={'_id': self.task['_id']}, update={'$set': self.task}) if not 'running' in self.task['task_status']: return else: yield scrapy.Request(url=add_or_replace_parameter( response.url, 'offset', next_offset), headers=FakeLoadParams.headers, method='GET')
def start_requests(self): brands = { 'USN': ['http://www.predatornutrition.com/shop-by-brand/usn'], 'Optimum Nutrition': [ 'http://www.predatornutrition.com/shop-by-brand/optimum-nutrition' ], 'BSN': ['http://www.predatornutrition.com/shop-by-brand/bsn'], 'PhD': ['http://www.predatornutrition.com/shop-by-brand/phd-nutrition'], 'Maxi Nutrition': ['http://www.predatornutrition.com/shop-by-brand/maxinutrition'], 'Reflex': ['http://www.predatornutrition.com/shop-by-brand/reflex'], 'Mutant': ['http://www.predatornutrition.com/shop-by-brand/mutant'], 'Cellucor': ['http://www.predatornutrition.com/shop-by-brand/cellucor'], 'Sci-MX': ['http://www.predatornutrition.com/shop-by-brand/sci-mx'] } cookies = { 'GlobalE_Data': { 'countryISO': 'GB', 'cultureCode': 'en', 'currencyCode': 'GBP' } } for brand_name, urls in brands.iteritems(): for url in urls: link = add_or_replace_parameter(url, 'viewAll', 'true') yield Request(link, meta={'brand': brand_name}, cookies=cookies)
def start_requests(self): codes = (u"AY1", u"BAR6", u"BAR9", u"HB010", u"HB011", u"BAI2", u"WIL1", u"BUR2", u"BUR1", u"WIL8", u"WIL5", u"NEW2", u"CAM3", u"CAM5", u"CAM4", u"CAM1", u"CRL2", u"HB013", u"WIL3", u"HB012", u"EWE2", u"FAR2", u"GA1", u"GA2", u"GA3", u"GIRT", u"FAR1", u"T55", u"LHR2", u"T54", u"HEN", u"HB008", u"HB009", u"HI1", u"SIPS", u"HB002", u"HB003", u"HS5", u"HS4", u"HS2", u"HS9", u"HS8", u"HS7", u"HS6", u"BN2", u"HIL1", u"HIL4", u"HIL5", u"HI3", u"HB006", u"HB007", u"MAN1", u"MAN7", u"MAHG", u"WIL7", u"NUL1", u"OX6", u"OX3", u"REA2", u"REA4", u"RED3", u"IMP", u"ORCH", u"M60", u"WIL4", u"CW", u"SLH7", u"SLH3", u"SLH6", u"SLH5", u"SLH8", u"SLH9", u"SLH4", u"GX", u"SHOL", u"MONK", u"HB005", u"STK7", u"STK5", u"SUN2", u"SUN4", u"BN1", u"TAM1", u"TAME", u"GOS1", u"TRAF", u"TRF2", u"WD1", u"WL4", u"WL1", u"WL5", u"HB004", u"WAT", u"HB001", u"WID2", u"WID1", u"WIG7", u"NEW3", u"WYA4", u"WSTO", u"YK10", u"YK11", u"YK16", u"YK7", u"YK13", u"YK8", u"YK9", u"YK15", u"YK018", u"BAR3", u"BPLE", u"BATH", u"BIL", u"BBRD", u"BIRR", u"AGRN", u"BIR1", u"BLAR", u"BLC2", u"BORN", u"BDMA", u"BRT3", u"BRS8", u"BURW", u"CAM", u"CANK", u"CANT", u"CARL", u"MACK", u"CHAT", u"CHLG", u"CHS7", u"CHBO", u"CHBR", u"COAL", u"DCST", u"EB", u"EX", u"GLAZ", u"HM", u"HONI", u"HUL2", u"HULR", u"LB", u"LEAM", u"LEAR", u"LEED", u"LED6", u"LEIR", u"LECU", u"LEOM", u"LIN3", u"LVP", u"LH", u"LUTR", u"MAN3", u"MKTH", u"MID", u"NEWC", u"NCA3", u"NTN3", u"NO12", u"NOTT", u"NWBV", u"BOLD", u"OX", u"OX8", u"PLYM", u"PMTH", u"PRES", u"REA5", u"ROCH", u"ECCL", u"SASH", u"SDY", u"SCN2", u"SHBR", u"SHDG", u"SHE", u"SIB", u"SA33", u"SOUT", u"SEND", u"SHLW", u"OSY", u"SOTR", u"EAGL", u"STKR", u"STOK", u"STOR", u"SUNR", u"WAL4", u"WAR", u"WEYB", u"WFEN", u"WSMR", u"WIG5", u"TRAN", u"WTHG", u"YW") # codes = (u"LEIR",) url = u"http://www.airqualityengland.co.uk/site/latest" for code_value in codes: url = add_or_replace_parameter(url, u"site_id", code_value) yield Request( url=url, callback=self.parse, meta={u"code": code_value} )
def parse_listing(self, response): """ Extract product list. @url https://www.walgreens.com/store/c/eyes/ID=360457-tier3 @returns requests 1 """ blob = response.css('script').re_first( r'__APP_INITIAL_STATE__ = (\{.+\});') if not blob: return data = json.loads(blob) if not data['searchResult'].get('productList'): return for each in data['searchResult']['productList']: yield response.follow(each['productInfo']['productURL'], callback=self.parse_product) limit = response.meta.get('limit', 24) offset = int(url_query_parameter(response.url, 'No', 0)) + limit return response.follow(add_or_replace_parameter( response.url, 'No', offset), callback=self.parse_listing, meta={ 'offset': offset, 'limit': limit })
def parse(self, response): data = json.loads(str(response.body, 'utf-8')) for item in data: finalData = { "language": item["languageName"], "comments": item["totalComments"], "votes": item["totalVotes"], "medal": item["medal"], "id": item["id"], "date": item["scriptVersionDateCreated"] } id = item["id"] yield finalData if id not in self.ids: self.ids.append(id) else: logging.info("The id is duplicate, stop here") return if data[len(data) - 1]["id"]: self.page += 20 if self.page > 1000: self.page = 1000 url = add_or_replace_parameter(response.url, 'after', data[len(data) - 1]["id"]) url = re.sub(r"([0-9]){1,9}(?=\?)", str(self.page), url) yield scrapy.Request(url, self.parse)
def parse_category(self, response): try: data = SpiderSchema(response).get_products() except: return products = False for product in data: if not product.get('sku'): continue products = True loader = ProductLoader(Product(), response=response) loader.add_value('identifier', product['sku']) loader.add_value('url', product['url'][0]) loader.add_value('name', product['name']) loader.add_value('sku', product['sku']) category = response.css('a.GTM-breadcumb::text').extract( )[1:] or response.meta.get('category') loader.add_value('category', category) loader.add_value('image_url', product['image']) loader.add_value('brand', product['brand']) if product['offers']['properties']['availability'] != 'in stock': loader.add_value('stock', 0) price = product['offers']['properties']['price'] yield Request(loader.get_output_value('url'), self.parse_product, meta={'item': Product(loader.load_item())}) if not products: return page = url_query_parameter(response.url, 'page') if page: url = add_or_replace_parameter(response.url, 'page', int(page) + 1) else: id_families = response.xpath( '//input[@data-key="idFamilies"]/@value').extract_first() if id_families: url = add_or_replace_parameter( 'https://www.pccomponentes.pt/listado/ajax?page=0&order=price-desc', 'idFamilies[]', id_families) elif response.url.endswith('/novedades/'): return elif response.url.endswith('/'): url = response.url + 'ajax?page=0&order=price-desc' else: return yield Request(url, self.parse_category, meta={'category': category})
def new_request(self, resp): date_url = self.url_to_datetime(resp) # decrease day previous = time_to_dict(date_url, 1) new_url = add_or_replace_parameter(resp.url, u"fecha_dia", previous[u"day"]) new_url = add_or_replace_parameter(new_url, u"fecha_mes", previous[u"month"]) new_url = add_or_replace_parameter(new_url, u"fecha_anio", previous[u"year"]) return Request( url=new_url, callback=self.check_validity, # dont_filter=True )
def parse_pages_json(self, response): # get count of pages j_response = json.loads(response.body_as_unicode()) page_count = int(j_response["list"]["numPages"]) # open pages for page in xrange(page_count - 1): yield Request(url.add_or_replace_parameter(response.url, 'p', page), callback=self.parse_page_json)
def parse(self, response): links = response.css('[data-hook=product-list-grid-item] a::attr(href)').getall() for l in links: yield Request(l, callback=self.parse_item) if response.css('[data-hook=load-more-button]'): page_no = response.meta.get('page', 1) + 1 next_page_url = add_or_replace_parameter(response.url, 'page', page_no + 1) yield Request(next_page_url, callback=self.parse, meta={'page': page_no})
def extract_links(self, response): page_no = url_query_parameter(response.url, 'page_no', None) if not response.css('.Result a'): return [] return [ Link(url=add_or_replace_parameter(response.url, 'page_no', int(page_no) + 1)) ]
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select('//li[@class="PANEL ALL"]//a/@href').extract() categories += hxs.select( '//li[@class="PANEL BY-SIZE"]//a/@href').extract() categories += hxs.select( '//li[@class="PANEL BY-TYPE"]//a/@href').extract() for url in categories: url = url_query_cleaner(response.urljoin(url)) yield Request(url, callback=self.parse_product_list) products = hxs.select('//div[@id="pdList"]//a/@href').extract() products += hxs.select( '//div[@class="product-tile"]//a/@href').extract() for url in products: pid = url.split('_')[-1] if pid not in self.parsed_products: self.parsed_products.append(pid) url = url_query_cleaner(response.urljoin(url)) yield Request(url, callback=self.parse_product) product_variants = hxs.select( '//div[@class="productVariantTypeOptions"]/a/@href').extract() for url in product_variants: self.log('productVariantTypeOptions! {}'.format(url)) pid = url.split('_')[-1] if pid not in self.parsed_products: self.parsed_products.append(pid) url = url_query_cleaner(response.urljoin(url)) yield Request(url, callback=self.parse_product) next_page = None cur_page = url_query_parameter(response.url, 'pi', None) if cur_page: # The spider is already crawling the pages, we just assing the current url # so we can increment the 'pi' argument next_page = response.url else: # First page of the product list, we extract the pagination url with regex next_page = re.findall('.get\( "(.*)pi=', response.body) if next_page: next_page = response.urljoin(next_page[0]) if (next_page and products != response.meta.get('products', [])) or ( next_page and product_variants != response.meta.get('product_variants', [])): cur_page = url_query_parameter(next_page, 'pi', '1') url = add_or_replace_parameter(next_page, 'pi', str(int(cur_page) + 1)) self.log('Goes to next page: ' + url) yield Request(url, callback=self.parse_product_list, meta={ 'products': products, 'product_variants': product_variants })
def start_requests(self): codes = (u"17", u"3", u"10", u"2", u"9", u"4", u"5", u"8") # codes = (u"17",) url = u"https://novascotia.ca/nse/airdata/StationInfo3.aspx?" for code_value in codes: url = add_or_replace_parameter(url, u"ST_ID", code_value) yield Request(url=url, callback=self.parse, meta={u"code": code_value})
def parse_pages(self, response): html = json.loads(response.body) selector = Selector(text=html['html']) for url in selector.xpath( '//@href[contains(., "/produkt/")]').extract(): yield Request(url, self.parse_product) total_sets = int(selector.css('.totalSets::text').extract_first()) for s in xrange(total_sets): url = add_or_replace_parameter(response.url, 'set', s + 1) yield Request(url, self.parse_pages)
def parse_category(self, response): try: category_id = response.xpath('//script/text()').re( "categoryID: *'(.+)'")[0] except IndexError: return per_page = response.xpath('//script/text()').re( "var showInput *= *'(.+)'")[0] sort = response.xpath('//script/text()').re( "var sortInput *= *'(.+)'")[0] url = 'http://www.bmstores.co.uk/hpcProduct/productbyfilter/ajaxmode/1' parameters = ('categoryID', 'perPage', 'sort') values = (category_id, per_page, sort) for parameter, value in zip(parameters, values): url = add_or_replace_parameter(url, parameter, value) pages = response.xpath('//@data-pageto').extract() for page in pages: yield Request(add_or_replace_parameter(url, 'pageNum', page), self.parse_json_products)
def start_requests_id(self): codes_id = (u"22", u"23", u"64", u"75", u"80", u"81", u"90", u"109", u"110", u"118") # codes_id = (u"22",) href = u"https://fortress.wa.gov/ecy/enviwa/DynamicTable.aspx?" for code_id_value in codes_id: url = add_or_replace_parameter(href, u"G_ID", code_id_value) yield Request(url=url, callback=self.parse)
def get_matched_products(self, website_id): api_url = urljoin_rfc(self.host, '/api/get_matched_products_paged.json') api_url = add_or_replace_parameter(api_url, 'website_id', str(website_id)) api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key) page = 0 count = 1000 continue_next_page = True matched_products = [] while continue_next_page: api_url = add_or_replace_parameter(api_url, 'start', str(page * count)) api_url = add_or_replace_parameter(api_url, 'count', str(count)) try: try_no = 1 try_query = True while try_query: try: r = requests.get(api_url) data = r.json() new_matches = data.get('matches', []) except Exception, e: if not (try_no <= 10 and self.retry): raise e else: try_no += 1 time.sleep(1) else: try_query = False except Exception: continue_next_page = False else: matched_products.extend(new_matches) if len(new_matches) < count: continue_next_page = False else: page += 1 return matched_products
class RebelSport(CrawlSpider): name = 'kitbag_au-rebelsport' allowed_domains = ['rebelsport.com.au'] start_urls = [ 'http://www.rebelsport.com.au/store/fangear/soccer-football/604' ] categories = LinkExtractor( restrict_css='.secondary-menu', process_value=lambda url: add_or_replace_parameter( url, 'pageSize', '500')) pages = LinkExtractor(restrict_css='.pagination') products = LinkExtractor( restrict_css='.product', process_value=lambda url: make_variant_url(url_query_cleaner(url))) rules = (Rule(categories), Rule(products, callback='parse_product')) def parse_product(self, response): data = response.xpath('//script/text()').re('{\\\\"Variants.+}')[0] data = json.loads(data.replace('\\"', '"')) variants = data['Variants'] for variant in variants: url = response.urljoin(variant['ProductPLU']) yield Request(make_variant_url(url), self.parse_product) loader = ProductLoader(item=Product(), response=response) identifier = response.xpath( '//input[@id="ProductPLU"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '(//h1[@itemprop="name"]/text())[1]') metadata = {} for i in xrange(3): variant_name = data['Variant%dSelected' % (i + 1)] if variant_name and variant_name != 'N/A': loader.add_value('name', variant_name) metadata[data['Variant%dHeader' % (i + 1)]] = variant_name if 'size' in variant_name.lower(): metadata['size'] = variant_name[5:].strip() price = response.css('.price-value .currency::text').extract() loader.add_value('price', price.pop()) category = response.css('.breadcrumb a::text').extract() loader.add_value('category', category[1:]) loader.add_css('image_url', '.product-image::attr(src)') loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('shipping_cost', '7.95') stock = response.css('.product-stock-widget::attr(ng-init)').re( 'AvailableOnline: (\w+)')[0] if stock != 'true': loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = metadata yield item
def parse(self, response): # Get help from: http://stackoverflow.com/questions/38574869/how-can-i-jump-to-next-page-in-scrapy if response.meta.get('is_json', False): page = Selector(text=json.loads(response.body)['table']) else: page = Selector(response) if self.flag: self.total_item_num = int(page.xpath('//div[@id="show-more-courses"]/text()').re(r'courses of (.*)')[0]) + 50 print "Total courses: ", self.total_item_num self.steps = self.total_item_num / 50 + 1 self.flag = False base_urls = "https://www.class-central.com/courses/past" #base_urls = "https://www.class-central.com/courses/recentlyAdded" my_header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'} divs = page.xpath('//tr[@itemtype="http://schema.org/Event"]') #print "print content", len(divs) print "Process: ", self.cnt, '/', self.steps for div in divs: item = MoocCrawlerItem() item = {k:"" for k in item.keys()} parse_name = div.xpath('./td/a/span[@class="course-name-text"]/text()').extract_first().strip() item['name'] = parse_name parse_score = div.xpath('./td/div[@class="course-rating-value"]/text()').extract_first().strip() if len(parse_score) > 3: parse_score = parse_score[:3] item['score'] = string.atof(parse_score) * 2 parse_platform = div.xpath('./td/div[@class="course-provider"]/text()').extract_first().strip() item['platform'] = parse_platform parse_url = div.xpath('./td/a/@href').extract_first().decode().encode('utf-8').strip() item['url'] = "https://www.class-central.com" + parse_url parse_cid = re.findall(r'/mooc/(.*)/', parse_url)[0] item['cid'] = "cc" + parse_cid req = scrapy.Request(item['url'], headers=my_header, callback=self.parse_detail_page) req.meta['item'] = item yield req #next_page_el = respones.xpath("//div[@id='show-more-courses']") if self.cnt < self.steps: #if next_page_el: next_page_url = "https://www.class-central.com/maestro/courses/past?page=1&_=1471346096733" #next_page_url = "https://www.class-central.com/maestro/courses/recentlyAdded?page=1" next_page = response.meta.get('page', 1) + 1 next_page_url = add_or_replace_parameter(next_page_url, 'page', next_page) r = scrapy.Request(next_page_url, headers=my_header, callback=self.parse, meta={'page': next_page, 'is_json': True}) self.cnt += 1 yield r
def parse(self, response): data = json.loads(response.body) total_results = data['totalDatasetListItems'] page = 1 # figure out how many pages are there and loop through them. for i in range(20, total_results, 20): # step 20 since we have 20 results per page url = add_or_replace_parameter(response.url, 'page', page) yield scrapy.Request(url, self.parse_page) # don't forget to parse first page as well! yield from self.parse_page(self, response)
def parse_page(self, response): data = json.loads(response.body) if not data['success']: self.logger.warning('Failed pagination %s' % response.url) selector = Selector(text=data['paginationLink']) for page in selector.css( 'div.pagination ::attr(data-pageto)').extract(): url = add_or_replace_parameter(response.url, 'pageNum', page) yield Request(url, self.parse_page) selector = Selector(text=data['pageHTML']) for url in selector.css('a.product::attr(href)').extract(): yield Request(response.urljoin(url), self.parse_product)
def start_requests(self): codes = (u"9", u"12", u"2", u"10", u"4", u"11", u"3", u"6", u"13", u"7", u"8", u"5", u"15") # codes = (u"4",) url = u"http://envista.pima.gov/StationInfo1.aspx?" for code_value in codes: url = add_or_replace_parameter(url, u"ST_ID", code_value) yield Request(url=url, callback=self.parse, meta={u"code": code_value})
def parse_hotel(self, response): hxs = Selector(response) hotel = HtmlParser.extract_hotel(response.url, hxs) checkin = url_query_parameter(response.url,"checkin") checkout = url_query_parameter(response.url,"checkout") checkinDatetime = None checkoutDatetime = None today = datetime.date.today() if checkin is not None: checkinDatetime = datetime.datetime.strptime(checkin, "%Y-%m-%d").date() checkinDatetime = self.add_months(checkinDatetime,1) else: checkinDatetime = datetime.date(today.year, today.month, 15) if checkout is not None: checkoutDatetime = datetime.datetime.strptime(checkout, "%Y-%m-%d").date() checkoutDatetime = self.add_months(checkoutDatetime,1) else: checkoutDatetime = datetime.date(today.year, today.month, 16) maxDatetime = self.add_months(today,18) if checkinDatetime < maxDatetime: url = url_query_cleaner(response.url) url = add_or_replace_parameter(url,"checkin",str(checkinDatetime)) url = add_or_replace_parameter(url,"checkout",str(checkoutDatetime)) #logging.warning('---------------------------- %s' % url) yield Request(url, callback=self.parse_hotel) yield hotel["hotel"] if len(hotel["rooms"]) > 0: for room in hotel["rooms"]: yield room
def parse_first_page(self, response): total = int(response.xpath('//*[@id="voltron_srp_main-content"]/comment()').re(r'"formattedResultCount":"([\d,]+)"')[0].replace(",","")) page_count = total / self.PAGE_SIZE if page_count > self.MAX_PAGE_COUNT: page_count = self.MAX_PAGE_COUNT # parse first page for item in self.parse_page(response): yield item for i in xrange(page_count-1): u = url.add_or_replace_parameter( self.CONTACTS_URL, 'page_num', i+2) yield Request(u, callback=self.parse_page)
def test_add_or_replace_parameter(self): url = 'http://domain/test' self.assertEqual(add_or_replace_parameter(url, 'arg', 'v'), 'http://domain/test?arg=v') url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3' self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4'), 'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4') self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3'), 'http://domain/test?arg1=v1&arg2=v2&arg3=nv3') url = 'http://domain/test?arg1=v1' self.assertEqual(add_or_replace_parameter(url, 'arg2', 'v2', sep=';'), 'http://domain/test?arg1=v1;arg2=v2') self.assertEqual(add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'), 'http://domain/moreInfo.asp?prodID=20') url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60' self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue', url_is_quoted=True), 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60') url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2,60&CatID=60' self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'), 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60') url = 'http://rmc-offers.co.uk/productlist.asp?' self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'), 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue')
def parse(self, response): for restaurant in response.css('.shortSellDetails'): il = RestaurantItemLoader(selector=restaurant) il.add_css('name', '.property_title::text') il.add_css('url', '.property_title::attr(href)') il.add_css('cuisines', '.cuisine::text') item = il.load_item() yield scrapy.Request( item['url'], callback=self.parse_details, meta=dict(item=item) ) pagination_url = ( 'https://www.tripadvisor.com/RestaurantSearch?Action=PAGE' '&geo=294079&ajax=1&sortOrder=popularity&o=a0' '&availSearchEnabled=false' ) if not response.css('.nav.next.disabled'): offset = response.meta.get('offset', 0) + 30 pagination_url = add_or_replace_parameter( pagination_url, 'o', 'a{}'.format(offset)) yield scrapy.Request(pagination_url, meta={'offset': offset})
def _process_url(self, url): return add_or_replace_parameter( 'http://localhost:8998/redirect-to', 'goto', url)
def _process_url(self, url): return add_or_replace_parameter( self.mockserver.url('/redirect-to'), 'goto', url)