def test_canonicalize_parse_url(self): # parse_url() wraps urlparse and is used in link extractors self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")), "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')), 'http://www.example.com/caf%E9-con-leche.htm') self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")), "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
def test_canonicalize_parse_url(self): # parse_url() wraps urlparse and is used in link extractors self.assertEqual( canonicalize_url( parse_url(u"http://www.example.com/résumé?q=résumé")), "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") self.assertEqual( canonicalize_url( parse_url('http://www.example.com/caf%e9-con-leche.htm')), 'http://www.example.com/caf%E9-con-leche.htm') self.assertEqual( canonicalize_url( parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")), "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
def parse_prod_list(self, response): print 'parse_prod_list' urlstr = "%s://%s%s" parsedurl = surl.parse_url(get_base_url(response)) for grid in response.css('.clearfix .grid_link'): item = ArchItem() item['path_url'] = [] item['path_name'] = [] item['main_bread'] = [] item['path_url'].extend(response.meta['item']['path_url']) item['path_name'].append(response.meta['item']['path_name']) item['main_bread'].append(response.meta['item']['main_bread']) prdlnk = grid.xpath('a/@href').extract()[0] realurl = urlstr % (parsedurl.scheme, parsedurl.netloc, prdlnk) item['path_url'].append(realurl) item['path_name'].append( [grid.css('.name-product').xpath('text()').extract()]) item['main_bread'].append( response.xpath('//*[@itemprop="title"]/text()').extract()) print "=======================PD:", item['path_url'] yield scrapy.Request(item['path_url'][-1], meta={'item': item}, callback=self.parse_content) #return item nxtpg = response.css('.btn-pag-right').xpath('@href').extract() if len(nxtpg) > 0: nextpage = urlstr % (parsedurl.scheme, parsedurl.netloc, nxtpg[0]) yield scrapy.Request(nextpage, meta={'item': response.meta['item']}, callback=self.parse_prod_list)
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, encoding=None): """Canonicalize the given url by applying the following procedures: - sort query arguments, first by key, then by value - percent encode paths and query arguments. non-ASCII characters are percent-encoded using UTF-8 (RFC-3986) - normalize all spaces (in query arguments) '+' (plus symbol) - normalize percent encodings case (%2f -> %2F) - remove query arguments with blank values (unless keep_blank_values is True) - remove duplicate query arguments - remove fragments (unless keep_fragments is True) The url passed can be a str or unicode, while the url returned is always a str. This builds on scrapy.utils.url.canonicalize_url to remove duplicate arguments. """ scheme, netloc, path, params, query, fragment = parse_url(url) keyvals = urlparse.parse_qsl(query, keep_blank_values) keyvals = list(set(keyvals)) keyvals.sort() query = urllib.urlencode(keyvals) path = safe_url_string(_unquotepath(path)) or '/' fragment = '' if not keep_fragments else fragment return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
def parse_selloffer_jsonp(self,response): fixedcontent = self._regex.sub(r"\\\\", response.body) rep = json.loads(fixedcontent[len(self.jsonp_callback)+1:-1].decode("GBK")) if rep["hasError"] == True: self.log("[pase_jsonp] Error:%s %s"%(rep["message"],response.url),level=scrapy.log.ERROR) return content = rep["content"]["offerResult"]["html"] tree=lxml.html.fromstring(content) #pdb.set_trace() #parse shop for href in tree.xpath('//li[@class="sm-offerShopwindow"]//a[@class="sm-previewCompany sw-mod-previewCompanyInfo"]/@href'): if not href.startswith("http://"): continue shop_url = href+"/" self.log('[pase_jsonp] found shop %s from %s'%(shop_url,response.url),level=scrapy.log.INFO) yield ShopItem(url=shop_url,insert_time=str(datetime.datetime.now())) #nextpage scheme, netloc, path, params, query, fragment = parse_url(response.url) qs = parse_query(query) try: totalPage = int(qs.get('totalPage')) beginPage = int(qs.get('beginPage')) except Exception, e: self.log("[pase_jsonp] %s"%e,level=scrapy.log.ERROR) return
def parse_selloffer(self,response): ''' parse like this :http://s.1688.com/selloffer/offer_search.htm?.....''' #parse category for href in response.xpath('//a/@href').extract(): if not href.startswith("http://s.1688.com/selloffer/offer_search.htm?"): continue yield scrapy.Request(href) #parse shop for item in self._get_shop_byxpath(response,'//li[@class="sm-offerShopwindow"]//a[@class="sm-previewCompany sw-mod-previewCompanyInfo"]/@href'): yield item #next page scheme, netloc, path, params, query, fragment = parse_url(response.url) qs = parse_query(query) keywords = qs.get('keywords') if not keywords: return totalPage= response.xpath('//li[@id="breadCrumbText"]/em/text()').extract() if len(totalPage ) == 1: totalPage = int(totalPage[0])/60 +1 else: totalPage = 50 jsonrpc_url='http://s.1688.com/selloffer/rpc_offer_search.jsonp?descendOrder=true&onlineStatus=yes&isOnlyAlipay=true&sortType=booked&uniqfield=userid&n=y&filt=y&from=marketSearch&async=true&asyncCount=60&startIndex=0&qrwRedirectEnabled=false&offset=0&isWideScreen=false&controls=_template_%3Aofferresult%2Cshopwindow%2CshopwindowOfferResult.vm%7C_moduleConfig_%3AshopwindowResultConfig%7CpageSize%3A60%7C_name_%3AofferResult%7Coffset%3A0&token=959352873'+'&callback=%(callback)s&beginPage=%(beginPage)d&totalPage=%(totalPage)d&keywords=%(keywords)s'%{"callback":self.jsonp_callback,"beginPage":1,"keywords":keywords,"totalPage":totalPage} yield scrapy.Request(jsonrpc_url)
def parse_caigou(self,response): ''' parse like this :http://s.1688.com/caigou/offer_search.htm?.....''' #parse category for href in response.xpath('//a/@href').extract(): if not href.startswith("http://s.1688.com/caigou/offer_search.htm?"): continue #yield IndexItem(url=href,insert_time=str(datetime.datetime.now())) #yield scrapy.Request(href) for item in self._get_shop_byxpath(response,'//li[@class="sm-offerItem"]/div[@class="sm-offerItem-alitalk"]//a[2]/@href'): yield item #next page scheme, netloc, path, params, query, fragment = parse_url(response.url) qs = parse_query(query) keywords = qs.get('keywords') if not keywords: return totalPage= response.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/span/em/text()').extract() if len(totalPage ) == 1: totalPage = int(totalPage[0])/60 +1 else: totalPage = 10 jsonrpc_url='http://s.1688.com/caigou/rpc_offer_search.jsonp?n=y&async=true&asyncCount=60&startIndex=0&qrwRedirectEnabled=false&offset=0&isWideScreen=false&controls=_template_%3Aofferresult%2CjicaiOfferResult.vm%7C_moduleConfig_%3AshopwindowResultConfig%7C_name_%3AofferResult&token=237250634'+'&callback=%(callback)s&beginPage=%(beginPage)d&totalPage=%(totalPage)d&keywords=%(keywords)s'%{"callback":self.jsonp_callback,"beginPage":1,"keywords":keywords,"totalPage":totalPage} yield scrapy.Request(jsonrpc_url)
def parse(self, response): self.log("Crawled (%d) <GET %s>" % (response.status, response.url), level=scrapy.log.INFO) if response.status != 200: yield response.request return if not isinstance(response, scrapy.http.HtmlResponse): return depth = response.meta.get("depth", 1) for href in response.xpath("//a/@href").extract(): href = href.strip() if href.startswith("javascript:") or href.startswith( "rtsp:") or href.startswith("ftp:"): continue scheme, netloc, path, params, query, fragment = parse_url(href) if path: suffix = path.split('.')[-1] if suffix in [ "png", "jpg", "gif", "rar", "zip", "mp3", ".pdf", "doc", ".txt", "docx", "swf", "mp4" ]: continue abs_url = urljoin_rfc(response.url, href) yield UrlItem(url=abs_url, fromurl=response.url) if depth < 1: depth += 1 yield scrapy.Request(abs_url, meta={"depth": depth})
def parse_index(self,response): #parse category for href in response.xpath('//*[@id="hotwordpanel"]//li/@data-url').extract()+response.xpath('//*[@id="hotwordpanel"]//a/@href').extract(): if not href.startswith("http://"): continue scheme, netloc, path, params, query, fragment = parse_url(href) if netloc.startswith("shop") or path.endswith("creditdetail.htm"): shop_url = "%s://%s/"%(scheme,netloc) yield ShopItem(url=shop_url,insert_time=str(datetime.datetime.now())) self.log('[parse_index] found shop %s'%(shop_url),level=scrapy.log.INFO) elif netloc == "detail.1688.com": yield GoodsItem(url=href,insert_time=str(datetime.datetime.now())) elif netloc == "go.1688.com" and 'supplier' in path: yield IndexItem(url=href,insert_time=str(datetime.datetime.now())) yield scrapy.Request(href) #parse shop for href in response.xpath('//*[@id="listbody"]/div[@class="supplier-list"]/div[@class="supplier"]/div[@class="title p-margin"]//a/@href').extract(): if not href.startswith("http://"): continue scheme, netloc, path, params, query, fragment = parse_url(href) shop_url = "%s://%s/"%(scheme,netloc) self.log('[parse_index] found shop %s from %s'%(shop_url,response.url),level=scrapy.log.INFO) yield ShopItem(url=shop_url,insert_time=str(datetime.datetime.now())) #next page scheme, netloc, path, params, query, fragment = parse_url(response.url) qs = parse_query(query) pageStart = int(qs.get('pageStart',1)) pageCount = int(qs.get('pageCount',0)) if not pageCount: pageCount = response.xpath('//*[@id="pageCount"]/@value').extract() if len(pageCount) > 0: try: pageCount = int(pageCount[0]) except Exception, e: pageCount = 1000
def start_requests(self): with open('urls_to_scrape_full_urls.csv', 'r') as read_file: for u in read_file.readlines(): host = parse_url(u.strip()).netloc.lower() if not host: continue self.allowed_domains.append(host) self.extractor.allow_domains.add(host) yield scrapy.Request(u.strip(), callback=self.parse, errback=self.errback_httpbin, dont_filter=True)
def my_canonicalize_url(url): url = canonicalize_url(url) scheme, netloc, path, params, query, fragment = parse_url(url) # www.hmrc.gov.uk and hmrc.gov.uk are the same if netloc == 'hmrc.gov.uk': netloc = 'www.hmrc.gov.uk' # Fix manuals links with multiple slashes path = re.sub(r'^/+', '/', path) # Fix customs.hmrc with session tokens in path (!?) if params and path.endswith('.portal'): params = '' url = urlparse.urlunparse((scheme, netloc, path, params, query, fragment)) return url
def parse(self,response): urlstr = "%s://%s%s" print 'start' parsedurl = surl.parse_url(get_base_url(response)) #col-categories-selected for catsel in response.css('.col-categories-selected'): item = ArchItem() pathurl = [] pathurl.append(urlstr%(parsedurl.scheme,parsedurl.netloc,catsel.xpath('a/@href').extract()[0])) item['path_url'] = pathurl item['path_name'] = catsel.xpath('a/span/text()').extract() item['main_bread'] = [catsel.xpath('//*[@itemprop="title"]/text()').extract()] print "!!!!!!!!!!!!!",item['path_url'] yield scrapy.Request(item['path_url'][-1],meta={'item':item},callback=self.pg2parse)
def handle_list(self, response: Response) -> Request: torrents = response.css( 'td.desc-top a[type="application/x-bittorrent"]::attr(href)' ).extract() if len(torrents) < 1: return for torrent in torrents: url = parse_url(torrent) if url.netloc == 'www.nyaa.se': continue request = DownloadRequest(url=torrent, callback=self.handle_item, dont_filter=True) request.meta['from_url'] = response.url yield request
def init_allowed_domains(self): if (not hasattr(self, 'allowed_domains')) or (hasattr( self, 'allowed_domains') and (not self.allowed_domains)): self.allowed_domains = [] dict_locations = dict(self.available_locations) for url in self.start_urls: domain = parse_url( dict_locations[self.location]).netloc.replace('www.', '') if domain not in self.allowed_domains: self.allowed_domains.append(domain) elif hasattr(self, 'allowed_domains') and not isinstance( self.allowed_domains, list): self.allowed_domains = list(self.allowed_domains) if 'bazaarvoice.com' not in self.allowed_domains: self.allowed_domains.append('bazaarvoice.com')
def pg2parse(self,response): print 'pg2parse' parsedurl = surl.parse_url(get_base_url(response)) urlstr = "%s://%s%s" for subcat in response.css('.cont-sub-content').xpath('a'): item = ArchItem() item['path_url']=[] item['path_name']=[] item['main_bread']=[] item['path_url'].extend( response.meta['item']['path_url']) item['path_name'] .append( response.meta['item']['path_name']) item['main_bread'] .append( response.meta['item']['main_bread']) realurl = urlstr%(parsedurl.scheme,parsedurl.netloc,subcat.xpath('@href').extract()[0]) item['path_url'].append(realurl) item['path_name'].append(subcat.xpath('*/h2[@itemprop="name"]/text()').extract()[0]) item['main_bread'].append(response.xpath('//*[@itemprop="title"]/text()').extract()) print "!!!!!!!!!!!!!==========",item['path_url'] yield scrapy.Request(item['path_url'][-1],meta={'item':item},callback=self.parse_prod_list)
def parse_jinpai(self, response): div = response.xpath('//*[@id="box_doc"]/div[1]/div/div[1]') for href in div.xpath("//a/@href").extract(): if not href.startswith("http://"): continue scheme, netloc, path, params, query, fragment = parse_url(href) if netloc.startswith("shop") or path.endswith("creditdetail.htm"): yield ShopItem(url="%s://%s/"%(scheme,netloc),insert_time=str(datetime.datetime.now())) elif netloc == "detail.1688.com": yield GoodsItem(url=href,insert_time=str(datetime.datetime.now())) elif netloc == "go.1688.com" and 'supplier' in path: yield IndexItem(url=href,insert_time=str(datetime.datetime.now())) yield scrapy.Request(href) pass
def parse(self, response): urlstr = "%s://%s%s" print 'start' parsedurl = surl.parse_url(get_base_url(response)) #col-categories-selected for catsel in response.css('.col-categories-selected'): item = ArchItem() pathurl = [] pathurl.append(urlstr % (parsedurl.scheme, parsedurl.netloc, catsel.xpath('a/@href').extract()[0])) item['path_url'] = pathurl item['path_name'] = catsel.xpath('a/span/text()').extract() item['main_bread'] = [ catsel.xpath('//*[@itemprop="title"]/text()').extract() ] print "!!!!!!!!!!!!!", item['path_url'] yield scrapy.Request(item['path_url'][-1], meta={'item': item}, callback=self.pg2parse)
def parse(self, response): hidden = lambda id: response.xpath( '/html/body/input[@id="{}"]/@data-value'.format(id)).extract_first( ) total_pages = int(hidden('quantidadeTotalPaginas').replace('.', '')) hashfragment = OrderedDict([ ('pagina', None), ('semente', self.seed or hidden('semente')), ]) formdata = OrderedDict([ ('tipoOferta', '1'), ('paginaAtual', None), ('pathName', parse_url(response.url).path), ('hashFragment', ''), ]) headers = {'X-Requested-With': 'XMLHttpRequest'} url = 'https://www.zapimoveis.com.br/Busca/RetornarBuscaAssincrona/' from_page = self.start if self.count: to_page = min(self.start + self.count - 1, total_pages) else: to_page = total_pages self.crawler.stats.set_value('total_pages', total_pages) self.crawler.stats.set_value('selected_pages', max(0, to_page - from_page + 1)) for page in range(from_page, to_page + 1): hashfragment['pagina'] = formdata['paginaAtual'] = str(page) formdata['hashFragment'] = json.dumps(hashfragment, separators=(',', ':')) yield FormRequest(url, headers=headers, formdata=formdata, callback=self.parse_busca)
def parse(self, response): self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status != 200 : yield response.request return if not isinstance(response,scrapy.http.HtmlResponse): return depth = response.meta.get("depth",1) for href in response.xpath("//a/@href").extract(): href = href.strip() if href.startswith("javascript:") or href.startswith("rtsp:") or href.startswith("ftp:"): continue scheme, netloc, path, params, query, fragment = parse_url(href) if path: suffix = path.split('.')[-1] if suffix in ["png","jpg","gif","rar","zip","mp3",".pdf","doc",".txt","docx","swf","mp4"]: continue abs_url =urljoin_rfc(response.url,href) yield UrlItem(url=abs_url,fromurl=response.url) if depth < 10: depth += 1 yield scrapy.Request(abs_url,meta={"depth":depth})
def pg2parse(self, response): print 'pg2parse' parsedurl = surl.parse_url(get_base_url(response)) urlstr = "%s://%s%s" for subcat in response.css('.cont-sub-content').xpath('a'): item = ArchItem() item['path_url'] = [] item['path_name'] = [] item['main_bread'] = [] item['path_url'].extend(response.meta['item']['path_url']) item['path_name'].append(response.meta['item']['path_name']) item['main_bread'].append(response.meta['item']['main_bread']) realurl = urlstr % (parsedurl.scheme, parsedurl.netloc, subcat.xpath('@href').extract()[0]) item['path_url'].append(realurl) item['path_name'].append( subcat.xpath('*/h2[@itemprop="name"]/text()').extract()[0]) item['main_bread'].append( response.xpath('//*[@itemprop="title"]/text()').extract()) print "!!!!!!!!!!!!!==========", item['path_url'] yield scrapy.Request(item['path_url'][-1], meta={'item': item}, callback=self.parse_prod_list)
def parse_prod_list(self,response): print 'parse_prod_list' urlstr = "%s://%s%s" parsedurl = surl.parse_url(get_base_url(response)) for grid in response.css('.clearfix .grid_link'): item = ArchItem() item['path_url']=[] item['path_name']=[] item['main_bread']=[] item['path_url'].extend( response.meta['item']['path_url']) item['path_name'] .append( response.meta['item']['path_name']) item['main_bread'] .append( response.meta['item']['main_bread']) prdlnk = grid.xpath('a/@href').extract()[0] realurl = urlstr%(parsedurl.scheme,parsedurl.netloc,prdlnk) item['path_url'].append(realurl) item['path_name'].append([grid.css('.name-product').xpath('text()').extract()]) item['main_bread'].append(response.xpath('//*[@itemprop="title"]/text()').extract()) print "=======================PD:",item['path_url'] yield scrapy.Request(item['path_url'][-1],meta={'item':item},callback=self.parse_content) #return item nxtpg = response.css('.btn-pag-right').xpath('@href').extract() if len(nxtpg)>0: nextpage = urlstr%(parsedurl.scheme,parsedurl.netloc,nxtpg[0]) yield scrapy.Request(nextpage,meta={'item':response.meta['item']},callback=self.parse_prod_list)
def parse_products_list(self, response): hxs = HtmlXPathSelector(response) url_base_parts = parse_url(response.url) base_url = '://'.join([url_base_parts.scheme, url_base_parts.netloc]) categories = hxs.select("//div[@class='catNav']//a/@href").extract() for category_url in categories: url = urljoin(get_base_url(response), category_url) r = Request(url, callback=self.parse_products_list) yield r search_form = hxs.select("//form[@name='luceneSearchForm'][@method='post']") if search_form: form_url = search_form.select("@action").extract()[0] form_url = urljoin(get_base_url(response), form_url) formdata = self._get_formdata(search_form) for page_number in set(hxs.select("//table[@class='searchresultsArticlesNavigation']//td[contains(., 'Pagina')]//a/@onclick").re(r'submitPageNumber\((.*)\)')): data = formdata.copy() data['pageNumber'] = page_number data['urlToCall'] = '' if 'onlyOnStock' in data: del(data['onlyOnStock']) if 'onlyPriceReduced' in data: del(data['onlyPriceReduced']) if 'pageSize' in data: del(data['pageSize']) if 'trackArtNr' in data: del(data['trackArtNr']) if 'trackPosition' in data: del(data['trackPosition']) if 'trackProductTitle' in data: del(data['trackProductTitle']) r = FormRequest(form_url, formdata=data, callback=self.parse_products_list) yield r category = hxs.select("//table[@class='LinksKatalogNav']//td[1]/a/h1/text()").extract() products = hxs.select("//td[@id='productListCell']/table/tr[td[contains(@class, 'document')]]") for p in products: name = p.select(".//input[contains(@name, '.shortDescription')]/@value").extract()[0] url = p.select(".//input[contains(@name, '.uriPath')]/@value").extract()[0] url = urljoin(base_url, url) brand = p.select(".//input[contains(@name, '.vendor')]/@value").extract()[0] price = p.select(".//input[contains(@name, '.price')]/@value").extract()[0] image_url = urljoin(get_base_url(response), p.select(".//input[contains(@name, '.img')]/@value").extract()[0]) sku = p.select(".//input[contains(@name, '.type')]/@value").extract()[0] identifier = p.select(".//input[contains(@name, '.artNr')]/@value").extract()[0] stock = p.select(".//input[contains(@name, '.stockValue')]/@value").extract()[0] if not stock: stock = 0 loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('url', url) loader.add_value('category', category) loader.add_value('brand', brand) loader.add_value('image_url', image_url) loader.add_value('sku', sku) loader.add_value('identifier', identifier) loader.add_value('stock', stock) loader.add_value('price', price) price = extract_price(price) if price < Decimal(100.00): loader.add_value('shipping_cost', '10.00') yield loader.load_item()
def url_has_any_extension(url, extensions): return posixpath.splitext(parse_url(url).path)[1].lower()[1:] in extensions