def parse_search_result(self, response): """ Parse Search Result Parser that will be used if response result detected as search result page. Args: response Returns: item """ item = ElectronicItem() part_number = cleansplit(Selector(text=response.body) #.xpath('//td[contains(@class, "tr-dkPartNumber")]//a//text()')) .css("a.symbol.product-symbol::text")) #print key_part_number manufacturer_part_number = cleansplit(Selector(text=response.body) #.xpath('//td[contains(@class, "tr-mfgPartNumber")]//a//span//text()')) .css(".manufacturer>a:nth-of-type(2)>b::text")) #print manufacturer_part_number manufacturer_name = cleansplit(Selector(text=response.body) #.xpath('//td[contains(@class, "tr-vendor")]//span//a//span//text()')) .css(".manufacturer>a:nth-of-type(1)>b::text")) description = cleansplit(Selector(text=response.body) #.xpath('//td[contains(@class, "tr-description")]//text()')) .css(".product>div>span::text")) # javascript execution needed, scrapy doesnt handle it quantity_available = [] #cleansplit(Selector(text=response.body) #.xpath('//td[contains(@class, "tr-qtyAvailable ptable-param")]//span//text()')) #.xpath('//tbody[1]/tr/td[5]/div[1]/b[2]')) #.css("td.stany>div>b:nth-of-type(2)::text")) print quantity_available image_url = cleansplit(Selector(text=response.body) #.xpath('//img[contains(@class, "pszoomer")]') #.xpath('@src')) .css(".product_image>a>img::attr(src)")) ''' This is variable handler when no content in selected xpath. so this algorithm will keep list balanced. and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method''' if not quantity_available: quantity_available = listbalancer(part_number) if not image_url: image_url = listbalancer(image_url) if not description: description = listbalancer(description) for i, j, k, l, m, n in zip(part_number, manufacturer_part_number, manufacturer_name, description, quantity_available, image_url): item['manufacturer'] = k item['manufacturer_part_number'] = j item['supplier'] = self.spider_name item['supplier_part_number'] = i item['description'] = l item['image_url'] = n item['product_url'] = response.url item['stock_qty'] = cleanqty(m.replace(u'\xa0', u'')) yield item next_url = response.css('form>div.nawigator>a:last-of-type::attr(href)').extract_first() if self.debug: print "Next URL -> %s" % (response.urljoin(next_url)) if next_url and "javascript:void(0);" not in next_url: "Following Next Page {0}".format(response.urljoin(next_url)) yield Request(response.urljoin(next_url), callback=self.parse_search_result, dont_filter=True)
def single_result(self, response): """ single result parser Parser that will be used if response result detected as single item page Args: response Returns: item """ item = ElectronicItem() part_number = cleansplit(Selector(text=response.body) .xpath('//td[@id="reportPartNumber"]//text()').extract_first()) manufacturer_part_number = cleansplit(Selector(text=response.body) .xpath('//h1[@itemprop="model"]//text()').extract_first()) manufacturer_name = cleansplit(Selector(text=response.body) .xpath('//h2[@class="lnkMfct"]//span//a//span//text()').extract_first()) description = cleansplit(Selector(text=response.body) .xpath('//td[@itemprop="description"]//text()').extract_first()) quantity_available = cleansplit(Selector(text=response.body) .xpath('//td[@id="quantityAvailable"]//span[@id="dkQty"]//text()').extract_first()) image_url = cleansplit(Selector(text=response.body) .xpath('//a[@class="bota-image-large"]//img//@src').extract_first()) item['manufacturer'] = manufacturer_name item['manufacturer_part_number'] = manufacturer_part_number item['supplier'] = self.spider_name item['supplier_part_number'] = part_number item['description'] = description item['product_url'] = response.url item['image_url'] = "{0}{1}".format("http:", image_url) item['stock_qty'] = cleanqty(quantity_available) yield item
def parse_search_result(self, response): """ Parse Search Result Parser that will be used if response result detected as search result page. Args: response Returns: item """ item = ElectronicItem() part_number = cleansplit( Selector(text=response.body).xpath( "//p[@class='mfr-results']//a//text()")) # manufacturer part number always same with part number in futureelectronics.com manufacturer_part_number = part_number manufacturer_name = cleansplit( Selector( text=response.body).xpath("//div[@class='desc']//h5//text()")) description = cleansplit( Selector(text=response.body).xpath( "//p[@class='mfr-results']//a//text()")) quantity_available = cleansplit( Selector(text=response.body).xpath( "//span[@class='prices-in-stock-value']//text()")) image_url = cleansplit( Selector(text=response.body).xpath( "//img[@class='productThumbnail']").xpath('@src')) ''' This is variable handler when no content in selected xpath. so this algorithm will keep list balanced. and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method''' if not quantity_available: quantity_available = listbalancer(part_number) if not image_url: image_url = listbalancer(image_url) if not description: description = listbalancer(description) for i, j, k, l, m, n in zip(part_number, manufacturer_part_number, manufacturer_name, description, quantity_available, image_url): item['manufacturer'] = k item['manufacturer_part_number'] = j item['supplier'] = self.spider_name item['supplier_part_number'] = i item['description'] = l item['image_url'] = n item['product_url'] = response.url item['stock_qty'] = m.replace(u'\xa0', u'') yield item next_url = response.xpath( '//a[@id="ctl00_PlaceHolderMain_results_pagingFooter_ctl08_HyperLink6"]//@href' ).extract_first() if self.debug: print "Next URL -> %s" % (next_url) if next_url: "Following Next Page {0}".format(response.urljoin(next_url)) yield Request(response.urljoin(next_url), callback=self.parse_search_result, dont_filter=True)
def parse_search_result(self, response): ''' Search Result Page Parser. self callback if there is pagination automatically. :param response: :return: ''' item = ElectronicItem() items = [] part_number = cleansplit(Selector(text=response.body) #.css("li.ttipartnumber a ::text/li[@class='ttipartnumber']/a/text()")) .css("li.ttipartnumber a::text")) manufacturer_part_number = cleansplit(Selector(text=response.body) #.xpath("/li[@class='mfrpartnumber']/a/text()")) .css("li.mfrpartnumber a::text")) manufacturer_name = cleansplit(Selector(text=response.body) #.xpath("/li[@class='manufacturer']/text()")) .css("li.manufacturer::text")) description = cleansplit(Selector(text=response.body) #.xpath("/td[@class='description']/text()")) .css("td.description::text")) quantity_available = cleansplit(Selector(text=response.body) #.xpath("/td[@class='availability']/text()")) .css("td.availability::text")) image_url = cleansplit(Selector(text=response.body).xpath("//img[@class='large-photo']") .xpath('@src')) ''' This is variable handler when no content in selected xpath. so this algorithm will keep list balanced. and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method''' if not quantity_available: quantity_available = listbalancer(part_number) if not image_url: image_url = listbalancer(image_url) if not description: description = listbalancer(description) for i, j, k, l, m, n in zip(part_number, manufacturer_part_number, manufacturer_name, description, quantity_available, image_url): item['manufacturer'] = k item['manufacturer_part_number'] = j item['supplier'] = self.spider_name item['supplier_part_number'] = i item['description'] = l item['image_url'] = n item['product_url'] = response.url item['stock_qty'] = cleanqty(m.replace(u'\xa0', u'')) yield item next_url = response.xpath("/html/body[@id='search_results']" "/div[@id='pageContent']/div[@id='content-wrapper']" "/div[@id='content-box']/form[@id='SearchAgainForm']" "/div[2]/div[@id='search-results']/div[@class='action-row']" "/div[@class='pagination']/strong/a[@class='current']" "/following-sibling::a/@href").extract_first() if self.debug: print "Next URL -> %s" % (next_url) if next_url: "Following Next Page {0}".format(response.urljoin(next_url)) yield Request(response.urljoin(next_url), callback=self.parse_search_result, dont_filter=True)
def single_result(self, response): """ single result parser Parser that will be used if response result detected as single item page Args: response Returns: item """ item = ElectronicItem() part_number = cleansplit( Selector(text=response.body) #.css('a.primarySearchLink::text').extract_first()) .xpath( "//ul[@class='keyDetailsLL']/li[1]/span[@class='keyValue bold']/text()" )) manufacturer_part_number = cleansplit( Selector(text=response.body) #.css('dd[itemprop="mpn"]::text').extract_first()) .xpath( "//ul[@class='keyDetailsLL']/li[3]/span[@class='keyValue bold']/span/text()" )) manufacturer_name = cleansplit( Selector(text=response.body) #.css('a.secondarySearchLink::text').extract_first()) .xpath( "//ul[@class='keyDetailsLL']/li[2]/span[@class='keyValue']/a/span/text()" )) description = cleansplit( Selector(text=response.body) #.css('div[itemprop="http://schema.org/description"]::text').extract_first()) .xpath("//div[@class='rangeOverview'][1]/p[1]/text()")) ''' There is no direct quantity available on rscomponents.''' quantity_available = cleansplit( Selector(text=response.body).xpath( "//span[@class='availability']//text()").extract_first()) image_url = cleansplit( Selector(text=response.body).xpath( "//img[@id='mainImage']/@src").extract_first()) #.css("img[id='productMainImage']::attr(src)").extract_first()) item['manufacturer'] = manufacturer_name item['manufacturer_part_number'] = manufacturer_part_number item['supplier'] = self.spider_name item['supplier_part_number'] = part_number item['description'] = description item['stock_qty'] = cleanqty(quantity_available) item['product_url'] = response.url item['image_url'] = image_url yield item
def single_result(self, response): """ single result parser Parser that will be used if response result detected as single item page Args: response Returns: item """ item = ElectronicItem() part_number = cleansplit( Selector(text=response.body) #.css('a.primarySearchLink::text').extract_first()) .xpath("//div[@id='divMouserPartNum']/text()").extract_first()) manufacturer_part_number = cleansplit( Selector(text=response.body) #.css('dd[itemprop="mpn"]::text').extract_first()) .xpath("//div[@id='divManufacturerPartNum']/h1/text()"). extract_first()) manufacturer_name = cleansplit( Selector(text=response.body) #.css('a.secondarySearchLink::text').extract_first()) .xpath( "//a[@id='ctl00_ContentMain_hlnk10']/text()").extract_first()) description = cleansplit( Selector(text=response.body) #.css('div[itemprop="http://schema.org/description"]::text').extract_first()) .xpath("//div[@id='divDes']/text()").extract_first()) quantity_available = cleansplit( Selector(text=response.body).xpath( "//div[@class='av-row'][1]/div[@class='av-col2']/text()"). extract_first().split(" ")[0]) image_url = cleansplit( Selector(text=response.body).xpath( "//img[@id='ctl00_ContentMain_img1']/@src").extract_first(). replace("../../../", self.start_urls[0] + "/")) #.css("img[id='productMainImage']::attr(src)").extract_first()) item['manufacturer'] = manufacturer_name item['manufacturer_part_number'] = manufacturer_part_number item['supplier'] = self.spider_name item['supplier_part_number'] = part_number item['description'] = description item['stock_qty'] = cleanqty(quantity_available) item['product_url'] = response.url item['image_url'] = image_url # items = dict((item['part_number'], item['manufacturer_part_number'], item['manufacturer_name'], # item['description'], item['stock_qty'], item['image_url'])) yield item
def parse_search_result(self, response): """ Parse Search Result Parser that will be used if response result detected as search result page Args: response Returns: item """ item = ElectronicItem() part_number = cleansplit(Selector(text=response.body) .xpath("//span[@class='SearchResults-productName']/span//text()")) # manufacturer part number always same with part number in arrow.com manufacturer_part_number = part_number manufacturer_name = cleansplit(Selector(text=response.body) .xpath("//a[@class='SearchResults-productManufacturer']//text()")) description = cleansplit(Selector(text=response.body) .xpath("//td[@class='SearchResults-column SearchResults-column--description']" "//span//text()")) quantity_available = cleansplit(Selector(text=response.body) .xpath("//span[@class='SearchResults-stock']//span//following-sibling::text()")) image_url = cleansplit(Selector(text=response.body).xpath("//img[contains(@class, 'SearchResults-image')]") .xpath('@src')) ''' This is variable handler when no content in selected xpath. so this algorithm will keep list balanced. and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method''' if not quantity_available: quantity_available = listbalancer(part_number) if not image_url: image_url = listbalancer(image_url) if not description: description = listbalancer(description) for i, j, k, l, m, n in zip(part_number, manufacturer_part_number, manufacturer_name, description, quantity_available, image_url): item['manufacturer'] = k item['manufacturer_part_number'] = j item['supplier'] = self.spider_name item['supplier_part_number'] = i item['description'] = l item['image_url'] = "{0}{1}".format("http:",n) item['product_url'] = response.url item['stock_qty'] = cleanqty(m) yield item next_url = response.xpath('//link[@rel="next"]/@href').extract_first() if self.debug: print "Next URL -> %s" % (next_url) if next_url: "Following Next Page {0}".format(response.urljoin(next_url)) yield Request(response.urljoin(next_url), callback=self.parse_search_result, dont_filter=True)
def single_result(self, response): """ single result parser Parser that will be used if response result detected as single item page Args: response Returns: item """ item = ElectronicItem() items = [] part_number = cleansplit( Selector(text=response.body).css( 'dd[itemprop="http://schema.org/sku"]::text').extract_first()) manufacturer_part_number = cleansplit( Selector(text=response.body).css( 'dd[itemprop="mpn"]::text').extract_first()) manufacturer_name = cleansplit( Selector(text=response.body).css( 'span[itemprop="http://schema.org/manufacturer"]::text'). extract_first()) description = cleansplit( Selector(text=response.body).css( 'div[itemprop="http://schema.org/description"]::text'). extract_first()) '''Quantity stock available always 29,050,000 for yes.''' quantity_available = cleansplit( Selector(text=response.body).xpath( "//span[@class='availability']//text()").extract_first()) # .xpath('//ul[@class="BuyingOptions-labeledValues BuyingOptions-labeledValues--right"]' # '//li[1]//strong//text()').extract_first()) image_url = cleansplit( Selector(text=response.body) #.xpath("/img[@id='productMainImage']/@src").extract_first()) .css("img[id='productMainImage']::attr(src)").extract_first()) item['manufacturer'] = manufacturer_name item['manufacturer_part_number'] = manufacturer_part_number item['supplier'] = self.spider_name item['supplier_part_number'] = part_number item['description'] = description item['stock_qty'] = cleanqty(quantity_available) item['product_url'] = response.url item['image_url'] = image_url yield item
def single_result(self, response): """ single result parser Parser that will be used if response result detected as single item page Args: response Returns: item """ item = ElectronicItem() items = [] part_number = cleansplit( Selector(text=response.body).xpath( '//p[@class="ref"]//b//following-sibling::text()[not(preceding-sibling::br) and not(self::br)]' ).extract_first()) manufacturer_part_number = part_number manufacturer_name = cleansplit( Selector(text=response.body).xpath( '///div[@id="product-desc"]/h2//text()').extract_first()) description = cleansplit( Selector(text=response.body).xpath( '//div[@id="product-desc"]/p[@class="desc"]//text()'). extract_first()) '''Quantity stock available always 29,050,000 for yes.''' quantity_available = cleansplit( Selector(text=response.body).xpath( "//td[@class='qty']//text()").extract_first()) #.xpath('//ul[@class="BuyingOptions-labeledValues BuyingOptions-labeledValues--right"]' # '//li[1]//strong//text()').extract_first()) image_url = cleansplit( Selector(text=response.body).xpath( '/img[@id="previewedMEDImage"]/@src').extract_first()) item['manufacturer'] = manufacturer_name item['manufacturer_part_number'] = manufacturer_part_number item['supplier'] = self.spider_name item['supplier_part_number'] = part_number item['description'] = description item['stock_qty'] = cleanqty(quantity_available) item['product_url'] = response.url item['image_url'] = image_url yield item
def parse_search_result(self, response): item = ElectronicItem() key_part_number = cleansplit( Selector(text=response.body).xpath( '//td[@id="reportPartNumber"]//text()')) manufacturer_part_number = cleansplit( Selector(text=response.body).xpath( '//td[contains(@class, "tr-mfgPartNumber")]//a//span//text()')) manufacturer_name = cleansplit( Selector(text=response.body).xpath( '//td[contains(@class, "tr-vendor")]//span//a//span//text()')) description = cleansplit( Selector(text=response.body).xpath( '//td[contains(@class, "tr-description")]//text()')) quantity_available = cleansplit( Selector(text=response.body).xpath( '//td[contains(@class, "tr-qtyAvailable ptable-param")]//span//text()' )) image_url = cleansplit( Selector(text=response.body).xpath( '//img[contains(@class, "pszoomer")]').xpath('@src')) for i, j, k, l, m, n in zip(key_part_number, manufacturer_part_number, manufacturer_name, description, quantity_available, image_url): item['manufacturer'] = k item['manufacturer_part_number'] = j item['supplier'] = self.spider_name item['supplier_part_number'] = i item['description'] = l item['image_url'] = "{0}{1}".format("http:", n) item['product_url'] = response.url item['stock_qty'] = cleanqty(m) yield item next_url = response.xpath('//a[@class="Next"]/@href').extract_first() if self.debug: print "Next URL -> %s" % (next_url) if next_url: "Following Next Page {0}".format(response.urljoin(next_url)) yield Request(response.urljoin(next_url), callback=self.parse_search_result, dont_filter=True)
def single_result(self, response): """ single result parser Parser that will be used if response result detected as single item page Args: response Returns: item """ item = ElectronicItem() part_number = cleansplit(Selector(text=response.body) .xpath('//h1[@class="Product-Summary-Name"]//text()').extract_first()) manufacturer_part_number = cleansplit(Selector(text=response.body) .xpath('//meta[@itemprop="mpn"]//@content').extract_first()) manufacturer_name = cleansplit(Selector(text=response.body) .xpath('//p[@itemprop="brand"]//a//text()').extract_first()) description = cleansplit(Selector(text=response.body) .xpath('//p[@itemprop="description"]//text()').extract_first()) quantity_available = cleansplit(Selector(text=response.body) .xpath("//li[contains(@class,'BuyingOptions-option')]//@data-quantity").extract_first()) image_url = cleansplit(Selector(text=response.body) .xpath('//img[@class="Product-Summary-Image"]//@src').extract_first()) item['manufacturer'] = manufacturer_name item['manufacturer_part_number'] = manufacturer_part_number item['supplier'] = self.spider_name item['supplier_part_number'] = part_number item['description'] = description item['stock_qty'] = cleanqty(quantity_available) item['product_url'] = response.url # if image url is not found in item page if image_url: item['image_url'] = "{0}{1}".format("http:", image_url) else: item['image_url'] = image_url yield item
def single_result(self, response): ''' Single result Parser :param response: :return: ''' item = ElectronicItem() part_number = cleansplit( Selector(text=response.body).css( 'td#ttiPartNumber strong::text').extract_first()) manufacturer_part_number = cleansplit( Selector(text=response.body).css( 'td#manufacturerPartNumber strong::text').extract_first()) manufacturer_name = cleansplit( Selector(text=response.body).css( 'td#manufacturer::text').extract_first()) description = cleansplit( Selector(text=response.body).css( 'td#partDescription::text').extract_first()) '''Quantity stock available always 29,050,000 for yes.''' quantity_available = cleansplit( Selector(text=response.body).xpath( "//td[@class='val']//text()").extract_first()) image_url = cleansplit( Selector(text=response.body).xpath( "/span[@class='photo-holder']/img/@src").extract_first()) item['manufacturer'] = manufacturer_name item['manufacturer_part_number'] = manufacturer_part_number item['supplier'] = self.spider_name item['supplier_part_number'] = part_number item['description'] = description item['stock_qty'] = cleanqty(quantity_available) item['product_url'] = response.url item['image_url'] = image_url yield item
def parse_search_result(self, response): """ Parse Search Result Parser that will be used if response result detected as search result page. Args: response Returns: item """ item = ElectronicItem() part_number = cleansplit( Selector(text=response.body) #.css("table.SearchResultsTable > tbody > tr > td:nth-child(3) > div > a")) #.xpath("//table[@class='SearchResultsTable']/tbody/tr/td[3]/div/a/text()")) #.xpath("//table[@id='ctl00_ContentMain_SearchResultsGrid_grid']/tbody/tr/td[3]/div/a/text()")) .xpath( "//a[@title='Click to view additional information on this product.']//text()" )) #print part_number manufacturer_part_number = cleansplit( Selector( text=response.body).xpath("//div[@class='mfrDiv']/a/text()")) #.xpath("//table/tbody/tr/td[4]/div/a/text()")) #.css("tbody tr td:nth-of-type(4)")) #print manufacturer_part_number manufacturer_name = cleansplit( Selector(text=response.body) #.xpath("//table[@class='SearchResultsTable']/tbody/tr/td[5]/a/text()")) .xpath("//a[contains(@id, 'lnkSupplier')]/text()")) #.css("table#ctl00_ContentMain_SearchResultsGrid_grid > tbody > tr > td:nth-child(5) > a::text")) #print manufacturer_name description = cleansplit( Selector(text=response.body).xpath( "//a[contains(@id, 'lnkSupplier')]/../following-sibling::td/text()" )) #.css("table#ctl00_ContentMain_SearchResultsGrid_grid tbody tr td:nth-child(6)")) #print len(description) quantity_available = cleansplit( Selector(text=response.body).xpath( "//span[contains(@id,'lnkAvailability')]/text()")) # "/text()[1]")) # .css("span.inStockBold::text")) print quantity_available image_url = cleansplit( Selector(text=response.body).xpath( "//tr[@class='SearchResultsRowOdd']/td/a/img/@src")) ''' This is variable handler when no content in selected xpath. so this algorithm will keep list balanced. and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method''' if not quantity_available: quantity_available = listbalancer(part_number) if not image_url: image_url = listbalancer(image_url) if not description: description = listbalancer(description) for i, j, k, l, m, n in zip(part_number, manufacturer_part_number, manufacturer_name, description, quantity_available, image_url): item['manufacturer'] = k item['manufacturer_part_number'] = j item['supplier'] = self.spider_name item['supplier_part_number'] = i item['description'] = l item['image_url'] = "{0}{1}".format(self.start_urls[0], n) item['product_url'] = response.url item['stock_qty'] = cleanqty(m.replace('In Stock', '')) yield item next_url = response.xpath( "//a[@id='ctl00_ContentMain_PagerTop_lnkNext']/@href" ).extract_first() if self.debug: print "Next URL -> %s" % (next_url) if next_url: self.pn += 1 "Following Next Page {0}".format(response.urljoin(next_url)) yield Request(response.urljoin(next_url), callback=self.parse_search_result, dont_filter=True)
def parse_search_result(self, response): """ Parse Search Result Parser that will be used if response result detected as search result page. Args: response Returns: item """ item = ElectronicItem() items = [] part_number = cleansplit( Selector(text=response.body) # .css("li.ttipartnumber a ::text/li[@class='ttipartnumber']/a/text()")) .xpath( "//div[@class='partColContent']/ul[@class='viewDescList']/li[1]/a[@class='primarySearchLink']/text()" )) manufacturer_part_number = cleansplit( Selector(text=response.body).xpath( "//ul[@class='viewDescList']/li[3]/span[@class='defaultSearchText']/text()" )) #.css("td.productImage.mftrPart a::text")) manufacturer_name = cleansplit( Selector(text=response.body).xpath( "//ul[@class='viewDescList']/li[2]/a[@class='secondarySearchLink']/text()" )) # .css("td.description a p:first-of-type::text")) description = cleansplit( Selector(text=response.body).xpath( "//div[@class='srDescDiv']/a[@class='primarySearchLink'][1]/text()" )) #.css("td.description a p:nth-of-type(2)::text")) quantity_available = cleansplit( Selector(text=response.body) # .xpath("/td[@class='availability']/text()")) .css("span.inStockBold::text") ) # quantity is not found in rscomponents image_url = cleansplit( Selector(text=response.body).xpath( "//div[@class='viewsImage']/a/img/@src")) ''' This is variable handler when no content in selected xpath. so this algorithm will keep list balanced. and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method''' if not quantity_available: quantity_available = listbalancer(part_number) if not image_url: image_url = listbalancer(image_url) if not description: description = listbalancer(description) for i, j, k, l, m, n in zip(part_number, manufacturer_part_number, manufacturer_name, description, quantity_available, image_url): item['manufacturer'] = k item['manufacturer_part_number'] = j item['supplier'] = self.spider_name item['supplier_part_number'] = i item['description'] = l item['image_url'] = "{0}{1}".format("http:", n) item['product_url'] = response.url item['stock_qty'] = cleanqty(m.replace(u'\xa0', u'')) yield item next_url = response.xpath( "//a[@class='rightLink nextLink approverMessageTitle']/@href" ).extract_first() if self.debug: print "Next URL -> %s" % (next_url) if next_url: self.pn += 1 "Following Next Page {0}".format(response.urljoin(next_url)) yield Request( 'http://uk.rs-online.com/web/c/?sra=oss&r=t&searchTerm=%s&pn=%s&rpp=2' % (self.query, self.pn), callback=self.parse_search_result, dont_filter=True)
def parse_search_result(self, response): """ Parse Search Result Parser that will be used if response result detected as search result page. Args: response Returns: item """ item = ElectronicItem() part_number = cleansplit( Selector(text=response.body) #.css("li.ttipartnumber a ::text/li[@class='ttipartnumber']/a/text()")) #.css("li.ttipartnumber a::text")) .xpath("//meta[@itemprop='sku']/@content")) manufacturer_part_number = cleansplit( Selector( text=response.body).xpath("//meta[@itemprop='mpn']/@content")) #.css("li.mfrpartnumber a::text")) manufacturer_name = cleansplit( Selector(text=response.body).xpath( "//td[@class='oc_row']/div/img/@title")) #.css("li.manufacturer::text")) description = cleansplit( Selector(text=response.body).xpath( "//span[@itemprop='description']/text()")) #.css("td.description::text")) quantity_available = cleansplit( Selector(text=response.body).xpath( "//table[1]/tbody[1]/tr/td[5]//text()")) #.css("td.availability::text")) image_url = cleansplit( Selector(text=response.body).xpath( "//table[1]/tbody[1]/tr/td[2]/img[1]/@src")) ''' This is variable handler when no content in selected xpath. so this algorithm will keep list balanced. and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method''' if not quantity_available: quantity_available = listbalancer(part_number) if not image_url: image_url = listbalancer(image_url) if not description: description = listbalancer(description) for i, j, k, l, m, n in zip(part_number, manufacturer_part_number, manufacturer_name, description, quantity_available, image_url): item['manufacturer'] = k item['manufacturer_part_number'] = j item['supplier'] = self.spider_name item['supplier_part_number'] = i item['description'] = l item['image_url'] = n item['product_url'] = response.url item['stock_qty'] = cleanqty(m.replace(u'\xa0', u'')) yield item #next_url = response.xpath( # '//a[@id="ctl00_PlaceHolderMain_results_pagingFooter_ctl08_HyperLink6"]//@href').extract_first() next_url = response.xpath( "//nav[1]/ul[1]/li[4]/a[1]/@href").extract_first() if self.debug: print "Next URL -> %s" % (next_url) if next_url: "Following Next Page {0}".format(response.urljoin(next_url)) yield Request(response.urljoin(next_url), callback=self.parse_search_result, dont_filter=True)
def parse_search_result(self, response): """ Parse Search Result Parser that will be used if response result detected as search result page. Args: response Returns: item """ item = ElectronicItem() part_number = cleansplit( Selector(text=response.body) #.css("li.ttipartnumber a ::text/li[@class='ttipartnumber']/a/text()")) .css("p.sku a::text")) manufacturer_part_number = cleansplit( Selector(text=response.body) #.xpath("/li[@class='mfrpartnumber']/a/text()")) .css("td.productImage.mftrPart a::text")) manufacturer_name = cleansplit( Selector(text=response.body) #.xpath("//id('sProdList')/x:tbody/x:tr/x:td/x:a/x:p[1]::text")) .css("td.description a p:first-of-type::text")) description = cleansplit( Selector(text=response.body) #.xpath("/td[@class='description']/text()")) .css("td.description a p:nth-of-type(2)::text")) quantity_available = cleansplit( Selector(text=response.body) #.xpath("/td[@class='availability']/text()")) .css("span.inStockBold::text")) image_url = cleansplit( Selector(text=response.body).xpath( "//img[@class='productThumbnail']").xpath('@src')) ''' This is variable handler when no content in selected xpath. so this algorithm will keep list balanced. and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method''' if not quantity_available: quantity_available = listbalancer(part_number) if not image_url: image_url = listbalancer(image_url) if not description: description = listbalancer(description) for i, j, k, l, m, n in zip(part_number, manufacturer_part_number, manufacturer_name, description, quantity_available, image_url): item['manufacturer'] = k item['manufacturer_part_number'] = j item['supplier'] = self.spider_name item['supplier_part_number'] = i item['description'] = l item['image_url'] = n item['product_url'] = response.url item['stock_qty'] = cleanqty(m.replace(u'\xa0', u'')) yield item next_url = response.xpath( "//span[@class='current']" "/following-sibling::span/a/@href").extract_first() if self.debug: print "Next URL -> %s" % (next_url) if next_url: "Following Next Page {0}".format(response.urljoin(next_url)) yield Request(response.urljoin(next_url), callback=self.parse_search_result, dont_filter=True)