def parse(self, response): self.log('crawl page: {}'.format(response.url), logging.INFO) match = re.search("\['mainContestObj'\] = ({.*?});\n", response.body) if match: item = ProductItem() item['json'] = match.group(1) return item
def get_item_contents(self, response): product_id = response.url.rsplit('/', 1)[1] image_url = response.xpath( '//div[@class="product-image__container"]/img/@src').get() image_url = image_url[:image_url.rfind('?')] product_title = response.xpath('//h1/text()').get() product_category = response.xpath( '(//span[contains(@class, "styled__StandaloneContainer")])[3]/a/span/span/text()' ).get() product_price = response.xpath( '//span[@data-auto="price-value"]/text()').get() product_description = '\n'.join( response.xpath( '//div[@id="product-description"]/*[not(self::h2)]//text()'). getall()) name_and_address = '\n'.join( response.xpath( '//div[@id="manufacturer-address"]/ul/li/text()').getall()) return_address = '\n'.join( response.xpath( '//div[@id="return-address"]/ul/li/text()').getall()) net_contents = response.xpath( '//div[@id="net-contents"]/p/text()').get() usually_bought_next = list() json = loads( response.xpath( '//body[@id="data-attributes"]/@data-redux-state').get()) try: for item in json['productDetails']['recommendations']['data']: for _item in json['productDetails']['recommendations']['data'][ item]['productItems']: usually_bought_next.append({ 'product_url': furl(self.product_template).join( _item['product']['id']).url, 'title': _item['product']['title'], 'product_image_url': _item['product']['defaultImageUrl'], 'price': _item['product']['price'] }) except TypeError: usually_bought_next = None product = ProductItem() product['product_id'] = product_id product['url'] = response.url product['image_url'] = image_url product['title'] = product_title product['category'] = product_category product['price'] = product_price product['description'] = product_description product['name_and_address'] = name_and_address product['return_address'] = return_address product['net_contents'] = net_contents product['reviews'] = response.meta.get('reviews', None) product['usually_bought_next'] = usually_bought_next yield product
def product_parse(self, response: HtmlResponse): loader = ItemLoader(item=ProductItem(), response=response) loader.add_xpath('title', '//h1[@slot="title"]/text()') loader.add_xpath( 'price', '//uc-pdp-price-view[@slot="primary-price"]/span/text()') loader.add_xpath('description', '//uc-pdp-section-vlimited/div/p/text()') loader.add_xpath('features', '//dl[@class="def-list"]/div/*/text()') loader.add_xpath('images', '//img[@alt="product image"]/@src') loader.add_value('url', response.url) yield loader.load_item()
def start(): import glob import lxml.etree as etree file_list = glob.glob('../tests/*.html') parmas = {} result = [] for file_name in file_list: product = ProductItem() with open(file_name, 'r', encoding='utf-8') as f: html = f.read() parmas['html_code'] = html parmas['xpath_obj'] = etree.HTML(html) product(parmas) result.append(product.__dict__) return result
def parse(self, response): item = ProductItem() page_item = PAGE_ITEM(response.text) # (title, price, stars, best_sell_rank) = page_item.get_Infos() page_item.get_Infos() if page_item.parsed: item['uuid'] = response.meta['uuid'] item['status'] = True item['title'] = page_item.title item['price'] = page_item.price item['stars'] = page_item.stars item['best_sell_rank'] = page_item.best_sell_rank else: item['uuid'] = response.meta['uuid'] item['status'] = False item['fail_crawls'] = [] if page_item.title is None: item['fail_crawls'].append('title') else: item['title'] = page_item.title if page_item.price is None: item['fail_crawls'].append('price') else: item['price'] = page_item.price if page_item.stars is None: item['fail_crawls'].append('stars') else: item['stars'] = page_item.stars if page_item.best_sell_rank is None: item['fail_crawls'].append('best_sell_rank') else: item['best_sell_rank'] = page_item.best_sell_rank item['url'] = response.url item['class_name'] = response.meta['class_name'] l_rem(self.hostname, response.meta['value']) yield item
def parse(self, response): print("processing:"+response.url) comment = response.xpath('/html//div[@class="comment"]') for comm in comment: item = ProductItem() data = ' '.join(comm.xpath('.//div[@class="comment_text"]/text()').extract()) item['otziv']=data.replace('\r', '') stars = comm.xpath('.//div[@class="author"]/div[@class="br-theme-css-stars"]/div[@class="br-widget"]/a[@class="br-active"]') if len(stars)<=3: item['target']=0 else: item['target']=1 yield item NEXT_PAGE_SELECTOR = 'html body div#wrapper div#content div div#user_reviews div#more_reviews a::attr(href)' next_page = response.css(NEXT_PAGE_SELECTOR).extract_first() if next_page: yield scrapy.Request( response.urljoin(next_page), callback=self.parse )
def parse(self, response): self.log('product url: {}'.format(response.url), logging.INFO) try: store_url = response.css('.shop-name').xpath( 'a/@href').extract()[0] self.log('crawl store url: {}'.format(store_url), logging.INFO) store_item = UrlItem() store_item['prefix'] = ProductSpider.prefix store_item['type'] = 'store' store_item['url'] = store_url yield store_item feedback_base_url = response.xpath( '//div[@id="feedback"]/iframe/@thesrc').extract()[0] parsed = urlparse.urlparse(feedback_base_url) product_id = urlparse.parse_qs(parsed.query)['productId'][0] try: percent_num = response.css('.percent-num').xpath( 'text()').extract()[0] rantings_text = response.css('.rantings-num').xpath( 'text()').extract()[0] rantings_num = rantings_text[1:rantings_text.index(' ')] order_text = response.css('.order-num').xpath( 'text()').extract()[0] order_num = order_text[:order_text.index(' ')] except: percent_num = 0 rantings_num = 0 order_num = 0 product_item = ProductItem() product_item['prefix'] = ProductSpider.prefix product_item['_id'] = product_id product_item['store'] = store_url product_item['url'] = response.url product_item['percent_num'] = percent_num product_item['rantings_num'] = rantings_num product_item['order_num'] = order_num yield product_item feedback_item = UrlItem() feedback_item['prefix'] = ProductSpider.prefix feedback_item['type'] = 'feedback' feedback_item['url'] = feedback_base_url yield feedback_item order_item = UrlItem() order_item['prefix'] = ProductSpider.prefix order_item['type'] = 'order' order_item[ 'url'] = 'http://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm?productId={}&type=default'.format( product_id) yield order_item except: try: product_url = response.meta['redirect_urls'][0] except: product_url = response.url self.log('strange product url: {}'.format(product_url), logging.ERROR) finally: self.log( 'meet anti-spider, back product: {}'.format(product_url), logging.INFO) url_item = UrlItem() url_item['prefix'] = ProductSpider.prefix url_item['type'] = 'product' url_item['url'] = product_url yield url_item