def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) brand = hxs.select( "//*[contains(text(),'Designer:')]/text()").extract() brand = brand[0].split(':')[1].strip() if brand else '' name = hxs.select( '//td[@class="cont_heading_td"]//h1/text()').extract()[0] identifier = hxs.select( '//input[@name="products_id"]/@value').extract() if identifier: identifier = identifier[0] else: identifier = re.search('p-(\d+).html', response.url) if identifier: identifier = identifier.group(1) else: log.msg('PRODUCT WIHTOUT IDENTIFIER: ' + response.url) return image_url = hxs.select('//a[@rel="fotografias"]/img/@src').extract() image_url = urljoin_rfc(base_url, image_url[0]) if image_url else '' category = hxs.select( '//td[@class="cont_heading_td"]/span/text()').extract() sku = hxs.select('//tr/td[contains(text(), "Ref: ")]/text()').re( 'Ref: (.*)') price = hxs.select('//td[@class="preu"]/text()').extract() price = price[0] if price else '0' price = extract_price(price) options = self.get_options(response, price) if options: for option in options: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url.split('?osCsid=')[0]) loader.add_value('name', name + option[1]) loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('identifier', identifier + '-'.join(option[0])) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('price', option[2]) out_of_stock = hxs.select( '//form[contains(@id, "cart_quantity_")]/img[contains(@alt, "OUT_STOCK")]' ) if out_of_stock: loader.add_value('stock', 0) formdata = {'products_id': identifier} for option_id in option[0]: attr_id = hxs.select('//select[option[@value="' + option_id + '"]]/@id').re('(\d+)')[0] formdata['attribute_' + attr_id] = option_id product = {'product': loader.load_item(), 'formdata': formdata} self.collect_products.append(product) else: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('identifier', identifier) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('price', price) out_of_stock = hxs.select( '//form[contains(@id, "cart_quantity_")]/img[contains(@alt, "OUT_STOCK")]' ) if out_of_stock: loader.add_value('stock', 0) formdata = {'products_id': identifier} product = {'product': loader.load_item(), 'formdata': formdata} self.collect_products.append(product)
def parse_category_products(self, response): hxs = HtmlXPathSelector(response) self.category_products[response.meta['category']] = \ hxs.select('//div[contains(@class, "list-product")]/a[contains(@class, "link")]/@href').extract()
def parse(self, response): self.log('No item received for %s' % response.url) for elem in super(HideMyAssSpider, self).parse(response): yield elem hxs = HtmlXPathSelector(response) links = hxs.select('//tr[@class="altshade"]') for link in links: ipaddress_parts = link.select('td[2]/span') style_text = ipaddress_parts.select('style/text()').extract() style_text = style_text[0].split('\n') display_none = [style[1:style.index('{')] for style in style_text if 'none' in style] display_inline = [style[1:style.index('{')] for style in style_text if 'inline' in style] display_none = set(display_none) display_inline = set(display_inline) ipaddress = [] for ipaddress_part in ipaddress_parts.select('span|div|text()'): tag_class = tag_style = tag_name = None try: tag_class = ipaddress_part.select('@class').extract() except TypeError: # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult) pass try: tag_style = ipaddress_part.select('@style').extract() except TypeError: # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult) pass try: tag_name = ipaddress_part.select("name()") except TypeError: # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult) pass if tag_name: tag_text = ipaddress_part.select('text()').extract() else: tag_text = ipaddress_part.extract() if tag_style and 'none' in tag_style[0]: continue if tag_class and tag_class[0] in display_none: continue if isinstance(tag_text, list): tag_text = ''.join(tag_text) tag_texts = tag_text.split('.') for tag_text in tag_texts: tag_text = tag_text.strip() if not tag_text.isdigit(): continue ipaddress.append(tag_text) ipaddress = '.'.join(ipaddress) loader = WebsiteLoader(selector=link) loader.add_value('ipaddress', ipaddress) loader.add_xpath('port', 'td[3]/text()') loader.add_xpath('country', 'td[4]/span/text()') loader.add_xpath('_type', 'td[7]/text()') loader.add_xpath('anonimity', 'td[8]/text()') loader.add_value('url', response.url) item = loader.load_item() yield item
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()') loader.add_value('url', response.url) loader.add_xpath('image_url', u'//tr[@id="prodImageContainer"]//img/@src') if not loader.get_output_value(u'image_url'): soup = BeautifulSoup(response.body) image_url = soup.find(lambda tag: tag.name == u'img' and tag. findParent(u'tr', id=u'prodImageContainer')) if image_url: loader.add_value('image_url', image_url.get(u'src')) loader.add_xpath( 'brand', u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()' ) loader.add_xpath('price', u'//b[@class="priceLarge"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="priceLarge"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="price"]/text()') sku = hxs.select( u'//li/b[contains(text(),"Item model number")]/../text()').extract( ) if sku: sku = sku[0].strip() else: log.msg('No sku.') csv_sku = response.meta['sku'].strip() log.msg('SKU: [%s == %s]' % (sku.lower() if sku else u'None', csv_sku)) csv_name = response.meta['name'].lower().split(u' ') site_name = loader.get_output_value('name').lower().split(u' ') log.msg(u'NAME: [%s == %s]' % (csv_name, site_name)) name_match = any(map(lambda elem: elem in site_name, csv_name)) if sku and (self.match_skus(sku, csv_sku) or self.match_skus(csv_sku, sku)) and name_match: if valid_price(response.meta['price'], loader.get_output_value('price')): loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku'].lower()) # if loader.get_output_value('price'): yield loader.load_item() else: meta = response.meta next_result = meta['next_results'] if next_result: next_result = next_result[0] meta['next_results'] = meta['next_results'][1:] yield Request(next_result, callback=self.parse_product, meta=response.meta) elif meta.get('next_page'): next_page = meta['next_page'] yield Request(next_page, meta=response.meta) elif meta.get('search_urls'): meta = response.meta search_url = meta['search_urls'][0] meta['search_urls'] = meta['search_urls'][1:] yield Request(search_url % {'q': meta['sku']}, meta=meta)
def parse_question_page(self,response): hxs = HtmlXPathSelector(response) question_loader = XPathItemLoader(item = YahooQuestion(), selector = hxs) answers_loader = XPathItemLoader(item = YahooAnswer(), selector = hxs) # get question id question_loader.add_value('question_id',''.join(parse_qs(urlparse(response.request.url).query)['qid'])) # print question_loader.get_output_value('question_id') # get question title question_loader.add_xpath('question_title',self.question_xpath+'//h1[contains(@class, "subject")]/text()') # get question content question_loader.add_xpath('question_content',self.question_xpath+'//div[contains(@class, "content")]/text()') # get question status question_loader.add_xpath('status',self.question_xpath+'//div[@class="hd"]//h2/text()') #get question url question_loader.add_value('question_url',''.join([ 'http://answers.yahoo.com/question/index?qid=', question_loader.get_output_value('question_id') ])) #get question date question_loader.add_xpath('asking_date',''.join([ self.question_xpath, '//div[@class="qa-container"]//ul[@class="meta"]', '/li[1]/abbr/@title' ])) #import date question_loader.add_value('import_date',time.strftime("%Y-%m-%d %A %X %Z", time.localtime())) # asking user question_loader.add_value('asker', self.get_user(hxs.select(''.join([ self.question_xpath, ])))) # interestin marks question_loader.add_xpath('number_of_interesting_marks', ''.join([ '//ul[@id="yan-question-tools"]', '//li[@id="yan-starthis"]', '//span[contains(@class,"star-count")]/text()' ])) # number of answers question_loader.add_xpath('number_of_answers',''.join([ self.answer_xpath, '/div[@class="hd"]', '/h3/text()' ])) #begin to parse answers # category of the question item question_loader.add_xpath('category',''.join([self.category_xpath, '//li//a//text()'])) # best answer best_answer_selector = hxs.select(self.best_answer_xpath) if best_answer_selector: yield self.get_answer(best_answer_selector, question_loader) #other answers for ans_selector in hxs.select(self.answer_xpath).select('.//li/div[@class="answer"]'): # self.get_answer(ans_selector, question_loader) yield self.get_answer(ans_selector, question_loader) yield question_loader.load_item()
def parse_detail(self, response): items = [] item = WuTongCarLineItem() hxs = HtmlXPathSelector(response) item['url'] = response.url #item['ruku_time'] = int(time.time()) company_name = hxs.select( './/*[@id="line_info"]/table/tr[1]/td/text()').extract() if len(company_name) == 0: item['company_name'] = '' else: item['company_name'] = company_name[0].strip() contact_name = hxs.select( './/*[@id="line_info"]/table/tr[4]/td/text()').extract() if len(contact_name) == 0: item['contact_name'] = '' else: item['contact_name'] = contact_name[0].strip() from_to = hxs.select('.//*[@id="line_info"]/table/tr[2]/td[1]/text()' ).extract()[0].strip() if len(from_to) == 0: item['start_place'] = '' item['to_place'] = '' else: m = re.match(r'(.*)--->(.*)', from_to) item['start_place'] = m.group(1) item['to_place'] = m.group(2) tel = hxs.select( './/*[@id="line_info"]/table/tr[5]/td[1]/text()').extract() if len(tel) == 0: item['tel'] = '' else: item['tel'] = tel[0].strip() phone_contact = hxs.select( './/*[@id="line_info"]/table/tr[5]/td[2]/text()').extract() if len(phone_contact) == 0: item['phone_contact'] = '' else: m = re.match(u"([\u4e00-\u9fa5]+):(.*)", phone_contact[0].decode("utf-8")) item['phone_contact'] = m.group(2) addr = hxs.select( './/*[@id="line_info"]/table/tr[6]/td/text()').extract() if len(addr) == 0: item['addr'] = '' else: item['addr'] = addr[0].strip() trans_type = hxs.select( './/*[@id="line_info"]/table/tr[3]/td/text()').extract() if len(trans_type) == 0: item['trans_type'] = '' else: item['trans_type'] = trans_type[0].strip() remark = hxs.select( './/*[@id="line_info"]/table/tr[7]/td//text()').extract() if len(remark) == 0: item['remark'] = '' else: item['remark'] = "".join(remark).strip() item['specia_lines'] = 1 items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select( '//div[@class="nav"]/ul/li[count(div)>0]/a/@href').extract() for category in categories: yield Request(category, callback=self.parse_page)
def parse_product(self, response): # inspect_response(response, self) # return hxs = HtmlXPathSelector(response) base_url = get_base_url(response) if not hxs.select('//select[@id="customerTaxType"]/option[@selected="selected"]').re('Excl'): url = hxs.select('//select[@id="customerTaxType"]/option[not (@selected)]/@value').extract() yield Request(urljoin(base_url, url[0]), callback=self.parse_product, dont_filter=True, meta=response.meta) return loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('category', response.meta['category']) name = '' tmp = hxs.select('//h1[@itemprop="name"]/text()').extract() if tmp: name = tmp[0].strip() loader.add_value('name', name) tmp = hxs.select('//div[@class="gallery"]//a[1]/@href').extract() if tmp: loader.add_value('image_url', tmp[0]) # Find brand. for brand in self.brands: if brand.lower() in name.lower(): loader.add_value('brand', brand) break # p = loader.load_item() tmp = hxs.select('//input[contains(@id,"add-to-cart-button-")]/@data-productid').extract() if tmp: # identifier = product['identifier'] loader.add_value('identifier', tmp[0]) tmp = hxs.select('//p/span[strong="Product Code:"]/text()').extract() if tmp: loader.add_value('sku', tmp[0].strip()) tmp = hxs.select('//span[@itemprop="price"]/text()').extract() if tmp: price = extract_price(tmp[0].strip().split()[0]) loader.add_value('price', price) product = loader.load_item() url_post = 'http://www.northseaworkwear.com/addproducttocart/details/%s/1' % product['identifier'] qty = '1' tmp = hxs.select('//input[contains(@class,"qty-input")]/@value').extract() if tmp: qty = tmp[0] selections = hxs.select('//div[@class="attributes"]//select') if not selections: # loader.add_value('stock', 0) # yield loader.load_item() formdata = {'addtocart_%s.EnteredQuantity' % product['identifier']:qty} self.cookie_num += 1 yield FormRequest(url_post, formdata=formdata, meta={'item':product, 'cookiejar':self.cookie_num}, dont_filter=True, callback=self.parse_stock) return attrs = [] for sel in selections: attr_name = '' tmp = sel.select('@name').extract() if tmp: attr_name = tmp[0] attr_values = [] for option in sel.select('option'): value = '' tmp = option.select('@value').extract() if tmp: value = tmp[0] txt = '' tmp = option.select('text()').extract() if tmp: txt = tmp[0].strip() if value != '' and value != '0': attr_values.append((attr_name, value, txt)) attrs.append(attr_values) # print '### Selections:', attrs for option in itertools.product(*attrs): # print '### option:', o item = copy.deepcopy(product) item['name'] += ' - ' + '-'.join([attr[2] for attr in option]) item['identifier'] += '-' + '-'.join([attr[1] for attr in option]) # yield item formdata = {'addtocart_%s.EnteredQuantity' % product['identifier']:qty} for attr in option: formdata[attr[0]] = attr[1] # print 'formdata:', formdata self.cookie_num += 1 yield FormRequest(url_post, formdata=formdata, meta={'item':item, 'cookiejar':self.cookie_num}, dont_filter=True, callback=self.parse_stock)
def parse(self, response): hxs = HtmlXPathSelector(response) categories = response.xpath( '//li[div[contains(text(), "Audio, vision & technology")]]//a/@href' ).extract() for category in categories: yield Request(response.urljoin(category)) categories = response.xpath( '//div[@id="subCategorycategories"]/ul/li/a/@href').extract() categories += response.xpath( '//li[@id="categories"]/ul/li/a/@href').extract() categories += response.xpath( '//div[@class="cat_detail"]/div/a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) yield Request(url) # products new parse method products = response.xpath('//div[contains(@id, "PSPProductList")]') for product in products: loader = ProductLoader(item=Product(), selector=product) name = "".join( product.xpath( ".//div[contains(@class, 'product_name')]//text()"). extract()).strip() brand = product.xpath( 'div/a/div[@class="brand_name"]/text()').extract()[0].strip() url = product.xpath(".//a/@href").extract() url = urljoin_rfc(get_base_url(response), url[0]) sku = product.xpath(".//div[contains(@id, 'psp')]/@id").re( "psp_(.+)")[0] price = product.xpath(".//span[@class='price_now']/text()").re( u'Now\xa0\xa3(.*)') if not price: price = product.xpath( ".//span[@class='price-actual' and @itemprop='price']/text()" ).extract() if price: price = price[0] else: price = '' loader.add_value('stock', 0) category = response.xpath( '//div[@id="box_productSelectionPage"]/div/h1/text()').extract( ) category = category[0].strip() if category else '' loader.add_value('name', name) loader.add_value('brand', brand) # loader.add_value('category', category) loader.add_value('url', url) loader.add_xpath('image_url', 'div//img[@class="proImg"]/@src') loader.add_value('sku', sku) loader.add_value('identifier', sku) loader.add_value('price', price) item = loader.load_item() metadata = DemoRMeta() metadata['reviews'] = [] metadata['promotion'] = ''.join( product.xpath( './/span[@class="discount_savings"]/text()').extract()) item = loader.load_item() item['metadata'] = metadata yield Request(item['url'], meta={'item': item}, callback=self.parse_product) for page in response.xpath( '//div[@id="pagination"]/a/@href').extract(): url = urljoin_rfc(get_base_url(response), page) yield Request(url)
def parse_product(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select( '//a[contains(@class,"size-boxes")]/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), url), callback=self.parse_product) product_category = hxs.select( '//div[contains(@class,"breadcrumbs")]/ul/li/a/text()').extract( )[-1].strip() product_name = hxs.select('//h1/text()').extract()[0] product_image = hxs.select('//a[@id="zoom-btn"]/@href').extract() if product_image: product_image = urljoin_rfc(get_base_url(response), product_image[0]) product_brand = hxs.select('//img[@class="man-logo"]/@alt').extract() product_brand = product_brand[0] if product_brand else '' product_sku = hxs.select('//tr[th/text()="SKU"]/td/text()').extract() product_sku = product_sku[0] if product_sku else '' product_config_reg = re.search( 'var spConfig = new Product.Config\((\{.*\})\);', response.body) product_identifier = hxs.select( '//input[@name="product"]/@value').extract()[0] if product_config_reg: products = json.loads(product_config_reg.group(1)) for identifier, product in products['childProducts'].items(): product_loader = ProductLoader(item=Product(), response=response) if identifier: product_loader.add_value( 'identifier', product_identifier + '-' + identifier) else: product_loader.add_value('identifier', product_sku) product_loader.add_value('price', product[u'finalPrice']) option_name = product_name for attr_id, attribute in products[u'attributes'].items(): for option in attribute['options']: if identifier in option['products']: option_name += ' ' + option['label'] product_loader.add_value( 'name', re.sub(r' \((.+?)\)', r'', option_name)) product_loader.add_value('sku', product_sku) product_loader.add_value('url', response.url) product_loader.add_value('brand', product_brand) product_loader.add_value('category', product_category) product_loader.add_value('image_url', product_image) if identifier: yield Request('http://www.bedworld.net/oi/ajax/co/?id=' + identifier + '&pid=' + product_identifier, meta={'item': product_loader.load_item()}, callback=self.parse_options) else: price = product_loader.get_output_value('price') net_price = price / Decimal('1.2') p = product_loader.load_item() meta_ = Meta() meta_['net_price'] = str(net_price) p['metadata'] = meta_ yield p else: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', re.sub(r' \((.+?)\)', r'', product_name)) product_loader.add_value('sku', product_sku) product_loader.add_value('brand', product_brand) product_loader.add_value('identifier', product_identifier) product_loader.add_value('url', response.url) product_loader.add_value('category', product_category) product_loader.add_value('image_url', product_image) price = hxs.select('//span[@id="product-price-' + product_identifier + '"]//text()').re(r'([\d.,]+)') price = price[0] if price else 0 product_loader.add_value('price', price) option_elements = [] dropdown_elements = hxs.select( '//select[contains(@class, "product-custom-options")]') for dropdown_options in dropdown_elements: options = [] for dropdown_option in dropdown_options.select( 'option[@value!=""]'): option = {} option['identifier'] = dropdown_option.select( '@value').extract()[0] option['desc'] = dropdown_option.select( './/text()').extract()[0].split('+')[0] option['price'] = dropdown_option.select( '@price').extract()[0] options.append(option) option_elements.append(options) final_options = [] if option_elements: combined_options = list(itertools.product(*option_elements)) for combined_option in combined_options: final_option = {} for option in combined_option: final_option['desc'] = final_option.get( 'desc', '') + option['desc'] final_option['price'] = final_option.get( 'price', Decimal(0)) + extract_price( option['price']) final_option['identifier'] = final_option.get( 'identifier', '') + '-' + option['identifier'] final_options.append(final_option) if final_options: for opt in final_options: opt_product = product_loader.load_item() opt_product['name'] += ' ' + normalize_space(opt['desc']) opt_product['price'] += opt['price'] opt_product['identifier'] += opt['identifier'] price = Decimal(opt_product['price']) net_price = price / Decimal('1.2') meta_ = Meta() meta_['net_price'] = str(net_price) opt_product['metadata'] = meta_ yield opt_product else: price = product_loader.get_output_value('price') net_price = price / Decimal('1.2') p = product_loader.load_item() meta_ = Meta() meta_['net_price'] = str(net_price) p['metadata'] = meta_ yield p
def parse_detail(self, response): item = Item() item["url"] = response.url m2 = hashlib.md5() m2.update(item["url"]) item["url_md5sum"] = m2.hexdigest() item["source"] = "智联招聘" item["sub_url"] = response.meta["sub_url"] response_selector = HtmlXPathSelector(response) #组装item for file_name in self.detail_xpath_dict: item[file_name] = "" if (len( response_selector.select( self.detail_xpath_dict[file_name]).extract()) > 0): item[file_name] = response_selector.select( self.detail_xpath_dict[file_name]).extract()[0] #公司url的MD5 m2 = hashlib.md5() m2.update(item["company_url"]) item["company_url_md5sum"] = m2.hexdigest() #时间处理 pub_at = item['time'] today = datetime.date.today() if pub_at == '15天前': item['time'] = (today - datetime.timedelta(days=15)).strftime('%Y-%m-%d') print 'a' elif pub_at == "前天": item['time'] = (today - datetime.timedelta(days=2)).strftime('%Y-%m-%d') print 'b' elif pub_at.find('刚') != -1 or pub_at.find("小时") != -1: print 'c' item['time'] = today.strftime('%Y-%m-%d') elif pub_at == '昨天': item['time'] = (today - datetime.timedelta(days=1)).strftime('%Y-%m-%d') print 'd' #组装完成,返回发送到pipline yield item company_url = item['company_url'] #如果没有公司链接 if len(company_url) == 0: return if company_url.split("/")[2] == "special.zhaopin.com": yield Request(url=company_url, callback=self.parse_company, meta={ "sub_url": response.url, "company_name": item["company_name"] }, headers=SPECIAL_REQUEST_HEADERS, dont_filter=False) elif company_url.split("/")[2] == "company.zhaopin.com": yield Request(url=company_url, callback=self.parse_company, meta={ "sub_url": response.url, "company_name": item["company_name"] }, headers=COMPANY_REQUEST_HEADERS, dont_filter=False)
def parse_anntaylor(self, response): self.check_shelfit_validity(response) return (False, None) hxs = HtmlXPathSelector(response) # find name of item item_name_path = hxs.select('//div[@class="hd-info"]//h1/text()') if len(item_name_path) == 0: self.invalid_links += 1 print "Invalid link: " + str(response.url) return (False, None) item_name = item_name_path.extract()[0] logging.critical("Name: " + str(item_name)) self.count_scraped += 1 ''' PLAYING NICE: sleeping for 1min after crawling every 100 pages ''' if self.count_scraped % 100 == 0: print "Sleeping for 60 secs..." sleep(60) # sleep for 1 mins for express meta_tag_url = hxs.select('//meta[@property="og:url"]/@content') prod_url = meta_tag_url.extract()[0] logging.critical("PRODUCT URL:" + str(prod_url) + " ITEM_NAME " + str(item_name) + " TOTAL SO FAR " + str(self.count_scraped)) # Ann Taylor is for women only gender = 'F' # find price and sale price item_id_, price_, sale_price_ = self._find_price(hxs, prod_url) if item_id_ in self.items_scraped: logging.critical("ITEM ALREADY SCRAPED " + str(item_id_)) # store the category for this itemid print "Appending categories for product " + str(item_id_) categories_path = hxs.select( '//div[@id="cat-pro-pagnation"]//a/text()').extract() num_categories = len(categories_path) categories = [] for i in range(0, num_categories): category = str(categories_path[i]).strip('\n').strip() categories.append(category) logging.critical("Categories: " + category) product = ProductModel.objects.filter(idx=item_id_).filter( insert_date=insert_date) self._create_category(product, categories) return (False, None) else: self.items_scraped.append(item_id_) logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) if price_ > sale_price_: logging.critical("SALE on ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) # extract image URL prod_img_path = hxs.select('//img[@id="productImage"]/@src') prod_img_url = str(prod_img_path.extract()[0]) logging.critical("Image URL: " + str(prod_img_url)) # find description and keywords: these will be useful in categorization desc = hxs.select( '//div[@class="gu gu-first description"]/p/text()').extract() prod_desc = ''.join(desc) logging.critical("Description: " + prod_desc) # promo text # DIDN'T FIND ANY #promo_path = hxs.select('//span[@class="cat-pro-promo-text"]//font/text()').extract() #promo_str = str(promo_path) #logging.critical("Promotion: ") #logging.critical(promo_str) promo_str = "" product, created_new = self._create_product_item(item_name, int(item_id_), str(prod_url), price_, \ sale_price_, gender, str(prod_img_url), promo_str, prod_desc) product = None #self._store_in_file(response, item_id_) #raise CloseSpider('Blah') logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\ str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped))) #raise SystemExit return (True, product)
def extract_all_page_data(self, response): hxs = HtmlXPathSelector(response) return process(hxs, response)
def parseItem(self, spiderName=None, itemCollectionName=None, response=None, responseBody='', pageid=''): ''' parse the page, get the information of attraction to initiate noteItem, then return items to pipeLine the pipeLine configured by "settings" will store the data ''' #bbsSpider单独处理 isbbsSpider = False if spiderName in self.bbsSpiderName: config = extractorConfig['BBsSpider'] isbbsSpider = True else: config = extractorConfig[spiderName] if not config: raise NotConfigured('解析配置信息没有找到,请检查extracotrConfig是否有爬虫%s的配置! ' % spiderName) hxs = HtmlXPathSelector(response) if not itemCollectionName or not itemCollectionName in config: raise NotConfigured('%s下载网页的类型%s没有找到,请检查解析配置文件' % (spiderName, itemCollectionName)) item = {} item['collectionName'] = itemCollectionName if itemCollectionName in self.collectionNameMap: item['collectionName'] = self.collectionNameMap[itemCollectionName] item['url'] = response.url item['status'] = 100 item['spiderName'] = spiderName item['optDateTime'] = datetime.datetime.now() xpathItem = config[itemCollectionName] #使用正文抽取,只要title、publishdate、content,imgList if 'mainext' in xpathItem and xpathItem['mainext']: return self.extractMainTxt( item, responseBody, config['threshold'] if 'threshold' in config else None, spiderName, pageid) #对bbs进行单独处理其Article if isbbsSpider and re.match('.*(Article).*', itemCollectionName) is not None: #标题 作者 发比日期等用regex抓 regexs = config['printpage'] for k, v in regexs.items(): if k.endswith('Regex'): continue regex = regexs[k + 'Regex'] value = hxs.select(v).re(regex) if (value is None or len(value) < 1) and k in self.requiredField: self.parseLog( 'bbs解析发现item缺失属性:%s,类型: %s,spiderName:%s, pageid:%s' % (k, itemCollectionName, spiderName, pageid), level=LogLevel.INFO) return None if type(value) == list and len(value) > 0: item[k] = value[0] elif value is not None: item[k] = value #content bodys = hxs.select('//body').extract()[0] content = self.ext.getText(bodys) #过滤掉水贴回复、标题、作者信息 contents = content.splitlines() newcontent = '' block = '' isblock = 0 for p in contents: p_strip = p.strip() #识别文本块 if re.match('.*作者.*时间.*\d+.*\d+.*', str(p_strip)) is not None: isblock += 1 elif len(p_strip) >= 20 and re.match('.*\[打印本页\].*', str(p_strip)) is None: block += p_strip block += '\n' if isblock >= 1: if len(block) > 100 or re.match('.*img.*src.*=.*>.*', block) is not None: newcontent += '---------------------------------------------------------------------------------\n' newcontent += block block = '' isblock = 0 if newcontent is None or len(newcontent.strip()) < 10: self.parseLog( 'bbs解析发现item缺失属性:%s,类型: %s,spiderName:%s, pageid:%s' % ('content', itemCollectionName, spiderName, pageid), level=LogLevel.INFO) return None item['content'] = newcontent images = self.ext.getImg(bodys) if images is not None: item['images'] = images return item #xpath解析 for k, v in xpathItem.items(): values = hxs.select(v).extract() if (not values or len(values) < 1 or (" ".join("%s" % p for p in values)).strip() == "") and k in self.requiredField: self.parseLog( 'xpath解析发现item缺失属性:%s,类型: %s,spiderName:%s,xpath=%s, pageid:%s 。改用正文抽取尝试' % (k, itemCollectionName, spiderName, v, pageid), level=LogLevel.INFO) #若为Article,xpath没有解析出来,就用正文抽取再解析一次 if item['collectionName'] == 'Article': return self.extractMainTxt( item, responseBody, config['threshold'] if 'threshold' in config else None, spiderName, pageid) if k in self.listFields: item[k] = values else: value = self.parseSpecialField(k, values) if value is not None: item[k] = value #图片 if k == 'content': imgs = self.ext.getImg(value) if imgs is not None: item['images'] = imgs #regex+xpath解析 regexItem = {} regexName = itemCollectionName + 'Regex' if regexName in config: regexItem = config[regexName] for k, v in regexItem.items(): if k.endswith('Regex'): continue regex = k + 'Regex' if not regex in regexItem: raise NotConfigured('找不到匹配的正则表达式,配置文件的%s配置缺少相应的%s' % (k, regex)) else: regex = regexItem[regex] values = hxs.select(v).re(regex) if (not values or len(values) < 1 or (" ".join("%s" % p for p in values)).strip() == "") and k in self.requiredField: self.parseLog( 'regex+xpath解析item缺失属性:%s,类型: %s,spiderName:%s, pageid:%s 。改用正文抽取尝试' % (k, itemCollectionName, spiderName, pageid), level=LogLevel.INFO) #若为Article,xpath没有解析出来,就用正文抽取再解析一次 if item['collectionName'] == 'Article': return self.extractMainTxt( item, responseBody, config['threshold'] if 'threshold' in config else None, spiderName, pageid) if k in self.listFields: item[k] = values else: value = self.parseSpecialField(k, values) if value is not None: item[k] = value #解析response中的数据 respItem = {} respName = itemCollectionName + 'Resp' if respName in config: respItem = config[respName] for k, v in respItem.items(): value = None if v == 'url': value = response.url elif v == 'header': if v.items(): header = response.headers for hk, hv in v.items(): value = header[hv] if not value: self.parseLog('response.headers中没有该属性:%s,类型: %s' % (hk, itemCollectionName), level=LogLevel.WARNING) continue if not value and hk in self.requiredField: self.parseLog( '非item页,因为缺失属性:%s,类型: %s, pageid:%s' % (hk, itemCollectionName, pageid), level=LogLevel.WARNING) #若为Article,xpath没有解析出来,就用正文抽取再解析一次 if item['collectionName'] == 'Article': return self.extractMainTxt( item, responseBody, config['threshold'] if 'threshold' in config else None, spiderName, pageid) if hk in self.specialField: value = self.parseSpecialField(hk, value) item[hk] = value.strip() continue value = response.headers elif k == 'status': value = response.status if not value and k in self.requiredField: self.parseLog('item缺失属性:%s,类型: %s,spiderName:%s, pageid:%s' % (k, itemCollectionName, spiderName, pageid), level=LogLevel.INFO) return None elif not value: continue value = self.parseSpecialField(k, value) item[k] = value.strip() return item
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select(u'//h1[@class="mainbox-title"]/text()')[0].extract() loader.add_value('name', name) loader.add_value('url', response.url) price = hxs.select( u'//div[@id="product_info"]//span[@class="price"]/span[@class="price" and @id]/text()' ) if not price: price = hxs.select( u'//*[@itemprop="price"]/span[@class="price" and @id]/text()') price = price[0].extract().replace(',', '') loader.add_value('price', price) image_url = hxs.select( u'//a[contains(text(),"View larger image")]/@href') if image_url: image_url = urljoin_rfc(get_base_url(response), image_url[0].extract()) loader.add_value('image_url', image_url) category = hxs.select( u'//div[@class="breadcrumbs"]/a[1]/following-sibling::a[1]/text()' ).extract() if category: loader.add_value('category', category[0]) sku = hxs.select( u'//div[@class="product-main-info" or @id="product_info"]//p[@class="sku"]//span[starts-with(@id,"product_code")]/text()' ) if sku and sku[0].extract().lower() != 'n/a': sku = sku[0].extract().lower() loader.add_value('sku', sku) loader.add_xpath('identifier', '//input[contains(@name, "product_id")]/@value') options = hxs.select(u'//div[starts-with(@id,"opt_")]//select/option') select_name = hxs.select( u'//div[starts-with(@id,"opt_")]//select/@name').extract() if len(options) == 1: formdata = { 'additional_info[get_detailed]': '1', 'additional_info[get_discounts]': '1', 'additional_info[get_features]': '', 'additional_info[get_icon]': '1', 'additional_info[get_options]': '1', 'additional_info[info_type]': 'D', 'appearance[but_role]': 'action', 'appearance[capture_options_vs_qty]': '', 'appearance[details_page]': '1', 'appearance[separate_buttons]': '', 'appearance[show_add_to_cart]': '1', 'appearance[show_list_buttons]': '1', 'appearance[show_price]': '1', 'appearance[show_price_values]': '1', 'appearance[show_product_amount]': '1', 'appearance[show_product_options]': '1', 'appearance[show_qty]': '1', 'appearance[show_sku]': '1', 'dispatch': 'products.options', select_name[0]: options[0].select(u'./@value').extract()[0] } yield FormRequest('http://www.eglobaldigitalstore.co.uk/index.php', formdata=formdata, meta={'loader': loader}, callback=self.reload_price, dont_filter=True) return else: out_stock = hxs.select('//span[contains(@class, "out-of-stock")]') if out_stock: loader.add_value('stock', 0) yield loader.load_item() for option in options: option_text = option.select(u'./text()')[0].extract() opt_value = option.select(u'./@value').extract()[0] if not opt_value: continue loader = ProductLoader(item=Product(), selector=hxs) res = re.search('(.*?) \(\+\xa3([\d\.,]+)\)', option_text) if res: option_name, option_price = res.groups() else: option_name = re.search('(.*)', option_text).groups()[0] option_price = u'0.00' loader.add_value('name', u'%s %s' % (name, option_name)) loader.add_value('url', response.url) if category: loader.add_value('category', category[0]) loader.add_value('price', str(Decimal(price) + Decimal(option_price))) if image_url: loader.add_value('image_url', image_url) formdata = { 'additional_info[get_detailed]': '1', 'additional_info[get_discounts]': '1', 'additional_info[get_features]': '', 'additional_info[get_icon]': '1', 'additional_info[get_options]': '1', 'additional_info[info_type]': 'D', 'appearance[but_role]': 'action', 'appearance[capture_options_vs_qty]': '', 'appearance[details_page]': '1', 'appearance[separate_buttons]': '', 'appearance[show_add_to_cart]': '1', 'appearance[show_list_buttons]': '1', 'appearance[show_price]': '1', 'appearance[show_price_values]': '1', 'appearance[show_product_amount]': '1', 'appearance[show_product_options]': '1', 'appearance[show_qty]': '1', 'appearance[show_sku]': '1', 'dispatch': 'products.options', select_name[0]: opt_value } yield FormRequest('http://www.eglobaldigitalstore.co.uk/index.php', formdata=formdata, meta={ 'loader': loader, 'opt_value': opt_value }, callback=self.parse_identifier, dont_filter=True)
def extract_product_data(self, response, item): hxs = HtmlXPathSelector(response) # assume new design of walmart product page product_name_node = hxs.select("//h1[contains(@class, 'product-name')]//text()").extract() if not product_name_node: # assume old design product_name_node = hxs.select("//h1[contains(@class, 'productTitle')]//text()").extract() if product_name_node: product_name = "".join(product_name_node).strip() else: self.log("Error: No product name: " + str(response.url) + " for source product " + origin_url, level=log.ERROR) # TODO:is this ok? I think so # return if product_name_node: item['product_name'] = product_name # extract product model number # TODO: use meta? works for both old and new? # extract features table for new page version: table_node = hxs.select("//div[@class='specs-table']/table").extract() if not table_node: # old page version: table_node = hxs.select("//table[@class='SpecTable']").extract() if table_node: try: product_model = table_node.select(".//td[contains(text(),'Model')]/following-sibling::*/text()").extract()[0] item['product_model'] = product_model except: pass upc_node = hxs.select("//meta[@itemprop='productID']/@content") if upc_node: item['product_upc'] = [upc_node.extract()[0]] brand_holder = hxs.select("//meta[@itemprop='brand']/@content").extract() if brand_holder: item['product_brand'] = brand_holder[0] # extract price # TODO: good enough for all pages? could also extract from page directly price_holder = hxs.select("//meta[@itemprop='price']/@content").extract() product_target_price = None if price_holder: product_target_price = price_holder[0].strip() else: product_target_price = "".join(hxs.select("//div[@itemprop='price']//text()").extract()).strip() # if we can't find it like above try other things: if product_target_price: # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",","",product_target_price) m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price) if m: item['product_target_price'] = float(m.group(1)) else: self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING) else: self.log("Didn't find product price: " + response.url + "\n", level=log.INFO) try: item['product_category_tree'] = hxs.select("//li[@class='breadcrumb']/a/span[@itemprop='name']/text()").extract()[1:] except: pass try: item['product_keywords'] = hxs.select("//meta[@name='keywords']/@content").extract()[0] except: pass return item
def parse(self, response): if not isinstance(response, HtmlResponse): self.log('ERROR: BAD HtmlResponse!!! URL:{}'.format(response.url)) return hxs = HtmlXPathSelector(response) # logic to find categories # find subcats for Outilage Jardin categories = hxs.select( '//div[contains(@class,"bg_U15 menugroup") and contains(@alt,"Jardin") and contains(@alt,"Outillage")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href' ).extract() # find subcats for Aspirateurs categories += hxs.select( '//div[contains(@class,"bg_U4 menugroup") and contains(@alt,"Entretien") and contains(@alt,"maison")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href' ).extract() for url in categories: url = urljoin_rfc(get_base_url(response), url) yield Request(url) totalproducts = hxs.select('//span[@class="SearchBig"]/text()').re( r'(\d+)') # pagination next_page = hxs.select( u'//ul[@class="PaginationButtons"]//a[contains(text(),"Suivant")]/@href' ).extract() if next_page and int(totalproducts[0]) <= 100000: if not 'filter_active' in response.meta: next_page = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(next_page, meta={ 'next_page_retry': 1, 'dont_redirect': True }) else: next_page = hxs.select( u'//ul[@class="PaginationButtons"]//a[contains(text(),"Suivant")]' ) next_page_onclick_id = next_page.select( '@id').extract()[-1] + '.OnClick' req = FormRequest.from_response( response, formname='PageForm', formdata={next_page_onclick_id: u'1'}, meta={'filter_active': True}) req.dont_filter = True yield req if totalproducts and int( totalproducts[0]) > 100000 and not response.meta.get( 'filter_active'): filters = hxs.select( '//div[@class="blocFilter" and contains(strong/text(), "Type de produit")]//input/@name' ).extract() req_base = FormRequest.from_response(response, formname='PageForm', meta={'filter_active': True}, dont_click=True) for filter in filters: req = replace_formdata(req_base, formdata={filter: u'1'}) req.dont_filter = True yield req products = hxs.select( u'//div[@id="productList"]//div[contains(@class,"plProductView")]') if products: for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath( 'url', './/a[contains(@class,"plPrName")]/@href') product_loader.add_xpath( 'name', './/a[contains(@class,"plPrName")]/text()') product_loader.add_xpath( 'category', '//div[@class="productListTitle"]/h1/text()') product_loader.add_xpath( 'image_url', './/div[contains(@class, "plProductImg")]//img/@data-src') product_loader.add_xpath('sku', './@data-sku') product_loader.add_xpath( 'identifier', './/input[contains(@name, "ProductPostedForm.ProductId")]/@value' ) price = product.select( u'.//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/text()' ).extract() if price: decimals = product.select( u'//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/sup/text()' ).re(u'(\d+)') if decimals: price = price[0] + '.' + decimals[0] product_loader.add_value('price', price) product_loader.add_value('stock', 1) if product_loader.get_output_value( 'name') and product_loader.get_output_value('price'): identifier = product_loader.get_output_value('identifier') if identifier and identifier.strip(): yield product_loader.load_item() else: self.log('PRODUCT WITH NO IDENTIFIER => %s' % response.url) else: # this site is buggy (it returns no products when we traverse thru the pages at random rate) # so this is a kind of retry code if 'next_page_retry' in response.meta: self.log('ERROR - NO PRODUCTS FOUND, retrying...') count = response.meta['next_page_retry'] if count < self.RETRY_TIMES: self.log( 'ERROR - NO PRODUCTS FOUND, retry #{} url: {}'.format( count, response.url)) if not 'filter_active' in response.meta: yield Request(response.url, meta={ 'next_page_retry': count + 1, 'dont_redirect': True }, dont_filter=True) else: # TODO: FormRequest? pass else: self.log( 'ERROR - NO PRODUCTS FOUND, retry limit reached, giving up, url: {}' .format(response.url))
def parse(self, response): # storing in the mongo database client = MongoClient(os.environ['MONGODB_URI']) db = client.heroku_5s156rtt dininghalls = db.dininghalls currUrl = response.request.url currDate = currUrl.replace( "http://hospitality.usc.edu/residential-dining-menus/?menu_venue=venue-507&menu_date=", "").replace("%2F", "/") currDate = currDate.replace( "http://hospitality.usc.edu/residential-dining-menus/?menu_venue=venue-514&menu_date=", "").replace("%2F", "/") currDate = currDate.replace( "http://hospitality.usc.edu/residential-dining-menus/?menu_venue=venue-518&menu_date=", "").replace("%2F", "/") # Both of these work hxs = HtmlXPathSelector(response) #declaring json dininghall = {'stations': []} cafeTitle = hxs.xpath( "//h2[contains(@class, 'fw-accordion-title ui-state-active')]/text()" ).extract() print cafeTitle[0].encode('utf-8') differentSections = hxs.xpath( "//div[contains(@class, 'col-sm-6 col-md-4')]") for differentSections in differentSections: mealTimes = differentSections.xpath("h3/text()").extract() stations = differentSections.xpath("h4/text()").extract() print(mealTimes[0].encode('utf-8') ).strip("[]").strip('u\'').strip('\'') dininghall.update({ 'mealtype': (mealTimes[0].encode('utf-8') ).strip("[]").strip('u\'').strip('\'') }) dininghall.update({'date': currDate}) # if datetime.datetime.strftime(datetime.date.today(), '%d') in cafeTitle[0]: # dininghall.update({'date': datetime.datetime.strftime(datetime.date.today(), '%x')}) # if datetime.datetime.strftime(datetime.date.today() + datetime.timedelta(days=1), '%d') in cafeTitle[0]: # dininghall.update({'date': datetime.datetime.strftime(datetime.date.today() + datetime.timedelta(days=1), '%x')}) if "Kitchen" in cafeTitle[0]: dininghall.update({'name': 'EVK'}) if "Parkside" in cafeTitle[0]: dininghall.update({'name': 'Parkside'}) if "84" in cafeTitle[0]: dininghall.update({'name': 'Cafe 84'}) foodItemSections = differentSections.xpath( "ul[contains(@class, 'menu-item-list')]") i = 0 for foodItemSections in foodItemSections: foodItems = foodItemSections.xpath("li/text()").extract() print stations[i].encode('utf-8') stationMiniJSON = { 'name': (stations[i].encode('utf-8') ).strip("[]").strip('u\'').strip('\''), 'options': [] } for foodItems in foodItems: print foodItems.encode('utf-8') if "\"" in foodItems: individualFoodItemsWrapper = foodItemSections.xpath( "li[contains(., '" + foodItems + "')]") else: individualFoodItemsWrapper = foodItemSections.xpath( "li[contains(., \"" + foodItems + "\")]") foodItemsTags = individualFoodItemsWrapper.xpath( "span/i/span/text()").extract() foodMiniJSON = { 'name': foodItems.encode('utf-8'), 'tags': [] } for foodItemsTags in foodItemsTags: foodMiniJSON['tags'].append( (foodItemsTags.encode('utf-8') ).strip("[]").strip('u\'').strip('\'')) stationMiniJSON['options'].append(foodMiniJSON) dininghall['stations'].append(stationMiniJSON) i += 1 print dininghall dininghalls.insert(dininghall) dininghall = {'stations': []}
def load_html(url): resp = requests.get(url) resp.raise_for_status() resp.encoding = "gbk" data = HTMLParser.HTMLParser().unescape(resp.text) return HtmlXPathSelector(text=data)
def parse_product(self, response): hxs = HtmlXPathSelector(response) if response.meta['parse_options']: color_options = hxs.select( "//div[@class='more-colors']//a/@href").extract() for color in color_options: url = self.base_url + color yield Request(url, meta={'parse_options': False}, callback=self.parse_product) item_data = re.search('dataLayer.push\((.*)\);', response.body) item_data = json.loads(item_data.group( 1))['ecommerce']['detail']['products'][0] if item_data else None name = item_data.get('name') sku = item_data.get('id') price = str(item_data.get('price')) price = float(extract_price(price)) brand = item_data.get('brand') stock = 1 if price else 0 brand = '' if brand == False else brand image_url = ''.join( hxs.select("//img[@itemprop='image']/@src").extract()) categories = [ category.strip() for category in hxs.select( "//div[@class='breadcrumbs']//a/text()").extract()[1:] ] categories = [ category for category in categories if not category.lower() in ['mehr', 'designer'] ] shipping = 0 color_swatches = re.search('ColorswatchConfig\((.*)\) ,', response.body) color_swatches = json.loads( color_swatches.group(1))['swatch'] if color_swatches else None if color_swatches: for color_swatch, colors in color_swatches.iteritems(): for color_id, values in colors.iteritems(): #== If the next part throws an error, swatch is not available and we should skip it ==# try: option_name = name + ' ' + values['option_values'][ 'store_label'] except: continue option_id = sku + values['option_values']['value_index'] option_price = values['option_values']['pricing_value'] if option_price: option_price = 0 if option_price in [ 'null', 'None', None ] else option_price option_price = price + float(option_price) else: option_price = price product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('image_url', image_url) product_loader.add_value('shipping_cost', shipping) product_loader.add_value('sku', option_id) product_loader.add_value('url', response.url) product_loader.add_value('name', option_name) product_loader.add_value('brand', brand) product_loader.add_value('identifier', option_id) product_loader.add_value('price', option_price) for category in categories: if not category.lower() == 'more': product_loader.add_value('category', category.strip()) yield product_loader.load_item() else: product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('image_url', image_url) product_loader.add_value('shipping_cost', shipping) product_loader.add_value('sku', sku) product_loader.add_value('url', response.url) product_loader.add_value('name', name) product_loader.add_value('brand', brand) product_loader.add_value('identifier', sku) product_loader.add_value('price', price) for category in categories: if not category.lower() == 'more': product_loader.add_value('category', category.strip()) yield product_loader.load_item()
def extract_product_data(self, response, item): hxs = HtmlXPathSelector(response) try: item['product_name'] = hxs.xpath( "//h1[starts-with(@class,'title')]//text()").extract( )[0].strip() except: try: item['product_name'] = hxs.xpath( "//div[@class='pdp_title']//text()[normalize-space()!='']" ).extract()[0].strip() except: try: item['product_name'] = hxs.xpath( "//h1//text()").extract()[0].strip() except: # out of stock products return 404s with this text, not the actual product page out_of_stock = hxs.xpath( "//strong[contains(text(),'out of stock')]").extract() if not out_of_stock: self.log("Error: No product name: " + str(response.url) + " from product: " + item['origin_url'], level=log.ERROR) # ignore products with no name return None price_node = hxs.select("//meta[@itemprop='price']/@content").extract() if price_node: try: price_currency = price_node[0][0] price_amount = "".join(price_node[0][1:]) price_amount = re.sub(",", "", price_amount) m1 = re.match("[0-9]+\.?[0-9]*", price_amount) m2 = re.match("(\xa3)|(\$)", price_currency) if not m1 or not m2: self.log("Didn't match product price: " + price_amount + price_currency + " " + response.url + "\n", level=log.WARNING) else: price = Utils.convert_to_dollars(float(price_amount), price_currency) item['product_target_price'] = price except Exception: self.log("Didn't find product price: " + response.url + "\n", level=log.INFO) try: product_model_node = hxs.select( "//div[@class='prod_description1']//li[contains(text(), 'Style')]/text()" ).re("[sS]tyle +[nN]o\.? +[a-zA-Z0-9]+") item['product_model'] = re.match( "[sS]tyle +[nN]o\.? +([a-zA-Z0-9]+)", product_model_node[0]).group(1) except Exception: pass try: item['product_brand'] = hxs.select( "//meta[@itemprop='brand']/@content").extract()[0] except Exception: pass try: js_body = hxs.select( "//script[contains(text(),'Upc')]/text()").extract()[0] item['product_upc'] = re.match('.*"skuUpcCode":"([0-9a-zA-Z]+)".*', js_body, re.DOTALL | re.MULTILINE).group(1) except Exception: pass return item
def parse_product(self, response): hxs = HtmlXPathSelector(response) image_url = hxs.select( '//p[contains(@class, "product-image")]/a/@href').extract() price = extract_price("".join( hxs.select( '//div/span/span[@class="price"]/text()').extract()).strip()) if not price: price = extract_price("".join( hxs.select( '//p[@class="special-price"]//span[@class="price"]/text()' ).extract()).strip()) if image_url: image_url = image_url[ 0] # urljoin_rfc(get_base_url(response), image_url[0]) category = hxs.select( '//div[contains(@class, "breadcrumbs")]/ul/li/a/text()').extract() # hxs.select(u'//div[@id="Breadcrumb"]//a/text()').extract() category = category[-1] if category else '' options = hxs.select('//select/option[@value!=""]') identifier = hxs.select( '//input[@name="product" and @value!=""]/@value' ).extract( )[0] # re.search(u'poingdestres\.co\.uk/(.*)/', response.url).group(1) name = hxs.select( '//div[@class="product-name"]/h1/text()').extract()[0] brand = ''.join( hxs.select('//div[contains(@class, "brand-name")]/text()').extract( )).strip() if options: # options url = response.url for option in options: try: name2 = option.select('text()').extract()[0].split( u' +£')[0] except: name2 = '' option_price = extract_price( option.select('@price').extract()[0]) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', url) loader.add_value('name', brand + u' ' + name + u' ' + name2) loader.add_value('price', price + option_price) loader.add_value( 'identifier', identifier + '.%s' % option.select('@value').extract()[0]) loader.add_value('category', category) loader.add_value('brand', brand) if image_url: loader.add_value('image_url', image_url) yield loader.load_item() elif re.search('Product.Config\((.*)\);', response.body): options = re.search('Product.Config\((.*)\);', response.body) options = json.loads(options.group(1)) url = response.url for attribute in options['attributes'].values(): for i, option in enumerate(attribute['options'], 1): name2 = option['label'] option_price = Decimal(option['price']) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', url) loader.add_value('name', brand + u' ' + name + u' ' + name2) loader.add_value('price', price + option_price) loader.add_value( 'identifier', identifier + '.%s' % option['products'][0]) loader.add_value('category', category) loader.add_value('brand', brand) if image_url: loader.add_value('image_url', image_url) yield loader.load_item() else: # hxs.select("//div[@class='ProductDetails']/h1/text()")[0].extract().strip() url = response.url loader = ProductLoader(item=Product(), response=response) loader.add_value('url', url) loader.add_value('name', brand + ' ' + name) loader.add_value('price', price) loader.add_value('identifier', identifier) loader.add_value('category', category) loader.add_value('brand', brand) if image_url: loader.add_value('image_url', image_url) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select('//ul[@id="MainMenu"]//a/@href').extract(): url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product_list)
def parse(self, response): hxs = HtmlXPathSelector(response) url = response.url sku = response.meta['sku'] name = response.meta['name'].encode('ascii', 'ignore') sec_number = response.meta['notes'] prod_name = hxs.select( "//h1[contains(@class, 'categoryname')]/text()").extract() if not prod_name: logging.error('ERROR!! NO NAME!! %s "%s"' % (sku, url)) return prod_name = prod_name[0].strip() options = hxs.select("//tr[td/input[@type='radio']]") found_products = [] for option in options: text = option.select("td[2]/div[1]/span/text()").extract() if not text: logging.error("OPTIONS TEXT NOT FOUND! '%s'" % url) continue text = "".join(text) m = re.search("(.*),([^,]*)(,([^,]*))?", text) if not m: logging.error("CAN'T PARSE OPTIONS TEXT! '%s', '%s'" % (text, url)) continue add_name = m.group(1).strip() add_number = m.group(2).strip() price = option.select( './/span[@class="productSave"]/text()').extract() if not price: price = option.select("td[2]/div[2]/span/text()").extract() if not price: price = option.select("td[2]/div[1]/span[2]/text()").extract() if not price: logging.error('ERROR!! NO PRICE!! %s "%s" "%s"' % (sku, prod_name, url)) return price = price[0].strip() found_products.append( ("%s %s" % (prod_name.encode('ascii', 'ignore'), add_name), add_number, price)) if add_number == sec_number: product = Product() loader = ProductLoader(item=product, response=response, selector=hxs) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) yield loader.load_item() return with open("/home/juraseg/src/drsfostersmith_products.txt", 'a+') as handle: handle.write("======================================\n") handle.write("Product not found\n") handle.write("SKU: %s, Name: %s\n" % (sku, name)) for prod in found_products: handle.write("Found: %s, %s, %s\n" % prod) handle.write("======================================\n\n")
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_titles = hxs.select('//div[@class="product-header"]/h2/text()').extract() product_urls = hxs.select('//div[@data-product-id]/@class').re(r'js-product-([\w-]+)') products = [] for l in response.body.split('\n'): if 'Harveys.DATA.CDP.Products' in l: products.append(l.strip()) for i, product in enumerate(products): data = json.loads(product.split(' = ')[1][:-1]) product_id = data['product_id'] product_url = response.url for value in data['variants'].values(): product_name = product_titles[i] + ' - ' + ' - '.join(value['attributes'].values()) product_price = value['prices']['price']['value'] variant_id = value[u'variant_id'] product_identifier = '%s:%s' % (product_id, variant_id) product_url = urljoin_rfc(product_url, '#/%s' % product_urls[i]) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', product_url) loader.add_value('name', product_name) loader.add_value('identifier', product_identifier) loader.add_value('price', product_price) loader.add_value('shipping_cost', '59') if product_url in self.old_data: loader.add_value('category', self.old_data[product_url]['category']) loader.add_value('brand', self.old_data[product_url]['brand']) loader.add_value('sku', self.old_data[product_url]['sku']) category_found = bool(loader.get_output_value('category')) if not category_found: for category, urls in self.category_products.items(): if product_url in urls or product_url + '/' in urls: loader.add_value('category', category.split(',')) category_found = True break if not category_found: if 'lily-loveseat' in product_url: loader.add_value('category', ['Sofa', 'Fabric', 'armchair']) elif 'lean-to-shelf' in product_url: loader.add_value('category', ['Cabinets', 'Bookcases']) elif 'bench' in product_url: loader.add_value('category', ['Dining', 'Dining Tables']) elif 'console-table' in product_url: loader.add_value('category', ['Cabinets', 'Console Tables']) elif 'coffee-table' in product_url: loader.add_value('category', ['Living', 'Coffee Tables']) elif 'nest-of-table' in product_url: loader.add_value('category', ['Living', 'Nest of Tables']) elif '-sofa' in product_url or 'sofa' in product_name.lower(): if 'leather' in product_url or 'leather' in product_name.lower(): category = ['Sofa', 'Leather'] else: category = ['Sofa', 'Fabric'] if '2-seater' in product_url: category.append('2 seater') elif '2.5 seater' in product_name.lower(): category.append('2.5 seater') elif '3-seater' in product_url: category.append('3 seater') elif '4-seater' in product_url: category.append('4 seater') elif 'corner' in product_url: category.append('Corner sofas') elif 'recliner' in product_url: category.append('Recliner sofas') if len(category) == 3: loader.add_value('category', category) elif '-corner' in product_url: if 'leather' in product_url or 'leather' in product_name.lower(): category = ['Sofa', 'Leather', 'Corner sofas'] else: category = ['Sofa', 'Fabric', 'Corner sofas'] loader.add_value('category', category) elif '-recliner-chair' in product_url or (('chair' in product_name.lower() or 'seat' in product_name.lower()) and ('recliner' in product_name.lower() or ' no recline' in product_name.lower())) or 'relaxer-chair' in product_url or 'hand-facing' in product_url: if 'leather' in product_url or 'leather' in product_name.lower() or 'reid-hedgemoor' in product_url: category = ['Sofa', 'Leather', 'armchair'] else: category = ['Sofa', 'Fabric', 'armchair'] loader.add_value('category', category) elif '-footstool' in product_url and not ('chair' in product_url): if 'millan-' in product_url or 'leather' in product_url or 'leather' in product_name.lower(): loader.add_value('category', ['Sofa', 'Leather', 'Footstools']) else: loader.add_value('category', ['Sofa', 'Fabric', 'Footstools']) elif '-table' in product_url and '-chairs' in product_url: loader.add_value('category', ['Dining', 'Dining Sets']) elif '-dining-table' in product_url: loader.add_value('category', ['Dining', 'Dining Tables']) elif '-bookcase' in product_url: loader.add_value('category', ['Cabinets', 'Bookcases']) elif '-lamp-table' in product_url: loader.add_value('category', ['Living', 'Lamp Tables']) elif '-sideboard' in product_url: loader.add_value('category', ['Cabinets', 'Sideboards']) elif '-display-unit' in product_url: loader.add_value('category', ['Cabinets', 'Display Units']) elif 'tv unit' in product_name.lower(): loader.add_value('category', ['Cabinets', 'Entertainment units']) elif '-shelving-unit' in product_url: loader.add_value('category', ['Cabinets', 'Display Units']) elif '-wine-storage' in product_url: loader.add_value('category', ['Cabinets', 'Display Units']) self.products_collected.append(set_product_type(loader.load_item()))
def _process_product_info_product_details(self, response, product_info): """ This needs to be in separate function because used by two methods: parse_product_details and parse_ajax_price """ hxs = HtmlXPathSelector(response) categories = hxs.select( '//div[@class="bucket"]/div[@class="content"]/ul/li[1]/a/text()' ).extract() product_info['category'] = categories sku = hxs.select( '//li[b[contains(text(), "ISBN-13")]]/text()').extract() product_info['sku'] = sku[0].strip() if sku else '' if response.meta.get( 'seller_identifier', None) and not product_info.get('seller_identifier', None): product_info['seller_identifier'] = response.meta[ 'seller_identifier'] check_match = response.meta.get('check_match', True) match = self.match(response.meta, self.current_search_item, product_info) if check_match and not match: self.log("[AMAZON] WARNING: product does not match: %s" % response.url) return if self.parse_options: if product_info['options'] and response.meta.get( 'parse_options', True): self.log('[AMAZON] OPTIONS FOUND => %s' % response.url) for option in product_info['options']: new_meta = response.meta.copy() new_meta.update({ 'parse_options': False, 'search_string': self.current_search, 'search_item': self.current_search_item, 'check_match': check_match }) yield Request(option['url'], self.parse_product, meta=new_meta, dont_filter=True) return else: if product_info['name_with_options']: product_info['name'] = product_info['name_with_options'] elif product_info['option_texts']: product_info['name'] += ' [' + ', '.join( product_info['option_texts']) + ']' if self.type == 'asins': url_asin = AmazonUrlCreator.get_product_asin_from_url( product_info['url']) if product_info['asin'].lower() != url_asin.lower(): self.log( "[AMAZON] product ASIN '%s' does not match url ASIN '%s'. Page: %s" % (product_info['asin'], url_asin, response.url)) return # Amazon Direct if self.amazon_direct: if self.collect_reviews and product_info.get( 'reviews_url') and response.meta.get( 'collect_reviews', True): new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request(product_info['reviews_url'], callback=self.parse_reviews, meta=new_meta) else: product = self.construct_product(product_info, meta=response.meta) self.log("[AMAZON] collect parse product: %s" % product['identifier']) if self.type == 'category': yield product else: self._collect_amazon_direct(product, response.meta) # Buy Box elif self.only_buybox: if (product_info['price'] and product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \ self.collect_products_with_no_dealer: if self.collect_reviews and product_info.get( 'reviews_url') and response.meta.get( 'collect_reviews', True): new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request(product_info['reviews_url'], callback=self.parse_reviews, meta=new_meta) else: product = self.construct_product(product_info, meta=response.meta) self.log("[AMAZON] collect parse product: %s" % product['identifier']) if self.type == 'category': yield product else: self._collect_buybox(product, response.meta) elif not product_info['vendor'] or not product_info['price']: new_meta = response.meta.copy() new_meta['found_item'] = product_info new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request(product_info['mbc_list_url_new'], callback=self.parse_mbc_list, meta=new_meta) #self.log("[AMAZON] WARNING: product with no vendor: %s" % response.url) else: self.log("[AMAZON] WARNING: vendor not allowed: %s" % response.url) # all sellers / lowest price elif self.all_sellers or self.lowest_product_and_seller: # Go to MBC lists to get dealers prices collect_mbc = response.meta.get('collect_mbc', True) if collect_mbc and product_info.get( 'mbc_list_url_new') and self.collect_new_products: # yield mbc parse new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request(product_info['mbc_list_url_new'], callback=self.parse_mbc_list, meta=new_meta) elif collect_mbc and product_info.get( 'mbc_list_url_used') and self.collect_used_products: # yield mbc parse new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request(product_info['mbc_list_url_used'], callback=self.parse_mbc_list, meta=new_meta) else: if (product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \ self.collect_products_with_no_dealer: if self.collect_reviews and product_info.get( 'reviews_url') and response.meta.get( 'collect_reviews', True): new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request(product_info['reviews_url'], callback=self.parse_reviews, meta=new_meta) else: use_seller_id_in_identifier = False \ if self.lowest_product_and_seller and not self.lowest_seller_collect_dealer_identifier else True product = self.construct_product( product_info, meta=response.meta, use_seller_id_in_identifier= use_seller_id_in_identifier) self.log("[AMAZON] collect parse product: %s" % product['identifier']) if self.type == 'category': yield product else: self._collect(product) elif not product_info['vendor']: # TODO: collect vendor from vendor details page self.log( "[AMAZON] WARNING: Could not scrape vendor from product details: %s" % response.url) self.errors.append( "Could not scrape vendor from product details: %s" % response.url) else: self.log("[AMAZON] WARNING: vendor not allowed: %s" % response.url)
def parse_product(self, response): # inspect_response(response, self) # return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) tmp = hxs.select('//p[@class="sku-number"]/span/text()').extract() if tmp: loader.add_value('identifier', tmp[0]) loader.add_value('sku', tmp[0]) else: log.msg('### No product ID at ' + response.url, level=log.INFO) name = '' tmp = hxs.select('//h1[@itemprop="name"]/text()').extract() if tmp: name = tmp[0].strip() loader.add_value('name', tmp[0].strip()) else: log.msg('### No name at ' + response.url, level=log.INFO) # price price = 0 tmp = hxs.select('//strong[@itemprop="price"]/text()').extract() if tmp: price = extract_price(tmp[0].strip()) loader.add_value('price', price) # image_url tmp = hxs.select('//img[@itemprop="image"]/@src').extract() if tmp: loader.add_value('image_url', tmp[0]) # brand for brand in self.brands: if brand.lower() in name.lower(): loader.add_value('brand', brand) break # category tmp = hxs.select('//ul[@itemprop="breadcrumb"]/li/a/text()').extract() if len(tmp): tmp = tmp[1:] if len(tmp) > 3: tmp = tmp[-3:] for s in tmp: loader.add_value('category', s) # shipping_cost shipping_cost = '9.90' # stock if not price: loader.add_value('stock', 0) else: tmp = hxs.select( '//span[contains(@class,"stock-status")]/text()').extract() if tmp and 'Out' in tmp[0]: loader.add_value('stock', 0) else: loader.add_value('stock', 1) product = loader.load_item() # options = hxs.select('//ul[contains(@id,"option-custcol")]/li/a').extract() # if not options: # yield product # return tmp = hxs.select('//meta[@itemprop="url"]/@content').extract() if product['price'] <= 99: product['shipping_cost'] = shipping_cost if not tmp: yield product return else: # process options if tmp[0].startswith('/'): tmp[0] = tmp[0][1:] if tmp[0].startswith('product/'): tmp[0] = tmp[0][8:] url = 'http://www.cellbikes.com.au/api/items?include=facets&fieldset=details&language=en&country=AU¤cy=AUD&pricelevel=5&c=980629&n=3&id=%s' % tmp[ 0] else: url = 'http://www.cellbikes.com.au/api/items?include=facets&fieldset=details&language=en&country=AU¤cy=AUD&pricelevel=5&c=980629&n=3&url=%s' % tmp[ 0] yield Request(url, meta={'product': product}, callback=self.parse_options)
def parse(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select('//td[@class="produkt_menu"]/div/table/tr/td/a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) yield Request(url, callback=self.parse_products)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) name = hxs.select('//div[@class="catBanner"]/h2/text()').extract()[0] price = hxs.select( '//span[@id="variant-price-header"]/text()').extract() if price: price = extract_price(price[0]) else: return sku = hxs.select('//div[@class="prod"]/p[@class="code"]').re( "Code: ([0-9]+)")[0] brand = hxs.select( '//td[@class="attrib" and text()="Manufacturer"]/following-sibling::td/text()' ).extract() product_loader.add_value('sku', sku) category = " ".join( hxs.select('//div[@id="breadcrumb"]/ul/li/a/text()').extract() [2:-1])[2:] product_loader.add_value('category', category) product_loader.add_value('brand', brand) image_url = hxs.select( '//div[@id="primary_image"]/a/img/@src').extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) identifier = hxs.select( '//input[@name="productCodePost"]/@value').extract() product = product_loader.load_item() variants = hxs.select('//select[@id="variant"]/option') if variants: for option in variants: value = option.select('./@value').extract() if value: variant = parse_variant(value[0]) title = option.select('./text()').extract()[0] price = extract_price(variant.get('price', "0")) subid = variant.get('code') if subid: prod = Product(product) prod['identifier'] = "%s_%s" % (identifier[0], subid) prod['price'] = price subname = title.split(u"£") if subname: subname = subname[0].strip().replace(u"\xa0", " ") if subname.endswith(","): subname = subname[:-1] prod['name'] = "%s %s" % (name, subname) yield prod else: # one option product prod = Product(product) prod['name'] = name o = hxs.select( '//div[@class="options_not_available"]/text()').extract() if o: prod['name'] += ' ' + o[0].strip() prod['identifier'] = identifier[0] prod['price'] = price yield prod
def parse_cat(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) search_url = response.xpath('//script/text()').re_first('var refine_filters_server_search_script.*?"(.+)"') if search_url: yield Request(response.urljoin(search_url), self.parse_cat, meta=response.meta) subcats = response.xpath('//div[@class="no_child_subcats_list"]//a/@href').extract() subcats += response.css('div.subcats_list a::attr(href)').extract() if subcats: for subcat in subcats: yield Request( url=urljoin_rfc(base_url, subcat), meta=response.meta, callback=self.parse_cat) # AJAX? if ( '-' in subcat and 'http' not in subcat and '?' not in subcat and '=' not in subcat ): url = urljoin_rfc(base_url, subcat.replace('-', '_') .replace('/', '') + '_search.php') request = FormRequest(url=url, formdata={u'mode': u'get_products', u'objects_per_page': u'45', u'page': u'1'}, meta=response.meta, callback=self.parse_cat) yield request next_page = response.css('a.right-arrow::attr(href)').extract() if not next_page or (next_page and not next_page[0]): try: next_page = int(response.css('a.right-arrow::attr(onclick)').re(r"\('(\d+)'")[0]) request = FormRequest(url=response.url, formdata={u'mode': u'get_products', u'objects_per_page': u'45', u'page': unicode(next_page)}, meta=response.meta, callback=self.parse_cat, dont_filter=True) yield request except: pass else: next_page = None if next_page: url1 = urljoin_rfc(base_url, next_page[0]) yield Request( url=url1, meta=response.meta, callback=self.parse_cat) products = response.css('div#pr_list a::attr(href)').extract() for product in products: # self.cookie_jar += 1 meta = response.meta.copy() meta['dont_merge_cookies'] = True # meta['cookiejar'] = self.cookie_jar yield Request( url=urljoin_rfc(base_url, product), meta=meta, callback=self.parse_product)