def parse(self, response): response_sel = Selector(response) category = response_sel.xpath(u'//a[contains(@href,"category")]/@href').extract() sellerid = response_sel.xpath(u'//meta[contains(@content,"userId")]/@content').extract() # get the sellerid and replace it sellerId = re.findall(r'userId=(\d+)', sellerid[0])[0] if sellerId: self.rateUrl = set_query(self.rateUrl, sellerId=sellerId) else: self.logger.error("Get the sellerid error !") domain = re.findall(r'https:(.*)', response.url)[0] if domain: # replace the request page domain self.categoryUrl, result_count = re.subn(r'//(.*?)/', domain, self.categoryUrl) self.asyncUrl, result_count = re.subn(r'//(.*?)/', domain, self.asyncUrl) else: self.logger.error("Get the request domain error!") all_category = set() for category_url in category: category_id = re.findall(r'category-(\d+).htm', category_url) if category_id: all_category.add(category_id[0]) for category_id in all_category: # set the category id result_url, result_count = re.subn(r'(\d+\d+)', category_id, self.categoryUrl) self.logger.info("category url : %s", result_url) yield Request(url=result_url, callback=self.parse_category)
def parse_category(self, response): response_sel = Selector(response) data_widgetid = response_sel.xpath(u'//*[@class="J_TModule" and @data-title="搜索列表"]/@data-widgetid').extract() wid = data_widgetid[0] mid = 'w-' + wid + '-0' catId = get_query(response.url, 'catId') path = "/category"+catId + '.htm' pageNo = get_query(response.url, 'pageNo') page_url = set_query(self.asyncUrl, wid=wid, mid=mid, path=path, catId=catId, scid=catId,pageNo=pageNo) yield Request(url=page_url, callback=self.parse_nextpage)
def parse_nextpage(self, response): response_sel = Selector(response) next_pageurl = response_sel.xpath(u'//a[contains(@class,"next")]/@href').extract() if len(next_pageurl) > 0: page_num = get_query(next_pageurl[0], 'pageNo') next_url = set_query(self.categoryUrl, pageNo=page_num) yield Request(url=next_url, callback=self.parse_category) else: self.logger.warning("Can not find the next page url ! ") dl_bodys = response_sel.xpath(u'/html/body/div/div[3]') for dl_body in dl_bodys: item_lines = dl_body.xpath(u'./div/dl') for item_line in item_lines: comment_item = TmallCommentItem() data_id = item_line.xpath(u'./@data-id').extract() item_id = re.findall('(\d+)', data_id[0]) item_name = item_line.xpath(u'./dd[contains(@class,"detail")]/a/text()').extract() item_type = item_line.xpath(u'./dd[contains(@class,"detail")]/a/span/text()').extract() item_price = item_line.xpath(u'./dd[contains(@class,"detail")]/div/div[contains(@class,"cprice-area")]/span/text()').extract() item_sales = item_line.xpath(u'./dd[contains(@class,"detail")]/div/div[contains(@class,"sale-area")]/span/text()').extract() if len(item_name) > 1: comment_item['ItemName'] = item_name[0].strip() + ' ' + item_name[1].strip() else: comment_item['ItemName'] = item_name[0].strip() if len(item_type) > 0: comment_item['ItemType'] = item_type[0].strip() if len(item_price) > 1: comment_item['ItemPrice'] = item_price[1].strip() if len(item_sales) > 0: comment_item['ItemSales'] = item_sales[0].strip() yield comment_item