def parse(self, response):
        response_sel = Selector(response)

        category = response_sel.xpath(u'//a[contains(@href,"category")]/@href').extract()
        sellerid = response_sel.xpath(u'//meta[contains(@content,"userId")]/@content').extract()

        # get the sellerid and replace it
        sellerId = re.findall(r'userId=(\d+)', sellerid[0])[0]
        if sellerId:
            self.rateUrl = set_query(self.rateUrl, sellerId=sellerId)
        else:
            self.logger.error("Get the sellerid error !")

        domain = re.findall(r'https:(.*)', response.url)[0]
        if domain:
            # replace the request page domain
            self.categoryUrl, result_count = re.subn(r'//(.*?)/', domain, self.categoryUrl)
            self.asyncUrl, result_count = re.subn(r'//(.*?)/', domain, self.asyncUrl)
        else:
            self.logger.error("Get the request domain error!")

        all_category = set()
        for category_url in category:
            category_id = re.findall(r'category-(\d+).htm', category_url)
            if category_id:
                all_category.add(category_id[0])

        for category_id in all_category:
            # set the category id
            result_url, result_count = re.subn(r'(\d+\d+)', category_id, self.categoryUrl)
            self.logger.info("category url : %s", result_url)
            yield Request(url=result_url, callback=self.parse_category)
    def parse_category(self, response):
        response_sel = Selector(response)
        data_widgetid = response_sel.xpath(u'//*[@class="J_TModule" and @data-title="搜索列表"]/@data-widgetid').extract()
        wid = data_widgetid[0]

        mid = 'w-' + wid + '-0'
        catId = get_query(response.url, 'catId')
        path = "/category"+catId + '.htm'
        pageNo = get_query(response.url, 'pageNo')

        page_url = set_query(self.asyncUrl, wid=wid, mid=mid, path=path, catId=catId, scid=catId,pageNo=pageNo)

        yield Request(url=page_url, callback=self.parse_nextpage)
    def parse_nextpage(self, response):
        response_sel = Selector(response)
        next_pageurl = response_sel.xpath(u'//a[contains(@class,"next")]/@href').extract()

        if len(next_pageurl) > 0:
            page_num = get_query(next_pageurl[0], 'pageNo')
            next_url = set_query(self.categoryUrl, pageNo=page_num)
            yield Request(url=next_url, callback=self.parse_category)
        else:
            self.logger.warning("Can not find the next page url ! ")

        dl_bodys = response_sel.xpath(u'/html/body/div/div[3]')

        for dl_body in dl_bodys:
            item_lines = dl_body.xpath(u'./div/dl')
            for item_line in item_lines:
                comment_item = TmallCommentItem()

                data_id = item_line.xpath(u'./@data-id').extract()

                item_id = re.findall('(\d+)', data_id[0])

                item_name = item_line.xpath(u'./dd[contains(@class,"detail")]/a/text()').extract()
                item_type = item_line.xpath(u'./dd[contains(@class,"detail")]/a/span/text()').extract()
                item_price = item_line.xpath(u'./dd[contains(@class,"detail")]/div/div[contains(@class,"cprice-area")]/span/text()').extract()
                item_sales = item_line.xpath(u'./dd[contains(@class,"detail")]/div/div[contains(@class,"sale-area")]/span/text()').extract()

                if len(item_name) > 1:
                    comment_item['ItemName'] = item_name[0].strip() + ' ' + item_name[1].strip()
                else:
                    comment_item['ItemName'] = item_name[0].strip()

                if len(item_type) > 0:
                    comment_item['ItemType'] = item_type[0].strip()
                if len(item_price) > 1:
                    comment_item['ItemPrice'] = item_price[1].strip()
                if len(item_sales) > 0:
                    comment_item['ItemSales'] = item_sales[0].strip()

                yield comment_item