def generate_category_urls(self):
     html = retry_get_html(self.domain)
     parser = etree.HTML(html)
     category_xpath_str = '//*[@id="nav"]/div/div/ul//a/@href'
     category_hrefs = parser.xpath(category_xpath_str)
     category_urls = []
     for href in category_hrefs:
         category_urls.append(urljoin(self.domain, href))
     for url in category_urls:
         parser = etree.HTML(retry_get_html(url))
         try:
             page_str = parser.xpath('//div[@class="cur"]/text()')[0]
             num = int(page_str.split('/')[1])
         except Exception:
             num = 1
         for page_num in range(1, num+1):
             yield url + '/%d.html' % page_num
 def generate_category_urls(self):
     html = retry_get_html(self.domain)
     parser = etree.HTML(html)
     category_xpath_str = '//*[@id="nav"]/div/div/ul//a/@href'
     category_hrefs = parser.xpath(category_xpath_str)
     category_urls = []
     for href in category_hrefs:
         category_urls.append(urljoin(self.domain, href))
     for url in category_urls:
         parser = etree.HTML(retry_get_html(url))
         try:
             page_str = parser.xpath('//div[@class="cur"]/text()')[0]
             num = int(page_str.split('/')[1])
         except Exception:
             num = 1
         for page_num in range(1, num + 1):
             yield url + '/%d.html' % page_num
 def generate_item_urls(self):
     """遍历站点的目录页,返回所有目录页的商品url列表"""
     category_url_list = self.generate_category_urls()
     href_xpath = """//div[@id="prod_list"]//a[@class="pic_box"]/@href"""
     for category_url in category_url_list:
         html = retry_get_html(category_url)
         parser = etree.HTML(html)
         href_list = parser.xpath(href_xpath)
         for href in href_list:
             yield urljoin(self.domain, href)
 def generate_item_urls(self):
     """遍历站点的目录页,返回所有目录页的商品url列表"""
     category_url_list = self.generate_category_urls()
     href_xpath = """//div[@id="prod_list"]//a[@class="pic_box"]/@href"""
     for category_url in category_url_list:
         html = retry_get_html(category_url)
         parser = etree.HTML(html)
         href_list = parser.xpath(href_xpath)
         for href in href_list:
             yield urljoin(self.domain, href)
 def html(self):
     return retry_get_html(self.url)
 def html(self):
     return retry_get_html(self.url)