def go(self, url): content = get_html(url, headers) # 过滤不符合义乌购加个体系的产品 filter_result = self.__filter(content) if filter_result['errcode'] != 0: return filter_result tree = html.fromstring(content) product = self.__extract_base_and_sku(tree) if product['isSKUOffer'] == 'false': # 抓取 阶梯价 price_range = self.__extract_price_range(tree) if len(price_range) > 0: product['isRangePriceSku'] = 'true' can_book_count = self.__extract_can_book_count_based_on_price_range( tree) product['sku'] = { "priceRange": price_range, "skuProps": [], "canBookCount": str(can_book_count) } product['title'] = self.__extract_title(tree) product['images'] = self.__extract_images(tree) product['attributes'] = self.__extract_attributes(tree) product['description'] = self.__extract_description(tree) return product
def go(self, url): content = get_html(url) tree = html.fromstring(content) data = { 'data': self.__extract_categories(tree), 'shop': self.__extract_shop_info(tree) } return data
def __extract_description(self, tree): description_request_url = tree.xpath( '//div[@id="desc-lazyload-container"]')[0].attrib['data-tfs-url'] content = get_html(description_request_url, headers) content = content[30:-3].replace('\\', '') content = re.sub('href[^>]+', 'href="#none"', content) if not content: easy_desc = tree.xpath( '//div[contains(@class, "offerdetail_easyoffer_dsc")]')[0] content += str(html.tostring(easy_desc), encoding="utf-8") return content
def go(self, url): content = get_html(url) tree = html.fromstring(content) products = { 'total': 0, 'current_page': 1, 'last_page': 1, 'per_page': 20, 'data': [] } products.update(self.__extract_pagination(tree)) products['data'] = self.__extract_product_info(tree) products['shop'] = self.__extract_shop_info(tree) return products