Python get_html Exemples, app.spiders.spider.get_html Python Exemples

Exemple #1

0

Afficher le fichier

    def go(self, url):
        content = get_html(url, headers)

        # 过滤不符合义乌购加个体系的产品
        filter_result = self.__filter(content)
        if filter_result['errcode'] != 0:
            return filter_result

        tree = html.fromstring(content)

        product = self.__extract_base_and_sku(tree)
        if product['isSKUOffer'] == 'false':
            # 抓取 阶梯价
            price_range = self.__extract_price_range(tree)
            if len(price_range) > 0:
                product['isRangePriceSku'] = 'true'
                can_book_count = self.__extract_can_book_count_based_on_price_range(
                    tree)
                product['sku'] = {
                    "priceRange": price_range,
                    "skuProps": [],
                    "canBookCount": str(can_book_count)
                }

        product['title'] = self.__extract_title(tree)
        product['images'] = self.__extract_images(tree)
        product['attributes'] = self.__extract_attributes(tree)
        product['description'] = self.__extract_description(tree)

        return product

Exemple #2

0

Afficher le fichier

    def go(self, url):
        content = get_html(url)
        tree = html.fromstring(content)

        data = {
            'data': self.__extract_categories(tree),
            'shop': self.__extract_shop_info(tree)
        }

        return data

Exemple #3

0

Afficher le fichier

 def __extract_description(self, tree):
     description_request_url = tree.xpath(
         '//div[@id="desc-lazyload-container"]')[0].attrib['data-tfs-url']
     content = get_html(description_request_url, headers)
     content = content[30:-3].replace('\\', '')
     content = re.sub('href[^>]+', 'href="#none"', content)
     if not content:
         easy_desc = tree.xpath(
             '//div[contains(@class, "offerdetail_easyoffer_dsc")]')[0]
         content += str(html.tostring(easy_desc), encoding="utf-8")
     return content

Exemple #4

0

Afficher le fichier

Fichier : products.py Projet : qq809911125/1688-Crawler

    def go(self, url):
        content = get_html(url)
        tree = html.fromstring(content)

        products = {
            'total': 0,
            'current_page': 1,
            'last_page': 1,
            'per_page': 20,
            'data': []
        }

        products.update(self.__extract_pagination(tree))
        products['data'] = self.__extract_product_info(tree)
        products['shop'] = self.__extract_shop_info(tree)

        return products