Python generate_open_ssl_hash Exemples, scrapies.utils.generate_open_ssl_hash Python Exemples

Exemple #1

0

Afficher le fichier

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath(
            '//body[@id="darty_liste_produit"]//div[@id="main_pagination_top"]/div['
            + u.x_class('darty_product_list_pages_list') + ']')
        if x_pagination:
            url_next_page = x_pagination.xpath(
                './a[text()=" Page suivante"][last()]/@href').extract_first()
            if url_next_page is not None:
                yield Request(self.base_url + url_next_page.strip(),
                              callback=self.parse)

        # Yield product pages.
        x_list = response.xpath(
            '//body[@id="darty_liste_produit"]//div[@id="main_products_list"]')
        if x_list:
            urls = x_list.xpath('.//div[' + u.x_class('infos_container') +
                                ']/h2/a/@href').extract()
            for url in urls:
                url = self.base_url + url.strip()
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled:
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//body[@id="page_product"]')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//ul[@id="dartyCom_fil_ariane"]')

            main_category = x_categories.xpath(
                './li[2]/a/text()').extract_first()
            if main_category is not None:
                main_category = main_category.strip()

            categories = x_categories.xpath(
                './li[position() >= 3 and position() < last()]/a/text()'
            ).extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Brand
            brand = response.xpath(
                '//a[@id="darty_product_brand"]/text()').extract_first()
            if brand is not None:
                brand = brand.strip()

            # Name
            name = re.sub(
                ' +', ' ', ''.join(
                    response.xpath('//h1[' + u.x_class('product_head') +
                                   ']//div[' + u.x_class('product_name') +
                                   ']/span//text()').extract()).replace(
                                       '\n', '').replace('\r', '').strip())

            # Price
            price, price_old, currency = p.get_darty_prices(response)

            # Image
            src = response.xpath(
                '//div[' +
                u.x_class('darty_product_picture_main_pic_container') +
                ']/div[1]//img/@src').extract_first()
            if src is not None:
                src = src.strip()

            # Avis
            x_avis = response.xpath('//div[' +
                                    u.x_class('bloc_reviews_resume') + ']')

            rate = x_avis.xpath(
                '//meta[@itemprop="ratingValue"]/@content').extract_first()
            if rate is not None:
                rate = u.string_to_float(rate.strip())

            max_rate = x_avis.xpath('//div[' + u.x_class('bloc_reviews_note') +
                                    ']/sub/text()').extract_first()
            if max_rate is not None:
                max_rate = int(re.sub('\D', ' ', max_rate.strip()))

            nb_avis = x_avis.xpath(
                '//meta[@itemprop="ratingCount"]/@content').extract_first()
            if nb_avis is not None:
                nb_avis = int(re.sub('\D', ' ', nb_avis.strip()))

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = main_category
            item['categories'] = categories
            item['brand'] = brand
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = max_rate
            item["nb_avis"] = nb_avis
            item["price_history"] = [{
                'date': time.strftime("%Y/%m/%d"),
                'price_old': price_old,
                'price': price,
                'currency': currency
            }]

            yield item

Exemple #2

0

Afficher le fichier

Fichier : fnac_spider.py Projet : Pierre-Lefevre/INeedDisAtDisPrice-server

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath('//ul[' +
                                      u.x_class('Pager bottom-toolbar') + ']')
        if x_pagination:
            url_next_page = x_pagination.xpath(
                './/a[' + u.x_class('prevnext actionNext') +
                ']/@href').extract_first()
            if url_next_page is not None:
                yield Request(url_next_page, callback=self.parse)

        # Yield product pages.
        x_list = response.xpath('//ul[' + u.x_class('articleList') + ']')
        if x_list:
            urls = x_list.xpath('.//p[' + u.x_class('Article-desc') +
                                ']/a/@href').extract()
            for url in urls:
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled:
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//div[' + u.x_class('f-productPage') + ']')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//ul[' + u.x_class('f-breadcrumb') +
                                          ']')

            main_category = x_categories.xpath(
                './li[2]/a/text()').extract_first()
            if main_category is not None:
                main_category = main_category.strip()

            categories = x_categories.xpath(
                './li[position() >= 3]/a/text()').extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Name
            name = response.xpath('//h1[' +
                                  u.x_class('f-productHeader-Title') +
                                  ']/text()').extract_first().strip()

            # Price
            price, price_old, currency = p.get_fnac_prices(response)

            # Image
            src = response.xpath('//img[' +
                                 u.x_class('f-productVisuals-mainMedia') +
                                 ']/@src').extract_first()
            if src is not None:
                src = src.strip()

            # Avis
            x_avis = response.xpath('//div[' + u.x_class('f-review-header') +
                                    ']')

            rate = x_avis.xpath('.//div[' + u.x_class('f-review-headerRate') +
                                ']/text()').extract_first()
            if rate is not None:
                rate = u.string_to_float(rate.strip())

            max_rate = x_avis.xpath('.//span[' +
                                    u.x_class('f-review-headerRateTotal') +
                                    ']/text()').extract_first()
            if max_rate is not None:
                max_rate = u.string_to_float(max_rate.strip().replace("/", ""))

            nb_avis = response.xpath('//div[' +
                                     u.x_class('f-productHeader-review') +
                                     ']//span[' +
                                     u.x_class('f-productHeader-reviewLabel') +
                                     ']/text()').extract_first()
            if nb_avis is not None:
                nb_avis = u.string_to_float(re.sub("\D", "", nb_avis.strip()))

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = main_category
            item['categories'] = categories
            item['brand'] = None
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = max_rate
            item["nb_avis"] = nb_avis
            item["price_history"] = [{
                'date': time.strftime("%Y/%m/%d"),
                'price_old': price_old,
                'price': price,
                'currency': currency
            }]

            if src == self.src_no_image:
                copyfile(
                    "data/default.jpg", "data/" + self.name + "/img/" +
                    item["image_name"] + ".jpg")

            yield item

Exemple #3

0

Afficher le fichier

Fichier : auchan_spider.py Projet : Pierre-Lefevre/INeedDisAtDisPrice-server

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath('//nav[' + u.x_class('ui-pagination') + ']')
        if x_pagination:
            url_next_page = x_pagination.xpath('.//a[' + u.x_class('ui-pagination--next') + ']/@href').extract_first()
            if url_next_page is not None:
                yield Request(self.base_url + url_next_page.strip(), callback=self.parse)

        # Yield product pages.
        x_list = response.xpath('//div[' + u.x_class('product-list--container') + ']')
        if x_list:
            urls = x_list.xpath('.//div[' + u.x_class('product-item--wrapper') + ']/a/@href').extract()
            for url in urls:
                url = self.base_url + url.strip()
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled:
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//div[' + u.x_class('product-detail') + ']')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//div[' + u.x_class('ui-breadcrumb--scroller') + ']/nav')

            main_category = x_categories.xpath('./span[2]/meta[@itemprop="name"]/@content').extract_first()
            if main_category is not None:
                main_category = main_category.strip()

            categories = x_categories.xpath('./span[position() >= 3 and position() < last()]/meta[@itemprop="name"]/@content').extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Brand
            x_brand_name = response.xpath('//div[' + u.x_class('product-detail--wrapper') + ']')

            brand = x_brand_name.xpath('./meta[@itemprop="brand"]/@content').extract_first()
            if brand is not None:
                brand = brand.strip()

            # Name
            name = x_brand_name.xpath('./h1[' + u.x_class('product-detail--title') + ']/text()').extract_first().replace('\n', '').replace('\r', '').strip()

            # Price
            price, price_old, currency = p.get_auchan_prices(response)

            # Image
            src = response.xpath('//div[' + u.x_class('x-scroller') + ']/label[1]//img/@src').extract_first()
            if src is not None:
                src = src.strip()

            # Avis
            x_avis = response.xpath('//div[' + u.x_class('product-detail--rating') + ']')

            rate = x_avis.xpath('.//meta[@itemprop="ratingValue"]/@content').extract_first()
            if rate is not None:
                rate = u.string_to_float(rate.strip())

            nb_avis = x_avis.xpath('.//meta[@itemprop="reviewCount"]/@content').extract_first()
            if nb_avis is not None:
                nb_avis = int(nb_avis.strip())

            max_rate = x_avis.xpath('.//span[' + u.x_class('ui-rating--background') + ']/i[' + u.x_class('icon-auchan-82') + ']').extract()
            max_rate = len(max_rate) if max_rate else None

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = main_category
            item['categories'] = categories
            item['brand'] = brand
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = max_rate
            item["nb_avis"] = nb_avis
            item["price_history"] = [{'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency}]

            yield item

Exemple #4

0

Afficher le fichier

Fichier : cdiscount_spider.py Projet : Pierre-Lefevre/INeedDisAtDisPrice-server

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath('//ul[@id="PaginationForm_ul"]')
        if x_pagination and self.first:
            self.first = False
            nb_page = x_pagination.xpath('./li[last()]/a/text()').extract_first()
            if nb_page is not None:
                for x in range(1, int(nb_page.strip())):
                    yield Request(response.url[:-5] + "-" + str(x) + response.url[-5:], callback=self.parse)

        # Yield product pages.
        x_list = response.xpath('//ul[@id="lpBloc"]')
        if x_list:
            urls = x_list.xpath('.//div[' + u.x_class('prdtBILDetails') + ']/a/@href').extract()
            for url in urls:
                url = url.strip()
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled and self.nb_crawled < 300 :
                    self.nb_crawled += 1
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//h1[@itemprop="name"]')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//div[@id="bc"]')

            main_category = x_categories.xpath('.//li[3]//span/text()').extract_first()
            if main_category is not None:
                main_category = main_category.strip()

            categories = x_categories.xpath('.//li[position() >= 4 and position() < last()]//span/text()').extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Brand
            brand = response.xpath('//table[' + u.x_class('fpDescTb fpDescTbPub') + ']//span[@itemprop="brand"]//span[@itemprop="name"]/text()').extract_first()
            if brand is not None:
                brand = brand.strip()

            # Name
            name = re.sub(' +', ' ', x_product.xpath('./text()').extract_first().strip())

            # Price
            price, price_old, currency = p.get_cdiscount_prices(response)

            # Image
            src = response.xpath('//div[' + u.x_class('fpMainImg') + ']/a[@itemprop="image"]/@href').extract_first()
            if src is not None:
                src = src.strip()

            # Avis
            x_avis = response.xpath('//div[' + u.x_class('topMainRating') + ']')

            rate = x_avis.xpath('//span[@itemprop="ratingValue"]/text()').extract_first()
            if rate is not None:
                rate = u.string_to_float(rate.strip())

            nb_avis = x_avis.xpath('//span[@itemprop="ratingCount"]/text()').extract_first()
            if nb_avis is not None:
                nb_avis = int(nb_avis.strip())

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = main_category
            item['categories'] = categories
            item['brand'] = brand
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = 5
            item["nb_avis"] = nb_avis
            item["price_history"] = [{'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency}]

            yield item

Exemple #5

0

Afficher le fichier

Fichier : boulanger_spider.py Projet : Pierre-Lefevre/INeedDisAtDisPrice-server

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath('//div[' + u.x_class('navigationListe') +
                                      ']')
        if x_pagination:
            url_next_page = x_pagination.xpath(
                './/span[' + u.x_class('navPage navPage-right') +
                ']/a/@href').extract_first()
            if url_next_page is not None:
                yield Request(self.base_url + url_next_page.strip(),
                              callback=self.parse)

        # Yield product pages.
        x_list = response.xpath('//div[' + u.x_class('productListe') + ']')
        if x_list:
            urls = x_list.xpath('.//div[' + u.x_class('designations') +
                                ']/h2/a/@href').extract()
            for url in urls:
                url = self.base_url + url.strip()
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled:
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//h1[@itemprop="name"]')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//div[@id="filAriane"]')

            main_category = x_categories.xpath(
                './/li[2]//a/text()').extract_first()
            if main_category is not None:
                main_category = main_category.strip()

            categories = x_categories.xpath(
                './/li[position() >= 3 and position() <= last()]//a/text()'
            ).extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Name
            name = re.sub(
                ' +', ' ',
                ''.join(x_product.xpath('./text()').extract()).replace(
                    '\n', '').replace('\r', '').strip())

            # Price
            price, price_old, currency = p.get_boulanger_prices(response)

            # Image
            src = response.xpath(
                '//span[@itemprop="gtin13"]/text()').extract_first()
            if src is not None:
                src = "https://boulanger.scene7.com/is/image/Boulanger/" + src.strip(
                ) + "_h_f_l_0"

            # Avis
            x_avis = response.xpath('//div[' + u.x_class('top') + ']/div[' +
                                    u.x_class('right') + ']//span[' +
                                    u.x_class('rating') + ']')

            rate = x_avis.xpath('./@class').extract_first()
            if rate is not None:
                rate = re.sub('\D', '', rate.strip())
                if rate != "0":
                    if len(rate) > 1:
                        rate = rate[:1] + "," + rate[1:]
                    rate = u.string_to_float(rate)
                else:
                    rate = None

            nb_avis = x_avis.xpath('./span[' + u.x_class('link') +
                                   ']/text()').extract_first()
            if nb_avis is not None:
                nb_avis = int(re.sub('\D', '', nb_avis.strip()))

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = main_category
            item['categories'] = categories
            item['brand'] = None
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = 5
            item["nb_avis"] = nb_avis
            item["price_history"] = [{
                'date': time.strftime("%Y/%m/%d"),
                'price_old': price_old,
                'price': price,
                'currency': currency
            }]

            yield item

Exemple #6

0

Afficher le fichier

Fichier : materiel_net_spider.py Projet : Pierre-Lefevre/INeedDisAtDisPrice-server

    def parse(self, response):

        # Yield list pages.
        x_pagination = response.xpath('//ul[' +
                                      u.x_class('pagination pagination-sm') +
                                      ']')
        if x_pagination:
            url_next_page = x_pagination.xpath(
                './li[position() = last()]/a/@href').extract_first()
            if url_next_page is None:
                url_next_page = x_pagination.xpath(
                    './li[position() = (last() - 1)]/a/@href').extract_first()
            if url_next_page is not None:
                yield Request(self.base_url + url_next_page,
                              callback=self.parse)

        # Yield product pages.
        x_list = response.xpath('//table[' + u.x_class('ProdList') + ']')
        if x_list:
            urls = x_list.xpath('.//td[' + u.x_class('Photo') +
                                ']/span/@data-href').extract()
            for url in urls:
                url = self.base_url + url
                open_ssl_hash = u.generate_open_ssl_hash(url)
                if open_ssl_hash not in self.already_crawled:
                    self.already_crawled.append(open_ssl_hash)
                    yield Request(url, callback=self.parse)

        # Yield product.
        x_product = response.xpath('//div[@id="prod"]')
        if x_product:
            item = Product()

            # Categories
            x_categories = response.xpath('//nav[@id="breadcrumb"]')

            categories = x_categories.xpath(
                './/li[position() >= 3 and position() < last()]/a/text()'
            ).extract()
            if categories:
                for i, category in enumerate(categories):
                    categories[i] = category.strip()

            # Brand
            brand = x_categories.xpath('.//li[2]/a/text()').extract_first()
            if brand is not None:
                brand = brand.strip()

            # Name
            name = re.sub(
                ' +', ' ', ''.join(
                    response.xpath(
                        '//h1[@id="ProdTitle"]//text()').extract()).replace(
                            '\n', '').replace('\r', '').strip())

            # Price
            price, price_old, currency = p.get_materiel_net_prices(response)

            # Image
            src = response.xpath('//div[' + u.x_class('swiper-wrapper') +
                                 ']//a/@data-zoom-image').extract_first()
            if src is None:
                src = response.xpath(
                    '//div[@id="container-image"]/@data-zoom-image'
                ).extract_first()
            if src is not None:
                src = src.strip()

            # Avis
            x_avis = response.xpath('//div[' + u.x_class('headerAvisClients') +
                                    ']')

            rate = x_avis.xpath('.//span[' + u.x_class('noteUser') +
                                ']/text()').extract_first()
            if rate is not None:
                rate = u.string_to_float(rate.strip())

            max_rate = x_avis.xpath(
                './/span[' + u.x_class('noteUser') +
                ']/following-sibling::span[1]/text()').extract_first()
            if max_rate is not None:
                max_rate = u.string_to_float(max_rate.strip())

            nb_avis = x_avis.xpath(
                './/span[@id="avisCount"]/span/text()').extract_first()
            if nb_avis is not None:
                nb_avis = int(nb_avis.strip())

            item['store'] = self.name
            item['url'] = response.url
            item['main_category'] = "Informatique"
            item['categories'] = categories
            item['brand'] = brand
            item['openssl_hash'] = u.generate_open_ssl_hash(item['url'])
            item['name'] = name
            item['price_old'] = price_old
            item['price'] = price
            item['currency'] = currency
            item["image_urls"] = [src]
            item["image_name"] = item['openssl_hash']
            item["rate"] = rate
            item["max_rate"] = max_rate
            item["nb_avis"] = nb_avis
            item["price_history"] = [{
                'date': time.strftime("%Y/%m/%d"),
                'price_old': price_old,
                'price': price,
                'currency': currency
            }]

            yield item