Python CrawlbotItem Exemples, crawlbot.items.CrawlbotItem Python Exemples

Exemple #1

0

Afficher le fichier

    def parse_item(self, response):
        i = CrawlbotItem()

        i['url'] = response.url
        i['title'] = response.xpath(
            '//header[contains(@class, "content-header")]/h1/text()').extract(
            )[0].strip()
        desc = response.xpath(
            '//div[@class="content-text"]//p/text()').extract()
        desc = " ".join(desc)
        i['description'] = desc

        return i

Exemple #2

0

Afficher le fichier

    def parse_detail(self, response):
        images= response.css('#amasty_gallery a::attr(href)').extract()
        if images is None or len(images)<1:
            images= response.css('div.product-image img::attr(data-zoom-image)').extract()
        name= response.css('div.product-name h1::text').extract()[0]
        loader= ItemLoader(item=CrawlbotItem(), selector= images)
        loader.add_value('image_urls', images)
        price_tex= response.css('div.price-box span span::text').extract_first()
        if price_tex is not None:
            price_tex= price_tex[:-2].replace('.','')
        sizes=[]
        for li_size in response.css('#configurable_swatch_size li'):
            sizes.append(li_size.css('a::attr(title)').extract()[0])
        
        product_item= ProductItem()
        product_item.product['name']= name
        product_item.product['price_tex']= price_tex

        product_item.product['manufacturer'] = CRAWLING_SITES[self.start_urls[0]]['brand']

        ima_url = loader._values['image_urls']
        ima_url = [CRAWLING_SITES[self.start_urls[0]]['image_dir'] + 'full/' + hashlib.sha1(i.encode('utf-8')).hexdigest() + '.jpg' for i in ima_url]
        product_item.product['image'] = ','.join(i for i in ima_url)
        product_item.set_alt_image()
        product_item.product['product_url'] = response.request.url
        product_item.product['categories_url'] = response.meta['root_url']
        product_item.product['category'] = response.meta['root_name']
       # write product to csv file
        if product_item.product['image'].find('images') == 0:
            product_item.write_to_csv(CRAWLING_SITES[self.start_urls[0]]['data_file'])

        # set a combination
        if sizes is not None and len(sizes) > 0:
            combination_item = CombinationItem()

            for element in sizes:
                size = element.split('/')[0].strip()
                combination_item.set_attribute(size)
                # cost = element.split('-')[1][0:-1].replace(',', '.')
                combination_item.set_wholesale_price(price_tex)
                combination_item.set_wholesale_price(price_tex)
                combination_item.set_product_reference()
                combination_item.combination['group'] = 'Size:12'

                # write combination to csv file
                if size is not None:
                    combination_item.write_to_csv(CRAWLING_SITES[self.start_urls[0]]['data_file'])

        # save images 
        yield loader.load_item()

Exemple #3

0

Afficher le fichier

Fichier : kb_fashion.py Projet : thaonguyenxanh/webCrawler

    def parse_detail(self, response):
        #set image_url and call pipeline
        image = response.css('div.swiper-wrapper')[1].css(
            'a::attr(href)').extract()
        image = [response.urljoin(url) for url in image]
        loader = ItemLoader(item=CrawlbotItem(), selector=image)
        loader.add_value('image_urls', image)

        # get value for fields from web
        name_p1 = response.css('div.col-sm-12 h3::text').extract_first()
        name_p2 = response.css(
            'div.row div.col-sm-12::text').extract()[2].strip()
        name = name_p1 + name_p2

        price_tex = response.css('div.col-sm-12.price::text').extract_first()
        reduction_from = response.css(
            'div.col-sm-12.price-line-through::text').extract_first()
        description = response.css(
            '#content-wrapper > section > div:nth-child(3) > div > div.col-sm-7 > div:nth-child(4) > div p::text'
        ).extract_first()

        # set product
        product_item = ProductItem()
        product_item.product["name"] = name
        if description is not None:
            product_item.product["description"] = description

        if price_tex is not None and len(price_tex) > 3:
            price_tex = price_tex.strip()[0:-1].replace('.', '')
            product_item.product["price_tex"] = price_tex

        if reduction_from is not None:
            product_item.product['reduction_from'] = reduction_from.strip(
            )[0:-1].replace('.', '')
            product_item.set_reduction_price()

        product_item.product['manufacturer'] = CRAWLING_SITES[
            self.start_urls[0]]['brand']
        ima_url = loader._values['image_urls']
        ima_url = [
            CRAWLING_SITES[self.start_urls[0]]['image_dir'] + 'full' +
            hashlib.sha1(i.encode('utf-8')).hexdigest() + '.jpg'
            for i in ima_url
        ]
        product_item.product['image'] = ','.join(i for i in ima_url)
        product_item.set_alt_image()
        product_item.product['product_url'] = response.request.url
        product_item.product['categories_url'] = response.meta['root_url']
        product_item.product['category'] = response.meta['root_name']

        sizes = response.css('div.col-xs-10.attribut-wrapper')[0].css(
            'label a::text').extract()

        # # write product to csv file
        if product_item.product['image'].find('images') == 0:
            product_item.write_to_csv(
                CRAWLING_SITES[self.start_urls[0]]['data_file'])

        # set a combination
        if sizes is not None and len(sizes) > 0:
            combination_item = CombinationItem()

            for size in sizes:
                combination_item.set_attribute(size)
                combination_item.set_wholesale_price(price_tex)
                combination_item.set_product_reference()
                combination_item.combination['group'] = 'Size:12'

                #write combination to csv file
                if size is not None:
                    combination_item.write_to_csv(
                        CRAWLING_SITES[self.start_urls[0]]['data_file'])

        # # save images
        yield loader.load_item()

Exemple #4

0

Afficher le fichier

Fichier : lamer.py Projet : thaonguyenxanh/webCrawler

    def parse_detail(self, response):
        #set image_url and call pipeline
        image = response.css('#surround > div a::attr(data-image)').extract()
        image = ['https:' + i for i in image]
        loader = ItemLoader(item=CrawlbotItem(), selector=image)
        loader.add_value('image_urls', image)

        # get value for fields from web
        name = response.css('div.product-title h1::text').extract_first()
        description_p1 = response.css('#tab_one > div > p::text').extract()
        description_p2 = response.css('#tab_one > div::text').extract()
        description = description_p1 + description_p2
        price_tex = response.css(
            'div.product-price span::text').extract_first()
        reduction_from = response.css(
            'div.product-price del::text').extract_first()
        sizes_colors_cost = response.css(
            '#product-select > option::text').extract()

        # set product
        product_item = ProductItem()
        product_item.product["name"] = name
        product_item.product["description"] = '\n'.join(
            i for i in description).strip()
        product_item.product["price_tex"] = price_tex[0:-1].replace(',', '')

        if reduction_from is not None:
            product_item.product['reduction_from'] = reduction_from[
                0:-1].replace(',', '')
            product_item.set_reduction_price()

        product_item.product['manufacturer'] = CRAWLING_SITES[
            self.start_urls[0]]['brand']
        ima_url = loader._values['image_urls']
        ima_url = [
            CRAWLING_SITES[self.start_urls[0]]['image_dir'] + 'full/' +
            hashlib.sha1(i.encode('utf-8')).hexdigest() + '.jpg'
            for i in ima_url
        ]
        product_item.product['image'] = ','.join(i for i in ima_url)
        product_item.set_alt_image()
        product_item.product['product_url'] = response.request.url
        product_item.product['categories_url'] = response.meta['root_url']
        product_item.product['category'] = response.meta['root_name']

        # write product to csv file
        if product_item.product['image'].find('images') == 0:
            product_item.write_to_csv(
                CRAWLING_SITES[self.start_urls[0]]['data_file'])

        # set a combination
        if sizes_colors_cost is not None and len(sizes_colors_cost) > 0:
            combination_item = CombinationItem()

            for element in sizes_colors_cost:
                size = element.split('/')[0].strip()
                combination_item.set_attribute(size)
                cost = element.split('-')[1][0:-1].replace(',', '.')
                combination_item.set_wholesale_price(cost)
                combination_item.set_wholesale_price(price_tex[0:-1].replace(
                    ',', ''))
                combination_item.set_product_reference()
                combination_item.combination['group'] = 'Size:12'

                #write combination to csv file
                if size is not None:
                    combination_item.write_to_csv(
                        CRAWLING_SITES[self.start_urls[0]]['data_file'])

        # save images
        yield loader.load_item()

Exemple #5

0

Afficher le fichier

    def parse_detail2(self, response):
        #get set of image url
        image_urls= response.css('#product_addtocart_form > div.product-img-box > div.more-views > ul li a img::attr(src)').extract()
        name= response.css('#product_addtocart_form > div.product-info-right > div.product-shop > div.product-name > span::text').extract_first()

        loader= ItemLoader(CrawlbotItem(), image_urls)
        image_urls = ['https:' + i for i in image_urls]

        loader.add_value('image_urls', image_urls)
        
        #get detail
        sizes= response.css('div.input-box ul li a::attr(title)').extract()
        yield{
            'size': sizes,
        }
        
        oldPrice = response.css('div.price-info div.price-box span.regular-price span.price::text')[0].extract()
        newPrice = []
        if oldPrice is None:
            oldPrice= response.css('#product_addtocart_form > div.product-info-right > div.product-shop > div.price-info > div > p.old-price span.price::text')[0].extract_first()
            newPrice= response.css('#product_addtocart_form > div.product-info-right > div.product-shop > div.price-info > div > p.special-price span::text')[0].extract_first()
        yield{
            'old': oldPrice,
            'new': newPrice
        }

        productItem= ProductItem()
        productItem.product["name"]= name.replace(',','')
        productItem.product["price_tex"]= oldPrice.replace('đ', '').replace('.','')
        productItem.product['manufacturer']= CRAWLING_SITES[self.start_urls[0]]['brand']
        ima_url = loader._values['image_urls']
        ima_url = [ CRAWLING_SITES[self.start_urls[0]]['image_dir'] + 'full/' + hashlib.sha1(i.encode('utf-8')).hexdigest() + '.jpg' for i in ima_url]
        productItem.product['image'] = ','.join(i for i in ima_url)
        productItem.set_alt_image()
        productItem.product['product_url'] = ''.join(i for i in response.meta['root_url']) 
        productItem.product['categories_url'] = response.request.url
        productItem.product['category'] = ''.join(i for i in response.meta['root_name'])

        # if newPrice is not None:
        #     productItem.product['reduction_from'] = newPrice.replace('đ','').replace('.','')
        #     productItem.set_reduction_price()

        if productItem.product['image'].find('images') == 0:
            
            productItem.write_to_csv(CRAWLING_SITES[self.start_urls[0]]['data_file'])


        
        if sizes is not None and len(sizes) > 0:
            combination_item = CombinationItem()

            for size in sizes:
                combination_item.set_attribute(size)
                combination_item.set_wholesale_price(oldPrice.replace(',', ''))
                combination_item.set_product_reference()
                combination_item.combination['group'] = 'Size:12'

                #write combination to csv file
                if size is not None:
                    combination_item.write_to_csv(CRAWLING_SITES[self.start_urls[0]]['data_file'])
        
        # # save images 
        yield loader.load_item()

Exemple #6

0

Afficher le fichier

    def parse_detail(self, response):
        #set image_url and call pipeline
        image = response.css(
            'div.more-views div.product-image-thumbs div a img::attr(src)'
        ).extract()
        loader = ItemLoader(item=CrawlbotItem(), selector=image)
        loader.add_value('image_urls', image)

        # get value for fields from web
        name = response.css(
            'div.product-shop div.product-name span::text').extract_first()

        description_p1 = response.css(
            'div.tab-content div.std p::text').extract()
        description_p2 = response.css('div.tab-content div::text').extract()
        descriptions = description_p2[0:2] + description_p1
        description = ','.join(i.strip() for i in descriptions)

        price = response.css('div.price-info div span.price::text').extract()

        # set product
        product_item = ProductItem()
        product_item.product["name"] = name
        product_item.product["description"] = description.replace(
            ',', '').replace('\t', '').replace('\r', '')

        if price_tex is not None and len(price_tex) > 3:
            price_tex = price[-1].strip()[0:-2].replace('.', '')
            product_item.product["price_tex"] = price
            product_item.product['reduction_from'] = price[0].strip(
            )[0:-2].replace('.', '')
            product_item.set_reduction_price()

        product_item.product['manufacturer'] = CRAWLING_SITES[
            self.start_urls[0]]['brand']
        ima_url = loader._values['image_urls']
        ima_url = [
            CRAWLING_SITES[self.start_urls[0]]['image_dir'] + 'full/' +
            hashlib.sha1(i.encode('utf-8')).hexdigest() + '.jpg'
            for i in ima_url
        ]
        product_item.product['image'] = ','.join(i for i in ima_url)
        product_item.set_alt_image()
        product_item.product['product_url'] = response.request.url
        product_item.product['categories_url'] = response.meta['root_url']
        product_item.product['category'] = response.meta['root_name']
        sizes = response.css(
            'dd.clearfix.swatch-attr.last ul.configurable-swatch-list.clearfix li a::attr(title)'
        ).extract()

        # # write product to csv file
        if product_item.product['image'].find('images') == 0:
            product_item.write_to_csv(
                CRAWLING_SITES[self.start_urls[0]]['data_file'])

        # set a combination
        if sizes is not None and len(sizes) > 0:
            combination_item = CombinationItem()

            for size in sizes:
                combination_item.set_attribute(size)
                combination_item.set_wholesale_price(price_tex)
                combination_item.set_product_reference()
                combination_item.combination['group'] = 'Size:12'

                #write combination to csv file
                if size is not None:
                    combination_item.write_to_csv(
                        CRAWLING_SITES[self.start_urls[0]]['data_file'])

        # # save images
        yield loader.load_item()