Python Product Exemples, ProductsScraper.items.Product Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : ae.py Projet : oceancloud82/scraping

    def parse(self, response):
        products = response.xpath(
            "//div[@class='product-list']/div/div[@class='product-details-container']"
        )
        for prod in products:
            item = Product()

            item['Name'] = prod.xpath(
                ".//h4/span[@itemprop='name']/text()").extract_first().strip()
            item['original_url'] = response.urljoin(
                prod.xpath("a/@href").extract_first())
            item['reg_price'] = re.sub(
                '[^\d\.]', '',
                prod.xpath(
                    ".//span[@itemprop='offers']/s/span[@itemprop='price']/text()"
                ).extract_first().strip())
            item['sale_price'] = re.sub(
                '[^\d\.]', '',
                prod.xpath(
                    ".//span[@itemprop='offers']/span[@itemprop='price']/text()"
                ).extract_first().strip())
            item['website_id'] = 23
            item['category_id'] = 2

            yield Request(item['original_url'],
                          meta={'item': item},
                          callback=self.parse_detail)

Exemple #2

0

Afficher le fichier

    def parse(self, response):
        products = response.xpath(
            '//div[@class="product-list"]/ul/li//div[@class="info clearfix"]/a[contains(@class, "url")]'
        )
        if len(products) == 0:
            return
        self.baseindex = self.baseindex + len(products)
        for prod in products:
            item = Product()

            item['Name'] = prod.xpath(
                'div[@class="title"]/text()').extract_first().strip()
            item['brand'] = prod.xpath(
                'div[@class="brand"]/text()').extract_first().strip()
            item['original_url'] = response.urljoin(
                prod.xpath('@href').extract_first())
            item['reg_price'] = re.sub(
                '[^\d\.\,]', '',
                prod.xpath('.//span[@class="retail-price"]/text()').
                extract_first()).strip()
            item['sale_price'] = re.sub(
                '[^\d\.\,]', '',
                prod.xpath(
                    './/span[@class="sale-price"]/span[@class="sale-price-low"]/text()'
                ).extract_first()).strip()
            item['website_id'] = 17
            item['category_id'] = 2

            yield Request(item['original_url'],
                          meta={'item': item},
                          callback=self.parse_detail)
            # break

    # return
        yield Request(self.nextpage_url % self.baseindex, callback=self.parse)

Exemple #3

0

Afficher le fichier

    def parse(self, response):
        if self.usd_set == 0:
            self.usd_set = 1
            yield Request(self.setcurrency_url, callback=self.parse)
            return
        elif self.page_num == 0:
            self.page_num = 1
            yield Request(self.nextpage_url % self.page_num, callback=self.parse)
            return

        products = response.xpath('//div[@id="products"]//div[@class="item first"]/div[@class="details"]/div[@class="details-content"]')
        # print len(products)
        for prod in products:
            item = Product()

            item['Name'] = prod.xpath('h4/a/text()').extract_first().strip()
            item['original_url'] = response.urljoin(prod.xpath('h4/a/@href').extract_first())
            item['reg_price'] = re.sub('[^\d\.]', '', prod.xpath('ul[@class="price"]/li[not(@class)]/text()').extract_first()).strip()
            item['sale_price'] = re.sub('[^\d\.]', '', prod.xpath('ul[@class="price"]/li[@class="now"]/text()').extract_first()).strip()
            item['website_id'] = 14
            item['category_id'] = 2

            yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail)
            # break
        # return

        if len(products) > 0:
            self.page_num = self.page_num + 1
            yield Request(self.nextpage_url % self.page_num, callback=self.parse)

Exemple #4

0

Afficher le fichier

    def parse(self, response):
        # e.g. http://www.swarovski.com/Web_US/en/json/json-result?SearchParameter=%26%40QueryTerm%3D*%26CategoryUUIDLevelX%3DkTUKaSUCyn4AAAEnV9lToUKM%26CategoryUUIDLevelX%252FkTUKaSUCyn4AAAEnV9lToUKM%3DInYKaVgfvWsAAAFaO6s2M2Wp%26CategoryUUIDLevelX%252FkTUKaSUCyn4AAAEnV9lToUKM%252FInYKaVgfvWsAAAFaO6s2M2Wp%3DTxcKaVgfw6MAAAFaOqs2M2Wp%26%40Sort.FFSort%3D0%26%40Page%3D2&PageSize=36&View=M
        page_num = 1
        while 1:
            ajax_url = 'http://www.swarovski.com/Web_US/en/json/json-result?SearchParameter=%26%40QueryTerm%3D*%26CategoryUUIDLevelX%3DkTUKaSUCyn4AAAEnV9lToUKM%26CategoryUUIDLevelX%252FkTUKaSUCyn4AAAEnV9lToUKM%3DInYKaVgfvWsAAAFaO6s2M2Wp%26CategoryUUIDLevelX%252FkTUKaSUCyn4AAAEnV9lToUKM%252FInYKaVgfvWsAAAFaO6s2M2Wp%3DTxcKaVgfw6MAAAFaOqs2M2Wp%26%40Sort.FFSort%3D0%26%40Page%3D' + str(
                page_num) + '&PageSize=36&View=M'
            products = requests.get(ajax_url,
                                    headers={
                                        'X-Requested-With': 'XMLHttpRequest'
                                    }).json()['SearchResult']['Products']
            for prod in products:
                item = Product()

                item['Name'] = prod['Name']
                item['original_url'] = prod['DetailPage']
                item['reg_price'] = re.sub('[^\d\.]', '', prod['OldPrice'])
                item['sale_price'] = re.sub('[^\d\.]', '', prod['Price'])
                item['website_id'] = 18
                item['category_id'] = 4
                item['discount'] = re.sub('[^\d]', '', prod['PricePercent'])

                yield Request(item['original_url'],
                              meta={'item': item},
                              callback=self.parse_detail)
                # break
            if len(products) < 36:
                break
            page_num = page_num + 1

Exemple #5

0

Afficher le fichier

Fichier : underarmour.py Projet : oceancloud82/scraping

    def parse(self, response):
        products = json.loads(re.search('\"GRID_DATA\"\:(.*)\,[\s]*\"navigation', response.xpath("//script[contains(text(), 'GRID_DATA')]/text()").extract_first().encode("utf-8"), re.M|re.S|re.I).group(1) + '}')["_embedded"]["results"][0]["products"]
        # print len(products)
        offset = 0
        while 1:
            for prod in products:
                item = Product()

                item['Name'] = prod["content"]['shortName']
                item['original_url'] = prod['materials'][0]["_links"]["web:locale"]["href"]
                item['reg_price'] = re.sub('[^\d\.]', '', str(prod['priceRange']["msrp"]["min"]))
                item['sale_price'] = re.sub('[^\d\.]', '', str(prod['priceRange']["base"]["min"]))
                item['website_id'] = 22
                item['category_id'] = 2
                item['description'] = prod["content"]['categoryName']
                item['original_image_url'] = ["http://underarmour.scene7.com/is/image/Underarmour/" + prod['materials'][0]["assets"][0]["image"] + "?template=v65GridLarge&$size=599,735&$wid=281&$hei=345&$extend=0,220,0,0"]
                item['image_urls'] = item['original_image_url']

                yield item
                
            if len(products) < 60:
                break
            offset = offset + len(products)
            products = json.loads(re.search('\)\]\}\'\,(.*)\,[\s]*\"navigation', requests.get("https://www.underarmour.com/en-us/api/json-grid/outlet/womens/tops/g/6cl?s=&q=&p=&offset=%d&limit=60&stackId=other_grid_header&stackIdx=0&t[IsNewLoadMoreGrid]=0" % offset, headers={'X-Requested-With': "XMLHttpRequest"}).text.encode("utf-8"), re.M|re.S|re.I).group(1) + '}')["_embedded"]["results"][0]["products"]
            if len(products) == 0:
                break

Exemple #6

0

Afficher le fichier

Fichier : anthropologie.py Projet : oceancloud82/scraping

    def parse(self, response):
        sel = Selector(response)

        # f=open("page_source.html",'w+b')
        # f.write(response.body)

        all_divs = sel.xpath('//div[@class="dom-category-browse"]/div[2]/div')
        print len(all_divs)

        for a in all_divs:
            Name = a.xpath(
                'span[@itemprop="product"]/div[@class="c-product-tile-details c-product-tile-details--regular"]/a/h3/span/text()'
            ).extract()
            description = ""
            reg_price = a.xpath(
                'span[@itemprop="product"]/div[@class="c-product-tile-details c-product-tile-details--regular"]/p[@class="c-product-tile__price c-product-tile__price--regular"]/span/span[@class="c-product-meta__original-price"]/text()'
            ).extract()
            sale_price = a.xpath(
                'span[@itemprop="product"]/div[@class="c-product-tile-details c-product-tile-details--regular"]/p[@class="c-product-tile__price c-product-tile__price--regular"]/span/span[@class="c-product-meta__current-price c-product-meta__current-price--sale"]/text()'
            ).extract()
            website_id = 5
            brand = "Anthropologie"
            original_url = a.xpath(
                'span[@itemprop="product"]/div[@class="c-product-tile-details c-product-tile-details--regular"]/a/@href'
            ).extract()
            original_url = "https://www.anthropologie.com" + "".join(
                original_url).strip()
            original_image_url = ""
            category_id = 2
            original_image_url = "".join(
                a.xpath(
                    'span[@itemprop="product"]/div[@class="c-product-tile-controls__link-wrap js-product-tile-controls__link-wrap"]/a/img/@src'
                ).extract()).strip()
            if 'https:' not in original_image_url:
                original_image_url = 'https:' + original_image_url

            item = Product()
            item['Name'] = "".join(Name).strip()
            item['reg_price'] = "".join(reg_price).strip().replace('$',
                                                                   '').strip()
            item['sale_price'] = "".join(sale_price).strip().replace(
                '$', '').strip()
            item['brand'] = "".join(brand).strip()
            item['original_url'] = original_url
            item['website_id'] = website_id
            item['category_id'] = category_id
            item['original_image_url'] = [original_image_url]

            yield Request(original_url,
                          meta={'item': item},
                          callback=self.each_detail)

            # break

        next_page = sel.xpath('//a[@aria-label="next page"]/@href').extract()
        if len(next_page) > 0:
            yield Request("https://www.anthropologie.com" + next_page[0],
                          callback=self.parse)

Exemple #7

0

Afficher le fichier

Fichier : truereligion.py Projet : oceancloud82/scraping

    def parse(self, response):
        products = response.xpath('//div[@class="product-tile"]')
        for prod in products:
            item = Product()

            item['Name'] = prod.xpath('div[@class="product-name"]/h2/a[@class="name-link"]/text()').extract_first().strip()
            item['original_url'] = response.urljoin(prod.xpath('div[@class="product-name"]/h2/a[@class="name-link"]/@href').extract_first())
            item['reg_price'] = re.sub('[^\d\.]', '', prod.xpath('div[@class="product-pricing"]//span[@class="price-standard"]/text()').extract_first().strip())
            item['sale_price'] = re.sub('[^\d\.]', '', prod.xpath('div[@class="product-pricing"]//span[@class="price-sales"]/text()').extract_first().strip())
            item['website_id'] = 15
            item['category_id'] = 2

            yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail)
            # break
        if len(products) > 0:
            self.start_num = self.start_num + len(products)
            yield Request(self.nextpage_url % self.start_num, callback=self.parse)

Exemple #8

0

Afficher le fichier

    def parse(self, response):
        products = requests.get(self.ajax_url,
                                headers={
                                    'X-Requested-With': 'XMLHttpRequest'
                                }).json()['Products']
        for prod in products:
            item = Product()

            item['Name'] = prod['ModelName']
            item['original_url'] = prod['ProductUrl']
            item['reg_price'] = prod['MaxRegularPrice']
            item['sale_price'] = prod['MinSalePrice']
            item['website_id'] = 12
            item['category_id'] = 2
            item['original_image_url'] = [prod['ProductImageUrl']]
            item['image_urls'] = item['original_image_url']

            yield item

Exemple #9

0

Afficher le fichier

    def parse(self, response):
        sel = Selector(response)

        for prod in sel.xpath(
                '//div[@id="mainResults"]/ul/li[@class="item thumbnail-item"]'
        ):
            item = Product()

            item['Name'] = prod.xpath(
                'ul[@class="feature-list"]/li[@class="fl-item title"]/a/text()'
            ).extract_first().strip()
            item['original_url'] = prod.xpath(
                'ul[@class="feature-list"]/li[@class="fl-item title"]/a/@href'
            ).extract_first().strip()
            item['reg_price'] = re.sub(
                '[^\d\.]', '',
                prod.xpath(
                    'ul[@class="feature-list"]/li[@class="fl-item price"]/a/span[@class="plp_product__strikeoutprice"]/b/text()'
                ).extract_first()).strip()
            item['sale_price'] = re.sub(
                '[^\d\.]', '',
                prod.xpath(
                    'ul[@class="feature-list"]/li[@class="fl-item price"]/a/span[@class="fontSale plp_product_price"]/b/text()'
                ).extract_first()).strip()
            item['website_id'] = 9
            item['category_id'] = 2

            yield Request(item['original_url'],
                          meta={'item': item},
                          callback=self.parse_detail)
        #     break
        # return

        try:
            nextpage_url = response.urljoin(
                sel.xpath(
                    '//ul[@class="no-bullet paging"]/li[@class="next"]/a/@href'
                ).extract_first()).strip()
            if (nextpage_url is None) or (nextpage_url == ''):
                return
            yield Request(nextpage_url, callback=self.parse)
        except:
            pass

Exemple #10

0

Afficher le fichier

Fichier : ninewest.py Projet : oceancloud82/scraping

    def get_products(self, response):
        products = json.loads(response.text)['response']['searchResults']
        self.products_num = len(products)
        for prod in products:
            item = Product()

            item['Name'] = prod['productName']
            item['original_url'] = prod['url']
            item['reg_price'] = re.sub('[^\d\.]', '', prod['listPrice'])
            item['sale_price'] = re.sub('[^\d\.]', '', prod['minPrice'])
            item['website_id'] = 19
            item['category_id'] = 3
            item['description'] = prod['productDescription']
            item['original_image_url'] = [
                prod['defaultImage']['productDetailMain']
            ]
            item['image_urls'] = item['original_image_url']

            yield item

Exemple #11

0

Afficher le fichier

    def parse(self, response):
        products = response.xpath(
            '//ul[@id="search-result-items"]/li/div[@class="product-tile"]')
        for prod in products:
            item = Product()
            item['Name'] = prod.xpath(
                'div[@class="product-caption"]/div[@class="product-name"]/h2/a/text()'
            ).extract_first().strip()
            item['original_url'] = prod.xpath(
                'div[@class="product-caption"]/div[@class="product-name"]/h2/a/@href'
            ).extract_first().strip()
            try:
                item['reg_price'] = re.sub(
                    '[^\d\.]', '',
                    prod.xpath(
                        'div[@class="product-caption"]/div[@class="product-pricing"]//span[@title="Regular Price"]/text()'
                    ).extract_first().strip())
            except:
                item['reg_price'] = '0.0'
            try:
                item['sale_price'] = re.sub(
                    '[^\d\.]', '',
                    prod.xpath(
                        'div[@class="product-caption"]/div[@class="product-pricing"]//span[@title="Sale Price"]/text()'
                    ).extract_first().strip())
            except:
                item['sale_price'] = re.sub(
                    '[^\d\.]', '',
                    prod.xpath(
                        'div[@class="product-caption"]/div[@class="product-pricing"]/span[@class="product-sales-price"]/text()'
                    ).extract_first().strip().split('-')[0])
            item['website_id'] = 13
            item['category_id'] = 2
            item['original_image_url'] = [
                prod.xpath(
                    'div[@class="product-image"]/a/img[@class="product-image"]/@src'
                ).extract_first()
            ]
            item['image_urls'] = item['original_image_url']

            yield Request(item['original_url'],
                          meta={'item': item},
                          callback=self.parse_detail)

Exemple #12

0

Afficher le fichier

Fichier : luckybrand.py Projet : oceancloud82/scraping

    def parse(self, response):
        products = response.xpath('//ul/li/div[@class="product-tile"]')
        # print len(products)
        for prod in products:
            item = Product()
            item['Name'] = prod.xpath('h6[@class="product-name"]/a[@class="name-link"]/text()').extract_first().strip()
            item['original_url'] = prod.xpath('h6[@class="product-name"]/a[@class="name-link"]/@href').extract_first().strip()
            try:
                item['reg_price'] = re.sub('[^\d\.]', '', prod.xpath('div[@class="product-pricing"]/span[@title="Regular Price"]/text()').extract_first().strip())
            except:
                item['reg_price'] = '0.0'
            try:
                item['sale_price'] = re.sub('[^\d\.]', '', prod.xpath('div[@class="product-pricing"]/span[@title="Sale Price"]/text()').extract_first().strip())
            except:
                item['sale_price'] = '0.0'
            item['website_id'] = 16
            item['category_id'] = 2
            # item['discount'] = re.sub('[^\d\.]', '', prod.xpath('div[@class="product-promo promotion"]/span[@class="promotional-message PRODUCT"]/text()').extract_first().strip())

            yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail)

Exemple #13

0

Afficher le fichier

Fichier : adidas.py Projet : oceancloud82/scraping

    def parse(self, response):
        products = response.xpath(
            '//div[@id="product-grid"]//div[@class="product-tile"]//div[contains(@class, "innercard  col")]'
        )
        if len(products) == 0:
            return
        self.page_num = self.page_num + len(products)
        for prod in products:
            item = Product()

            item['Name'] = prod.xpath(
                'div/div[@class="product-info-inner-content clearfix with-badges"]/a/@data-productname'
            ).extract_first().strip()
            item['original_url'] = prod.xpath(
                'div/div[@class="product-info-inner-content clearfix with-badges"]/a/@href'
            ).extract_first().strip()
            item['reg_price'] = re.sub(
                '[^\d\.]', '',
                prod.xpath(
                    './/div[@class="price"]/span[@class="strike"]/span[@class="baseprice"]/text()'
                ).extract_first()).strip()
            item['sale_price'] = re.sub(
                '[^\d\.]', '',
                prod.xpath(
                    './/div[@class="price"]/span[@class="salesprice discount-price"]/text()'
                ).extract_first()).strip()
            item['website_id'] = 11
            item['category_id'] = 2
            item['discount'] = re.sub(
                '[^\d\.]', '',
                prod.xpath(
                    'div[@class="badge sale"]/span[@class="badge-text"]/text()'
                ).extract_first()).strip()

            yield Request(item['original_url'],
                          meta={'item': item},
                          callback=self.parse_detail)
            # break

    # return
        yield Request(self.nextpage_url % self.page_num, callback=self.parse)

Exemple #14

0

Afficher le fichier

    def parse(self, response):
        sel = Selector(response)

        for prod in sel.xpath(
                '//div[@class="productlist"]/div[@class="productrows"]//div[contains(@class, "prodgrid")]'
        ):
            item = Product()

            item['Name'] = prod.xpath(
                'span[@class="details"]/a/text()').extract_first().strip()
            item['original_url'] = response.urljoin(
                prod.xpath(
                    'span[@class="details"]/a/@href').extract_first()).strip()
            item['reg_price'] = re.sub(
                '[^\d\.]', '',
                prod.xpath(
                    'span[@class="cart"]/span[@class="wasPrice price"]/strike/text()'
                ).extract_first()).strip()
            item['sale_price'] = re.sub(
                '[^\d\.]', '',
                ''.join(prod.xpath(
                    'span[@class="cart"]/text()').extract())).strip()
            item['website_id'] = 8
            item['category_id'] = 2

            yield Request(item['original_url'],
                          meta={'item': item},
                          callback=self.parse_detail)
        #     break
        # return

        try:
            nextpage_url = response.urljoin(
                sel.xpath(
                    '//div[@class="pagination"]/div[@class="paginationlinks"]/ul/li[@class="next"]/a/@href'
                ).extract_first()).strip()
            if (nextpage_url is None) or (nextpage_url == ''):
                return
            yield Request(nextpage_url, callback=self.parse)
        except:
            pass

Exemple #15

0

Afficher le fichier

Fichier : farfetch.py Projet : oceancloud82/scraping

    def get_products(self, response):
        products = json.loads(response.text)['Products']['List']
        # print len(products)
        self.products_num = len(products)
        for prod in products:
            item = Product()

            item['Name'] = prod['Description']
            item['original_url'] = response.urljoin(prod['ProductUrl'])
            item['reg_price'] = re.sub('[^\d\.]', '', prod['PriceDisplay'])
            item['sale_price'] = re.sub('[^\d\.]', '',
                                        prod['PriceSaleDisplay'])
            item['website_id'] = 20
            item['category_id'] = 2
            item['original_image_url'] = [prod['ImageMain']]
            item['image_urls'] = item['original_image_url']
            item['discount'] = re.sub('[^\d\.]', '', prod['PercentageOff'])
            item['brand'] = prod['DesignerName']

            yield Request(url=item['original_url'],
                          callback=self.get_description,
                          meta={'item': item})

Exemple #16

0

Afficher le fichier

    def parse(self, response):
        products = response.xpath(
            "//div[@class='mainsite_record_listing']/div[@id='endeca_search_results']/ul/li[not(@class)]"
        )
        # print len(products)
        if len(products) == 0:
            return
        for prod in products:
            item = Product()

            item['Name'] = ''.join(
                prod.xpath('a[not(@onmousedown)]/text()').extract()).strip()
            item['original_url'] = prod.xpath(
                'a[not(@onmousedown)]/@href').extract_first()
            item['reg_price'] = re.sub(
                '[^\d\.\,]', '',
                prod.xpath("p[@class='product_price']/strike/b/text()").
                extract_first()).strip()
            item['sale_price'] = re.sub(
                '[^\d\.\,]', '',
                prod.xpath("p[@class='product_price']/em/b/text()").
                extract_first()).strip()
            item['website_id'] = 21
            item['category_id'] = 2

            yield Request(item['original_url'],
                          meta={'item': item},
                          callback=self.parse_detail)
            # break

        next_page = response.xpath(
            "//div[@class='endeca_pagination']/a[@class='next']/@href"
        ).extract_first()
        if next_page:
            yield Request(response.urljoin(next_page),
                          callback=self.parse,
                          dont_filter=True)

Exemple #17

0

Afficher le fichier

Fichier : dillards.py Projet : oceancloud82/scraping

    def parse(self, response):
        while 1:
            products = requests.get(self.ajax_url % self.page_num,
                                    headers={
                                        'X-Requested-With': 'XMLHttpRequest'
                                    }).json()['products']
            if len(products) == 0:
                break
            for prod in products:
                item = Product()

                item['Name'] = prod['name']
                item['original_url'] = 'http://www.dillards.com/p/' + prod[
                    'nameForURL'] + '/' + prod['catentryId'] + '?di=' + prod[
                        'fullImage'] + '&categoryId=410&facetCache=pageSize=96&beginIndex=%d&orderBy=1' % self.page_num
                item['reg_price'] = re.sub('[^\d\.]', '', prod['listMax'])
                item['sale_price'] = re.sub('[^\d\.]', '', prod['offerMin'])
                item['website_id'] = 10
                item['category_id'] = 2

                yield Request(item['original_url'],
                              meta={'item': item},
                              callback=self.parse_detail)
            self.page_num = self.page_num + len(products)

Exemple #18

0

Afficher le fichier

Fichier : macys.py Projet : oceancloud82/scraping

    def parse(self, response):
        sel = Selector(response)

        # f=open("page_source.html",'w+b')
        # f.write(response.body)

        all_divs = sel.xpath('//ul[@id="thumbnails"]/li')

        for a in all_divs:
            Name = a.xpath(
                'div[@class="innerWrapper"]/div[@class="textWrapper"]/div[@class="shortDescription"]/a/text()'
            ).extract()
            description = ""
            reg_price = a.xpath(
                'div[@class="innerWrapper"]/div[@class="textWrapper"]/div[@class="prices"]/span[@class="colorway-price"][1]/span[@class="first-range "]/text()'
            ).extract()
            sale_price = a.xpath(
                'div[@class="innerWrapper"]/div[@class="textWrapper"]/div[@class="prices"]/span[@class="colorway-price"]/span[@class="first-range priceSale"]/text()'
            ).extract()
            website_id = 4
            brand = ""
            original_url = a.xpath(
                'div[@class="innerWrapper"]/div[@class="textWrapper"]/div[@class="shortDescription"]/a/@href'
            ).extract()
            original_url = "https://www.macys.com" + "".join(
                original_url).strip()
            original_image_url = ""
            category_id = 2

            item = Product()
            item['Name'] = "".join(Name).strip()

            try:
                reg_price = "".join(reg_price).strip().replace(
                    '$', '').strip().split(' ')[1].strip()
            except:
                pass

            try:
                sale_price = "".join(sale_price).strip().replace(
                    '$', '').strip().split(' ')[1].strip()
            except:
                pass

            item['reg_price'] = reg_price
            item['sale_price'] = sale_price
            item['original_url'] = original_url
            item['website_id'] = website_id
            item['category_id'] = category_id

            yield Request(original_url,
                          meta={'item': item},
                          callback=self.each_detail)

            #break

        try:
            current_page_no = response.url.split(
                'Productsperpage/')[1].strip().split(',')[0].strip()
            max_page_no = response.body.split(
                'totalPageCount:')[1].strip().split(',')[0].strip()

            if int("".join(current_page_no).strip()) < int(
                    "".join(max_page_no).strip()):
                current_page_no = int("".join(current_page_no).strip()) + 1
                temp_link = response.url.split('Productsperpage/')[0].strip(
                ) + "Productsperpage/" + str(
                    current_page_no) + "," + response.url.split(
                        'Productsperpage/')[1].strip().split(',')[1].strip()
                yield Request(temp_link, callback=self.parse)
        except:
            #raise
            pass

Exemple #19

0

Afficher le fichier

    def parse(self, response):
        sel = Selector(response)

        # f=open("page_source.html",'w+b')
        # f.write(response.body)

        all_divs = sel.xpath('//ul[@id="thumbnails"]/li')

        for a in all_divs:
            Name = a.xpath(
                'div/div[@class="shortDescription newProdDesc"]/div[@id="prodName"]/a/text()'
            ).extract()
            description = ""
            reg_price = a.xpath(
                'div/div[@class="prices"]/div[@class="priceSale colorwayBrowse"]/div/text()'
            ).extract()
            sale_price = a.xpath(
                'div/div[@class="prices"]/div[@class="priceSale colorwayBrowse"]/div/span[@class="priceSale"]/text()'
            ).extract()
            website_id = 6
            brand = a.xpath(
                'div/div[@class="shortDescription newProdDesc"]/div[@id="brandName"]/a/text()'
            ).extract()
            original_url = a.xpath(
                'div/div[@class="shortDescription newProdDesc"]/div[@id="prodName"]/a/@href'
            ).extract()
            original_url = "".join(original_url).strip()
            original_image_url = ""
            category_id = 2

            try:
                reg_price = "".join(reg_price).strip().replace(
                    '$', '').strip().split(' ')[1].strip()
            except:
                pass

            try:
                sale_price = "".join(sale_price).strip().replace(
                    '$', '').strip().split(' ')[1].strip()
            except:
                pass

            item = Product()
            item['Name'] = "".join(Name).strip()
            item['reg_price'] = reg_price
            item['sale_price'] = sale_price
            item['brand'] = "".join(brand).strip()
            item['original_url'] = original_url
            item['website_id'] = website_id
            item['category_id'] = category_id

            yield Request(original_url,
                          meta={'item': item},
                          callback=self.each_detail)

            # break

        try:
            next_page = sel.xpath('//link[@rel="canonical"]/@href').extract()
            current_page_no = sel.xpath(
                '//li[@class="currentPage displayNone"][1]/text()').extract()

            temp_page = sel.xpath(
                '//select[@id="paginationDdl"]/option/@value').extract()
            max_page_no = temp_page[len(temp_page) - 1]

            if int("".join(current_page_no).strip()) < int(
                    "".join(max_page_no).strip()):
                current_page_no = int("".join(current_page_no).strip()) + 1
                temp_link = "".join(next_page).strip().split('?id')[0].strip(
                ) + "/Pageindex/" + str(current_page_no) + "?id=" + "".join(
                    next_page).strip().split('?id=')[1].strip()
                yield Request(temp_link, callback=self.parse)
        except:
            pass

Exemple #20

0

Afficher le fichier

Fichier : barneyswarehouse.py Projet : oceancloud82/scraping

    def parse(self, response):
        print response.url
        sel = Selector(response)

        # f=open("page_source.html",'w+b')
        # f.write(response.body)

        all_divs = sel.xpath('//div[@id="atg_store_prodList"]/ul/li')

        for a in all_divs:
            Name = a.xpath(
                'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="product-name"]/a/text()'
            ).extract()
            description = ""
            reg_price = a.xpath(
                'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="product-pricing"]/div[@class="product-standard-price"]/span[@class="product-discounted-price"]/text()'
            ).extract()
            sale_price = a.xpath(
                'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="product-pricing"]/div[@class="product-standard-price"]/span[@class="product-sales-price"]/text()'
            ).extract()
            website_id = 3
            brand = a.xpath(
                'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="brand"]/a/text()'
            ).extract()
            original_url = a.xpath(
                'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="product-name"]/a/@href'
            ).extract()
            original_url = "http://www.barneyswarehouse.com" + "".join(
                original_url).strip()
            original_image_url = ""
            category_id = 2
            discount = a.xpath(
                'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="product-pricing"]/div[@class="product-standard-price"]/text()'
            ).extract()
            discount = "".join(discount).strip().replace('Off', '').replace(
                '%', '').strip()

            item = Product()
            item['Name'] = "".join(Name).strip()
            item['reg_price'] = "".join(reg_price).strip().replace('$',
                                                                   '').strip()
            item['sale_price'] = "".join(sale_price).strip().replace(
                '$', '').strip()
            item['brand'] = "".join(brand).strip()
            item['original_url'] = original_url
            item['discount'] = discount
            item['website_id'] = website_id
            item['category_id'] = category_id

            print 'yield Request(original_url, meta={\'item\': item}, callback=self.each_detail)'
            yield Request(original_url,
                          meta={'item': item},
                          callback=self.each_detail)

            # break

        current_page_no = sel.xpath(
            '//input[@id="currentPageNumber"][1]/@value').extract()
        max_page_no = sel.xpath(
            '//input[@id="currentPageNumber"][1]/@max').extract()
        try:
            if int("".join(current_page_no).strip()) < int(
                    "".join(max_page_no).strip()):
                current_page_no = int("".join(current_page_no).strip()) + 1
                temp_link = response.url.split(
                    '&page=')[0].strip() + "&page=" + str(current_page_no)
                yield Request(temp_link, callback=self.parse)
        except:
            pass