Ejemplo n.º 1
0
    def parse_items(self, response):
        global primary_key
        print("parse_items", response.meta['main_category_name'],
              response.meta['sub_category_name'])

        best_items = response.css('div.best-list')
        for index, item in enumerate(best_items[1].css('li')):
            doc = EcommerceItem()
            ranking = index + 1
            title = item.css('a.itemname::text').get()
            ori_price = item.css('div.o-price::text').get()
            dis_price = item.css('div.s-price strong span span::text').get()
            discount_percent = item.css('div.s-price em::text').get()

            if ori_price == None:
                ori_price = dis_price
            ori_price = ori_price.replace(',', '').replace('원', '')
            dis_price = dis_price.replace(',', '').replace('원', '')
            if discount_percent == None:
                discount_percent = 0
            else:
                discount_percent = discount_percent.replace('%', '')

            doc['main_category_name'] = response.meta['main_category_name']
            doc['sub_category_name'] = response.meta['sub_category_name']
            doc['ranking'] = ranking
            doc['title'] = title
            doc['ori_price'] = int(ori_price)
            doc['dis_price'] = int(dis_price)
            doc['discount_percent'] = int(discount_percent)
            doc['primary_key'] = primary_key
            # print(ranking, title, ori_price, dis_price, discount_percent)
            yield doc
            primary_key += 1
Ejemplo n.º 2
0
    def parse_items(self, response):
        print("parse_maincategory", response.meta['main_category_name'],
              response.meta['sub_category_name'])

        best_items = response.css('div.best-list')
        for index, item in enumerate(best_items[1].css('li')):
            doc = EcommerceItem()
            ranking = index + 1
            title = item.css('a.itemname::text').get()
            ori_price = item.css('div.o-price span span::text').get()
            print(title, ori_price)
            dis_price = item.css('div.s-price strong span span::text').get()
            discount_percent = item.css('div.s-price em::text').get()

            if ori_price == None:
                ori_price = dis_price
            ori_price = ori_price.replace(",", "").replace("원", "")
            dis_price = dis_price.replace(",", "").replace("원", "")
            if discount_percent == None:
                discount_percent = '0'
            else:
                discount_percent = discount_percent.replace("%", "")

            doc['main_category_name'] = response.meta['main_category_name']
            doc['sub_category_name'] = response.meta['sub_category_name']
            doc['ranking'] = ranking
            doc['title'] = title
            doc['ori_price'] = ori_price
            doc['dis_price'] = dis_price
            doc['discount_percent'] = discount_percent
            #print (ranking, title, ori_price, dis_price, discount_percent)
            yield doc
Ejemplo n.º 3
0
    def parse_maincategory(self, response):
        print('parse_maincategory', response.meta['maincategory_name'],
              response.meta['subcategory_name'])

        best_items = response.css('div.best-list')[1]
        for index, best_item in enumerate(best_items.css('li')):
            doc = EcommerceItem()
            ranking = index + 1
            title = best_item.css('a.itemname::text').get()
            ori_price = best_item.css('div.o-price::text').get()
            dis_price = best_item.css(
                'div.s-price strong span span::text').get()
            discount_percent = best_item.css('div.s-price em::text').get()

            if ori_price is None:
                ori_price = dis_price
            # ori_price = ori_price.replace('원','')
            # dis_price = dis_price.replace('원','')
            if discount_percent is None:
                discount_percent = 0
            else:
                discount_percent = discount_percent.replace('%', '')

            doc['maincategory_name'] = response.meta['maincategory_name']
            doc['subcategory_name'] = response.meta['subcategory_name']
            doc['ranking'] = ranking
            doc['title'] = title
            doc['ori_price'] = ori_price
            doc['dis_price'] = dis_price
            doc['discount_percent'] = discount_percent

            yield doc
Ejemplo n.º 4
0
    def parse(self, response):
        items = []
        items_xpath = response.xpath('//*[@class="Products-item"]')
        for item_xpath in items_xpath:
            item = EcommerceItem()
            title = item_xpath.xpath(
                './/*[@class="Product-list-right"]//@title'
            ).extract()
            url = item_xpath.xpath(
                './/*[@class="Product-list-right"]'
                '/*[@class="Product-nameHeading"]/a/@href'
            ).extract()
            reduction = item_xpath.xpath(
                './/*[@class="Badge-reducere"]/text()'
            ).extract()
            new_price = item_xpath.xpath(
                './/*[@itemprop="price"]/@content'
            ).extract()

            item['title'] = title
            item['url'] = url
            item['reduction'] = reduction
            item['new_price'] = new_price
            items.append(item)
        logging.info("parsing result: {item}".format(item=items))
        return items
Ejemplo n.º 5
0
    def parse(self, response):
        titles = response.css('div.best-list li > a::text').getall()
        prices = response.css('div.best-list ul li div.item_price div.s-price strong span::text').getall()

        for num, title in enumerate(titles):
            doc = EcommerceItem()
            doc['title'] = title
            doc['price'] = prices[num].strip().replace("원", "").replace(",", "")
            yield doc
Ejemplo n.º 6
0
 def parse(self, response):
     titles = response.css(
         'div.best-list li > a::text').getall()
     prices = response.css(
         'div.best-list ul li div.item_price div.s-price strong span::text').getall()
     for num, title in enumerate(titles):
         item = EcommerceItem()
         item['title'] = title #'title'은 items.py 의 변수
         item['price']= prices[num]
         yield item  # item 쌓는다(?)
Ejemplo n.º 7
0
 def parse(self, response):
     titles = response.css('div.best-list > ul > li[id] > a::text').getall()  
     prices = response.css('div.best-list > ul > li[id] > div.item_price > div.s-price > strong > span::text').getall()
                                         # id가 있는 것만 추출 
     # > 바로 밑에 있는 태그 가리킴 (비슷한 태그가 많을 경우에 사용)        
     
     for num, title in enumerate(titles):
         item = EcommerceItem() # 클래스로 객체 생성 
         item['title'] = title # item에 넣어라 
         item['price'] = prices[num].strip().replace("원","").replace(",","") # item에 넣어라 
         yield item # yield 할때마다 items.py의 item 객체에 데이터가 쌓인다.
Ejemplo n.º 8
0
    def parse(self, response):
        titles = response.css('div.best-list > ul > li > a::text').getall()
        prices = response.css(
            'div.best-list > ul > li > div.item_price > div.s-price > strong > span > span::text'
        ).getall()

        for num, title in enumerate(titles):
            doc = EcommerceItem()
            doc['title'] = title
            doc['price'] = prices[num].strip().replace('원',
                                                       '').replace(',', '')
            yield doc
    def parse(self, response):
        if 'isProduct' in response.meta and response.meta['isProduct']:
            soup = BeautifulSoup(response.body, 'lxml')
            productJson = json.loads(soup.find('script', attrs={'id':'data-mz-preload-product'}).encode_contents())

            hdrs = {
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
                'Authorization': '516a63a6-9f21-4dbe-b9f4-edc4b7387ea7'
            }
            res = requests.get('https://readservices-b2c.powerreviews.com/m/4403/l/en_US/product/%s/reviews?' % str(productJson["productCode"]), headers=hdrs)
            parsed = json.loads(res.text)

            item = EcommerceItem()
            
            item['id'] = productJson["productCode"]
            item['name'] = productJson["content"]["productName"]
            item['model'] = productJson["mfgPartNumber"].strip()
            item['url'] = response.url
            item['brand'] = ""
            for p in productJson["properties"]:
                if p["attributeDetail"]["name"] == 'Brand Name':
                    item['brand'] = p["values"][0]["stringValue"]
            try:
                item['reviews'] = parsed["results"][0]["rollup"]["average_rating"]
            except:
                item['reviews'] = 0
            try:
                item['rating'] = parsed["results"][0]["rollup"]["review_count"]
            except:
                item['rating'] = 0
            
            if productJson["price"]["onSale"]:
                item['price'] = productJson["price"]["salePrice"]
            else:
                item['price'] = productJson["price"]["price"]
            
            yield item
        
        else:
            for u in response.xpath('//li[contains(@class, "mz-productlist-item")]//a[@class="mz-productlisting-title"]/@href').extract():
                yield scrapy.Request(u, meta={'isProduct': True})
            
            next_page = response.xpath('//a[@class="mz-pagenumbers-next"]/@href').extract_first()
            if next_page:
                yield scrapy.Request(response.urljoin(next_page), meta={'isProduct': False})
Ejemplo n.º 10
0
    def parse(self, response):
        # pass
        titles = response.css('div.best-list li > a::text').getall()
        prices = response.css(
            'div.best-list li > div.item_price > div.s-price > strong > span > span::text'
        ).getall()

        for num, title in enumerate(titles):
            item = EcommerceItem()
            item['title'] = title
            item['price'] = prices[num].strip().replace("원",
                                                        "").replace(",", "")
            yield item


# In contrast to "return",
# "yield" doesn't exit the function and continues with the your for-loop.
# If you use "return", your for-loop will finish after the first iteration.
Ejemplo n.º 11
0
    def _scrape_product_links(self, response):

        items = response.css('ol.product-results li.psli')
        if not items:
            items = response.css('ol.product-results li.psgi')

        if not items:
            self.log("Found no product links.", WARNING)
        # try to get data from json
        script = response.xpath(
            '//div[@id="xjsi"]/script/text()').extract()
        script = script[0] if script else ''

        json_data = {}
        start = script.find(u'google.pmc=')
        if start < 0:
            start = 0
        else:
            start = start + len(u'google.pmc=')

        end = script.find(u';google.y.first.push')
        if end < 0:
            end = None

        cleansed = script[start:end]

        if cleansed:
            try:
                json_data = json.loads(cleansed)
            except:
                self.log('Failed to process json data', WARNING)

            try:
                json_data = json_data['spop']['r']
            except:
                self.log('Failed to find ["spop"]["r"] at json data', WARNING)

        for item in items:
            url = title = description = price = image_url = None
            try:
                id = item.xpath('@data-docid').extract()[0]
                link = item.xpath('.//div[@class="pslmain"]/h3[@class="r"]/a')
                if not link:
                    link = item.xpath('.//a[@class="psgiimg"]')
                title = link.xpath('string(.)').extract()[0]
                url = link.xpath('@href').extract()[0]
                rewiew_link = item.xpath(
                    './/a[@class="shop__secondary"]/@href'
                ).extract()
                if rewiew_link:
                    rewiew_link = rewiew_link[0]
                source_site = item.xpath(
                    './/div[@class="_tyb"]'
                )
                if source_site:
                    source_site_path = source_site
                    source_site_text = source_site_path.xpath(
                        './text()').extract()
                    source_site = source_site_text[0].replace(
                        'from ', '').strip()
            except IndexError:
                self.log('Index error at {url}'.format(url=response.url),
                         WARNING)
                continue

            _prices = item.xpath('.//*[contains(@class, "price")]')
            price = get_price(_prices)

            # TODO: support more currencies? we have to detect the website
            #  (google.au, google.br etc.) and use the appropriate currency
            # See https://support.google.com/merchants/answer/160637?hl=en
            if '$' not in price:  # TODO: only USD is supported now
                self.log('Unrecognized currency sign at %s' % response.url,
                         level=ERROR)
            else:
                price = Price(
                    price=price.replace('$', '').replace(',', '').strip(),
                    priceCurrency='USD'
                )

            # fill from json
            l = json_data.get(id)
            if l:
                try:
                    if not title:
                        title = Selector(
                            text=l[1]).xpath('string(.)').extract()[0]
                    if not url:
                        url = l[2]
                    description = l[3]
                    image_url = l[8][0][0]
                except IndexError:
                    self.log('Invalid JSON on {url}'.format(url=response.url),
                             WARNING)

            redirect = url
            url = urlparse.urljoin(response.url, url)

            if len(source_site) > 0:
                if len(source_site_path) == 1:
                    source_price = source_site_path.xpath(
                        './span[@class="price"]/b/text()'
                    ).extract()
                    if source_price:
                        source_price = source_price[0].replace('$', '')\
                            .replace(',', '').strip()
                        priceCurrency = 'USD'
                        data = {
                            'price': source_price,
                            'currency': priceCurrency
                        }
                        source_site = {source_site:data}
                else:
                    source_site = {source_site:{}}
                source_site = json.dumps(source_site)

            yield redirect, EcommerceItem(
                url=url,
                title=title,
                price=price,
                image_url=image_url,
                description=description,
                google_source_site=source_site,
                buyer_reviews=rewiew_link,
                locale='en-US')