Ejemplo n.º 1
0
    def parse(self, response):
        self.log('crawl page: {}'.format(response.url), logging.INFO)

        match = re.search("\['mainContestObj'\] = ({.*?});\n", response.body)
        if match:
            item = ProductItem()
            item['json'] = match.group(1)
            return item
Ejemplo n.º 2
0
 def get_item_contents(self, response):
     product_id = response.url.rsplit('/', 1)[1]
     image_url = response.xpath(
         '//div[@class="product-image__container"]/img/@src').get()
     image_url = image_url[:image_url.rfind('?')]
     product_title = response.xpath('//h1/text()').get()
     product_category = response.xpath(
         '(//span[contains(@class, "styled__StandaloneContainer")])[3]/a/span/span/text()'
     ).get()
     product_price = response.xpath(
         '//span[@data-auto="price-value"]/text()').get()
     product_description = '\n'.join(
         response.xpath(
             '//div[@id="product-description"]/*[not(self::h2)]//text()').
         getall())
     name_and_address = '\n'.join(
         response.xpath(
             '//div[@id="manufacturer-address"]/ul/li/text()').getall())
     return_address = '\n'.join(
         response.xpath(
             '//div[@id="return-address"]/ul/li/text()').getall())
     net_contents = response.xpath(
         '//div[@id="net-contents"]/p/text()').get()
     usually_bought_next = list()
     json = loads(
         response.xpath(
             '//body[@id="data-attributes"]/@data-redux-state').get())
     try:
         for item in json['productDetails']['recommendations']['data']:
             for _item in json['productDetails']['recommendations']['data'][
                     item]['productItems']:
                 usually_bought_next.append({
                     'product_url':
                     furl(self.product_template).join(
                         _item['product']['id']).url,
                     'title':
                     _item['product']['title'],
                     'product_image_url':
                     _item['product']['defaultImageUrl'],
                     'price':
                     _item['product']['price']
                 })
     except TypeError:
         usually_bought_next = None
     product = ProductItem()
     product['product_id'] = product_id
     product['url'] = response.url
     product['image_url'] = image_url
     product['title'] = product_title
     product['category'] = product_category
     product['price'] = product_price
     product['description'] = product_description
     product['name_and_address'] = name_and_address
     product['return_address'] = return_address
     product['net_contents'] = net_contents
     product['reviews'] = response.meta.get('reviews', None)
     product['usually_bought_next'] = usually_bought_next
     yield product
Ejemplo n.º 3
0
 def product_parse(self, response: HtmlResponse):
     loader = ItemLoader(item=ProductItem(), response=response)
     loader.add_xpath('title', '//h1[@slot="title"]/text()')
     loader.add_xpath(
         'price', '//uc-pdp-price-view[@slot="primary-price"]/span/text()')
     loader.add_xpath('description',
                      '//uc-pdp-section-vlimited/div/p/text()')
     loader.add_xpath('features', '//dl[@class="def-list"]/div/*/text()')
     loader.add_xpath('images', '//img[@alt="product image"]/@src')
     loader.add_value('url', response.url)
     yield loader.load_item()
Ejemplo n.º 4
0
def start():
    import glob
    import lxml.etree as etree
    file_list = glob.glob('../tests/*.html')
    parmas = {}
    result = []
    for file_name in file_list:
        product = ProductItem()
        with open(file_name, 'r', encoding='utf-8') as f:
            html = f.read()
        parmas['html_code'] = html
        parmas['xpath_obj'] = etree.HTML(html)
        product(parmas)
        result.append(product.__dict__)
    return result
Ejemplo n.º 5
0
    def parse(self, response):
        item = ProductItem()
        page_item = PAGE_ITEM(response.text)
        # (title, price, stars, best_sell_rank) = page_item.get_Infos()
        page_item.get_Infos()
        if page_item.parsed:
            item['uuid'] = response.meta['uuid']
            item['status'] = True
            item['title'] = page_item.title
            item['price'] = page_item.price
            item['stars'] = page_item.stars
            item['best_sell_rank'] = page_item.best_sell_rank
        else:
            item['uuid'] = response.meta['uuid']
            item['status'] = False
            item['fail_crawls'] = []
            if page_item.title is None:
                item['fail_crawls'].append('title')
            else:
                item['title'] = page_item.title

            if page_item.price is None:
                item['fail_crawls'].append('price')
            else:
                item['price'] = page_item.price

            if page_item.stars is None:
                item['fail_crawls'].append('stars')
            else:
                item['stars'] = page_item.stars

            if page_item.best_sell_rank is None:
                item['fail_crawls'].append('best_sell_rank')
            else:
                item['best_sell_rank'] = page_item.best_sell_rank

            item['url'] = response.url
            item['class_name'] = response.meta['class_name']

        l_rem(self.hostname, response.meta['value'])

        yield item
Ejemplo n.º 6
0
 def parse(self, response):
     print("processing:"+response.url)
     comment = response.xpath('/html//div[@class="comment"]')
     for comm in comment:
         item = ProductItem()
         data = ' '.join(comm.xpath('.//div[@class="comment_text"]/text()').extract())
         item['otziv']=data.replace('\r', '')
         stars = comm.xpath('.//div[@class="author"]/div[@class="br-theme-css-stars"]/div[@class="br-widget"]/a[@class="br-active"]')
         if len(stars)<=3:
            item['target']=0
         else:
            item['target']=1
         yield item
     NEXT_PAGE_SELECTOR = 'html body div#wrapper div#content div div#user_reviews div#more_reviews a::attr(href)'
     next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
     if next_page:
        yield scrapy.Request(
        response.urljoin(next_page),
        callback=self.parse
        )
Ejemplo n.º 7
0
    def parse(self, response):
        self.log('product url: {}'.format(response.url), logging.INFO)

        try:
            store_url = response.css('.shop-name').xpath(
                'a/@href').extract()[0]
            self.log('crawl store url: {}'.format(store_url), logging.INFO)

            store_item = UrlItem()
            store_item['prefix'] = ProductSpider.prefix
            store_item['type'] = 'store'
            store_item['url'] = store_url
            yield store_item

            feedback_base_url = response.xpath(
                '//div[@id="feedback"]/iframe/@thesrc').extract()[0]
            parsed = urlparse.urlparse(feedback_base_url)
            product_id = urlparse.parse_qs(parsed.query)['productId'][0]

            try:
                percent_num = response.css('.percent-num').xpath(
                    'text()').extract()[0]
                rantings_text = response.css('.rantings-num').xpath(
                    'text()').extract()[0]
                rantings_num = rantings_text[1:rantings_text.index(' ')]
                order_text = response.css('.order-num').xpath(
                    'text()').extract()[0]
                order_num = order_text[:order_text.index(' ')]
            except:
                percent_num = 0
                rantings_num = 0
                order_num = 0

            product_item = ProductItem()
            product_item['prefix'] = ProductSpider.prefix
            product_item['_id'] = product_id
            product_item['store'] = store_url
            product_item['url'] = response.url
            product_item['percent_num'] = percent_num
            product_item['rantings_num'] = rantings_num
            product_item['order_num'] = order_num
            yield product_item

            feedback_item = UrlItem()
            feedback_item['prefix'] = ProductSpider.prefix
            feedback_item['type'] = 'feedback'
            feedback_item['url'] = feedback_base_url
            yield feedback_item

            order_item = UrlItem()
            order_item['prefix'] = ProductSpider.prefix
            order_item['type'] = 'order'
            order_item[
                'url'] = 'http://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm?productId={}&type=default'.format(
                    product_id)
            yield order_item
        except:
            try:
                product_url = response.meta['redirect_urls'][0]
            except:
                product_url = response.url
                self.log('strange product url: {}'.format(product_url),
                         logging.ERROR)
            finally:
                self.log(
                    'meet anti-spider, back product: {}'.format(product_url),
                    logging.INFO)

                url_item = UrlItem()
                url_item['prefix'] = ProductSpider.prefix
                url_item['type'] = 'product'
                url_item['url'] = product_url
                yield url_item