class ScrapyAppPipeline(object):
    def __init__(self, unique_id, stats, publisher, *args, **kwargs):
        self.unique_id = unique_id
        self.stats = stats
        self.publisher = publisher
        self.items = []
        self.item_all = ScrapyItem()

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            # this will be passed from django view
            unique_id=crawler.settings.get('unique_id'),
            publisher=crawler.settings.get('publisher'),
            stats=crawler.stats)

    def close_spider(self, spider):
        pass

    def process_item(self, item, spider):
        self.items.append({
            'url': item['item']['url'],
            'title': item['item']['title']
        })
        self.item_all.unique_id = self.unique_id
        self.item_all.data = json.dumps(self.items)
        if self.stats.get_value('item_scraped_count'):
            self.item_all.item_scraped = self.stats.get_value(
                'item_scraped_count')
        else:
            pass
        self.item_all.save()

        return item
Beispiel #2
0
    def parse(self, response):
        for deal in response.css('div.sku'):
            item = ScrapyAppItem()
            url = deal.css('a.link::attr(href)').extract_first()

            title = deal.css('a.link h2.title span.name::text').extract_first()
            image_url = deal.css('a.link div.image-wrapper img::attr(data-src)'
                                 ).extract_first()
            percentage = deal.css(
                'a.link div.price-container span.sale-flag-percent::text'
            ).extract_first()
            item['web_source'] = 'jumia'
            item['deal_title'] = title
            item['deal_image_url'] = image_url
            if percentage:
                percentage = percentage.split("%", -1)[0]
                percentage = percentage.split("-", 1)[1]
                item['deal_percentage'] = percentage
            else:
                item['deal_percentage'] = '0'
            obj = ScrapyItem.objects.filter(
                deal_title=item['deal_title']).first()
            if not obj:
                scrapy_item = ScrapyItem()
                scrapy_item.web_source = item['web_source']
                scrapy_item.deal_title = item['deal_title']
                scrapy_item.deal_image_url = item['deal_image_url']
                scrapy_item.deal_percentage = item['deal_percentage']
                scrapy_item.save()

            url = response.urljoin(url)
            yield scrapy.Request(url=url,
                                 callback=self.parse_details,
                                 meta=item)
Beispiel #3
0
    def parse(self, response):
        for deal in response.css('div.block-grid-large'):
            item = ScrapyAppItem()
            url = deal.css(
                'div.img-bucket a.img-link::attr(href)').extract_first()
            title = deal.css(
                'div ul li.title-row h6 span a::attr(title)').extract_first()
            image_url = deal.css(
                'div.img-bucket a.img-link img::attr(data-src)').extract_first(
                )
            percentage = deal.css(
                'div.img-bucket a.img-link div.discounts-box span.discount::text'
            ).extract_first()

            item['web_source'] = 'souq'
            item['deal_title'] = title
            item['deal_image_url'] = image_url
            if percentage:
                item['deal_percentage'] = percentage.split("%", -1)[0]
            else:
                item['deal_percentage'] = '0'
            obj = ScrapyItem.objects.filter(
                deal_title=item['deal_title']).first()
            if not obj:
                scrapy_item = ScrapyItem()
                scrapy_item.web_source = item['web_source']
                scrapy_item.deal_title = item['deal_title']
                scrapy_item.deal_image_url = item['deal_image_url']
                scrapy_item.deal_percentage = item['deal_percentage']
                scrapy_item.save()

            url = response.urljoin(url)
            yield scrapy.Request(url=url,
                                 callback=self.parse_details,
                                 meta=item)
Beispiel #4
0
 def process_item(self, item, spider):
     obj = ScrapyItem.objects.filter(deal_title=item['deal_title']).first()
     if not obj:
         scrapy_item = ScrapyItem()
         scrapy_item.unique_id = self.unique_id
         scrapy_item.web_source = item['web_source']
         scrapy_item.deal_title = item['deal_title']
         scrapy_item.deal_image_url = item['deal_image_url']
         scrapy_item.deal_percentage = item['deal_percentage']
         scrapy_item.save()
     return item
Beispiel #5
0
 def close_spider(self, spider):
     # And here we are saving our crawled data with django models.
     item = ScrapyItem()
     item.unique_id = self.unique_id
     item.data = json.dumps(self.items)
     item.save()
 def __init__(self, unique_id, stats, publisher, *args, **kwargs):
     self.unique_id = unique_id
     self.stats = stats
     self.publisher = publisher
     self.items = []
     self.item_all = ScrapyItem()
Beispiel #7
0
    def process_item(self, item, spider):
        scrapy_item = ScrapyItem()
        scrapy_item.unique_id = self.unique_id
        scrapy_item.title = item['title']
        scrapy_item.contents = item['contents']
        scrapy_item.published_date = item['published_date']
        scrapy_item.views = item['views']
        scrapy_item.recommends = item['recommends']
        scrapy_item.url = item['url']
        scrapy_item.category = item['category']

        scrapy_item.save()
        return item