class ScrapyAppPipeline(object): def __init__(self, unique_id, stats, publisher, *args, **kwargs): self.unique_id = unique_id self.stats = stats self.publisher = publisher self.items = [] self.item_all = ScrapyItem() @classmethod def from_crawler(cls, crawler): return cls( # this will be passed from django view unique_id=crawler.settings.get('unique_id'), publisher=crawler.settings.get('publisher'), stats=crawler.stats) def close_spider(self, spider): pass def process_item(self, item, spider): self.items.append({ 'url': item['item']['url'], 'title': item['item']['title'] }) self.item_all.unique_id = self.unique_id self.item_all.data = json.dumps(self.items) if self.stats.get_value('item_scraped_count'): self.item_all.item_scraped = self.stats.get_value( 'item_scraped_count') else: pass self.item_all.save() return item
def parse(self, response): for deal in response.css('div.sku'): item = ScrapyAppItem() url = deal.css('a.link::attr(href)').extract_first() title = deal.css('a.link h2.title span.name::text').extract_first() image_url = deal.css('a.link div.image-wrapper img::attr(data-src)' ).extract_first() percentage = deal.css( 'a.link div.price-container span.sale-flag-percent::text' ).extract_first() item['web_source'] = 'jumia' item['deal_title'] = title item['deal_image_url'] = image_url if percentage: percentage = percentage.split("%", -1)[0] percentage = percentage.split("-", 1)[1] item['deal_percentage'] = percentage else: item['deal_percentage'] = '0' obj = ScrapyItem.objects.filter( deal_title=item['deal_title']).first() if not obj: scrapy_item = ScrapyItem() scrapy_item.web_source = item['web_source'] scrapy_item.deal_title = item['deal_title'] scrapy_item.deal_image_url = item['deal_image_url'] scrapy_item.deal_percentage = item['deal_percentage'] scrapy_item.save() url = response.urljoin(url) yield scrapy.Request(url=url, callback=self.parse_details, meta=item)
def parse(self, response): for deal in response.css('div.block-grid-large'): item = ScrapyAppItem() url = deal.css( 'div.img-bucket a.img-link::attr(href)').extract_first() title = deal.css( 'div ul li.title-row h6 span a::attr(title)').extract_first() image_url = deal.css( 'div.img-bucket a.img-link img::attr(data-src)').extract_first( ) percentage = deal.css( 'div.img-bucket a.img-link div.discounts-box span.discount::text' ).extract_first() item['web_source'] = 'souq' item['deal_title'] = title item['deal_image_url'] = image_url if percentage: item['deal_percentage'] = percentage.split("%", -1)[0] else: item['deal_percentage'] = '0' obj = ScrapyItem.objects.filter( deal_title=item['deal_title']).first() if not obj: scrapy_item = ScrapyItem() scrapy_item.web_source = item['web_source'] scrapy_item.deal_title = item['deal_title'] scrapy_item.deal_image_url = item['deal_image_url'] scrapy_item.deal_percentage = item['deal_percentage'] scrapy_item.save() url = response.urljoin(url) yield scrapy.Request(url=url, callback=self.parse_details, meta=item)
def process_item(self, item, spider): obj = ScrapyItem.objects.filter(deal_title=item['deal_title']).first() if not obj: scrapy_item = ScrapyItem() scrapy_item.unique_id = self.unique_id scrapy_item.web_source = item['web_source'] scrapy_item.deal_title = item['deal_title'] scrapy_item.deal_image_url = item['deal_image_url'] scrapy_item.deal_percentage = item['deal_percentage'] scrapy_item.save() return item
def close_spider(self, spider): # And here we are saving our crawled data with django models. item = ScrapyItem() item.unique_id = self.unique_id item.data = json.dumps(self.items) item.save()
def __init__(self, unique_id, stats, publisher, *args, **kwargs): self.unique_id = unique_id self.stats = stats self.publisher = publisher self.items = [] self.item_all = ScrapyItem()
def process_item(self, item, spider): scrapy_item = ScrapyItem() scrapy_item.unique_id = self.unique_id scrapy_item.title = item['title'] scrapy_item.contents = item['contents'] scrapy_item.published_date = item['published_date'] scrapy_item.views = item['views'] scrapy_item.recommends = item['recommends'] scrapy_item.url = item['url'] scrapy_item.category = item['category'] scrapy_item.save() return item