Beispiel #1
0
class SavePipeline(object):
    def __init__(self):
        self.duplicates = {}
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_opened(self, spider):
        Data.objects.filter(name=spider.name).delete()
        self.spider_data = Data(name=spider.name)
        self.spider_data.source = [];

    def spider_closed(self, spider):
        self.spider_data.save()

        transaction.commit()
        ProcessSpiderData.delay(spider.name)

        os.system('chmod -R 755 %s/full' % settings['IMAGES_STORE'])
        os.system('chmod -R 755 %s/thumbs' % settings['IMAGES_STORE'])

    def process_item(self, item, spider):
        if isinstance(item, JobItem):
            data = dict(item)

            if data.has_key('images') and data['images']:
                data['image'] = "%s.jpg" % hashlib.sha1(data['images'][0]['url']).hexdigest()
                del data['image_urls']
                del data['images']

            self.spider_data.source.append(data)

            log.msg('Saved source "%s", document id: "%s"' %
                (spider.name, data['id']), level=log.INFO
            )

            return item
Beispiel #2
0
 def spider_opened(self, spider):
     Data.objects.filter(name=spider.name).delete()
     self.spider_data = Data(name=spider.name)
     self.spider_data.source = [];