Example #1
0
class Scheduler(object):

    def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None):
        self.df = dupefilter
        self.dqdir = self._dqdir(jobdir)
        self.dqclass = dqclass
        self.mqclass = mqclass
        self.logunser = logunser
        self.stats = stats

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
        return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats)

    def has_pending_requests(self):
        return len(self) > 0

    def open(self, spider):
        self.spider = spider
        self.mqs = PriorityQueue(self._newmq)
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
            prios = self.dqs.close()
            with open(join(self.dqdir, 'active.json'), 'w') as f:
                json.dump(prios, f)
        return self.df.close(reason)

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        if not self._dqpush(request):
            self._mqpush(request)

    def next_request(self):
        return self.mqs.pop() or self._dqpop()

    def __len__(self):
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.priority)
        except ValueError, e: # non serializable request
            if self.logunser:
                log.msg("Unable to serialize request: %s - reason: %s" % \
                    (request, str(e)), level=log.ERROR, spider=self.spider)
            return
        else:
Example #2
0
class Scheduler(object):

    def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False):
        self.df = dupefilter
        self.dqdir = self._dqdir(jobdir)
        self.dqclass = dqclass
        self.mqclass = mqclass
        self.logunser = logunser

    @classmethod
    def from_settings(cls, settings):
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
        return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser)

    def has_pending_requests(self):
        return len(self) > 0

    def open(self, spider):
        self.spider = spider
        self.mqs = PriorityQueue(self._newmq)
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
            prios = self.dqs.close()
            with open(join(self.dqdir, 'active.json'), 'w') as f:
                json.dump(prios, f)
        return self.df.close(reason)

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        if not self._dqpush(request):
            self._mqpush(request)

    def next_request(self):
        return self.mqs.pop() or self._dqpop()

    def __len__(self):
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.priority)
        except ValueError, e: # non serializable request
            if self.logunser:
                log.msg("Unable to serialize request: %s - reason: %s" % \
                    (request, str(e)), level=log.ERROR, spider=self.spider)
            return
        else:
Example #3
0
 def _dq(self):
     activef = join(self.dqdir, 'active.json')
     if exists(activef):
         with open(activef) as f:
             prios = json.load(f)
     else:
         prios = ()
     q = PriorityQueue(self._newdq, startprios=prios)
     if q:
         log.msg("Resuming crawl (%d requests scheduled)" % len(q), \
             spider=self.spider)
     return q
Example #4
0
 def open(self, spider):
     self.spider = spider
     self.mqs = PriorityQueue(self._newmq)
     self.dqs = self._dq() if self.dqdir else None
     return self.df.open()
Example #5
0
 def setUp(self):
     self.q = PriorityQueue(self.qfactory)
Example #6
0
class FifoMemoryPriorityQueueTest(unittest.TestCase):
    def setUp(self):
        self.q = PriorityQueue(self.qfactory)

    def qfactory(self, prio):
        return track_closed(FifoMemoryQueue)()

    def test_push_pop_noprio(self):
        self.q.push('a')
        self.q.push('b')
        self.q.push('c')
        self.assertEqual(self.q.pop(), 'a')
        self.assertEqual(self.q.pop(), 'b')
        self.assertEqual(self.q.pop(), 'c')
        self.assertEqual(self.q.pop(), None)

    def test_push_pop_prio(self):
        self.q.push('a', 3)
        self.q.push('b', 1)
        self.q.push('c', 2)
        self.q.push('d', 1)
        self.assertEqual(self.q.pop(), 'b')
        self.assertEqual(self.q.pop(), 'd')
        self.assertEqual(self.q.pop(), 'c')
        self.assertEqual(self.q.pop(), 'a')
        self.assertEqual(self.q.pop(), None)

    def test_len_nonzero(self):
        assert not self.q
        self.assertEqual(len(self.q), 0)
        self.q.push('a', 3)
        assert self.q
        self.q.push('b', 1)
        self.q.push('c', 2)
        self.q.push('d', 1)
        self.assertEqual(len(self.q), 4)
        self.q.pop()
        self.q.pop()
        self.q.pop()
        self.q.pop()
        assert not self.q
        self.assertEqual(len(self.q), 0)

    def test_close(self):
        self.q.push('a', 3)
        self.q.push('b', 1)
        self.q.push('c', 2)
        self.q.push('d', 1)
        iqueues = self.q.queues.values()
        self.assertEqual(sorted(self.q.close()), [1, 2, 3])
        assert all(q.closed for q in iqueues)

    def test_close_return_active(self):
        self.q.push('b', 1)
        self.q.push('c', 2)
        self.q.push('a', 3)
        self.q.pop()
        self.assertEqual(sorted(self.q.close()), [2, 3])

    def test_popped_internal_queues_closed(self):
        self.q.push('a', 3)
        self.q.push('b', 1)
        self.q.push('c', 2)
        p1queue = self.q.queues[1]
        self.assertEqual(self.q.pop(), 'b')
        self.q.close()
        assert p1queue.closed
Example #7
0
class FifoDiskPriorityQueueTest(FifoMemoryPriorityQueueTest):
    def setUp(self):
        self.q = PriorityQueue(self.qfactory)

    def qfactory(self, prio):
        return track_closed(FifoDiskQueue)(self.mktemp())

    def test_nonserializable_object_one(self):
        self.assertRaises(TypeError, self.q.push, lambda x: x, 0)
        self.assertEqual(self.q.close(), [])

    def test_nonserializable_object_many_close(self):
        self.q.push('a', 3)
        self.q.push('b', 1)
        self.assertRaises(TypeError, self.q.push, lambda x: x, 0)
        self.q.push('c', 2)
        self.assertEqual(self.q.pop(), 'b')
        self.assertEqual(sorted(self.q.close()), [2, 3])

    def test_nonserializable_object_many_pop(self):
        self.q.push('a', 3)
        self.q.push('b', 1)
        self.assertRaises(TypeError, self.q.push, lambda x: x, 0)
        self.q.push('c', 2)
        self.assertEqual(self.q.pop(), 'b')
        self.assertEqual(self.q.pop(), 'c')
        self.assertEqual(self.q.pop(), 'a')
        self.assertEqual(self.q.pop(), None)
        self.assertEqual(self.q.close(), [])
Example #8
0
class Scheduler(object):

    def __init__(self, dupefilter, jobdir=None, dqclass=None):
        self.df = dupefilter
        self.dqdir = join(jobdir, 'requests.queue') if jobdir else None
        self.dqclass = dqclass

    @classmethod
    def from_settings(cls, settings):
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        return cls(dupefilter, job_dir(settings), dqclass)

    def has_pending_requests(self):
        return len(self) > 0

    def open(self, spider):
        self.spider = spider
        self.mqs = PriorityQueue(self._newmq)
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
            prios = self.dqs.close()
            with open(join(self.dqdir, 'active.json'), 'w') as f:
                json.dump(prios, f)
        return self.df.close()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        if not self._dqpush(request):
            self._mqpush(request)

    def next_request(self):
        return self.mqs.pop() or self._dqpop()

    def __len__(self):
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, request.priority)
        except ValueError: # non serializable request
            return
        else:
            stats.inc_value('scheduler/disk_enqueued', spider=self.spider)
            return True

    def _mqpush(self, request):
        stats.inc_value('scheduler/memory_enqueued', spider=self.spider)
        self.mqs.push(request, request.priority)

    def _dqpop(self):
        if self.dqs:
            d = self.dqs.pop()
            if d:
                return request_from_dict(d, self.spider)

    def _newmq(self, priority):
        return MemoryQueue()

    def _newdq(self, priority):
        return self.dqclass(join(self.dqdir, 'p%s' % priority))

    def _dq(self):
        activef = join(self.dqdir, 'active.json')
        if exists(activef):
            with open(activef) as f:
                prios = json.load(f)
        else:
            prios = ()
        q = PriorityQueue(self._newdq, startprios=prios)
        if q:
            log.msg("Resuming crawl (%d requests scheduled)" % len(q), \
                spider=self.spider)
        return q
Example #9
0
 def setUp(self):
     qfactory = lambda x: TestMemoryQueue()
     self.q = PriorityQueue(qfactory)
Example #10
0
class PriorityQueueTest(unittest.TestCase):

    def setUp(self):
        qfactory = lambda x: TestMemoryQueue()
        self.q = PriorityQueue(qfactory)

    def test_push_pop_noprio(self):
        self.q.push('a')
        self.q.push('b')
        self.q.push('c')
        self.assertEqual(self.q.pop(), 'a')
        self.assertEqual(self.q.pop(), 'b')
        self.assertEqual(self.q.pop(), 'c')
        self.assertEqual(self.q.pop(), None)

    def test_push_pop_prio(self):
        self.q.push('a', 3)
        self.q.push('b', 1)
        self.q.push('c', 2)
        self.q.push('d', 1)
        self.assertEqual(self.q.pop(), 'b')
        self.assertEqual(self.q.pop(), 'd')
        self.assertEqual(self.q.pop(), 'c')
        self.assertEqual(self.q.pop(), 'a')
        self.assertEqual(self.q.pop(), None)

    def test_len_nonzero(self):
        assert not self.q
        self.assertEqual(len(self.q), 0)
        self.q.push('a', 3)
        assert self.q
        self.q.push('b', 1)
        self.q.push('c', 2)
        self.q.push('d', 1)
        self.assertEqual(len(self.q), 4)
        self.q.pop()
        self.q.pop()
        self.q.pop()
        self.q.pop()
        assert not self.q
        self.assertEqual(len(self.q), 0)

    def test_close(self):
        self.q.push('a', 3)
        self.q.push('b', 1)
        self.q.push('c', 2)
        self.q.push('d', 1)
        iqueues = self.q.queues.values()
        self.assertEqual(sorted(self.q.close()), [1, 2, 3])
        assert all(q.closed for q in iqueues)

    def test_popped_internal_queues_closed(self):
        self.q.push('a', 3)
        self.q.push('b', 1)
        self.q.push('c', 2)
        p1queue = self.q.queues[1]
        self.assertEqual(self.q.pop(), 'b')
        self.q.close()
        assert p1queue.closed
Example #11
0
class Scheduler(SettingObject):
    
    dupfilter_class = StringField(default="scrapy.dupefilter.RFPDupeFilter")
    schedule_disk_queue = StringField(default="scrapy.squeue.PickleLifoDiskQueue")
    schedule_memory_queue = StringField(default="scrapy.squeue.LifoMemoryQueue")
    log_unserailizable_requests = BooleanField(default=False)
    jobdir = StringField(default="")
    
    def __init__(self, settings):
        super(Scheduler, self).__init__(settings)
        dupefilter_cls = load_object(self.dupfilter_class.to_value())
        dupefilter = dupefilter_cls(self.metas)
        dqclass = load_object(self.schedule_disk_queue.to_value())
        mqclass = load_object(self.schedule_memory_queue.to_value())
        logunser = self.log_unserailizable_requests.to_value()
        
        self.df = dupefilter
        self.jobpath = self.__job_dir(self.jobdir.to_value()) 
        self.dqdir = self._dqdir(self.jobpath)
        self.dqclass = dqclass
        self.mqclass = mqclass
        self.logunser = logunser
    
    def __job_dir(self, path):
        if path and not os.path.exists(path):
            os.makedirs(path)
        return path

    def has_pending_requests(self):
        return len(self) > 0

    def open(self, spider):
        self.spider = spider
        self.mqs = PriorityQueue(self._newmq)
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
            prios = self.dqs.close()
            with open(join(self.dqdir, 'active.json'), 'w') as f:
                json.dump(prios, f)
        return self.df.close(reason)

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        if not self._dqpush(request):
            self._mqpush(request)

    def next_request(self):
        return self.mqs.pop() or self._dqpop()

    def __len__(self):
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.priority)
        except ValueError, e: # non serializable request
            if self.logunser:
                log.msg("Unable to serialize request: %s - reason: %s" % \
                    (request, str(e)), level=log.ERROR, spider=self.spider)
            return
        else: