Ejemplo n.º 1
0
class MemoryBaseBackend(Backend):
    """
    Base class for in-memory heapq Backend objects.
    """
    component_name = 'Memory Base Backend'

    def __init__(self, manager):
        self.manager = manager
        self.requests = {}
        self.heap = Heap(self._compare_pages)

    @classmethod
    def from_manager(cls, manager):
        return cls(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        pass

    def add_seeds(self, seeds):
        for seed in seeds:
            request, _ = self._get_or_create_request(seed)
            self.heap.push(request)

    def get_next_requests(self, max_next_requests, **kwargs):
        return self.heap.pop(max_next_requests)

    def page_crawled(self, response, links):
        for link in links:
            request, created = self._get_or_create_request(link)
            if created:
                request.meta['depth'] = response.request.meta.get('depth', 0) + 1
                self.heap.push(request)

    def request_error(self, request, error):
        pass

    def _get_or_create_request(self, request):
        fingerprint = request.meta['fingerprint']
        if fingerprint not in self.requests:
            new_request = self._create_request(request)
            self.requests[fingerprint] = new_request
            self.manager.logger.backend.debug('Creating request %s' % new_request)
            return new_request, True
        else:
            page = self.requests[fingerprint]
            self.manager.logger.backend.debug('Request exists %s' % request)
            return page, False

    def _create_request(self, request):
        new_request = request.copy()
        new_request.meta['created_at'] = datetime.datetime.utcnow()
        new_request.meta['depth'] = 0
        return new_request

    def _compare_pages(self, first, second):
        raise NotImplementedError
Ejemplo n.º 2
0
 def test_heap_order(self):
     heap = Heap(cmp)
     heap.push(5)
     heap.push(2)
     heap.push(3)
     heap.push(4)
     heap.push(1)
     assert heap.pop(1) == [1]
     assert heap.pop(3) == [2, 3, 4]
     assert heap.pop(10) == [5]
     assert heap.pop(1) == []
Ejemplo n.º 3
0
 def __init__(self, partitions):
     self.partitions = [i for i in range(0, partitions)]
     self.partitioner = Crc32NamePartitioner(self.partitions)
     self.logger = logging.getLogger("memory.queue")
     self.heap = {}
     for partition in self.partitions:
         self.heap[partition] = Heap(self._compare_pages)
Ejemplo n.º 4
0
 def __init__(self, partitions):
     self.partitions = [i for i in range(0, partitions)]
     self.partitioner = Crc32NamePartitioner(self.partitions)
     self.logger = logging.getLogger("frontera.contrib.backends.memory.MemoryQueue")
     self.heap = {}
     for partition in self.partitions:
         self.heap[partition] = Heap(self._compare_pages)
Ejemplo n.º 5
0
 def test_push_limit(self):
     self.heap = Heap(compare_request, limit=2)
     self.heap.push(req1)
     self.heap.push(req2)
     p = self.heap.push(req3)
     assert p == req2
     assert len(self.heap) == 2
     assert self.heap.pop(2) == [req3, req1]
 def test_heap_obj(self):
     obj = type('obj', (object, ), {})
     a = obj()
     a.score = 3
     b = obj()
     b.score = 1
     c = obj()
     c.score = 2
     heap = Heap(lambda x, y: cmp(x.score, y.score))
     heap.push(a)
     heap.push(b)
     heap.push(c)
     assert heap.pop(3) == [b, c, a]
     assert heap.pop(1) == []
Ejemplo n.º 7
0
 def test_heap_obj(self):
     obj = type("obj", (object,), {})
     a = obj()
     a.score = 3
     b = obj()
     b.score = 1
     c = obj()
     c.score = 2
     heap = Heap(lambda x, y: cmp(x.score, y.score))
     heap.push(a)
     heap.push(b)
     heap.push(c)
     assert heap.pop(3) == [b, c, a]
     assert heap.pop(1) == []
 def test_heap_order(self):
     heap = Heap(cmp)
     heap.push(5)
     heap.push(2)
     heap.push(3)
     heap.push(4)
     heap.push(1)
     assert heap.pop(1) == [1]
     assert heap.pop(3) == [2, 3, 4]
     assert heap.pop(10) == [5]
     assert heap.pop(1) == []
Ejemplo n.º 9
0
 def __init__(self, manager):
     self.manager = manager
     self.requests = {}
     self.heap = Heap(self._compare_pages)
Ejemplo n.º 10
0
 def setUp(self):
     self.heap = Heap(compare_request)
Ejemplo n.º 11
0
class HeapTest(unittest.TestCase):
    def setUp(self):
        self.heap = Heap(compare_request)

    def test_push(self):
        self.heap.push(req1)
        self.heap.push(req2)
        assert len(self.heap) == 2

    def test_pop0_as_pop0(self):
        print len(self.heap)
        self.heap.push(req1)
        req = self.heap.pop(0)
        assert len(req) == 1
        assert req[0] == req1
        self.heap.push(req1)
        req = self.heap.pop(1)
        assert len(req) == 1
        assert req[0] == req1

    def test_pop_smallest_in_heap(self):
        self.heap.push(req1)
        self.heap.push(req2)
        req = self.heap.pop(0)
        assert req[0] == req2

    def test_push_limit(self):
        self.heap = Heap(compare_request, limit=2)
        self.heap.push(req1)
        self.heap.push(req2)
        p = self.heap.push(req3)
        assert p == req2
        assert len(self.heap) == 2
        assert self.heap.pop(2) == [req3, req1]
Ejemplo n.º 12
0
class MemoryBaseBackend(Backend):
    """
    Base class for in-memory heapq Backend objects.
    """
    component_name = 'Memory Base Backend'

    def __init__(self, manager):
        self.manager = manager
        self.requests = {}
        self.heap = Heap(self._compare_pages)

    @classmethod
    def from_manager(cls, manager):
        return cls(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        pass

    def add_seeds(self, seeds):
        for seed in seeds:
            request, _ = self._get_or_create_request(seed)
            self.heap.push(request)

    def get_next_requests(self, max_next_requests, **kwargs):
        return self.heap.pop(max_next_requests)

    def page_crawled(self, response, links):
        for link in links:
            request, created = self._get_or_create_request(link)
            if created:
                request.meta['depth'] = response.request.meta.get('depth',
                                                                  0) + 1
                self.heap.push(request)

    def request_error(self, request, error):
        pass

    def _get_or_create_request(self, request):
        fingerprint = request.meta['fingerprint']
        if fingerprint not in self.requests:
            new_request = self._create_request(request)
            self.requests[fingerprint] = new_request
            self.manager.logger.backend.debug('Creating request %s' %
                                              new_request)
            return new_request, True
        else:
            page = self.requests[fingerprint]
            self.manager.logger.backend.debug('Request exists %s' % request)
            return page, False

    def _create_request(self, request):
        new_request = request.copy()
        new_request.meta['created_at'] = datetime.datetime.utcnow()
        new_request.meta['depth'] = 0
        return new_request

    def _compare_pages(self, first, second):
        raise NotImplementedError
Ejemplo n.º 13
0
 def __init__(self, manager):
     self.manager = manager
     self.requests = {}
     self.heap = Heap(self._compare_pages)
Ejemplo n.º 14
0
 def __init__(self, partitioner):
     self.partitioner = partitioner
     self.logger = logging.getLogger("memory.queue")
     self.heap = {}
     for partition in self.partitioner.partitions:
         self.heap[partition] = Heap(self._compare_pages)