Beispiel #1
0
class ThrottleMixin(object):
    @mock.patch('crawling.distributed_scheduler.DistributedScheduler' \
                '.setup')
    def setUp(self, a):
        self.scheduler = DistributedScheduler(MagicMock(), False, 10, 3)
        self.scheduler.open(MagicMock())
        self.scheduler.spider.name = 'link'
        self.req = None

    def get_request(self):
        req = None

        # required
        req = Request('http://ex.com')
        req.meta['crawlid'] = "abc123"
        req.meta['appid'] = "myapp"

        # optional defaults
        req.meta['spiderid'] = "link"
        req.meta["attrs"] = {}
        req.meta["allowed_domains"] = ()
        req.meta["allow_regex"] = ()
        req.meta["deny_regex"] = ()
        req.meta["deny_extensions"] = None
        req.meta['curdepth'] = 0
        req.meta["maxdepth"] = 0
        req.meta['priority'] = 0
        req.meta['retry_times'] = 0
        req.meta['expires'] = 0
        req.meta['useragent'] = None

        return req
Beispiel #2
0
 def setUp(self, u, z):
     self.scheduler = DistributedScheduler(MagicMock(), False, 60, 10, 3,
                                           MagicMock(), 10, 60, False, 60,
                                           False, False, '.*', True, 3600)
     self.scheduler.open(MagicMock())
     self.scheduler.my_ip = 'ip'
     self.scheduler.spider.name = 'link'
     self.req = None
 def setUp(self, u, z):
     self.scheduler = DistributedScheduler(MagicMock(), False, 60, 10, 3,
                                           MagicMock(), 10, 60, False, 60,
                                           False, False, '.*', True, 3600)
     self.scheduler.open(MagicMock())
     self.scheduler.my_ip = 'ip'
     self.scheduler.spider.name = 'link'
     self.req = None
class ThrottleMixin(object):

    @mock.patch('crawling.distributed_scheduler.DistributedScheduler' \
                '.update_ipaddress')
    @mock.patch('crawling.distributed_scheduler.DistributedScheduler' \
                '.setup_zookeeper')
    def setUp(self, u, z):
        self.scheduler = DistributedScheduler(MagicMock(), False, 60, 10, 3,
                                              MagicMock(), 10, 60, False, 60,
                                              False, False, '.*', True, 3600,
                                              None, 600, 600)
        self.scheduler.open(MagicMock())
        self.scheduler.my_ip = 'ip'
        self.scheduler.spider.name = 'link'
        self.req = None

    def get_request(self):
        req = None

        # required
        req = Request('http://ex.com')
        req.meta['crawlid'] = "abc123"
        req.meta['appid'] = "myapp"

        req.meta['url'] = "http://ex.com"
        req.meta['spiderid'] = "link"
        req.meta["attrs"] = None
        req.meta["allowed_domains"] = None
        req.meta["allow_regex"] = None
        req.meta["deny_regex"] = None
        req.meta["deny_extensions"] = None
        req.meta['curdepth'] = 0
        req.meta["maxdepth"] = 0
        req.meta["domain_max_pages"] = None
        req.meta['priority'] = 0
        req.meta['retry_times'] = 0
        req.meta['expires'] = 0
        req.meta['useragent'] = None
        req.meta['cookie'] = None

        return req
class ThrottleMixin(object):

    @mock.patch('crawling.distributed_scheduler.DistributedScheduler' \
                '.update_ipaddress')
    @mock.patch('crawling.distributed_scheduler.DistributedScheduler' \
                '.setup_zookeeper')
    def setUp(self, u, z):
        self.scheduler = DistributedScheduler(MagicMock(), False, 60, 10, 3,
                                              MagicMock(), 10, 60, False, 60,
                                              False, False, '.*', True, 3600)
        self.scheduler.open(MagicMock())
        self.scheduler.my_ip = 'ip'
        self.scheduler.spider.name = 'link'
        self.req = None

    def get_request(self):
        req = None

        # required
        req = Request('http://ex.com')
        req.meta['crawlid'] = "abc123"
        req.meta['appid'] = "myapp"

        req.meta['url'] = "http://ex.com"
        req.meta['spiderid'] = "link"
        req.meta["attrs"] = None
        req.meta["allowed_domains"] = None
        req.meta["allow_regex"] = None
        req.meta["deny_regex"] = None
        req.meta["deny_extensions"] = None
        req.meta['curdepth'] = 0
        req.meta["maxdepth"] = 0
        req.meta['priority'] = 0
        req.meta['retry_times'] = 0
        req.meta['expires'] = 0
        req.meta['useragent'] = None
        req.meta['cookie'] = None

        return req
Beispiel #6
0
 def setUp(self, a):
     self.scheduler = DistributedScheduler(MagicMock(), False, 10, 3)
     self.scheduler.open(MagicMock())
     self.scheduler.spider.name = 'link'
     self.req = None