Example #1
0
 def __init__(self, config):
     SpiderScheduler.__init__(self, config)
     self.redis_connecton = redis.StrictRedis(host=config.get('REDIS_HOST'),
                                              port=config.get('REDIS_PORT'),
                                              db=0,
                                              password=config.get(
                                                  'REDIS_PASSWORD', ''))
Example #2
0
 def setUp(self):
     d = self.mktemp()
     eggs_dir = self.eggs_dir = os.path.join(d, 'eggs')
     dbs_dir = os.path.join(d, 'dbs')
     os.mkdir(d)
     os.makedirs(eggs_dir)
     os.makedirs(dbs_dir)
     os.makedirs(os.path.join(eggs_dir, 'mybot1'))
     os.makedirs(os.path.join(eggs_dir, 'mybot2'))
     config = Config(values={'eggs_dir': eggs_dir, 'dbs_dir': dbs_dir})
     self.queues = get_spider_queues(config)
     self.sched = SpiderScheduler(config)
Example #3
0
class SpiderSchedulerTest(unittest.TestCase):
    def setUp(self):
        d = self.mktemp()
        eggs_dir = self.eggs_dir = os.path.join(d, 'eggs')
        dbs_dir = os.path.join(d, 'dbs')
        os.mkdir(d)
        os.makedirs(eggs_dir)
        os.makedirs(dbs_dir)
        os.makedirs(os.path.join(eggs_dir, 'mybot1'))
        os.makedirs(os.path.join(eggs_dir, 'mybot2'))
        config = Config(values={'eggs_dir': eggs_dir, 'dbs_dir': dbs_dir})
        self.queues = get_spider_queues(config)
        self.sched = SpiderScheduler(config)

    def test_interface(self):
        verifyObject(ISpiderScheduler, self.sched)

    def test_list_update_projects(self):
        self.assertEqual(sorted(self.sched.list_projects()),
                         sorted(['mybot1', 'mybot2']))
        os.makedirs(os.path.join(self.eggs_dir, 'mybot3'))
        self.sched.update_projects()
        self.assertEqual(sorted(self.sched.list_projects()),
                         sorted(['mybot1', 'mybot2', 'mybot3']))

    def test_schedule(self):
        q1, q2 = self.queues['mybot1'], self.queues['mybot2']
        self.failIf(q1.count())
        self.sched.schedule('mybot1', 'myspider1', 2, a='b')
        self.sched.schedule('mybot2', 'myspider2', 1, c='d')
        self.sched.schedule('mybot2', 'myspider3', 10, e='f')
        self.assertEqual(q1.pop(), {'name': 'myspider1', 'a': 'b'})
        self.assertEqual(q2.pop(), {'name': 'myspider3', 'e': 'f'})
        self.assertEqual(q2.pop(), {'name': 'myspider2', 'c': 'd'})
Example #4
0
    def startService(self):
        spider_scheduler = SpiderScheduler(self.config)
        running = get_spider_running(self.config)
        for project in running.keys():
            runner_db = running[project]
            for item in runner_db.iteritems():
                spider_scheduler.schedule(project, str(item[1]['_spider']), _job=str(item[0]), domain=str(item[1]['domain']), settings=item[1]['settings'])
            finished_jobs = get_spider_finished(self.config)
            finished_db = finished_jobs[project]
            for item in finished_db.iteritems():
                item = json.loads(item[1])
                pp = ScrapyProcessProtocol(item['slot'], item['project'], item['spider'], item['job'], item['env'], domain=item['domain'])
                pp.end_time = parser.parse(item['end_time'])
                pp.start_time = parser.parse(item['start_time'])
                self.finished.append(pp)

        for slot in range(self.max_proc):
            self._wait_for_project(slot)
        log.msg(format='Scrapyd %(version)s started: max_proc=%(max_proc)r, runner=%(runner)r',
                version=__version__, max_proc=self.max_proc,
                runner=self.runner, system='Launcher')
Example #5
0
 def setUp(self):
     d = self.mktemp()
     eggs_dir = self.eggs_dir = os.path.join(d, 'eggs')
     dbs_dir = os.path.join(d, 'dbs')
     os.mkdir(d)
     os.makedirs(eggs_dir)
     os.makedirs(dbs_dir)
     os.makedirs(os.path.join(eggs_dir, 'mybot1'))
     os.makedirs(os.path.join(eggs_dir, 'mybot2'))
     config = Config(values={'eggs_dir': eggs_dir, 'dbs_dir': dbs_dir})
     self.queues = get_spider_queues(config)
     self.sched = SpiderScheduler(config)
Example #6
0
class SpiderSchedulerTest(unittest.TestCase):

    def setUp(self):
        d = self.mktemp()
        eggs_dir = self.eggs_dir = os.path.join(d, 'eggs')
        dbs_dir = os.path.join(d, 'dbs')
        os.mkdir(d)
        os.makedirs(eggs_dir)
        os.makedirs(dbs_dir)
        os.makedirs(os.path.join(eggs_dir, 'mybot1'))
        os.makedirs(os.path.join(eggs_dir, 'mybot2'))
        config = Config(values={'eggs_dir': eggs_dir, 'dbs_dir': dbs_dir})
        self.queues = get_spider_queues(config)
        self.sched = SpiderScheduler(config)

    def test_interface(self):
        verifyObject(ISpiderScheduler, self.sched)

    def test_list_update_projects(self):
        self.assertEqual(sorted(self.sched.list_projects()), sorted(['mybot1', 'mybot2']))
        os.makedirs(os.path.join(self.eggs_dir, 'mybot3'))
        self.sched.update_projects()
        self.assertEqual(sorted(self.sched.list_projects()), sorted(['mybot1', 'mybot2', 'mybot3']))

    def test_schedule(self):
        q = self.queues['mybot1']
        self.failIf(q.count())
        self.sched.schedule('mybot1', 'myspider1', a='b')
        self.sched.schedule('mybot2', 'myspider2', c='d')
        self.assertEqual(q.pop(), {'name': 'myspider1', 'a': 'b'})
        q = self.queues['mybot2']
        self.assertEqual(q.pop(), {'name': 'myspider2', 'c': 'd'})
Example #7
0
def application(config):
    app = Application("Scrapyd")
    http_port = config.getint('http_port', 6800)
    bind_address = config.get('bind_address', '127.0.0.1')
    poll_interval = config.getfloat('poll_interval', 5)

    poller = QueuePoller(config)
    scheduler = SpiderScheduler(config)
    environment = Environment(config)

    app.setComponent(IPoller, poller)
    app.setComponent(ISpiderScheduler, scheduler)
    app.setComponent(IEnvironment, environment)

    jspath = config.get('jobstorage', 'scrapyd.jobstorage.MemoryJobStorage')
    jscls = load_object(jspath)
    jobstorage = jscls(config)
    app.setComponent(IJobStorage, jobstorage)
    eggstorage = config.get('eggstorage',
                            'scrapyd.eggstorage.FilesystemEggStorage')
    eggstoragecls = load_object(eggstorage)
    app.setComponent(IEggStorage, eggstoragecls(config))

    laupath = config.get('launcher', 'scrapyd.launcher.Launcher')
    laucls = load_object(laupath)
    launcher = laucls(config, app)

    timer = TimerService(poll_interval, poller.poll)

    webpath = config.get('webroot', 'scrapyd.website.Root')
    webcls = load_object(webpath)
    resource = create_wrapped_resource(webcls, config, app)
    webservice = TCPServer(http_port,
                           server.Site(resource),
                           interface=bind_address)
    log.msg(
        format=
        "Scrapyd web console available at http://%(bind_address)s:%(http_port)s/",
        bind_address=bind_address,
        http_port=http_port)

    launcher.setServiceParent(app)
    timer.setServiceParent(app)
    webservice.setServiceParent(app)

    return app
Example #8
0
def application(config):
    app = Application("Scrapyd")
    http_port = config.getint('http_port', 6800)

    if 'PORT' in os.environ:
        http_port = int(os.environ.get('PORT'))

    bind_address = '0.0.0.0' if 'PORT' in os.environ else config.get('bind_address', '127.0.0.1')

    poll_interval = config.getfloat('poll_interval', 5)

    poller = QueuePoller(config)
    eggstorage = FilesystemEggStorage(config)
    scheduler = SpiderScheduler(config)
    environment = Environment(config)

    app.setComponent(IPoller, poller)
    app.setComponent(IEggStorage, eggstorage)
    app.setComponent(ISpiderScheduler, scheduler)
    app.setComponent(IEnvironment, environment)

    laupath = config.get('launcher', 'scrapyd.launcher.Launcher')
    laucls = load_object(laupath)
    launcher = laucls(config, app)

    webpath = config.get('webroot', 'scrapyd.website.Root')
    webcls = load_object(webpath)

    timer = TimerService(poll_interval, poller.poll)
    webservice = TCPServer(http_port, server.Site(webcls(config, app)), interface=bind_address)
    log.msg(format="Scrapyd web console available at http://%(bind_address)s:%(http_port)s/",
            bind_address=bind_address, http_port=http_port)

    launcher.setServiceParent(app)
    timer.setServiceParent(app)
    webservice.setServiceParent(app)

    return app