Ejemplo n.º 1
0
    def test_setup(self):
        #-----------------------------------------------------------------------
        # Set up the controller. Artificially add some RUNNING jobs to the
        # schedule to simulate the daemon being killed unexpectedly. These
        # jobs should be converted to PENDING to be restarted ASAP.
        #-----------------------------------------------------------------------
        controller = self.controller
        yield controller.push_project(self.project_archive_data)
        controller.schedule_job('quotesbot', 'toscrape-css', 'every second')
        controller.schedule_job('quotesbot', 'toscrape-xpath',
                                'every 2 seconds')

        job = Job(Status.RUNNING, Actor.SCHEDULER, 'now', 'foo1', 'bar1')
        self.controller.schedule.add_job(job)
        job = Job(Status.RUNNING, Actor.SCHEDULER, 'now', 'foo2', 'bar2')
        self.controller.schedule.add_job(job)
        self.assertEqual(len(controller.get_jobs(Status.RUNNING)), 2)

        #-----------------------------------------------------------------------
        # Set up another controller with the same config to see if the state
        # is reconstructed
        #-----------------------------------------------------------------------
        controller = Controller(self.config)
        self.assertEqual(len(controller.scheduler.jobs), 2)
        self.assertEqual(len(controller.get_jobs(Status.PENDING)), 2)
        yield twisted_sleep(3)
        controller.run_scheduler()
        pending_jobs = controller.get_jobs(Status.PENDING)
        pending_spiders = [job.spider for job in pending_jobs]
        self.assertEqual(len(pending_jobs), 4)

        for spider in ['toscrape-css', 'toscrape-xpath', 'bar1', 'bar2']:
            self.assertIn(spider, pending_spiders)
Ejemplo n.º 2
0
    def test_run_scheduler(self):
        #-----------------------------------------------------------------------
        # Set up
        #-----------------------------------------------------------------------
        controller = self.controller
        project = yield controller.push_project(self.project_archive_data)
        self.assertIn('toscrape-css', project.spiders)
        self.assertIn('toscrape-xpath', project.spiders)

        #-----------------------------------------------------------------------
        # Check if a scheduled job created a new pending job after two
        # seconds
        #-----------------------------------------------------------------------
        job_id = controller.schedule_job('quotesbot', 'toscrape-css',
                                         'every second')
        yield twisted_sleep(2)
        controller.run_scheduler()

        pending_jobs = controller.get_jobs(Status.PENDING)
        job_s = controller.get_job(job_id)
        job_p = pending_jobs[0]
        self.assertEqual(len(pending_jobs), 1)
        self.assertEqual(job_p.project, job_s.project)
        self.assertEqual(job_p.spider, job_s.spider)
        self.assertEqual(job_p.actor, Actor.SCHEDULER)
        self.assertEqual(job_p.status, Status.PENDING)
Ejemplo n.º 3
0
    def test_purge_completed(self):
        controller = self.controller
        yield controller.push_project(self.project_archive_data)
        controller.schedule_job('quotesbot', 'toscrape-css', 'every second')
        controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second')
        controller.schedule_job('quotesbot', 'toscrape-css', 'every second')
        controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second')

        yield twisted_sleep(2)
        controller.run_scheduler()

        for _ in range(2):
            controller.run_crawlers()
            yield controller.wait_for_running_jobs()

        completed_jobs = controller.get_completed_jobs()
        self.assertEqual(len(completed_jobs), 4)
        controller.purge_completed_jobs()
        self.assertEqual(len(controller.get_completed_jobs()), 2)
        for job in completed_jobs[2:]:
            log_file = os.path.join(controller.log_dir, job.identifier + '.err')
            self.assertFalse(os.path.exists(log_file))
Ejemplo n.º 4
0
    def test_cancel(self):
        #-----------------------------------------------------------------------
        # Set things up
        #-----------------------------------------------------------------------
        controller = self.controller
        yield controller.push_project(self.project_archive_data)
        job_id1 = controller.schedule_job('quotesbot', 'toscrape-css',
                                          'every second')
        controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second')
        controller.schedule_job('quotesbot', 'toscrape-css', 'every second')
        controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second')

        #-----------------------------------------------------------------------
        # Cancel a scheduled job
        #-----------------------------------------------------------------------
        yield controller.cancel_job(job_id1)
        job = controller.get_job(job_id1)
        self.assertEqual(job.status, Status.CANCELED)

        #-----------------------------------------------------------------------
        # Convert the remaining scheduled jobs to pending jobs
        #-----------------------------------------------------------------------
        yield twisted_sleep(2)
        controller.run_scheduler()
        pending_jobs = controller.get_jobs(Status.PENDING)
        self.assertEqual(len(pending_jobs), 3)
        job_ids = [x.identifier for x in pending_jobs]
        job_id2, job_id3, job_id4 = job_ids

        #-----------------------------------------------------------------------
        # Cancel a pending job
        #-----------------------------------------------------------------------
        yield controller.cancel_job(job_id2)
        job = controller.get_job(job_id2)
        self.assertEqual(job.status, Status.CANCELED)

        #-----------------------------------------------------------------------
        # Run the remaining pending jobs and cancel one of them. Start the
        # cancellation early to test the case when the job process haven't
        # had enough time to start.
        #-----------------------------------------------------------------------
        controller.run_crawlers()
        cancel_done = controller.cancel_job(job_id3)
        yield controller.wait_for_starting_jobs()
        yield cancel_done

        #-----------------------------------------------------------------------
        # Temporarily remove a job from a dictionary to simulate a race
        # condition in which the job finishes while the canceling procedure
        # sleeps waiting for the process to start
        #-----------------------------------------------------------------------
        job_data4 = controller.running_jobs[job_id4]
        del controller.running_jobs[job_id4]  # this simulates the race

        try:
            yield controller.cancel_job(job_id4)
            self.fail('Cancelling a job missing from the dictionary should '
                      'have risen a ValueError exception but did not')
        except ValueError:
            pass

        controller.running_jobs[job_id4] = job_data4
        yield controller.wait_for_running_jobs()

        #-----------------------------------------------------------------------
        # Cancel an inactive job
        #-----------------------------------------------------------------------
        try:
            yield controller.cancel_job(job_id4)
            self.fail('Cancelling an incactive job should have risen '
                      'a ValueError exception but did not')
        except ValueError:
            pass

        #-----------------------------------------------------------------------
        # Check the final statuses
        #-----------------------------------------------------------------------
        self.assertEqual(controller.get_job(job_id1).status, Status.CANCELED)
        self.assertEqual(controller.get_job(job_id2).status, Status.CANCELED)
        self.assertEqual(controller.get_job(job_id3).status, Status.CANCELED)
        self.assertEqual(controller.get_job(job_id4).status, Status.SUCCESSFUL)
Ejemplo n.º 5
0
    def test_run_crawlers(self):
        #-----------------------------------------------------------------------
        # Set the projects up, schedule some jobs, and run the scheduler
        #-----------------------------------------------------------------------
        controller = self.controller
        yield controller.push_project(self.project_archive_data)
        controller.schedule_job('quotesbot', 'toscrape-css', 'every second')
        controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second')
        controller.schedule_job('quotesbot', 'toscrape-css', 'every second')
        controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second')

        yield twisted_sleep(2)
        controller.run_scheduler()
        pending_jobs = controller.get_jobs(Status.PENDING)
        self.assertEqual(len(pending_jobs), 4)

        #-----------------------------------------------------------------------
        # Run the crawlers
        #-----------------------------------------------------------------------
        controller.run_crawlers()

        running_jobs = controller.get_jobs(Status.RUNNING)
        pending_jobs = controller.get_jobs(Status.PENDING)
        self.assertEqual(len(running_jobs), 2)
        self.assertEqual(len(pending_jobs), 2)

        yield controller.wait_for_running_jobs()
        controller.run_crawlers()
        yield controller.wait_for_running_jobs()

        pending_jobs = controller.get_jobs(Status.PENDING)
        successful_jobs = controller.get_jobs(Status.SUCCESSFUL)
        self.assertEqual(len(successful_jobs), 4)
        self.assertEqual(len(pending_jobs), 0)

        for job in successful_jobs:
            log_file = os.path.join(self.temp_dir, 'log-dir',
                                    successful_jobs[0].identifier + '.err')
            self.assertTrue(os.path.exists(log_file))

        #-----------------------------------------------------------------------
        # Test the log getter
        #-----------------------------------------------------------------------
        for job in successful_jobs:
            log = controller.get_job_logs(job.identifier)
            if job.spider == 'toscrape-css':
                self.assertNotEqual(log[0], None)
            else:
                self.assertEqual(log[0], None)
            self.assertNotEqual(log[1], None)

        #-----------------------------------------------------------------------
        # Test failure to spawn a job
        #-----------------------------------------------------------------------
        job = Job(Status.PENDING, Actor.SCHEDULER, 'now', 'foo', 'bar')
        self.controller.schedule.add_job(job)
        controller.run_crawlers()
        yield controller.wait_for_starting_jobs()
        job = controller.get_job(job.identifier)
        self.assertEqual(job.status, Status.FAILED)

        #-----------------------------------------------------------------------
        # Spawn a job but then kill it
        #-----------------------------------------------------------------------
        controller.schedule_job('quotesbot', 'toscrape-css', 'now')
        controller.run_crawlers()
        yield controller.wait_for_running_jobs(cancel=True)

        #-----------------------------------------------------------------------
        # Check the overall number of completed and active jobs
        #-----------------------------------------------------------------------
        self.assertEqual(len(controller.get_active_jobs()), 4)
        self.assertEqual(len(controller.get_completed_jobs()), 6)