def test_setup(self): #----------------------------------------------------------------------- # Set up the controller. Artificially add some RUNNING jobs to the # schedule to simulate the daemon being killed unexpectedly. These # jobs should be converted to PENDING to be restarted ASAP. #----------------------------------------------------------------------- controller = self.controller yield controller.push_project(self.project_archive_data) controller.schedule_job('quotesbot', 'toscrape-css', 'every second') controller.schedule_job('quotesbot', 'toscrape-xpath', 'every 2 seconds') job = Job(Status.RUNNING, Actor.SCHEDULER, 'now', 'foo1', 'bar1') self.controller.schedule.add_job(job) job = Job(Status.RUNNING, Actor.SCHEDULER, 'now', 'foo2', 'bar2') self.controller.schedule.add_job(job) self.assertEqual(len(controller.get_jobs(Status.RUNNING)), 2) #----------------------------------------------------------------------- # Set up another controller with the same config to see if the state # is reconstructed #----------------------------------------------------------------------- controller = Controller(self.config) self.assertEqual(len(controller.scheduler.jobs), 2) self.assertEqual(len(controller.get_jobs(Status.PENDING)), 2) yield twisted_sleep(3) controller.run_scheduler() pending_jobs = controller.get_jobs(Status.PENDING) pending_spiders = [job.spider for job in pending_jobs] self.assertEqual(len(pending_jobs), 4) for spider in ['toscrape-css', 'toscrape-xpath', 'bar1', 'bar2']: self.assertIn(spider, pending_spiders)
def test_run_scheduler(self): #----------------------------------------------------------------------- # Set up #----------------------------------------------------------------------- controller = self.controller project = yield controller.push_project(self.project_archive_data) self.assertIn('toscrape-css', project.spiders) self.assertIn('toscrape-xpath', project.spiders) #----------------------------------------------------------------------- # Check if a scheduled job created a new pending job after two # seconds #----------------------------------------------------------------------- job_id = controller.schedule_job('quotesbot', 'toscrape-css', 'every second') yield twisted_sleep(2) controller.run_scheduler() pending_jobs = controller.get_jobs(Status.PENDING) job_s = controller.get_job(job_id) job_p = pending_jobs[0] self.assertEqual(len(pending_jobs), 1) self.assertEqual(job_p.project, job_s.project) self.assertEqual(job_p.spider, job_s.spider) self.assertEqual(job_p.actor, Actor.SCHEDULER) self.assertEqual(job_p.status, Status.PENDING)
def test_purge_completed(self): controller = self.controller yield controller.push_project(self.project_archive_data) controller.schedule_job('quotesbot', 'toscrape-css', 'every second') controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second') controller.schedule_job('quotesbot', 'toscrape-css', 'every second') controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second') yield twisted_sleep(2) controller.run_scheduler() for _ in range(2): controller.run_crawlers() yield controller.wait_for_running_jobs() completed_jobs = controller.get_completed_jobs() self.assertEqual(len(completed_jobs), 4) controller.purge_completed_jobs() self.assertEqual(len(controller.get_completed_jobs()), 2) for job in completed_jobs[2:]: log_file = os.path.join(controller.log_dir, job.identifier + '.err') self.assertFalse(os.path.exists(log_file))
def test_cancel(self): #----------------------------------------------------------------------- # Set things up #----------------------------------------------------------------------- controller = self.controller yield controller.push_project(self.project_archive_data) job_id1 = controller.schedule_job('quotesbot', 'toscrape-css', 'every second') controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second') controller.schedule_job('quotesbot', 'toscrape-css', 'every second') controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second') #----------------------------------------------------------------------- # Cancel a scheduled job #----------------------------------------------------------------------- yield controller.cancel_job(job_id1) job = controller.get_job(job_id1) self.assertEqual(job.status, Status.CANCELED) #----------------------------------------------------------------------- # Convert the remaining scheduled jobs to pending jobs #----------------------------------------------------------------------- yield twisted_sleep(2) controller.run_scheduler() pending_jobs = controller.get_jobs(Status.PENDING) self.assertEqual(len(pending_jobs), 3) job_ids = [x.identifier for x in pending_jobs] job_id2, job_id3, job_id4 = job_ids #----------------------------------------------------------------------- # Cancel a pending job #----------------------------------------------------------------------- yield controller.cancel_job(job_id2) job = controller.get_job(job_id2) self.assertEqual(job.status, Status.CANCELED) #----------------------------------------------------------------------- # Run the remaining pending jobs and cancel one of them. Start the # cancellation early to test the case when the job process haven't # had enough time to start. #----------------------------------------------------------------------- controller.run_crawlers() cancel_done = controller.cancel_job(job_id3) yield controller.wait_for_starting_jobs() yield cancel_done #----------------------------------------------------------------------- # Temporarily remove a job from a dictionary to simulate a race # condition in which the job finishes while the canceling procedure # sleeps waiting for the process to start #----------------------------------------------------------------------- job_data4 = controller.running_jobs[job_id4] del controller.running_jobs[job_id4] # this simulates the race try: yield controller.cancel_job(job_id4) self.fail('Cancelling a job missing from the dictionary should ' 'have risen a ValueError exception but did not') except ValueError: pass controller.running_jobs[job_id4] = job_data4 yield controller.wait_for_running_jobs() #----------------------------------------------------------------------- # Cancel an inactive job #----------------------------------------------------------------------- try: yield controller.cancel_job(job_id4) self.fail('Cancelling an incactive job should have risen ' 'a ValueError exception but did not') except ValueError: pass #----------------------------------------------------------------------- # Check the final statuses #----------------------------------------------------------------------- self.assertEqual(controller.get_job(job_id1).status, Status.CANCELED) self.assertEqual(controller.get_job(job_id2).status, Status.CANCELED) self.assertEqual(controller.get_job(job_id3).status, Status.CANCELED) self.assertEqual(controller.get_job(job_id4).status, Status.SUCCESSFUL)
def test_run_crawlers(self): #----------------------------------------------------------------------- # Set the projects up, schedule some jobs, and run the scheduler #----------------------------------------------------------------------- controller = self.controller yield controller.push_project(self.project_archive_data) controller.schedule_job('quotesbot', 'toscrape-css', 'every second') controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second') controller.schedule_job('quotesbot', 'toscrape-css', 'every second') controller.schedule_job('quotesbot', 'toscrape-xpath', 'every second') yield twisted_sleep(2) controller.run_scheduler() pending_jobs = controller.get_jobs(Status.PENDING) self.assertEqual(len(pending_jobs), 4) #----------------------------------------------------------------------- # Run the crawlers #----------------------------------------------------------------------- controller.run_crawlers() running_jobs = controller.get_jobs(Status.RUNNING) pending_jobs = controller.get_jobs(Status.PENDING) self.assertEqual(len(running_jobs), 2) self.assertEqual(len(pending_jobs), 2) yield controller.wait_for_running_jobs() controller.run_crawlers() yield controller.wait_for_running_jobs() pending_jobs = controller.get_jobs(Status.PENDING) successful_jobs = controller.get_jobs(Status.SUCCESSFUL) self.assertEqual(len(successful_jobs), 4) self.assertEqual(len(pending_jobs), 0) for job in successful_jobs: log_file = os.path.join(self.temp_dir, 'log-dir', successful_jobs[0].identifier + '.err') self.assertTrue(os.path.exists(log_file)) #----------------------------------------------------------------------- # Test the log getter #----------------------------------------------------------------------- for job in successful_jobs: log = controller.get_job_logs(job.identifier) if job.spider == 'toscrape-css': self.assertNotEqual(log[0], None) else: self.assertEqual(log[0], None) self.assertNotEqual(log[1], None) #----------------------------------------------------------------------- # Test failure to spawn a job #----------------------------------------------------------------------- job = Job(Status.PENDING, Actor.SCHEDULER, 'now', 'foo', 'bar') self.controller.schedule.add_job(job) controller.run_crawlers() yield controller.wait_for_starting_jobs() job = controller.get_job(job.identifier) self.assertEqual(job.status, Status.FAILED) #----------------------------------------------------------------------- # Spawn a job but then kill it #----------------------------------------------------------------------- controller.schedule_job('quotesbot', 'toscrape-css', 'now') controller.run_crawlers() yield controller.wait_for_running_jobs(cancel=True) #----------------------------------------------------------------------- # Check the overall number of completed and active jobs #----------------------------------------------------------------------- self.assertEqual(len(controller.get_active_jobs()), 4) self.assertEqual(len(controller.get_completed_jobs()), 6)