def getJobList_impl(self): """ This method is called in a loop by the scheduler daemon service. It's goal is to return a list of jobs that are ready to be started. Note: handles both old and pipeline jobs but only so far as putting devices into a Reserved state. Running pipeline jobs from Reserved is the sole concern of the dispatcher-master. """ self._handle_cancelling_jobs() # FIXME: to move into the dispatcher-master if utils.is_master(): submit_health_check_jobs() assign_jobs() # from here on, ignore pipeline jobs. my_devices = get_temporary_devices(self.my_devices()) my_submitted_jobs = TestJob.objects.filter( status=TestJob.SUBMITTED, actual_device_id__in=my_devices, is_pipeline=False ) my_ready_jobs = filter(lambda job: job.is_ready_to_start, my_submitted_jobs) if not connection.in_atomic_block: self._commit_transaction(src='getJobList_impl') return my_ready_jobs
def getJobList_impl(self): """ This method is called in a loop by the scheduler daemon service. It's goal is to return a list of jobs that are ready to be started. Note: handles both old and pipeline jobs but only so far as putting devices into a Reserved state. Running pipeline jobs from Reserved is the sole concern of the dispatcher-master. """ self._handle_cancelling_jobs() if utils.is_master(): # FIXME: move into dispatcher-master self._submit_health_check_jobs() self._assign_jobs() # from here on, ignore pipeline jobs. my_devices = get_temporary_devices(self.my_devices()) my_submitted_jobs = TestJob.objects.filter( status=TestJob.SUBMITTED, actual_device_id__in=my_devices, is_pipeline=False) my_ready_jobs = filter(lambda job: job.is_ready_to_start, my_submitted_jobs) self._commit_transaction(src='getJobList_impl') return my_ready_jobs
def getJobList_impl(self): """ This method is called in a loop by the scheduler daemon service. It's goal is to return a list of jobs that are ready to be started. """ self._handle_cancelling_jobs() if utils.is_master(): self._submit_health_check_jobs() self._assign_jobs() my_devices = get_temporary_devices(self.my_devices()) my_submitted_jobs = TestJob.objects.filter(status=TestJob.SUBMITTED, actual_device_id__in=my_devices) my_ready_jobs = filter(lambda job: job.is_ready_to_start, my_submitted_jobs) transaction.commit() return my_ready_jobs
def getJobList_impl(self): """ This method is called in a loop by the scheduler daemon service. It's goal is to return a list of jobs that are ready to be started. """ self._handle_cancelling_jobs() if utils.is_master(): self._submit_health_check_jobs() self._assign_jobs() my_devices = get_temporary_devices(self.my_devices()) my_submitted_jobs = TestJob.objects.filter( status=TestJob.SUBMITTED, actual_device_id__in=my_devices, ) my_ready_jobs = filter(lambda job: job.is_ready_to_start, my_submitted_jobs) transaction.commit() return my_ready_jobs
def jobCompleted_impl(self, job_id, board_name, exit_code, kill_reason): if not job_id: self.logger.debug('job completion called without a job id on %s', board_name) return else: job = TestJob.objects.get(id=job_id) self.logger.debug('marking job as complete on %s', board_name) device = Device.objects.get(hostname=board_name) old_device_status = device.status self.logger.debug('old device status %s, job state %s' % (Device.STATUS_CHOICES[old_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) if old_device_status == Device.RUNNING: new_device_status = Device.IDLE elif old_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE elif old_device_status == Device.RESERVED: new_device_status = Device.IDLE else: self.logger.error("Unexpected device state in jobCompleted: %s", device.status) new_device_status = Device.IDLE if new_device_status is None: self.logger.debug("unhandled old device state") new_device_status = Device.IDLE self.logger.debug('new device status %s, job state %s' % (Device.STATUS_CHOICES[new_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) # Temporary devices should be marked as RETIRED once the job is # complete or canceled. if job.is_vmgroup: try: if device.temporarydevice: new_device_status = Device.RETIRED device.current_job = None except TemporaryDevice.DoesNotExist: self.logger.debug("%s is not a tmp device", device.hostname) if job.status == TestJob.RUNNING: if exit_code == 0: job.status = TestJob.COMPLETE else: job.status = TestJob.INCOMPLETE elif job.status == TestJob.CANCELING: job.status = TestJob.CANCELED else: self.logger.error( "Unexpected job state in jobCompleted: %s, probably we are trying job completion for a different job", job.status) return self.logger.debug('changed job status to %s' % (TestJob.STATUS_CHOICES[job.status][1])) if job.health_check: device.last_health_report_job = job self.logger.debug("old device health status %s" % Device.HEALTH_CHOICES[device.health_status][1]) if device.health_status != Device.HEALTH_LOOPING: if job.status == TestJob.INCOMPLETE: device.health_status = Device.HEALTH_FAIL self.logger.debug( "taking %s offline, failed health check job %s" % (device.hostname, job_id)) device.put_into_maintenance_mode( None, "Health Check Job Failed") # update the local variable to track the effect of the external function call new_device_status = device.status if new_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE # offlining job is complete. elif job.status == TestJob.COMPLETE: device.health_status = Device.HEALTH_PASS if old_device_status == Device.RUNNING: new_device_status = Device.IDLE device.save() self.logger.debug("new device health status %s" % Device.HEALTH_CHOICES[device.health_status][1]) if job.output_dir and job.output_dir != '': bundle_file = os.path.join(job.output_dir, 'result-bundle') if os.path.exists(bundle_file): with open(bundle_file) as f: results_link = f.read().strip() job._results_link = results_link sha1 = results_link.strip('/').split('/')[-1] try: bundle = Bundle.objects.get(content_sha1=sha1) except Bundle.DoesNotExist: pass else: job._results_bundle = bundle device.device_version = _get_device_version( job.results_bundle) else: self.logger.warning("[%d] lacked a usable output_dir", job.id) self.logger.debug('new device status %s, job state %s' % (Device.STATUS_CHOICES[new_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) job.end_time = timezone.now() job.submit_token = None device.current_job = None msg = "Job %s completed" % job.display_id device.state_transition_to(new_device_status, message=msg, job=job) self._commit_transaction(src='%s state' % device.hostname) device.save() job.save() self._commit_transaction(src='jobCompleted_impl') self.logger.info('job %s completed on %s', job.id, device.hostname) if utils.is_master(): try: job.send_summary_mails() except: # Better to catch all exceptions here and log it than have this # method fail. self.logger.exception( 'sending job summary mails for job %r failed', job.pk) else: worker = WorkerData() worker.notify_on_incomplete(job.id)
def jobCompleted_impl(self, board_name, exit_code, kill_reason): self.logger.debug('marking job as complete on %s', board_name) device = Device.objects.get(hostname=board_name) old_device_status = device.status new_device_status = None previous_state = device.previous_state() MAX_RETRIES = 3 if old_device_status == Device.RUNNING: new_device_status = previous_state elif old_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE elif old_device_status == Device.RESERVED: new_device_status = previous_state else: self.logger.error( "Unexpected device state in jobCompleted: %s" % device.status) new_device_status = Device.IDLE if new_device_status is None: new_device_status = Device.IDLE job = device.current_job # Temporary devices should be marked as RETIRED once the job is # complete or canceled. if job.is_vmgroup: try: if device.temporarydevice: new_device_status = Device.RETIRED except TemporaryDevice.DoesNotExist: self.logger.debug("%s is not a tmp device" % device.hostname) device.device_version = _get_device_version(job.results_bundle) device.current_job = None if job.status == TestJob.RUNNING: if exit_code == 0: job.status = TestJob.COMPLETE else: job.status = TestJob.INCOMPLETE elif job.status == TestJob.CANCELING: job.status = TestJob.CANCELED else: self.logger.error( "Unexpected job state in jobCompleted: %s" % job.status) job.status = TestJob.COMPLETE msg = "Job %s completed" % job.display_id device.state_transition_to(new_device_status, message=msg, job=job) if job.health_check: device.last_health_report_job = job if device.health_status != Device.HEALTH_LOOPING: if job.status == TestJob.INCOMPLETE: device.health_status = Device.HEALTH_FAIL device.put_into_maintenance_mode(None, "Health Check Job Failed") elif job.status == TestJob.COMPLETE: device.health_status = Device.HEALTH_PASS bundle_file = os.path.join(job.output_dir, 'result-bundle') if os.path.exists(bundle_file): with open(bundle_file) as f: results_link = f.read().strip() job._results_link = results_link sha1 = results_link.strip('/').split('/')[-1] try: bundle = Bundle.objects.get(content_sha1=sha1) except Bundle.DoesNotExist: pass else: job._results_bundle = bundle job.end_time = datetime.datetime.utcnow() token = job.submit_token job.submit_token = None device.save() job.save() # notification needs to have the correct status in the database for retry in range(MAX_RETRIES): try: transaction.commit() self.logger.debug('%s job completed and status saved' % job.id) break except TransactionRollbackError as err: self.logger.warn('Retrying %s job completion ... %s' % (job.id, err)) continue if utils.is_master(): try: job.send_summary_mails() except: # Better to catch all exceptions here and log it than have this # method fail. self.logger.exception( 'sending job summary mails for job %r failed', job.pk) else: worker = WorkerData() worker.notify_on_incomplete(job.id) # need the token for the XMLRPC token.delete()
def jobCompleted_impl(self, job_id, board_name, exit_code, kill_reason): if not job_id: self.logger.debug('job completion called without a job id on %s', board_name) return else: job = TestJob.objects.get(id=job_id) self.logger.debug('marking job as complete on %s', board_name) device = Device.objects.get(hostname=board_name) old_device_status = device.status self.logger.debug('old device status %s, job state %s' % ( Device.STATUS_CHOICES[old_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) if old_device_status == Device.RUNNING: new_device_status = Device.IDLE elif old_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE elif old_device_status == Device.RESERVED: new_device_status = Device.IDLE else: self.logger.error( "Unexpected device state in jobCompleted: %s", device.status) new_device_status = Device.IDLE if new_device_status is None: self.logger.debug("unhandled old device state") new_device_status = Device.IDLE self.logger.debug('new device status %s, job state %s' % ( Device.STATUS_CHOICES[new_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) # Temporary devices should be marked as RETIRED once the job is # complete or canceled. if job.is_vmgroup: try: if device.temporarydevice: new_device_status = Device.RETIRED device.current_job = None except TemporaryDevice.DoesNotExist: self.logger.debug("%s is not a tmp device", device.hostname) if job.status == TestJob.RUNNING: if exit_code == 0: job.status = TestJob.COMPLETE else: job.status = TestJob.INCOMPLETE elif job.status == TestJob.CANCELING: job.status = TestJob.CANCELED else: self.logger.error("Unexpected job state in jobCompleted: %s, probably we are trying job completion for a different job", job.status) return self.logger.debug('changed job status to %s' % ( TestJob.STATUS_CHOICES[job.status][1])) if job.health_check: device.last_health_report_job = job self.logger.debug("old device health status %s" % Device.HEALTH_CHOICES[device.health_status][1]) if device.health_status != Device.HEALTH_LOOPING: if job.status == TestJob.INCOMPLETE: device.health_status = Device.HEALTH_FAIL self.logger.debug("taking %s offline, failed health check job %s" % ( device.hostname, job_id)) device.put_into_maintenance_mode(None, "Health Check Job Failed") # update the local variable to track the effect of the external function call new_device_status = device.status if new_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE # offlining job is complete. elif job.status == TestJob.COMPLETE: device.health_status = Device.HEALTH_PASS if old_device_status == Device.RUNNING: new_device_status = Device.IDLE device.save() self.logger.debug("new device health status %s" % Device.HEALTH_CHOICES[device.health_status][1]) if job.output_dir and job.output_dir != '': bundle_file = os.path.join(job.output_dir, 'result-bundle') if os.path.exists(bundle_file): with open(bundle_file) as f: results_link = f.read().strip() job._results_link = results_link sha1 = results_link.strip('/').split('/')[-1] try: bundle = Bundle.objects.get(content_sha1=sha1) except Bundle.DoesNotExist: pass else: job._results_bundle = bundle device.device_version = _get_device_version(job.results_bundle) else: self.logger.warning("[%d] lacked a usable output_dir", job.id) self.logger.debug('new device status %s, job state %s' % ( Device.STATUS_CHOICES[new_device_status][1], TestJob.STATUS_CHOICES[job.status][1])) job.end_time = timezone.now() job.submit_token = None device.current_job = None msg = "Job %s completed" % job.display_id device.state_transition_to(new_device_status, message=msg, job=job) self._commit_transaction(src='%s state' % device.hostname) device.save() job.save() self._commit_transaction(src='jobCompleted_impl') self.logger.info('job %s completed on %s', job.id, device.hostname) if utils.is_master(): try: job.send_summary_mails() except: # Better to catch all exceptions here and log it than have this # method fail. self.logger.exception( 'sending job summary mails for job %r failed', job.pk) else: worker = WorkerData() worker.notify_on_incomplete(job.id)
def jobCompleted_impl(self, board_name, exit_code, kill_reason): self.logger.debug("marking job as complete on %s", board_name) device = Device.objects.get(hostname=board_name) old_device_status = device.status new_device_status = None previous_state = device.previous_state() MAX_RETRIES = 3 if old_device_status == Device.RUNNING: new_device_status = previous_state elif old_device_status == Device.OFFLINING: new_device_status = Device.OFFLINE elif old_device_status == Device.RESERVED: new_device_status = previous_state else: self.logger.error("Unexpected device state in jobCompleted: %s" % device.status) new_device_status = Device.IDLE if new_device_status is None: new_device_status = Device.IDLE job = device.current_job # Temporary devices should be marked as RETIRED once the job is # complete or canceled. if job.is_vmgroup: try: if device.temporarydevice: new_device_status = Device.RETIRED except TemporaryDevice.DoesNotExist: self.logger.debug("%s is not a tmp device" % device.hostname) device.device_version = _get_device_version(job.results_bundle) device.current_job = None if job.status == TestJob.RUNNING: if exit_code == 0: job.status = TestJob.COMPLETE else: job.status = TestJob.INCOMPLETE elif job.status == TestJob.CANCELING: job.status = TestJob.CANCELED else: self.logger.error("Unexpected job state in jobCompleted: %s" % job.status) job.status = TestJob.COMPLETE msg = "Job %s completed" % job.display_id device.state_transition_to(new_device_status, message=msg, job=job) if job.health_check: device.last_health_report_job = job if device.health_status != Device.HEALTH_LOOPING: if job.status == TestJob.INCOMPLETE: device.health_status = Device.HEALTH_FAIL device.put_into_maintenance_mode(None, "Health Check Job Failed") elif job.status == TestJob.COMPLETE: device.health_status = Device.HEALTH_PASS bundle_file = os.path.join(job.output_dir, "result-bundle") if os.path.exists(bundle_file): with open(bundle_file) as f: results_link = f.read().strip() job._results_link = results_link sha1 = results_link.strip("/").split("/")[-1] try: bundle = Bundle.objects.get(content_sha1=sha1) except Bundle.DoesNotExist: pass else: job._results_bundle = bundle job.end_time = datetime.datetime.utcnow() token = job.submit_token job.submit_token = None device.save() job.save() # notification needs to have the correct status in the database for retry in range(MAX_RETRIES): try: transaction.commit() self.logger.debug("%s job completed and status saved" % job.id) break except TransactionRollbackError as err: self.logger.warn("Retrying %s job completion ... %s" % (job.id, err)) continue if utils.is_master(): try: job.send_summary_mails() except: # Better to catch all exceptions here and log it than have this # method fail. self.logger.exception("sending job summary mails for job %r failed", job.pk) else: worker = WorkerData() worker.notify_on_incomplete(job.id) # need the token for the XMLRPC token.delete()