def __init__(self, db_obj, sqs_scanner_queue, sqs_worker_queue, emailer): """ :param db_obj: dynamodb table of scheduled jobs :type db_obj: boto.dynamodb2.table.Table :param sqs_scanner_queue: scanner queue which to send a feed back message :type sqs_scanner_queue: boto.sqs.queue.Queue :param sqs_worker_queue: worker queue which to receive job from :type sqs_worker_queue: boto.sqs.queue.Queue """ self.db = db_obj self.scanner_queue = sqs_scanner_queue self.worker_queue = sqs_worker_queue self._should_run = True self._run_once = False self.worker_keepalive_sec = staticconf.read_int('scanner.worker_keepalive_sec') self.max_error_retries = staticconf.read_int('max_error_retries') self.msg_max_retention_sec = int( self.scanner_queue.get_queue_attributes()['MessageRetentionPeriod'] ) self.msg_max_retention_sec += 3600 # give SQS enough time to delete the message self.emailer = emailer log("scanner initialization") log(dict((k, str(v))for k, v in vars(self).iteritems()))
def _maint_error_jobs(self, now): retry_ts_field = 'et_next_error_retry_attempt' num_retries_field = 'et_num_error_retries' get_jobs = getattr(self.db, 'get_jobs_with_et_status') jobs = get_jobs(JOBS_ETL_STATUS_ERROR) for job in jobs: kwargs = {retry_ts_field: None, num_retries_field: None} fields = job.get(**kwargs) retry_ts_value = fields.get(retry_ts_field, False) or str(now) num_retries = fields.get(num_retries_field, None) # for legacy error jobs, set to max_error_retries to avoid auto-start num_retries = int(num_retries) if num_retries is not None else self.max_error_retries next_retry = datetime.strptime(retry_ts_value, '%Y-%m-%d %H:%M:%S.%f') if next_retry <= now and num_retries < self.max_error_retries: log('attempt {0}/{1} to re-run job {2}'.format( num_retries + 1, self.max_error_retries, job2log(job) )) self._update_job_status(job, JOBS_ETL_STATUS_EMPTY) if self._action_pending(job): # push job to worker for actions like cancel/pause/delete self._update_job_status(job, JOBS_ETL_STATUS_EMPTY)
def _schedule_runs_lk(self, et_pool, job): """ Schedule runs to execute up to max possible parallelism suffix '_lk' means caller must already hold lock. :param et_pool: A multiprocessor pool handle :type: Pool :param job: current job :type: WorkerJob """ while (self._has_more_runs_to_schedule(job) and job.runs_in_flight < job.max_runs_in_flight): run = job.schedule_next_run() if run.id is None: raise ValueError("Unexpected end of runs") self.etl_helper.etl_step_started(job.msg_dict, run.id, run.step) log('scheduled: {0}'.format(run.id)) et_pool.apply_async( run.func, args=run.args, callback=self._create_run_complete_callback( job, run.id, run.step), ) job.runs_in_flight += 1
def _fetch_jobs_for_work(self): # query self.db and return jobs statuses = JOBS_ETL_STATUSES_SCHEDULABLE result = [ job for s in statuses for job in self.db.get_jobs_with_et_status(s) ] log("ET scanner: num jobs to process from DB:" + str(len(result))) return result
def _fetch_jobs_for_work(self): # query self.db and return jobs statuses = JOBS_ETL_STATUSES_SCHEDULABLE result = [ job for s in statuses for job in self.db.get_jobs_with_et_status(s) ] log( "ET scanner: num jobs to process from DB:" + str(len(result))) return result
def _handle_cancel_request_lk(self, job, et_pool): """ Cancel currently executing job. Send SIGINT to all children, ignore SIGINT in parent to prevent killing it. Note: suffix '_lk' means caller must already hold lock. :param job: current job """ log("cancel detected, terminating") if job.runs_in_flight != 0: log("sending cancel request") et_pool.cancel()
def _maint_running_jobs(self, now): status_ts_field = 'et_status_last_updated_at' get_jobs = getattr(self.db, 'get_jobs_with_et_status') jobs = get_jobs(JOBS_ETL_STATUS_RUNNING) for job in jobs: kwargs = {status_ts_field: None} status_ts_value = job.get(**kwargs).get(status_ts_field, False) or str(now) last_update = datetime.strptime(status_ts_value, '%Y-%m-%d %H:%M:%S.%f') if now - last_update > timedelta(seconds=self.worker_keepalive_sec): log('found stuck running job {0}, resetting...'.format(job2log(job))) self._update_job_status(job, JOBS_ETL_STATUS_EMPTY)
def _maint_scheduled_jobs(self, now): status_ts_field = 'et_status_last_updated_at' get_jobs = getattr(self.db, 'get_jobs_with_et_status') jobs = get_jobs(JOBS_ETL_STATUS_SCHEDULED) for job in jobs: kwargs = {status_ts_field: None} status_ts_value = job.get(**kwargs).get(status_ts_field, False) or str(now) last_update = datetime.strptime(status_ts_value, '%Y-%m-%d %H:%M:%S.%f') if now - last_update > timedelta(seconds=self.msg_max_retention_sec): log('found stuck scheduled job {0}, resetting...'.format(job2log(job))) self._update_job_status(job, JOBS_ETL_STATUS_EMPTY)
def _create_work_for_job(self, job): # Update status = SCHEDULED # iff the status is not SCHEDULED/COMPLETE/ERROR job_dict = job.get(**self.DEFAULT_KEYS_TO_FETCH_FROM_JOB) if not self._update_job_status_conditionally(job): log("ET: Could not conditionally update job to status 'scheduled'" ".Skipping this job this time : " + str(job)) return start_date = self._later_date( job_dict['start_date'], next_date_for_string(job_dict['et_last_successful_date'])) end_date = self._earlier_date(job_dict['end_date'], self._get_max_available_date(job)) self._enqueue_work_in_sqs(start_date, end_date, "ET", job_dict) log("Enqueued ET work item in SQS, for job: " + str(job_dict))
def run_scanner(self): """ Fetch relevant jobs from DB table and create ET or L work. Does not throw any exceptions out. """ jobs = self._fetch_jobs_for_work() for job in jobs: try: if self._action_pending(job) or self._should_process_job(job): self._create_work_for_job(job) else: log("Skipping job since there was no processing needed: {0}".format( job2log(job) )) except Exception: log_exception( "Caught an exception in processing a job. Ignoring" " entry: {0}".format(job2log(job)))
def _create_work_for_job(self, job): # Update status = SCHEDULED # iff the status is not SCHEDULED/COMPLETE/ERROR job_dict = job.get(**self.DEFAULT_KEYS_TO_FETCH_FROM_JOB) if not self._update_job_status_conditionally(job): log( "ET: Could not conditionally update job to status 'scheduled'" ".Skipping this job this time : " + str(job) ) return start_date = self._later_date( job_dict['start_date'], next_date_for_string(job_dict['et_last_successful_date']) ) end_date = self._earlier_date(job_dict['end_date'], self._get_max_available_date(job)) self._enqueue_work_in_sqs(start_date, end_date, "ET", job_dict) log("Enqueued ET work item in SQS, for job: " + str(job_dict))
def __init__(self, config_loc, config_override_loc, run_local, emailer, dummy_run=False): super(ImdWorker, self).__init__( config_loc, config_override_loc, emailer, num_processes=3, ) for key in self.KEYS_TO_LOAD: self.__setattr__(key, staticconf.read_string(key)) if dummy_run: log("Dummy worker! Skip the real etl process. Just for test.") import mycroft.backend.worker.fake_ingest_multiple_dates as ingest_multiple_dates else: import sherlock.batch.ingest_multiple_dates as ingest_multiple_dates self._should_run_local = run_local self.dummy_run = dummy_run self.ingest_multiple_dates = ingest_multiple_dates.ingest_multiple_dates_main self.queue_name = staticconf.get_string("sqs.et_queue_name") self.scanner_queue_name = staticconf.get_string( "sqs.et_scanner_queue_name") log("ImdWorker initialization") log(dict((k, str(v)) for k, v in vars(self).iteritems()))
def _run_complete_callback(self, job, run_id, step, results): """ Callback invoked when a single run is done :param job: current job :type: WorkerJob :param run_id: an unique run identifier :type: string :param step: step associated with the run execution :type: string :param results: run results :type: list """ self._cond.acquire() try: if len(results) != 1: raise ValueError("len(results) != 1, {0}".format(results)) log("done: {0}, {1}, {2}".format(run_id, step, results[0]['status'])) job.all_results[run_id].extend(results) job.runs_done += 1 job.runs_in_flight -= 1 if job.runs_in_flight < 0: raise ValueError("runs_in_flight < 0 \ ({0} < 0)".format(job.runs_in_flight)) if job.is_waiting is True: self._cond.notify(n=1) self.etl_helper.etl_step_complete(job.msg_dict, run_id, step, results[0]) job.run_complete(run_id, step, results) except: # if callback dies, et_pool stops working log_exception('_run_complete_callback') finally: self._cond.release()
def _schedule_runs_lk(self, et_pool, job): """ Schedule runs to execute up to max possible parallelism suffix '_lk' means caller must already hold lock. :param et_pool: A multiprocessor pool handle :type: Pool :param job: current job :type: WorkerJob """ while (self._has_more_runs_to_schedule(job) and job.runs_in_flight < job.max_runs_in_flight): run = job.schedule_next_run() if run.id is None: raise ValueError("Unexpected end of runs") self.etl_helper.etl_step_started(job.msg_dict, run.id, run.step) log('scheduled: {0}'.format(run.id)) et_pool.apply_async( run.func, args=run.args, callback=self._create_run_complete_callback(job, run.id, run.step), ) job.runs_in_flight += 1
def _maint_paused_jobs(self, now): lsd_field = 'et_last_successful_date' kwargs = dict((action, None) for action in JOBS_ETL_ACTIONS) kwargs[lsd_field] = None get_jobs = getattr(self.db, 'get_jobs_with_et_status') jobs = get_jobs(JOBS_ETL_STATUS_PAUSED) for job in jobs: # if key is set to None or does not exist, assume it's set to 1 # to avoid resuming non-conforming jobs. result_dict = job.get(**kwargs) pause_requested = result_dict.get('pause_requested', 1) cancel_requested = result_dict.get('cancel_requested', 1) lsd = result_dict.get(lsd_field, None) if cancel_requested != 0: log('cancel requested for paused job {0}, resetting...'.format(job2log(job))) self._update_job_status(job, JOBS_ETL_STATUS_EMPTY) elif pause_requested == 0: # job is paused but pause_requested is 0 => resume to IDLE, # or SUCCESS if can't resume right away ready_to_process = self._should_process_job(job) new_job_status = JOBS_ETL_STATUS_EMPTY \ if ready_to_process or lsd is None else JOBS_ETL_STATUS_SUCCESS log('resetting paused job {0} to {1}'.format(job2log(job), new_job_status)) self._update_job_status(job, new_job_status) if ready_to_process: additional_info = "Job will start immediately." else: additional_info = "Job will start when new data is available." job_dict = job.get(**self.DEFAULT_KEYS_TO_FETCH_FROM_JOB) if job_dict['contact_emails'] is not None: job_dict['contact_emails'] = list(job_dict['contact_emails']) try: self.emailer.mail_result( "resumed", job_dict, additional_info ) log("Sent emails to: {0}".format(job_dict['contact_emails'])) except Exception: log_exception("Exception in sending emails of job:" + str(job_dict))
def __init__(self, config_loc, config_override_loc, run_local, emailer, dummy_run=False): super(ImdWorker, self).__init__( config_loc, config_override_loc, emailer, num_processes=3, ) for key in self.KEYS_TO_LOAD: self.__setattr__(key, staticconf.read_string(key)) if dummy_run: log("Dummy worker! Skip the real etl process. Just for test.") import mycroft.backend.worker.fake_ingest_multiple_dates as ingest_multiple_dates else: import sherlock.batch.ingest_multiple_dates as ingest_multiple_dates self._should_run_local = run_local self.dummy_run = dummy_run self.ingest_multiple_dates = ingest_multiple_dates.ingest_multiple_dates_main self.queue_name = staticconf.get_string("sqs.et_queue_name") self.scanner_queue_name = staticconf.get_string("sqs.et_scanner_queue_name") log("ImdWorker initialization") log(dict((k, str(v))for k, v in vars(self).iteritems()))
def run(self): """Main entry point for the worker. Queries an SQS queue for messages and performs the appropriate action on each message received. Swallows all exceptions and logs them. """ queue_name = str(self._get_queue_name()) sqs = self._get_sqs_wrapper(queue_name, JSONMessage) scanner_queue_name = str(self._get_scanner_queue_name()) scanner_sqs = self._get_sqs_wrapper(scanner_queue_name, JSONMessage) dummy_message = {"message": "dummy"} # TODO: make this message meaningful while(not self._stop_requested): # Loop forever while this variable is set. try: # Main try-except for msg in sqs.get_messages_from_queue(): msg_body = msg.get_body() log({ "status": "new message", "queue": queue_name, "msg": msg_body, }) results = None final_status = JOBS_ETL_STATUS_ERROR lsd = None try: self._update_scheduled_jobs_on_etl_start(msg_body) # safe to delete message. if worker dies, scanner will resubmit sqs.delete_message_from_queue(msg) try: # Execute etl results, action_dict = self._process_msg(msg) # Parse results final_status, lsd, extra_info = \ parse_results(results, msg_body['end_date']) if final_status != JOBS_ETL_STATUS_COMPLETE: if action_dict['delete_requested']: final_status = JOBS_ETL_STATUS_DELETED elif action_dict['cancel_requested']: final_status = JOBS_ETL_STATUS_CANCELLED elif action_dict['pause_requested']: final_status = JOBS_ETL_STATUS_PAUSED log({ "status": "processed message OK", "queue": queue_name, "msg": msg_body, "results": results, "job status": final_status, "last OK date": lsd, }) except Exception: final_status = JOBS_ETL_STATUS_ERROR log_exception( "Exception in processing msg from queue: " + queue_name + " msg body:" + str(msg_body) ) if final_status != JOBS_ETL_STATUS_DELETED: self._update_scheduled_jobs_on_etl_complete( msg_body, final_status, lsd ) scanner_sqs.write_message_to_queue(dummy_message) try: self.emailer.mail_result( final_status, msg_body, additional_info=extra_info ) log( "Sent emails to:" + str(msg_body['contact_emails']) ) except Exception: log_exception( "Exception in sending emails of job:" + str(msg_body) ) except Exception: log_exception( "Failed to update scheduled jobs on etl" " start/complete, msg body: " + str(msg_body) ) except Exception: # end of main try-except log_exception( "Exception in fetching messages from queue:" + queue_name ) # if sqs queue fails, throttle retry time.sleep(sqs.get_wait_time()) if self._run_once: break self._stop_requested = False
def run(self): """Main entry point for the worker. Queries an SQS queue for messages and performs the appropriate action on each message received. Swallows all exceptions and logs them. """ queue_name = str(self._get_queue_name()) sqs = self._get_sqs_wrapper(queue_name, JSONMessage) scanner_queue_name = str(self._get_scanner_queue_name()) scanner_sqs = self._get_sqs_wrapper(scanner_queue_name, JSONMessage) dummy_message = { "message": "dummy" } # TODO: make this message meaningful while (not self._stop_requested ): # Loop forever while this variable is set. try: # Main try-except for msg in sqs.get_messages_from_queue(): msg_body = msg.get_body() log({ "status": "new message", "queue": queue_name, "msg": msg_body, }) results = None final_status = JOBS_ETL_STATUS_ERROR lsd = None try: self._update_scheduled_jobs_on_etl_start(msg_body) # safe to delete message. if worker dies, scanner will resubmit sqs.delete_message_from_queue(msg) try: # Execute etl results, action_dict = self._process_msg(msg) # Parse results final_status, lsd, extra_info = \ parse_results(results, msg_body['end_date']) if final_status != JOBS_ETL_STATUS_COMPLETE: if action_dict['delete_requested']: final_status = JOBS_ETL_STATUS_DELETED elif action_dict['cancel_requested']: final_status = JOBS_ETL_STATUS_CANCELLED elif action_dict['pause_requested']: final_status = JOBS_ETL_STATUS_PAUSED log({ "status": "processed message OK", "queue": queue_name, "msg": msg_body, "results": results, "job status": final_status, "last OK date": lsd, }) except Exception: final_status = JOBS_ETL_STATUS_ERROR log_exception( "Exception in processing msg from queue: " + queue_name + " msg body:" + str(msg_body)) if final_status != JOBS_ETL_STATUS_DELETED: self._update_scheduled_jobs_on_etl_complete( msg_body, final_status, lsd) scanner_sqs.write_message_to_queue(dummy_message) try: self.emailer.mail_result( final_status, msg_body, additional_info=extra_info) log("Sent emails to:" + str(msg_body['contact_emails'])) except Exception: log_exception( "Exception in sending emails of job:" + str(msg_body)) except Exception: log_exception("Failed to update scheduled jobs on etl" " start/complete, msg body: " + str(msg_body)) except Exception: # end of main try-except log_exception("Exception in fetching messages from queue:" + queue_name) # if sqs queue fails, throttle retry time.sleep(sqs.get_wait_time()) if self._run_once: break self._stop_requested = False
def _update_job_status(self, job, status_value): kwargs = {"et_status": status_value} if job.update(**kwargs) is not True: log("Could not update {0} for job {1}".format(kwargs, job2log(job)))