Example #1
0
    def __init__(self, db_obj, sqs_scanner_queue, sqs_worker_queue, emailer):
        """
        :param db_obj: dynamodb table of scheduled jobs
        :type db_obj: boto.dynamodb2.table.Table

        :param sqs_scanner_queue: scanner queue which to send a feed back message
        :type sqs_scanner_queue: boto.sqs.queue.Queue

        :param sqs_worker_queue: worker queue which to receive job from
        :type sqs_worker_queue: boto.sqs.queue.Queue

        """
        self.db = db_obj
        self.scanner_queue = sqs_scanner_queue
        self.worker_queue = sqs_worker_queue
        self._should_run = True
        self._run_once = False
        self.worker_keepalive_sec = staticconf.read_int('scanner.worker_keepalive_sec')
        self.max_error_retries = staticconf.read_int('max_error_retries')
        self.msg_max_retention_sec = int(
            self.scanner_queue.get_queue_attributes()['MessageRetentionPeriod']
        )
        self.msg_max_retention_sec += 3600  # give SQS enough time to delete the message
        self.emailer = emailer

        log("scanner initialization")
        log(dict((k, str(v))for k, v in vars(self).iteritems()))
Example #2
0
    def _maint_error_jobs(self, now):
        retry_ts_field = 'et_next_error_retry_attempt'
        num_retries_field = 'et_num_error_retries'
        get_jobs = getattr(self.db, 'get_jobs_with_et_status')
        jobs = get_jobs(JOBS_ETL_STATUS_ERROR)
        for job in jobs:
            kwargs = {retry_ts_field: None, num_retries_field: None}
            fields = job.get(**kwargs)

            retry_ts_value = fields.get(retry_ts_field, False) or str(now)
            num_retries = fields.get(num_retries_field, None)

            # for legacy error jobs, set to max_error_retries to avoid auto-start
            num_retries = int(num_retries) if num_retries is not None else self.max_error_retries

            next_retry = datetime.strptime(retry_ts_value, '%Y-%m-%d %H:%M:%S.%f')
            if next_retry <= now and num_retries < self.max_error_retries:
                log('attempt {0}/{1} to re-run job {2}'.format(
                    num_retries + 1, self.max_error_retries, job2log(job)
                ))
                self._update_job_status(job, JOBS_ETL_STATUS_EMPTY)

            if self._action_pending(job):
                # push job to worker for actions like cancel/pause/delete
                self._update_job_status(job, JOBS_ETL_STATUS_EMPTY)
Example #3
0
    def _schedule_runs_lk(self, et_pool, job):
        """ Schedule runs to execute up to max possible parallelism
        suffix '_lk' means caller must already hold lock.

        :param et_pool: A multiprocessor pool handle
        :type: Pool
        :param job: current job
        :type: WorkerJob
        """
        while (self._has_more_runs_to_schedule(job)
               and job.runs_in_flight < job.max_runs_in_flight):
            run = job.schedule_next_run()
            if run.id is None:
                raise ValueError("Unexpected end of runs")

            self.etl_helper.etl_step_started(job.msg_dict, run.id, run.step)

            log('scheduled: {0}'.format(run.id))
            et_pool.apply_async(
                run.func,
                args=run.args,
                callback=self._create_run_complete_callback(
                    job, run.id, run.step),
            )
            job.runs_in_flight += 1
Example #4
0
 def _fetch_jobs_for_work(self):
     # query self.db and return jobs
     statuses = JOBS_ETL_STATUSES_SCHEDULABLE
     result = [
         job for s in statuses for job in self.db.get_jobs_with_et_status(s)
     ]
     log("ET scanner: num jobs to process from DB:" + str(len(result)))
     return result
Example #5
0
 def _fetch_jobs_for_work(self):
     # query self.db and return jobs
     statuses = JOBS_ETL_STATUSES_SCHEDULABLE
     result = [
         job for s in statuses for job in self.db.get_jobs_with_et_status(s)
     ]
     log(
         "ET scanner: num jobs to process from DB:" + str(len(result)))
     return result
Example #6
0
    def _handle_cancel_request_lk(self, job, et_pool):
        """ Cancel currently executing job.  Send SIGINT to all children,
        ignore SIGINT in parent to prevent killing it.

        Note: suffix '_lk' means caller must already hold lock.

        :param job: current job
        """
        log("cancel detected, terminating")
        if job.runs_in_flight != 0:
            log("sending cancel request")
            et_pool.cancel()
Example #7
0
    def _maint_running_jobs(self, now):
        status_ts_field = 'et_status_last_updated_at'
        get_jobs = getattr(self.db, 'get_jobs_with_et_status')
        jobs = get_jobs(JOBS_ETL_STATUS_RUNNING)
        for job in jobs:
            kwargs = {status_ts_field: None}
            status_ts_value = job.get(**kwargs).get(status_ts_field, False) or str(now)

            last_update = datetime.strptime(status_ts_value, '%Y-%m-%d %H:%M:%S.%f')
            if now - last_update > timedelta(seconds=self.worker_keepalive_sec):
                log('found stuck running job {0}, resetting...'.format(job2log(job)))
                self._update_job_status(job, JOBS_ETL_STATUS_EMPTY)
Example #8
0
    def _handle_cancel_request_lk(self, job, et_pool):
        """ Cancel currently executing job.  Send SIGINT to all children,
        ignore SIGINT in parent to prevent killing it.

        Note: suffix '_lk' means caller must already hold lock.

        :param job: current job
        """
        log("cancel detected, terminating")
        if job.runs_in_flight != 0:
            log("sending cancel request")
            et_pool.cancel()
Example #9
0
    def _maint_scheduled_jobs(self, now):
        status_ts_field = 'et_status_last_updated_at'
        get_jobs = getattr(self.db, 'get_jobs_with_et_status')

        jobs = get_jobs(JOBS_ETL_STATUS_SCHEDULED)
        for job in jobs:
            kwargs = {status_ts_field: None}
            status_ts_value = job.get(**kwargs).get(status_ts_field, False) or str(now)

            last_update = datetime.strptime(status_ts_value, '%Y-%m-%d %H:%M:%S.%f')
            if now - last_update > timedelta(seconds=self.msg_max_retention_sec):
                log('found stuck scheduled job {0}, resetting...'.format(job2log(job)))
                self._update_job_status(job, JOBS_ETL_STATUS_EMPTY)
Example #10
0
    def _create_work_for_job(self, job):
        # Update status = SCHEDULED
        # iff the status is not SCHEDULED/COMPLETE/ERROR
        job_dict = job.get(**self.DEFAULT_KEYS_TO_FETCH_FROM_JOB)
        if not self._update_job_status_conditionally(job):
            log("ET: Could not conditionally update job to status 'scheduled'"
                ".Skipping this job this time : " + str(job))
            return

        start_date = self._later_date(
            job_dict['start_date'],
            next_date_for_string(job_dict['et_last_successful_date']))
        end_date = self._earlier_date(job_dict['end_date'],
                                      self._get_max_available_date(job))
        self._enqueue_work_in_sqs(start_date, end_date, "ET", job_dict)
        log("Enqueued ET work item in SQS, for job: " + str(job_dict))
Example #11
0
 def run_scanner(self):
     """ Fetch relevant jobs from DB table and create ET or L work.
     Does not throw any exceptions out.
     """
     jobs = self._fetch_jobs_for_work()
     for job in jobs:
         try:
             if self._action_pending(job) or self._should_process_job(job):
                 self._create_work_for_job(job)
             else:
                 log("Skipping job since there was no processing needed: {0}".format(
                     job2log(job)
                 ))
         except Exception:
             log_exception(
                 "Caught an exception in processing a job. Ignoring"
                 " entry: {0}".format(job2log(job)))
Example #12
0
    def _create_work_for_job(self, job):
        # Update status = SCHEDULED
        # iff the status is not SCHEDULED/COMPLETE/ERROR
        job_dict = job.get(**self.DEFAULT_KEYS_TO_FETCH_FROM_JOB)
        if not self._update_job_status_conditionally(job):
            log(
                "ET: Could not conditionally update job to status 'scheduled'"
                ".Skipping this job this time : " + str(job)
            )
            return

        start_date = self._later_date(
            job_dict['start_date'],
            next_date_for_string(job_dict['et_last_successful_date'])
        )
        end_date = self._earlier_date(job_dict['end_date'], self._get_max_available_date(job))
        self._enqueue_work_in_sqs(start_date, end_date, "ET", job_dict)
        log("Enqueued ET work item in SQS, for job: " + str(job_dict))
Example #13
0
    def __init__(self,
                 config_loc,
                 config_override_loc,
                 run_local,
                 emailer,
                 dummy_run=False):
        super(ImdWorker, self).__init__(
            config_loc,
            config_override_loc,
            emailer,
            num_processes=3,
        )
        for key in self.KEYS_TO_LOAD:
            self.__setattr__(key, staticconf.read_string(key))
        if dummy_run:
            log("Dummy worker! Skip the real etl process. Just for test.")
            import mycroft.backend.worker.fake_ingest_multiple_dates as ingest_multiple_dates
        else:
            import sherlock.batch.ingest_multiple_dates as ingest_multiple_dates
        self._should_run_local = run_local
        self.dummy_run = dummy_run
        self.ingest_multiple_dates = ingest_multiple_dates.ingest_multiple_dates_main
        self.queue_name = staticconf.get_string("sqs.et_queue_name")
        self.scanner_queue_name = staticconf.get_string(
            "sqs.et_scanner_queue_name")

        log("ImdWorker initialization")
        log(dict((k, str(v)) for k, v in vars(self).iteritems()))
Example #14
0
    def _run_complete_callback(self, job, run_id, step, results):
        """ Callback invoked when a single run is done

        :param job: current job
        :type: WorkerJob
        :param run_id: an unique run identifier
        :type: string
        :param step: step associated with the run execution
        :type: string
        :param results: run results
        :type: list
        """
        self._cond.acquire()
        try:
            if len(results) != 1:
                raise ValueError("len(results) != 1, {0}".format(results))
            log("done: {0}, {1}, {2}".format(run_id, step,
                                             results[0]['status']))

            job.all_results[run_id].extend(results)
            job.runs_done += 1
            job.runs_in_flight -= 1

            if job.runs_in_flight < 0:
                raise ValueError("runs_in_flight < 0 \
    ({0} < 0)".format(job.runs_in_flight))

            if job.is_waiting is True:
                self._cond.notify(n=1)

            self.etl_helper.etl_step_complete(job.msg_dict, run_id, step,
                                              results[0])
            job.run_complete(run_id, step, results)
        except:
            # if callback dies, et_pool stops working
            log_exception('_run_complete_callback')
        finally:
            self._cond.release()
Example #15
0
    def _schedule_runs_lk(self, et_pool, job):
        """ Schedule runs to execute up to max possible parallelism
        suffix '_lk' means caller must already hold lock.

        :param et_pool: A multiprocessor pool handle
        :type: Pool
        :param job: current job
        :type: WorkerJob
        """
        while (self._has_more_runs_to_schedule(job) and
               job.runs_in_flight < job.max_runs_in_flight):
            run = job.schedule_next_run()
            if run.id is None:
                raise ValueError("Unexpected end of runs")

            self.etl_helper.etl_step_started(job.msg_dict, run.id, run.step)

            log('scheduled: {0}'.format(run.id))
            et_pool.apply_async(
                run.func,
                args=run.args,
                callback=self._create_run_complete_callback(job, run.id, run.step),
            )
            job.runs_in_flight += 1
Example #16
0
    def _run_complete_callback(self, job, run_id, step, results):
        """ Callback invoked when a single run is done

        :param job: current job
        :type: WorkerJob
        :param run_id: an unique run identifier
        :type: string
        :param step: step associated with the run execution
        :type: string
        :param results: run results
        :type: list
        """
        self._cond.acquire()
        try:
            if len(results) != 1:
                raise ValueError("len(results) != 1, {0}".format(results))
            log("done: {0}, {1}, {2}".format(run_id, step, results[0]['status']))

            job.all_results[run_id].extend(results)
            job.runs_done += 1
            job.runs_in_flight -= 1

            if job.runs_in_flight < 0:
                raise ValueError("runs_in_flight < 0 \
    ({0} < 0)".format(job.runs_in_flight))

            if job.is_waiting is True:
                self._cond.notify(n=1)

            self.etl_helper.etl_step_complete(job.msg_dict, run_id, step, results[0])
            job.run_complete(run_id, step, results)
        except:
            # if callback dies, et_pool stops working
            log_exception('_run_complete_callback')
        finally:
            self._cond.release()
Example #17
0
    def _maint_paused_jobs(self, now):
        lsd_field = 'et_last_successful_date'
        kwargs = dict((action, None) for action in JOBS_ETL_ACTIONS)
        kwargs[lsd_field] = None

        get_jobs = getattr(self.db, 'get_jobs_with_et_status')
        jobs = get_jobs(JOBS_ETL_STATUS_PAUSED)
        for job in jobs:
            # if key is set to None or does not exist, assume it's set to 1
            # to avoid resuming non-conforming jobs.
            result_dict = job.get(**kwargs)
            pause_requested = result_dict.get('pause_requested', 1)
            cancel_requested = result_dict.get('cancel_requested', 1)
            lsd = result_dict.get(lsd_field, None)

            if cancel_requested != 0:
                log('cancel requested for paused job {0}, resetting...'.format(job2log(job)))
                self._update_job_status(job, JOBS_ETL_STATUS_EMPTY)
            elif pause_requested == 0:
                # job is paused but pause_requested is 0 => resume to IDLE,
                # or SUCCESS if can't resume right away
                ready_to_process = self._should_process_job(job)
                new_job_status = JOBS_ETL_STATUS_EMPTY \
                    if ready_to_process or lsd is None else JOBS_ETL_STATUS_SUCCESS
                log('resetting paused job {0} to {1}'.format(job2log(job), new_job_status))
                self._update_job_status(job, new_job_status)
                if ready_to_process:
                    additional_info = "Job will start immediately."
                else:
                    additional_info = "Job will start when new data is available."
                job_dict = job.get(**self.DEFAULT_KEYS_TO_FETCH_FROM_JOB)
                if job_dict['contact_emails'] is not None:
                    job_dict['contact_emails'] = list(job_dict['contact_emails'])
                try:
                    self.emailer.mail_result(
                        "resumed", job_dict, additional_info
                    )
                    log("Sent emails to: {0}".format(job_dict['contact_emails']))
                except Exception:
                    log_exception("Exception in sending emails of job:" +
                                  str(job_dict))
Example #18
0
    def __init__(self, config_loc, config_override_loc, run_local, emailer, dummy_run=False):
        super(ImdWorker, self).__init__(
            config_loc,
            config_override_loc,
            emailer,
            num_processes=3,
        )
        for key in self.KEYS_TO_LOAD:
            self.__setattr__(key, staticconf.read_string(key))
        if dummy_run:
            log("Dummy worker! Skip the real etl process. Just for test.")
            import mycroft.backend.worker.fake_ingest_multiple_dates as ingest_multiple_dates
        else:
            import sherlock.batch.ingest_multiple_dates as ingest_multiple_dates
        self._should_run_local = run_local
        self.dummy_run = dummy_run
        self.ingest_multiple_dates = ingest_multiple_dates.ingest_multiple_dates_main
        self.queue_name = staticconf.get_string("sqs.et_queue_name")
        self.scanner_queue_name = staticconf.get_string("sqs.et_scanner_queue_name")

        log("ImdWorker initialization")
        log(dict((k, str(v))for k, v in vars(self).iteritems()))
Example #19
0
    def run(self):
        """Main entry point for the worker. Queries an SQS queue for messages
        and performs the appropriate action on each message received.
        Swallows all exceptions and logs them.
        """
        queue_name = str(self._get_queue_name())
        sqs = self._get_sqs_wrapper(queue_name, JSONMessage)

        scanner_queue_name = str(self._get_scanner_queue_name())
        scanner_sqs = self._get_sqs_wrapper(scanner_queue_name, JSONMessage)
        dummy_message = {"message": "dummy"}  # TODO: make this message meaningful

        while(not self._stop_requested):  # Loop forever while this variable is set.
            try:  # Main try-except
                for msg in sqs.get_messages_from_queue():
                    msg_body = msg.get_body()
                    log({
                        "status": "new message",
                        "queue": queue_name,
                        "msg": msg_body,
                    })

                    results = None
                    final_status = JOBS_ETL_STATUS_ERROR
                    lsd = None
                    try:
                        self._update_scheduled_jobs_on_etl_start(msg_body)
                        # safe to delete message. if worker dies, scanner will resubmit
                        sqs.delete_message_from_queue(msg)

                        try:
                            # Execute etl
                            results, action_dict = self._process_msg(msg)

                            # Parse results
                            final_status, lsd, extra_info = \
                                parse_results(results, msg_body['end_date'])
                            if final_status != JOBS_ETL_STATUS_COMPLETE:
                                if action_dict['delete_requested']:
                                    final_status = JOBS_ETL_STATUS_DELETED
                                elif action_dict['cancel_requested']:
                                    final_status = JOBS_ETL_STATUS_CANCELLED
                                elif action_dict['pause_requested']:
                                    final_status = JOBS_ETL_STATUS_PAUSED

                            log({
                                "status": "processed message OK",
                                "queue": queue_name,
                                "msg": msg_body,
                                "results": results,
                                "job status": final_status,
                                "last OK date": lsd,
                            })
                        except Exception:
                            final_status = JOBS_ETL_STATUS_ERROR
                            log_exception(
                                "Exception in processing msg from queue: " +
                                queue_name + " msg body:" + str(msg_body)
                            )
                        if final_status != JOBS_ETL_STATUS_DELETED:
                            self._update_scheduled_jobs_on_etl_complete(
                                msg_body, final_status, lsd
                            )
                        scanner_sqs.write_message_to_queue(dummy_message)
                        try:
                            self.emailer.mail_result(
                                final_status, msg_body, additional_info=extra_info
                            )
                            log(
                                "Sent emails to:" + str(msg_body['contact_emails'])
                            )
                        except Exception:
                            log_exception(
                                "Exception in sending emails of job:" +
                                str(msg_body)
                            )
                    except Exception:
                        log_exception(
                            "Failed to update scheduled jobs on etl"
                            " start/complete, msg body: " + str(msg_body)
                        )
            except Exception:  # end of main try-except
                log_exception(
                    "Exception in fetching messages from queue:"
                    + queue_name
                )
                # if sqs queue fails, throttle retry
                time.sleep(sqs.get_wait_time())
            if self._run_once:
                break

        self._stop_requested = False
Example #20
0
    def run(self):
        """Main entry point for the worker. Queries an SQS queue for messages
        and performs the appropriate action on each message received.
        Swallows all exceptions and logs them.
        """
        queue_name = str(self._get_queue_name())
        sqs = self._get_sqs_wrapper(queue_name, JSONMessage)

        scanner_queue_name = str(self._get_scanner_queue_name())
        scanner_sqs = self._get_sqs_wrapper(scanner_queue_name, JSONMessage)
        dummy_message = {
            "message": "dummy"
        }  # TODO: make this message meaningful

        while (not self._stop_requested
               ):  # Loop forever while this variable is set.
            try:  # Main try-except
                for msg in sqs.get_messages_from_queue():
                    msg_body = msg.get_body()
                    log({
                        "status": "new message",
                        "queue": queue_name,
                        "msg": msg_body,
                    })

                    results = None
                    final_status = JOBS_ETL_STATUS_ERROR
                    lsd = None
                    try:
                        self._update_scheduled_jobs_on_etl_start(msg_body)
                        # safe to delete message. if worker dies, scanner will resubmit
                        sqs.delete_message_from_queue(msg)

                        try:
                            # Execute etl
                            results, action_dict = self._process_msg(msg)

                            # Parse results
                            final_status, lsd, extra_info = \
                                parse_results(results, msg_body['end_date'])
                            if final_status != JOBS_ETL_STATUS_COMPLETE:
                                if action_dict['delete_requested']:
                                    final_status = JOBS_ETL_STATUS_DELETED
                                elif action_dict['cancel_requested']:
                                    final_status = JOBS_ETL_STATUS_CANCELLED
                                elif action_dict['pause_requested']:
                                    final_status = JOBS_ETL_STATUS_PAUSED

                            log({
                                "status": "processed message OK",
                                "queue": queue_name,
                                "msg": msg_body,
                                "results": results,
                                "job status": final_status,
                                "last OK date": lsd,
                            })
                        except Exception:
                            final_status = JOBS_ETL_STATUS_ERROR
                            log_exception(
                                "Exception in processing msg from queue: " +
                                queue_name + " msg body:" + str(msg_body))
                        if final_status != JOBS_ETL_STATUS_DELETED:
                            self._update_scheduled_jobs_on_etl_complete(
                                msg_body, final_status, lsd)
                        scanner_sqs.write_message_to_queue(dummy_message)
                        try:
                            self.emailer.mail_result(
                                final_status,
                                msg_body,
                                additional_info=extra_info)
                            log("Sent emails to:" +
                                str(msg_body['contact_emails']))
                        except Exception:
                            log_exception(
                                "Exception in sending emails of job:" +
                                str(msg_body))
                    except Exception:
                        log_exception("Failed to update scheduled jobs on etl"
                                      " start/complete, msg body: " +
                                      str(msg_body))
            except Exception:  # end of main try-except
                log_exception("Exception in fetching messages from queue:" +
                              queue_name)
                # if sqs queue fails, throttle retry
                time.sleep(sqs.get_wait_time())
            if self._run_once:
                break

        self._stop_requested = False
Example #21
0
 def _update_job_status(self, job, status_value):
     kwargs = {"et_status": status_value}
     if job.update(**kwargs) is not True:
         log("Could not update {0} for job {1}".format(kwargs, job2log(job)))