Ejemplo n.º 1
0
    def __init__(self, config_loc, config_override_loc, emailer,
                 num_processes=1, wait_timeout_sec=60):
        """
        :param config_loc: path of config.yaml
        :type config_loc: string

        :param config_override_loc: path of config-env-dev.yaml
        :type config_override_loc: string

        :param run_local: run local flag
        :type run_local: boolean

        :param num_processes: number of worker processes to use for sqs request
        :type num_processes: int

        :param wait_timeout_sec: A timeout passed to conditional variable wait
            function.  If thread is woken up on timeout, do some maintenance work.
        :type wait_timeout_sec: int

        """
        self._config_loc = config_loc
        self._config_override_loc = config_override_loc
        self._stop_requested = False
        self._run_once = False
        self.max_error_retries = staticconf.read_int('max_error_retries')
        self.etl_helper = ETLStatusHelper()
        self.jobs_db = TableConnection.get_connection('ScheduledJobs')
        self.runs_db = TableConnection.get_connection('ETLRecords')
        self._num_processes = num_processes
        self._cond = threading.Condition(threading.Lock())
        self._wait_timeout_sec = max(wait_timeout_sec, 60)
        self.emailer = emailer
Ejemplo n.º 2
0
    def __init__(self,
                 config_loc,
                 config_override_loc,
                 emailer,
                 num_processes=1,
                 wait_timeout_sec=60):
        """
        :param config_loc: path of config.yaml
        :type config_loc: string

        :param config_override_loc: path of config-env-dev.yaml
        :type config_override_loc: string

        :param run_local: run local flag
        :type run_local: boolean

        :param num_processes: number of worker processes to use for sqs request
        :type num_processes: int

        :param wait_timeout_sec: A timeout passed to conditional variable wait
            function.  If thread is woken up on timeout, do some maintenance work.
        :type wait_timeout_sec: int

        """
        self._config_loc = config_loc
        self._config_override_loc = config_override_loc
        self._stop_requested = False
        self._run_once = False
        self.max_error_retries = staticconf.read_int('max_error_retries')
        self.etl_helper = ETLStatusHelper()
        self.jobs_db = TableConnection.get_connection('ScheduledJobs')
        self.runs_db = TableConnection.get_connection('ETLRecords')
        self._num_processes = num_processes
        self._cond = threading.Condition(threading.Lock())
        self._wait_timeout_sec = max(wait_timeout_sec, 60)
        self.emailer = emailer
Ejemplo n.º 3
0
class BaseMycroftWorker(object):

    """ Worker hosted on mycroft AWS instances. Picks up work messages from
    SQS and runs the appropriate ET or Load scripts for the parameters
    (log name, schema, date, etc.) mentioned in these messages.
    Subclasses must implement
    :func:`mycroft.backend.worker.base_worker.BaseMycroftWorker.create_worker_job`
    methods.
    """

    def __init__(self, config_loc, config_override_loc, emailer,
                 num_processes=1, wait_timeout_sec=60):
        """
        :param config_loc: path of config.yaml
        :type config_loc: string

        :param config_override_loc: path of config-env-dev.yaml
        :type config_override_loc: string

        :param run_local: run local flag
        :type run_local: boolean

        :param num_processes: number of worker processes to use for sqs request
        :type num_processes: int

        :param wait_timeout_sec: A timeout passed to conditional variable wait
            function.  If thread is woken up on timeout, do some maintenance work.
        :type wait_timeout_sec: int

        """
        self._config_loc = config_loc
        self._config_override_loc = config_override_loc
        self._stop_requested = False
        self._run_once = False
        self.max_error_retries = staticconf.read_int('max_error_retries')
        self.etl_helper = ETLStatusHelper()
        self.jobs_db = TableConnection.get_connection('ScheduledJobs')
        self.runs_db = TableConnection.get_connection('ETLRecords')
        self._num_processes = num_processes
        self._cond = threading.Condition(threading.Lock())
        self._wait_timeout_sec = max(wait_timeout_sec, 60)
        self.emailer = emailer

    def stop(self):
        """ Stop a running worker.
        """
        self._stop_requested = True

    def _get_sqs_wrapper(self, queue_name, class_type):
        return SQSWrapper(queue_name, class_type)

    def run(self):
        """Main entry point for the worker. Queries an SQS queue for messages
        and performs the appropriate action on each message received.
        Swallows all exceptions and logs them.
        """
        queue_name = str(self._get_queue_name())
        sqs = self._get_sqs_wrapper(queue_name, JSONMessage)

        scanner_queue_name = str(self._get_scanner_queue_name())
        scanner_sqs = self._get_sqs_wrapper(scanner_queue_name, JSONMessage)
        dummy_message = {"message": "dummy"}  # TODO: make this message meaningful

        while(not self._stop_requested):  # Loop forever while this variable is set.
            try:  # Main try-except
                for msg in sqs.get_messages_from_queue():
                    msg_body = msg.get_body()
                    log({
                        "status": "new message",
                        "queue": queue_name,
                        "msg": msg_body,
                    })

                    results = None
                    final_status = JOBS_ETL_STATUS_ERROR
                    lsd = None
                    try:
                        self._update_scheduled_jobs_on_etl_start(msg_body)
                        # safe to delete message. if worker dies, scanner will resubmit
                        sqs.delete_message_from_queue(msg)

                        try:
                            # Execute etl
                            results, action_dict = self._process_msg(msg)

                            # Parse results
                            final_status, lsd, extra_info = \
                                parse_results(results, msg_body['end_date'])
                            if final_status != JOBS_ETL_STATUS_COMPLETE:
                                if action_dict['delete_requested']:
                                    final_status = JOBS_ETL_STATUS_DELETED
                                elif action_dict['cancel_requested']:
                                    final_status = JOBS_ETL_STATUS_CANCELLED
                                elif action_dict['pause_requested']:
                                    final_status = JOBS_ETL_STATUS_PAUSED

                            log({
                                "status": "processed message OK",
                                "queue": queue_name,
                                "msg": msg_body,
                                "results": results,
                                "job status": final_status,
                                "last OK date": lsd,
                            })
                        except Exception:
                            final_status = JOBS_ETL_STATUS_ERROR
                            log_exception(
                                "Exception in processing msg from queue: " +
                                queue_name + " msg body:" + str(msg_body)
                            )
                        if final_status != JOBS_ETL_STATUS_DELETED:
                            self._update_scheduled_jobs_on_etl_complete(
                                msg_body, final_status, lsd
                            )
                        scanner_sqs.write_message_to_queue(dummy_message)
                        try:
                            self.emailer.mail_result(
                                final_status, msg_body, additional_info=extra_info
                            )
                            log(
                                "Sent emails to:" + str(msg_body['contact_emails'])
                            )
                        except Exception:
                            log_exception(
                                "Exception in sending emails of job:" +
                                str(msg_body)
                            )
                    except Exception:
                        log_exception(
                            "Failed to update scheduled jobs on etl"
                            " start/complete, msg body: " + str(msg_body)
                        )
            except Exception:  # end of main try-except
                log_exception(
                    "Exception in fetching messages from queue:"
                    + queue_name
                )
                # if sqs queue fails, throttle retry
                time.sleep(sqs.get_wait_time())
            if self._run_once:
                break

        self._stop_requested = False

    def _update_scheduled_jobs_on_etl_start(self, msg_dict):
        """ Update scheduled jobs status from scheduled to running
        """
        new_kwargs = {"et_status": JOBS_ETL_STATUS_RUNNING}

        job = self._get_scheduled_job(msg_dict['hash_key'])
        job_status = job.get(**new_kwargs)['et_status']
        if job_status == JOBS_ETL_STATUS_SCHEDULED:
            self._update_scheduled_jobs(msg_dict['hash_key'], new_kwargs)
            job_status = new_kwargs['et_status']
        else:
            raise ValueError(
                "Unexpected job status {0} for job {1}".format(job_status, job.__dict__)
            )
        return job_status

    def _update_scheduled_jobs_on_etl_complete(
            self, msg_dict, final_status, lsd):
        """ Updates the scheduled jobs table with the result of this etl -
        both *_status and *_last_successul_date are updated - these are inturn
        used by the scanner to schedule the next run.
        """
        now = datetime.utcnow()

        new_kwargs = {
            'et_status': final_status,
            'et_status_last_updated_at': str(now),
        }
        if lsd is not None:
            new_kwargs['et_last_successful_date'] = lsd

        num_retries_field = 'et_num_error_retries'
        next_retry_field = 'et_next_error_retry_attempt'

        if final_status == JOBS_ETL_STATUS_ERROR:
            job = self.jobs_db.get(hash_key=msg_dict['hash_key'])
            args = {num_retries_field: -1}
            fields = job.get(**args)
            num_retries = int(fields[num_retries_field]) + 1

            if num_retries < self.max_error_retries:
                # exponential backoff
                next_retry = timedelta(seconds=(300 * (2 ** num_retries))) + now
                new_kwargs[next_retry_field] = str(next_retry)
            new_kwargs[num_retries_field] = num_retries
        elif final_status == JOBS_ETL_STATUS_SUCCESS:
            # If not reset, perpetual jobs will eventually reach max retries
            # and auto-retry will stops working.  Delete by setting to None
            new_kwargs[num_retries_field] = None
            new_kwargs[next_retry_field] = None

        self._update_scheduled_jobs(msg_dict['hash_key'], new_kwargs)

    def _get_scheduled_job(self, hash_key):
        job = self.jobs_db.get(hash_key=hash_key)
        if job is None:
            raise ValueError(
                "Could not find job entry for hash key {0}".format(hash_key)
            )
        return job

    def _update_scheduled_jobs(self, hash_key, new_kwargs):
        job = self._get_scheduled_job(hash_key)
        ret = job.update(**new_kwargs)
        if not ret:
            raise ValueError(
                "Could not update scheduled jobs entry for etl start/finish,"
                " hash key:" + hash_key + " return value: " + str(ret)
            )

    def _run_complete_callback(self, job, run_id, step, results):
        """ Callback invoked when a single run is done

        :param job: current job
        :type: WorkerJob
        :param run_id: an unique run identifier
        :type: string
        :param step: step associated with the run execution
        :type: string
        :param results: run results
        :type: list
        """
        self._cond.acquire()
        try:
            if len(results) != 1:
                raise ValueError("len(results) != 1, {0}".format(results))
            log("done: {0}, {1}, {2}".format(run_id, step, results[0]['status']))

            job.all_results[run_id].extend(results)
            job.runs_done += 1
            job.runs_in_flight -= 1

            if job.runs_in_flight < 0:
                raise ValueError("runs_in_flight < 0 \
    ({0} < 0)".format(job.runs_in_flight))

            if job.is_waiting is True:
                self._cond.notify(n=1)

            self.etl_helper.etl_step_complete(job.msg_dict, run_id, step, results[0])
            job.run_complete(run_id, step, results)
        except:
            # if callback dies, et_pool stops working
            log_exception('_run_complete_callback')
        finally:
            self._cond.release()

    def _create_run_complete_callback(self, job, run_id, step):
        return lambda results: self._run_complete_callback(
            job, run_id, step, results
        )

    def _schedule_runs_lk(self, et_pool, job):
        """ Schedule runs to execute up to max possible parallelism
        suffix '_lk' means caller must already hold lock.

        :param et_pool: A multiprocessor pool handle
        :type: Pool
        :param job: current job
        :type: WorkerJob
        """
        while (self._has_more_runs_to_schedule(job) and
               job.runs_in_flight < job.max_runs_in_flight):
            run = job.schedule_next_run()
            if run.id is None:
                raise ValueError("Unexpected end of runs")

            self.etl_helper.etl_step_started(job.msg_dict, run.id, run.step)

            log('scheduled: {0}'.format(run.id))
            et_pool.apply_async(
                run.func,
                args=run.args,
                callback=self._create_run_complete_callback(job, run.id, run.step),
            )
            job.runs_in_flight += 1

    def _handle_job_delete(self, job):
        """ Delete job. Start by deleting runs first.  If we die during deletion
        we can resume deletion until entry is deleted from jobs table
        """
        jobid = job.msg_dict['uuid']

        result = self.runs_db.delete_job_runs(jobid)
        if not result:
            raise Exception("failed to delete all runs for job {0}".format(jobid))

        result = self.jobs_db.delete(hash_key=job.msg_dict['hash_key'])
        if not result:
            raise Exception("failed to delete job {0}".format(jobid))

    def _handle_cancel_request_lk(self, job, et_pool):
        """ Cancel currently executing job.  Send SIGINT to all children,
        ignore SIGINT in parent to prevent killing it.

        Note: suffix '_lk' means caller must already hold lock.

        :param job: current job
        """
        log("cancel detected, terminating")
        if job.runs_in_flight != 0:
            log("sending cancel request")
            et_pool.cancel()

    def _has_more_runs_to_schedule(self, job):
        if job.has_incomplete_runs():
            return True

        return not job.pause_in_progress and job.has_more_runs_to_schedule()

    def create_worker_job(self, job_request):
        """ A factory method to create custom jobs
        :param job_request: job request message
        :type: json dict
        :returns: a custom job object
        :rtype:  WorkerJob
        """
        raise NotImplementedError

    def _process_msg(self, etl_msg):
        """ Process the given SQS message and run an appropriate step (ET/L).

        :param etl_msg: instance of `:class:boto.sqs.jsonmessage.JSONMessage`
        """
        msg_dict = etl_msg.get_body()

        job = self.create_worker_job(msg_dict)
        et_pool = PoolExtended(processes=self._num_processes)
        self._cond.acquire()
        try:
            while True:
                job.update_action_requests()

                if job.actions['delete_requested'] is True:
                    self._handle_job_delete(job)
                    break

                if not job.cancel_in_progress and job.actions['cancel_requested'] is True:
                    self._handle_cancel_request_lk(job, et_pool)
                    job.cancel_in_progress = True

                if not job.pause_in_progress and job.actions['pause_requested'] is True:
                    # pause the job by waiting for the completion of ongoing run(s) and refusing
                    # any new run.
                    job.pause_in_progress = True

                if job.is_done():
                    break

                if not job.cancel_in_progress:
                    self._schedule_runs_lk(et_pool, job)
                job.is_waiting = True
                self._cond.wait(self._wait_timeout_sec)
                job.update_keepalive()
                job.is_waiting = False
        finally:
            self._cond.release()
            et_pool.close()
            et_pool.join()
        return job.all_results, job.actions

    def _get_queue_name(self):
        """ Get the name of SQS queue to use for this worker.
        """
        raise NotImplementedError

    def _get_scanner_queue_name(self):
        """ Get the name of feedback scanner SQS queue to use for this worker.
        """
        raise NotImplementedError
Ejemplo n.º 4
0
class BaseMycroftWorker(object):
    """ Worker hosted on mycroft AWS instances. Picks up work messages from
    SQS and runs the appropriate ET or Load scripts for the parameters
    (log name, schema, date, etc.) mentioned in these messages.
    Subclasses must implement
    :func:`mycroft.backend.worker.base_worker.BaseMycroftWorker.create_worker_job`
    methods.
    """
    def __init__(self,
                 config_loc,
                 config_override_loc,
                 emailer,
                 num_processes=1,
                 wait_timeout_sec=60):
        """
        :param config_loc: path of config.yaml
        :type config_loc: string

        :param config_override_loc: path of config-env-dev.yaml
        :type config_override_loc: string

        :param run_local: run local flag
        :type run_local: boolean

        :param num_processes: number of worker processes to use for sqs request
        :type num_processes: int

        :param wait_timeout_sec: A timeout passed to conditional variable wait
            function.  If thread is woken up on timeout, do some maintenance work.
        :type wait_timeout_sec: int

        """
        self._config_loc = config_loc
        self._config_override_loc = config_override_loc
        self._stop_requested = False
        self._run_once = False
        self.max_error_retries = staticconf.read_int('max_error_retries')
        self.etl_helper = ETLStatusHelper()
        self.jobs_db = TableConnection.get_connection('ScheduledJobs')
        self.runs_db = TableConnection.get_connection('ETLRecords')
        self._num_processes = num_processes
        self._cond = threading.Condition(threading.Lock())
        self._wait_timeout_sec = max(wait_timeout_sec, 60)
        self.emailer = emailer

    def stop(self):
        """ Stop a running worker.
        """
        self._stop_requested = True

    def _get_sqs_wrapper(self, queue_name, class_type):
        return SQSWrapper(queue_name, class_type)

    def run(self):
        """Main entry point for the worker. Queries an SQS queue for messages
        and performs the appropriate action on each message received.
        Swallows all exceptions and logs them.
        """
        queue_name = str(self._get_queue_name())
        sqs = self._get_sqs_wrapper(queue_name, JSONMessage)

        scanner_queue_name = str(self._get_scanner_queue_name())
        scanner_sqs = self._get_sqs_wrapper(scanner_queue_name, JSONMessage)
        dummy_message = {
            "message": "dummy"
        }  # TODO: make this message meaningful

        while (not self._stop_requested
               ):  # Loop forever while this variable is set.
            try:  # Main try-except
                for msg in sqs.get_messages_from_queue():
                    msg_body = msg.get_body()
                    log({
                        "status": "new message",
                        "queue": queue_name,
                        "msg": msg_body,
                    })

                    results = None
                    final_status = JOBS_ETL_STATUS_ERROR
                    lsd = None
                    try:
                        self._update_scheduled_jobs_on_etl_start(msg_body)
                        # safe to delete message. if worker dies, scanner will resubmit
                        sqs.delete_message_from_queue(msg)

                        try:
                            # Execute etl
                            results, action_dict = self._process_msg(msg)

                            # Parse results
                            final_status, lsd, extra_info = \
                                parse_results(results, msg_body['end_date'])
                            if final_status != JOBS_ETL_STATUS_COMPLETE:
                                if action_dict['delete_requested']:
                                    final_status = JOBS_ETL_STATUS_DELETED
                                elif action_dict['cancel_requested']:
                                    final_status = JOBS_ETL_STATUS_CANCELLED
                                elif action_dict['pause_requested']:
                                    final_status = JOBS_ETL_STATUS_PAUSED

                            log({
                                "status": "processed message OK",
                                "queue": queue_name,
                                "msg": msg_body,
                                "results": results,
                                "job status": final_status,
                                "last OK date": lsd,
                            })
                        except Exception:
                            final_status = JOBS_ETL_STATUS_ERROR
                            log_exception(
                                "Exception in processing msg from queue: " +
                                queue_name + " msg body:" + str(msg_body))
                        if final_status != JOBS_ETL_STATUS_DELETED:
                            self._update_scheduled_jobs_on_etl_complete(
                                msg_body, final_status, lsd)
                        scanner_sqs.write_message_to_queue(dummy_message)
                        try:
                            self.emailer.mail_result(
                                final_status,
                                msg_body,
                                additional_info=extra_info)
                            log("Sent emails to:" +
                                str(msg_body['contact_emails']))
                        except Exception:
                            log_exception(
                                "Exception in sending emails of job:" +
                                str(msg_body))
                    except Exception:
                        log_exception("Failed to update scheduled jobs on etl"
                                      " start/complete, msg body: " +
                                      str(msg_body))
            except Exception:  # end of main try-except
                log_exception("Exception in fetching messages from queue:" +
                              queue_name)
                # if sqs queue fails, throttle retry
                time.sleep(sqs.get_wait_time())
            if self._run_once:
                break

        self._stop_requested = False

    def _update_scheduled_jobs_on_etl_start(self, msg_dict):
        """ Update scheduled jobs status from scheduled to running
        """
        new_kwargs = {"et_status": JOBS_ETL_STATUS_RUNNING}

        job = self._get_scheduled_job(msg_dict['hash_key'])
        job_status = job.get(**new_kwargs)['et_status']
        if job_status == JOBS_ETL_STATUS_SCHEDULED:
            self._update_scheduled_jobs(msg_dict['hash_key'], new_kwargs)
            job_status = new_kwargs['et_status']
        else:
            raise ValueError("Unexpected job status {0} for job {1}".format(
                job_status, job.__dict__))
        return job_status

    def _update_scheduled_jobs_on_etl_complete(self, msg_dict, final_status,
                                               lsd):
        """ Updates the scheduled jobs table with the result of this etl -
        both *_status and *_last_successul_date are updated - these are inturn
        used by the scanner to schedule the next run.
        """
        now = datetime.utcnow()

        new_kwargs = {
            'et_status': final_status,
            'et_status_last_updated_at': str(now),
        }
        if lsd is not None:
            new_kwargs['et_last_successful_date'] = lsd

        num_retries_field = 'et_num_error_retries'
        next_retry_field = 'et_next_error_retry_attempt'

        if final_status == JOBS_ETL_STATUS_ERROR:
            job = self.jobs_db.get(hash_key=msg_dict['hash_key'])
            args = {num_retries_field: -1}
            fields = job.get(**args)
            num_retries = int(fields[num_retries_field]) + 1

            if num_retries < self.max_error_retries:
                # exponential backoff
                next_retry = timedelta(seconds=(300 * (2**num_retries))) + now
                new_kwargs[next_retry_field] = str(next_retry)
            new_kwargs[num_retries_field] = num_retries
        elif final_status == JOBS_ETL_STATUS_SUCCESS:
            # If not reset, perpetual jobs will eventually reach max retries
            # and auto-retry will stops working.  Delete by setting to None
            new_kwargs[num_retries_field] = None
            new_kwargs[next_retry_field] = None

        self._update_scheduled_jobs(msg_dict['hash_key'], new_kwargs)

    def _get_scheduled_job(self, hash_key):
        job = self.jobs_db.get(hash_key=hash_key)
        if job is None:
            raise ValueError(
                "Could not find job entry for hash key {0}".format(hash_key))
        return job

    def _update_scheduled_jobs(self, hash_key, new_kwargs):
        job = self._get_scheduled_job(hash_key)
        ret = job.update(**new_kwargs)
        if not ret:
            raise ValueError(
                "Could not update scheduled jobs entry for etl start/finish,"
                " hash key:" + hash_key + " return value: " + str(ret))

    def _run_complete_callback(self, job, run_id, step, results):
        """ Callback invoked when a single run is done

        :param job: current job
        :type: WorkerJob
        :param run_id: an unique run identifier
        :type: string
        :param step: step associated with the run execution
        :type: string
        :param results: run results
        :type: list
        """
        self._cond.acquire()
        try:
            if len(results) != 1:
                raise ValueError("len(results) != 1, {0}".format(results))
            log("done: {0}, {1}, {2}".format(run_id, step,
                                             results[0]['status']))

            job.all_results[run_id].extend(results)
            job.runs_done += 1
            job.runs_in_flight -= 1

            if job.runs_in_flight < 0:
                raise ValueError("runs_in_flight < 0 \
    ({0} < 0)".format(job.runs_in_flight))

            if job.is_waiting is True:
                self._cond.notify(n=1)

            self.etl_helper.etl_step_complete(job.msg_dict, run_id, step,
                                              results[0])
            job.run_complete(run_id, step, results)
        except:
            # if callback dies, et_pool stops working
            log_exception('_run_complete_callback')
        finally:
            self._cond.release()

    def _create_run_complete_callback(self, job, run_id, step):
        return lambda results: self._run_complete_callback(
            job, run_id, step, results)

    def _schedule_runs_lk(self, et_pool, job):
        """ Schedule runs to execute up to max possible parallelism
        suffix '_lk' means caller must already hold lock.

        :param et_pool: A multiprocessor pool handle
        :type: Pool
        :param job: current job
        :type: WorkerJob
        """
        while (self._has_more_runs_to_schedule(job)
               and job.runs_in_flight < job.max_runs_in_flight):
            run = job.schedule_next_run()
            if run.id is None:
                raise ValueError("Unexpected end of runs")

            self.etl_helper.etl_step_started(job.msg_dict, run.id, run.step)

            log('scheduled: {0}'.format(run.id))
            et_pool.apply_async(
                run.func,
                args=run.args,
                callback=self._create_run_complete_callback(
                    job, run.id, run.step),
            )
            job.runs_in_flight += 1

    def _handle_job_delete(self, job):
        """ Delete job. Start by deleting runs first.  If we die during deletion
        we can resume deletion until entry is deleted from jobs table
        """
        jobid = job.msg_dict['uuid']

        result = self.runs_db.delete_job_runs(jobid)
        if not result:
            raise Exception(
                "failed to delete all runs for job {0}".format(jobid))

        result = self.jobs_db.delete(hash_key=job.msg_dict['hash_key'])
        if not result:
            raise Exception("failed to delete job {0}".format(jobid))

    def _handle_cancel_request_lk(self, job, et_pool):
        """ Cancel currently executing job.  Send SIGINT to all children,
        ignore SIGINT in parent to prevent killing it.

        Note: suffix '_lk' means caller must already hold lock.

        :param job: current job
        """
        log("cancel detected, terminating")
        if job.runs_in_flight != 0:
            log("sending cancel request")
            et_pool.cancel()

    def _has_more_runs_to_schedule(self, job):
        if job.has_incomplete_runs():
            return True

        return not job.pause_in_progress and job.has_more_runs_to_schedule()

    def create_worker_job(self, job_request):
        """ A factory method to create custom jobs
        :param job_request: job request message
        :type: json dict
        :returns: a custom job object
        :rtype:  WorkerJob
        """
        raise NotImplementedError

    def _process_msg(self, etl_msg):
        """ Process the given SQS message and run an appropriate step (ET/L).

        :param etl_msg: instance of `:class:boto.sqs.jsonmessage.JSONMessage`
        """
        msg_dict = etl_msg.get_body()

        job = self.create_worker_job(msg_dict)
        et_pool = PoolExtended(processes=self._num_processes)
        self._cond.acquire()
        try:
            while True:
                job.update_action_requests()

                if job.actions['delete_requested'] is True:
                    self._handle_job_delete(job)
                    break

                if not job.cancel_in_progress and job.actions[
                        'cancel_requested'] is True:
                    self._handle_cancel_request_lk(job, et_pool)
                    job.cancel_in_progress = True

                if not job.pause_in_progress and job.actions[
                        'pause_requested'] is True:
                    # pause the job by waiting for the completion of ongoing run(s) and refusing
                    # any new run.
                    job.pause_in_progress = True

                if job.is_done():
                    break

                if not job.cancel_in_progress:
                    self._schedule_runs_lk(et_pool, job)
                job.is_waiting = True
                self._cond.wait(self._wait_timeout_sec)
                job.update_keepalive()
                job.is_waiting = False
        finally:
            self._cond.release()
            et_pool.close()
            et_pool.join()
        return job.all_results, job.actions

    def _get_queue_name(self):
        """ Get the name of SQS queue to use for this worker.
        """
        raise NotImplementedError

    def _get_scanner_queue_name(self):
        """ Get the name of feedback scanner SQS queue to use for this worker.
        """
        raise NotImplementedError
Ejemplo n.º 5
0
 def get_etl_helper(self, etl_records):
     with mock.patch(
             'mycroft.models.aws_connections.TableConnection.get_connection'
     ) as mocked_etl:
         mocked_etl.return_value = etl_records
         yield ETLStatusHelper()