def get_expired_tasks_for_labourer(self, labourer: Labourer) -> List[Dict]: """ Return a list of tasks of Labourer previously invoked, and expired without being closed. """ _ = self.get_db_field_name return self.dynamo_db_client.get_by_query( keys={ _('labourer_id'): labourer.id, f"st_between_{_('greenfield')}": labourer.get_attr('start'), f"en_between_{_('greenfield')}": labourer.get_attr('expired'), }, index_name=self.config['dynamo_db_config']['index_greenfield'], filter_expression=f"attribute_not_exists {_('completed_at')}", )
def get_running_tasks_for_labourer( self, labourer: Labourer, count: bool = False) -> Union[List[Dict], int]: """ Return a list of tasks of Labourer previously invoked, but not yet closed or expired. We assume they are still running. If `count` is specified as True will return just the number of tasks, not the items themselves. Much cheaper. """ _ = self.get_db_field_name q = dict( keys={ _('labourer_id'): labourer.id, f"st_between_{_('greenfield')}": labourer.get_attr('expired'), f"en_between_{_('greenfield')}": labourer.get_attr('invoked'), }, index_name=self.config['dynamo_db_config']['index_greenfield'], filter_expression=f'attribute_not_exists {_("completed_at")}') if count: q['return_count'] = True return self.dynamo_db_client.get_by_query(**q)
def get_desired_invocation_number_for_labourer(self, labourer: Labourer) -> int: """ Decides the desired maximum number of simultaneous invocations for a specific Labourer. The decision is based on the ecology status of the Labourer and the configs. :return: Number of invocations """ labourer_status = self.task_client.ecology_client.get_labourer_status( labourer=labourer) coefficient = next( v for k, v in self.config['invocation_number_coefficient'].items() if labourer_status == k) labourer_max = labourer.get_attr('max_simultaneous_invocations') max_invocations = labourer_max if labourer_max is not None else self.config[ 'max_simultaneous_invocations'] desired = int(math.floor(max_invocations * coefficient)) currently_running = self.task_client.ecology_client.count_running_tasks_for_labourer( labourer) logger.info( f"Labourer: {labourer.id} has currently running {currently_running} tasks and desired {desired} " f"with respect to status {labourer_status}.") return max(desired - currently_running, 0)
def get_invoked_tasks_for_labourer(self, labourer: Labourer, completed: Optional[bool] = None) -> List[Dict]: """ Return a list of tasks of current Labourer invoked during the current run of the Orchestrator. If completed is provided: * True - filter completed ones * False - filter NOT completed ones * None (default) - do not care about `completed` status. """ _ = self.get_db_field_name query_args = { 'keys': { _('labourer_id'): labourer.id, _('greenfield'): labourer.get_attr('invoked') }, 'comparisons': {_('greenfield'): '>='}, 'index_name': self.config['dynamo_db_config']['index_greenfield'], } if completed is True: query_args['filter_expression'] = f"attribute_exists {_('completed_at')}" elif completed is False: query_args['filter_expression'] = f"attribute_not_exists {_('completed_at')}" else: logger.debug(f"No filtering by completed status for {query_args}") return self.dynamo_db_client.get_by_query(**query_args)
def get_next_for_labourer(self, labourer: Labourer, cnt: int = 1, only_ids: bool = False) -> List[Union[str, Dict]]: """ Fetch the next task(s) from the queue for the Labourer. :param labourer: Labourer to get next tasks for. :param cnt: Optional number of Tasks to fetch. :param only_ids: If explicitly set True, then returns only the IDs of tasks. This could save some transport if you are sending big batches of tasks between Lambdas. """ # Maximum value to identify the task as available for invocation (either new, or ready for retry). max_greenfield = labourer.get_attr('start') result = self.dynamo_db_client.get_by_query( { self.get_db_field_name('labourer_id'): labourer.id, self.get_db_field_name('greenfield'): max_greenfield }, table_name=self.config['dynamo_db_config']['table_name'], index_name=self.config['dynamo_db_config']['index_greenfield'], strict=True, max_items=cnt, comparisons={ self.get_db_field_name('greenfield'): '<' }) logger.debug(f"get_next_for_labourer() received: {result} from {self.config['dynamo_db_config']['table_name']} " f"for labourer: {labourer.id} max greenfield: {max_greenfield}") return result if not only_ids else [task[self.get_db_field_name('task_id')] for task in result]
def calculate_delay_for_task_retry(self, labourer: Labourer, task: Dict) -> int: logger.debug( f"Called Scavenger.calculate_delay_for_task_retry with labourer={labourer}, task={task}" ) attempts = task[self.get_db_field_name('attempts')] wanted_delay = labourer.get_attr('max_duration') * attempts return wanted_delay
def get_average_labourer_duration(self, labourer: Labourer) -> int: """ Analyse latest tasks of Labourer and calculate average runtime duration. .. warning:: This method doesn't know the exact duration of failed attempts. Thus if the task is completely failed, we assume that all attempts failed at maximum duration. :return: Average duration in seconds. """ _ = self.get_db_field_name _cfg = self.config.get durations = [] q = dict(keys={ _('labourer_id_task_status'): f"{labourer.id}_1", }, table_name=_cfg('sosw_closed_tasks_table'), index_name=_cfg('sosw_closed_tasks_labourer_status_index'), max_items=_cfg('max_closed_to_analyse_for_duration'), desc=True) # Fetch last X closed tasks tasks = self.dynamo_db_client.get_by_query(**q) # Fetch failed tasks as well q['keys'][_('labourer_id_task_status')] = f"{labourer.id}_0" tasks.extend(self.dynamo_db_client.get_by_query(**q)) # Now take the really last 50 ordered by greenfield (last invocation) tasks = sorted(tasks, key=lambda x: x.get(_('greenfield')) )[:_cfg('max_closed_to_analyse_for_duration')] # Get their duration for task in tasks: # We assume duration of failed tasks to be maximum. if not task.get(_('completed_at')): durations.extend([ labourer.get_attr('max_duration') for _ in range(int(task[_('attempts')])) ]) else: # Duration of completed tasks we calculate based on the value of last `greenfield` and `completed_at` durations.append(task[_('completed_at')] - task[_('greenfield')] + _cfg('greenfield_invocation_delta')) # Return the average try: return round(sum(durations) / len(durations)) except ZeroDivisionError: return 0
def get_tasks_to_retry_for_labourer(self, labourer: Labourer, limit: int = None) -> List[Dict]: _ = self.get_db_field_name attrs = { 'keys': { _('labourer_id'): labourer.id, _('desired_launch_time'): str(labourer.get_attr('start')) }, 'comparisons': { _('desired_launch_time'): "<=" }, 'table_name': self.config['sosw_retry_tasks_table'], 'index_name': self.config['sosw_retry_tasks_greenfield_index'], } if limit: attrs['max_items'] = limit tasks = self.dynamo_db_client.get_by_query(**attrs) return tasks
def should_retry_task(self, labourer: Labourer, task: Dict) -> bool: logger.debug( f"Called Scavenger.should_retry_task with labourer={labourer}, task={task}" ) attempts = task.get(self.get_db_field_name('attempts')) return True if attempts < labourer.get_attr('max_attempts') else False