def put_task(self, task: Task): """Put one task to the queue.""" self.producer.produce(self.topic, task.to_json(), on_delivery=self._delivery_callback) # Ensure local queue is not overloaded, see this issue for details: # https://github.com/confluentinc/confluent-kafka-python/issues/16 self.producer.poll(0)
def perform_prediction_step(self): stats_projects_ok = {} stats_models_ok = {} stats_projects_bad = {} stats_models_bad = {} # TODO: Use clock once it is finished stats_start_time = datetime.datetime.now() prediction_start_time = datetime.datetime.now( ) - self.continuous_prediction_delay prediction_end_time = prediction_start_time + self.continuous_prediction_interval for model in self.models: project_name = model.project_name if not project_name: logger.warning( "No project name found for model, skipping model") continue model_name = model.model_name if not model_name: logger.warning( f"No model name found for model in project {project_name}, skipping model" ) continue task = Task(project_name=project_name, model_name=model_name, from_time=prediction_start_time, to_time=prediction_end_time) try: self.task_queue.put_task(task) self.task_serial += 1 # logger.info(f"Enqueued '{model_name}' in '{project_name}'") stats_projects_ok[project_name] = stats_projects_ok.get( project_name, 0) + 1 stats_models_ok[model_name] = stats_models_ok.get( model_name, 0) + 1 except Exception as e: # logger.error(f"Could not send task: {e}") # traceback.print_exc() stats_projects_bad[project_name] = stats_projects_bad.get( project_name, "") + f", {e}" stats_models_bad[model_name] = stats_models_bad.get( model_name, "") + f", {e}" raise e stats_interval = datetime.datetime.now() - stats_start_time logger.info( f"Scheduled {len(stats_models_ok)} models in {len(stats_projects_ok)} projects in {human_delta(stats_interval)}" ) if len(stats_models_bad) > 0 or len(stats_projects_bad) > 0: logger.error( f" {len(stats_models_bad)} models in {len(stats_projects_bad)} projects failed" ) for name in stats_models_bad: logger.error(f" + {name}({stats_models_bad[name]})")
def deserialize_task(task_bytes, mode="json") -> typing.Optional[Task]: """ Deserialize a task from bytes """ task = None if mode == "pickle": try: task = pickle.loads(task_bytes) except pickle.UnpicklingError as e: logger.error(f"Could not deserialize task from pickle of size {len(task_bytes)}bytes: {e}") traceback.print_exc() else: try: # Rely on dataclass_json task = Task.from_json(task_bytes) except Exception as e: logger.error(f"Could not deserialize task from json of size {len(task_bytes)}bytes: '{task_bytes}', error:'{e}'") traceback.print_exc() return task
def perform_prediction_step(self): stats_projects_ok = set() stats_models_ok = set() stats_start_time_utc = datetime.datetime.now(utc) # Use UTC time cause Queue will ignore timezone. # Also Gordo client ignores microseconds, so round them to 0 prediction_start_time_utc = stats_start_time_utc.replace( microsecond=0) - self.continuous_prediction_delay prediction_end_time_utc = prediction_start_time_utc.replace( microsecond=0) + self.continuous_prediction_interval # fetch projects from the API projects = self.models_metadata_info_provider.get_projects() if not projects: raise ValueError( "No projects were fetch for scheduling the predictions.") model_names_by_project = self.model_info_provider.get_all_model_names_by_project( projects=projects) for project_name, models in model_names_by_project.items(): for model_name in models: task = Task( project_name=project_name, model_name=model_name, from_time=prediction_start_time_utc, to_time=prediction_end_time_utc, ) with pylogctx.context(task=task): self.task_queue.put_task(task) stats_projects_ok.add(project_name) stats_models_ok.add(model_name) logger.info( f"Scheduled {len(stats_models_ok)} models over {len(stats_projects_ok)} projects." )
def get_task(self) -> Task: return Task("null")