Ejemplo n.º 1
0
    def put_task(self, task: Task):
        """Put one task to the queue."""
        self.producer.produce(self.topic,
                              task.to_json(),
                              on_delivery=self._delivery_callback)

        # Ensure local queue is not overloaded, see this issue for details:
        # https://github.com/confluentinc/confluent-kafka-python/issues/16
        self.producer.poll(0)
Ejemplo n.º 2
0
 def perform_prediction_step(self):
     stats_projects_ok = {}
     stats_models_ok = {}
     stats_projects_bad = {}
     stats_models_bad = {}
     # TODO: Use clock once it is finished
     stats_start_time = datetime.datetime.now()
     prediction_start_time = datetime.datetime.now(
     ) - self.continuous_prediction_delay
     prediction_end_time = prediction_start_time + self.continuous_prediction_interval
     for model in self.models:
         project_name = model.project_name
         if not project_name:
             logger.warning(
                 "No project name found for model, skipping model")
             continue
         model_name = model.model_name
         if not model_name:
             logger.warning(
                 f"No model name found for model in project {project_name}, skipping model"
             )
             continue
         task = Task(project_name=project_name,
                     model_name=model_name,
                     from_time=prediction_start_time,
                     to_time=prediction_end_time)
         try:
             self.task_queue.put_task(task)
             self.task_serial += 1
             # logger.info(f"Enqueued '{model_name}' in '{project_name}'")
             stats_projects_ok[project_name] = stats_projects_ok.get(
                 project_name, 0) + 1
             stats_models_ok[model_name] = stats_models_ok.get(
                 model_name, 0) + 1
         except Exception as e:
             # logger.error(f"Could not send task: {e}")
             # traceback.print_exc()
             stats_projects_bad[project_name] = stats_projects_bad.get(
                 project_name, "") + f", {e}"
             stats_models_bad[model_name] = stats_models_bad.get(
                 model_name, "") + f", {e}"
             raise e
     stats_interval = datetime.datetime.now() - stats_start_time
     logger.info(
         f"Scheduled {len(stats_models_ok)} models in {len(stats_projects_ok)} projects in {human_delta(stats_interval)}"
     )
     if len(stats_models_bad) > 0 or len(stats_projects_bad) > 0:
         logger.error(
             f"          {len(stats_models_bad)} models in {len(stats_projects_bad)} projects failed"
         )
         for name in stats_models_bad:
             logger.error(f"          + {name}({stats_models_bad[name]})")
Ejemplo n.º 3
0
def deserialize_task(task_bytes, mode="json") -> typing.Optional[Task]:
    """
    Deserialize a task from bytes
    """
    task = None
    if mode == "pickle":
        try:
            task = pickle.loads(task_bytes)
        except pickle.UnpicklingError as e:
            logger.error(f"Could not deserialize task from pickle of size {len(task_bytes)}bytes: {e}")
            traceback.print_exc()
    else:
        try:
            # Rely on dataclass_json
            task = Task.from_json(task_bytes)
        except Exception as e:
            logger.error(f"Could not deserialize task from json of size {len(task_bytes)}bytes: '{task_bytes}', error:'{e}'")
            traceback.print_exc()
    return task
Ejemplo n.º 4
0
    def perform_prediction_step(self):
        stats_projects_ok = set()
        stats_models_ok = set()

        stats_start_time_utc = datetime.datetime.now(utc)

        # Use UTC time cause Queue will ignore timezone.
        # Also Gordo client ignores microseconds, so round them to 0
        prediction_start_time_utc = stats_start_time_utc.replace(
            microsecond=0) - self.continuous_prediction_delay
        prediction_end_time_utc = prediction_start_time_utc.replace(
            microsecond=0) + self.continuous_prediction_interval

        # fetch projects from the API
        projects = self.models_metadata_info_provider.get_projects()
        if not projects:
            raise ValueError(
                "No projects were fetch for scheduling the predictions.")

        model_names_by_project = self.model_info_provider.get_all_model_names_by_project(
            projects=projects)

        for project_name, models in model_names_by_project.items():
            for model_name in models:
                task = Task(
                    project_name=project_name,
                    model_name=model_name,
                    from_time=prediction_start_time_utc,
                    to_time=prediction_end_time_utc,
                )
                with pylogctx.context(task=task):
                    self.task_queue.put_task(task)

                    stats_projects_ok.add(project_name)
                    stats_models_ok.add(model_name)

        logger.info(
            f"Scheduled {len(stats_models_ok)} models over {len(stats_projects_ok)} projects."
        )
Ejemplo n.º 5
0
 def get_task(self) -> Task:
     return Task("null")