Exemple #1
0
    def _resolve_job_function(
        self,
        scheduled_kind: schemas.ScheduleKinds,
        scheduled_object: Any,
        project_name: str,
        schedule_name: str,
        schedule_concurrency_limit: int,
    ) -> Tuple[Callable, Optional[Union[List, Tuple]], Optional[Dict]]:
        """
        :return: a tuple (function, args, kwargs) to be used with the APScheduler.add_job
        """

        if scheduled_kind == schemas.ScheduleKinds.job:
            scheduled_object_copy = copy.deepcopy(scheduled_object)
            return (
                Scheduler.submit_run_wrapper,
                [
                    scheduled_object_copy,
                    project_name,
                    schedule_name,
                    schedule_concurrency_limit,
                ],
                {},
            )
        if scheduled_kind == schemas.ScheduleKinds.local_function:
            return scheduled_object, [], {}

        # sanity
        message = "Scheduled object kind missing implementation"
        logger.warn(message, scheduled_object_kind=scheduled_kind)
        raise NotImplementedError(message)
Exemple #2
0
 def _reload_schedules(
     self,
     db_session: Session,
     auth_info: mlrun.api.schemas.AuthInfo,
 ):
     logger.info("Reloading schedules")
     db_schedules = get_db().list_schedules(db_session)
     for db_schedule in db_schedules:
         # don't let one failure fail the rest
         try:
             self._create_schedule_in_scheduler(
                 db_schedule.project,
                 db_schedule.name,
                 db_schedule.kind,
                 db_schedule.scheduled_object,
                 db_schedule.cron_trigger,
                 db_schedule.concurrency_limit,
                 auth_info,
             )
         except Exception as exc:
             logger.warn(
                 "Failed rescheduling job. Continuing",
                 exc=str(exc),
                 db_schedule=db_schedule,
             )
Exemple #3
0
    async def submit_run_wrapper(
        scheduled_object,
        project_name,
        schedule_name,
        schedule_concurrency_limit,
        auth_info: mlrun.api.schemas.AuthInfo,
    ):
        # import here to avoid circular imports
        from mlrun.api.api.utils import submit_run

        # removing the schedule from the body otherwise when the scheduler will submit this task it will go to an
        # endless scheduling loop
        scheduled_object.pop("schedule", None)

        # removing the uid from the task metadata so that a new uid will be generated for every run
        # otherwise all runs will have the same uid
        scheduled_object.get("task", {}).get("metadata", {}).pop("uid", None)

        if "task" in scheduled_object and "metadata" in scheduled_object[
                "task"]:
            scheduled_object["task"]["metadata"].setdefault("labels", {})
            scheduled_object["task"]["metadata"]["labels"][
                schemas.constants.LabelNames.schedule_name] = schedule_name

        db_session = create_session()

        active_runs = get_db().list_runs(
            db_session,
            state=RunStates.non_terminal_states(),
            project=project_name,
            labels=
            f"{schemas.constants.LabelNames.schedule_name}={schedule_name}",
        )
        if len(active_runs) >= schedule_concurrency_limit:
            logger.warn(
                "Schedule exceeded concurrency limit, skipping this run",
                project=project_name,
                schedule_name=schedule_name,
                schedule_concurrency_limit=schedule_concurrency_limit,
                active_runs=len(active_runs),
            )
            return

        response = await submit_run(db_session, auth_info, scheduled_object)

        run_metadata = response["data"]["metadata"]
        run_uri = RunObject.create_uri(run_metadata["project"],
                                       run_metadata["uid"],
                                       run_metadata["iteration"])
        get_db().update_schedule(
            db_session,
            run_metadata["project"],
            schedule_name,
            last_run_uri=run_uri,
            leader_session=auth_info.session,
        )

        close_session(db_session)

        return response
Exemple #4
0
    def _reload_schedules(self, db_session: Session):
        logger.info("Reloading schedules")
        db_schedules = get_db().list_schedules(db_session)
        for db_schedule in db_schedules:
            # don't let one failure fail the rest
            try:
                # import here to avoid circular imports
                import mlrun.api.crud

                access_key = None
                username = None
                if mlrun.api.utils.auth.verifier.AuthVerifier(
                ).is_jobs_auth_required():
                    username, access_key = self._get_schedule_secrets(
                        db_schedule.project, db_schedule.name)
                self._create_schedule_in_scheduler(
                    db_schedule.project,
                    db_schedule.name,
                    db_schedule.kind,
                    db_schedule.scheduled_object,
                    db_schedule.cron_trigger,
                    db_schedule.concurrency_limit,
                    mlrun.api.schemas.AuthInfo(username=username,
                                               access_key=access_key),
                )
            except Exception as exc:
                logger.warn(
                    "Failed rescheduling job. Continuing",
                    exc=str(exc),
                    traceback=traceback.format_exc(),
                    db_schedule=db_schedule,
                )
Exemple #5
0
def _parse_query_parameters(request_body: Dict[str, Any]) -> Dict[str, str]:
    """
    This function searches for the target field in Grafana's SimpleJson json. Once located, the target string is
    parsed by splitting on semi-colons (;). Each part in the resulting list is then split by an equal sign (=) to be
    read as key-value pairs.
    """

    # Try to get the target
    targets = request_body.get("targets", [])

    if len(targets) > 1:
        logger.warn(
            f"The 'targets' list contains more then one element ({len(targets)}), all targets except the first one are "
            f"ignored.")

    target_obj = targets[0] if targets else {}
    target_query = target_obj.get("target") if target_obj else ""

    if not target_query:
        raise MLRunBadRequestError(
            f"Target missing in request body:\n {request_body}")

    parameters = _parse_parameters(target_query)

    return parameters
Exemple #6
0
    def post_init(self, mode="sync"):
        server = getattr(self.context, "_server", None) or getattr(
            self.context, "server", None)
        if not server:
            logger.warn(
                "GraphServer not initialized for VotingEnsemble instance")
            return

        _init_endpoint_record(server, self)
Exemple #7
0
def grafana_incoming_features(body: Dict[str, Any],
                              query_parameters: Dict[str,
                                                     str], access_key: str):
    endpoint_id = query_parameters.get("endpoint_id")
    project = query_parameters.get("project")
    start = body.get("rangeRaw", {}).get("from", "now-1h")
    end = body.get("rangeRaw", {}).get("to", "now")

    endpoint = ModelEndpoints.get_endpoint(access_key=access_key,
                                           project=project,
                                           endpoint_id=endpoint_id)

    time_series = []

    feature_names = endpoint.spec.feature_names

    if not feature_names:
        logger.warn(
            "'feature_names' is either missing or not initialized in endpoint record",
            endpoint_id=endpoint.metadata.uid,
        )
        return time_series

    path = config.model_endpoint_monitoring.store_prefixes.default.format(
        project=project, kind=EVENTS)
    _, container, path = parse_model_endpoint_store_prefix(path)

    client = get_frames_client(
        token=access_key,
        address=config.v3io_framesd,
        container=container,
    )

    data: pd.DataFrame = client.read(
        backend="tsdb",
        table=path,
        columns=feature_names,
        filter=f"endpoint_id=='{endpoint_id}'",
        start=start,
        end=end,
    )

    data.drop(["endpoint_id"], axis=1, inplace=True, errors="ignore")
    data.index = data.index.astype(np.int64) // 10**6

    for feature, indexed_values in data.to_dict().items():
        target = GrafanaTimeSeriesTarget(target=feature)
        for index, value in indexed_values.items():
            data_point = GrafanaDataPoint(value=float(value), timestamp=index)
            target.add_data_point(data_point)
        time_series.append(target)

    return time_series
Exemple #8
0
    def _validate_cron_trigger(
        self,
        cron_trigger: schemas.ScheduleCronTrigger,
        # accepting now from outside for testing purposes
        now: datetime = None,
    ):
        """
        Enforce no more then one job per min_allowed_interval
        """
        logger.debug("Validating cron trigger")
        apscheduler_cron_trigger = self.transform_schemas_cron_trigger_to_apscheduler_cron_trigger(
            cron_trigger
        )
        now = now or datetime.now(apscheduler_cron_trigger.timezone)
        next_run_time = None
        second_next_run_time = now

        # doing 60 checks to allow one minute precision, if the _min_allowed_interval is less then one minute validation
        # won't fail in certain scenarios that it should. See test_validate_cron_trigger_multi_checks for detailed
        # explanation
        for index in range(60):
            next_run_time = apscheduler_cron_trigger.get_next_fire_time(
                None, second_next_run_time
            )
            # will be none if we got a schedule that has no next fire time - for example schedule with year=1999
            if next_run_time is None:
                return
            second_next_run_time = apscheduler_cron_trigger.get_next_fire_time(
                next_run_time, next_run_time
            )
            # will be none if we got a schedule that has no next fire time - for example schedule with year=2050
            if second_next_run_time is None:
                return
            min_allowed_interval_seconds = humanfriendly.parse_timespan(
                self._min_allowed_interval
            )
            if second_next_run_time < next_run_time + timedelta(
                seconds=min_allowed_interval_seconds
            ):
                logger.warn(
                    "Cron trigger too frequent. Rejecting",
                    cron_trigger=cron_trigger,
                    next_run_time=next_run_time,
                    second_next_run_time=second_next_run_time,
                    delta=second_next_run_time - next_run_time,
                )
                raise ValueError(
                    f"Cron trigger too frequent. no more then one job "
                    f"per {self._min_allowed_interval} is allowed"
                )
Exemple #9
0
    def post_init(self, mode="sync"):
        """sync/async model loading, for internal use"""
        if not self.ready:
            if mode == "async":
                t = threading.Thread(target=self._load_and_update_state)
                t.start()
                self.context.logger.info(
                    f"started async model loading for {self.name}")
            else:
                self._load_and_update_state()

        server = getattr(self.context, "_server", None) or getattr(
            self.context, "server", None)
        if not server:
            logger.warn(
                "GraphServer not initialized for VotingEnsemble instance")
            return

        _init_endpoint_record(server, self)
Exemple #10
0
    def _reload_schedules(self, db_session: Session):
        logger.info("Reloading schedules")
        db_schedules = get_db().list_schedules(db_session)
        for db_schedule in db_schedules:
            # don't let one failure fail the rest
            try:
                # import here to avoid circular imports
                import mlrun.api.crud

                session = None
                if self._store_schedule_credentials_in_secrets:
                    schedule_secret_key = mlrun.api.crud.Secrets(
                    ).generate_schedule_secret_key(db_schedule.name)
                    secret_key_map = (mlrun.api.crud.Secrets().
                                      generate_schedule_key_map_secret_key())
                    session = mlrun.api.crud.Secrets().get_secret(
                        db_schedule.project,
                        self._secrets_provider,
                        schedule_secret_key,
                        allow_secrets_from_k8s=True,
                        allow_internal_secrets=True,
                        key_map_secret_key=secret_key_map,
                    )
                self._create_schedule_in_scheduler(
                    db_schedule.project,
                    db_schedule.name,
                    db_schedule.kind,
                    db_schedule.scheduled_object,
                    db_schedule.cron_trigger,
                    db_schedule.concurrency_limit,
                    mlrun.api.schemas.AuthInfo(session=session),
                )
            except Exception as exc:
                logger.warn(
                    "Failed rescheduling job. Continuing",
                    exc=str(exc),
                    traceback=traceback.format_exc(),
                    db_schedule=db_schedule,
                )
    def do(self, event: Dict):
        endpoint_id = event[ENDPOINT_ID]

        if endpoint_id not in self.feature_names:
            endpoint_record = get_endpoint_record(
                kv_container=self.kv_container,
                kv_path=self.kv_path,
                endpoint_id=endpoint_id,
            )
            feature_names = endpoint_record.get(FEATURE_NAMES)
            feature_names = json.loads(
                feature_names) if feature_names else None

            label_columns = endpoint_record.get(LABEL_COLUMNS)
            label_columns = json.loads(
                label_columns) if label_columns else None

            if not feature_names:
                logger.warn(
                    f"Feature names are not initialized, they will be automatically generated",
                    endpoint_id=endpoint_id,
                )
                feature_names = [
                    f"f{i}" for i, _ in enumerate(event[FEATURES])
                ]
                get_v3io_client().kv.update(
                    container=self.kv_container,
                    table_path=self.kv_path,
                    key=event[ENDPOINT_ID],
                    attributes={FEATURE_NAMES: json.dumps(feature_names)},
                )

            if not label_columns:
                logger.warn(
                    f"label column names are not initialized, they will be automatically generated",
                    endpoint_id=endpoint_id,
                )
                label_columns = [
                    f"p{i}" for i, _ in enumerate(event[PREDICTION])
                ]
                get_v3io_client().kv.update(
                    container=self.kv_container,
                    table_path=self.kv_path,
                    key=event[ENDPOINT_ID],
                    attributes={LABEL_COLUMNS: json.dumps(label_columns)},
                )

            self.label_columns[endpoint_id] = label_columns
            self.feature_names[endpoint_id] = feature_names

        feature_names = self.feature_names[endpoint_id]
        features = event[FEATURES]
        event[NAMED_FEATURES] = {
            name: feature
            for name, feature in zip(feature_names, features)
        }

        label_columns = self.label_columns[endpoint_id]
        prediction = event[PREDICTION]
        event[NAMED_PREDICTIONS] = {
            name: prediction
            for name, prediction in zip(label_columns, prediction)
        }
        return event
    def do(self, event: Dict):
        endpoint_id = event[ENDPOINT_ID]

        if endpoint_id not in self.feature_names:
            endpoint_record = get_endpoint_record(
                kv_container=self.kv_container,
                kv_path=self.kv_path,
                endpoint_id=endpoint_id,
                access_key=self.access_key,
            )
            feature_names = endpoint_record.get(FEATURE_NAMES)
            feature_names = json.loads(
                feature_names) if feature_names else None

            label_columns = endpoint_record.get(LABEL_COLUMNS)
            label_columns = json.loads(
                label_columns) if label_columns else None

            if not feature_names and self._infer_columns_from_data:
                feature_names = self._infer_feature_names_from_data(event)

            if not feature_names:
                logger.warn(
                    "Feature names are not initialized, they will be automatically generated",
                    endpoint_id=endpoint_id,
                )
                feature_names = [
                    f"f{i}" for i, _ in enumerate(event[FEATURES])
                ]
                get_v3io_client().kv.update(
                    container=self.kv_container,
                    table_path=self.kv_path,
                    access_key=self.access_key,
                    key=event[ENDPOINT_ID],
                    attributes={FEATURE_NAMES: json.dumps(feature_names)},
                    raise_for_status=RaiseForStatus.always,
                )

            if not label_columns and self._infer_columns_from_data:
                label_columns = self._infer_label_columns_from_data(event)

            if not label_columns:
                logger.warn(
                    "label column names are not initialized, they will be automatically generated",
                    endpoint_id=endpoint_id,
                )
                label_columns = [
                    f"p{i}" for i, _ in enumerate(event[PREDICTION])
                ]
                get_v3io_client().kv.update(
                    container=self.kv_container,
                    table_path=self.kv_path,
                    access_key=self.access_key,
                    key=event[ENDPOINT_ID],
                    attributes={LABEL_COLUMNS: json.dumps(label_columns)},
                    raise_for_status=RaiseForStatus.always,
                )

            self.label_columns[endpoint_id] = label_columns
            self.feature_names[endpoint_id] = feature_names

            logger.info("Label columns",
                        endpoint_id=endpoint_id,
                        label_columns=label_columns)
            logger.info("Feature names",
                        endpoint_id=endpoint_id,
                        feature_names=feature_names)

        feature_names = self.feature_names[endpoint_id]
        features = event[FEATURES]
        event[NAMED_FEATURES] = {
            name: feature
            for name, feature in zip(feature_names, features)
        }

        label_columns = self.label_columns[endpoint_id]
        prediction = event[PREDICTION]
        event[NAMED_PREDICTIONS] = {
            name: prediction
            for name, prediction in zip(label_columns, prediction)
        }
        logger.info("Mapped event", event=event)
        return event
Exemple #13
0
    async def submit_run_wrapper(
        scheduler,
        scheduled_object,
        project_name,
        schedule_name,
        schedule_concurrency_limit,
        auth_info: mlrun.api.schemas.AuthInfo,
    ):
        # import here to avoid circular imports
        import mlrun.api.crud
        from mlrun.api.api.utils import submit_run

        # removing the schedule from the body otherwise when the scheduler will submit this task it will go to an
        # endless scheduling loop
        scheduled_object.pop("schedule", None)

        # removing the uid from the task metadata so that a new uid will be generated for every run
        # otherwise all runs will have the same uid
        scheduled_object.get("task", {}).get("metadata", {}).pop("uid", None)

        if "task" in scheduled_object and "metadata" in scheduled_object[
                "task"]:
            scheduled_object["task"]["metadata"].setdefault("labels", {})
            scheduled_object["task"]["metadata"]["labels"][
                schemas.constants.LabelNames.schedule_name] = schedule_name

        db_session = create_session()

        active_runs = mlrun.api.crud.Runs().list_runs(
            db_session,
            state=RunStates.non_terminal_states(),
            project=project_name,
            labels=
            f"{schemas.constants.LabelNames.schedule_name}={schedule_name}",
        )
        if len(active_runs) >= schedule_concurrency_limit:
            logger.warn(
                "Schedule exceeded concurrency limit, skipping this run",
                project=project_name,
                schedule_name=schedule_name,
                schedule_concurrency_limit=schedule_concurrency_limit,
                active_runs=len(active_runs),
            )
            return

        # if credentials are needed but missing (will happen for schedules on upgrade from scheduler that didn't store
        # credentials to one that does store) enrich them
        # Note that here we're using the "knowledge" that submit_run only requires the session of the auth info
        if not auth_info.session and scheduler._store_schedule_credentials_in_secrets:
            # import here to avoid circular imports
            import mlrun.api.utils.auth
            import mlrun.api.utils.singletons.project_member

            logger.info(
                "Schedule missing auth info which is required. Trying to fill from project owner",
                project_name=project_name,
                schedule_name=schedule_name,
            )

            project_owner = mlrun.api.utils.singletons.project_member.get_project_member(
            ).get_project_owner(db_session, project_name)
            # Update the schedule with the new auth info so we won't need to do the above again in the next run
            scheduler.update_schedule(
                db_session,
                mlrun.api.schemas.AuthInfo(session=project_owner.session),
                project_name,
                schedule_name,
            )

        response = await submit_run(db_session, auth_info, scheduled_object)

        run_metadata = response["data"]["metadata"]
        run_uri = RunObject.create_uri(run_metadata["project"],
                                       run_metadata["uid"],
                                       run_metadata["iteration"])
        get_db().update_schedule(
            db_session,
            run_metadata["project"],
            schedule_name,
            last_run_uri=run_uri,
        )

        close_session(db_session)

        return response
Exemple #14
0
    def run(self):

        try:
            endpoints = self.db.list_model_endpoints(self.project)
        except Exception as e:
            logger.error("Failed to list endpoints", exc=e)
            return

        active_endpoints = set()
        for endpoint in endpoints.endpoints:
            if endpoint.spec.active:
                active_endpoints.add(endpoint.metadata.uid)

        store, sub = store_manager.get_or_create_store(self.parquet_path)
        prefix = self.parquet_path.replace(sub, "")
        fs = store.get_filesystem(silent=False)

        if not fs.exists(sub):
            logger.warn(
                f"{sub} does not exist"
            )
            return

        for endpoint_dir in fs.ls(sub):
            endpoint_id = endpoint_dir["name"].split("=")[-1]
            if endpoint_id not in active_endpoints:
                continue

            try:
                last_year = self.get_last_created_dir(fs, endpoint_dir)
                last_month = self.get_last_created_dir(fs, last_year)
                last_day = self.get_last_created_dir(fs, last_month)
                last_hour = self.get_last_created_dir(fs, last_day)

                parquet_files = fs.ls(last_hour["name"])
                last_parquet = sorted(parquet_files, key=lambda k: k["mtime"])[-1]
                parquet_name = last_parquet["name"]
                full_path = f"{prefix}{parquet_name}"

                logger.info(f"Now processing {full_path}")

                endpoint = self.db.get_model_endpoint(
                    project=self.project, endpoint_id=endpoint_id
                )

                df = pd.read_parquet(full_path)
                timestamp = df["timestamp"].iloc[-1]

                named_features_df = list(df["named_features"])
                named_features_df = pd.DataFrame(named_features_df)

                current_stats = DFDataInfer.get_stats(
                    df=named_features_df, options=InferOptions.Histogram
                )

                drift_result = self.virtual_drift.compute_drift_from_histograms(
                    feature_stats=endpoint.status.feature_stats,
                    current_stats=current_stats,
                )

                logger.info("Drift result", drift_result=drift_result)

                drift_status, drift_measure = self.check_for_drift(
                    drift_result=drift_result, endpoint=endpoint
                )

                logger.info(
                    "Drift status",
                    endpoint_id=endpoint_id,
                    drift_status=drift_status,
                    drift_measure=drift_measure,
                )

                if drift_status == "POSSIBLE_DRIFT" or drift_status == "DRIFT_DETECTED":
                    self.v3io.stream.put_records(
                        container=self.stream_container,
                        stream_path=self.stream_path,
                        records=[
                            {
                                "data": json.dumps(
                                    {
                                        "endpoint_id": endpoint_id,
                                        "drift_status": drift_status,
                                        "drift_measure": drift_measure,
                                        "drift_per_feature": {**drift_result},
                                    }
                                )
                            }
                        ],
                    )

                self.v3io.kv.update(
                    container=self.kv_container,
                    table_path=self.kv_path,
                    key=endpoint_id,
                    attributes={
                        "current_stats": json.dumps(current_stats),
                        "drift_measures": json.dumps(drift_result),
                        "drift_status": drift_status,
                    },
                )

                tsdb_drift_measures = {
                    "endpoint_id": endpoint_id,
                    "timestamp": pd.to_datetime(timestamp, format=TIME_FORMAT),
                    "record_type": "drift_measures",
                    "tvd_mean": drift_result["tvd_mean"],
                    "kld_mean": drift_result["kld_mean"],
                    "hellinger_mean": drift_result["hellinger_mean"],
                }

                self.frames.write(
                    backend="tsdb",
                    table=self.tsdb_path,
                    dfs=pd.DataFrame.from_dict([tsdb_drift_measures]),
                    index_cols=["timestamp", "endpoint_id", "record_type"],
                )

                logger.info(f"Done updating drift measures {full_path}")

            except Exception as e:
                logger.error(e)