Esempio n. 1
0
    def _prepare_docker_command(self, options, run_mode, raw_arguments):
        """
        Building a docker command line for running the model inside the docker - this command line can
        be used by the user independently of drum.
        Parameters
        Returns: docker command line to run as a string
        """
        options.docker = self._maybe_build_image(options.docker)
        in_docker_model = "/opt/model"
        in_docker_input_file = "/opt/input.csv"
        in_docker_output_file = "/opt/output.csv"
        in_docker_fit_output_dir = "/opt/fit_output_dir"
        in_docker_fit_target_filename = "/opt/fit_target.csv"
        in_docker_fit_row_weights_filename = "/opt/fit_row_weights.csv"

        docker_cmd = "docker run --rm --interactive  --user $(id -u):$(id -g) "
        docker_cmd_args = " -v {}:{}".format(options.code_dir, in_docker_model)

        in_docker_cmd_list = raw_arguments
        in_docker_cmd_list[0] = ArgumentsOptions.MAIN_COMMAND
        in_docker_cmd_list[1] = run_mode.value

        CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list,
                                          ArgumentsOptions.DOCKER)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.CODE_DIR,
                                                 in_docker_model)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, "-cd",
                                                 in_docker_model)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.INPUT,
                                                 in_docker_input_file)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.OUTPUT,
                                                 in_docker_output_file)

        if run_mode == RunMode.SERVER:
            host_port_list = options.address.split(":", 1)
            if len(host_port_list) == 1:
                raise DrumCommonException(
                    "Error: when using the docker option provide argument --server host:port"
                )
            port = int(host_port_list[1])
            host_port_inside_docker = "{}:{}".format("0.0.0.0", port)
            CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                     ArgumentsOptions.ADDRESS,
                                                     host_port_inside_docker)
            docker_cmd_args += " -p {port}:{port}".format(port=port)

        if run_mode in [
                RunMode.SCORE, RunMode.PERF_TEST, RunMode.VALIDATION,
                RunMode.FIT
        ]:
            docker_cmd_args += " -v {}:{}".format(options.input,
                                                  in_docker_input_file)

            if run_mode == RunMode.SCORE and options.output:
                output_file = os.path.realpath(options.output)
                if not os.path.exists(output_file):
                    # Creating an empty file so the mount command will mount the file correctly -
                    # otherwise docker create an empty directory
                    open(output_file, "a").close()
                docker_cmd_args += " -v {}:{}".format(output_file,
                                                      in_docker_output_file)
                CMRunnerUtils.replace_cmd_argument_value(
                    in_docker_cmd_list, ArgumentsOptions.OUTPUT,
                    in_docker_output_file)
            elif run_mode == RunMode.FIT:
                if options.output:
                    fit_output_dir = os.path.realpath(options.output)
                    docker_cmd_args += " -v {}:{}".format(
                        fit_output_dir, in_docker_fit_output_dir)
                CMRunnerUtils.replace_cmd_argument_value(
                    in_docker_cmd_list, ArgumentsOptions.OUTPUT,
                    in_docker_fit_output_dir)
                if options.target_csv:
                    fit_target_filename = os.path.realpath(options.target_csv)
                    docker_cmd_args += " -v {}:{}".format(
                        fit_target_filename, in_docker_fit_target_filename)
                    CMRunnerUtils.replace_cmd_argument_value(
                        in_docker_cmd_list,
                        ArgumentsOptions.TARGET_FILENAME,
                        in_docker_fit_target_filename,
                    )
                if options.row_weights_csv:
                    fit_row_weights_filename = os.path.realpath(
                        options.row_weights_csv)
                    docker_cmd_args += " -v {}:{}".format(
                        fit_row_weights_filename,
                        in_docker_fit_row_weights_filename)
                    CMRunnerUtils.replace_cmd_argument_value(
                        in_docker_cmd_list,
                        ArgumentsOptions.WEIGHTS_CSV,
                        in_docker_fit_row_weights_filename,
                    )

        docker_cmd += " {} {} {}".format(docker_cmd_args, options.docker,
                                         " ".join(in_docker_cmd_list))

        self._print_verbose("docker command: [{}]".format(docker_cmd))
        return docker_cmd
Esempio n. 2
0
def shared_fit_preprocessing(fit_class):
    """
    Shared preprocessing to get X, y, class_order, and row_weights.
    Used by _materialize method for both python and R fitting.

    :param fit_class: PythonFit or RFit class
    :return:
        X: pd.DataFrame of features to use in fit
        y: pd.Series of target to use in fit
        class_order: array specifying class order, or None
        row_weights: pd.Series of row weights, or None
    """
    # read in data
    if fit_class.input_filename.endswith(".mtx"):
        colnames = None
        if fit_class.sparse_column_file:
            colnames = [
                column.strip()
                for column in open(fit_class.sparse_column_file).readlines()
            ]
        df = pd.DataFrame.sparse.from_spmatrix(mmread(
            fit_class.input_filename),
                                               columns=colnames)
    else:
        df = pd.read_csv(fit_class.input_filename)

    # get num rows to use
    if fit_class.num_rows == "ALL":
        fit_class.num_rows = len(df)
    else:
        if fit_class.num_rows > len(df):
            raise DrumCommonException(
                "Requested number of rows greater than data length {} > {}".
                format(fit_class.num_rows, len(df)))
        fit_class.num_rows = int(fit_class.num_rows)

    # get target and features, resample and modify nrows if needed
    if fit_class.target_filename or fit_class.target_name:
        if fit_class.target_filename:
            y_unsampled = pd.read_csv(fit_class.target_filename,
                                      index_col=False)
            assert (
                len(y_unsampled.columns) == 1
            ), "Your target dataset at path {} has {} columns named {}".format(
                fit_class.target_filename, len(y_unsampled.columns),
                y_unsampled.columns)
            assert len(df) == len(
                y_unsampled
            ), "Your input data has {} entries, but your target data has {} entries".format(
                len(df), len(y_unsampled))
            if y_unsampled.columns[0] in df.columns:
                y_unsampled.columns = ["__target__"]
            df = df.merge(y_unsampled, left_index=True, right_index=True)
            assert len(y_unsampled.columns.values) == 1
            fit_class.target_name = y_unsampled.columns.values[0]
        df = df.dropna(subset=[fit_class.target_name])
        X = df.drop(fit_class.target_name, axis=1).sample(fit_class.num_rows,
                                                          random_state=1)
        y = df[fit_class.target_name].sample(fit_class.num_rows,
                                             random_state=1)

    else:
        X = df.sample(fit_class.num_rows, random_state=1)
        y = None

    row_weights = extract_weights(X, fit_class)
    class_order = extract_class_order(fit_class)
    return X, y, class_order, row_weights
Esempio n. 3
0
    def _run_fit_and_predictions_pipelines_in_mlpiper(self):
        if self.run_mode == RunMode.SERVER:
            run_language = self._check_artifacts_and_get_run_language()
            # in prediction server mode infra pipeline == prediction server runner pipeline
            infra_pipeline_str = self._prepare_prediction_server_or_batch_pipeline(
                run_language)
        elif self.run_mode == RunMode.SCORE:
            run_language = self._check_artifacts_and_get_run_language()
            tmp_output_filename = None
            # if output is not provided, output into tmp file and print
            if not self.options.output:
                # keep object reference so it will be destroyed only in the end of the process
                __tmp_output_file = tempfile.NamedTemporaryFile(mode="w")
                self.options.output = tmp_output_filename = __tmp_output_file.name
            # in batch prediction mode infra pipeline == predictor pipeline
            infra_pipeline_str = self._prepare_prediction_server_or_batch_pipeline(
                run_language)
        elif self.run_mode == RunMode.FIT:
            run_language = self._get_fit_run_language()
            infra_pipeline_str = self._prepare_fit_pipeline(run_language)
        else:
            error_message = "{} mode is not supported here".format(
                self.run_mode)
            print(error_message)
            raise DrumCommonException(error_message)

        config = ExecutorConfig(
            pipeline=infra_pipeline_str,
            pipeline_file=None,
            run_locally=True,
            comp_root_path=CMRunnerUtils.get_components_repo(),
            mlpiper_jar=None,
            spark_jars=None,
        )

        _pipeline_executor = Executor(config).standalone(True).set_verbose(
            self.options.verbose)
        # assign logger with the name drum.mlpiper.Executor to mlpiper Executor
        _pipeline_executor.set_logger(
            logging.getLogger(LOGGER_NAME_PREFIX + "." +
                              _pipeline_executor.logger_name()))

        self.logger.info(">>> Start {} in the {} mode".format(
            ArgumentsOptions.MAIN_COMMAND, self.run_mode.value))
        sc = StatsCollector(disable_instance=(
            not hasattr(self.options, "show_perf")
            or not self.options.show_perf or self.run_mode == RunMode.SERVER))
        sc.register_report("Full time", "end", StatsOperation.SUB, "start")
        sc.register_report("Init time (incl model loading)", "init",
                           StatsOperation.SUB, "start")
        sc.register_report("Run time (incl reading CSV)", "run",
                           StatsOperation.SUB, "init")
        with verbose_stdout(self.options.verbose):
            sc.enable()
            try:
                sc.mark("start")

                _pipeline_executor.init_pipeline()
                self.runtime.initialization_succeeded = True
                sc.mark("init")

                _pipeline_executor.run_pipeline(cleanup=False)
                sc.mark("run")
            finally:
                _pipeline_executor.cleanup_pipeline()
                sc.mark("end")
                sc.disable()
        self.logger.info("<<< Finish {} in the {} mode".format(
            ArgumentsOptions.MAIN_COMMAND, self.run_mode.value))
        sc.print_reports()
        if self.run_mode == RunMode.SCORE:
            # print result if output is not provided
            if tmp_output_filename:
                print(pd.read_csv(tmp_output_filename))
Esempio n. 4
0
    def _maybe_build_image(self, docker_image_or_directory):
        def _get_requirements_lines(reqs_file_path):
            if not os.path.exists(reqs_file_path):
                return None

            with open(reqs_file_path) as f:
                lines = f.readlines()
                lines = [l.strip() for l in lines]
            return lines

        ret_docker_image = None
        if os.path.isdir(docker_image_or_directory):
            docker_image_or_directory = os.path.abspath(
                docker_image_or_directory)
            # Set image tag to the dirname/dirname of the docker context.
            # E.g. for two folders:
            # /home/path1/my_env
            # /home/path2/my_env
            # tags will be 'path1/my_env', 'path2/my_env'
            #
            # If tag already exists, older image will be untagged.
            context_path = os.path.abspath(docker_image_or_directory)
            tag = "{}/{}".format(
                os.path.basename(os.path.dirname(context_path)),
                os.path.basename(context_path)).lower()

            lines = _get_requirements_lines(
                os.path.join(self.options.code_dir, "requirements.txt"))
            temp_context_dir = None
            if lines is not None and not self.options.skip_deps_install:
                temp_context_dir = tempfile.mkdtemp()
                shutil.rmtree(temp_context_dir)
                shutil.copytree(docker_image_or_directory, temp_context_dir)
                msg = (
                    "Requirements file has been found in the code dir. DRUM will try to install dependencies into a docker image.\n"
                    "Docker context has been copied from: {} to: {}".format(
                        docker_image_or_directory, temp_context_dir))

                print(msg)
                self.logger.debug(msg)
                docker_image_or_directory = temp_context_dir

                with open(os.path.join(temp_context_dir, "Dockerfile"),
                          mode="a") as f:
                    if self.options.language == RunLanguage.PYTHON.value:
                        f.write("\nRUN pip3 install {}".format(
                            " ".join(lines)))
                    elif self.options.language == RunLanguage.R.value:
                        quoted_lines = ["'{}'".format(ll) for ll in lines]
                        deps_str = ", ".join(quoted_lines)
                        l1 = "\nRUN echo \"r <- getOption('repos'); r['CRAN'] <- 'http://cran.rstudio.com/'; options(repos = r);\" > ~/.Rprofile"
                        l2 = '\nRUN Rscript -e "withCallingHandlers(install.packages(c({}), Ncpus=4), warning = function(w) stop(w))"'.format(
                            deps_str)
                        f.write(l1)
                        f.write(l2)
                    else:
                        msg = "Dependencies management is not supported for the '{}' language and will not be installed into an image".format(
                            self.options.language)
                        self.logger.warning(msg)
                        print(msg)

            docker_build_msg = "Building a docker image from directory: {}...".format(
                docker_image_or_directory)
            self.logger.info(docker_build_msg)
            self.logger.info("This may take some time")

            try:
                client_docker_low_level = docker.APIClient()
                spinner = Spinner(docker_build_msg + "  ")
                json_lines = []
                # Build docker, rotate spinner according to build progress
                # and save status messages from docker build.
                for line in client_docker_low_level.build(
                        path=docker_image_or_directory, rm=True, tag=tag):
                    line = line.decode("utf-8").strip()
                    json_lines.extend(
                        [json.loads(ll) for ll in line.split("\n")])
                    spinner.next()
                spinner.finish()
                # skip a line after spinner
                print()

                image_id = None
                build_error = False
                for line in json_lines:
                    if "error" in line:
                        build_error = True
                        break
                    if "stream" in line:
                        match = re.search(
                            r"(^Successfully built |sha256:)([0-9a-f]+)$",
                            line["stream"])
                        if match:
                            image_id = match.group(2)
                if image_id is None or build_error:
                    all_lines = "   \n".join(
                        [json.dumps(l) for l in json_lines])
                    raise DrumCommonException(
                        "Failed to build a docker image:\n{}".format(
                            all_lines))

                print(
                    "\nImage successfully built; tag: {}; image id: {}".format(
                        tag, image_id))
                print(
                    "It is recommended to use --docker {}, if you don't need to rebuild the image.\n"
                    .format(tag))

                ret_docker_image = image_id
            except docker.errors.APIError as e:
                self.logger.exception(
                    "Image build failed because of unknown to DRUM reason!")
                raise
            finally:
                if temp_context_dir is not None:
                    shutil.rmtree(temp_context_dir)
            self.logger.info("Done building image!")
        else:
            try:
                client = docker.client.from_env()
                client.images.get(docker_image_or_directory)
                ret_docker_image = docker_image_or_directory
            except docker.errors.ImageNotFound:
                pass

        if not ret_docker_image:
            raise DrumCommonException(
                "The string '{}' does not represent a docker image "
                "in your registry or a directory".format(
                    docker_image_or_directory))

        return ret_docker_image
Esempio n. 5
0
def _push_training(model_config, code_dir, endpoint=None, token=None):
    try:
        from datarobot._experimental import CustomTrainingBlueprint, CustomTrainingModel
    except ImportError:
        raise DrumCommonException(
            "You tried to run custom training models using a version of the \n"
            "datarobot client which doesn't have this beta functionality yet. \n"
            "Please pip install datarobot>=2.22.0b0 to access this functionality. \n"
            "This requires adding the internal datarobot artifactory index \n"
            "as your pip index. ")
    dr_client.Client(token=token, endpoint=endpoint)
    if "modelID" in model_config:
        model_id = model_config["modelID"]
    else:
        model_id = CustomTrainingModel.create(
            name=model_config["name"],
            target_type=_convert_target_type(model_config["targetType"]),
            description=model_config.get("description", "Pushed from DRUM"),
        ).id
        print(
            "You just created a new custom model. Please add this model ID to your metadata file "
            "by adding the line 'modelID:{}'".format(model_id))

    try:
        dr_client.CustomModelVersion.create_clean(
            model_id,
            base_environment_id=model_config["environmentID"],
            folder_path=code_dir,
            is_major_update=model_config.get("majorVersion", True),
        )
    except dr_client.errors.ClientError as e:
        print("Error adding model with ID {} and dir {}: {}".format(
            model_id, code_dir, str(e)))
        raise SystemExit(1)

    blueprint = CustomTrainingBlueprint.create(
        environment_id=model_config["environmentID"],
        custom_model_id=model_id,
    )

    print("A blueprint was created with the ID {}".format(blueprint.id))

    _print_model_started_dialogue(model_id)

    if "trainOnProject" in model_config.get("trainingModel", ""):
        try:
            project = dr_client.Project(
                model_config["trainingModel"]["trainOnProject"])
            model_job_id = project.train(blueprint)
            lid = dr_client.ModelJob.get(project_id=project.id,
                                         model_job_id=model_job_id).model_id
        except dr_client.errors.ClientError as e:
            print("There was an error training your model: {}".format(e))
            raise SystemExit()
        print("\nIn addition...")
        print("Model training has started! Follow along at this link: ")
        print(
            MODEL_LOGS_LINK_FORMAT.format(
                url=re.sub(r"/api/v2/?", "",
                           dr_client.client._global_client.endpoint),
                model_id=lid,
                project_id=model_config["trainingModel"]["trainOnProject"],
            ))
    def predict_unstructured(self, data, **kwargs):
        def _r_is_character(r_val):
            _is_character = ro.r("is.character")
            return bool(_is_character(r_val))

        def _r_is_raw(r_val):
            _is_raw = ro.r("is.raw")
            return bool(_is_raw(r_val))

        def _r_is_null(r_val):
            return r_val == ro.rinterface.NULL

        def _cast_r_to_py(r_val):
            # TODO: consider checking type against rpy2 proxy object like: isinstance(list_data_kwargs, ro.vectors.ListVector)
            # instead of calling R interpreter
            if _r_is_null(r_val):
                return None
            elif _r_is_raw(r_val):
                return bytes(r_val)
            elif _r_is_character(r_val):
                # Any scalar value is returned from R as one element vector,
                # so get this value.
                return str(r_val[0])
            else:
                raise DrumCommonException(
                    "Can not convert R value {} type {}".format(
                        r_val, type(r_val)))

        def _rlist_to_dict(rlist):
            if _r_is_null(rlist):
                return None
            return {str(k): _cast_r_to_py(v) for k, v in rlist.items()}

        data_binary_or_text = data

        if UnstructuredDtoKeys.QUERY in kwargs:
            kwargs[UnstructuredDtoKeys.QUERY] = ro.ListVector(
                kwargs[UnstructuredDtoKeys.QUERY])

        # if data_binary_or_text is str it will be auto converted into R character type;
        # otherwise if it is bytes, manually convert it into byte vector (raw)
        r_data_binary_or_text = data_binary_or_text
        if isinstance(data_binary_or_text, bytes):
            r_data_binary_or_text = ro.vectors.ByteVector(data_binary_or_text)

        kwargs_filtered = {k: v for k, v in kwargs.items() if v is not None}
        with capture_R_traceback_if_errors(r_handler, logger):
            list_data_kwargs = r_handler.predict_unstructured(
                model=self._model,
                data=r_data_binary_or_text,
                **kwargs_filtered)

        if isinstance(list_data_kwargs, ro.vectors.ListVector):
            ret = _cast_r_to_py(list_data_kwargs[0]), _rlist_to_dict(
                list_data_kwargs[1])
        else:
            raise DrumCommonException(
                "Wrong type returned in unstructured mode: {}".format(
                    type(list_data_kwargs)))

        return ret
Esempio n. 7
0
    def _materialize(self, parent_data_objs, user_data):
        model_api = base_api_blueprint()

        @model_api.route("/predict/", methods=["POST"])
        def predict():
            response_status = HTTP_200_OK
            file_key = "X"
            logger.debug("Entering predict() endpoint")
            REGRESSION_PRED_COLUMN = "Predictions"
            filename = request.files[
                file_key] if file_key in request.files else None
            logger.debug("Filename provided under X key: {}".format(filename))

            if not filename:
                wrong_key_error_message = "Samples should be provided as a csv file under `{}` key.".format(
                    file_key)
                logger.error(wrong_key_error_message)
                response_status = HTTP_422_UNPROCESSABLE_ENTITY
                return {
                    "message": "ERROR: " + wrong_key_error_message
                }, response_status

            in_df = pd.read_csv(filename)

            # TODO labels have to be provided as command line arguments or within configure endpoint
            self._stats_collector.enable()
            self._stats_collector.mark("start")
            self._set_in_df(in_df)
            self._stats_collector.mark("set_in_df")
            self._run_pipeline()
            self._stats_collector.mark("run_pipeline")
            out_df = self._get_out_df()
            self._clean_out_mem()
            self._stats_collector.mark("get_out_df")
            self._stats_collector.disable()

            num_columns = len(out_df.columns)
            # float32 is not JSON serializable, so cast to float, which is float64
            out_df = out_df.astype("float")
            if num_columns == 1:
                # df.to_json() is much faster.
                # But as it returns string, we have to assemble final json using strings.
                df_json = out_df[REGRESSION_PRED_COLUMN].to_json(
                    orient="records")
                response_json = '{{"predictions":{df_json}}}'.format(
                    df_json=df_json)
            elif num_columns == 2:
                # df.to_json() is much faster.
                # But as it returns string, we have to assemble final json using strings.
                df_json_str = out_df.to_json(orient="records")
                response_json = '{{"predictions":{df_json}}}'.format(
                    df_json=df_json_str)
            else:
                ret_str = (
                    "Predictions dataframe has {} columns; "
                    "Expected: 1 - for regression, 2 - for binary classification."
                    .format(num_columns))
                response_json = {"message": "ERROR: " + ret_str}
                response_status = HTTP_422_UNPROCESSABLE_ENTITY

            return response_json, response_status

        @model_api.route("/stats/", methods=["GET"])
        def stats():
            mem_info = self._memory_monitor.collect_memory_info()
            ret_dict = {"mem_info": mem_info._asdict()}
            self._stats_collector.round()

            ret_dict["time_info"] = {}
            for name in self._stats_collector.get_report_names():
                d = self._stats_collector.dict_report(name)
                ret_dict["time_info"][name] = d
            self._stats_collector.stats_reset()
            return ret_dict, HTTP_200_OK

        app = get_flask_app(model_api)
        logging.getLogger("werkzeug").setLevel(logger.getEffectiveLevel())

        host = self._params.get("host", None)
        port = self._params.get("port", None)
        try:
            app.run(host, port, threaded=self._threaded)
        except OSError as e:
            raise DrumCommonException("{}: host: {}; port: {}".format(
                e, host, port))

        self._cleanup_pipeline()
        if self._stats_collector:
            self._stats_collector.print_reports()

        return []
Esempio n. 8
0
def _push_training(model_config, code_dir, endpoint=None, token=None):
    try:
        from datarobot._experimental import CustomTrainingBlueprint, CustomTrainingModel
    except ImportError:
        raise DrumCommonException(
            "You tried to run custom training models using a version of the \n"
            "datarobot client which doesn't have this beta functionality yet. \n"
            "Please pip install datarobot>=2.22.0b0 to access this functionality. \n"
            "This requires adding the internal datarobot artifactory index \n"
            "as your pip index. ")
    dr_client.Client(token=token, endpoint=endpoint)
    if ModelMetadataKeys.MODEL_ID in model_config:
        model_id = model_config[ModelMetadataKeys.MODEL_ID]
    else:
        model_id = CustomTrainingModel.create(
            name=model_config[ModelMetadataKeys.NAME],
            target_type=_convert_target_type(
                model_config[ModelMetadataKeys.TARGET_TYPE]),
            description=model_config.get("description", "Pushed from DRUM"),
        ).id
        print(
            "You just created a new custom model. Please add this model ID to your metadata file "
            "by adding the line 'modelID:{}'".format(model_id))

    try:
        model_version = dr_client.CustomModelVersion.create_clean(
            model_id,
            base_environment_id=model_config[ModelMetadataKeys.ENVIRONMENT_ID],
            folder_path=code_dir,
            is_major_update=model_config.get(ModelMetadataKeys.MAJOR_VERSION,
                                             True),
        )
    except dr_client.errors.ClientError as e:
        print("Error adding model with ID {} and dir {}: {}".format(
            model_id, code_dir, str(e)))
        raise SystemExit(1)

    # TODO: Update this once the datarobot client is updated
    payload = dict(custom_model_version_id=model_version.id)
    response = dr_client.client.get_client().post("customTrainingBlueprints/",
                                                  data=payload)
    user_blueprint_id = response.json()["userBlueprintId"]

    print("A user blueprint was created with the ID {}".format(
        user_blueprint_id))

    _print_model_started_dialogue(model_id)

    if "trainOnProject" in model_config.get("trainingModel", ""):
        try:
            pid = model_config["trainingModel"]["trainOnProject"]
            current_task = "fetching the specified project {}".format(pid)
            project = dr_client.Project(pid)

            # TODO: Update this once the datarobot client is updated
            payload = dict(user_blueprint_id=user_blueprint_id)
            current_task = "adding your model to the menu"
            response = dr_client.client.get_client().post(
                "projects/{}/blueprints/fromUserBlueprint/".format(pid),
                data=payload)
            blueprint_id = response.json()["id"]

            current_task = "actually training of blueprint {}".format(
                blueprint_id)
            model_job_id = project.train(blueprint_id)
            lid = dr_client.ModelJob.get(project_id=pid,
                                         model_job_id=model_job_id).model_id
        except dr_client.errors.ClientError as e:
            print("There was an error training your model while {}: {}".format(
                current_task, e))
            raise SystemExit(1)
        print("\nIn addition...")
        print("Model training has started! Follow along at this link: ")
        print(
            MODEL_LOGS_LINK_FORMAT.format(
                url=re.sub(r"/api/v2/?", "",
                           dr_client.client._global_client.endpoint),
                model_id=lid,
                project_id=model_config["trainingModel"]["trainOnProject"],
            ))
Esempio n. 9
0
def read_model_metadata_yaml(code_dir):
    code_dir = Path(code_dir)
    config_path = code_dir.joinpath(MODEL_CONFIG_FILENAME)
    if config_path.exists():
        with open(config_path) as f:
            try:
                model_config = load(f.read(), MODEL_CONFIG_SCHEMA).data
            except YAMLError as e:
                print(e)
                raise SystemExit(1)

        if model_config[
                ModelMetadataKeys.TARGET_TYPE] == TargetType.BINARY.value:
            if model_config[ModelMetadataKeys.TYPE] == "inference":
                validate_config_fields(model_config,
                                       ModelMetadataKeys.INFERENCE_MODEL)
                validate_config_fields(
                    model_config[ModelMetadataKeys.INFERENCE_MODEL],
                    *["positiveClassLabel", "negativeClassLabel"])

        if model_config[
                ModelMetadataKeys.TARGET_TYPE] == TargetType.MULTICLASS.value:
            if model_config[ModelMetadataKeys.TYPE] == "inference":
                validate_config_fields(model_config,
                                       ModelMetadataKeys.INFERENCE_MODEL)
                classLabelsKeyIn = "classLabels" in model_config[
                    ModelMetadataKeys.INFERENCE_MODEL]
                classLabelFileKeyIn = (
                    "classLabelsFile"
                    in model_config[ModelMetadataKeys.INFERENCE_MODEL])
                if all([classLabelsKeyIn, classLabelFileKeyIn]):
                    raise DrumCommonException(
                        "\nError - for multiclass classification, either the class labels or "
                        "a class labels file should be provided in {} file, but not both."
                        .format(MODEL_CONFIG_FILENAME))
                elif not any([classLabelsKeyIn, classLabelFileKeyIn]):
                    raise DrumCommonException(
                        "\nError - for multiclass classification, either the class labels or "
                        "a class labels file must be provided in {} file.".
                        format(MODEL_CONFIG_FILENAME))

                if classLabelFileKeyIn:
                    classLabelsFile = model_config[
                        ModelMetadataKeys.INFERENCE_MODEL]["classLabelsFile"]

                    with open(classLabelsFile) as f:
                        labels = [
                            label for label in f.read().split(os.linesep)
                            if label
                        ]
                        if len(labels) < 2:
                            raise DrumCommonException(
                                "Multiclass classification requires at least 2 labels."
                            )
                        model_config[ModelMetadataKeys.
                                     INFERENCE_MODEL]["classLabels"] = labels
                        model_config[ModelMetadataKeys.
                                     INFERENCE_MODEL]["classLabelsFile"] = None

        return model_config
    return None
    def _materialize(self, parent_data_objs, user_data):
        model_api = base_api_blueprint()

        @model_api.route("/capabilities/", methods=["GET"])
        def capabilities():
            return make_predictor_capabilities(self._predictor.supported_payload_formats)

        @model_api.route("/info/", methods=["GET"])
        def info():
            model_info = self._predictor.model_info()
            model_info.update({ModelInfoKeys.LANGUAGE: self._run_language.value})
            model_info.update({ModelInfoKeys.DRUM_VERSION: drum_version})
            model_info.update({ModelInfoKeys.DRUM_SERVER: "flask"})
            model_info.update(
                {ModelInfoKeys.MODEL_METADATA: read_model_metadata_yaml(self._code_dir)}
            )

            return model_info, HTTP_200_OK

        @model_api.route("/health/", methods=["GET"])
        def health():
            return {"message": "OK"}, HTTP_200_OK

        @model_api.route("/predictions/", methods=["POST"])
        @model_api.route("/predict/", methods=["POST"])
        def predict():
            logger.debug("Entering predict() endpoint")

            self._stats_collector.enable()
            self._stats_collector.mark("start")

            try:
                response, response_status = self.do_predict_structured(logger=logger)
            finally:
                self._stats_collector.mark("finish")
                self._stats_collector.disable()
            return response, response_status

        @model_api.route("/transform/", methods=["POST"])
        def transform():

            logger.debug("Entering transform() endpoint")

            self._stats_collector.enable()
            self._stats_collector.mark("start")

            try:
                response, response_status = self.do_transform(logger=logger)
            finally:
                self._stats_collector.mark("finish")
                self._stats_collector.disable()
            return response, response_status

        @model_api.route("/predictionsUnstructured/", methods=["POST"])
        @model_api.route("/predictUnstructured/", methods=["POST"])
        def predict_unstructured():
            logger.debug("Entering predict() endpoint")

            self._stats_collector.enable()
            self._stats_collector.mark("start")

            try:
                response, response_status = self.do_predict_unstructured(logger=logger)
            finally:
                self._stats_collector.mark("finish")
                self._stats_collector.disable()
            return response, response_status

        @model_api.route("/stats/", methods=["GET"])
        def stats():
            mem_info = self._memory_monitor.collect_memory_info()
            ret_dict = {"mem_info": mem_info._asdict()}

            self._stats_collector.round()
            ret_dict["time_info"] = {}
            for name in self._stats_collector.get_report_names():
                d = self._stats_collector.dict_report(name)
                ret_dict["time_info"][name] = d
            self._stats_collector.stats_reset()
            return ret_dict, HTTP_200_OK

        @model_api.errorhandler(Exception)
        def handle_exception(e):
            logger.exception(e)
            return {"message": "ERROR: {}".format(e)}, HTTP_500_INTERNAL_SERVER_ERROR

        # Disables warning for development server
        cli = sys.modules["flask.cli"]
        cli.show_server_banner = lambda *x: None

        app = get_flask_app(model_api)

        host = self._params.get("host", None)
        port = self._params.get("port", None)
        try:
            app.run(host, port, threaded=False)
        except OSError as e:
            raise DrumCommonException("{}: host: {}; port: {}".format(e, host, port))

        if self._stats_collector:
            self._stats_collector.print_reports()

        return []
Esempio n. 11
0
    def _prepare_docker_command(self, options, run_mode, raw_arguments):
        """
        Building a docker command line for running the model inside the docker - this command line
        can be used by the user independently of drum.
        Parameters
        Returns: docker command line to run as a string
        """
        options.docker = self._maybe_build_image(options.docker)
        in_docker_model = "/opt/model"
        in_docker_input_file = "/opt/input.csv"
        in_docker_output_file = "/opt/output.csv"
        in_docker_fit_output_dir = "/opt/fit_output_dir"
        in_docker_fit_target_filename = "/opt/fit_target.csv"
        in_docker_fit_row_weights_filename = "/opt/fit_row_weights.csv"

        docker_cmd = "docker run --rm --entrypoint '' --interactive --user $(id -u):$(id -g)"
        docker_cmd_args = ' -v "{}":{}'.format(options.code_dir,
                                               in_docker_model)

        in_docker_cmd_list = raw_arguments
        in_docker_cmd_list[0] = ArgumentsOptions.MAIN_COMMAND
        in_docker_cmd_list[1] = run_mode.value

        # [RAPTOR-5607] Using -cd makes fit fail within docker, but not --code-dir.
        # Hotfix it by replacing -cd with --code-dir
        in_docker_cmd_list = [
            ArgumentsOptions.CODE_DIR if arg == "-cd" else arg
            for arg in in_docker_cmd_list
        ]

        CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list,
                                          ArgumentsOptions.DOCKER)
        CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list,
                                          ArgumentsOptions.SKIP_DEPS_INSTALL)
        if options.memory:
            docker_cmd_args += " --memory {mem_size} --memory-swap {mem_size} ".format(
                mem_size=options.memory)
            CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list,
                                              ArgumentsOptions.MEMORY)

        if options.class_labels and ArgumentsOptions.CLASS_LABELS not in in_docker_cmd_list:
            CMRunnerUtils.delete_cmd_argument(
                in_docker_cmd_list, ArgumentsOptions.CLASS_LABELS_FILE)
            in_docker_cmd_list.append(ArgumentsOptions.CLASS_LABELS)
            for label in options.class_labels:
                in_docker_cmd_list.append(label)

        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.CODE_DIR,
                                                 in_docker_model)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, "-cd",
                                                 in_docker_model)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.INPUT,
                                                 in_docker_input_file)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.OUTPUT,
                                                 in_docker_output_file)

        if run_mode == RunMode.SERVER:
            host_port_list = options.address.split(":", 1)
            if len(host_port_list) == 1:
                raise DrumCommonException(
                    "Error: when using the docker option provide argument --server host:port"
                )
            port = int(host_port_list[1])
            host_port_inside_docker = "{}:{}".format("0.0.0.0", port)
            CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                     ArgumentsOptions.ADDRESS,
                                                     host_port_inside_docker)
            docker_cmd_args += " -p {port}:{port}".format(port=port)

        if run_mode in [
                RunMode.SCORE, RunMode.PERF_TEST, RunMode.VALIDATION,
                RunMode.FIT
        ]:
            docker_cmd_args += ' -v "{}":{}'.format(options.input,
                                                    in_docker_input_file)

            if run_mode == RunMode.SCORE and options.output:
                output_file = os.path.realpath(options.output)
                if not os.path.exists(output_file):
                    # Creating an empty file so the mount command will mount the file correctly -
                    # otherwise docker create an empty directory
                    open(output_file, "a").close()
                docker_cmd_args += ' -v "{}":{}'.format(
                    output_file, in_docker_output_file)
                CMRunnerUtils.replace_cmd_argument_value(
                    in_docker_cmd_list, ArgumentsOptions.OUTPUT,
                    in_docker_output_file)
            elif run_mode == RunMode.FIT:
                if options.output:
                    fit_output_dir = os.path.realpath(options.output)
                    docker_cmd_args += ' -v "{}":{}'.format(
                        fit_output_dir, in_docker_fit_output_dir)
                CMRunnerUtils.replace_cmd_argument_value(
                    in_docker_cmd_list, ArgumentsOptions.OUTPUT,
                    in_docker_fit_output_dir)
                if options.target_csv:
                    fit_target_filename = os.path.realpath(options.target_csv)
                    docker_cmd_args += ' -v "{}":{}'.format(
                        fit_target_filename, in_docker_fit_target_filename)
                    CMRunnerUtils.replace_cmd_argument_value(
                        in_docker_cmd_list,
                        ArgumentsOptions.TARGET_CSV,
                        in_docker_fit_target_filename,
                    )
                if options.row_weights_csv:
                    fit_row_weights_filename = os.path.realpath(
                        options.row_weights_csv)
                    docker_cmd_args += ' -v "{}":{}'.format(
                        fit_row_weights_filename,
                        in_docker_fit_row_weights_filename)
                    CMRunnerUtils.replace_cmd_argument_value(
                        in_docker_cmd_list,
                        ArgumentsOptions.WEIGHTS_CSV,
                        in_docker_fit_row_weights_filename,
                    )

        docker_cmd += " {} {} {}".format(docker_cmd_args, options.docker,
                                         " ".join(in_docker_cmd_list))

        self._print_verbose("docker command: [{}]".format(docker_cmd))
        return docker_cmd
Esempio n. 12
0
    def configure(self, params):
        super(JavaPredictor, self).configure(params)

        ## retrieve the relevant extensions of the java predictor
        ## changed from last verion significantly due to associating
        ## jars with dr codegen AND h2o dai mojo pipeline
        self.custom_model_path = params["__custom_model_path__"]
        files_list = sorted(os.listdir(self.custom_model_path))
        files_list_str = " | ".join(files_list)
        self.logger.debug(
            "files in custom model path: ".format(files_list_str))
        reg_exp = r"|".join(r"(\{})".format(ext) for ext in JavaArtifacts.ALL)
        ext_re = re.findall(reg_exp, files_list_str)
        ext_re = [[match for match in matches if match != ""]
                  for matches in ext_re]
        ext_re = list(chain.from_iterable(ext_re))

        if len(ext_re) == 0:
            raise DrumCommonException(
                "\n\n{}\n"
                "Could not find model artifact file in: {} supported by default predictors.\n"
                "They support filenames with the following extensions {}.\n"
                "List of retrieved files are: {}".format(
                    RUNNING_LANG_MSG, self.custom_model_path,
                    JavaArtifacts.ALL, files_list_str))
        self.logger.debug("relevant artifact extensions {}".format(
            ", ".join(ext_re)))

        if ".mojo" in ext_re:
            ## check for liscense
            license_location = os.path.join(params["__custom_model_path__"],
                                            "license.sig")
            self.logger.debug("license location: {}".format(license_location))
            try:
                os.environ["DRIVERLESS_AI_LICENSE_FILE"]
            except:
                try:
                    os.environ["DRIVERLESS_AI_LICENSE_KEY"]
                except:
                    if not os.path.exists(license_location):
                        raise DrumCommonException(
                            "Cannot find license file for DAI Mojo Pipeline.\n"
                            "Make sure you have done one of the following:\n"
                            "\t* provided license.sig file in the artifacts\n"
                            "\t* set the environment variable DRIVERLESS_AI_LICENSE_FILE : A location of file with a license\n"
                            "\t* set the environment variable DRIVERLESS_AI_LICENSE_KEY : A license key"
                        )
                    else:
                        os.environ[
                            "DRIVERLESS_AI_LICENSE_FILE"] = license_location
            self.model_artifact_extension = ".mojo"
        else:
            self.model_artifact_extension = ext_re[0]

        self.logger.debug("model artifact extension: {}".format(
            self.model_artifact_extension))

        ## only needed to add mojo runtime jars
        additional_jars = (None if self.model_artifact_extension != ".mojo"
                           else glob.glob(
                               os.path.join(self.custom_model_path, "*.jar")))

        ## the mojo runtime jars must be added to self.__jar_files to be passed to gateway
        try:
            self._jar_files.extend(additional_jars)
        except:
            pass
        ##

        self._init_py4j_and_load_predictor()

        m = self._gateway.jvm.java.util.HashMap()
        for key in params.keys():
            if isinstance(params[key], dict):
                continue
            elif isinstance(params[key], list):
                pylist = params[key]
                jarray = self._gateway.new_array(
                    self._gateway.jvm.java.lang.String, len(pylist))
                for i, val in enumerate(pylist):
                    jarray[i] = str(val)
                m[key] = jarray
            else:
                m[key] = params[key]
        self._predictor_via_py4j.configure(m)
Esempio n. 13
0
    def check_prediction_side_effects(self):
        rtol = 2e-02
        atol = 1e-06
        input_extension = os.path.splitext(self.options.input)
        is_sparse = input_extension[1] == ".mtx"

        if is_sparse:
            columns = [
                column.strip() for column in open(
                    self.options.sparse_column_file).readlines()
            ]
            df = pd.DataFrame.sparse.from_spmatrix(mmread(self.options.input),
                                                   columns=columns)
            samplesize = min(1000, max(int(len(df) * 0.1), 10))
            data_subset = df.sample(n=samplesize, random_state=42)
            subset_payload, colnames = make_mtx_payload(data_subset)
            subset_payload = ("X.mtx", subset_payload)
            files = {
                "X":
                subset_payload,
                SPARSE_COLNAMES: (
                    SPARSE_COLNAMES,
                    colnames,
                    PredictionServerMimetypes.APPLICATION_OCTET_STREAM,
                ),
            }
        else:
            df = pd.read_csv(self.options.input)
            samplesize = min(1000, max(int(len(df) * 0.1), 10))
            data_subset = df.sample(n=samplesize, random_state=42)
            subset_payload = make_csv_payload(data_subset)
            files = {"X": subset_payload}

        labels = self.resolve_labels(self.target_type, self.options)

        with DrumServerRun(self.target_type.value,
                           labels,
                           self.options.code_dir,
                           verbose=self._verbose) as run:
            endpoint = "/predict/"
            payload = {"X": open(self.options.input)}
            if is_sparse:
                payload.update({
                    SPARSE_COLNAMES: (
                        SPARSE_COLNAMES,
                        open(self.options.sparse_column_file),
                        PredictionServerMimetypes.APPLICATION_OCTET_STREAM,
                    )
                })

            response_full = requests.post(run.url_server_address + endpoint,
                                          files=payload)
            if not response_full.ok:
                raise DrumCommonException("Failure in {} server: {}".format(
                    endpoint[1:-1], response_full.text))

            response_sample = requests.post(run.url_server_address + endpoint,
                                            files=files)
            if not response_sample.ok:
                raise DrumCommonException("Failure in {} server: {}".format(
                    endpoint[1:-1], response_sample.text))

            preds_full = pd.DataFrame(
                json.loads(response_full.text)[RESPONSE_PREDICTIONS_KEY])
            preds_sample = pd.DataFrame(
                json.loads(response_sample.text)[RESPONSE_PREDICTIONS_KEY])

            preds_full_subset = preds_full.iloc[data_subset.index]

            if self._schema_validator:
                # Validate that the predictions are of the type and shape the user specified in the schema
                self._schema_validator.validate_outputs(preds_sample)

            matches = np.isclose(preds_full_subset,
                                 preds_sample,
                                 rtol=rtol,
                                 atol=atol)
            if not np.all(matches):
                if is_sparse:
                    _, __tempfile_sample = mkstemp(suffix=".mtx")
                    sparse_mat = vstack(x[0] for x in data_subset.values)
                    mmwrite(__tempfile_sample, sparse_mat.sparse.to_coo())
                else:
                    _, __tempfile_sample = mkstemp(suffix=".csv")
                    data_subset.to_csv(__tempfile_sample, index=False)

                message = """
                            Warning: Your predictions were different when we tried to predict twice.
                            The last 10 predictions from the main predict run were: {}
                            However when we reran predictions on the same data, we got: {}.
                            The sample used to calculate prediction reruns can be found in this file: {}""".format(
                    preds_full_subset[~matches][:10].to_string(index=False),
                    preds_sample[~matches][:10].to_string(index=False),
                    __tempfile_sample,
                )
                raise DrumPredException(message)
 def _read_structured_input(filename):
     try:
         df = pd.read_csv(filename)
     except pd.errors.ParserError as e:
         raise DrumCommonException("Pandas failed to read input csv file: {}".format(filename))
     return df
Esempio n. 15
0
    def _check_artifacts_and_get_run_language(self):
        lang = getattr(self.options, "language", None)
        if lang:
            return RunLanguage(self.options.language)

        code_dir_abspath = os.path.abspath(self.options.code_dir)

        artifact_language = None
        custom_language = None
        # check which artifacts present in the code dir
        python_artifacts = CMRunnerUtils.find_files_by_extensions(
            code_dir_abspath, PythonArtifacts.ALL)
        r_artifacts = CMRunnerUtils.find_files_by_extensions(
            code_dir_abspath, RArtifacts.ALL)

        java_artifacts = CMRunnerUtils.find_files_by_extensions(
            code_dir_abspath, JavaArtifacts.ALL)

        # check which custom code files present in the code dir
        is_custom_py = CMRunnerUtils.filename_exists_and_is_file(
            code_dir_abspath, "custom.py")
        is_custom_r = CMRunnerUtils.filename_exists_and_is_file(
            code_dir_abspath,
            "custom.R") or CMRunnerUtils.filename_exists_and_is_file(
                code_dir_abspath, "custom.r")

        # if all the artifacts belong to the same language, set it
        if bool(len(python_artifacts)) + bool(len(r_artifacts)) + bool(
                len(java_artifacts)) == 1:
            if len(python_artifacts):
                artifact_language = RunLanguage.PYTHON
            elif len(r_artifacts):
                artifact_language = RunLanguage.R
            elif len(java_artifacts):
                artifact_language = RunLanguage.JAVA

        # if only one custom file found, set it:
        if is_custom_py + is_custom_r == 1:
            custom_language = RunLanguage.PYTHON if is_custom_py else RunLanguage.R

        # if both language values are None, or both are not None and not equal
        if (bool(custom_language) + bool(artifact_language) == 0
                or bool(custom_language) + bool(artifact_language) == 2
                and custom_language != artifact_language):
            artifact_language = "None" if artifact_language is None else artifact_language.value
            custom_language = "None" if custom_language is None else custom_language.value
            error_mes = (
                "Can not detect language by artifacts and/or custom.py/R files.\n"
                "Detected: language by artifacts - {}; language by custom - {}.\n"
                "Code directory must have one or more model artifacts belonging to the same language:\n"
                "Python/R/Java, with an extension:\n"
                "Python models: {}\n"
                "R models: {}\n"
                "Java models: {}.\n"
                "Or one of custom.py/R files.".format(
                    artifact_language,
                    custom_language,
                    PythonArtifacts.ALL,
                    RArtifacts.ALL,
                    JavaArtifacts.ALL,
                ))
            all_files_message = "\n\nFiles(100 first) found in {}:\n{}\n".format(
                code_dir_abspath,
                "\n".join(sorted(os.listdir(code_dir_abspath))[0:100]))

            error_mes += all_files_message
            self.logger.error(error_mes)
            raise DrumCommonException(error_mes)

        run_language = custom_language if custom_language is not None else artifact_language
        return run_language
Esempio n. 16
0
    def _resolve_class_labels(self):
        if self.run_mode in [
                RunMode.NEW
        ] or (self.run_mode == RunMode.PUSH and
              self.options.model_config[ModelMetadataKeys.TYPE] == "training"):
            self.options.positive_class_label = None
            self.options.negative_class_label = None
            self.options.class_labels = None
            self.options.class_labels_file = None
            return

        if self.target_type == TargetType.BINARY:
            pos_options = getattr(self.options, "positive_class_label", None)
            neg_options = getattr(self.options, "negative_class_label", None)

            try:
                pos_model_config = self.options.model_config.get(
                    ModelMetadataKeys.INFERENCE_MODEL).get(
                        "positiveClassLabel")
                neg_model_config = self.options.model_config.get(
                    ModelMetadataKeys.INFERENCE_MODEL).get(
                        "negativeClassLabel")
            except AttributeError:
                pos_model_config = neg_model_config = None

            if (not all([pos_options, neg_options])
                    and not all([pos_model_config, neg_model_config])
                    and self.run_mode != RunMode.FIT):
                raise DrumCommonException(
                    "Positive/negative class labels are missing. They must be provided with either one: {}/{} arguments, environment variables, model config file."
                    .format(ArgumentsOptions.POSITIVE_CLASS_LABEL,
                            ArgumentsOptions.NEGATIVE_CLASS_LABEL))
            elif all(
                [pos_options, neg_options, pos_model_config, neg_model_config
                 ]) and (pos_options != pos_model_config
                         or neg_options != neg_model_config):
                raise DrumCommonException(
                    "Positive/negative class labels provided with command arguments or environment variable don't match values from model config file. "
                    "Use either one of them or make them match.")
            else:
                self.options.positive_class_label = (pos_options
                                                     if pos_options is not None
                                                     else pos_model_config)

                self.options.negative_class_label = (neg_options
                                                     if neg_options is not None
                                                     else neg_model_config)

        elif self.target_type == TargetType.MULTICLASS:
            labels_options = getattr(self.options, "class_labels", None)
            try:
                labels_model_config = self.options.model_config.get(
                    ModelMetadataKeys.INFERENCE_MODEL).get("classLabels")
            except AttributeError:
                labels_model_config = None

            if (labels_options is None and labels_model_config is None
                    and self.run_mode != RunMode.FIT):
                raise DrumCommonException(
                    "Class labels are missing. They must be provided with either one: {}/{} arguments, environment variables, model config file."
                    .format(ArgumentsOptions.CLASS_LABELS,
                            ArgumentsOptions.CLASS_LABELS_FILE))
            # both not None but not set() equal
            elif all([labels_options, labels_model_config
                      ]) and set(labels_options) != set(labels_model_config):
                raise DrumCommonException(
                    "Class labels provided with command arguments or environment variable don't match values from model config file. "
                    "Use either one of them or make them match.")
            else:
                self.options.class_labels = (labels_options
                                             if labels_options is not None else
                                             labels_model_config)
        else:
            self.options.positive_class_label = None
            self.options.negative_class_label = None
            self.options.class_labels = None
            self.options.class_labels_file = None
from datarobot_drum.drum.language_predictors.base_language_predictor import BaseLanguagePredictor

logger = logging.getLogger(LOGGER_NAME_PREFIX + "." + __name__)

try:
    import rpy2.robjects as ro
    from rpy2.robjects import pandas2ri, StrVector
    from rpy2.robjects.conversion import localconverter

except ImportError:
    error_message = (
        "rpy2 package is not installed."
        "Install datarobot-drum using 'pip install datarobot-drum[R]'"
        "Available for Python>=3.6")
    logger.error(error_message)
    raise DrumCommonException(error_message)

pandas2ri.activate()
CUR_DIR = os.path.dirname(os.path.abspath(__file__))
R_SCORE_PATH = os.path.join(CUR_DIR, "score.R")
R_COMMON_PATH = os.path.abspath(
    os.path.join(
        CUR_DIR,
        "..",
        "r_common_code",
        "common.R",
    ))

r_handler = ro.r

Esempio n. 18
0
    def _prepare_fit_pipeline(self, run_language):

        if self.target_type.value in TargetType.CLASSIFICATION.value and (
                self.options.negative_class_label is None
                or self.options.class_labels is None):
            # No class label information was supplied, but we may be able to infer the labels
            possible_class_labels = possibly_intuit_order(
                self.options.input,
                self.options.target_csv,
                self.options.target,
                self.target_type == TargetType.ANOMALY,
            )
            if possible_class_labels is not None:
                if self.target_type == TargetType.BINARY:
                    if len(possible_class_labels) != 2:
                        raise DrumCommonException(
                            "Target type {} requires exactly 2 class labels. Detected {}: {}"
                            .format(TargetType.BINARY,
                                    len(possible_class_labels),
                                    possible_class_labels))
                    (
                        self.options.positive_class_label,
                        self.options.negative_class_label,
                    ) = possible_class_labels
                elif self.target_type == TargetType.MULTICLASS:
                    if len(possible_class_labels) < 2:
                        raise DrumCommonException(
                            "Target type {} requires more than 2 class labels. Detected {}: {}"
                            .format(
                                TargetType.MULTICLASS,
                                len(possible_class_labels),
                                possible_class_labels,
                            ))
                    self.options.class_labels = list(possible_class_labels)
            else:
                raise DrumCommonException(
                    "Target type {} requires class label information. No labels were supplied and "
                    "labels could not be inferred from the target.".format(
                        self.target_type.value))

        options = self.options
        # functional pipeline is predictor pipeline
        # they are a little different for batch and server predictions.
        functional_pipeline_name = self._functional_pipelines[(self.run_mode,
                                                               run_language)]
        functional_pipeline_filepath = CMRunnerUtils.get_pipeline_filepath(
            functional_pipeline_name)
        # fields to replace in the functional pipeline (predictor)
        replace_data = {
            "customModelPath": os.path.abspath(options.code_dir),
            "input_filename": options.input,
            "weights": options.row_weights,
            "weights_filename": options.row_weights_csv,
            "target_column": options.target,
            "target_filename": options.target_csv,
            "positiveClassLabel": options.positive_class_label,
            "negativeClassLabel": options.negative_class_label,
            "classLabels": options.class_labels,
            "output_dir": options.output,
            "num_rows": options.num_rows,
            "sparse_column_file": options.sparse_column_file,
        }

        functional_pipeline_str = CMRunnerUtils.render_file(
            functional_pipeline_filepath, replace_data)
        return functional_pipeline_str
 def transform(self, **kwargs):
     raise DrumCommonException("Transform feature is not supported for R")
    def _materialize(self, parent_data_objs, user_data):
        model_api = base_api_blueprint()

        @model_api.route("/health/", methods=["GET"])
        def health():
            return {"message": "OK"}, HTTP_200_OK

        @model_api.route("/predict/", methods=["POST"])
        def predict():
            logger.debug("Entering predict() endpoint")

            self._stats_collector.enable()
            self._stats_collector.mark("start")

            try:
                response, response_status = self.do_predict(logger=logger)
            finally:
                self._stats_collector.mark("finish")
                self._stats_collector.disable()
            return response, response_status

        @model_api.route("/predictUnstructured/", methods=["POST"])
        def predict_unstructured():
            logger.debug("Entering predict() endpoint")

            self._stats_collector.enable()
            self._stats_collector.mark("start")

            try:
                response, response_status = self.do_predict_unstructured(
                    logger=logger)
            finally:
                self._stats_collector.mark("finish")
                self._stats_collector.disable()
            return response, response_status

        @model_api.route("/stats/", methods=["GET"])
        def stats():
            mem_info = self._memory_monitor.collect_memory_info()
            ret_dict = {"mem_info": mem_info._asdict()}

            self._stats_collector.round()
            ret_dict["time_info"] = {}
            for name in self._stats_collector.get_report_names():
                d = self._stats_collector.dict_report(name)
                ret_dict["time_info"][name] = d
            self._stats_collector.stats_reset()
            return ret_dict, HTTP_200_OK

        @model_api.errorhandler(Exception)
        def handle_exception(e):
            logger.exception(e)
            return {
                "message": "ERROR: {}".format(e)
            }, HTTP_500_INTERNAL_SERVER_ERROR

        app = get_flask_app(model_api)
        logging.getLogger("werkzeug").setLevel(logger.getEffectiveLevel())

        host = self._params.get("host", None)
        port = self._params.get("port", None)
        try:
            app.run(host, port, threaded=False)
        except OSError as e:
            raise DrumCommonException("{}: host: {}; port: {}".format(
                e, host, port))

        if self._stats_collector:
            self._stats_collector.print_reports()

        return []