Example #1
0
    def _prepare_fit_pipeline(self, run_language):

        if self.options.negative_class_label is None:
            (
                self.options.positive_class_label,
                self.options.negative_class_label,
            ) = possibly_intuit_order(
                self.options.input,
                self.options.target_csv,
                self.options.target,
                self.options.unsupervised,
            )
        if self.options.unsupervised:
            self._set_target_type(TargetType.ANOMALY)
        elif self.options.negative_class_label is not None:
            self._set_target_type(TargetType.BINARY)
        else:
            self._set_target_type(TargetType.REGRESSION)

        options = self.options
        # functional pipeline is predictor pipeline
        # they are a little different for batch and server predictions.
        functional_pipeline_name = self._functional_pipelines[(self.run_mode,
                                                               run_language)]
        functional_pipeline_filepath = CMRunnerUtils.get_pipeline_filepath(
            functional_pipeline_name)
        # fields to replace in the functional pipeline (predictor)

        replace_data = {
            "customModelPath":
            os.path.abspath(options.code_dir),
            "input_filename":
            options.input,
            "weights":
            '"{}"'.format(options.row_weights)
            if options.row_weights else "null",
            "weights_filename":
            '"{}"'.format(options.row_weights_csv)
            if options.row_weights_csv else "null",
            "target_column":
            '"{}"'.format(options.target) if options.target else "null",
            "target_filename":
            '"{}"'.format(options.target_csv)
            if options.target_csv else "null",
            "positiveClassLabel":
            '"{}"'.format(options.positive_class_label)
            if options.positive_class_label is not None else "null",
            "negativeClassLabel":
            '"{}"'.format(options.negative_class_label)
            if options.negative_class_label is not None else "null",
            "output_dir":
            options.output,
            "num_rows":
            options.num_rows,
        }

        functional_pipeline_str = CMRunnerUtils.render_file(
            functional_pipeline_filepath, replace_data)
        return functional_pipeline_str
    def _prepare_prediction_server_or_batch_pipeline(self, run_language):
        options = self.options
        # functional pipeline is predictor pipeline
        # they are a little different for batch and server predictions.
        functional_pipeline_name = self._functional_pipelines[(self.run_mode,
                                                               run_language)]
        functional_pipeline_filepath = CMRunnerUtils.get_pipeline_filepath(
            functional_pipeline_name)
        # fields to replace in the functional pipeline (predictor)
        replace_data = {
            "positiveClassLabel":
            '"{}"'.format(options.positive_class_label)
            if options.positive_class_label else "null",
            "negativeClassLabel":
            '"{}"'.format(options.negative_class_label)
            if options.negative_class_label else "null",
            "customModelPath":
            os.path.abspath(options.code_dir),
        }

        if self.run_mode == RunMode.SCORE:
            replace_data.update({
                "input_filename":
                options.input,
                "output_filename":
                '"{}"'.format(options.output) if options.output else "null",
            })

        functional_pipeline_str = CMRunnerUtils.render_file(
            functional_pipeline_filepath, replace_data)
        ret_pipeline = functional_pipeline_str

        if self.run_mode == RunMode.SERVER:
            with open(
                    CMRunnerUtils.get_pipeline_filepath(
                        EXTERNAL_SERVER_RUNNER), "r") as f:
                runner_pipeline_json = json.load(f)
                # can not use template for pipeline as quotes won't be escaped
                args = runner_pipeline_json["pipe"][0]["arguments"]
                # in server mode, predictor pipeline is passed to server as param
                args["pipeline"] = functional_pipeline_str
                args["repo"] = CMRunnerUtils.get_components_repo()
                host_port_list = options.address.split(":", 1)
                args["host"] = host_port_list[0]
                args["port"] = int(
                    host_port_list[1]) if len(host_port_list) == 2 else None
                args["threaded"] = options.threaded
                args["show_perf"] = options.show_perf
                ret_pipeline = json.dumps(runner_pipeline_json)
        return ret_pipeline
Example #3
0
    def __init__(
        self,
        labels,
        custom_model_dir,
        docker=None,
        with_error_server=False,
        show_stacktrace=True,
        nginx=False,
    ):
        port = CMRunnerUtils.find_free_port()
        server_address = "localhost:{}".format(port)
        url_host = os.environ.get("TEST_URL_HOST", "localhost")
        if docker:
            self.url_server_address = "http://{}:{}".format(url_host, port)
        else:
            self.url_server_address = "http://localhost:{}".format(port)

        cmd = "{} server --code-dir {} --address {}".format(
            ArgumentsOptions.MAIN_COMMAND, custom_model_dir, server_address
        )
        if labels:
            cmd = _cmd_add_class_labels(cmd, labels)
        if docker:
            cmd += " --docker {}".format(docker)
        if with_error_server:
            cmd += " --with-error-server"
        if show_stacktrace:
            cmd += " --show-stacktrace"
        if nginx:
            cmd += " --production"
        self._cmd = cmd

        self._process_object_holder = DrumServerProcess()
        self._server_thread = None
        self._with_nginx = nginx
    def performance_test(self):
        self._print_perf_test_params()
        self._prepare_test_cases()

        _kill_drum_perf_test_server_process(
            _find_drum_perf_test_server_process(), self.options.verbose
        )
        if CMRunnerUtils.is_port_in_use(self._server_addr, self._server_port):
            error_message = "\nError: address: {} is in use".format(self._url_server_address)
            print(error_message)
            raise DrumCommonException(error_message)

        self._start_drum_server()
        self._init_signals()

        print("\n\n")
        results = self._run_all_test_cases()
        self._reset_signals()
        self._stop_drum_server()
        in_docker = self.options.docker is not None
        str_report = PerfTestResultsFormatter(
            results, in_docker=in_docker, show_inside_server=self.options.in_server
        ).get_tbl_str()

        print("\n" + str_report)
        return
Example #5
0
    def _prepare_prediction_server_or_batch_pipeline(self, run_language):
        options = self.options
        functional_pipeline_name = (
            PREDICTION_SERVER_PIPELINE if self.run_mode == RunMode.SERVER else PREDICTOR_PIPELINE
        )
        functional_pipeline_filepath = CMRunnerUtils.get_pipeline_filepath(functional_pipeline_name)

        # fields to replace in the pipeline
        replace_data = {
            "positiveClassLabel": '"{}"'.format(options.positive_class_label)
            if options.positive_class_label
            else "null",
            "negativeClassLabel": '"{}"'.format(options.negative_class_label)
            if options.negative_class_label
            else "null",
            "customModelPath": os.path.abspath(options.code_dir),
            "run_language": run_language.value,
        }
        if self.run_mode == RunMode.SCORE:
            replace_data.update(
                {
                    "input_filename": options.input,
                    "output_filename": '"{}"'.format(options.output) if options.output else "null",
                }
            )
        else:
            host_port_list = options.address.split(":", 1)
            host = host_port_list[0]
            port = int(host_port_list[1]) if len(host_port_list) == 2 else None
            replace_data.update(
                {
                    "host": host,
                    "port": port,
                    "threaded": str(options.threaded).lower(),
                    "show_perf": str(options.show_perf).lower(),
                }
            )

        functional_pipeline_str = CMRunnerUtils.render_file(
            functional_pipeline_filepath, replace_data
        )

        return functional_pipeline_str
Example #6
0
    def _basic_batch_prediction_check(self):
        test_name = "Basic batch prediction"
        test_passed = True
        failure_message = ""
        cmd_list = sys.argv

        TMP_DIR = "/tmp"
        DIR_PREFIX = "drum_validation_check_"

        output_dir = mkdtemp(prefix=DIR_PREFIX, dir=TMP_DIR)
        output_filename = os.path.join(output_dir, "output")

        CMRunnerUtils.replace_cmd_argument_value(cmd_list,
                                                 ArgumentsOptions.OUTPUT,
                                                 output_filename)

        p = subprocess.Popen(cmd_list, env=os.environ)
        retcode = p.wait()
        if retcode != 0:
            test_passed = False
            failure_message = "Test failed on provided dataset: {}".format(
                self._input_csv)

        return test_name, test_passed, failure_message
    def __init__(self, options, run_mode, target_type=None):
        self.options = options
        self.target_type = target_type
        self._verbose = self.options.verbose
        self._input_csv = self.options.input
        self._input_df = pd.read_csv(self._input_csv)

        self._server_addr = "localhost"
        self._server_port = CMRunnerUtils.find_free_port()
        self._url_server_address = "http://{}:{}".format(self._server_addr, self._server_port)
        self._shutdown_endpoint = "/shutdown/"
        self._predict_endpoint = "/predict/"
        self._stats_endpoint = "/stats/"
        self._timeout = 20
        self._server_process = None

        self._df_for_test = None
        self._test_cases_to_run = None
Example #8
0
    def __init__(self, options, run_mode):
        self.options = options
        self._input_csv = self.options.input

        self._server_addr = "localhost"
        self._server_port = CMRunnerUtils.find_free_port()
        self._url_server_address = "http://{}:{}".format(
            self._server_addr, self._server_port)
        self._shutdown_endpoint = "/shutdown/"
        self._predict_endpoint = "/predict/"
        self._stats_endpoint = "/stats/"
        self._timeout = 20
        self._server_process = None

        self._df_for_test = None
        self._test_cases_to_run = None
        if run_mode == RunMode.PERF_TEST:
            self._prepare_test_cases()
Example #9
0
    def performance_test(self):
        _find_and_kill_cmrun_server_process(self.options.verbose)
        if CMRunnerUtils.is_port_in_use(self._server_addr, self._server_port):
            error_message = "\nError: address: {} is in use".format(
                self._url_server_address)
            print(error_message)
            raise DrumCommonException(error_message)

        cmd_list = self._build_cmrun_cmd()
        self._server_process = subprocess.Popen(cmd_list, env=os.environ)
        self._wait_for_server_to_start()

        def signal_handler(sig, frame):
            print("\nCtrl+C pressed, aborting test")
            print("Sending shutdown to server")
            self._stop_server()
            os.system("tput init")
            sys.exit(0)

        def testcase_timeout(signum, frame):
            raise DrumPerfTestTimeout()

        signal.signal(signal.SIGINT, signal_handler)
        signal.signal(signal.SIGALRM, testcase_timeout)

        results = []
        print("\n\n")
        for tc in self._test_cases_to_run:
            signal.alarm(self.options.timeout)
            try:
                self._run_test_case(tc, results)
            except DrumPerfTestTimeout:
                print("... timed out ({}s)".format(self.options.timeout))
            except Exception as e:
                print("\n...test case failed with a message: {}".format(e))

        self._stop_server()
        str_report = self._generate_table_report_adv(
            results, show_inside_server=self.options.in_server)
        print("\n" + str_report)
        return
Example #10
0
    def _prepare_prediction_server_or_batch_pipeline(self, run_language):
        options = self.options
        functional_pipeline_name = (SERVER_PIPELINE if self.run_mode
                                    == RunMode.SERVER else PREDICTOR_PIPELINE)
        functional_pipeline_filepath = CMRunnerUtils.get_pipeline_filepath(
            functional_pipeline_name)

        # fields to replace in the pipeline
        replace_data = {
            "positiveClassLabel":
            '"{}"'.format(options.positive_class_label)
            if options.positive_class_label else "null",
            "negativeClassLabel":
            '"{}"'.format(options.negative_class_label)
            if options.negative_class_label else "null",
            "customModelPath":
            os.path.abspath(options.code_dir),
            "run_language":
            run_language.value,
            "monitor":
            options.monitor,
            "model_id":
            options.model_id,
            "deployment_id":
            options.deployment_id,
            "monitor_settings":
            options.monitor_settings,
        }

        if self.run_mode == RunMode.SCORE:
            replace_data.update({
                "input_filename":
                options.input,
                "output_filename":
                '"{}"'.format(options.output) if options.output else "null",
            })
        else:
            host_port_list = options.address.split(":", 1)
            host = host_port_list[0]
            port = int(host_port_list[1]) if len(host_port_list) == 2 else None
            replace_data.update({
                "host":
                host,
                "port":
                port,
                "show_perf":
                str(options.show_perf).lower(),
                "engine_type":
                "RestModelServing" if options.production else "Generic",
                "component_type":
                "uwsgi_serving" if options.production else "prediction_server",
                "uwsgi_max_workers":
                options.max_workers
                if getattr(options, "max_workers") else "null",
            })

        functional_pipeline_str = CMRunnerUtils.render_file(
            functional_pipeline_filepath, replace_data)

        if self.run_mode == RunMode.SERVER:
            if options.production:
                pipeline_json = json.loads(functional_pipeline_str)
                # Because of tech debt in MLPiper which requires that the modelFileSourcePath key
                # be filled with something, we're putting in a dummy file path here
                if json_fields.PIPELINE_SYSTEM_CONFIG_FIELD not in pipeline_json:
                    system_config = {
                        "modelFileSourcePath": os.path.abspath(__file__)
                    }
                pipeline_json[
                    json_fields.PIPELINE_SYSTEM_CONFIG_FIELD] = system_config
                functional_pipeline_str = json.dumps(pipeline_json)
        return functional_pipeline_str
    def validation_test(self):

        # TODO: create infrastructure to easily add more checks
        # NullValueImputationCheck
        test_name = "Null value imputation"
        ValidationTestResult = collections.namedtuple("ValidationTestResult", "filename retcode")

        cmd_list = sys.argv
        cmd_list[0] = ArgumentsOptions.MAIN_COMMAND
        cmd_list[1] = ArgumentsOptions.SCORE

        TMP_DIR = "/tmp"
        DIR_PREFIX = "drum_validation_checks_"

        null_datasets_dir = mkdtemp(prefix=DIR_PREFIX, dir=TMP_DIR)

        df = pd.read_csv(self._input_csv)
        column_names = list(df.iloc[[0]])

        results = {}

        for column_name in column_names:
            with NamedTemporaryFile(
                mode="w",
                dir=null_datasets_dir,
                prefix="null_value_imputation_{}_".format(column_name),
                delete=False,
            ) as temp_f:
                temp_data_name = temp_f.name
                df_tmp = df.copy()
                df_tmp[column_name] = None
                df_tmp.to_csv(temp_data_name, index=False)
                CMRunnerUtils.replace_cmd_argument_value(
                    cmd_list, ArgumentsOptions.INPUT, temp_data_name
                )

                p = subprocess.Popen(cmd_list, env=os.environ)
                retcode = p.wait()
                if retcode != 0:
                    results[column_name] = ValidationTestResult(temp_data_name, retcode)

        table = Texttable()
        table.set_deco(Texttable.HEADER)

        try:
            terminal_size = shutil.get_terminal_size()
            table.set_max_width(terminal_size.columns)
        except Exception as e:
            pass

        header_names = ["Test case", "Status"]
        col_types = ["t", "t"]
        col_align = ["l", "l"]

        rows = []

        if len(results) == 0:
            rows.append(header_names)
            rows.append([test_name, "PASSED"])
            shutil.rmtree(null_datasets_dir)
        else:
            col_types.append("t")
            col_align.append("l")
            header_names.append("Details")
            rows.append(header_names)
            for test_result in results.values():
                if not test_result.retcode:
                    os.remove(test_result.filename)

            table2 = Texttable()
            table2.set_deco(Texttable.HEADER)

            message = (
                "Null value imputation check performs check by imputing each feature with NaN value. "
                "If check fails for a feature, test dataset is saved in {}/{}. "
                "Make sure to delete those folders if it takes too much space.".format(
                    TMP_DIR, DIR_PREFIX
                )
            )
            rows.append([test_name, "FAILED", message])

            header_names2 = ["Failed feature", "Dataset filename"]

            table2.set_cols_dtype(["t", "t"])
            table2.set_cols_align(["l", "l"])

            rows2 = [header_names2]

            for key, test_result in results.items():
                if test_result.retcode:
                    rows2.append([key, test_result.filename])
                    pass

            table2.add_rows(rows2)
            table_res = table2.draw()
            rows.append(["", "", "\n{}".format(table_res)])

        table.set_cols_dtype(col_types)
        table.set_cols_align(col_align)
        table.add_rows(rows)
        tbl_report = table.draw()
        print("\n\nValidation checks results")
        print(tbl_report)
    def __init__(
        self,
        target_type,
        labels,
        custom_model_dir,
        docker=None,
        with_error_server=False,
        show_stacktrace=True,
        nginx=False,
        memory=None,
        fail_on_shutdown_error=True,
        pass_args_as_env_vars=False,
        verbose=True,
        append_cmd=None,
    ):
        port = CMRunnerUtils.find_free_port()
        self.server_address = "localhost:{}".format(port)
        url_host = os.environ.get("TEST_URL_HOST", "localhost")

        if docker:
            self.url_server_address = "http://{}:{}".format(url_host, port)
        else:
            self.url_server_address = "http://localhost:{}".format(port)

        cmd = "{} server".format(ArgumentsOptions.MAIN_COMMAND)

        if pass_args_as_env_vars:
            os.environ[ArgumentOptionsEnvVars.CODE_DIR] = str(custom_model_dir)
            os.environ[ArgumentOptionsEnvVars.TARGET_TYPE] = target_type
            os.environ[ArgumentOptionsEnvVars.ADDRESS] = self.server_address
        else:
            cmd += " --code-dir {} --target-type {} --address {}".format(
                custom_model_dir, target_type, self.server_address
            )

        if labels:
            cmd = _cmd_add_class_labels(
                cmd, labels, target_type=target_type, pass_args_as_env_vars=pass_args_as_env_vars
            )
        if docker:
            cmd += " --docker {}".format(docker)
            if memory:
                cmd += " --memory {}".format(memory)
        if with_error_server:
            if pass_args_as_env_vars:
                os.environ[ArgumentOptionsEnvVars.WITH_ERROR_SERVER] = "1"
            else:
                cmd += " --with-error-server"
        if show_stacktrace:
            if pass_args_as_env_vars:
                os.environ[ArgumentOptionsEnvVars.SHOW_STACKTRACE] = "1"
            else:
                cmd += " --show-stacktrace"
        if nginx:
            if pass_args_as_env_vars:
                os.environ[ArgumentOptionsEnvVars.PRODUCTION] = "1"
            else:
                cmd += " --production"

        if append_cmd is not None:
            cmd += " " + append_cmd

        self._cmd = cmd

        self._process_object_holder = DrumServerProcess()
        self._server_thread = None
        self._with_nginx = nginx
        self._fail_on_shutdown_error = fail_on_shutdown_error
        self._verbose = verbose
Example #13
0
    def _check_artifacts_and_get_run_language(self):
        lang = getattr(self.options, "language", None)
        if lang:
            return RunLanguage(self.options.language)

        code_dir_abspath = os.path.abspath(self.options.code_dir)

        artifact_language = None
        custom_language = None
        # check which artifacts present in the code dir
        python_artifacts = CMRunnerUtils.find_files_by_extensions(
            code_dir_abspath, PythonArtifacts.ALL)
        r_artifacts = CMRunnerUtils.find_files_by_extensions(
            code_dir_abspath, RArtifacts.ALL)

        java_artifacts = CMRunnerUtils.find_files_by_extensions(
            code_dir_abspath, JavaArtifacts.ALL)

        # check which custom code files present in the code dir
        is_custom_py = CMRunnerUtils.filename_exists_and_is_file(
            code_dir_abspath, "custom.py")
        is_custom_r = CMRunnerUtils.filename_exists_and_is_file(
            code_dir_abspath,
            "custom.R") or CMRunnerUtils.filename_exists_and_is_file(
                code_dir_abspath, "custom.r")

        # if all the artifacts belong to the same language, set it
        if bool(len(python_artifacts)) + bool(len(r_artifacts)) + bool(
                len(java_artifacts)) == 1:
            if len(python_artifacts):
                artifact_language = RunLanguage.PYTHON
            elif len(r_artifacts):
                artifact_language = RunLanguage.R
            elif len(java_artifacts):
                artifact_language = RunLanguage.JAVA

        # if only one custom file found, set it:
        if is_custom_py + is_custom_r == 1:
            custom_language = RunLanguage.PYTHON if is_custom_py else RunLanguage.R

        # if both language values are None, or both are not None and not equal
        if (bool(custom_language) + bool(artifact_language) == 0
                or bool(custom_language) + bool(artifact_language) == 2
                and custom_language != artifact_language):
            artifact_language = "None" if artifact_language is None else artifact_language.value
            custom_language = "None" if custom_language is None else custom_language.value
            error_mes = (
                "Can not detect language by artifacts and/or custom.py/R files.\n"
                "Detected: language by artifacts - {}; language by custom - {}.\n"
                "Code directory must have one or more model artifacts belonging to the same language:\n"
                "Python/R/Java, with an extension:\n"
                "Python models: {}\n"
                "R models: {}\n"
                "Java models: {}.\n"
                "Or one of custom.py/R files.".format(
                    artifact_language,
                    custom_language,
                    PythonArtifacts.ALL,
                    RArtifacts.ALL,
                    JavaArtifacts.ALL,
                ))
            all_files_message = "\n\nFiles(100 first) found in {}:\n{}\n".format(
                code_dir_abspath,
                "\n".join(sorted(os.listdir(code_dir_abspath))[0:100]))

            error_mes += all_files_message
            self.logger.error(error_mes)
            raise DrumCommonException(error_mes)

        run_language = custom_language if custom_language is not None else artifact_language
        return run_language
Example #14
0
    def _prepare_docker_command(self, options, run_mode, raw_arguments):
        """
        Building a docker command line for running the model inside the docker - this command line can
        be used by the user independently of drum.
        Parameters
        Returns: docker command line to run as a string
        """
        options.docker = self._maybe_build_image(options.docker)
        in_docker_model = "/opt/model"
        in_docker_input_file = "/opt/input.csv"
        in_docker_output_file = "/opt/output.csv"
        in_docker_fit_output_dir = "/opt/fit_output_dir"
        in_docker_fit_target_filename = "/opt/fit_target.csv"
        in_docker_fit_row_weights_filename = "/opt/fit_row_weights.csv"

        docker_cmd = "docker run --rm --interactive  --user $(id -u):$(id -g) "
        docker_cmd_args = " -v {}:{}".format(options.code_dir, in_docker_model)

        in_docker_cmd_list = raw_arguments
        in_docker_cmd_list[0] = ArgumentsOptions.MAIN_COMMAND
        in_docker_cmd_list[1] = run_mode.value

        CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list,
                                          ArgumentsOptions.DOCKER)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.CODE_DIR,
                                                 in_docker_model)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, "-cd",
                                                 in_docker_model)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.INPUT,
                                                 in_docker_input_file)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.OUTPUT,
                                                 in_docker_output_file)

        if run_mode == RunMode.SERVER:
            host_port_list = options.address.split(":", 1)
            if len(host_port_list) == 1:
                raise DrumCommonException(
                    "Error: when using the docker option provide argument --server host:port"
                )
            port = int(host_port_list[1])
            host_port_inside_docker = "{}:{}".format("0.0.0.0", port)
            CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                     ArgumentsOptions.ADDRESS,
                                                     host_port_inside_docker)
            docker_cmd_args += " -p {port}:{port}".format(port=port)

        if run_mode in [
                RunMode.SCORE, RunMode.PERF_TEST, RunMode.VALIDATION,
                RunMode.FIT
        ]:
            docker_cmd_args += " -v {}:{}".format(options.input,
                                                  in_docker_input_file)

            if run_mode == RunMode.SCORE and options.output:
                output_file = os.path.realpath(options.output)
                if not os.path.exists(output_file):
                    # Creating an empty file so the mount command will mount the file correctly -
                    # otherwise docker create an empty directory
                    open(output_file, "a").close()
                docker_cmd_args += " -v {}:{}".format(output_file,
                                                      in_docker_output_file)
                CMRunnerUtils.replace_cmd_argument_value(
                    in_docker_cmd_list, ArgumentsOptions.OUTPUT,
                    in_docker_output_file)
            elif run_mode == RunMode.FIT:
                if options.output:
                    fit_output_dir = os.path.realpath(options.output)
                    docker_cmd_args += " -v {}:{}".format(
                        fit_output_dir, in_docker_fit_output_dir)
                CMRunnerUtils.replace_cmd_argument_value(
                    in_docker_cmd_list, ArgumentsOptions.OUTPUT,
                    in_docker_fit_output_dir)
                if options.target_csv:
                    fit_target_filename = os.path.realpath(options.target_csv)
                    docker_cmd_args += " -v {}:{}".format(
                        fit_target_filename, in_docker_fit_target_filename)
                    CMRunnerUtils.replace_cmd_argument_value(
                        in_docker_cmd_list,
                        ArgumentsOptions.TARGET_FILENAME,
                        in_docker_fit_target_filename,
                    )
                if options.row_weights_csv:
                    fit_row_weights_filename = os.path.realpath(
                        options.row_weights_csv)
                    docker_cmd_args += " -v {}:{}".format(
                        fit_row_weights_filename,
                        in_docker_fit_row_weights_filename)
                    CMRunnerUtils.replace_cmd_argument_value(
                        in_docker_cmd_list,
                        ArgumentsOptions.WEIGHTS_CSV,
                        in_docker_fit_row_weights_filename,
                    )

        docker_cmd += " {} {} {}".format(docker_cmd_args, options.docker,
                                         " ".join(in_docker_cmd_list))

        self._print_verbose("docker command: [{}]".format(docker_cmd))
        return docker_cmd
Example #15
0
    def _run_fit_and_predictions_pipelines_in_mlpiper(self):
        if self.run_mode == RunMode.SERVER:
            run_language = self._check_artifacts_and_get_run_language()
            # in prediction server mode infra pipeline == prediction server runner pipeline
            infra_pipeline_str = self._prepare_prediction_server_or_batch_pipeline(
                run_language)
        elif self.run_mode == RunMode.SCORE:
            run_language = self._check_artifacts_and_get_run_language()
            tmp_output_filename = None
            # if output is not provided, output into tmp file and print
            if not self.options.output:
                # keep object reference so it will be destroyed only in the end of the process
                __tmp_output_file = tempfile.NamedTemporaryFile(mode="w")
                self.options.output = tmp_output_filename = __tmp_output_file.name
            # in batch prediction mode infra pipeline == predictor pipeline
            infra_pipeline_str = self._prepare_prediction_server_or_batch_pipeline(
                run_language)
        elif self.run_mode == RunMode.FIT:
            run_language = self._get_fit_run_language()
            infra_pipeline_str = self._prepare_fit_pipeline(run_language)
        else:
            error_message = "{} mode is not supported here".format(
                self.run_mode)
            print(error_message)
            raise DrumCommonException(error_message)

        config = ExecutorConfig(
            pipeline=infra_pipeline_str,
            pipeline_file=None,
            run_locally=True,
            comp_root_path=CMRunnerUtils.get_components_repo(),
            mlpiper_jar=None,
            spark_jars=None,
        )

        _pipeline_executor = Executor(config).standalone(True).set_verbose(
            self.options.verbose)
        # assign logger with the name drum.mlpiper.Executor to mlpiper Executor
        _pipeline_executor.set_logger(
            logging.getLogger(LOGGER_NAME_PREFIX + "." +
                              _pipeline_executor.logger_name()))

        self.logger.info(">>> Start {} in the {} mode".format(
            ArgumentsOptions.MAIN_COMMAND, self.run_mode.value))
        sc = StatsCollector(disable_instance=(
            not hasattr(self.options, "show_perf")
            or not self.options.show_perf or self.run_mode == RunMode.SERVER))
        sc.register_report("Full time", "end", StatsOperation.SUB, "start")
        sc.register_report("Init time (incl model loading)", "init",
                           StatsOperation.SUB, "start")
        sc.register_report("Run time (incl reading CSV)", "run",
                           StatsOperation.SUB, "init")
        with verbose_stdout(self.options.verbose):
            sc.enable()
            try:
                sc.mark("start")

                _pipeline_executor.init_pipeline()
                self.runtime.initialization_succeeded = True
                sc.mark("init")

                _pipeline_executor.run_pipeline(cleanup=False)
                sc.mark("run")
            finally:
                _pipeline_executor.cleanup_pipeline()
                sc.mark("end")
                sc.disable()
        self.logger.info("<<< Finish {} in the {} mode".format(
            ArgumentsOptions.MAIN_COMMAND, self.run_mode.value))
        sc.print_reports()
        if self.run_mode == RunMode.SCORE:
            # print result if output is not provided
            if tmp_output_filename:
                print(pd.read_csv(tmp_output_filename))
Example #16
0
    def _prepare_fit_pipeline(self, run_language):

        if self.target_type.value in TargetType.CLASSIFICATION.value and (
                self.options.negative_class_label is None
                or self.options.class_labels is None):
            # No class label information was supplied, but we may be able to infer the labels
            possible_class_labels = possibly_intuit_order(
                self.options.input,
                self.options.target_csv,
                self.options.target,
                self.target_type == TargetType.ANOMALY,
            )
            if possible_class_labels is not None:
                if self.target_type == TargetType.BINARY:
                    if len(possible_class_labels) != 2:
                        raise DrumCommonException(
                            "Target type {} requires exactly 2 class labels. Detected {}: {}"
                            .format(TargetType.BINARY,
                                    len(possible_class_labels),
                                    possible_class_labels))
                    (
                        self.options.positive_class_label,
                        self.options.negative_class_label,
                    ) = possible_class_labels
                elif self.target_type == TargetType.MULTICLASS:
                    if len(possible_class_labels) < 2:
                        raise DrumCommonException(
                            "Target type {} requires more than 2 class labels. Detected {}: {}"
                            .format(
                                TargetType.MULTICLASS,
                                len(possible_class_labels),
                                possible_class_labels,
                            ))
                    self.options.class_labels = list(possible_class_labels)
            else:
                raise DrumCommonException(
                    "Target type {} requires class label information. No labels were supplied and "
                    "labels could not be inferred from the target.".format(
                        self.target_type.value))

        options = self.options
        # functional pipeline is predictor pipeline
        # they are a little different for batch and server predictions.
        functional_pipeline_name = self._functional_pipelines[(self.run_mode,
                                                               run_language)]
        functional_pipeline_filepath = CMRunnerUtils.get_pipeline_filepath(
            functional_pipeline_name)
        # fields to replace in the functional pipeline (predictor)
        replace_data = {
            "customModelPath": os.path.abspath(options.code_dir),
            "input_filename": options.input,
            "weights": options.row_weights,
            "weights_filename": options.row_weights_csv,
            "target_column": options.target,
            "target_filename": options.target_csv,
            "positiveClassLabel": options.positive_class_label,
            "negativeClassLabel": options.negative_class_label,
            "classLabels": options.class_labels,
            "output_dir": options.output,
            "num_rows": options.num_rows,
            "sparse_column_file": options.sparse_column_file,
        }

        functional_pipeline_str = CMRunnerUtils.render_file(
            functional_pipeline_filepath, replace_data)
        return functional_pipeline_str
Example #17
0
    def _prepare_docker_command(self, options, run_mode, raw_arguments):
        """
        Building a docker command line for running the model inside the docker - this command line
        can be used by the user independently of drum.
        Parameters
        Returns: docker command line to run as a string
        """
        options.docker = self._maybe_build_image(options.docker)
        in_docker_model = "/opt/model"
        in_docker_input_file = "/opt/input.csv"
        in_docker_output_file = "/opt/output.csv"
        in_docker_fit_output_dir = "/opt/fit_output_dir"
        in_docker_fit_target_filename = "/opt/fit_target.csv"
        in_docker_fit_row_weights_filename = "/opt/fit_row_weights.csv"

        docker_cmd = "docker run --rm --entrypoint '' --interactive --user $(id -u):$(id -g)"
        docker_cmd_args = ' -v "{}":{}'.format(options.code_dir,
                                               in_docker_model)

        in_docker_cmd_list = raw_arguments
        in_docker_cmd_list[0] = ArgumentsOptions.MAIN_COMMAND
        in_docker_cmd_list[1] = run_mode.value

        # [RAPTOR-5607] Using -cd makes fit fail within docker, but not --code-dir.
        # Hotfix it by replacing -cd with --code-dir
        in_docker_cmd_list = [
            ArgumentsOptions.CODE_DIR if arg == "-cd" else arg
            for arg in in_docker_cmd_list
        ]

        CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list,
                                          ArgumentsOptions.DOCKER)
        CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list,
                                          ArgumentsOptions.SKIP_DEPS_INSTALL)
        if options.memory:
            docker_cmd_args += " --memory {mem_size} --memory-swap {mem_size} ".format(
                mem_size=options.memory)
            CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list,
                                              ArgumentsOptions.MEMORY)

        if options.class_labels and ArgumentsOptions.CLASS_LABELS not in in_docker_cmd_list:
            CMRunnerUtils.delete_cmd_argument(
                in_docker_cmd_list, ArgumentsOptions.CLASS_LABELS_FILE)
            in_docker_cmd_list.append(ArgumentsOptions.CLASS_LABELS)
            for label in options.class_labels:
                in_docker_cmd_list.append(label)

        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.CODE_DIR,
                                                 in_docker_model)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, "-cd",
                                                 in_docker_model)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.INPUT,
                                                 in_docker_input_file)
        CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                 ArgumentsOptions.OUTPUT,
                                                 in_docker_output_file)

        if run_mode == RunMode.SERVER:
            host_port_list = options.address.split(":", 1)
            if len(host_port_list) == 1:
                raise DrumCommonException(
                    "Error: when using the docker option provide argument --server host:port"
                )
            port = int(host_port_list[1])
            host_port_inside_docker = "{}:{}".format("0.0.0.0", port)
            CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list,
                                                     ArgumentsOptions.ADDRESS,
                                                     host_port_inside_docker)
            docker_cmd_args += " -p {port}:{port}".format(port=port)

        if run_mode in [
                RunMode.SCORE, RunMode.PERF_TEST, RunMode.VALIDATION,
                RunMode.FIT
        ]:
            docker_cmd_args += ' -v "{}":{}'.format(options.input,
                                                    in_docker_input_file)

            if run_mode == RunMode.SCORE and options.output:
                output_file = os.path.realpath(options.output)
                if not os.path.exists(output_file):
                    # Creating an empty file so the mount command will mount the file correctly -
                    # otherwise docker create an empty directory
                    open(output_file, "a").close()
                docker_cmd_args += ' -v "{}":{}'.format(
                    output_file, in_docker_output_file)
                CMRunnerUtils.replace_cmd_argument_value(
                    in_docker_cmd_list, ArgumentsOptions.OUTPUT,
                    in_docker_output_file)
            elif run_mode == RunMode.FIT:
                if options.output:
                    fit_output_dir = os.path.realpath(options.output)
                    docker_cmd_args += ' -v "{}":{}'.format(
                        fit_output_dir, in_docker_fit_output_dir)
                CMRunnerUtils.replace_cmd_argument_value(
                    in_docker_cmd_list, ArgumentsOptions.OUTPUT,
                    in_docker_fit_output_dir)
                if options.target_csv:
                    fit_target_filename = os.path.realpath(options.target_csv)
                    docker_cmd_args += ' -v "{}":{}'.format(
                        fit_target_filename, in_docker_fit_target_filename)
                    CMRunnerUtils.replace_cmd_argument_value(
                        in_docker_cmd_list,
                        ArgumentsOptions.TARGET_CSV,
                        in_docker_fit_target_filename,
                    )
                if options.row_weights_csv:
                    fit_row_weights_filename = os.path.realpath(
                        options.row_weights_csv)
                    docker_cmd_args += ' -v "{}":{}'.format(
                        fit_row_weights_filename,
                        in_docker_fit_row_weights_filename)
                    CMRunnerUtils.replace_cmd_argument_value(
                        in_docker_cmd_list,
                        ArgumentsOptions.WEIGHTS_CSV,
                        in_docker_fit_row_weights_filename,
                    )

        docker_cmd += " {} {} {}".format(docker_cmd_args, options.docker,
                                         " ".join(in_docker_cmd_list))

        self._print_verbose("docker command: [{}]".format(docker_cmd))
        return docker_cmd
Example #18
0
    def _null_value_imputation_check(self):
        test_name = "Null value imputation"
        test_passed = True
        failure_message = ""
        cmd_list = sys.argv

        TMP_DIR = "/tmp"
        DIR_PREFIX = "drum_validation_checks_"

        ValidationTestResult = collections.namedtuple(
            "ValidationTestResult", "filename retcode message")

        null_datasets_dir = mkdtemp(prefix=DIR_PREFIX, dir=TMP_DIR)

        df = pd.read_csv(self._input_csv)
        column_names = list(df.iloc[[0]])

        results = {}
        for i, column_name in enumerate(column_names):
            output_filename = os.path.join(null_datasets_dir,
                                           "output{}".format(i))
            tmp_dataset_file_path = os.path.join(
                null_datasets_dir, "null_value_imputation_column{}".format(i))
            df_tmp = df.copy()
            df_tmp[column_name] = None
            df_tmp.to_csv(tmp_dataset_file_path, index=False)
            CMRunnerUtils.replace_cmd_argument_value(cmd_list,
                                                     ArgumentsOptions.INPUT,
                                                     tmp_dataset_file_path)
            CMRunnerUtils.replace_cmd_argument_value(cmd_list,
                                                     ArgumentsOptions.OUTPUT,
                                                     output_filename)

            p = subprocess.Popen(cmd_list, env=os.environ)
            retcode = p.wait()
            if retcode != 0:
                test_passed = False
                results[column_name] = ValidationTestResult(
                    tmp_dataset_file_path, retcode, "")

        # process results
        if test_passed:
            shutil.rmtree(null_datasets_dir)
        else:
            for test_result in results.values():
                if not test_result.retcode:
                    os.remove(test_result.filename)

            table = Texttable()
            table.set_deco(Texttable.HEADER)

            headers = ["Failed feature", "Message", "Dataset filename"]

            table.set_cols_dtype(["t", "t", "t"])
            table.set_cols_align(["l", "l", "l"])

            rows = [headers]

            for key, test_result in results.items():
                if test_result.retcode:
                    rows.append(
                        [key, test_result.message, test_result.filename])

            table.add_rows(rows)
            table_res = table.draw()

            message = (
                "Null value imputation check performs check by imputing each feature with NaN value. "
                "If check fails for a feature, test dataset is saved in {}/{}* "
                "Make sure to delete those folders if it takes too much space."
                .format(TMP_DIR, DIR_PREFIX))
            failure_message = "{}\n\n{}".format(message, table_res)

        return test_name, test_passed, failure_message