def _prepare_fit_pipeline(self, run_language): if self.options.negative_class_label is None: ( self.options.positive_class_label, self.options.negative_class_label, ) = possibly_intuit_order( self.options.input, self.options.target_csv, self.options.target, self.options.unsupervised, ) if self.options.unsupervised: self._set_target_type(TargetType.ANOMALY) elif self.options.negative_class_label is not None: self._set_target_type(TargetType.BINARY) else: self._set_target_type(TargetType.REGRESSION) options = self.options # functional pipeline is predictor pipeline # they are a little different for batch and server predictions. functional_pipeline_name = self._functional_pipelines[(self.run_mode, run_language)] functional_pipeline_filepath = CMRunnerUtils.get_pipeline_filepath( functional_pipeline_name) # fields to replace in the functional pipeline (predictor) replace_data = { "customModelPath": os.path.abspath(options.code_dir), "input_filename": options.input, "weights": '"{}"'.format(options.row_weights) if options.row_weights else "null", "weights_filename": '"{}"'.format(options.row_weights_csv) if options.row_weights_csv else "null", "target_column": '"{}"'.format(options.target) if options.target else "null", "target_filename": '"{}"'.format(options.target_csv) if options.target_csv else "null", "positiveClassLabel": '"{}"'.format(options.positive_class_label) if options.positive_class_label is not None else "null", "negativeClassLabel": '"{}"'.format(options.negative_class_label) if options.negative_class_label is not None else "null", "output_dir": options.output, "num_rows": options.num_rows, } functional_pipeline_str = CMRunnerUtils.render_file( functional_pipeline_filepath, replace_data) return functional_pipeline_str
def _prepare_prediction_server_or_batch_pipeline(self, run_language): options = self.options # functional pipeline is predictor pipeline # they are a little different for batch and server predictions. functional_pipeline_name = self._functional_pipelines[(self.run_mode, run_language)] functional_pipeline_filepath = CMRunnerUtils.get_pipeline_filepath( functional_pipeline_name) # fields to replace in the functional pipeline (predictor) replace_data = { "positiveClassLabel": '"{}"'.format(options.positive_class_label) if options.positive_class_label else "null", "negativeClassLabel": '"{}"'.format(options.negative_class_label) if options.negative_class_label else "null", "customModelPath": os.path.abspath(options.code_dir), } if self.run_mode == RunMode.SCORE: replace_data.update({ "input_filename": options.input, "output_filename": '"{}"'.format(options.output) if options.output else "null", }) functional_pipeline_str = CMRunnerUtils.render_file( functional_pipeline_filepath, replace_data) ret_pipeline = functional_pipeline_str if self.run_mode == RunMode.SERVER: with open( CMRunnerUtils.get_pipeline_filepath( EXTERNAL_SERVER_RUNNER), "r") as f: runner_pipeline_json = json.load(f) # can not use template for pipeline as quotes won't be escaped args = runner_pipeline_json["pipe"][0]["arguments"] # in server mode, predictor pipeline is passed to server as param args["pipeline"] = functional_pipeline_str args["repo"] = CMRunnerUtils.get_components_repo() host_port_list = options.address.split(":", 1) args["host"] = host_port_list[0] args["port"] = int( host_port_list[1]) if len(host_port_list) == 2 else None args["threaded"] = options.threaded args["show_perf"] = options.show_perf ret_pipeline = json.dumps(runner_pipeline_json) return ret_pipeline
def __init__( self, labels, custom_model_dir, docker=None, with_error_server=False, show_stacktrace=True, nginx=False, ): port = CMRunnerUtils.find_free_port() server_address = "localhost:{}".format(port) url_host = os.environ.get("TEST_URL_HOST", "localhost") if docker: self.url_server_address = "http://{}:{}".format(url_host, port) else: self.url_server_address = "http://localhost:{}".format(port) cmd = "{} server --code-dir {} --address {}".format( ArgumentsOptions.MAIN_COMMAND, custom_model_dir, server_address ) if labels: cmd = _cmd_add_class_labels(cmd, labels) if docker: cmd += " --docker {}".format(docker) if with_error_server: cmd += " --with-error-server" if show_stacktrace: cmd += " --show-stacktrace" if nginx: cmd += " --production" self._cmd = cmd self._process_object_holder = DrumServerProcess() self._server_thread = None self._with_nginx = nginx
def performance_test(self): self._print_perf_test_params() self._prepare_test_cases() _kill_drum_perf_test_server_process( _find_drum_perf_test_server_process(), self.options.verbose ) if CMRunnerUtils.is_port_in_use(self._server_addr, self._server_port): error_message = "\nError: address: {} is in use".format(self._url_server_address) print(error_message) raise DrumCommonException(error_message) self._start_drum_server() self._init_signals() print("\n\n") results = self._run_all_test_cases() self._reset_signals() self._stop_drum_server() in_docker = self.options.docker is not None str_report = PerfTestResultsFormatter( results, in_docker=in_docker, show_inside_server=self.options.in_server ).get_tbl_str() print("\n" + str_report) return
def _prepare_prediction_server_or_batch_pipeline(self, run_language): options = self.options functional_pipeline_name = ( PREDICTION_SERVER_PIPELINE if self.run_mode == RunMode.SERVER else PREDICTOR_PIPELINE ) functional_pipeline_filepath = CMRunnerUtils.get_pipeline_filepath(functional_pipeline_name) # fields to replace in the pipeline replace_data = { "positiveClassLabel": '"{}"'.format(options.positive_class_label) if options.positive_class_label else "null", "negativeClassLabel": '"{}"'.format(options.negative_class_label) if options.negative_class_label else "null", "customModelPath": os.path.abspath(options.code_dir), "run_language": run_language.value, } if self.run_mode == RunMode.SCORE: replace_data.update( { "input_filename": options.input, "output_filename": '"{}"'.format(options.output) if options.output else "null", } ) else: host_port_list = options.address.split(":", 1) host = host_port_list[0] port = int(host_port_list[1]) if len(host_port_list) == 2 else None replace_data.update( { "host": host, "port": port, "threaded": str(options.threaded).lower(), "show_perf": str(options.show_perf).lower(), } ) functional_pipeline_str = CMRunnerUtils.render_file( functional_pipeline_filepath, replace_data ) return functional_pipeline_str
def _basic_batch_prediction_check(self): test_name = "Basic batch prediction" test_passed = True failure_message = "" cmd_list = sys.argv TMP_DIR = "/tmp" DIR_PREFIX = "drum_validation_check_" output_dir = mkdtemp(prefix=DIR_PREFIX, dir=TMP_DIR) output_filename = os.path.join(output_dir, "output") CMRunnerUtils.replace_cmd_argument_value(cmd_list, ArgumentsOptions.OUTPUT, output_filename) p = subprocess.Popen(cmd_list, env=os.environ) retcode = p.wait() if retcode != 0: test_passed = False failure_message = "Test failed on provided dataset: {}".format( self._input_csv) return test_name, test_passed, failure_message
def __init__(self, options, run_mode, target_type=None): self.options = options self.target_type = target_type self._verbose = self.options.verbose self._input_csv = self.options.input self._input_df = pd.read_csv(self._input_csv) self._server_addr = "localhost" self._server_port = CMRunnerUtils.find_free_port() self._url_server_address = "http://{}:{}".format(self._server_addr, self._server_port) self._shutdown_endpoint = "/shutdown/" self._predict_endpoint = "/predict/" self._stats_endpoint = "/stats/" self._timeout = 20 self._server_process = None self._df_for_test = None self._test_cases_to_run = None
def __init__(self, options, run_mode): self.options = options self._input_csv = self.options.input self._server_addr = "localhost" self._server_port = CMRunnerUtils.find_free_port() self._url_server_address = "http://{}:{}".format( self._server_addr, self._server_port) self._shutdown_endpoint = "/shutdown/" self._predict_endpoint = "/predict/" self._stats_endpoint = "/stats/" self._timeout = 20 self._server_process = None self._df_for_test = None self._test_cases_to_run = None if run_mode == RunMode.PERF_TEST: self._prepare_test_cases()
def performance_test(self): _find_and_kill_cmrun_server_process(self.options.verbose) if CMRunnerUtils.is_port_in_use(self._server_addr, self._server_port): error_message = "\nError: address: {} is in use".format( self._url_server_address) print(error_message) raise DrumCommonException(error_message) cmd_list = self._build_cmrun_cmd() self._server_process = subprocess.Popen(cmd_list, env=os.environ) self._wait_for_server_to_start() def signal_handler(sig, frame): print("\nCtrl+C pressed, aborting test") print("Sending shutdown to server") self._stop_server() os.system("tput init") sys.exit(0) def testcase_timeout(signum, frame): raise DrumPerfTestTimeout() signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGALRM, testcase_timeout) results = [] print("\n\n") for tc in self._test_cases_to_run: signal.alarm(self.options.timeout) try: self._run_test_case(tc, results) except DrumPerfTestTimeout: print("... timed out ({}s)".format(self.options.timeout)) except Exception as e: print("\n...test case failed with a message: {}".format(e)) self._stop_server() str_report = self._generate_table_report_adv( results, show_inside_server=self.options.in_server) print("\n" + str_report) return
def _prepare_prediction_server_or_batch_pipeline(self, run_language): options = self.options functional_pipeline_name = (SERVER_PIPELINE if self.run_mode == RunMode.SERVER else PREDICTOR_PIPELINE) functional_pipeline_filepath = CMRunnerUtils.get_pipeline_filepath( functional_pipeline_name) # fields to replace in the pipeline replace_data = { "positiveClassLabel": '"{}"'.format(options.positive_class_label) if options.positive_class_label else "null", "negativeClassLabel": '"{}"'.format(options.negative_class_label) if options.negative_class_label else "null", "customModelPath": os.path.abspath(options.code_dir), "run_language": run_language.value, "monitor": options.monitor, "model_id": options.model_id, "deployment_id": options.deployment_id, "monitor_settings": options.monitor_settings, } if self.run_mode == RunMode.SCORE: replace_data.update({ "input_filename": options.input, "output_filename": '"{}"'.format(options.output) if options.output else "null", }) else: host_port_list = options.address.split(":", 1) host = host_port_list[0] port = int(host_port_list[1]) if len(host_port_list) == 2 else None replace_data.update({ "host": host, "port": port, "show_perf": str(options.show_perf).lower(), "engine_type": "RestModelServing" if options.production else "Generic", "component_type": "uwsgi_serving" if options.production else "prediction_server", "uwsgi_max_workers": options.max_workers if getattr(options, "max_workers") else "null", }) functional_pipeline_str = CMRunnerUtils.render_file( functional_pipeline_filepath, replace_data) if self.run_mode == RunMode.SERVER: if options.production: pipeline_json = json.loads(functional_pipeline_str) # Because of tech debt in MLPiper which requires that the modelFileSourcePath key # be filled with something, we're putting in a dummy file path here if json_fields.PIPELINE_SYSTEM_CONFIG_FIELD not in pipeline_json: system_config = { "modelFileSourcePath": os.path.abspath(__file__) } pipeline_json[ json_fields.PIPELINE_SYSTEM_CONFIG_FIELD] = system_config functional_pipeline_str = json.dumps(pipeline_json) return functional_pipeline_str
def validation_test(self): # TODO: create infrastructure to easily add more checks # NullValueImputationCheck test_name = "Null value imputation" ValidationTestResult = collections.namedtuple("ValidationTestResult", "filename retcode") cmd_list = sys.argv cmd_list[0] = ArgumentsOptions.MAIN_COMMAND cmd_list[1] = ArgumentsOptions.SCORE TMP_DIR = "/tmp" DIR_PREFIX = "drum_validation_checks_" null_datasets_dir = mkdtemp(prefix=DIR_PREFIX, dir=TMP_DIR) df = pd.read_csv(self._input_csv) column_names = list(df.iloc[[0]]) results = {} for column_name in column_names: with NamedTemporaryFile( mode="w", dir=null_datasets_dir, prefix="null_value_imputation_{}_".format(column_name), delete=False, ) as temp_f: temp_data_name = temp_f.name df_tmp = df.copy() df_tmp[column_name] = None df_tmp.to_csv(temp_data_name, index=False) CMRunnerUtils.replace_cmd_argument_value( cmd_list, ArgumentsOptions.INPUT, temp_data_name ) p = subprocess.Popen(cmd_list, env=os.environ) retcode = p.wait() if retcode != 0: results[column_name] = ValidationTestResult(temp_data_name, retcode) table = Texttable() table.set_deco(Texttable.HEADER) try: terminal_size = shutil.get_terminal_size() table.set_max_width(terminal_size.columns) except Exception as e: pass header_names = ["Test case", "Status"] col_types = ["t", "t"] col_align = ["l", "l"] rows = [] if len(results) == 0: rows.append(header_names) rows.append([test_name, "PASSED"]) shutil.rmtree(null_datasets_dir) else: col_types.append("t") col_align.append("l") header_names.append("Details") rows.append(header_names) for test_result in results.values(): if not test_result.retcode: os.remove(test_result.filename) table2 = Texttable() table2.set_deco(Texttable.HEADER) message = ( "Null value imputation check performs check by imputing each feature with NaN value. " "If check fails for a feature, test dataset is saved in {}/{}. " "Make sure to delete those folders if it takes too much space.".format( TMP_DIR, DIR_PREFIX ) ) rows.append([test_name, "FAILED", message]) header_names2 = ["Failed feature", "Dataset filename"] table2.set_cols_dtype(["t", "t"]) table2.set_cols_align(["l", "l"]) rows2 = [header_names2] for key, test_result in results.items(): if test_result.retcode: rows2.append([key, test_result.filename]) pass table2.add_rows(rows2) table_res = table2.draw() rows.append(["", "", "\n{}".format(table_res)]) table.set_cols_dtype(col_types) table.set_cols_align(col_align) table.add_rows(rows) tbl_report = table.draw() print("\n\nValidation checks results") print(tbl_report)
def __init__( self, target_type, labels, custom_model_dir, docker=None, with_error_server=False, show_stacktrace=True, nginx=False, memory=None, fail_on_shutdown_error=True, pass_args_as_env_vars=False, verbose=True, append_cmd=None, ): port = CMRunnerUtils.find_free_port() self.server_address = "localhost:{}".format(port) url_host = os.environ.get("TEST_URL_HOST", "localhost") if docker: self.url_server_address = "http://{}:{}".format(url_host, port) else: self.url_server_address = "http://localhost:{}".format(port) cmd = "{} server".format(ArgumentsOptions.MAIN_COMMAND) if pass_args_as_env_vars: os.environ[ArgumentOptionsEnvVars.CODE_DIR] = str(custom_model_dir) os.environ[ArgumentOptionsEnvVars.TARGET_TYPE] = target_type os.environ[ArgumentOptionsEnvVars.ADDRESS] = self.server_address else: cmd += " --code-dir {} --target-type {} --address {}".format( custom_model_dir, target_type, self.server_address ) if labels: cmd = _cmd_add_class_labels( cmd, labels, target_type=target_type, pass_args_as_env_vars=pass_args_as_env_vars ) if docker: cmd += " --docker {}".format(docker) if memory: cmd += " --memory {}".format(memory) if with_error_server: if pass_args_as_env_vars: os.environ[ArgumentOptionsEnvVars.WITH_ERROR_SERVER] = "1" else: cmd += " --with-error-server" if show_stacktrace: if pass_args_as_env_vars: os.environ[ArgumentOptionsEnvVars.SHOW_STACKTRACE] = "1" else: cmd += " --show-stacktrace" if nginx: if pass_args_as_env_vars: os.environ[ArgumentOptionsEnvVars.PRODUCTION] = "1" else: cmd += " --production" if append_cmd is not None: cmd += " " + append_cmd self._cmd = cmd self._process_object_holder = DrumServerProcess() self._server_thread = None self._with_nginx = nginx self._fail_on_shutdown_error = fail_on_shutdown_error self._verbose = verbose
def _check_artifacts_and_get_run_language(self): lang = getattr(self.options, "language", None) if lang: return RunLanguage(self.options.language) code_dir_abspath = os.path.abspath(self.options.code_dir) artifact_language = None custom_language = None # check which artifacts present in the code dir python_artifacts = CMRunnerUtils.find_files_by_extensions( code_dir_abspath, PythonArtifacts.ALL) r_artifacts = CMRunnerUtils.find_files_by_extensions( code_dir_abspath, RArtifacts.ALL) java_artifacts = CMRunnerUtils.find_files_by_extensions( code_dir_abspath, JavaArtifacts.ALL) # check which custom code files present in the code dir is_custom_py = CMRunnerUtils.filename_exists_and_is_file( code_dir_abspath, "custom.py") is_custom_r = CMRunnerUtils.filename_exists_and_is_file( code_dir_abspath, "custom.R") or CMRunnerUtils.filename_exists_and_is_file( code_dir_abspath, "custom.r") # if all the artifacts belong to the same language, set it if bool(len(python_artifacts)) + bool(len(r_artifacts)) + bool( len(java_artifacts)) == 1: if len(python_artifacts): artifact_language = RunLanguage.PYTHON elif len(r_artifacts): artifact_language = RunLanguage.R elif len(java_artifacts): artifact_language = RunLanguage.JAVA # if only one custom file found, set it: if is_custom_py + is_custom_r == 1: custom_language = RunLanguage.PYTHON if is_custom_py else RunLanguage.R # if both language values are None, or both are not None and not equal if (bool(custom_language) + bool(artifact_language) == 0 or bool(custom_language) + bool(artifact_language) == 2 and custom_language != artifact_language): artifact_language = "None" if artifact_language is None else artifact_language.value custom_language = "None" if custom_language is None else custom_language.value error_mes = ( "Can not detect language by artifacts and/or custom.py/R files.\n" "Detected: language by artifacts - {}; language by custom - {}.\n" "Code directory must have one or more model artifacts belonging to the same language:\n" "Python/R/Java, with an extension:\n" "Python models: {}\n" "R models: {}\n" "Java models: {}.\n" "Or one of custom.py/R files.".format( artifact_language, custom_language, PythonArtifacts.ALL, RArtifacts.ALL, JavaArtifacts.ALL, )) all_files_message = "\n\nFiles(100 first) found in {}:\n{}\n".format( code_dir_abspath, "\n".join(sorted(os.listdir(code_dir_abspath))[0:100])) error_mes += all_files_message self.logger.error(error_mes) raise DrumCommonException(error_mes) run_language = custom_language if custom_language is not None else artifact_language return run_language
def _prepare_docker_command(self, options, run_mode, raw_arguments): """ Building a docker command line for running the model inside the docker - this command line can be used by the user independently of drum. Parameters Returns: docker command line to run as a string """ options.docker = self._maybe_build_image(options.docker) in_docker_model = "/opt/model" in_docker_input_file = "/opt/input.csv" in_docker_output_file = "/opt/output.csv" in_docker_fit_output_dir = "/opt/fit_output_dir" in_docker_fit_target_filename = "/opt/fit_target.csv" in_docker_fit_row_weights_filename = "/opt/fit_row_weights.csv" docker_cmd = "docker run --rm --interactive --user $(id -u):$(id -g) " docker_cmd_args = " -v {}:{}".format(options.code_dir, in_docker_model) in_docker_cmd_list = raw_arguments in_docker_cmd_list[0] = ArgumentsOptions.MAIN_COMMAND in_docker_cmd_list[1] = run_mode.value CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list, ArgumentsOptions.DOCKER) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.CODE_DIR, in_docker_model) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, "-cd", in_docker_model) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.INPUT, in_docker_input_file) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_output_file) if run_mode == RunMode.SERVER: host_port_list = options.address.split(":", 1) if len(host_port_list) == 1: raise DrumCommonException( "Error: when using the docker option provide argument --server host:port" ) port = int(host_port_list[1]) host_port_inside_docker = "{}:{}".format("0.0.0.0", port) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.ADDRESS, host_port_inside_docker) docker_cmd_args += " -p {port}:{port}".format(port=port) if run_mode in [ RunMode.SCORE, RunMode.PERF_TEST, RunMode.VALIDATION, RunMode.FIT ]: docker_cmd_args += " -v {}:{}".format(options.input, in_docker_input_file) if run_mode == RunMode.SCORE and options.output: output_file = os.path.realpath(options.output) if not os.path.exists(output_file): # Creating an empty file so the mount command will mount the file correctly - # otherwise docker create an empty directory open(output_file, "a").close() docker_cmd_args += " -v {}:{}".format(output_file, in_docker_output_file) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_output_file) elif run_mode == RunMode.FIT: if options.output: fit_output_dir = os.path.realpath(options.output) docker_cmd_args += " -v {}:{}".format( fit_output_dir, in_docker_fit_output_dir) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_fit_output_dir) if options.target_csv: fit_target_filename = os.path.realpath(options.target_csv) docker_cmd_args += " -v {}:{}".format( fit_target_filename, in_docker_fit_target_filename) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.TARGET_FILENAME, in_docker_fit_target_filename, ) if options.row_weights_csv: fit_row_weights_filename = os.path.realpath( options.row_weights_csv) docker_cmd_args += " -v {}:{}".format( fit_row_weights_filename, in_docker_fit_row_weights_filename) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.WEIGHTS_CSV, in_docker_fit_row_weights_filename, ) docker_cmd += " {} {} {}".format(docker_cmd_args, options.docker, " ".join(in_docker_cmd_list)) self._print_verbose("docker command: [{}]".format(docker_cmd)) return docker_cmd
def _run_fit_and_predictions_pipelines_in_mlpiper(self): if self.run_mode == RunMode.SERVER: run_language = self._check_artifacts_and_get_run_language() # in prediction server mode infra pipeline == prediction server runner pipeline infra_pipeline_str = self._prepare_prediction_server_or_batch_pipeline( run_language) elif self.run_mode == RunMode.SCORE: run_language = self._check_artifacts_and_get_run_language() tmp_output_filename = None # if output is not provided, output into tmp file and print if not self.options.output: # keep object reference so it will be destroyed only in the end of the process __tmp_output_file = tempfile.NamedTemporaryFile(mode="w") self.options.output = tmp_output_filename = __tmp_output_file.name # in batch prediction mode infra pipeline == predictor pipeline infra_pipeline_str = self._prepare_prediction_server_or_batch_pipeline( run_language) elif self.run_mode == RunMode.FIT: run_language = self._get_fit_run_language() infra_pipeline_str = self._prepare_fit_pipeline(run_language) else: error_message = "{} mode is not supported here".format( self.run_mode) print(error_message) raise DrumCommonException(error_message) config = ExecutorConfig( pipeline=infra_pipeline_str, pipeline_file=None, run_locally=True, comp_root_path=CMRunnerUtils.get_components_repo(), mlpiper_jar=None, spark_jars=None, ) _pipeline_executor = Executor(config).standalone(True).set_verbose( self.options.verbose) # assign logger with the name drum.mlpiper.Executor to mlpiper Executor _pipeline_executor.set_logger( logging.getLogger(LOGGER_NAME_PREFIX + "." + _pipeline_executor.logger_name())) self.logger.info(">>> Start {} in the {} mode".format( ArgumentsOptions.MAIN_COMMAND, self.run_mode.value)) sc = StatsCollector(disable_instance=( not hasattr(self.options, "show_perf") or not self.options.show_perf or self.run_mode == RunMode.SERVER)) sc.register_report("Full time", "end", StatsOperation.SUB, "start") sc.register_report("Init time (incl model loading)", "init", StatsOperation.SUB, "start") sc.register_report("Run time (incl reading CSV)", "run", StatsOperation.SUB, "init") with verbose_stdout(self.options.verbose): sc.enable() try: sc.mark("start") _pipeline_executor.init_pipeline() self.runtime.initialization_succeeded = True sc.mark("init") _pipeline_executor.run_pipeline(cleanup=False) sc.mark("run") finally: _pipeline_executor.cleanup_pipeline() sc.mark("end") sc.disable() self.logger.info("<<< Finish {} in the {} mode".format( ArgumentsOptions.MAIN_COMMAND, self.run_mode.value)) sc.print_reports() if self.run_mode == RunMode.SCORE: # print result if output is not provided if tmp_output_filename: print(pd.read_csv(tmp_output_filename))
def _prepare_fit_pipeline(self, run_language): if self.target_type.value in TargetType.CLASSIFICATION.value and ( self.options.negative_class_label is None or self.options.class_labels is None): # No class label information was supplied, but we may be able to infer the labels possible_class_labels = possibly_intuit_order( self.options.input, self.options.target_csv, self.options.target, self.target_type == TargetType.ANOMALY, ) if possible_class_labels is not None: if self.target_type == TargetType.BINARY: if len(possible_class_labels) != 2: raise DrumCommonException( "Target type {} requires exactly 2 class labels. Detected {}: {}" .format(TargetType.BINARY, len(possible_class_labels), possible_class_labels)) ( self.options.positive_class_label, self.options.negative_class_label, ) = possible_class_labels elif self.target_type == TargetType.MULTICLASS: if len(possible_class_labels) < 2: raise DrumCommonException( "Target type {} requires more than 2 class labels. Detected {}: {}" .format( TargetType.MULTICLASS, len(possible_class_labels), possible_class_labels, )) self.options.class_labels = list(possible_class_labels) else: raise DrumCommonException( "Target type {} requires class label information. No labels were supplied and " "labels could not be inferred from the target.".format( self.target_type.value)) options = self.options # functional pipeline is predictor pipeline # they are a little different for batch and server predictions. functional_pipeline_name = self._functional_pipelines[(self.run_mode, run_language)] functional_pipeline_filepath = CMRunnerUtils.get_pipeline_filepath( functional_pipeline_name) # fields to replace in the functional pipeline (predictor) replace_data = { "customModelPath": os.path.abspath(options.code_dir), "input_filename": options.input, "weights": options.row_weights, "weights_filename": options.row_weights_csv, "target_column": options.target, "target_filename": options.target_csv, "positiveClassLabel": options.positive_class_label, "negativeClassLabel": options.negative_class_label, "classLabels": options.class_labels, "output_dir": options.output, "num_rows": options.num_rows, "sparse_column_file": options.sparse_column_file, } functional_pipeline_str = CMRunnerUtils.render_file( functional_pipeline_filepath, replace_data) return functional_pipeline_str
def _prepare_docker_command(self, options, run_mode, raw_arguments): """ Building a docker command line for running the model inside the docker - this command line can be used by the user independently of drum. Parameters Returns: docker command line to run as a string """ options.docker = self._maybe_build_image(options.docker) in_docker_model = "/opt/model" in_docker_input_file = "/opt/input.csv" in_docker_output_file = "/opt/output.csv" in_docker_fit_output_dir = "/opt/fit_output_dir" in_docker_fit_target_filename = "/opt/fit_target.csv" in_docker_fit_row_weights_filename = "/opt/fit_row_weights.csv" docker_cmd = "docker run --rm --entrypoint '' --interactive --user $(id -u):$(id -g)" docker_cmd_args = ' -v "{}":{}'.format(options.code_dir, in_docker_model) in_docker_cmd_list = raw_arguments in_docker_cmd_list[0] = ArgumentsOptions.MAIN_COMMAND in_docker_cmd_list[1] = run_mode.value # [RAPTOR-5607] Using -cd makes fit fail within docker, but not --code-dir. # Hotfix it by replacing -cd with --code-dir in_docker_cmd_list = [ ArgumentsOptions.CODE_DIR if arg == "-cd" else arg for arg in in_docker_cmd_list ] CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list, ArgumentsOptions.DOCKER) CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list, ArgumentsOptions.SKIP_DEPS_INSTALL) if options.memory: docker_cmd_args += " --memory {mem_size} --memory-swap {mem_size} ".format( mem_size=options.memory) CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list, ArgumentsOptions.MEMORY) if options.class_labels and ArgumentsOptions.CLASS_LABELS not in in_docker_cmd_list: CMRunnerUtils.delete_cmd_argument( in_docker_cmd_list, ArgumentsOptions.CLASS_LABELS_FILE) in_docker_cmd_list.append(ArgumentsOptions.CLASS_LABELS) for label in options.class_labels: in_docker_cmd_list.append(label) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.CODE_DIR, in_docker_model) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, "-cd", in_docker_model) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.INPUT, in_docker_input_file) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_output_file) if run_mode == RunMode.SERVER: host_port_list = options.address.split(":", 1) if len(host_port_list) == 1: raise DrumCommonException( "Error: when using the docker option provide argument --server host:port" ) port = int(host_port_list[1]) host_port_inside_docker = "{}:{}".format("0.0.0.0", port) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.ADDRESS, host_port_inside_docker) docker_cmd_args += " -p {port}:{port}".format(port=port) if run_mode in [ RunMode.SCORE, RunMode.PERF_TEST, RunMode.VALIDATION, RunMode.FIT ]: docker_cmd_args += ' -v "{}":{}'.format(options.input, in_docker_input_file) if run_mode == RunMode.SCORE and options.output: output_file = os.path.realpath(options.output) if not os.path.exists(output_file): # Creating an empty file so the mount command will mount the file correctly - # otherwise docker create an empty directory open(output_file, "a").close() docker_cmd_args += ' -v "{}":{}'.format( output_file, in_docker_output_file) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_output_file) elif run_mode == RunMode.FIT: if options.output: fit_output_dir = os.path.realpath(options.output) docker_cmd_args += ' -v "{}":{}'.format( fit_output_dir, in_docker_fit_output_dir) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_fit_output_dir) if options.target_csv: fit_target_filename = os.path.realpath(options.target_csv) docker_cmd_args += ' -v "{}":{}'.format( fit_target_filename, in_docker_fit_target_filename) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.TARGET_CSV, in_docker_fit_target_filename, ) if options.row_weights_csv: fit_row_weights_filename = os.path.realpath( options.row_weights_csv) docker_cmd_args += ' -v "{}":{}'.format( fit_row_weights_filename, in_docker_fit_row_weights_filename) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.WEIGHTS_CSV, in_docker_fit_row_weights_filename, ) docker_cmd += " {} {} {}".format(docker_cmd_args, options.docker, " ".join(in_docker_cmd_list)) self._print_verbose("docker command: [{}]".format(docker_cmd)) return docker_cmd
def _null_value_imputation_check(self): test_name = "Null value imputation" test_passed = True failure_message = "" cmd_list = sys.argv TMP_DIR = "/tmp" DIR_PREFIX = "drum_validation_checks_" ValidationTestResult = collections.namedtuple( "ValidationTestResult", "filename retcode message") null_datasets_dir = mkdtemp(prefix=DIR_PREFIX, dir=TMP_DIR) df = pd.read_csv(self._input_csv) column_names = list(df.iloc[[0]]) results = {} for i, column_name in enumerate(column_names): output_filename = os.path.join(null_datasets_dir, "output{}".format(i)) tmp_dataset_file_path = os.path.join( null_datasets_dir, "null_value_imputation_column{}".format(i)) df_tmp = df.copy() df_tmp[column_name] = None df_tmp.to_csv(tmp_dataset_file_path, index=False) CMRunnerUtils.replace_cmd_argument_value(cmd_list, ArgumentsOptions.INPUT, tmp_dataset_file_path) CMRunnerUtils.replace_cmd_argument_value(cmd_list, ArgumentsOptions.OUTPUT, output_filename) p = subprocess.Popen(cmd_list, env=os.environ) retcode = p.wait() if retcode != 0: test_passed = False results[column_name] = ValidationTestResult( tmp_dataset_file_path, retcode, "") # process results if test_passed: shutil.rmtree(null_datasets_dir) else: for test_result in results.values(): if not test_result.retcode: os.remove(test_result.filename) table = Texttable() table.set_deco(Texttable.HEADER) headers = ["Failed feature", "Message", "Dataset filename"] table.set_cols_dtype(["t", "t", "t"]) table.set_cols_align(["l", "l", "l"]) rows = [headers] for key, test_result in results.items(): if test_result.retcode: rows.append( [key, test_result.message, test_result.filename]) table.add_rows(rows) table_res = table.draw() message = ( "Null value imputation check performs check by imputing each feature with NaN value. " "If check fails for a feature, test dataset is saved in {}/{}* " "Make sure to delete those folders if it takes too much space." .format(TMP_DIR, DIR_PREFIX)) failure_message = "{}\n\n{}".format(message, table_res) return test_name, test_passed, failure_message