def _basic_batch_prediction_check(self): test_name = "Basic batch prediction" test_passed = True failure_message = "" cmd_list = sys.argv TMP_DIR = "/tmp" DIR_PREFIX = "drum_validation_check_" output_dir = mkdtemp(prefix=DIR_PREFIX, dir=TMP_DIR) output_filename = os.path.join(output_dir, "output") CMRunnerUtils.replace_cmd_argument_value(cmd_list, ArgumentsOptions.OUTPUT, output_filename) p = subprocess.Popen(cmd_list, env=os.environ) retcode = p.wait() if retcode != 0: test_passed = False failure_message = "Test failed on provided dataset: {}".format( self._input_csv) return test_name, test_passed, failure_message
def validation_test(self): # TODO: create infrastructure to easily add more checks # NullValueImputationCheck test_name = "Null value imputation" ValidationTestResult = collections.namedtuple("ValidationTestResult", "filename retcode") cmd_list = sys.argv cmd_list[0] = ArgumentsOptions.MAIN_COMMAND cmd_list[1] = ArgumentsOptions.SCORE TMP_DIR = "/tmp" DIR_PREFIX = "drum_validation_checks_" null_datasets_dir = mkdtemp(prefix=DIR_PREFIX, dir=TMP_DIR) df = pd.read_csv(self._input_csv) column_names = list(df.iloc[[0]]) results = {} for column_name in column_names: with NamedTemporaryFile( mode="w", dir=null_datasets_dir, prefix="null_value_imputation_{}_".format(column_name), delete=False, ) as temp_f: temp_data_name = temp_f.name df_tmp = df.copy() df_tmp[column_name] = None df_tmp.to_csv(temp_data_name, index=False) CMRunnerUtils.replace_cmd_argument_value( cmd_list, ArgumentsOptions.INPUT, temp_data_name ) p = subprocess.Popen(cmd_list, env=os.environ) retcode = p.wait() if retcode != 0: results[column_name] = ValidationTestResult(temp_data_name, retcode) table = Texttable() table.set_deco(Texttable.HEADER) try: terminal_size = shutil.get_terminal_size() table.set_max_width(terminal_size.columns) except Exception as e: pass header_names = ["Test case", "Status"] col_types = ["t", "t"] col_align = ["l", "l"] rows = [] if len(results) == 0: rows.append(header_names) rows.append([test_name, "PASSED"]) shutil.rmtree(null_datasets_dir) else: col_types.append("t") col_align.append("l") header_names.append("Details") rows.append(header_names) for test_result in results.values(): if not test_result.retcode: os.remove(test_result.filename) table2 = Texttable() table2.set_deco(Texttable.HEADER) message = ( "Null value imputation check performs check by imputing each feature with NaN value. " "If check fails for a feature, test dataset is saved in {}/{}. " "Make sure to delete those folders if it takes too much space.".format( TMP_DIR, DIR_PREFIX ) ) rows.append([test_name, "FAILED", message]) header_names2 = ["Failed feature", "Dataset filename"] table2.set_cols_dtype(["t", "t"]) table2.set_cols_align(["l", "l"]) rows2 = [header_names2] for key, test_result in results.items(): if test_result.retcode: rows2.append([key, test_result.filename]) pass table2.add_rows(rows2) table_res = table2.draw() rows.append(["", "", "\n{}".format(table_res)]) table.set_cols_dtype(col_types) table.set_cols_align(col_align) table.add_rows(rows) tbl_report = table.draw() print("\n\nValidation checks results") print(tbl_report)
def _prepare_docker_command(self, options, run_mode, raw_arguments): """ Building a docker command line for running the model inside the docker - this command line can be used by the user independently of drum. Parameters Returns: docker command line to run as a string """ options.docker = self._maybe_build_image(options.docker) in_docker_model = "/opt/model" in_docker_input_file = "/opt/input.csv" in_docker_output_file = "/opt/output.csv" in_docker_fit_output_dir = "/opt/fit_output_dir" in_docker_fit_target_filename = "/opt/fit_target.csv" in_docker_fit_row_weights_filename = "/opt/fit_row_weights.csv" docker_cmd = "docker run --rm --interactive --user $(id -u):$(id -g) " docker_cmd_args = " -v {}:{}".format(options.code_dir, in_docker_model) in_docker_cmd_list = raw_arguments in_docker_cmd_list[0] = ArgumentsOptions.MAIN_COMMAND in_docker_cmd_list[1] = run_mode.value CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list, ArgumentsOptions.DOCKER) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.CODE_DIR, in_docker_model) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, "-cd", in_docker_model) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.INPUT, in_docker_input_file) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_output_file) if run_mode == RunMode.SERVER: host_port_list = options.address.split(":", 1) if len(host_port_list) == 1: raise DrumCommonException( "Error: when using the docker option provide argument --server host:port" ) port = int(host_port_list[1]) host_port_inside_docker = "{}:{}".format("0.0.0.0", port) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.ADDRESS, host_port_inside_docker) docker_cmd_args += " -p {port}:{port}".format(port=port) if run_mode in [ RunMode.SCORE, RunMode.PERF_TEST, RunMode.VALIDATION, RunMode.FIT ]: docker_cmd_args += " -v {}:{}".format(options.input, in_docker_input_file) if run_mode == RunMode.SCORE and options.output: output_file = os.path.realpath(options.output) if not os.path.exists(output_file): # Creating an empty file so the mount command will mount the file correctly - # otherwise docker create an empty directory open(output_file, "a").close() docker_cmd_args += " -v {}:{}".format(output_file, in_docker_output_file) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_output_file) elif run_mode == RunMode.FIT: if options.output: fit_output_dir = os.path.realpath(options.output) docker_cmd_args += " -v {}:{}".format( fit_output_dir, in_docker_fit_output_dir) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_fit_output_dir) if options.target_csv: fit_target_filename = os.path.realpath(options.target_csv) docker_cmd_args += " -v {}:{}".format( fit_target_filename, in_docker_fit_target_filename) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.TARGET_FILENAME, in_docker_fit_target_filename, ) if options.row_weights_csv: fit_row_weights_filename = os.path.realpath( options.row_weights_csv) docker_cmd_args += " -v {}:{}".format( fit_row_weights_filename, in_docker_fit_row_weights_filename) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.WEIGHTS_CSV, in_docker_fit_row_weights_filename, ) docker_cmd += " {} {} {}".format(docker_cmd_args, options.docker, " ".join(in_docker_cmd_list)) self._print_verbose("docker command: [{}]".format(docker_cmd)) return docker_cmd
def _prepare_docker_command(self, options, run_mode, raw_arguments): """ Building a docker command line for running the model inside the docker - this command line can be used by the user independently of drum. Parameters Returns: docker command line to run as a string """ options.docker = self._maybe_build_image(options.docker) in_docker_model = "/opt/model" in_docker_input_file = "/opt/input.csv" in_docker_output_file = "/opt/output.csv" in_docker_fit_output_dir = "/opt/fit_output_dir" in_docker_fit_target_filename = "/opt/fit_target.csv" in_docker_fit_row_weights_filename = "/opt/fit_row_weights.csv" docker_cmd = "docker run --rm --entrypoint '' --interactive --user $(id -u):$(id -g)" docker_cmd_args = ' -v "{}":{}'.format(options.code_dir, in_docker_model) in_docker_cmd_list = raw_arguments in_docker_cmd_list[0] = ArgumentsOptions.MAIN_COMMAND in_docker_cmd_list[1] = run_mode.value # [RAPTOR-5607] Using -cd makes fit fail within docker, but not --code-dir. # Hotfix it by replacing -cd with --code-dir in_docker_cmd_list = [ ArgumentsOptions.CODE_DIR if arg == "-cd" else arg for arg in in_docker_cmd_list ] CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list, ArgumentsOptions.DOCKER) CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list, ArgumentsOptions.SKIP_DEPS_INSTALL) if options.memory: docker_cmd_args += " --memory {mem_size} --memory-swap {mem_size} ".format( mem_size=options.memory) CMRunnerUtils.delete_cmd_argument(in_docker_cmd_list, ArgumentsOptions.MEMORY) if options.class_labels and ArgumentsOptions.CLASS_LABELS not in in_docker_cmd_list: CMRunnerUtils.delete_cmd_argument( in_docker_cmd_list, ArgumentsOptions.CLASS_LABELS_FILE) in_docker_cmd_list.append(ArgumentsOptions.CLASS_LABELS) for label in options.class_labels: in_docker_cmd_list.append(label) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.CODE_DIR, in_docker_model) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, "-cd", in_docker_model) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.INPUT, in_docker_input_file) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_output_file) if run_mode == RunMode.SERVER: host_port_list = options.address.split(":", 1) if len(host_port_list) == 1: raise DrumCommonException( "Error: when using the docker option provide argument --server host:port" ) port = int(host_port_list[1]) host_port_inside_docker = "{}:{}".format("0.0.0.0", port) CMRunnerUtils.replace_cmd_argument_value(in_docker_cmd_list, ArgumentsOptions.ADDRESS, host_port_inside_docker) docker_cmd_args += " -p {port}:{port}".format(port=port) if run_mode in [ RunMode.SCORE, RunMode.PERF_TEST, RunMode.VALIDATION, RunMode.FIT ]: docker_cmd_args += ' -v "{}":{}'.format(options.input, in_docker_input_file) if run_mode == RunMode.SCORE and options.output: output_file = os.path.realpath(options.output) if not os.path.exists(output_file): # Creating an empty file so the mount command will mount the file correctly - # otherwise docker create an empty directory open(output_file, "a").close() docker_cmd_args += ' -v "{}":{}'.format( output_file, in_docker_output_file) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_output_file) elif run_mode == RunMode.FIT: if options.output: fit_output_dir = os.path.realpath(options.output) docker_cmd_args += ' -v "{}":{}'.format( fit_output_dir, in_docker_fit_output_dir) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.OUTPUT, in_docker_fit_output_dir) if options.target_csv: fit_target_filename = os.path.realpath(options.target_csv) docker_cmd_args += ' -v "{}":{}'.format( fit_target_filename, in_docker_fit_target_filename) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.TARGET_CSV, in_docker_fit_target_filename, ) if options.row_weights_csv: fit_row_weights_filename = os.path.realpath( options.row_weights_csv) docker_cmd_args += ' -v "{}":{}'.format( fit_row_weights_filename, in_docker_fit_row_weights_filename) CMRunnerUtils.replace_cmd_argument_value( in_docker_cmd_list, ArgumentsOptions.WEIGHTS_CSV, in_docker_fit_row_weights_filename, ) docker_cmd += " {} {} {}".format(docker_cmd_args, options.docker, " ".join(in_docker_cmd_list)) self._print_verbose("docker command: [{}]".format(docker_cmd)) return docker_cmd
def _null_value_imputation_check(self): test_name = "Null value imputation" test_passed = True failure_message = "" cmd_list = sys.argv TMP_DIR = "/tmp" DIR_PREFIX = "drum_validation_checks_" ValidationTestResult = collections.namedtuple( "ValidationTestResult", "filename retcode message") null_datasets_dir = mkdtemp(prefix=DIR_PREFIX, dir=TMP_DIR) df = pd.read_csv(self._input_csv) column_names = list(df.iloc[[0]]) results = {} for i, column_name in enumerate(column_names): output_filename = os.path.join(null_datasets_dir, "output{}".format(i)) tmp_dataset_file_path = os.path.join( null_datasets_dir, "null_value_imputation_column{}".format(i)) df_tmp = df.copy() df_tmp[column_name] = None df_tmp.to_csv(tmp_dataset_file_path, index=False) CMRunnerUtils.replace_cmd_argument_value(cmd_list, ArgumentsOptions.INPUT, tmp_dataset_file_path) CMRunnerUtils.replace_cmd_argument_value(cmd_list, ArgumentsOptions.OUTPUT, output_filename) p = subprocess.Popen(cmd_list, env=os.environ) retcode = p.wait() if retcode != 0: test_passed = False results[column_name] = ValidationTestResult( tmp_dataset_file_path, retcode, "") # process results if test_passed: shutil.rmtree(null_datasets_dir) else: for test_result in results.values(): if not test_result.retcode: os.remove(test_result.filename) table = Texttable() table.set_deco(Texttable.HEADER) headers = ["Failed feature", "Message", "Dataset filename"] table.set_cols_dtype(["t", "t", "t"]) table.set_cols_align(["l", "l", "l"]) rows = [headers] for key, test_result in results.items(): if test_result.retcode: rows.append( [key, test_result.message, test_result.filename]) table.add_rows(rows) table_res = table.draw() message = ( "Null value imputation check performs check by imputing each feature with NaN value. " "If check fails for a feature, test dataset is saved in {}/{}* " "Make sure to delete those folders if it takes too much space." .format(TMP_DIR, DIR_PREFIX)) failure_message = "{}\n\n{}".format(message, table_res) return test_name, test_passed, failure_message