def kfold(fold_num, out_dir, intent_train_file, workspace_base_file, figure_path, keep_workspace, username, password, iam_apikey, url, version, weight_mode, conf_thres, partial_credit_table): FOLD_TRAIN = 'fold_train' FOLD_TEST = 'fold_test' WORKSPACE_SPEC = 'fold_workspace' WORKSPACE_NAME = 'workspace_name' TEST_OUT = 'test_out' print('Begin {} with following details:'.format(KFOLD.upper())) print('{}={}'.format(INTENT_FILE_ITEM, intent_train_file)) print('{}={}'.format(WORKSPACE_BASE_ITEM, workspace_base_file)) print('{}={}'.format(FIGURE_PATH_ITEM, figure_path)) print('{}={}'.format(OUT_DIR_ITEM, out_dir)) print('{}={}'.format(FOLD_NUM_ITEM, fold_num)) print('{}={}'.format(DO_KEEP_WORKSPACE_ITEM, BOOL_MAP[keep_workspace])) print('{}={}'.format(WEIGHT_MODE_ITEM, weight_mode)) print('{}={}'.format(CONF_THRES_ITEM, conf_thres)) print('{}={}'.format(WCS_USERNAME_ITEM, username)) print('{}={}'.format(WCS_BASEURL_ITEM, url)) print('{}={}'.format(WA_API_VERSION_ITEM, version)) print('{}={}'.format(PARTIAL_CREDIT_TABLE_ITEM, partial_credit_table)) working_dir = os.path.join(out_dir, KFOLD) if not os.path.exists(working_dir): os.makedirs(working_dir) # Prepare folds if subprocess.run([sys.executable, CREATE_TEST_TRAIN_FOLDS_PATH, '-i', intent_train_file, '-o', working_dir, '-k', str(fold_num)], stdout=subprocess.PIPE).returncode == 0: print('Created {} folds'.format(str(fold_num))) else: raise RuntimeError('Failure in folds creation') # Construct fold params fold_params = [{FOLD_TRAIN: os.path.join(working_dir, str(idx), TRAIN_FILENAME), FOLD_TEST: os.path.join(working_dir, str(idx), TEST_FILENAME), TEST_OUT: os.path.join(working_dir, str(idx), TEST_OUT_FILENAME), WORKSPACE_SPEC: os.path.join(working_dir, str(idx), SPEC_FILENAME), WORKSPACE_NAME: '{}_{}'.format(KFOLD, str(idx))} for idx in range(fold_num)] # Begin training train_processes_specs = {} for fold_param in fold_params: spec_file = open(fold_param[WORKSPACE_SPEC], 'w') train_args = [sys.executable, TRAIN_CONVERSATION_PATH, '-i', fold_param[FOLD_TRAIN], '-n', fold_param[WORKSPACE_NAME], '-u', username, '-p', password, '-a', iam_apikey, '-l', url, '-v', version, '-w', workspace_base_file] train_processes_specs[ subprocess.Popen(train_args, stdout=spec_file)] = spec_file train_failure_idx = [] for idx, (process, file) in enumerate(train_processes_specs.items()): if process.wait() == 0: file.close() else: train_failure_idx.append(idx) try: if len(train_failure_idx) != 0: raise RuntimeError( 'Fail to train {} fold workspace'.format(','.join( str(train_failure_idx)))) print('Trained {} workspaces'.format(str(fold_num))) # Begin testing test_processes = [] workspace_ids = [] FOLD_TEST_RATE = int(MAX_TEST_RATE / fold_num) for fold_param in fold_params: workspace_id = None with open(fold_param[WORKSPACE_SPEC]) as f: workspace_id = json.load(f)[WORKSPACE_ID_TAG] workspace_ids.append(workspace_id) test_args = [sys.executable, TEST_CONVERSATION_PATH, '-i', fold_param[FOLD_TEST], '-o', fold_param[TEST_OUT], '-u', username, '-p', password, '-a', iam_apikey, '-l', url, '-v', version, '-t', UTTERANCE_COLUMN, '-g', GOLDEN_INTENT_COLUMN, '-w', workspace_id, '-r', str(FOLD_TEST_RATE), '-m'] if partial_credit_table is not None: test_args += ['--partial_credit_table', partial_credit_table] test_processes.append(subprocess.Popen(test_args)) test_failure_idx_str = [] for idx, process in enumerate(test_processes): if process.wait() != 0: test_failure_idx_str.append(str(idx)) if len(test_failure_idx_str) != 0: raise RuntimeError('Fail to test {} fold workspace'.format( ','.join(test_failure_idx_str))) print('Tested {} workspaces'.format(str(fold_num))) test_out_files = [fold_param[TEST_OUT] for fold_param in fold_params] # Add a column for the fold number for idx, this_file in enumerate(test_out_files): this_df = pd.read_csv(this_file, quoting=csv.QUOTE_ALL, encoding='utf-8', \ keep_default_na=False) this_df['Fold Index'] = idx this_df.to_csv( this_file, encoding='utf-8', quoting=csv.QUOTE_ALL, index=False ) # Union test out kfold_result_file = os.path.join(out_dir, KFOLD_UNION_FILE) pd.concat([pd.read_csv(file, quoting=csv.QUOTE_ALL, encoding=UTF_8, keep_default_na=False) for file in test_out_files]) \ .to_csv(kfold_result_file, encoding='utf-8', quoting=csv.QUOTE_ALL, index=False) print("Wrote k-fold result file to {}".format(kfold_result_file)) classfier_names = ['Fold {}'.format(idx) for idx in range(fold_num)] plot_args = [sys.executable, CREATE_PRECISION_CURVE_PATH, '-t', '{} Fold Test'.format(str(fold_num)), '-o', figure_path, '-w', weight_mode, '--tau', conf_thres, '-n'] + \ classfier_names + ['-i'] + test_out_files if subprocess.run(plot_args).returncode != 0: raise RuntimeError('Failure in plotting curves') kfold_result_file_base = kfold_result_file[:-4] metrics_args = [sys.executable, INTENT_METRICS_PATH, '-i', kfold_result_file, '-o', kfold_result_file_base+".metrics.csv", '--partial_credit_on', str(partial_credit_table is not None)] if subprocess.run(metrics_args).returncode != 0: raise RuntimeError('Failure in generating intent metrics') confusion_args = [sys.executable, CONFUSION_MATRIX_PATH, '-i', kfold_result_file, '-o', kfold_result_file_base+".confusion_args.csv"] if subprocess.run(confusion_args).returncode != 0: raise RuntimeError('Failure in generating confusion matrix') finally: if not keep_workspace: workspace_ids = [] for idx in range(fold_num): if idx not in train_failure_idx: with open(fold_params[idx][WORKSPACE_SPEC]) as f: workspace_id = json.load(f)[WORKSPACE_ID_TAG] workspace_ids.append(workspace_id) delete_workspaces(username, password, iam_apikey, url, version, workspace_ids)
def kfold(fold_num, temp_dir, intent_train_file, workspace_base_file, figure_path, keep_workspace, username, password, weight_mode, conf_thres): FOLD_TRAIN = 'fold_train' FOLD_TEST = 'fold_test' WORKSPACE_SPEC = 'fold_workspace' WORKSPACE_NAME = 'workspace_name' TEST_OUT = 'test_out' print('Begin {} with following details:'.format(KFOLD.upper())) print('{}={}'.format(INTENT_FILE_ITEM, intent_train_file)) print('{}={}'.format(WORKSPACE_BASE_ITEM, workspace_base_file)) print('{}={}'.format(FIGURE_PATH_ITEM, figure_path)) print('{}={}'.format(TEMP_DIR_ITEM, temp_dir)) print('{}={}'.format(FOLD_NUM_ITEM, fold_num)) print('{}={}'.format(DO_KEEP_WORKSPACE_ITEM, BOOL_MAP[keep_workspace])) print('{}={}'.format(WEIGHT_MODE_ITEM, weight_mode)) print('{}={}'.format(CONF_THRES_ITEM, conf_thres)) print('{}={}'.format(WCS_USERNAME_ITEM, username)) working_dir = os.path.join(temp_dir, KFOLD) if not os.path.exists(working_dir): os.makedirs(working_dir) # Prepare folds if subprocess.run([ sys.executable, CREATE_TEST_TRAIN_FOLDS_PATH, '-i', intent_train_file, '-o', working_dir, '-k', str(fold_num) ], stdout=subprocess.PIPE).returncode == 0: print('Created {} folds'.format(str(fold_num))) else: raise RuntimeError('Failure in folds creation') # Construct fold params fold_params = [{ FOLD_TRAIN: os.path.join(working_dir, str(idx), TRAIN_FILENAME), FOLD_TEST: os.path.join(working_dir, str(idx), TEST_FILENAME), TEST_OUT: os.path.join(working_dir, str(idx), TEST_OUT_FILENAME), WORKSPACE_SPEC: os.path.join(working_dir, str(idx), SPEC_FILENAME), WORKSPACE_NAME: '{}_{}'.format(KFOLD, str(idx)) } for idx in range(fold_num)] # Begin training train_processes_specs = {} for fold_param in fold_params: spec_file = open(fold_param[WORKSPACE_SPEC], 'w') train_args = [ sys.executable, TRAIN_CONVERSATION_PATH, '-i', fold_param[FOLD_TRAIN], '-n', fold_param[WORKSPACE_NAME], '-u', username, '-p', password, '-w', workspace_base_file ] train_processes_specs[subprocess.Popen(train_args, stdout=spec_file)] = spec_file train_failure_idx = [] for idx, (process, file) in enumerate(train_processes_specs.items()): if process.wait() == 0: file.close() else: train_failure_idx.append(idx) try: if len(train_failure_idx) != 0: raise RuntimeError('Fail to train {} fold workspace'.format( ','.join(str(train_failure_idx)))) print('Trained {} workspaces'.format(str(fold_num))) # Begin testing test_processes = [] workspace_ids = [] FOLD_TEST_RATE = int(MAX_TEST_RATE / fold_num) for fold_param in fold_params: workspace_id = None with open(fold_param[WORKSPACE_SPEC]) as f: workspace_id = json.load(f)[WORKSPACE_ID_TAG] workspace_ids.append(workspace_id) test_args = [ sys.executable, TEST_CONVERSATION_PATH, '-i', fold_param[FOLD_TEST], '-o', fold_param[TEST_OUT], '-u', username, '-p', password, '-t', UTTERANCE_COLUMN, '-g', GOLDEN_INTENT_COLUMN, '-w', workspace_id, '-r', str(FOLD_TEST_RATE), '-m' ] test_processes.append(subprocess.Popen(test_args)) test_failure_idx_str = [] for idx, process in enumerate(test_processes): if process.wait() != 0: test_failure_idx_str.append(str(idx)) if len(test_failure_idx_str) != 0: raise RuntimeError('Fail to test {} fold workspace'.format( ','.join(test_failure_idx_str))) print('Tested {} workspaces'.format(str(fold_num))) test_out_files = [fold_param[TEST_OUT] for fold_param in fold_params] # Union test out pd.concat([pd.read_csv(file, quoting=csv.QUOTE_ALL, encoding=UTF_8, keep_default_na=False) for file in test_out_files]) \ .to_csv(os.path.join(working_dir, KFOLD_UNION_FILE), encoding='utf-8', quoting=csv.QUOTE_ALL, index=False) classfier_names = ['Fold {}'.format(idx) for idx in range(fold_num)] plot_args = [sys.executable, CREATE_PRECISION_CURVE_PATH, '-t', '{} Fold Test'.format(str(fold_num)), '-o', figure_path, '-w', weight_mode, '--tau', conf_thres, '-n'] + \ classfier_names + ['-i'] + test_out_files if subprocess.run(plot_args).returncode == 0: print('Generated precision curves for {} folds'.format( str(fold_num))) else: raise RuntimeError('Failure in plotting curves') finally: if not keep_workspace: workspace_ids = [] for idx in range(fold_num): if idx not in train_failure_idx: with open(fold_params[idx][WORKSPACE_SPEC]) as f: workspace_id = json.load(f)[WORKSPACE_ID_TAG] workspace_ids.append(workspace_id) delete_workspaces(username, password, workspace_ids)