def test_v_svr(prompt_idx, gamma=None): args = _parse_args() dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=False) x_train_list, y_train_list = dataset.get_train() x_dev_list, y_dev_list = dataset.get_dev() x_test_list, _, _ = dataset.get_test() train_len, dev_len, test_len = len(x_train_list[prompt_idx-1]), len(x_dev_list[prompt_idx-1]),\ len(x_test_list[prompt_idx-1]) y, x = svm_read_problem(SVM_SCALE_DIR + '/prompt@' + str(prompt_idx) + '-scale.txt') x_train, y_train = x[:train_len], y[:train_len] x_dev, y_dev = x[train_len:train_len + dev_len], y[train_len:train_len + dev_len] x_test = x[train_len + dev_len:] if gamma: param = f'-s 4 -t 2 -c 1000 -n 0.1 -g {gamma}' else: param = f'-s 4 -t 2 -c 1000 -n 0.1' svm_model = svm_train(y_train + y_dev, x_train + x_dev, param) p_label, p_acc, p_val = svm_predict(np.zeros(shape=len(x_test)), x_test, svm_model) p_label = np.round(p_label) dev_label, dev_acc, dev_val = svm_predict(y_dev, x_dev, svm_model) dev_kappa = kappa(y_true=y_dev, y_pred=dev_label, weights='quadratic') print(f'Dev kappa: {dev_kappa}') return dev_kappa, p_label
def predict(self, x_test, is_dev): # x_test = self._model.preprocess_data(x_test) # x_test = self.preprocess_data(x_test, is_test=True) y_pred = self._model.predict(x_test) if is_dev: return y_pred x_dev, y_dev = self._dev_set # x_dev = self.preprocess_data(x_dev, is_test=True) y_pred_dev = self._model.predict(x_dev) kap = kappa(y_true=y_dev, y_pred=y_pred_dev) if self._k_best_kappa[-1] < kap: self._k_best_kappa[-1] = kap self._k_best_predicts[-1] = y_pred self._k_best_predicts_dev[-1] = y_pred_dev # sort k_best for idx, (kap, pred, pred_dev) in enumerate( sorted(zip(self._k_best_kappa, self._k_best_predicts, self._k_best_predicts_dev), key=lambda x: x[0], reverse=True)): self._k_best_kappa[idx] = kap self._k_best_predicts[idx] = pred self._k_best_predicts_dev[idx] = pred_dev return self._blending_ensemble()
def objective(hyperparams): model = gbr(**params, **hyperparams) model.fit(x_train, y_train) y_pred_dev = model.predict(x_dev) score = kappa(y_true=y_pred_dev, y_pred=y_dev) return {'loss': -score, 'status': STATUS_OK}
def objective(hyperparams): model = XGBClassifier(**params, **hyperparams) # model.fit(x_train, y_train, eval_set=[(x_dev, y_dev)], early_stopping_rounds=50, eval_metric='merror', verbose=True) model.fit(X=x_train, y=y_train, eval_metric=self._metrics) y_pred_dev = model.predict(x_dev) score = kappa(y_true=y_pred_dev, y_pred=y_dev) return {'loss': -score, 'status': STATUS_OK}
def _dev(umodel, dataset): # Validate on model timer = Timer() x_dev, y_dev = dataset y_pred = umodel.predict(x_dev, is_dev=True) score = kappa(y_true=y_dev, y_pred=y_pred) duration = timer.get_duration() LOGGER.info( f"Finished validating the model. time spent {duration} sec. Validate score: {score}" ) return score
def do_ingestion(): """main entry""" LOGGER.info('===== Start integration program.') # Parse directories from input arguments LOGGER.info('===== Initialize args.') args = _parse_args() _init_python_path(args) dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=IS_CROSS_DATASET) x_train_list, y_train_list = dataset.get_train() x_dev_list, y_dev_list = dataset.get_dev() essay_list, essay_id_list, essay_set_list = dataset.get_test() score_list = [] prediction_list = [] for i in range(PROMPT_NUM): log_prompt(entry="Begin handling ", prompt=i + 1) x_train, y_train = x_train_list[i], y_train_list[i] x_dev, y_dev = x_dev_list[i], y_dev_list[i] essay, essay_id, essay_set = essay_list[i], essay_id_list[ i], essay_set_list[i] umodel = Model(prompt=i + 1, max_iter=1) # LOGGER.info("===== Check model methods =====") # _check_umodel_methed(umodel) dev_score, pred_result = None, None while not umodel.done_training: LOGGER.info(f"===== Begin training model =====") _train(umodel, (x_train, y_train), (x_dev, y_dev)) LOGGER.info("===== Begin predicting on test set =====") pred_result, pred_result_dev = _predict( umodel, (essay, essay_id, essay_set)) pred_result_dev = np.round(pred_result_dev) dev_score = kappa(y_true=y_dev, y_pred=pred_result_dev) log(f"--------------Prompt{i+1} is done, and the dev_score is {dev_score}-------------" ) score_list.append(dev_score) prediction_list.append(pred_result) # save result score_file = os.path.join( args.output_dir, "score-" + time.strftime("%Y-%m-%d@%H-%M-%S") + '.txt') prediction_file = os.path.join( args.output_dir, "prediction-" + time.strftime("%Y-%m-%d@%H-%M-%S") + '.txt') LOGGER.info("===== Begin Saving prediction =====") # with open(score_file, 'w', encoding='utf8') as fout: # score_list = [str(score) for score in score_list] # fout.write('\n'.join(score_list) + '\n') with open(prediction_file, 'w', encoding='utf') as fout: for prediction in prediction_list: for idx in range(len(prediction[0])): fout.write( str(prediction[0][idx]) + '\t' + str(prediction[1][idx]) + '\t' + str(prediction[2][idx]) + '\n') with open(score_file, 'w', encoding='utf') as fout1: tot = 0.0 for idx in range(len(score_list)): tot += score_list[idx] fout1.write(str(idx + 1) + '\t' + str(score_list[idx]) + '\n') avg = tot * 1.0 / PROMPT_NUM fout1.write("avg_score: " + str(avg) + '\n') LOGGER.info("[Ingestion terminated]")
def embedding_predicts(wordvec_dict): args = _parse_args() dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=IS_CROSS_DATASET, use_correct=True) x_train_list, y_train_list = dataset.get_train() x_dev_list, y_dev_list = dataset.get_dev() essay_list, essay_id_list, essay_set_list = dataset.get_test() cleaned_dir = ROOT_DIR + '/essay_data/cleaned' cleaned_path = os.path.join(cleaned_dir, 'cleaned.txt') os.makedirs(cleaned_dir, exist_ok=True) if IS_CROSS_DATASET: x_train_cleaned = cleanup_essays(x_train_list, logging=True) x_dev_cleaned = cleanup_essays(x_dev_list, logging=True) x_test_cleaned = cleanup_essays(essay_list, logging=True) else: if not os.path.exists(cleaned_path): x_train_cleaned = [cleanup_essays(x_train_list[i], logging=True) for i in range(PROMPT_NUM)] x_dev_cleaned = [cleanup_essays(x_dev_list[i], logging=True) for i in range(PROMPT_NUM)] x_test_cleaned = [cleanup_essays(essay_list[i], logging=True) for i in range(PROMPT_NUM)] fout = open(cleaned_path, 'w', encoding='utf8') for i in range(PROMPT_NUM): fout.write('\n'.join(x_train_cleaned[i]) + '\n') fout.write('\n'.join(x_dev_cleaned[i]) + '\n') fout.write('\n'.join(x_test_cleaned[i]) + '\n') fout.close() else: x_train_cleaned, x_dev_cleaned, x_test_cleaned = [], [], [] begin_idx = 0 with open(cleaned_path, 'r', encoding='utf8') as fin: cleaned_essays = [line.strip() for line in fin] for prompt_i in range(PROMPT_NUM): x_train_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(x_train_list[prompt_i])]) begin_idx += len(x_train_list[prompt_i]) x_dev_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(x_dev_list[prompt_i])]) begin_idx += len(x_dev_list[prompt_i]) x_test_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(essay_list[prompt_i])]) begin_idx += len(essay_list[prompt_i]) prompt_cnt = 0 k_list = [] use_regression = True model_lib = { # LSTM_MODEL: Lstm, # CNN_MODEL: Cnn, CNN_MULTIPLE: CnnMulInputs, LSTM_MULTIPLE: LstmMulInputs, # CRNN_MODEL: crnn } repeat_num = 6 prompt_predicts = [] for i in range(0, PROMPT_NUM): prompt_cnt += 1 x_train_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_train_cleaned[i]]) x_dev_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_dev_cleaned[i]]) x_test_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_test_cleaned[i]]) x_train_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_train_cleaned[i]]) x_dev_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_dev_cleaned[i]]) x_test_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_test_cleaned[i]]) y_train = y_train_list[i] y_dev = y_dev_list[i] max_class, min_class = max(y_train), min(y_train) if use_regression: output_dim = 1 else: output_dim = max_class + 1 hisk_dir = ROOT_DIR + '/essay_data/HISK/output' hisk_all_dir = ROOT_DIR + '/essay_data/HISK/output-all' hisk_all = [np.array(line.strip().split()).astype(int) for line in open(hisk_all_dir + '/prompt@' + str(i+1) + '.txt', 'r', encoding='utf8')] hisk_train = [np.array(line.strip().split()).astype(int) for line in open(hisk_dir+'/prompt@' + str(i+1) + '-train.txt', 'r', encoding='utf8')] hisk_dev = [np.array(line.strip().split()).astype(int) for line in open(hisk_dir+'/prompt@' + str(i+1) + '-dev.txt', 'r', encoding='utf8')] hisk_test = [np.array(line.strip().split()).astype(int) for line in open(hisk_dir+'/prompt@' + str(i+1) + '-test.txt', 'r', encoding='utf8')] hisk_train, hisk_dev, hisk_test = np.array(hisk_train), np.array(hisk_dev), np.array(hisk_test) sscalar = StandardScaler() hisk_all = sscalar.fit_transform(hisk_all) hisk_train, hisk_dev, hisk_test = np.array(hisk_all[:len(y_train)]), np.array(hisk_all[len(y_train):len(y_train)+len(y_dev)]),\ np.array(hisk_all[-len(essay_list[i]):]) x_train_vec = np.concatenate([x_train_vec, hisk_train], axis=-1) x_dev_vec = np.concatenate([x_dev_vec, hisk_dev], axis=-1) x_test_vec = np.concatenate([x_test_vec, hisk_test], axis=-1) x_train_vec = hisk_train x_dev_vec = hisk_dev x_test_vec = hisk_test x_train_vec = x_train_seq_vec x_dev_vec = x_dev_seq_vec x_test_vec = x_test_seq_vec print(f'Prompt@{i+1}, num_classes: {max_class-min_class+1}; ' f'x_train shape: {np.array(x_train_vec).shape}, y_train shape: {np.array(y_train).shape}; ' f'x_dev shape: {np.array(x_dev_vec).shape}, y_dev shape: {np.array(y_dev).shape}; ' f'x_test shape: {np.array(x_test_vec).shape}, y_test shape: {np.array(essay_list[i]).shape}') total_predicts = [] for model_name in model_lib.keys(): predicts_list = [] dev_predicts_list = [] for idx in range(repeat_num): x_train_input = x_train_vec x_dev_input = x_dev_vec x_test_input = x_test_vec my_model = model_lib[model_name]() if 'mul' in model_name: my_model.init_model(prompt=i+1, input_shape1=x_train_vec.shape[1:], input_shape2=np.array(hisk_train).shape[-1], output_dim=output_dim) x_train_input = [x_train_vec, hisk_train] x_dev_input = [x_dev_vec, hisk_dev] x_test_input = [x_test_vec, hisk_test] else: my_model.init_model(input_shape=x_train_vec.shape[1:], output_dim=output_dim) my_model.fit(x_train_input, y_train, x_dev_input, y_dev, train_loop_num=1) predicts = np.round(my_model.predict(x_test_input)).reshape(-1, 1) dev_predicts = np.round(my_model.predict(x_dev_input)).reshape(-1, 1) # predicts = mmscaler.inverse_transform(predicts) predicts_list.append(predicts) dev_predicts_list.append(dev_predicts) dev_kappa_list = [] for dev_predict in dev_predicts_list: dev_kappa = kappa(y_true=y_dev, y_pred=dev_predict, weights="quadratic") dev_kappa_list.append(dev_kappa) aver_dev_kappa = np.mean(dev_kappa_list) cmp_kapaa, cmp_kappa_list = aver_dev_kappa, dev_kappa_list selected_list = [predict for predict, kp in zip(predicts_list, cmp_kappa_list) if kp >= cmp_kapaa] aver_predicts = np.mean(np.concatenate(selected_list, axis=-1), axis=-1) total_predicts.append(aver_predicts.reshape(-1, 1)) ensemble_predicts = np.mean(np.concatenate(total_predicts, axis=-1), axis=-1) prompt_predicts.append(ensemble_predicts) os.makedirs(ROOT_DIR + '/result_output', exist_ok=True) save_predicts(prompt_predicts)