def compute_string_kernel(): args = _parse_args() dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=IS_CROSS_DATASET) x_train_list, y_train_list = dataset.get_train() x_dev_list, y_dev_list = dataset.get_dev() essay_list, essay_id_list, essay_set_list = dataset.get_test() # train_len = sum(list(map(len, x_train_list))) # dev_len = sum(list(map(len, x_dev_list))) # test_len = sum(list(map(len, essay_list))) # print(train_len, dev_len, test_len) for idx in range(PROMPT_NUM): x_train, y_train = x_train_list[idx], y_train_list[idx] x_dev, y_dev = x_dev_list[idx], y_dev_list[idx] essay_test, essay_id_test, essay_set_test = essay_list[idx], essay_id_list[idx], essay_set_list[idx] all_prompt_essays = np.concatenate([x_train, x_dev, essay_test], axis=0) prompt_write_into_txt(essays_of_prompt=all_prompt_essays, prompt=idx+1) hisk_input_dir = ROOT_DIR + '/essay_data/HISK/input' hisk_output_dir = ROOT_DIR + '/essay_data/HISK/output1' os.makedirs(hisk_output_dir, exist_ok=True) # Compute kernel string matrix for idx in range(PROMPT_NUM): # kernel_type can be "intersection", "presence" or "spectrum" essays_input = os.path.join(hisk_input_dir, get_prompt_str(prompt=idx+1)) essays_output = os.path.join(hisk_output_dir, get_prompt_str(prompt=idx+1)) execute_compute_string_kernel_command(kernel_type='intersection', input_file_1=essays_input, input_file_2=None, output_file=essays_output) '''
def test_v_svr(prompt_idx, gamma=None): args = _parse_args() dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=False) x_train_list, y_train_list = dataset.get_train() x_dev_list, y_dev_list = dataset.get_dev() x_test_list, _, _ = dataset.get_test() train_len, dev_len, test_len = len(x_train_list[prompt_idx-1]), len(x_dev_list[prompt_idx-1]),\ len(x_test_list[prompt_idx-1]) y, x = svm_read_problem(SVM_SCALE_DIR + '/prompt@' + str(prompt_idx) + '-scale.txt') x_train, y_train = x[:train_len], y[:train_len] x_dev, y_dev = x[train_len:train_len + dev_len], y[train_len:train_len + dev_len] x_test = x[train_len + dev_len:] if gamma: param = f'-s 4 -t 2 -c 1000 -n 0.1 -g {gamma}' else: param = f'-s 4 -t 2 -c 1000 -n 0.1' svm_model = svm_train(y_train + y_dev, x_train + x_dev, param) p_label, p_acc, p_val = svm_predict(np.zeros(shape=len(x_test)), x_test, svm_model) p_label = np.round(p_label) dev_label, dev_acc, dev_val = svm_predict(y_dev, x_dev, svm_model) dev_kappa = kappa(y_true=y_dev, y_pred=dev_label, weights='quadratic') print(f'Dev kappa: {dev_kappa}') return dev_kappa, p_label
def correct_essays(save_path=None): args = _parse_args() dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=True) x_train_list, y_train_list = dataset.get_train() x_dev_list, y_dev_list = dataset.get_dev() essay_list, essay_id_list, essay_set_list = dataset.get_test() all_essays = np.concatenate((x_train_list, x_dev_list, essay_list), axis=0) corrects = correct_language(essay_list=all_essays, save_path=save_path) return corrects
def save_predicts(all_predicts): fout = open(ROOT_DIR + '/result_output/predicts-' + time.strftime("%Y-%m-%d@%H-%M-%S") + '.tsv', 'w', encoding='utf8') args = _parse_args() dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=IS_CROSS_DATASET) essay_list, essay_id_list, essay_set_list = dataset.get_test() for i in range(PROMPT_NUM): essay_ids = essay_id_list[i] essay_sets = essay_set_list[i] for idx, prediction in enumerate(all_predicts[i]): fout.write(f'{essay_ids[idx]}\t{essay_sets[idx]}\t{prediction}\n') fout.close()
def do_ingestion(): """main entry""" LOGGER.info('===== Start integration program.') # Parse directories from input arguments LOGGER.info('===== Initialize args.') args = _parse_args() _init_python_path(args) dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=IS_CROSS_DATASET) x_train_list, y_train_list = dataset.get_train() x_dev_list, y_dev_list = dataset.get_dev() essay_list, essay_id_list, essay_set_list = dataset.get_test() score_list = [] prediction_list = [] for i in range(PROMPT_NUM): log_prompt(entry="Begin handling ", prompt=i + 1) x_train, y_train = x_train_list[i], y_train_list[i] x_dev, y_dev = x_dev_list[i], y_dev_list[i] essay, essay_id, essay_set = essay_list[i], essay_id_list[ i], essay_set_list[i] umodel = Model(prompt=i + 1, max_iter=1) # LOGGER.info("===== Check model methods =====") # _check_umodel_methed(umodel) dev_score, pred_result = None, None while not umodel.done_training: LOGGER.info(f"===== Begin training model =====") _train(umodel, (x_train, y_train), (x_dev, y_dev)) LOGGER.info("===== Begin predicting on test set =====") pred_result, pred_result_dev = _predict( umodel, (essay, essay_id, essay_set)) pred_result_dev = np.round(pred_result_dev) dev_score = kappa(y_true=y_dev, y_pred=pred_result_dev) log(f"--------------Prompt{i+1} is done, and the dev_score is {dev_score}-------------" ) score_list.append(dev_score) prediction_list.append(pred_result) # save result score_file = os.path.join( args.output_dir, "score-" + time.strftime("%Y-%m-%d@%H-%M-%S") + '.txt') prediction_file = os.path.join( args.output_dir, "prediction-" + time.strftime("%Y-%m-%d@%H-%M-%S") + '.txt') LOGGER.info("===== Begin Saving prediction =====") # with open(score_file, 'w', encoding='utf8') as fout: # score_list = [str(score) for score in score_list] # fout.write('\n'.join(score_list) + '\n') with open(prediction_file, 'w', encoding='utf') as fout: for prediction in prediction_list: for idx in range(len(prediction[0])): fout.write( str(prediction[0][idx]) + '\t' + str(prediction[1][idx]) + '\t' + str(prediction[2][idx]) + '\n') with open(score_file, 'w', encoding='utf') as fout1: tot = 0.0 for idx in range(len(score_list)): tot += score_list[idx] fout1.write(str(idx + 1) + '\t' + str(score_list[idx]) + '\n') avg = tot * 1.0 / PROMPT_NUM fout1.write("avg_score: " + str(avg) + '\n') LOGGER.info("[Ingestion terminated]")
X_tfidf = vectorizer.transform(X) X_tfidf = X_tfidf.toarray() X_features = np.c_[X_features, X_tfidf] scalar, X_features = features_scalar(X_features) return X_features if __name__ == '__main__': # nltk.download('punkt') # nltk.download('stopwords') # nltk.download('averaged_perceptron_tagger') from integration.dataset import AESDataset # D = AESDataset(r'../essay_data/DEMO') D = AESDataset(ROOT_DIR + '/essay_data/DEMO') x_train, y_train = D.get_train() X, y = x_train[0][:100], y_train[0][:100] # features_dict = get_all_features(X) # X_features = concact_features(features_dict) # _, X_features = features_scalar(X_features) # _, selected_features = features_selector(X_features, y) # get_tfidf_features(X) X_features, scalar_pure, scalar_tfidf, selector, vectorizer = get_features(X, y, None, 15, True) print(X_features.shape) print(X_features[:10])
def embedding_predicts(wordvec_dict): args = _parse_args() dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=IS_CROSS_DATASET, use_correct=True) x_train_list, y_train_list = dataset.get_train() x_dev_list, y_dev_list = dataset.get_dev() essay_list, essay_id_list, essay_set_list = dataset.get_test() cleaned_dir = ROOT_DIR + '/essay_data/cleaned' cleaned_path = os.path.join(cleaned_dir, 'cleaned.txt') os.makedirs(cleaned_dir, exist_ok=True) if IS_CROSS_DATASET: x_train_cleaned = cleanup_essays(x_train_list, logging=True) x_dev_cleaned = cleanup_essays(x_dev_list, logging=True) x_test_cleaned = cleanup_essays(essay_list, logging=True) else: if not os.path.exists(cleaned_path): x_train_cleaned = [cleanup_essays(x_train_list[i], logging=True) for i in range(PROMPT_NUM)] x_dev_cleaned = [cleanup_essays(x_dev_list[i], logging=True) for i in range(PROMPT_NUM)] x_test_cleaned = [cleanup_essays(essay_list[i], logging=True) for i in range(PROMPT_NUM)] fout = open(cleaned_path, 'w', encoding='utf8') for i in range(PROMPT_NUM): fout.write('\n'.join(x_train_cleaned[i]) + '\n') fout.write('\n'.join(x_dev_cleaned[i]) + '\n') fout.write('\n'.join(x_test_cleaned[i]) + '\n') fout.close() else: x_train_cleaned, x_dev_cleaned, x_test_cleaned = [], [], [] begin_idx = 0 with open(cleaned_path, 'r', encoding='utf8') as fin: cleaned_essays = [line.strip() for line in fin] for prompt_i in range(PROMPT_NUM): x_train_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(x_train_list[prompt_i])]) begin_idx += len(x_train_list[prompt_i]) x_dev_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(x_dev_list[prompt_i])]) begin_idx += len(x_dev_list[prompt_i]) x_test_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(essay_list[prompt_i])]) begin_idx += len(essay_list[prompt_i]) prompt_cnt = 0 k_list = [] use_regression = True model_lib = { # LSTM_MODEL: Lstm, # CNN_MODEL: Cnn, CNN_MULTIPLE: CnnMulInputs, LSTM_MULTIPLE: LstmMulInputs, # CRNN_MODEL: crnn } repeat_num = 6 prompt_predicts = [] for i in range(0, PROMPT_NUM): prompt_cnt += 1 x_train_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_train_cleaned[i]]) x_dev_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_dev_cleaned[i]]) x_test_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_test_cleaned[i]]) x_train_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_train_cleaned[i]]) x_dev_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_dev_cleaned[i]]) x_test_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict) for essay in x_test_cleaned[i]]) y_train = y_train_list[i] y_dev = y_dev_list[i] max_class, min_class = max(y_train), min(y_train) if use_regression: output_dim = 1 else: output_dim = max_class + 1 hisk_dir = ROOT_DIR + '/essay_data/HISK/output' hisk_all_dir = ROOT_DIR + '/essay_data/HISK/output-all' hisk_all = [np.array(line.strip().split()).astype(int) for line in open(hisk_all_dir + '/prompt@' + str(i+1) + '.txt', 'r', encoding='utf8')] hisk_train = [np.array(line.strip().split()).astype(int) for line in open(hisk_dir+'/prompt@' + str(i+1) + '-train.txt', 'r', encoding='utf8')] hisk_dev = [np.array(line.strip().split()).astype(int) for line in open(hisk_dir+'/prompt@' + str(i+1) + '-dev.txt', 'r', encoding='utf8')] hisk_test = [np.array(line.strip().split()).astype(int) for line in open(hisk_dir+'/prompt@' + str(i+1) + '-test.txt', 'r', encoding='utf8')] hisk_train, hisk_dev, hisk_test = np.array(hisk_train), np.array(hisk_dev), np.array(hisk_test) sscalar = StandardScaler() hisk_all = sscalar.fit_transform(hisk_all) hisk_train, hisk_dev, hisk_test = np.array(hisk_all[:len(y_train)]), np.array(hisk_all[len(y_train):len(y_train)+len(y_dev)]),\ np.array(hisk_all[-len(essay_list[i]):]) x_train_vec = np.concatenate([x_train_vec, hisk_train], axis=-1) x_dev_vec = np.concatenate([x_dev_vec, hisk_dev], axis=-1) x_test_vec = np.concatenate([x_test_vec, hisk_test], axis=-1) x_train_vec = hisk_train x_dev_vec = hisk_dev x_test_vec = hisk_test x_train_vec = x_train_seq_vec x_dev_vec = x_dev_seq_vec x_test_vec = x_test_seq_vec print(f'Prompt@{i+1}, num_classes: {max_class-min_class+1}; ' f'x_train shape: {np.array(x_train_vec).shape}, y_train shape: {np.array(y_train).shape}; ' f'x_dev shape: {np.array(x_dev_vec).shape}, y_dev shape: {np.array(y_dev).shape}; ' f'x_test shape: {np.array(x_test_vec).shape}, y_test shape: {np.array(essay_list[i]).shape}') total_predicts = [] for model_name in model_lib.keys(): predicts_list = [] dev_predicts_list = [] for idx in range(repeat_num): x_train_input = x_train_vec x_dev_input = x_dev_vec x_test_input = x_test_vec my_model = model_lib[model_name]() if 'mul' in model_name: my_model.init_model(prompt=i+1, input_shape1=x_train_vec.shape[1:], input_shape2=np.array(hisk_train).shape[-1], output_dim=output_dim) x_train_input = [x_train_vec, hisk_train] x_dev_input = [x_dev_vec, hisk_dev] x_test_input = [x_test_vec, hisk_test] else: my_model.init_model(input_shape=x_train_vec.shape[1:], output_dim=output_dim) my_model.fit(x_train_input, y_train, x_dev_input, y_dev, train_loop_num=1) predicts = np.round(my_model.predict(x_test_input)).reshape(-1, 1) dev_predicts = np.round(my_model.predict(x_dev_input)).reshape(-1, 1) # predicts = mmscaler.inverse_transform(predicts) predicts_list.append(predicts) dev_predicts_list.append(dev_predicts) dev_kappa_list = [] for dev_predict in dev_predicts_list: dev_kappa = kappa(y_true=y_dev, y_pred=dev_predict, weights="quadratic") dev_kappa_list.append(dev_kappa) aver_dev_kappa = np.mean(dev_kappa_list) cmp_kapaa, cmp_kappa_list = aver_dev_kappa, dev_kappa_list selected_list = [predict for predict, kp in zip(predicts_list, cmp_kappa_list) if kp >= cmp_kapaa] aver_predicts = np.mean(np.concatenate(selected_list, axis=-1), axis=-1) total_predicts.append(aver_predicts.reshape(-1, 1)) ensemble_predicts = np.mean(np.concatenate(total_predicts, axis=-1), axis=-1) prompt_predicts.append(ensemble_predicts) os.makedirs(ROOT_DIR + '/result_output', exist_ok=True) save_predicts(prompt_predicts)
def convert_to_libsvm_input_format(is_contain_test=True, is_scale_y=False, split_train_dev=False): hisk_output_dir = os.path.join(ROOT_DIR, 'essay_data/HISK/output-all') libsvm_input_dir = LIBSVM_DIR os.makedirs(libsvm_input_dir, exist_ok=True) args = _parse_args() dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=IS_CROSS_DATASET) x_train_list, y_train_list = dataset.get_train() x_dev_list, y_dev_list = dataset.get_dev() x_test_list, x_test_ids, x_test_sets = dataset.get_test() for file in os.listdir(hisk_output_dir): hisk_file = os.path.join(hisk_output_dir, file) if is_contain_test: libsvm_file = os.path.join( libsvm_input_dir, file.__str__().split('.')[0] + '-libsvm.txt') else: libsvm_file = os.path.join( libsvm_input_dir, file.__str__().split('.')[0] + '-libsvm-notest.txt') if os.path.exists(libsvm_file) and not split_train_dev: continue prompt_idx = int(re.findall(r'\d+', file.__str__())[0]) if is_contain_test: y_test = np.zeros(shape=len(x_test_list[prompt_idx - 1])) y_labels = np.concatenate([ y_train_list[prompt_idx - 1], y_dev_list[prompt_idx - 1], y_test ], axis=0) else: y_labels = np.concatenate( [y_train_list[prompt_idx - 1], y_dev_list[prompt_idx - 1]], axis=0) if is_scale_y: mm_scaler = MinMaxScaler(feature_range=(0, 1)) y_labels = mm_scaler.fit_transform( np.array(y_labels).reshape(-1, 1)).reshape(-1) if DEBUG: print(f'prompt@{prompt_idx} shape: {y_labels.shape}') with open(hisk_file, 'r', encoding='utf8') as fin, open(libsvm_file, 'w', encoding='utf8') as fout: x_feas = [line.strip().split() for line in fin][:len(y_labels)] for idx, x_fea in enumerate(x_feas): fea_idx = 1 fout.write(str(y_labels[idx])) for each_fea in x_fea: fout.write(' ' + str(fea_idx) + ':' + each_fea) fea_idx += 1 fout.write('\n') if split_train_dev: libsvm_train_file = os.path.join( libsvm_input_dir, file.__str__().split('.')[0] + '-libsvm-train.txt') libsvm_dev_file = os.path.join( libsvm_input_dir, file.__str__().split('.')[0] + '-libsvm-dev.txt') with open(libsvm_file, 'r', encoding='utf8') as fin: fout1, fout2 = open(libsvm_train_file, 'w', encoding='utf8'), open(libsvm_dev_file, 'w', encoding='utf8') train_len = len(x_train_list[prompt_idx - 1]) for idx, line in enumerate(fin): if idx < train_len: fout1.write(line.strip() + '\n') else: fout2.write(line.strip() + '\n') fout1.close() fout2.close() execute_scale_command()