Example #1
0
def compute_string_kernel():
    args = _parse_args()
    dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=IS_CROSS_DATASET)
    x_train_list, y_train_list = dataset.get_train()
    x_dev_list, y_dev_list = dataset.get_dev()
    essay_list, essay_id_list, essay_set_list = dataset.get_test()

    # train_len = sum(list(map(len, x_train_list)))
    # dev_len = sum(list(map(len, x_dev_list)))
    # test_len = sum(list(map(len, essay_list)))
    # print(train_len, dev_len, test_len)

    for idx in range(PROMPT_NUM):
        x_train, y_train = x_train_list[idx], y_train_list[idx]
        x_dev, y_dev = x_dev_list[idx], y_dev_list[idx]
        essay_test, essay_id_test, essay_set_test = essay_list[idx], essay_id_list[idx], essay_set_list[idx]
        all_prompt_essays = np.concatenate([x_train, x_dev, essay_test], axis=0)
        prompt_write_into_txt(essays_of_prompt=all_prompt_essays, prompt=idx+1)

    hisk_input_dir = ROOT_DIR + '/essay_data/HISK/input'
    hisk_output_dir = ROOT_DIR + '/essay_data/HISK/output1'
    os.makedirs(hisk_output_dir, exist_ok=True)
    # Compute kernel string matrix
    for idx in range(PROMPT_NUM):
        # kernel_type can be "intersection", "presence" or "spectrum"
        essays_input = os.path.join(hisk_input_dir, get_prompt_str(prompt=idx+1))
        essays_output = os.path.join(hisk_output_dir, get_prompt_str(prompt=idx+1))
        execute_compute_string_kernel_command(kernel_type='intersection',
                                              input_file_1=essays_input,
                                              input_file_2=None,
                                              output_file=essays_output)

        '''
Example #2
0
def test_v_svr(prompt_idx, gamma=None):
    args = _parse_args()
    dataset = AESDataset(args.dataset_dir,
                         prompt_num=PROMPT_NUM,
                         is_cross_dataset=False)
    x_train_list, y_train_list = dataset.get_train()
    x_dev_list, y_dev_list = dataset.get_dev()
    x_test_list, _, _ = dataset.get_test()
    train_len, dev_len, test_len = len(x_train_list[prompt_idx-1]), len(x_dev_list[prompt_idx-1]),\
                                   len(x_test_list[prompt_idx-1])

    y, x = svm_read_problem(SVM_SCALE_DIR + '/prompt@' + str(prompt_idx) +
                            '-scale.txt')
    x_train, y_train = x[:train_len], y[:train_len]
    x_dev, y_dev = x[train_len:train_len + dev_len], y[train_len:train_len +
                                                       dev_len]
    x_test = x[train_len + dev_len:]

    if gamma:
        param = f'-s 4 -t 2 -c 1000 -n 0.1 -g {gamma}'
    else:
        param = f'-s 4 -t 2 -c 1000 -n 0.1'
    svm_model = svm_train(y_train + y_dev, x_train + x_dev, param)
    p_label, p_acc, p_val = svm_predict(np.zeros(shape=len(x_test)), x_test,
                                        svm_model)
    p_label = np.round(p_label)

    dev_label, dev_acc, dev_val = svm_predict(y_dev, x_dev, svm_model)
    dev_kappa = kappa(y_true=y_dev, y_pred=dev_label, weights='quadratic')
    print(f'Dev kappa: {dev_kappa}')
    return dev_kappa, p_label
Example #3
0
def correct_essays(save_path=None):
    args = _parse_args()
    dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=True)
    x_train_list, y_train_list = dataset.get_train()
    x_dev_list, y_dev_list = dataset.get_dev()
    essay_list, essay_id_list, essay_set_list = dataset.get_test()
    all_essays = np.concatenate((x_train_list, x_dev_list, essay_list), axis=0)
    corrects = correct_language(essay_list=all_essays, save_path=save_path)
    return corrects
Example #4
0
def save_predicts(all_predicts):
    fout = open(ROOT_DIR + '/result_output/predicts-' + time.strftime("%Y-%m-%d@%H-%M-%S") + '.tsv',
                'w', encoding='utf8')
    args = _parse_args()
    dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=IS_CROSS_DATASET)
    essay_list, essay_id_list, essay_set_list = dataset.get_test()
    for i in range(PROMPT_NUM):
        essay_ids = essay_id_list[i]
        essay_sets = essay_set_list[i]
        for idx, prediction in enumerate(all_predicts[i]):
            fout.write(f'{essay_ids[idx]}\t{essay_sets[idx]}\t{prediction}\n')
    fout.close()
Example #5
0
def do_ingestion():
    """main entry"""
    LOGGER.info('===== Start integration program.')
    # Parse directories from input arguments
    LOGGER.info('===== Initialize args.')
    args = _parse_args()
    _init_python_path(args)

    dataset = AESDataset(args.dataset_dir,
                         prompt_num=PROMPT_NUM,
                         is_cross_dataset=IS_CROSS_DATASET)
    x_train_list, y_train_list = dataset.get_train()
    x_dev_list, y_dev_list = dataset.get_dev()
    essay_list, essay_id_list, essay_set_list = dataset.get_test()

    score_list = []
    prediction_list = []
    for i in range(PROMPT_NUM):
        log_prompt(entry="Begin handling ", prompt=i + 1)
        x_train, y_train = x_train_list[i], y_train_list[i]
        x_dev, y_dev = x_dev_list[i], y_dev_list[i]
        essay, essay_id, essay_set = essay_list[i], essay_id_list[
            i], essay_set_list[i]
        umodel = Model(prompt=i + 1, max_iter=1)
        # LOGGER.info("===== Check model methods =====")
        # _check_umodel_methed(umodel)

        dev_score, pred_result = None, None
        while not umodel.done_training:
            LOGGER.info(f"===== Begin training model =====")
            _train(umodel, (x_train, y_train), (x_dev, y_dev))

            LOGGER.info("===== Begin predicting on test set =====")
            pred_result, pred_result_dev = _predict(
                umodel, (essay, essay_id, essay_set))

        pred_result_dev = np.round(pred_result_dev)
        dev_score = kappa(y_true=y_dev, y_pred=pred_result_dev)

        log(f"--------------Prompt{i+1} is done, and the dev_score is {dev_score}-------------"
            )

        score_list.append(dev_score)
        prediction_list.append(pred_result)

    # save result
    score_file = os.path.join(
        args.output_dir,
        "score-" + time.strftime("%Y-%m-%d@%H-%M-%S") + '.txt')
    prediction_file = os.path.join(
        args.output_dir,
        "prediction-" + time.strftime("%Y-%m-%d@%H-%M-%S") + '.txt')
    LOGGER.info("===== Begin Saving prediction =====")
    # with open(score_file, 'w', encoding='utf8') as fout:
    #     score_list = [str(score) for score in score_list]
    #     fout.write('\n'.join(score_list) + '\n')
    with open(prediction_file, 'w', encoding='utf') as fout:
        for prediction in prediction_list:
            for idx in range(len(prediction[0])):
                fout.write(
                    str(prediction[0][idx]) + '\t' + str(prediction[1][idx]) +
                    '\t' + str(prediction[2][idx]) + '\n')
    with open(score_file, 'w', encoding='utf') as fout1:
        tot = 0.0
        for idx in range(len(score_list)):
            tot += score_list[idx]
            fout1.write(str(idx + 1) + '\t' + str(score_list[idx]) + '\n')
        avg = tot * 1.0 / PROMPT_NUM
        fout1.write("avg_score: " + str(avg) + '\n')

    LOGGER.info("[Ingestion terminated]")
Example #6
0
        X_tfidf = vectorizer.transform(X)
        X_tfidf = X_tfidf.toarray()

        X_features = np.c_[X_features, X_tfidf]
        scalar, X_features = features_scalar(X_features)

    return X_features


if __name__ == '__main__':
    # nltk.download('punkt')
    # nltk.download('stopwords')
    # nltk.download('averaged_perceptron_tagger')
    from integration.dataset import AESDataset
    # D = AESDataset(r'../essay_data/DEMO')
    D = AESDataset(ROOT_DIR + '/essay_data/DEMO')

    x_train, y_train = D.get_train()

    X, y = x_train[0][:100], y_train[0][:100]

    # features_dict = get_all_features(X)
    # X_features = concact_features(features_dict)
    # _, X_features = features_scalar(X_features)
    # _, selected_features = features_selector(X_features, y)
    # get_tfidf_features(X)
    X_features, scalar_pure, scalar_tfidf, selector, vectorizer = get_features(X, y, None, 15, True)

    print(X_features.shape)

    print(X_features[:10])
Example #7
0
def embedding_predicts(wordvec_dict):
    args = _parse_args()
    dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=IS_CROSS_DATASET, use_correct=True)
    x_train_list, y_train_list = dataset.get_train()
    x_dev_list, y_dev_list = dataset.get_dev()
    essay_list, essay_id_list, essay_set_list = dataset.get_test()

    cleaned_dir = ROOT_DIR + '/essay_data/cleaned'
    cleaned_path = os.path.join(cleaned_dir, 'cleaned.txt')
    os.makedirs(cleaned_dir, exist_ok=True)

    if IS_CROSS_DATASET:
        x_train_cleaned = cleanup_essays(x_train_list, logging=True)
        x_dev_cleaned = cleanup_essays(x_dev_list, logging=True)
        x_test_cleaned = cleanup_essays(essay_list, logging=True)
    else:
        if not os.path.exists(cleaned_path):
            x_train_cleaned = [cleanup_essays(x_train_list[i], logging=True) for i in range(PROMPT_NUM)]
            x_dev_cleaned = [cleanup_essays(x_dev_list[i], logging=True) for i in range(PROMPT_NUM)]
            x_test_cleaned = [cleanup_essays(essay_list[i], logging=True) for i in range(PROMPT_NUM)]
            fout = open(cleaned_path, 'w', encoding='utf8')
            for i in range(PROMPT_NUM):
                fout.write('\n'.join(x_train_cleaned[i]) + '\n')
                fout.write('\n'.join(x_dev_cleaned[i]) + '\n')
                fout.write('\n'.join(x_test_cleaned[i]) + '\n')
            fout.close()
        else:
            x_train_cleaned, x_dev_cleaned, x_test_cleaned = [], [], []
            begin_idx = 0
            with open(cleaned_path, 'r', encoding='utf8') as fin:
                cleaned_essays = [line.strip() for line in fin]
            for prompt_i in range(PROMPT_NUM):
                x_train_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(x_train_list[prompt_i])])
                begin_idx += len(x_train_list[prompt_i])
                x_dev_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(x_dev_list[prompt_i])])
                begin_idx += len(x_dev_list[prompt_i])
                x_test_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(essay_list[prompt_i])])
                begin_idx += len(essay_list[prompt_i])

        prompt_cnt = 0
        k_list = []
        use_regression = True
        model_lib = {
            # LSTM_MODEL: Lstm,
            # CNN_MODEL: Cnn,
            CNN_MULTIPLE: CnnMulInputs,
            LSTM_MULTIPLE: LstmMulInputs,
            # CRNN_MODEL: crnn
        }
        repeat_num = 6
        prompt_predicts = []
        for i in range(0, PROMPT_NUM):
            prompt_cnt += 1
            x_train_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                           for essay in x_train_cleaned[i]])
            x_dev_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                           for essay in x_dev_cleaned[i]])
            x_test_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                           for essay in x_test_cleaned[i]])

            x_train_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                                        for essay in x_train_cleaned[i]])
            x_dev_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                                      for essay in x_dev_cleaned[i]])
            x_test_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                                       for essay in x_test_cleaned[i]])

            y_train = y_train_list[i]
            y_dev = y_dev_list[i]
            max_class, min_class = max(y_train), min(y_train)
            if use_regression:
                output_dim = 1
            else:
                output_dim = max_class + 1
            hisk_dir = ROOT_DIR + '/essay_data/HISK/output'
            hisk_all_dir = ROOT_DIR + '/essay_data/HISK/output-all'
            hisk_all = [np.array(line.strip().split()).astype(int) for line
                        in open(hisk_all_dir + '/prompt@' + str(i+1) + '.txt', 'r', encoding='utf8')]
            hisk_train = [np.array(line.strip().split()).astype(int) for line
                          in open(hisk_dir+'/prompt@' + str(i+1) + '-train.txt', 'r', encoding='utf8')]
            hisk_dev = [np.array(line.strip().split()).astype(int) for line
                          in open(hisk_dir+'/prompt@' + str(i+1) + '-dev.txt', 'r', encoding='utf8')]
            hisk_test = [np.array(line.strip().split()).astype(int) for line
                          in open(hisk_dir+'/prompt@' + str(i+1) + '-test.txt', 'r', encoding='utf8')]
            hisk_train, hisk_dev, hisk_test = np.array(hisk_train), np.array(hisk_dev), np.array(hisk_test)

            sscalar = StandardScaler()
            hisk_all = sscalar.fit_transform(hisk_all)
            hisk_train, hisk_dev, hisk_test = np.array(hisk_all[:len(y_train)]), np.array(hisk_all[len(y_train):len(y_train)+len(y_dev)]),\
                                              np.array(hisk_all[-len(essay_list[i]):])

            x_train_vec = np.concatenate([x_train_vec, hisk_train], axis=-1)
            x_dev_vec = np.concatenate([x_dev_vec, hisk_dev], axis=-1)
            x_test_vec = np.concatenate([x_test_vec, hisk_test], axis=-1)
            x_train_vec = hisk_train
            x_dev_vec = hisk_dev
            x_test_vec = hisk_test

            x_train_vec = x_train_seq_vec
            x_dev_vec = x_dev_seq_vec
            x_test_vec = x_test_seq_vec

            print(f'Prompt@{i+1}, num_classes: {max_class-min_class+1}; '
                  f'x_train shape: {np.array(x_train_vec).shape}, y_train shape: {np.array(y_train).shape}; '
                  f'x_dev shape: {np.array(x_dev_vec).shape}, y_dev shape: {np.array(y_dev).shape}; '
                  f'x_test shape: {np.array(x_test_vec).shape}, y_test shape: {np.array(essay_list[i]).shape}')

            total_predicts = []

            for model_name in model_lib.keys():
                predicts_list = []
                dev_predicts_list = []
                for idx in range(repeat_num):
                    x_train_input = x_train_vec
                    x_dev_input = x_dev_vec
                    x_test_input = x_test_vec
                    my_model = model_lib[model_name]()
                    if 'mul' in model_name:
                        my_model.init_model(prompt=i+1,
                                            input_shape1=x_train_vec.shape[1:], input_shape2=np.array(hisk_train).shape[-1],
                                            output_dim=output_dim)
                        x_train_input = [x_train_vec, hisk_train]
                        x_dev_input = [x_dev_vec, hisk_dev]
                        x_test_input = [x_test_vec, hisk_test]
                    else:
                        my_model.init_model(input_shape=x_train_vec.shape[1:], output_dim=output_dim)
                    my_model.fit(x_train_input, y_train, x_dev_input, y_dev, train_loop_num=1)
                    predicts = np.round(my_model.predict(x_test_input)).reshape(-1, 1)
                    dev_predicts = np.round(my_model.predict(x_dev_input)).reshape(-1, 1)
                    # predicts = mmscaler.inverse_transform(predicts)
                    predicts_list.append(predicts)
                    dev_predicts_list.append(dev_predicts)

                dev_kappa_list = []
                for dev_predict in dev_predicts_list:
                    dev_kappa = kappa(y_true=y_dev, y_pred=dev_predict, weights="quadratic")
                    dev_kappa_list.append(dev_kappa)
                aver_dev_kappa = np.mean(dev_kappa_list)

                cmp_kapaa, cmp_kappa_list = aver_dev_kappa, dev_kappa_list
                selected_list = [predict for predict, kp in zip(predicts_list, cmp_kappa_list) if kp >= cmp_kapaa]

                aver_predicts = np.mean(np.concatenate(selected_list, axis=-1), axis=-1)
                total_predicts.append(aver_predicts.reshape(-1, 1))

            ensemble_predicts = np.mean(np.concatenate(total_predicts, axis=-1), axis=-1)
            prompt_predicts.append(ensemble_predicts)

        os.makedirs(ROOT_DIR + '/result_output', exist_ok=True)
        save_predicts(prompt_predicts)
Example #8
0
def convert_to_libsvm_input_format(is_contain_test=True,
                                   is_scale_y=False,
                                   split_train_dev=False):
    hisk_output_dir = os.path.join(ROOT_DIR, 'essay_data/HISK/output-all')
    libsvm_input_dir = LIBSVM_DIR
    os.makedirs(libsvm_input_dir, exist_ok=True)
    args = _parse_args()
    dataset = AESDataset(args.dataset_dir,
                         prompt_num=PROMPT_NUM,
                         is_cross_dataset=IS_CROSS_DATASET)
    x_train_list, y_train_list = dataset.get_train()
    x_dev_list, y_dev_list = dataset.get_dev()
    x_test_list, x_test_ids, x_test_sets = dataset.get_test()

    for file in os.listdir(hisk_output_dir):
        hisk_file = os.path.join(hisk_output_dir, file)
        if is_contain_test:
            libsvm_file = os.path.join(
                libsvm_input_dir,
                file.__str__().split('.')[0] + '-libsvm.txt')
        else:
            libsvm_file = os.path.join(
                libsvm_input_dir,
                file.__str__().split('.')[0] + '-libsvm-notest.txt')
        if os.path.exists(libsvm_file) and not split_train_dev:
            continue

        prompt_idx = int(re.findall(r'\d+', file.__str__())[0])
        if is_contain_test:
            y_test = np.zeros(shape=len(x_test_list[prompt_idx - 1]))
            y_labels = np.concatenate([
                y_train_list[prompt_idx - 1], y_dev_list[prompt_idx - 1],
                y_test
            ],
                                      axis=0)
        else:
            y_labels = np.concatenate(
                [y_train_list[prompt_idx - 1], y_dev_list[prompt_idx - 1]],
                axis=0)
        if is_scale_y:
            mm_scaler = MinMaxScaler(feature_range=(0, 1))
            y_labels = mm_scaler.fit_transform(
                np.array(y_labels).reshape(-1, 1)).reshape(-1)
        if DEBUG:
            print(f'prompt@{prompt_idx} shape: {y_labels.shape}')
        with open(hisk_file, 'r',
                  encoding='utf8') as fin, open(libsvm_file,
                                                'w',
                                                encoding='utf8') as fout:
            x_feas = [line.strip().split() for line in fin][:len(y_labels)]
            for idx, x_fea in enumerate(x_feas):
                fea_idx = 1
                fout.write(str(y_labels[idx]))
                for each_fea in x_fea:
                    fout.write(' ' + str(fea_idx) + ':' + each_fea)
                    fea_idx += 1
                fout.write('\n')

        if split_train_dev:
            libsvm_train_file = os.path.join(
                libsvm_input_dir,
                file.__str__().split('.')[0] + '-libsvm-train.txt')
            libsvm_dev_file = os.path.join(
                libsvm_input_dir,
                file.__str__().split('.')[0] + '-libsvm-dev.txt')
            with open(libsvm_file, 'r', encoding='utf8') as fin:
                fout1, fout2 = open(libsvm_train_file, 'w',
                                    encoding='utf8'), open(libsvm_dev_file,
                                                           'w',
                                                           encoding='utf8')
                train_len = len(x_train_list[prompt_idx - 1])
                for idx, line in enumerate(fin):
                    if idx < train_len:
                        fout1.write(line.strip() + '\n')
                    else:
                        fout2.write(line.strip() + '\n')
                fout1.close()
                fout2.close()

    execute_scale_command()