Ejemplo n.º 1
0
def test_v_svr(prompt_idx, gamma=None):
    args = _parse_args()
    dataset = AESDataset(args.dataset_dir,
                         prompt_num=PROMPT_NUM,
                         is_cross_dataset=False)
    x_train_list, y_train_list = dataset.get_train()
    x_dev_list, y_dev_list = dataset.get_dev()
    x_test_list, _, _ = dataset.get_test()
    train_len, dev_len, test_len = len(x_train_list[prompt_idx-1]), len(x_dev_list[prompt_idx-1]),\
                                   len(x_test_list[prompt_idx-1])

    y, x = svm_read_problem(SVM_SCALE_DIR + '/prompt@' + str(prompt_idx) +
                            '-scale.txt')
    x_train, y_train = x[:train_len], y[:train_len]
    x_dev, y_dev = x[train_len:train_len + dev_len], y[train_len:train_len +
                                                       dev_len]
    x_test = x[train_len + dev_len:]

    if gamma:
        param = f'-s 4 -t 2 -c 1000 -n 0.1 -g {gamma}'
    else:
        param = f'-s 4 -t 2 -c 1000 -n 0.1'
    svm_model = svm_train(y_train + y_dev, x_train + x_dev, param)
    p_label, p_acc, p_val = svm_predict(np.zeros(shape=len(x_test)), x_test,
                                        svm_model)
    p_label = np.round(p_label)

    dev_label, dev_acc, dev_val = svm_predict(y_dev, x_dev, svm_model)
    dev_kappa = kappa(y_true=y_dev, y_pred=dev_label, weights='quadratic')
    print(f'Dev kappa: {dev_kappa}')
    return dev_kappa, p_label
Ejemplo n.º 2
0
    def predict(self, x_test, is_dev):
        # x_test = self._model.preprocess_data(x_test)
        # x_test = self.preprocess_data(x_test, is_test=True)
        y_pred = self._model.predict(x_test)
        if is_dev:
            return y_pred

        x_dev, y_dev = self._dev_set
        # x_dev = self.preprocess_data(x_dev, is_test=True)
        y_pred_dev = self._model.predict(x_dev)
        kap = kappa(y_true=y_dev, y_pred=y_pred_dev)

        if self._k_best_kappa[-1] < kap:
            self._k_best_kappa[-1] = kap
            self._k_best_predicts[-1] = y_pred
            self._k_best_predicts_dev[-1] = y_pred_dev

        # sort k_best
        for idx, (kap, pred, pred_dev) in enumerate(
                sorted(zip(self._k_best_kappa, self._k_best_predicts,
                           self._k_best_predicts_dev),
                       key=lambda x: x[0],
                       reverse=True)):
            self._k_best_kappa[idx] = kap
            self._k_best_predicts[idx] = pred
            self._k_best_predicts_dev[idx] = pred_dev

        return self._blending_ensemble()
Ejemplo n.º 3
0
        def objective(hyperparams):
            model = gbr(**params, **hyperparams)

            model.fit(x_train, y_train)

            y_pred_dev = model.predict(x_dev)
            score = kappa(y_true=y_pred_dev, y_pred=y_dev)

            return {'loss': -score, 'status': STATUS_OK}
Ejemplo n.º 4
0
        def objective(hyperparams):
            model = XGBClassifier(**params, **hyperparams)

            # model.fit(x_train, y_train, eval_set=[(x_dev, y_dev)], early_stopping_rounds=50, eval_metric='merror', verbose=True)

            model.fit(X=x_train, y=y_train, eval_metric=self._metrics)

            y_pred_dev = model.predict(x_dev)
            score = kappa(y_true=y_pred_dev, y_pred=y_dev)
            return {'loss': -score, 'status': STATUS_OK}
Ejemplo n.º 5
0
def _dev(umodel, dataset):
    # Validate on model
    timer = Timer()
    x_dev, y_dev = dataset
    y_pred = umodel.predict(x_dev, is_dev=True)
    score = kappa(y_true=y_dev, y_pred=y_pred)

    duration = timer.get_duration()
    LOGGER.info(
        f"Finished validating the model. time spent {duration} sec. Validate score: {score}"
    )
    return score
Ejemplo n.º 6
0
def do_ingestion():
    """main entry"""
    LOGGER.info('===== Start integration program.')
    # Parse directories from input arguments
    LOGGER.info('===== Initialize args.')
    args = _parse_args()
    _init_python_path(args)

    dataset = AESDataset(args.dataset_dir,
                         prompt_num=PROMPT_NUM,
                         is_cross_dataset=IS_CROSS_DATASET)
    x_train_list, y_train_list = dataset.get_train()
    x_dev_list, y_dev_list = dataset.get_dev()
    essay_list, essay_id_list, essay_set_list = dataset.get_test()

    score_list = []
    prediction_list = []
    for i in range(PROMPT_NUM):
        log_prompt(entry="Begin handling ", prompt=i + 1)
        x_train, y_train = x_train_list[i], y_train_list[i]
        x_dev, y_dev = x_dev_list[i], y_dev_list[i]
        essay, essay_id, essay_set = essay_list[i], essay_id_list[
            i], essay_set_list[i]
        umodel = Model(prompt=i + 1, max_iter=1)
        # LOGGER.info("===== Check model methods =====")
        # _check_umodel_methed(umodel)

        dev_score, pred_result = None, None
        while not umodel.done_training:
            LOGGER.info(f"===== Begin training model =====")
            _train(umodel, (x_train, y_train), (x_dev, y_dev))

            LOGGER.info("===== Begin predicting on test set =====")
            pred_result, pred_result_dev = _predict(
                umodel, (essay, essay_id, essay_set))

        pred_result_dev = np.round(pred_result_dev)
        dev_score = kappa(y_true=y_dev, y_pred=pred_result_dev)

        log(f"--------------Prompt{i+1} is done, and the dev_score is {dev_score}-------------"
            )

        score_list.append(dev_score)
        prediction_list.append(pred_result)

    # save result
    score_file = os.path.join(
        args.output_dir,
        "score-" + time.strftime("%Y-%m-%d@%H-%M-%S") + '.txt')
    prediction_file = os.path.join(
        args.output_dir,
        "prediction-" + time.strftime("%Y-%m-%d@%H-%M-%S") + '.txt')
    LOGGER.info("===== Begin Saving prediction =====")
    # with open(score_file, 'w', encoding='utf8') as fout:
    #     score_list = [str(score) for score in score_list]
    #     fout.write('\n'.join(score_list) + '\n')
    with open(prediction_file, 'w', encoding='utf') as fout:
        for prediction in prediction_list:
            for idx in range(len(prediction[0])):
                fout.write(
                    str(prediction[0][idx]) + '\t' + str(prediction[1][idx]) +
                    '\t' + str(prediction[2][idx]) + '\n')
    with open(score_file, 'w', encoding='utf') as fout1:
        tot = 0.0
        for idx in range(len(score_list)):
            tot += score_list[idx]
            fout1.write(str(idx + 1) + '\t' + str(score_list[idx]) + '\n')
        avg = tot * 1.0 / PROMPT_NUM
        fout1.write("avg_score: " + str(avg) + '\n')

    LOGGER.info("[Ingestion terminated]")
Ejemplo n.º 7
0
def embedding_predicts(wordvec_dict):
    args = _parse_args()
    dataset = AESDataset(args.dataset_dir, prompt_num=PROMPT_NUM, is_cross_dataset=IS_CROSS_DATASET, use_correct=True)
    x_train_list, y_train_list = dataset.get_train()
    x_dev_list, y_dev_list = dataset.get_dev()
    essay_list, essay_id_list, essay_set_list = dataset.get_test()

    cleaned_dir = ROOT_DIR + '/essay_data/cleaned'
    cleaned_path = os.path.join(cleaned_dir, 'cleaned.txt')
    os.makedirs(cleaned_dir, exist_ok=True)

    if IS_CROSS_DATASET:
        x_train_cleaned = cleanup_essays(x_train_list, logging=True)
        x_dev_cleaned = cleanup_essays(x_dev_list, logging=True)
        x_test_cleaned = cleanup_essays(essay_list, logging=True)
    else:
        if not os.path.exists(cleaned_path):
            x_train_cleaned = [cleanup_essays(x_train_list[i], logging=True) for i in range(PROMPT_NUM)]
            x_dev_cleaned = [cleanup_essays(x_dev_list[i], logging=True) for i in range(PROMPT_NUM)]
            x_test_cleaned = [cleanup_essays(essay_list[i], logging=True) for i in range(PROMPT_NUM)]
            fout = open(cleaned_path, 'w', encoding='utf8')
            for i in range(PROMPT_NUM):
                fout.write('\n'.join(x_train_cleaned[i]) + '\n')
                fout.write('\n'.join(x_dev_cleaned[i]) + '\n')
                fout.write('\n'.join(x_test_cleaned[i]) + '\n')
            fout.close()
        else:
            x_train_cleaned, x_dev_cleaned, x_test_cleaned = [], [], []
            begin_idx = 0
            with open(cleaned_path, 'r', encoding='utf8') as fin:
                cleaned_essays = [line.strip() for line in fin]
            for prompt_i in range(PROMPT_NUM):
                x_train_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(x_train_list[prompt_i])])
                begin_idx += len(x_train_list[prompt_i])
                x_dev_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(x_dev_list[prompt_i])])
                begin_idx += len(x_dev_list[prompt_i])
                x_test_cleaned.append(cleaned_essays[begin_idx:begin_idx+len(essay_list[prompt_i])])
                begin_idx += len(essay_list[prompt_i])

        prompt_cnt = 0
        k_list = []
        use_regression = True
        model_lib = {
            # LSTM_MODEL: Lstm,
            # CNN_MODEL: Cnn,
            CNN_MULTIPLE: CnnMulInputs,
            LSTM_MULTIPLE: LstmMulInputs,
            # CRNN_MODEL: crnn
        }
        repeat_num = 6
        prompt_predicts = []
        for i in range(0, PROMPT_NUM):
            prompt_cnt += 1
            x_train_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                           for essay in x_train_cleaned[i]])
            x_dev_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                           for essay in x_dev_cleaned[i]])
            x_test_vec = np.array([create_average_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                           for essay in x_test_cleaned[i]])

            x_train_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                                        for essay in x_train_cleaned[i]])
            x_dev_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                                      for essay in x_dev_cleaned[i]])
            x_test_seq_vec = np.array([create_sequence_vec(essay, text_dim=TEXT_DIM, wordvec_dict=wordvec_dict)
                                       for essay in x_test_cleaned[i]])

            y_train = y_train_list[i]
            y_dev = y_dev_list[i]
            max_class, min_class = max(y_train), min(y_train)
            if use_regression:
                output_dim = 1
            else:
                output_dim = max_class + 1
            hisk_dir = ROOT_DIR + '/essay_data/HISK/output'
            hisk_all_dir = ROOT_DIR + '/essay_data/HISK/output-all'
            hisk_all = [np.array(line.strip().split()).astype(int) for line
                        in open(hisk_all_dir + '/prompt@' + str(i+1) + '.txt', 'r', encoding='utf8')]
            hisk_train = [np.array(line.strip().split()).astype(int) for line
                          in open(hisk_dir+'/prompt@' + str(i+1) + '-train.txt', 'r', encoding='utf8')]
            hisk_dev = [np.array(line.strip().split()).astype(int) for line
                          in open(hisk_dir+'/prompt@' + str(i+1) + '-dev.txt', 'r', encoding='utf8')]
            hisk_test = [np.array(line.strip().split()).astype(int) for line
                          in open(hisk_dir+'/prompt@' + str(i+1) + '-test.txt', 'r', encoding='utf8')]
            hisk_train, hisk_dev, hisk_test = np.array(hisk_train), np.array(hisk_dev), np.array(hisk_test)

            sscalar = StandardScaler()
            hisk_all = sscalar.fit_transform(hisk_all)
            hisk_train, hisk_dev, hisk_test = np.array(hisk_all[:len(y_train)]), np.array(hisk_all[len(y_train):len(y_train)+len(y_dev)]),\
                                              np.array(hisk_all[-len(essay_list[i]):])

            x_train_vec = np.concatenate([x_train_vec, hisk_train], axis=-1)
            x_dev_vec = np.concatenate([x_dev_vec, hisk_dev], axis=-1)
            x_test_vec = np.concatenate([x_test_vec, hisk_test], axis=-1)
            x_train_vec = hisk_train
            x_dev_vec = hisk_dev
            x_test_vec = hisk_test

            x_train_vec = x_train_seq_vec
            x_dev_vec = x_dev_seq_vec
            x_test_vec = x_test_seq_vec

            print(f'Prompt@{i+1}, num_classes: {max_class-min_class+1}; '
                  f'x_train shape: {np.array(x_train_vec).shape}, y_train shape: {np.array(y_train).shape}; '
                  f'x_dev shape: {np.array(x_dev_vec).shape}, y_dev shape: {np.array(y_dev).shape}; '
                  f'x_test shape: {np.array(x_test_vec).shape}, y_test shape: {np.array(essay_list[i]).shape}')

            total_predicts = []

            for model_name in model_lib.keys():
                predicts_list = []
                dev_predicts_list = []
                for idx in range(repeat_num):
                    x_train_input = x_train_vec
                    x_dev_input = x_dev_vec
                    x_test_input = x_test_vec
                    my_model = model_lib[model_name]()
                    if 'mul' in model_name:
                        my_model.init_model(prompt=i+1,
                                            input_shape1=x_train_vec.shape[1:], input_shape2=np.array(hisk_train).shape[-1],
                                            output_dim=output_dim)
                        x_train_input = [x_train_vec, hisk_train]
                        x_dev_input = [x_dev_vec, hisk_dev]
                        x_test_input = [x_test_vec, hisk_test]
                    else:
                        my_model.init_model(input_shape=x_train_vec.shape[1:], output_dim=output_dim)
                    my_model.fit(x_train_input, y_train, x_dev_input, y_dev, train_loop_num=1)
                    predicts = np.round(my_model.predict(x_test_input)).reshape(-1, 1)
                    dev_predicts = np.round(my_model.predict(x_dev_input)).reshape(-1, 1)
                    # predicts = mmscaler.inverse_transform(predicts)
                    predicts_list.append(predicts)
                    dev_predicts_list.append(dev_predicts)

                dev_kappa_list = []
                for dev_predict in dev_predicts_list:
                    dev_kappa = kappa(y_true=y_dev, y_pred=dev_predict, weights="quadratic")
                    dev_kappa_list.append(dev_kappa)
                aver_dev_kappa = np.mean(dev_kappa_list)

                cmp_kapaa, cmp_kappa_list = aver_dev_kappa, dev_kappa_list
                selected_list = [predict for predict, kp in zip(predicts_list, cmp_kappa_list) if kp >= cmp_kapaa]

                aver_predicts = np.mean(np.concatenate(selected_list, axis=-1), axis=-1)
                total_predicts.append(aver_predicts.reshape(-1, 1))

            ensemble_predicts = np.mean(np.concatenate(total_predicts, axis=-1), axis=-1)
            prompt_predicts.append(ensemble_predicts)

        os.makedirs(ROOT_DIR + '/result_output', exist_ok=True)
        save_predicts(prompt_predicts)