Exemple #1
0
def predict_final(output_key, output_labels):
    config_path = data_config.output_path(output_key, ALL, CONFIG)
    config_data = yaml.load(open(config_path))
    nn_config = NNConfig(config_data)
    vocab_id_mapping = json.load(open(data_config.output_path(output_key, ALL, VOCAB_ID_MAPPING), 'r'))

    dataset = load_dataset(
        mode=FINAL, vocab_id_mapping=vocab_id_mapping,
        max_seq_len=nn_config.seq_len, sampling=False, with_label=False
    )
    index_iterator = SimpleIndexIterator.from_dataset(dataset)
    n_sample = index_iterator.n_sample()

    with tf.Session() as sess:
        prefix_checkpoint = tf.train.latest_checkpoint(data_config.model_path(key=output_key))
        saver = tf.train.import_meta_graph('{}.meta'.format(prefix_checkpoint))
        saver.restore(sess, prefix_checkpoint)

        nn = BaseNNModel(config=None)
        nn.set_graph(tf.get_default_graph())

        fetches = {_key: nn.var(_key) for _key in [LABEL_PREDICT]}
        labels_predict = list()

        for batch_index in index_iterator.iterate(nn_config.batch_size, shuffle=False):
            feed_dict = {nn.var(_key): dataset[_key][batch_index] for _key in feed_key[TEST]}
            feed_dict[nn.var(TEST_MODE)] = 1
            res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict)
            labels_predict += res[LABEL_PREDICT].tolist()

        labels_predict = labels_predict[:n_sample]

    with open(output_labels, 'w') as file_obj:
        for i, label in enumerate(labels_predict):
            file_obj.write('{},{},{}'.format(i, label, label_str[label]))
Exemple #2
0
def live_test(output_key):
    config_path = data_config.output_path(output_key, ALL, CONFIG)
    config_data = yaml.load(open(config_path))
    nn_config = NNConfig(config_data)
    vocab_id_mapping = json.load(open(data_config.output_path(output_key, ALL, VOCAB_ID_MAPPING), 'r'))

    with tf.Session() as sess:
        prefix_checkpoint = tf.train.latest_checkpoint(data_config.model_path(key=output_key))
        saver = tf.train.import_meta_graph('{}.meta'.format(prefix_checkpoint))
        saver.restore(sess, prefix_checkpoint)

        nn = BaseNNModel(config=None)
        nn.set_graph(tf.get_default_graph())

        fetches = {_key: nn.var(_key) for _key in [LABEL_PREDICT, PROB_PREDICT]}
        while True:
            res = input('input: ')
            if res == 'quit':
                break

            turns = res.strip().split('|')
            if len(turns) != 3:
                print('invalid turns')
                continue

            tokens_list = list()
            for turn in turns:
                tokens = re.sub('\s+', ' ', turn.strip()).split(' ')
                tokens_list.append(tokens)

            placeholder = [[]] * (nn_config.batch_size - 1)
            tid_list_0 = tokenized_to_tid_list([tokens_list[0], ] + placeholder, vocab_id_mapping)
            tid_list_1 = tokenized_to_tid_list([tokens_list[1], ] + placeholder, vocab_id_mapping)
            tid_list_2 = tokenized_to_tid_list([tokens_list[2], ] + placeholder, vocab_id_mapping)

            tid_0 = np.asarray(zero_pad_seq_list(tid_list_0, nn_config.seq_len))
            tid_1 = np.asarray(zero_pad_seq_list(tid_list_1, nn_config.seq_len))
            tid_2 = np.asarray(zero_pad_seq_list(tid_list_2, nn_config.seq_len))

            feed_dict = {
                nn.var(TID_0): tid_0,
                nn.var(TID_1): tid_1,
                nn.var(TID_2): tid_2,
                nn.var(TEST_MODE): 1
            }
            res = sess.run(fetches=fetches, feed_dict=feed_dict)
            label = res[LABEL_PREDICT][0]
            prob = res[PROB_PREDICT][0]
            print('label: {}'.format(label))
            print('prob: {}'.format(prob))
Exemple #3
0
def export_wrong(output_key):
    mode = TEST
    path = data_config.output_path(output_key, mode, LABEL_PREDICT)
    pred = load_label_list(path)

    path = data_config.path(mode, LABEL)
    gold = load_label_list(path)

    tokens_0 = load_tokenized_list(data_config.path(mode, TURN, '0.ek'))
    tokens_1 = load_tokenized_list(data_config.path(mode, TURN, '1.ek'))
    tokens_2 = load_tokenized_list(data_config.path(mode, TURN, '2.ek'))

    wrong = defaultdict(lambda: defaultdict(lambda: list()))

    max_seq_len = 0
    for p, g, tk_0, tk_1, tk_2 in zip(pred, gold, tokens_0, tokens_1,
                                      tokens_2):
        if p != g:
            wrong[g][p].append(' '.join(tk_0) + ' | ' + ' '.join(tk_1) +
                               ' | ' + ' '.join(tk_2))
            max_seq_len = max(max_seq_len, len(tk_0), len(tk_1), len(tk_2))

    for _g in range(4):
        for _p in range(4):
            print('{}->{}'.format(_g, _p))
            for sample in wrong[_g][_p]:
                print('\t{}'.format(sample))

    print(max_seq_len)
def build_final_submit(output_key):
    """
    python -m scripts.semeval2019_task3_dev final -o gru_ek_1543492018
    python3 -m scripts.semeval2019_task3_dev build_dev_submit -o

    :param output_key: string
    :return:
    """
    output_path = 'test.txt'

    first_line = open(config.path_train, 'r').readline()
    with open(output_path, 'w') as o_obj:
        o_obj.write(first_line)

        lines = open(config.path_test_no_labels).read().strip().split('\n')
        lines = lines[1:]
        lines = list(map(lambda l: l.strip(), lines))

        path_labels = config.output_path(output_key, FINAL, LABEL_PREDICT)

        labels = open(path_labels, 'r').read().strip().split('\n')
        labels = list(map(int, labels))
        labels = list(map(lambda l: label_str[l], labels))
        assert len(labels) == len(lines)

        for line, label in zip(lines, labels):
            o_obj.write('{}\t{}\n'.format(line, label))
Exemple #5
0
def show_config(output_key):
    """
    [Usage]
    python3 -m algo.main93 config xxxxxx

    :param output_key:
    :return:
    """
    path = data_config.output_path(output_key, ALL, CONFIG)
    print(open(path).read())
    print(path)
Exemple #6
0
def load_tri_votes(config, modes):
    n_sample = 5509
    votes = [[0 for _ in range(4)] for _ in range(n_sample)]
    for output_key in config.tri:
        labels = list()
        for _mode in modes:
            path = data_config.output_path(output_key, _mode, LABEL_PREDICT)
            labels += load_label_list(path)

        for i, label in enumerate(labels):
            votes[i][label] += 1
    return votes
Exemple #7
0
def show_eval(output_key):
    labels_predict = list()
    labels_gold = list()
    for mode in [TRAIN, TEST]:
        path = data_config.output_path(output_key, mode, LABEL_PREDICT)
        labels_predict += load_label_list(path)

        path = data_config.path(mode, LABEL)
        labels_gold += load_label_list(path)

    res = basic_evaluate(gold=labels_gold, pred=labels_predict)
    print_evaluation(res)
    for col in res[CONFUSION_MATRIX]:
        print(','.join(map(str, col)))
Exemple #8
0
def show_eval(output_key):
    """
    [Usage]
    python algo/main.py eval A_ntua_ek_1542454066

    :param output_key: string
    :return:
    """
    for mode in [TRAIN, VALID, TEST]:
        res = json.load(
            open(data_config.output_path(output_key, mode, EVALUATION)))
        print(mode)
        print_evaluation(res)
        for col in res[CONFUSION_MATRIX]:
            print(','.join(map(str, col)))
        print()
Exemple #9
0
def load_others_votes(config, modes):
    n_sample = 5509
    votes = [0 for _ in range(n_sample)]
    for output_key in config.others:
        labels = list()
        for _mode in modes:
            path = data_config.output_path(output_key, _mode, LABEL_PREDICT)
            labels += load_label_list(path)
        if len(labels) != n_sample:
            raise Exception('mismatch {}({}) != {}'.format(
                output_key, len(labels), n_sample))

        for i, label in enumerate(labels):
            if label == 0:
                votes[i] += 1
    return votes
Exemple #10
0
def check_wrong(output_key, w2v_key='ntua_ek'):
    mode = TEST
    path = data_config.output_path(output_key, mode, LABEL_PREDICT)
    pred = load_label_list(path)

    path = data_config.path(mode, LABEL)
    gold = load_label_list(path)

    w2v_model_path = data_config.path(ALL, WORD2VEC, w2v_key)
    vocab_train_path = data_config.path(TRAIN, VOCAB, 'ek')

    # 加载字典集
    # 在模型中会采用所有模型中支持的词向量, 并为有足够出现次数的单词随机生成词向量
    vocab_meta_list = load_vocab_list(vocab_train_path)
    vocabs = [_meta['t'] for _meta in vocab_meta_list if _meta['tf'] >= 2]

    # 加载词向量与相关数据
    lookup_table, vocab_id_mapping, embedding_dim = load_lookup_table(
        w2v_model_path=w2v_model_path, vocabs=vocabs)

    tokens_0 = load_tokenized_list(data_config.path(mode, TURN, '0.ek'))
    tokens_1 = load_tokenized_list(data_config.path(mode, TURN, '1.ek'))
    tokens_2 = load_tokenized_list(data_config.path(mode, TURN, '2.ek'))
    tid_list_0 = tokenized_to_tid_list(tokens_0, vocab_id_mapping)
    tid_list_1 = tokenized_to_tid_list(tokens_1, vocab_id_mapping)
    tid_list_2 = tokenized_to_tid_list(
        load_tokenized_list(data_config.path(mode, TURN, '2.ek')),
        vocab_id_mapping)

    max_seq_len = 0
    for p, g, tid_0, tid_1, tid_2, tk_0, tk_1, tk_2 in zip(
            pred, gold, tid_list_0, tid_list_1, tid_list_2, tokens_0, tokens_1,
            tokens_2):
        if p != g and (len(tid_0) > 30 or len(tid_1) > 30 or len(tid_2) > 30):
            print('pred: {}, gold: {}'.format(p, g))
            print('turn0: {}'.format(' '.join(tk_0)))
            print('turn1: {}'.format(' '.join(tk_1)))
            print('turn2: {}'.format(' '.join(tk_2)))

        if p != g:
            max_seq_len = max(max_seq_len, len(tid_0), len(tid_1), len(tid_2))
    print(max_seq_len)
Exemple #11
0
def diff(a_filename, b_filename, output_filename, config_path='e93.yaml'):
    config_data = yaml.load(open(config_path))
    config = Config(data=config_data)

    votes = None

    for output_key in config.others:
        labels = list()
        for _mode in modes[FINAL]:
            path = data_config.output_path(output_key, _mode, LABEL_PREDICT)
            labels += load_label_list(path)

        if votes is None:
            n_sample = len(labels)
            votes = [0 for _ in range(n_sample)]

        for i, label in enumerate(labels):
            if label == 0:
                votes[i] += 1

    dataset = Processor.load_origin(a_filename)
    labels_a = list(map(lambda _item: _item[-1], dataset))

    dataset = Processor.load_origin(b_filename)
    labels_b = list(map(lambda _item: _item[-1], dataset))

    assert len(votes) == len(labels_a) == len(labels_b)

    n_match = 0
    with open(output_filename, 'w') as file_obj:
        for i, (a, b, d) in enumerate(zip(labels_a, labels_b, dataset)):
            if a == 3:
                if b == 0:
                    file_obj.write('{}\t{}\t{}\t{}\t{}->{} ({})\n'.format(
                        i, d[0], d[1], d[2], label_str[a], label_str[b],
                        votes[i]))
                else:
                    n_match += 1
    print(n_match)
Exemple #12
0
def analyse_others(config_path='e93.yaml'):
    config_data = yaml.load(open(config_path))
    config = Config(data=config_data)

    modes = {
        TRAIN: [TRAIN, TEST],
        FINAL: [
            FINAL,
        ]
    }
    for mode in [TRAIN, FINAL]:
        n_sample = None
        n_effective = 0
        for output_key in config.others:
            labels = list()
            for _mode in modes[mode]:
                path = data_config.output_path(output_key, _mode,
                                               LABEL_PREDICT)
                labels += load_label_list(path)

            if n_sample is None:
                n_sample = len(labels)
                votes = [0 for _ in range(n_sample)]

            for i, label in enumerate(labels):
                if label == 0:
                    votes[i] += 1

        if config.others_min_vote == 'all':
            min_vote = len(config.others)
        else:
            min_vote = int(config.others_min_vote)

        for i, vote in enumerate(votes):
            if vote >= min_vote:
                n_effective += 1
        print('{}: {}'.format(mode, n_effective))
Exemple #13
0
def filter_by_others(input_filename,
                     output_filename,
                     thr,
                     config_path='e93.yaml'):
    thr = int(thr)
    config_data = yaml.load(open(config_path))
    config = Config(data=config_data)

    votes = None

    for output_key in config.others:
        labels = list()
        for _mode in modes[FINAL]:
            path = data_config.output_path(output_key, _mode, LABEL_PREDICT)
            labels += load_label_list(path)

        if votes is None:
            n_sample = len(labels)
            votes = [0 for _ in range(n_sample)]

        for i, label in enumerate(labels):
            if label == 0:
                votes[i] += 1

    dataset = Processor.load_origin(input_filename)
    labels = list(map(lambda _item: _item[-1], dataset))

    assert len(votes) == len(labels)

    with open(output_filename, 'w') as file_obj:
        for i, (p, d) in enumerate(zip(labels, dataset)):
            if p != 0 and votes[i] >= thr:
                file_obj.write('{}\t{}\t{}\t{}\t{} ({})\n'.format(
                    i, d[0], d[1], d[2], p, votes[i]))
                labels[i] = 0
    export_final('test.txt', labels)
Exemple #14
0
def show_eval(output_key):
    labels_predict_ = dict()
    labels_gold_ = dict()
    for mode in [TRAIN, TEST, FINAL]:
        path = data_config.output_path(output_key, mode, LABEL_PREDICT)
        labels_predict_[mode] = load_label_list(path)

        path = data_config.path(mode, LABEL)
        labels_gold_[mode] = load_label_list(path)

    print('TRAIN + TEST')
    res = basic_evaluate(gold=labels_gold_[TRAIN] + labels_gold_[TEST],
                         pred=labels_predict_[TRAIN] + labels_predict_[TEST])
    print_evaluation(res)
    for col in res[CONFUSION_MATRIX]:
        print(','.join(map(str, col)))
    print()

    print('FINAL')
    res = basic_evaluate(gold=labels_gold_[FINAL], pred=labels_predict_[FINAL])
    print_evaluation(res)
    for col in res[CONFUSION_MATRIX]:
        print(','.join(map(str, col)))
    print()
Exemple #15
0
def train(text_version='ek',
          label_version=None,
          config_path='config93_naive.yaml'):
    """
    python -m algo.main93_v2 train
    python3 -m algo.main93_v2 train -c config_ntua93.yaml

    :param text_version: string
    :param label_version: string
    :param config_path: string
    :return:
    """
    config_data = yaml.load(open(config_path))

    output_key = '{}_{}_{}'.format(NNModel.name, text_version,
                                   int(time.time()))
    if label_version is not None:
        output_key = '{}_{}'.format(label_version, output_key)
    print('OUTPUT_KEY: {}'.format(output_key))

    # 准备输出路径的文件夹
    data_config.prepare_output_folder(output_key=output_key)
    data_config.prepare_model_folder(output_key=output_key)

    shutil.copy(config_path, data_config.output_path(output_key, ALL, CONFIG))

    w2v_key = '{}_{}'.format(config_data['word']['w2v_version'], text_version)
    w2v_model_path = data_config.path(ALL, WORD2VEC, w2v_key)
    vocab_train_path = data_config.path(TRAIN, VOCAB, text_version)

    # 加载字典集
    # 在模型中会采用所有模型中支持的词向量, 并为有足够出现次数的单词随机生成词向量
    vocab_meta_list = load_vocab_list(vocab_train_path)
    vocabs = [
        _meta['t'] for _meta in vocab_meta_list
        if _meta['tf'] >= config_data['word']['min_tf']
    ]

    # 加载词向量与相关数据
    lookup_table, vocab_id_mapping, embedding_dim = load_lookup_table2(
        w2v_model_path=w2v_model_path, vocabs=vocabs)
    json.dump(
        vocab_id_mapping,
        open(data_config.output_path(output_key, ALL, VOCAB_ID_MAPPING), 'w'))

    # 加载配置
    nn_config = NNConfig(config_data)
    train_config = TrainConfig(config_data['train'])
    early_stop_metric = train_config.early_stop_metric

    # 加载训练数据
    datasets, output_dim = load_dataset(vocab_id_mapping=vocab_id_mapping,
                                        max_seq_len=nn_config.seq_len,
                                        with_label=True,
                                        label_version=label_version)

    # 初始化数据集的检索
    index_iterators = {
        mode: IndexIterator(datasets[mode][LABEL_GOLD])
        for mode in [TRAIN, TEST]
    }
    # 按配置将训练数据切割成训练集和验证集
    index_iterators[TRAIN].split_train_valid(train_config.valid_rate)

    # 计算各个类的权重
    if train_config.use_class_weights:
        label_weight = {
            # 参考 sklearn 中 class_weight='balanced'的公式, 实验显示效果显着
            _label: float(index_iterators[TRAIN].n_sample()) /
            (index_iterators[TRAIN].dim * len(_index))
            for _label, _index in index_iterators[TRAIN].label_index.items()
        }
    else:
        label_weight = {
            _label: 1.
            for _label in range(index_iterators[TRAIN].dim)
        }

    # 基于加载的数据更新配置
    nn_config.set_embedding_dim(embedding_dim)
    nn_config.set_output_dim(output_dim)
    # 搭建神经网络
    nn = NNModel(config=nn_config)
    nn.build_neural_network(lookup_table=lookup_table)

    batch_size = train_config.batch_size
    fetches = {
        mode: {_key: nn.var(_key)
               for _key in fetch_key[mode]}
        for mode in [TRAIN, TEST]
    }
    last_eval = {TRAIN: None, VALID: None, TEST: None}

    model_output_prefix = data_config.model_path(key=output_key) + '/model'

    best_res = {mode: None for mode in [TRAIN, VALID]}
    no_update_count = {mode: 0 for mode in [TRAIN, VALID]}
    max_no_update_count = 10

    eval_history = {TRAIN: list(), DEV: list(), TEST: list()}

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(tf.global_variables())

        dataset = datasets[TRAIN]
        index_iterator = index_iterators[TRAIN]

        # 训练开始 ##########################################################################
        for epoch in range(train_config.epoch):
            print('== epoch {} = {} ='.format(epoch, output_key))

            # 利用训练集进行训练
            print('TRAIN')
            n_sample = index_iterator.n_sample(TRAIN)
            labels_predict = list()
            labels_gold = list()

            for batch_index in index_iterator.iterate(batch_size,
                                                      mode=TRAIN,
                                                      shuffle=True):
                feed_dict = {
                    nn.var(_key): dataset[_key][batch_index]
                    for _key in feed_key[TRAIN]
                }
                feed_dict[nn.var(SAMPLE_WEIGHTS)] = list(
                    map(label_weight.get, feed_dict[nn.var(LABEL_GOLD)]))
                feed_dict[nn.var(TEST_MODE)] = 0

                if train_config.input_dropout_keep_prob < 1.:
                    for _key in [TID_0, TID_1, TID_2]:
                        var = nn.var(_key)
                        _tids = feed_dict[var]
                        feed_dict[var] = tid_dropout(
                            _tids, train_config.input_dropout_keep_prob)

                res = sess.run(fetches=fetches[TRAIN], feed_dict=feed_dict)

                labels_predict += res[LABEL_PREDICT].tolist()
                labels_gold += dataset[LABEL_GOLD][batch_index].tolist()

            labels_predict, labels_gold = labels_predict[:
                                                         n_sample], labels_gold[:
                                                                                n_sample]
            res = basic_evaluate(gold=labels_gold, pred=labels_predict)
            print_evaluation(res)
            eval_history[TRAIN].append(res)

            global_step = tf.train.global_step(sess, nn.var(GLOBAL_STEP))

            if train_config.valid_rate == 0.:
                if best_res[TRAIN] is None or res[
                        early_stop_metric] > best_res[TRAIN][early_stop_metric]:
                    best_res[TRAIN] = res
                    no_update_count[TRAIN] = 0
                    saver.save(sess,
                               save_path=model_output_prefix,
                               global_step=global_step)
                else:
                    no_update_count[TRAIN] += 1
            else:
                if best_res[TRAIN] is None or res[
                        early_stop_metric] > best_res[TRAIN][early_stop_metric]:
                    best_res[TRAIN] = res
                    no_update_count[TRAIN] = 0
                else:
                    no_update_count[TRAIN] += 1

                # 计算在验证集上的表现, 不更新模型参数
                print('VALID')
                n_sample = index_iterator.n_sample(VALID)
                labels_predict = list()
                labels_gold = list()

                for batch_index in index_iterator.iterate(batch_size,
                                                          mode=VALID,
                                                          shuffle=False):
                    feed_dict = {
                        nn.var(_key): dataset[_key][batch_index]
                        for _key in feed_key[TEST]
                    }
                    feed_dict[nn.var(TEST_MODE)] = 1
                    res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict)
                    labels_predict += res[LABEL_PREDICT].tolist()
                    labels_gold += dataset[LABEL_GOLD][batch_index].tolist()

                labels_predict, labels_gold = labels_predict[:
                                                             n_sample], labels_gold[:
                                                                                    n_sample]
                res = basic_evaluate(gold=labels_gold, pred=labels_predict)
                eval_history[DEV].append(res)
                print_evaluation(res)

                # Early Stop
                if best_res[VALID] is None or res[
                        early_stop_metric] > best_res[VALID][early_stop_metric]:
                    saver.save(sess,
                               save_path=model_output_prefix,
                               global_step=global_step)
                    best_res[VALID] = res
                    no_update_count[VALID] = 0
                else:
                    no_update_count[VALID] += 1

            # eval test
            _mode = TEST
            _dataset = datasets[_mode]
            _index_iterator = index_iterators[_mode]
            _n_sample = _index_iterator.n_sample()

            labels_predict = list()
            labels_gold = list()
            for batch_index in _index_iterator.iterate(batch_size,
                                                       shuffle=False):
                feed_dict = {
                    nn.var(_key): _dataset[_key][batch_index]
                    for _key in feed_key[TEST]
                }
                feed_dict[nn.var(TEST_MODE)] = 1
                res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict)
                labels_predict += res[LABEL_PREDICT].tolist()
                labels_gold += _dataset[LABEL_GOLD][batch_index].tolist()
            labels_predict, labels_gold = labels_predict[:
                                                         _n_sample], labels_gold[:
                                                                                 _n_sample]
            res = basic_evaluate(gold=labels_gold, pred=labels_predict)
            eval_history[TEST].append(res)

            if no_update_count[TRAIN] >= max_no_update_count:
                break

        # 训练结束 ##########################################################################
        # 确保输出文件夹存在

    print(
        '========================= BEST ROUND EVALUATION ========================='
    )

    json.dump(eval_history,
              open(data_config.output_path(output_key, 'eval', 'json'), 'w'))

    with tf.Session() as sess:
        prefix_checkpoint = tf.train.latest_checkpoint(
            data_config.model_path(key=output_key))
        saver = tf.train.import_meta_graph('{}.meta'.format(prefix_checkpoint))
        saver.restore(sess, prefix_checkpoint)

        nn = BaseNNModel(config=None)
        nn.set_graph(tf.get_default_graph())

        for mode in [TRAIN, TEST]:
            dataset = datasets[mode]
            index_iterator = index_iterators[mode]
            n_sample = index_iterator.n_sample()

            prob_predict = list()
            labels_predict = list()
            labels_gold = list()
            hidden_feats = list()

            for batch_index in index_iterator.iterate(batch_size,
                                                      shuffle=False):
                feed_dict = {
                    nn.var(_key): dataset[_key][batch_index]
                    for _key in feed_key[TEST]
                }
                feed_dict[nn.var(TEST_MODE)] = 1
                res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict)
                prob_predict += res[PROB_PREDICT].tolist()
                labels_predict += res[LABEL_PREDICT].tolist()
                hidden_feats += res[HIDDEN_FEAT].tolist()
                labels_gold += dataset[LABEL_GOLD][batch_index].tolist()

            prob_predict = prob_predict[:n_sample]
            labels_predict = labels_predict[:n_sample]
            labels_gold = labels_gold[:n_sample]
            hidden_feats = hidden_feats[:n_sample]

            if mode == TEST:
                res = basic_evaluate(gold=labels_gold, pred=labels_predict)
                best_res[TEST] = res

            # 导出隐藏层
            with open(data_config.output_path(output_key, mode, HIDDEN_FEAT),
                      'w') as file_obj:
                for _feat in hidden_feats:
                    file_obj.write('\t'.join(map(str, _feat)) + '\n')
            # 导出预测的label
            with open(data_config.output_path(output_key, mode, LABEL_PREDICT),
                      'w') as file_obj:
                for _label in labels_predict:
                    file_obj.write('{}\n'.format(_label))
            with open(data_config.output_path(output_key, mode, PROB_PREDICT),
                      'w') as file_obj:
                for _prob in prob_predict:
                    file_obj.write('\t'.join(map(str, _prob)) + '\n')

        for mode in [TRAIN, VALID, TEST]:
            if mode == VALID and train_config.valid_rate == 0.:
                continue
            res = best_res[mode]
            print(mode)
            print_evaluation(res)

            json.dump(
                res,
                open(data_config.output_path(output_key, mode, EVALUATION),
                     'w'))
            print()

    test_score_list = map(lambda _item: _item['f1'], eval_history[TEST])
    print('best test f1 reached: {}'.format(max(test_score_list)))

    print('OUTPUT_KEY: {}'.format(output_key))
Exemple #16
0
def main(ensemble_mode, config_path='e93.yaml', final_output=None):
    """
    [Usage]
    python3 -m algo.ensemble93 main -e mv --build-analysis

    :param ensemble_mode:
    :param config_path:
    :param final_output: string
    :return:
    """
    config_data = yaml.load(open(config_path))
    config = Config(data=config_data)

    labels_predict = dict()
    labels_predict_last = dict()
    labels_gold = dict()

    n_sample = dict()

    modes = {
        TRAIN: [TRAIN, TEST],
        FINAL: [
            FINAL,
        ]
    }

    for mode in [TRAIN, FINAL]:
        labels = list()
        for _mode in modes[mode]:
            label_path = data_config.path(_mode, LABEL, None)
            labels += load_label_list(label_path)
        labels_gold[mode] = labels
        n_sample[mode] = len(labels)
    output_dim = max(labels_gold[TRAIN]) + 1

    if ensemble_mode == SOFT_VOTING:
        for mode in [TRAIN, TEST]:
            components = dict()
            for output_key in config.components:
                path = data_config.output_path(output_key, mode, PROB_PREDICT)

                prob_list = list()
                with open(path) as file_obj:
                    for line in file_obj:
                        line = line.strip()
                        if line == '':
                            continue
                        prob = list(map(float, line.split('\t')))
                        prob_list.append(prob)
                components[output_key] = prob_list

            labels = list()
            for i in range(n_sample[mode]):
                prob = np.zeros((output_dim, ))
                for output_key, prob_list in components.items():
                    prob += np.asarray(prob_list[i])
                labels.append(np.argmax(prob))
            labels_predict[mode] = labels

    elif ensemble_mode == MAJORITY_VOTING:
        for mode in [TRAIN, FINAL]:
            components = list()

            for output_key in config.components:
                label_list = list()
                for _mode in modes[mode]:
                    path = data_config.output_path(output_key, _mode,
                                                   LABEL_PREDICT)
                    label_list += load_label_list(path)
                components.append(label_list)

            labels = list()
            for i in range(n_sample[mode]):
                prob = np.zeros((output_dim, ))
                for label_list in components:
                    label = label_list[i]
                    prob[label] += 1
                labels.append(np.argmax(prob))
            labels_predict[mode] = labels

    elif ensemble_mode == WEIGHTED_MAJORITY_VOTE:
        raise NotImplementedError
    else:
        raise ValueError('unknown mode: {}'.format(ensemble_mode))

    for mode in [TRAIN, FINAL]:
        if mode == TRAIN: continue

        print('=== {} ==='.format(mode))
        res = basic_evaluate(gold=labels_gold[mode], pred=labels_predict[mode])
        print(mode)
        print_evaluation(res)
        for col in res[CONFUSION_MATRIX]:
            print(','.join(map(str, col)))
        print()

        n_sample = len(labels_predict[mode])
        labels_predict_last[mode] = labels_predict[mode]

        # 修正HAS
        if config.tri_enabled:
            n_changed = 0

            votes = [[0 for _ in range(4)] for _ in range(n_sample)]
            for output_key in config.tri:
                labels = list()
                for _mode in modes[mode]:
                    path = data_config.output_path(output_key, _mode,
                                                   LABEL_PREDICT)
                    labels += load_label_list(path)
                if len(labels) != n_sample:
                    raise Exception('mismatch {}({}) != {}'.format(
                        output_key, len(labels), n_sample))

                for i, label in enumerate(labels):
                    votes[i][label] += 1

            base = list() + labels_predict_last[mode]
            for i, vote in enumerate(votes):
                if base[i] != 0:
                    arg_max = int(np.argmax(vote))
                    if arg_max != 0 and vote[arg_max] >= config.tri_min_vote:
                        if base[i] != arg_max:
                            n_changed += 1
                        base[i] = arg_max

            print('n_exchanged within "HAS": {}'.format(n_changed))

            labels_predict_last[mode] = base
            res = basic_evaluate(gold=labels_gold[mode], pred=base)
            print(mode, '(after TRI)')
            print_evaluation(res)
            for col in res[CONFUSION_MATRIX]:
                print(','.join(map(str, col)))
            print()

        # 将判成HAS的样本修正为Others
        if config.others_enabled:
            votes = [0 for i in range(n_sample)]
            n_changed = 0

            for output_key in config.others:
                labels = list()
                for _mode in modes[mode]:
                    path = data_config.output_path(output_key, _mode,
                                                   LABEL_PREDICT)
                    labels += load_label_list(path)
                if len(labels) != n_sample:
                    raise Exception('mismatch {}({}) != {}'.format(
                        output_key, len(labels), n_sample))

                for i, label in enumerate(labels):
                    if label == 0:
                        votes[i] += 1
            if config.others_min_vote == 'all':
                min_vote = len(config.others)
            else:
                min_vote = int(config.others_min_vote)
            base = list() + labels_predict_last[mode]
            for i, vote in enumerate(votes):
                if vote >= min_vote:
                    if base[i] != 0:
                        n_changed += 1
                    base[i] = 0
            print('n_changed to "OTHERS": {}'.format(n_changed))

            labels_predict_last[mode] = base

            res = basic_evaluate(gold=labels_gold[mode], pred=base)
            print(mode, '(after OTHERS)')
            print_evaluation(res)
            for col in res[CONFUSION_MATRIX]:
                print(','.join(map(str, col)))
            print()

        if mode == FINAL and final_output is not None:
            first_line = open(data_config.path_train, 'r').readline()
            with open(final_output, 'w') as o_obj:
                o_obj.write(first_line)

                lines = open(
                    data_config.path_test_no_labels).read().strip().split('\n')
                lines = lines[1:]
                lines = list(map(lambda l: l.strip(), lines))

                labels = labels_predict_last[FINAL]
                labels = list(map(lambda l: label_str[l], labels))
                assert len(labels) == len(lines)

                for line, label in zip(lines, labels):
                    o_obj.write('{}\t{}\n'.format(line, label))
Exemple #17
0
def train(model_name,
          label_version=None,
          label_key=None,
          config_path='c93f.yaml',
          check=False):
    """
    python -m algo.main93_v2 train
    python3 -m algo.main93_v2 train -c config_ntua93.yaml

    :param model_name: string
    :param label_version: string
    :param config_path: string
    :return:
    """
    text_version = 'ek'
    config_data = yaml.load(open(config_path))
    NNModel = getattr(
        importlib.import_module('algo.m93.{}'.format(model_name)), 'NNModel')

    output_key = '{}_{}_{}'.format(model_name, text_version, int(time.time()))
    if label_key is not None:
        output_key = '{}_{}'.format(label_key, output_key)
    output_key = 'f_{}'.format(output_key)
    print('OUTPUT_KEY: {}'.format(output_key))

    # 准备输出路径的文件夹
    data_config.prepare_output_folder(output_key=output_key)
    data_config.prepare_model_folder(output_key=output_key)

    shutil.copy(config_path, data_config.output_path(output_key, ALL, CONFIG))

    w2v_key = '{}_{}'.format(config_data['word']['w2v_version'], text_version)
    w2v_model_path = data_config.path(ALL, WORD2VEC, w2v_key)
    vocab_train_path = data_config.path(TRAIN, VOCAB, text_version)

    # 加载字典集
    # 在模型中会采用所有模型中支持的词向量, 并为有足够出现次数的单词随机生成词向量
    vocab_meta_list = load_vocab_list(vocab_train_path)
    vocabs = [
        _meta['t'] for _meta in vocab_meta_list
        if _meta['tf'] >= config_data['word']['min_tf']
    ]

    # 加载词向量与相关数据
    lookup_table, vocab_id_mapping, embedding_dim = load_lookup_table2(
        w2v_model_path=w2v_model_path, vocabs=vocabs)
    json.dump(
        vocab_id_mapping,
        open(data_config.output_path(output_key, ALL, VOCAB_ID_MAPPING), 'w'))

    # 加载配置
    nn_config = NNConfig(config_data)
    train_config = TrainConfig(config_data['train'])
    early_stop_metric = train_config.early_stop_metric

    # 加载训练数据
    datasets = dict()
    datasets[TRAIN], output_dim = load_dataset(
        mode=[TRAIN, TEST],
        vocab_id_mapping=vocab_id_mapping,
        max_seq_len=nn_config.seq_len,
        label_version=label_version,
        sampling=train_config.train_sampling,
        label_map=train_config.label_map(label_key))
    datasets[TEST], _ = load_dataset(
        mode=FINAL,
        vocab_id_mapping=vocab_id_mapping,
        max_seq_len=nn_config.seq_len,
        label_version=label_version,
        sampling=False,
        label_map=train_config.label_map(label_key))

    # 初始化数据集的检索
    index_iterators = {
        TRAIN: IndexIterator.from_dataset(datasets[TRAIN]),
        TEST: IndexIterator.from_dataset(datasets[TEST])
    }
    # 按配置将训练数据切割成训练集和验证集
    index_iterators[TRAIN].split_train_valid(train_config.valid_rate)

    # 计算各个类的权重
    if train_config.use_class_weights:
        label_weight = {
            # 参考 sklearn 中 class_weight='balanced'的公式, 实验显示效果显着
            _label: float(index_iterators[TRAIN].n_sample()) /
            (index_iterators[TRAIN].dim * len(_index))
            for _label, _index in index_iterators[TRAIN].label_index.items()
        }
    else:
        label_weight = {
            _label: 1.
            for _label in range(index_iterators[TRAIN].dim)
        }

    # 基于加载的数据更新配置
    nn_config.set_embedding_dim(embedding_dim)
    nn_config.set_output_dim(output_dim)
    # 搭建神经网络
    nn = NNModel(config=nn_config)
    nn.build_neural_network(lookup_table=lookup_table)

    batch_size = train_config.batch_size
    fetches = {
        mode: {_key: nn.var(_key)
               for _key in fetch_key[mode]}
        for mode in [TRAIN, TEST]
    }

    model_output_prefix = data_config.model_path(key=output_key) + '/model'

    best_res = {mode: None for mode in [TRAIN, VALID, TEST]}
    no_update_count = {mode: 0 for mode in [TRAIN, VALID]}
    max_no_update_count = 10

    eval_history = {TRAIN: list(), VALID: list(), TEST: list()}
    best_epoch = -1
    best_epoch_test = -1

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(tf.global_variables())

        dataset = datasets[TRAIN]
        index_iterator = index_iterators[TRAIN]

        # 训练开始 ##########################################################################
        for epoch in range(train_config.epoch):
            print('== epoch {} = {} ='.format(epoch, output_key))

            # 利用训练集进行训练
            print('TRAIN')
            n_sample = index_iterator.n_sample(TRAIN)
            labels_predict = list()
            labels_gold = list()

            for batch_index in index_iterator.iterate(batch_size,
                                                      mode=TRAIN,
                                                      shuffle=True):
                feed_dict = {
                    nn.var(_key): dataset[_key][batch_index]
                    for _key in feed_key[TRAIN]
                }
                feed_dict[nn.var(SAMPLE_WEIGHTS)] = list(
                    map(label_weight.get, feed_dict[nn.var(LABEL_GOLD)]))
                feed_dict[nn.var(TEST_MODE)] = 0
                res = sess.run(fetches=fetches[TRAIN], feed_dict=feed_dict)

                labels_predict += res[LABEL_PREDICT].tolist()
                labels_gold += dataset[LABEL_GOLD][batch_index].tolist()

            labels_predict, labels_gold = labels_predict[:
                                                         n_sample], labels_gold[:
                                                                                n_sample]
            res = basic_evaluate(gold=labels_gold, pred=labels_predict)
            print_evaluation(res)
            eval_history[TRAIN].append(res)

            global_step = tf.train.global_step(sess, nn.var(GLOBAL_STEP))

            if train_config.valid_rate == 0.:
                if best_epoch <= 10 or (best_res[TRAIN] is None
                                        or res[early_stop_metric] >
                                        best_res[TRAIN][early_stop_metric]):
                    best_epoch = epoch
                    best_res[TRAIN] = res
                    no_update_count[TRAIN] = 0
                    saver.save(sess,
                               save_path=model_output_prefix,
                               global_step=global_step)
                else:
                    no_update_count[TRAIN] += 1
            else:
                if best_res[TRAIN] is None or res[
                        early_stop_metric] > best_res[TRAIN][early_stop_metric]:
                    best_res[TRAIN] = res
                    no_update_count[TRAIN] = 0
                else:
                    no_update_count[TRAIN] += 1

                # 计算在验证集上的表现, 不更新模型参数
                print('VALID')
                n_sample = index_iterator.n_sample(VALID)
                labels_predict = list()
                labels_gold = list()

                for batch_index in index_iterator.iterate(batch_size,
                                                          mode=VALID,
                                                          shuffle=False):
                    feed_dict = {
                        nn.var(_key): dataset[_key][batch_index]
                        for _key in feed_key[TEST]
                    }
                    feed_dict[nn.var(TEST_MODE)] = 1
                    res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict)

                    labels_predict += res[LABEL_PREDICT].tolist()
                    labels_gold += dataset[LABEL_GOLD][batch_index].tolist()

                labels_predict, labels_gold = labels_predict[:
                                                             n_sample], labels_gold[:
                                                                                    n_sample]
                res = basic_evaluate(gold=labels_gold, pred=labels_predict)
                eval_history[VALID].append(res)
                print_evaluation(res)

                # Early Stop
                if best_epoch <= 10 or (best_res[VALID] is None
                                        or res[early_stop_metric] >
                                        best_res[VALID][early_stop_metric]):
                    best_epoch = epoch
                    saver.save(sess,
                               save_path=model_output_prefix,
                               global_step=global_step)
                    best_res[VALID] = res
                    no_update_count[VALID] = 0
                else:
                    no_update_count[VALID] += 1

            # eval test
            _mode = TEST
            _dataset = datasets[_mode]
            _index_iterator = SimpleIndexIterator.from_dataset(_dataset)
            _n_sample = _index_iterator.n_sample()

            labels_predict = list()
            labels_gold = list()
            for batch_index in _index_iterator.iterate(batch_size,
                                                       shuffle=False):
                feed_dict = {
                    nn.var(_key): _dataset[_key][batch_index]
                    for _key in feed_key[TEST]
                }
                feed_dict[nn.var(TEST_MODE)] = 1
                res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict)

                labels_predict += res[LABEL_PREDICT].tolist()
                labels_gold += _dataset[LABEL_GOLD][batch_index].tolist()
            labels_predict, labels_gold = labels_predict[:
                                                         _n_sample], labels_gold[:
                                                                                 _n_sample]
            res = basic_evaluate(gold=labels_gold, pred=labels_predict)
            eval_history[TEST].append(res)
            print('TEST')
            print_evaluation(res)

            if best_res[TEST] is None or res[F1_SCORE] > best_res[TEST][
                    F1_SCORE]:
                best_res[TEST] = res
                best_epoch_test = epoch

            if no_update_count[TRAIN] >= max_no_update_count:
                break

        # 训练结束 ##########################################################################
        # 确保输出文件夹存在

    print(
        '========================= BEST ROUND EVALUATION ========================='
    )

    json.dump(eval_history,
              open(data_config.output_path(output_key, 'eval', 'json'), 'w'))

    labels_predict_final = None
    labels_gold_final = load_label_list(data_config.path(FINAL, LABEL))

    with tf.Session() as sess:
        prefix_checkpoint = tf.train.latest_checkpoint(
            data_config.model_path(key=output_key))
        saver = tf.train.import_meta_graph('{}.meta'.format(prefix_checkpoint))
        saver.restore(sess, prefix_checkpoint)

        nn = BaseNNModel(config=None)
        nn.set_graph(tf.get_default_graph())
        for mode in [TRAIN, TEST, FINAL]:
            dataset = load_dataset(mode=mode,
                                   vocab_id_mapping=vocab_id_mapping,
                                   max_seq_len=nn_config.seq_len,
                                   with_label=False)
            index_iterator = SimpleIndexIterator.from_dataset(dataset)
            n_sample = index_iterator.n_sample()

            prob_predict = list()
            labels_predict = list()

            for batch_index in index_iterator.iterate(batch_size,
                                                      shuffle=False):
                feed_dict = {
                    nn.var(_key): dataset[_key][batch_index]
                    for _key in feed_key[TEST]
                }
                feed_dict[nn.var(TEST_MODE)] = 1
                res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict)
                prob_predict += res[PROB_PREDICT].tolist()
                labels_predict += res[LABEL_PREDICT].tolist()

            prob_predict = prob_predict[:n_sample]
            labels_predict = labels_predict[:n_sample]

            if mode == FINAL:
                labels_predict_final = labels_predict

            # 导出预测的label
            with open(data_config.output_path(output_key, mode, LABEL_PREDICT),
                      'w') as file_obj:
                for _label in labels_predict:
                    file_obj.write('{}\n'.format(_label))
            with open(data_config.output_path(output_key, mode, PROB_PREDICT),
                      'w') as file_obj:
                for _prob in prob_predict:
                    file_obj.write('\t'.join(map(str, _prob)) + '\n')

    print('====== best epoch test: {} ======'.format(best_epoch_test))

    for mode in [TRAIN, VALID, TEST]:
        if mode == VALID and train_config.valid_rate == 0.:
            continue

        print(mode)
        res = eval_history[mode][best_epoch_test]
        print_evaluation(res)

        if mode == TEST:
            for col in res[CONFUSION_MATRIX]:
                print(','.join(map(str, col)))

    print(eval_history[TEST][best_epoch_test])
    print()

    print('====== best epoch valid: {} ======'.format(best_epoch))
    for mode in [TRAIN, VALID, TEST]:
        if mode == VALID and train_config.valid_rate == 0.:
            continue

        print(mode)
        res = eval_history[mode][best_epoch]
        print_evaluation(res)

        if mode == TEST:
            for col in res[CONFUSION_MATRIX]:
                print(','.join(map(str, col)))

        json.dump(
            res,
            open(data_config.output_path(output_key, mode, EVALUATION), 'w'))
        print()

    print(eval_history[TEST][best_epoch])
    print()

    if check:
        print('====== label_map check ======')

        label_map = train_config.label_map(label_key)
        if label_map is not None:
            new_gold = list()
            new_pred = list()
            for g, p in zip(labels_gold_final, labels_predict_final):
                if g in label_map:
                    new_gold.append(label_map[g])
                    new_pred.append(p)
            labels_gold_final = new_gold
            labels_predict_final = new_pred

        res = basic_evaluate(gold=labels_gold_final, pred=labels_predict_final)
        print_evaluation(res)
        for col in res[CONFUSION_MATRIX]:
            print(','.join(map(str, col)))

    print('OUTPUT_KEY: {}'.format(output_key))
Exemple #18
0
def train(dataset_key,
          text_version,
          label_version=None,
          config_path='config.yaml'):
    """
    python -m algo.main93 train semeval2019_task3_dev -t ek
    python3 -m algo.main93 train semeval2019_task3_dev -t ek -c config_ntua93.yaml

    :param dataset_key: string
    :param text_version: string
    :param label_version: string
    :param config_path: string
    :return:
    """
    config_data = yaml.load(open(config_path))

    output_key = '{}_{}_{}'.format(config_data['module'].rsplit('.', 1)[1],
                                   text_version, int(time.time()))
    if label_version is not None:
        output_key = '{}_{}'.format(label_version, output_key)
    print('OUTPUT_KEY: {}'.format(output_key))

    # 准备输出路径的文件夹
    data_config.prepare_output_folder(output_key=output_key)
    data_config.prepare_model_folder(output_key=output_key)

    shutil.copy(config_path, data_config.output_path(output_key, ALL, CONFIG))

    # 根据配置加载模块
    module_relative_path = config_data['module']
    NNModel = getattr(importlib.import_module(module_relative_path), 'NNModel')
    NNConfig = getattr(importlib.import_module(module_relative_path),
                       'NNConfig')

    if config_data['analyzer'] == WORD:
        w2v_key = '{}_{}'.format(config_data['word']['w2v_version'],
                                 text_version)
        w2v_model_path = data_config.path(ALL, WORD2VEC, w2v_key)
        vocab_train_path = data_config.path(TRAIN, VOCAB, text_version)

        # 加载字典集
        # 在模型中会采用所有模型中支持的词向量, 并为有足够出现次数的单词随机生成词向量
        vocab_meta_list = load_vocab_list(vocab_train_path)
        # vocab_meta_list += load_vocab_list(semeval2018_task3_date_config.path(TRAIN, VOCAB, text_version))
        vocabs = [
            _meta['t'] for _meta in vocab_meta_list
            if _meta['tf'] >= config_data[WORD]['min_tf']
        ]

        # 加载词向量与相关数据
        lookup_table, vocab_id_mapping, embedding_dim = load_lookup_table(
            w2v_model_path=w2v_model_path, vocabs=vocabs)
        json.dump(
            vocab_id_mapping,
            open(data_config.output_path(output_key, ALL, VOCAB_ID_MAPPING),
                 'w'))
        max_seq_len = MAX_WORD_SEQ_LEN
    elif config_data['analyzer'] == CHAR:
        texts = load_text_list(data_config.path(TRAIN, TEXT))
        char_set = set()
        for text in texts:
            char_set |= set(text)
        lookup_table, vocab_id_mapping, embedding_dim = build_random_lookup_table(
            vocabs=char_set, dim=config_data['char']['embedding_dim'])
        max_seq_len = MAX_CHAR_SEQ_LEN
    else:
        raise ValueError('invalid analyzer: {}'.format(
            config_data['analyzer']))

    # 加载训练数据
    datasets, output_dim = load_dataset(data_config=data_config,
                                        analyzer=config_data['analyzer'],
                                        vocab_id_mapping=vocab_id_mapping,
                                        seq_len=max_seq_len,
                                        with_label=True,
                                        label_version=label_version,
                                        text_version=text_version)

    # 加载配置
    nn_config = NNConfig(config_data)
    train_config = TrainConfig(config_data['train'])

    # 初始化数据集的检索
    index_iterators = {
        mode: IndexIterator(datasets[mode][LABEL_GOLD])
        for mode in [TRAIN, TEST]
    }
    # 按配置将训练数据切割成训练集和验证集
    index_iterators[TRAIN].split_train_valid(train_config.valid_rate)

    # 计算各个类的权重
    if train_config.use_class_weights:
        label_weight = {
            # 参考 sklearn 中 class_weight='balanced'的公式, 实验显示效果显着
            _label: float(index_iterators[TRAIN].n_sample()) /
            (index_iterators[TRAIN].dim * len(_index))
            for _label, _index in index_iterators[TRAIN].label_index.items()
        }
    else:
        label_weight = {
            _label: 1.
            for _label in range(index_iterators[TRAIN].dim)
        }

    # 基于加载的数据更新配置
    nn_config.set_embedding_dim(embedding_dim)
    nn_config.set_output_dim(output_dim)
    nn_config.set_seq_len(max_seq_len)
    # 搭建神经网络
    nn = NNModel(config=nn_config)
    nn.build_neural_network(lookup_table=lookup_table)

    batch_size = train_config.batch_size
    fetches = {
        mode: {_key: nn.var(_key)
               for _key in fetch_key[mode]}
        for mode in [TRAIN, TEST]
    }
    last_eval = {TRAIN: None, VALID: None, TEST: None}

    model_output_prefix = data_config.model_path(key=output_key) + '/model'

    best_res = {mode: None for mode in [TRAIN, VALID]}
    no_update_count = {mode: 0 for mode in [TRAIN, VALID]}
    max_no_update_count = 10

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(tf.global_variables())

        dataset = datasets[TRAIN]
        index_iterator = index_iterators[TRAIN]

        # 训练开始 ##########################################################################
        for epoch in range(train_config.epoch):
            print('== epoch {} =='.format(epoch))

            # 利用训练集进行训练
            print('TRAIN')
            n_sample = index_iterator.n_sample(TRAIN)
            labels_predict = list()
            labels_gold = list()

            for batch_index in index_iterator.iterate(batch_size,
                                                      mode=TRAIN,
                                                      shuffle=True):
                feed_dict = {
                    nn.var(_key): dataset[_key][batch_index]
                    for _key in feed_key[TRAIN]
                }
                feed_dict[nn.var(SAMPLE_WEIGHTS)] = list(
                    map(label_weight.get, feed_dict[nn.var(LABEL_GOLD)]))
                feed_dict[nn.var(TEST_MODE)] = 0
                res = sess.run(fetches=fetches[TRAIN], feed_dict=feed_dict)

                labels_predict += res[LABEL_PREDICT].tolist()
                labels_gold += dataset[LABEL_GOLD][batch_index].tolist()

            labels_predict, labels_gold = labels_predict[:
                                                         n_sample], labels_gold[:
                                                                                n_sample]
            labels_predict, labels_gold = labels_predict[:
                                                         n_sample], labels_gold[:
                                                                                n_sample]
            res = basic_evaluate(gold=labels_gold, pred=labels_predict)
            last_eval[TRAIN] = res
            print_evaluation(res)

            global_step = tf.train.global_step(sess, nn.var(GLOBAL_STEP))

            if train_config.valid_rate == 0.:
                if best_res[TRAIN] is None or res[F1_SCORE] > best_res[TRAIN][
                        F1_SCORE]:
                    best_res[TRAIN] = res
                    no_update_count[TRAIN] = 0
                    saver.save(sess,
                               save_path=model_output_prefix,
                               global_step=global_step)
                else:
                    no_update_count[TRAIN] += 1
            else:
                if best_res[TRAIN] is None or res[F1_SCORE] > best_res[TRAIN][
                        F1_SCORE]:
                    best_res[TRAIN] = res
                    no_update_count[TRAIN] = 0
                else:
                    no_update_count[TRAIN] += 1

                # 计算在验证集上的表现, 不更新模型参数
                print('VALID')
                n_sample = index_iterator.n_sample(VALID)
                labels_predict = list()
                labels_gold = list()

                for batch_index in index_iterator.iterate(batch_size,
                                                          mode=VALID,
                                                          shuffle=False):
                    feed_dict = {
                        nn.var(_key): dataset[_key][batch_index]
                        for _key in feed_key[TEST]
                    }
                    feed_dict[nn.var(TEST_MODE)] = 1
                    res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict)
                    labels_predict += res[LABEL_PREDICT].tolist()
                    labels_gold += dataset[LABEL_GOLD][batch_index].tolist()

                labels_predict, labels_gold = labels_predict[:
                                                             n_sample], labels_gold[:
                                                                                    n_sample]
                res = basic_evaluate(gold=labels_gold, pred=labels_predict)
                last_eval[VALID] = res
                print_evaluation(res)

                # Early Stop
                if best_res[VALID] is None or res[F1_SCORE] > best_res[VALID][
                        F1_SCORE]:
                    saver.save(sess,
                               save_path=model_output_prefix,
                               global_step=global_step)
                    best_res[VALID] = res
                    no_update_count[VALID] = 0
                else:
                    no_update_count[VALID] += 1

            if no_update_count[TRAIN] >= max_no_update_count:
                break

        # 训练结束 ##########################################################################
        # 确保输出文件夹存在

    print(
        '========================= BEST ROUND EVALUATION ========================='
    )

    with tf.Session() as sess:
        prefix_checkpoint = tf.train.latest_checkpoint(
            data_config.model_path(key=output_key))
        saver = tf.train.import_meta_graph('{}.meta'.format(prefix_checkpoint))
        saver.restore(sess, prefix_checkpoint)

        nn = BaseNNModel(config=None)
        nn.set_graph(tf.get_default_graph())

        for mode in [TRAIN, TEST]:
            dataset = datasets[mode]
            index_iterator = index_iterators[mode]
            n_sample = index_iterator.n_sample()

            prob_predict = list()
            labels_predict = list()
            labels_gold = list()
            hidden_feats = list()

            for batch_index in index_iterator.iterate(batch_size,
                                                      shuffle=False):
                feed_dict = {
                    nn.var(_key): dataset[_key][batch_index]
                    for _key in feed_key[TEST]
                }
                feed_dict[nn.var(TEST_MODE)] = 1
                res = sess.run(fetches=fetches[TEST], feed_dict=feed_dict)
                prob_predict += res[PROB_PREDICT].tolist()
                labels_predict += res[LABEL_PREDICT].tolist()
                hidden_feats += res[HIDDEN_FEAT].tolist()
                labels_gold += dataset[LABEL_GOLD][batch_index].tolist()

            prob_predict = prob_predict[:n_sample]
            labels_predict = labels_predict[:n_sample]
            labels_gold = labels_gold[:n_sample]
            hidden_feats = hidden_feats[:n_sample]

            if mode == TEST:
                res = basic_evaluate(gold=labels_gold, pred=labels_predict)
                best_res[TEST] = res

            # 导出隐藏层
            with open(data_config.output_path(output_key, mode, HIDDEN_FEAT),
                      'w') as file_obj:
                for _feat in hidden_feats:
                    file_obj.write('\t'.join(map(str, _feat)) + '\n')
            # 导出预测的label
            with open(data_config.output_path(output_key, mode, LABEL_PREDICT),
                      'w') as file_obj:
                for _label in labels_predict:
                    file_obj.write('{}\n'.format(_label))
            with open(data_config.output_path(output_key, mode, PROB_PREDICT),
                      'w') as file_obj:
                for _prob in prob_predict:
                    file_obj.write('\t'.join(map(str, _prob)) + '\n')

        for mode in [TRAIN, VALID, TEST]:
            if mode == VALID and train_config.valid_rate == 0.:
                continue
            res = best_res[mode]
            print(mode)
            print_evaluation(res)

            json.dump(
                res,
                open(data_config.output_path(output_key, mode, EVALUATION),
                     'w'))
            print()

    print('OUTPUT_KEY: {}'.format(output_key))
Exemple #19
0
def show_config(output_key):
    path = data_config.output_path(output_key, ALL, CONFIG)
    print(open(path).read())
    print(path)
Exemple #20
0
def main(input_filename, config_path='e93.yaml', final_output=None):
    """
    [Usage]
    python3 -m algo.ensemble93 main -e mv --build-analysis
    """
    config_data = yaml.load(open(config_path))
    config = Config(data=config_data)

    labels_gold = dict()
    labels_predict = dict()
    labels_predict_last = dict()

    dataset = Processor.load_origin(input_filename)
    labels_predict[FINAL] = list(map(lambda _item: _item[-1], dataset))

    for mode in [
            FINAL,
    ]:
        if not mode == FINAL:
            res = basic_evaluate(gold=labels_gold[mode],
                                 pred=labels_predict[mode])
            print(mode)
            print_evaluation(res)
            for col in res[CONFUSION_MATRIX]:
                print(','.join(map(str, col)))
            print()

        n_sample = len(labels_predict[mode])
        labels_predict_last[mode] = labels_predict[mode]

        # 修正HAS
        if config.tri_enabled:
            n_changed = 0

            votes = [[0 for _ in range(4)] for _ in range(n_sample)]
            for output_key in config.tri:
                labels = list()
                for _mode in modes[mode]:
                    path = data_config.output_path(output_key, _mode,
                                                   LABEL_PREDICT)
                    labels += load_label_list(path)
                if len(labels) != n_sample:
                    raise Exception('mismatch {}({}) != {}'.format(
                        output_key, len(labels), n_sample))

                for i, label in enumerate(labels):
                    votes[i][label] += 1

            base = list() + labels_predict_last[mode]
            for i, vote in enumerate(votes):
                arg_max = int(np.argmax(vote))
                if arg_max == 0:
                    continue
                if base[i] != 0:
                    if vote[arg_max] >= config.tri_min_vote:
                        if base[i] != arg_max:
                            n_changed += 1
                        base[i] = arg_max
                elif vote[arg_max] >= config.tri_out_vote:
                    base[i] = arg_max
                    n_changed += 1

            print('n_exchanged within "HAS": {}'.format(n_changed))

            labels_predict_last[mode] = base
            if not mode == FINAL:
                res = basic_evaluate(gold=labels_gold[mode], pred=base)
                print(mode, '(after TRI)')
                print_evaluation(res)
                for col in res[CONFUSION_MATRIX]:
                    print(','.join(map(str, col)))
                print()

        # 将判成HAS的样本修正为Others
        if config.others_enabled:
            votes = [0 for _ in range(n_sample)]
            n_changed = 0

            for output_key in config.others:
                labels = list()
                for _mode in modes[mode]:
                    path = data_config.output_path(output_key, _mode,
                                                   LABEL_PREDICT)
                    labels += load_label_list(path)
                if len(labels) != n_sample:
                    raise Exception('mismatch {}({}) != {}'.format(
                        output_key, len(labels), n_sample))

                for i, label in enumerate(labels):
                    if label == 0:
                        votes[i] += 1
            if config.others_min_vote == 'all':
                min_vote = len(config.others)
            else:
                min_vote = int(config.others_min_vote)
            base = list() + labels_predict_last[mode]
            for i, vote in enumerate(votes):
                if vote >= min_vote:
                    if base[i] != 0:
                        n_changed += 1
                    base[i] = 0
            print('n_changed to "OTHERS": {}'.format(n_changed))

            labels_predict_last[mode] = base
            if not mode == FINAL:
                res = basic_evaluate(gold=labels_gold[mode], pred=base)
                print(mode, '(after OTHERS)')
                print_evaluation(res)
                for col in res[CONFUSION_MATRIX]:
                    print(','.join(map(str, col)))
                print()

        if mode == FINAL and final_output is not None:
            labels = labels_predict_last[FINAL]
            export_final(final_output, labels)