예제 #1
0
def load_dataset(vocab_id_mapping,
                 max_seq_len,
                 with_label=True,
                 label_version=None,
                 text_version=None):
    def seq_to_len_list(seq_list):
        return list(map(len, seq_list))

    def zero_pad_seq_list(seq_list, seq_len):
        return list(
            map(lambda _seq: _seq + [0] * (seq_len - len(_seq)), seq_list))

    def trim_tid_list(tid_list, max_len):
        return list(map(lambda _seq: _seq[:max_len], tid_list))

    datasets = dict()
    for mode in [TRAIN, TEST]:
        tid_list_0 = tokenized_to_tid_list(
            load_tokenized_list(data_config.path(mode, TURN, '0.ek')),
            vocab_id_mapping)
        tid_list_0 = trim_tid_list(tid_list_0, max_seq_len)
        seq_len_0 = seq_to_len_list(tid_list_0)

        tid_list_1 = tokenized_to_tid_list(
            load_tokenized_list(data_config.path(mode, TURN, '1.ek')),
            vocab_id_mapping)
        tid_list_1 = trim_tid_list(tid_list_1, max_seq_len)
        seq_len_1 = seq_to_len_list(tid_list_1)

        tid_list_2 = tokenized_to_tid_list(
            load_tokenized_list(data_config.path(mode, TURN, '2.ek')),
            vocab_id_mapping)
        tid_list_2 = trim_tid_list(tid_list_2, max_seq_len)
        seq_len_2 = seq_to_len_list(tid_list_2)

        datasets[mode] = {
            TID_0: tid_list_0,
            TID_1: tid_list_1,
            TID_2: tid_list_2,
            SEQ_LEN_0: np.asarray(seq_len_0),
            SEQ_LEN_1: np.asarray(seq_len_1),
            SEQ_LEN_2: np.asarray(seq_len_2),
        }
        if with_label:
            label_path = data_config.path(mode, LABEL, label_version)
            label_list = load_label_list(label_path)
            datasets[mode][LABEL_GOLD] = np.asarray(label_list)

    for mode in [TRAIN, TEST]:
        for key in [TID_0, TID_1, TID_2]:
            datasets[mode][key] = np.asarray(
                zero_pad_seq_list(datasets[mode][key], max_seq_len))

    if with_label:
        output_dim = max(datasets[TRAIN][LABEL_GOLD]) + 1
        return datasets, output_dim
    else:
        return datasets
예제 #2
0
def live_test(output_key):
    config_path = data_config.output_path(output_key, ALL, CONFIG)
    config_data = yaml.load(open(config_path))
    nn_config = NNConfig(config_data)
    vocab_id_mapping = json.load(open(data_config.output_path(output_key, ALL, VOCAB_ID_MAPPING), 'r'))

    with tf.Session() as sess:
        prefix_checkpoint = tf.train.latest_checkpoint(data_config.model_path(key=output_key))
        saver = tf.train.import_meta_graph('{}.meta'.format(prefix_checkpoint))
        saver.restore(sess, prefix_checkpoint)

        nn = BaseNNModel(config=None)
        nn.set_graph(tf.get_default_graph())

        fetches = {_key: nn.var(_key) for _key in [LABEL_PREDICT, PROB_PREDICT]}
        while True:
            res = input('input: ')
            if res == 'quit':
                break

            turns = res.strip().split('|')
            if len(turns) != 3:
                print('invalid turns')
                continue

            tokens_list = list()
            for turn in turns:
                tokens = re.sub('\s+', ' ', turn.strip()).split(' ')
                tokens_list.append(tokens)

            placeholder = [[]] * (nn_config.batch_size - 1)
            tid_list_0 = tokenized_to_tid_list([tokens_list[0], ] + placeholder, vocab_id_mapping)
            tid_list_1 = tokenized_to_tid_list([tokens_list[1], ] + placeholder, vocab_id_mapping)
            tid_list_2 = tokenized_to_tid_list([tokens_list[2], ] + placeholder, vocab_id_mapping)

            tid_0 = np.asarray(zero_pad_seq_list(tid_list_0, nn_config.seq_len))
            tid_1 = np.asarray(zero_pad_seq_list(tid_list_1, nn_config.seq_len))
            tid_2 = np.asarray(zero_pad_seq_list(tid_list_2, nn_config.seq_len))

            feed_dict = {
                nn.var(TID_0): tid_0,
                nn.var(TID_1): tid_1,
                nn.var(TID_2): tid_2,
                nn.var(TEST_MODE): 1
            }
            res = sess.run(fetches=fetches, feed_dict=feed_dict)
            label = res[LABEL_PREDICT][0]
            prob = res[PROB_PREDICT][0]
            print('label: {}'.format(label))
            print('prob: {}'.format(prob))
예제 #3
0
def load_dataset(mode,
                 vocab_id_mapping,
                 max_seq_len,
                 sampling=False,
                 with_label=True,
                 label_version=None):
    dataset = dict()
    tid_list = tokenized_to_tid_list(
        load_tokenized_list(data_config.path(mode, TEXT, EK)),
        vocab_id_mapping)
    dataset[TID] = tid_list
    print('{}: {}'.format(mode,
                          max(list(map(lambda _item: len(_item), tid_list)))))

    if with_label:
        label_path = data_config.path(mode, LABEL, label_version)
        label_list = load_label_list(label_path)
        dataset[LABEL_GOLD] = np.asarray(label_list)

    if sampling:
        dataset = custom_sampling(dataset)

    dataset[TID], dataset[SEQ_LEN] = to_nn_input(dataset[TID],
                                                 max_seq_len=max_seq_len)

    if with_label:
        output_dim = max(dataset[LABEL_GOLD]) + 1
        return dataset, output_dim
    else:
        return dataset
예제 #4
0
def load_dataset(mode, vocab_id_mapping, max_seq_len, sampling=False, with_label=True, label_version=None):
    dataset = dict()
    for i in [0, 1, 2]:
        tid_list = tokenized_to_tid_list(
            load_tokenized_list(data_config.path(mode, TURN, '{}.ek'.format(i))),
            vocab_id_mapping
        )
        dataset[TID_[i]] = tid_list

    if with_label:
        label_path = data_config.path(mode, LABEL, label_version)
        label_list = load_label_list(label_path)
        dataset[LABEL_GOLD] = np.asarray(label_list)

    if sampling:
        dataset = custom_sampling(dataset)

    for i in [0, 1, 2]:
        dataset[TID_[i]], dataset[SEQ_LEN_[i]] = to_nn_input(dataset[TID_[i]], max_seq_len=max_seq_len)

    if with_label:
        output_dim = max(dataset[LABEL_GOLD]) + 1
        return dataset, output_dim
    else:
        return dataset
예제 #5
0
def load_dataset(vocab_id_mapping,
                 max_seq_len,
                 with_label=True,
                 label_version=None):
    datasets = dict()
    for mode in [TRAIN, TEST]:
        datasets[mode] = dict()
        for i in [0, 1, 2]:
            tid_list = tokenized_to_tid_list(
                load_tokenized_list(
                    data_config.path(mode, TURN, '{}.ek'.format(i))),
                vocab_id_mapping)
            datasets[mode][TID_[i]] = tid_list
        if with_label:
            label_path = data_config.path(mode, LABEL, label_version)
            label_list = load_label_list(label_path)
            datasets[mode][LABEL_GOLD] = np.asarray(label_list)

    datasets[TRAIN] = custom_sampling(datasets[TRAIN])

    for mode in [TRAIN, TEST]:
        for i in [0, 1, 2]:
            datasets[mode][TID_[i]] = to_nn_input(datasets[mode][TID_[i]],
                                                  max_seq_len=max_seq_len)

    if with_label:
        output_dim = max(datasets[TRAIN][LABEL_GOLD]) + 1
        return datasets, output_dim
    else:
        return datasets
예제 #6
0
def load_dataset(data_config,
                 analyzer,
                 vocab_id_mapping,
                 seq_len,
                 with_label=True,
                 label_version=None,
                 text_version=None):
    def seq_to_len_list(seq_list):
        return list(map(len, seq_list))

    def zero_pad_seq_list(seq_list, seq_len):
        return list(
            map(lambda _seq: _seq + [0] * (seq_len - len(_seq)), seq_list))

    datasets = dict()
    for mode in [TRAIN, TEST]:
        if analyzer == WORD:
            text_path = data_config.path(mode, TEXT, text_version)
            tokenized_list = load_tokenized_list(text_path)
        elif analyzer == CHAR:
            text_path = data_config.path(mode, TEXT)
            text_list = load_text_list(text_path)
            tokenized_list = list(map(list, text_list))
        else:
            raise ValueError('invalid analyzer, got {}'.format(analyzer))

        tid_list = tokenized_to_tid_list(tokenized_list, vocab_id_mapping)
        seq_len_list = seq_to_len_list(tid_list)

        datasets[mode] = {
            TOKEN_ID_SEQ: tid_list,
            SEQ_LEN: np.asarray(seq_len_list),
        }
        if with_label:
            label_path = data_config.path(mode, LABEL, label_version)
            label_list = load_label_list(label_path)
            datasets[mode][LABEL_GOLD] = np.asarray(label_list)

    max_seq_len = -1
    for _dataset in datasets.values():
        max_seq_len = max(max_seq_len, _dataset[SEQ_LEN].max() + 1)

    if seq_len < max_seq_len:
        raise ValueError('seq_len set as {}, got max seq_len = {}'.format(
            seq_len, max_seq_len))

    for mode in [TRAIN, TEST]:
        datasets[mode][TOKEN_ID_SEQ] = np.asarray(
            zero_pad_seq_list(datasets[mode][TOKEN_ID_SEQ], seq_len))

    if with_label:
        output_dim = max(datasets[TRAIN][LABEL_GOLD]) + 1
        return datasets, output_dim
    else:
        return datasets
예제 #7
0
def check_wrong(output_key, w2v_key='ntua_ek'):
    mode = TEST
    path = data_config.output_path(output_key, mode, LABEL_PREDICT)
    pred = load_label_list(path)

    path = data_config.path(mode, LABEL)
    gold = load_label_list(path)

    w2v_model_path = data_config.path(ALL, WORD2VEC, w2v_key)
    vocab_train_path = data_config.path(TRAIN, VOCAB, 'ek')

    # 加载字典集
    # 在模型中会采用所有模型中支持的词向量, 并为有足够出现次数的单词随机生成词向量
    vocab_meta_list = load_vocab_list(vocab_train_path)
    vocabs = [_meta['t'] for _meta in vocab_meta_list if _meta['tf'] >= 2]

    # 加载词向量与相关数据
    lookup_table, vocab_id_mapping, embedding_dim = load_lookup_table(
        w2v_model_path=w2v_model_path, vocabs=vocabs)

    tokens_0 = load_tokenized_list(data_config.path(mode, TURN, '0.ek'))
    tokens_1 = load_tokenized_list(data_config.path(mode, TURN, '1.ek'))
    tokens_2 = load_tokenized_list(data_config.path(mode, TURN, '2.ek'))
    tid_list_0 = tokenized_to_tid_list(tokens_0, vocab_id_mapping)
    tid_list_1 = tokenized_to_tid_list(tokens_1, vocab_id_mapping)
    tid_list_2 = tokenized_to_tid_list(
        load_tokenized_list(data_config.path(mode, TURN, '2.ek')),
        vocab_id_mapping)

    max_seq_len = 0
    for p, g, tid_0, tid_1, tid_2, tk_0, tk_1, tk_2 in zip(
            pred, gold, tid_list_0, tid_list_1, tid_list_2, tokens_0, tokens_1,
            tokens_2):
        if p != g and (len(tid_0) > 30 or len(tid_1) > 30 or len(tid_2) > 30):
            print('pred: {}, gold: {}'.format(p, g))
            print('turn0: {}'.format(' '.join(tk_0)))
            print('turn1: {}'.format(' '.join(tk_1)))
            print('turn2: {}'.format(' '.join(tk_2)))

        if p != g:
            max_seq_len = max(max_seq_len, len(tid_0), len(tid_1), len(tid_2))
    print(max_seq_len)
예제 #8
0
def load_dataset(mode,
                 vocab_id_mapping,
                 max_seq_len,
                 sampling=False,
                 label_map=None,
                 with_label=True,
                 label_version=None):
    modes = mode if isinstance(mode, list) else [
        mode,
    ]

    dataset = dict()
    for i in [0, 1, 2]:
        tid_list = list()
        for mode in modes:
            tid_list += tokenized_to_tid_list(
                load_tokenized_list(
                    data_config.path(mode, TURN, '{}.ek'.format(i))),
                vocab_id_mapping)
        dataset[TID_[i]] = tid_list

    if with_label:
        label_list = list()
        for mode in modes:
            label_path = data_config.path(mode, LABEL, label_version)
            label_list += load_label_list(label_path)

        if label_map is not None:
            new_tid_list_ = [list() for _ in range(3)]
            new_label_list = list()
            for idx, label in enumerate(label_list):
                if label in label_map:
                    for i in range(3):
                        new_tid_list_[i].append(dataset[TID_[i]][idx])
                    new_label_list.append(label_map[label])
            for i in range(3):
                dataset[TID_[i]] = new_tid_list_[i]
            label_list = new_label_list

        dataset[LABEL_GOLD] = label_list

    if sampling:
        dataset = custom_sampling(dataset)

    for i in [0, 1, 2]:
        dataset[TID_[i]], dataset[SEQ_LEN_[i]] = to_nn_input(
            dataset[TID_[i]], max_seq_len=max_seq_len)

    if with_label:
        dataset[LABEL_GOLD] = np.asarray(dataset[LABEL_GOLD])
        output_dim = max(dataset[LABEL_GOLD]) + 1
        return dataset, output_dim
    else:
        return dataset
예제 #9
0
def load_dataset(mode,
                 vocab_id_mapping,
                 max_seq_len,
                 sampling=False,
                 with_label=True,
                 label_version=None,
                 filter_others=False):
    modes = mode if isinstance(mode, list) else [
        mode,
    ]

    dataset = dict()
    for i in [0, 1, 2]:
        tid_list = list()
        for mode in modes:
            tid_list += tokenized_to_tid_list(
                load_tokenized_list(
                    data_config.path(mode, TURN, '{}.ek'.format(i))),
                vocab_id_mapping)
        dataset[TID_[i]] = tid_list

    if with_label:
        label_list = list()
        for mode in modes:
            label_path = data_config.path(mode, LABEL, label_version)
            label_list += load_label_list(label_path)
        dataset[LABEL_GOLD] = label_list

    if filter_others:
        select_index = build_select_index(dataset[LABEL_GOLD])
        for k, v in dataset.items():
            dataset[k] = filter_by_index(v, select_index)

    if sampling:
        dataset = custom_sampling(dataset)

    for i in [0, 1, 2]:
        dataset[TID_[i]], dataset[SEQ_LEN_[i]] = to_nn_input(
            dataset[TID_[i]], max_seq_len=max_seq_len)

    if with_label:
        dataset[LABEL_GOLD] = np.asarray(dataset[LABEL_GOLD])
        output_dim = max(dataset[LABEL_GOLD]) + 1
        return dataset, output_dim
    else:
        return dataset
예제 #10
0
def load_dataset(data_config,
                 vocab_id_mapping,
                 seq_len,
                 with_label=True,
                 label_version=None,
                 text_version=None):
    def seq_to_len_list(seq_list):
        return list(map(len, seq_list))

    def zero_pad_seq_list(seq_list, seq_len):
        return list(
            map(lambda _seq: _seq + [0] * (seq_len - len(_seq)), seq_list))

    def trim_tid_list(tid_list, max_len):
        return list(map(lambda _seq: _seq[:max_len], tid_list))

    datasets = dict()
    for mode in [TRAIN, TEST]:
        tid_list_0 = tokenized_to_tid_list(
            load_tokenized_list(data_config.path(mode, TURN, '0.ek')),
            vocab_id_mapping)
        tid_list_0 = trim_tid_list(tid_list_0, MAX_SEQ_LEN)
        seq_len_0 = seq_to_len_list(tid_list_0)

        tid_list_1 = tokenized_to_tid_list(
            load_tokenized_list(data_config.path(mode, TURN, '1.ek')),
            vocab_id_mapping)
        tid_list_1 = trim_tid_list(tid_list_1, MAX_SEQ_LEN)
        seq_len_1 = seq_to_len_list(tid_list_1)

        tid_list_2 = tokenized_to_tid_list(
            load_tokenized_list(data_config.path(mode, TURN, '2.ek')),
            vocab_id_mapping)
        tid_list_2 = trim_tid_list(tid_list_2, MAX_SEQ_LEN)
        seq_len_2 = seq_to_len_list(tid_list_2)

        datasets[mode] = {
            TID_0: tid_list_0,
            TID_1: tid_list_1,
            TID_2: tid_list_2,
            SEQ_LEN_0: np.asarray(seq_len_0),
            SEQ_LEN_1: np.asarray(seq_len_1),
            SEQ_LEN_2: np.asarray(seq_len_2),
        }
        if with_label:
            label_path = data_config.path(mode, LABEL, label_version)
            label_list = load_label_list(label_path)
            datasets[mode][LABEL_GOLD] = np.asarray(label_list)

    max_seq_len = -1
    for mode, _dataset in datasets.items():
        print(mode, _dataset[SEQ_LEN_0].max(), _dataset[SEQ_LEN_1].max(),
              _dataset[SEQ_LEN_2].max())
        max_seq_len = max(max_seq_len, _dataset[SEQ_LEN_0].max(),
                          _dataset[SEQ_LEN_1].max(), _dataset[SEQ_LEN_2].max())

    if seq_len < max_seq_len:
        raise ValueError('seq_len set as {}, got max seq_len = {}'.format(
            seq_len, max_seq_len))

    for mode in [TRAIN, TEST]:
        for key in [TID_0, TID_1, TID_2]:
            datasets[mode][key] = np.asarray(
                zero_pad_seq_list(datasets[mode][key], seq_len))

    if with_label:
        output_dim = max(datasets[TRAIN][LABEL_GOLD]) + 1
        return datasets, output_dim
    else:
        return datasets