def load_dataset(vocab_id_mapping, max_seq_len, with_label=True, label_version=None, text_version=None): def seq_to_len_list(seq_list): return list(map(len, seq_list)) def zero_pad_seq_list(seq_list, seq_len): return list( map(lambda _seq: _seq + [0] * (seq_len - len(_seq)), seq_list)) def trim_tid_list(tid_list, max_len): return list(map(lambda _seq: _seq[:max_len], tid_list)) datasets = dict() for mode in [TRAIN, TEST]: tid_list_0 = tokenized_to_tid_list( load_tokenized_list(data_config.path(mode, TURN, '0.ek')), vocab_id_mapping) tid_list_0 = trim_tid_list(tid_list_0, max_seq_len) seq_len_0 = seq_to_len_list(tid_list_0) tid_list_1 = tokenized_to_tid_list( load_tokenized_list(data_config.path(mode, TURN, '1.ek')), vocab_id_mapping) tid_list_1 = trim_tid_list(tid_list_1, max_seq_len) seq_len_1 = seq_to_len_list(tid_list_1) tid_list_2 = tokenized_to_tid_list( load_tokenized_list(data_config.path(mode, TURN, '2.ek')), vocab_id_mapping) tid_list_2 = trim_tid_list(tid_list_2, max_seq_len) seq_len_2 = seq_to_len_list(tid_list_2) datasets[mode] = { TID_0: tid_list_0, TID_1: tid_list_1, TID_2: tid_list_2, SEQ_LEN_0: np.asarray(seq_len_0), SEQ_LEN_1: np.asarray(seq_len_1), SEQ_LEN_2: np.asarray(seq_len_2), } if with_label: label_path = data_config.path(mode, LABEL, label_version) label_list = load_label_list(label_path) datasets[mode][LABEL_GOLD] = np.asarray(label_list) for mode in [TRAIN, TEST]: for key in [TID_0, TID_1, TID_2]: datasets[mode][key] = np.asarray( zero_pad_seq_list(datasets[mode][key], max_seq_len)) if with_label: output_dim = max(datasets[TRAIN][LABEL_GOLD]) + 1 return datasets, output_dim else: return datasets
def live_test(output_key): config_path = data_config.output_path(output_key, ALL, CONFIG) config_data = yaml.load(open(config_path)) nn_config = NNConfig(config_data) vocab_id_mapping = json.load(open(data_config.output_path(output_key, ALL, VOCAB_ID_MAPPING), 'r')) with tf.Session() as sess: prefix_checkpoint = tf.train.latest_checkpoint(data_config.model_path(key=output_key)) saver = tf.train.import_meta_graph('{}.meta'.format(prefix_checkpoint)) saver.restore(sess, prefix_checkpoint) nn = BaseNNModel(config=None) nn.set_graph(tf.get_default_graph()) fetches = {_key: nn.var(_key) for _key in [LABEL_PREDICT, PROB_PREDICT]} while True: res = input('input: ') if res == 'quit': break turns = res.strip().split('|') if len(turns) != 3: print('invalid turns') continue tokens_list = list() for turn in turns: tokens = re.sub('\s+', ' ', turn.strip()).split(' ') tokens_list.append(tokens) placeholder = [[]] * (nn_config.batch_size - 1) tid_list_0 = tokenized_to_tid_list([tokens_list[0], ] + placeholder, vocab_id_mapping) tid_list_1 = tokenized_to_tid_list([tokens_list[1], ] + placeholder, vocab_id_mapping) tid_list_2 = tokenized_to_tid_list([tokens_list[2], ] + placeholder, vocab_id_mapping) tid_0 = np.asarray(zero_pad_seq_list(tid_list_0, nn_config.seq_len)) tid_1 = np.asarray(zero_pad_seq_list(tid_list_1, nn_config.seq_len)) tid_2 = np.asarray(zero_pad_seq_list(tid_list_2, nn_config.seq_len)) feed_dict = { nn.var(TID_0): tid_0, nn.var(TID_1): tid_1, nn.var(TID_2): tid_2, nn.var(TEST_MODE): 1 } res = sess.run(fetches=fetches, feed_dict=feed_dict) label = res[LABEL_PREDICT][0] prob = res[PROB_PREDICT][0] print('label: {}'.format(label)) print('prob: {}'.format(prob))
def load_dataset(mode, vocab_id_mapping, max_seq_len, sampling=False, with_label=True, label_version=None): dataset = dict() tid_list = tokenized_to_tid_list( load_tokenized_list(data_config.path(mode, TEXT, EK)), vocab_id_mapping) dataset[TID] = tid_list print('{}: {}'.format(mode, max(list(map(lambda _item: len(_item), tid_list))))) if with_label: label_path = data_config.path(mode, LABEL, label_version) label_list = load_label_list(label_path) dataset[LABEL_GOLD] = np.asarray(label_list) if sampling: dataset = custom_sampling(dataset) dataset[TID], dataset[SEQ_LEN] = to_nn_input(dataset[TID], max_seq_len=max_seq_len) if with_label: output_dim = max(dataset[LABEL_GOLD]) + 1 return dataset, output_dim else: return dataset
def load_dataset(mode, vocab_id_mapping, max_seq_len, sampling=False, with_label=True, label_version=None): dataset = dict() for i in [0, 1, 2]: tid_list = tokenized_to_tid_list( load_tokenized_list(data_config.path(mode, TURN, '{}.ek'.format(i))), vocab_id_mapping ) dataset[TID_[i]] = tid_list if with_label: label_path = data_config.path(mode, LABEL, label_version) label_list = load_label_list(label_path) dataset[LABEL_GOLD] = np.asarray(label_list) if sampling: dataset = custom_sampling(dataset) for i in [0, 1, 2]: dataset[TID_[i]], dataset[SEQ_LEN_[i]] = to_nn_input(dataset[TID_[i]], max_seq_len=max_seq_len) if with_label: output_dim = max(dataset[LABEL_GOLD]) + 1 return dataset, output_dim else: return dataset
def load_dataset(vocab_id_mapping, max_seq_len, with_label=True, label_version=None): datasets = dict() for mode in [TRAIN, TEST]: datasets[mode] = dict() for i in [0, 1, 2]: tid_list = tokenized_to_tid_list( load_tokenized_list( data_config.path(mode, TURN, '{}.ek'.format(i))), vocab_id_mapping) datasets[mode][TID_[i]] = tid_list if with_label: label_path = data_config.path(mode, LABEL, label_version) label_list = load_label_list(label_path) datasets[mode][LABEL_GOLD] = np.asarray(label_list) datasets[TRAIN] = custom_sampling(datasets[TRAIN]) for mode in [TRAIN, TEST]: for i in [0, 1, 2]: datasets[mode][TID_[i]] = to_nn_input(datasets[mode][TID_[i]], max_seq_len=max_seq_len) if with_label: output_dim = max(datasets[TRAIN][LABEL_GOLD]) + 1 return datasets, output_dim else: return datasets
def load_dataset(data_config, analyzer, vocab_id_mapping, seq_len, with_label=True, label_version=None, text_version=None): def seq_to_len_list(seq_list): return list(map(len, seq_list)) def zero_pad_seq_list(seq_list, seq_len): return list( map(lambda _seq: _seq + [0] * (seq_len - len(_seq)), seq_list)) datasets = dict() for mode in [TRAIN, TEST]: if analyzer == WORD: text_path = data_config.path(mode, TEXT, text_version) tokenized_list = load_tokenized_list(text_path) elif analyzer == CHAR: text_path = data_config.path(mode, TEXT) text_list = load_text_list(text_path) tokenized_list = list(map(list, text_list)) else: raise ValueError('invalid analyzer, got {}'.format(analyzer)) tid_list = tokenized_to_tid_list(tokenized_list, vocab_id_mapping) seq_len_list = seq_to_len_list(tid_list) datasets[mode] = { TOKEN_ID_SEQ: tid_list, SEQ_LEN: np.asarray(seq_len_list), } if with_label: label_path = data_config.path(mode, LABEL, label_version) label_list = load_label_list(label_path) datasets[mode][LABEL_GOLD] = np.asarray(label_list) max_seq_len = -1 for _dataset in datasets.values(): max_seq_len = max(max_seq_len, _dataset[SEQ_LEN].max() + 1) if seq_len < max_seq_len: raise ValueError('seq_len set as {}, got max seq_len = {}'.format( seq_len, max_seq_len)) for mode in [TRAIN, TEST]: datasets[mode][TOKEN_ID_SEQ] = np.asarray( zero_pad_seq_list(datasets[mode][TOKEN_ID_SEQ], seq_len)) if with_label: output_dim = max(datasets[TRAIN][LABEL_GOLD]) + 1 return datasets, output_dim else: return datasets
def check_wrong(output_key, w2v_key='ntua_ek'): mode = TEST path = data_config.output_path(output_key, mode, LABEL_PREDICT) pred = load_label_list(path) path = data_config.path(mode, LABEL) gold = load_label_list(path) w2v_model_path = data_config.path(ALL, WORD2VEC, w2v_key) vocab_train_path = data_config.path(TRAIN, VOCAB, 'ek') # 加载字典集 # 在模型中会采用所有模型中支持的词向量, 并为有足够出现次数的单词随机生成词向量 vocab_meta_list = load_vocab_list(vocab_train_path) vocabs = [_meta['t'] for _meta in vocab_meta_list if _meta['tf'] >= 2] # 加载词向量与相关数据 lookup_table, vocab_id_mapping, embedding_dim = load_lookup_table( w2v_model_path=w2v_model_path, vocabs=vocabs) tokens_0 = load_tokenized_list(data_config.path(mode, TURN, '0.ek')) tokens_1 = load_tokenized_list(data_config.path(mode, TURN, '1.ek')) tokens_2 = load_tokenized_list(data_config.path(mode, TURN, '2.ek')) tid_list_0 = tokenized_to_tid_list(tokens_0, vocab_id_mapping) tid_list_1 = tokenized_to_tid_list(tokens_1, vocab_id_mapping) tid_list_2 = tokenized_to_tid_list( load_tokenized_list(data_config.path(mode, TURN, '2.ek')), vocab_id_mapping) max_seq_len = 0 for p, g, tid_0, tid_1, tid_2, tk_0, tk_1, tk_2 in zip( pred, gold, tid_list_0, tid_list_1, tid_list_2, tokens_0, tokens_1, tokens_2): if p != g and (len(tid_0) > 30 or len(tid_1) > 30 or len(tid_2) > 30): print('pred: {}, gold: {}'.format(p, g)) print('turn0: {}'.format(' '.join(tk_0))) print('turn1: {}'.format(' '.join(tk_1))) print('turn2: {}'.format(' '.join(tk_2))) if p != g: max_seq_len = max(max_seq_len, len(tid_0), len(tid_1), len(tid_2)) print(max_seq_len)
def load_dataset(mode, vocab_id_mapping, max_seq_len, sampling=False, label_map=None, with_label=True, label_version=None): modes = mode if isinstance(mode, list) else [ mode, ] dataset = dict() for i in [0, 1, 2]: tid_list = list() for mode in modes: tid_list += tokenized_to_tid_list( load_tokenized_list( data_config.path(mode, TURN, '{}.ek'.format(i))), vocab_id_mapping) dataset[TID_[i]] = tid_list if with_label: label_list = list() for mode in modes: label_path = data_config.path(mode, LABEL, label_version) label_list += load_label_list(label_path) if label_map is not None: new_tid_list_ = [list() for _ in range(3)] new_label_list = list() for idx, label in enumerate(label_list): if label in label_map: for i in range(3): new_tid_list_[i].append(dataset[TID_[i]][idx]) new_label_list.append(label_map[label]) for i in range(3): dataset[TID_[i]] = new_tid_list_[i] label_list = new_label_list dataset[LABEL_GOLD] = label_list if sampling: dataset = custom_sampling(dataset) for i in [0, 1, 2]: dataset[TID_[i]], dataset[SEQ_LEN_[i]] = to_nn_input( dataset[TID_[i]], max_seq_len=max_seq_len) if with_label: dataset[LABEL_GOLD] = np.asarray(dataset[LABEL_GOLD]) output_dim = max(dataset[LABEL_GOLD]) + 1 return dataset, output_dim else: return dataset
def load_dataset(mode, vocab_id_mapping, max_seq_len, sampling=False, with_label=True, label_version=None, filter_others=False): modes = mode if isinstance(mode, list) else [ mode, ] dataset = dict() for i in [0, 1, 2]: tid_list = list() for mode in modes: tid_list += tokenized_to_tid_list( load_tokenized_list( data_config.path(mode, TURN, '{}.ek'.format(i))), vocab_id_mapping) dataset[TID_[i]] = tid_list if with_label: label_list = list() for mode in modes: label_path = data_config.path(mode, LABEL, label_version) label_list += load_label_list(label_path) dataset[LABEL_GOLD] = label_list if filter_others: select_index = build_select_index(dataset[LABEL_GOLD]) for k, v in dataset.items(): dataset[k] = filter_by_index(v, select_index) if sampling: dataset = custom_sampling(dataset) for i in [0, 1, 2]: dataset[TID_[i]], dataset[SEQ_LEN_[i]] = to_nn_input( dataset[TID_[i]], max_seq_len=max_seq_len) if with_label: dataset[LABEL_GOLD] = np.asarray(dataset[LABEL_GOLD]) output_dim = max(dataset[LABEL_GOLD]) + 1 return dataset, output_dim else: return dataset
def load_dataset(data_config, vocab_id_mapping, seq_len, with_label=True, label_version=None, text_version=None): def seq_to_len_list(seq_list): return list(map(len, seq_list)) def zero_pad_seq_list(seq_list, seq_len): return list( map(lambda _seq: _seq + [0] * (seq_len - len(_seq)), seq_list)) def trim_tid_list(tid_list, max_len): return list(map(lambda _seq: _seq[:max_len], tid_list)) datasets = dict() for mode in [TRAIN, TEST]: tid_list_0 = tokenized_to_tid_list( load_tokenized_list(data_config.path(mode, TURN, '0.ek')), vocab_id_mapping) tid_list_0 = trim_tid_list(tid_list_0, MAX_SEQ_LEN) seq_len_0 = seq_to_len_list(tid_list_0) tid_list_1 = tokenized_to_tid_list( load_tokenized_list(data_config.path(mode, TURN, '1.ek')), vocab_id_mapping) tid_list_1 = trim_tid_list(tid_list_1, MAX_SEQ_LEN) seq_len_1 = seq_to_len_list(tid_list_1) tid_list_2 = tokenized_to_tid_list( load_tokenized_list(data_config.path(mode, TURN, '2.ek')), vocab_id_mapping) tid_list_2 = trim_tid_list(tid_list_2, MAX_SEQ_LEN) seq_len_2 = seq_to_len_list(tid_list_2) datasets[mode] = { TID_0: tid_list_0, TID_1: tid_list_1, TID_2: tid_list_2, SEQ_LEN_0: np.asarray(seq_len_0), SEQ_LEN_1: np.asarray(seq_len_1), SEQ_LEN_2: np.asarray(seq_len_2), } if with_label: label_path = data_config.path(mode, LABEL, label_version) label_list = load_label_list(label_path) datasets[mode][LABEL_GOLD] = np.asarray(label_list) max_seq_len = -1 for mode, _dataset in datasets.items(): print(mode, _dataset[SEQ_LEN_0].max(), _dataset[SEQ_LEN_1].max(), _dataset[SEQ_LEN_2].max()) max_seq_len = max(max_seq_len, _dataset[SEQ_LEN_0].max(), _dataset[SEQ_LEN_1].max(), _dataset[SEQ_LEN_2].max()) if seq_len < max_seq_len: raise ValueError('seq_len set as {}, got max seq_len = {}'.format( seq_len, max_seq_len)) for mode in [TRAIN, TEST]: for key in [TID_0, TID_1, TID_2]: datasets[mode][key] = np.asarray( zero_pad_seq_list(datasets[mode][key], seq_len)) if with_label: output_dim = max(datasets[TRAIN][LABEL_GOLD]) + 1 return datasets, output_dim else: return datasets