def read_datasets(): print("[Start Dataset Reading.]") data = {} # load word2vec model fasttext = load_w2v(word2vec_path) #fasttext = load_fasttext(fasttext_path) print("[Load Word2Vec model.]") # load normalized word embeddings norm_embedding = tool.norm_matrix(fasttext.wv.syn0) data['embedding'] = norm_embedding print("[Load normalized word embedding.]") # preprocess seen and unseen labels ex_dict, ex_vec = process_label(ex_intent, fasttext) em_dict, em_vec = process_label(em_intent, fasttext) print("[Preprocess labels.]") # trans data into embedding vectors max_len = 0 x_ex, y_ex, ex_len, max_len = load_vec(training_data_path, fasttext, ex_dict, max_len) x_em, y_em, em_len, max_len = load_vec(test_data_path, fasttext, em_dict, max_len) label_ex, label_ex_len = load_vec_label(ex_intent, fasttext, max_len) label_em, label_em_len = load_vec_label(em_intent, fasttext, max_len) # existing intent #data['ex_intent']=ex_intent #data['em_intent']=em_intent data['label_ex'] = label_ex data['label_ex_len'] = label_ex_len data['label_em'] = label_em data['label_em_len'] = label_em_len data['x_ex'] = x_ex data['y_ex'] = y_ex data['ex_len'] = ex_len data['ex_vec'] = ex_vec data['ex_dict'] = ex_dict # emerging intent data['x_em'] = x_em data['y_em'] = y_em data['em_len'] = em_len data['em_vec'] = em_vec data['em_dict'] = em_dict data['max_len'] = max_len data['ex_label'] = get_label(data) # [0.0, 0.0, ..., 1.0, ..., 0.0] print("[Complete Dataset Reading.]") return data
def read_datasets(from_data_dir, to_data_dir, args): # Read datasets and make the data dictionary containing essential data objects for training. print("Splitting raw data and saving into txt files...") if args.mode == 'seen_class': train_data_path, valid_data_path = split_seen_class_data( from_data_dir, to_data_dir, args) else: train_data_path, valid_data_path = split_zero_shot_data( from_data_dir, to_data_dir, args) # Setting configurations. tokenizer = None w2v = None bert_config = None if args.model_type == 'bert_capsnet' or args.model_type == 'basic_capsnet': print("Loading BertTokenizer...") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") if args.model_type == 'bert_capsnet': bert_config = BertConfig.from_pretrained("bert-base-uncased") args.max_len = min(args.max_len, bert_config.max_position_embeddings) else: w2v_path = f'{args.data_dir}/GoogleNews-vectors-negative300.bin' assert os.path.isfile( w2v_path ), f"There is no Korean w2v file. Please download w2v file from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit, extract it and put it in {args.data_dir}." w2v = load_w2v(w2v_path) # Keep the index of padding. if tokenizer is None: args.pad_id = 0 else: args.pad_id = tokenizer.get_vocab()['[PAD]'] # Preprocess train/test data print("Preprocessing train/test data...") train_set = CustomDataset(train_data_path, tokenizer, w2v, args.max_len, args.pad_id) valid_set = CustomDataset(valid_data_path, tokenizer, w2v, args.max_len, args.pad_id) # Depending the model type, vocab_size, word_emb_size, and w2v object can be different. if args.model_type == 'bert_capsnet': args.vocab_size = len(tokenizer.get_vocab()) args.word_emb_size = bert_config.hidden_size args.embedding = None elif args.model_type == 'basic_capsnet': args.vocab_size = len(tokenizer.get_vocab()) args.word_emb_size = 300 args.embedding = None elif args.model_type == 'w2v_capsnet': w2v_shape = w2v.wv.vectors.shape args.vocab_size = w2v_shape[0] args.word_emb_size = w2v_shape[1] args.embedding = tool.norm_matrix(w2v.syn0) return train_set, valid_set, args
def read_datasets(): print("------------------read datasets begin-------------------") data = {} # load word2vec model print("------------------load word2vec begin-------------------") w2v = load_w2v(word2vec_path) print("------------------load word2vec end---------------------") # load normalized word embeddings embedding = w2v.syn0 data['embedding'] = embedding norm_embedding = tool.norm_matrix(embedding) data['embedding'] = norm_embedding # pre process seen and unseen labels sc_dict, sc_vec = process_label(seen_intent, w2v) uc_dict, uc_vec = process_label(unseen_intent, w2v) # trans data into embedding vectors max_len = 0 # jiayi # x_tr, y_tr, s_len, max_len = load_vec( # training_data_path, w2v, sc_dict, max_len) # x_te, y_te, u_len, max_len = load_vec( # test_data_path, w2v, uc_dict, max_len) x_tr, y_tr, s_len, max_len = load_vec(training_data_path, w2v, sc_dict, max_len) x_te, y_te, u_len, max_len = load_vec(test_data_path, w2v, uc_dict, max_len) data['x_tr'] = x_tr data['y_tr'] = y_tr data['s_len'] = s_len data['sc_vec'] = sc_vec data['sc_dict'] = sc_dict data['x_te'] = x_te data['y_te'] = y_te data['u_len'] = u_len data['uc_vec'] = uc_vec data['uc_dict'] = uc_dict data['max_len'] = max_len ind = get_label(data) data['s_label'] = ind # [0.0, 0.0, ..., 1.0, ..., 0.0] print("------------------read datasets end---------------------") return data
def read_datasets(data_setting): print("------------------read datasets begin-------------------") data = dict() data['dataset'] = data_setting['dataset'] # load data data_path = data_setting['data_prefix'] + data_setting['dataset_name'] x_text, y_text, y, class_dict = load_data(data_path) # tokenize data['text_represent'] = data_setting['text_represent'] data['key_pretrained'] = data_setting['key_pretrained'] word2vec_path = data_setting['data_prefix'] + data_setting['wordvec_name'] w2v = load_w2v(word2vec_path) if data['text_represent'] == 'w2v': x_pad, _, _ = tokenize_w2v(x_text, w2v) y_pad, data['n_vocab'], data['d_emb'] = tokenize_w2v(y_text, w2v) data['embedding'] = torch.from_numpy(tool.norm_matrix(w2v.syn0)) elif data['text_represent'] == 'transformers': x_pad, data['n_vocab'], data['d_emb'] = tokenize_transformers( x_text, data['key_pretrained']) # TODO n_vocab not update by y y_pad, _, _ = tokenize_transformers(y_text, data['key_pretrained']) # split dataset print('split dataset') if data_setting['freeze_class']: label_order = data_setting['label_order'] unseen_class = data_setting['unseen_class'] seen_class = [x for x in label_order if x not in unseen_class] else: if data_setting['dataset'] == 'SMP18': del class_dict['聊天'] class_freq_dict = { k: (y == class_dict[k]).nonzero().shape[0] for k in class_dict.keys() } class_freq_dict = {k: v for k, v in class_freq_dict.items() if v > 2} label_order = list(class_freq_dict.keys()) label_freq_np = np.array([class_freq_dict[l] for l in label_order]) label_freq_np = label_freq_np / sum(label_freq_np) n_c_tr = math.ceil(len(label_order) * data_setting['seen_class_prob']) seen_class = list( np.random.choice(label_order, size=n_c_tr, replace=False, p=label_freq_np)) unseen_class = [x for x in label_order if x not in seen_class] print("unseen_class:\n", unseen_class) # update class_dict and y (first seen and then unseen classes in class_dict) class_dict = dict() for c in seen_class: class_dict[c] = len(class_dict) for c in unseen_class: class_dict[c] = len(class_dict) y = [ class_dict[label] if label in class_dict.keys() else -1 for label in y_text ] y = torch.tensor(y) # get split index if data_setting['freeze_class']: data['id_split'] = data_setting['id_split'] matlab_data = sio.loadmat(data_setting['data_prefix'] + data_setting['sim_name_withS']) idx_tr = (matlab_data['train_ind'][0, data_setting['id_split']] - 1).tolist()[0] idx_te = (matlab_data['test_ind'][0, data_setting['id_split']] - 1).tolist()[0] idx_tr = torch.LongTensor(idx_tr) idx_te = torch.LongTensor(idx_te) data['sim'] = matlab_data['similarity'][0, data_setting['id_split']] data['sim'] = torch.from_numpy(data['sim']) else: idx_tr = torch.tensor([]).long() idx_te = torch.tensor([]).long() for c in unseen_class: idx_c = (y == class_dict[c]).nonzero().squeeze(-1) if data_setting['test_mode'] == 'standard': idx_te = torch.cat([idx_te, idx_c], dim=0) else: # idx_te = torch.cat([idx_te, idx_c], dim=0) n_unseen_in_test = int(idx_c.shape[0] * data_setting['sample_in_test_prob']) idx_te = torch.cat([idx_te, idx_c[:n_unseen_in_test]], dim=0) for c in seen_class: idx_c = (y == class_dict[c]).nonzero().squeeze(-1) if data_setting['test_mode'] == 'standard': idx_tr = torch.cat([idx_tr, idx_c], dim=0) elif data_setting['test_mode'] == 'general': idx_perm = torch.randperm(idx_c.shape[0]) idx_c = idx_c[idx_perm] n_seen_in_test = int(idx_c.shape[0] * data_setting['sample_in_test_prob']) idx_tr = torch.cat([idx_tr, idx_c[n_seen_in_test:]]) idx_te = torch.cat([idx_te, idx_c[:n_seen_in_test]]) # shuffle data idx_tr = idx_tr[torch.randperm(idx_tr.shape[0])] idx_te = idx_te[torch.randperm(idx_te.shape[0])] # get padded class represent if data['text_represent'] == 'w2v': # class_pad, data['n_vocab'], data['d_emb'] = tokenize_w2v(seen_class + unseen_class, w2v) class_pad, data['n_vocab'], data['d_emb'] = tokenize_w2v( seen_class, w2v) else: class_pad, _, _ = tokenize_transformers(seen_class + unseen_class, data['key_pretrained']) # pre process seen and unseen labels # if data_setting['text_represent'] == 'w2v': class_id_startpoint = 0 sc_dict, sc_vec = process_label(seen_class, w2v, class_id_startpoint) if data_setting['test_mode'] == 'general': uc_dict, uc_vec = process_label(unseen_class, w2v, class_id_startpoint + len(sc_dict)) uc_dict = dict(sc_dict, **uc_dict) uc_vec = np.concatenate([sc_vec, uc_vec], axis=0) else: uc_dict, uc_vec = process_label(unseen_class, w2v, class_id_startpoint) data['sc_dict'] = sc_dict data['sc_vec'] = sc_vec data['uc_dict'] = uc_dict data['uc_vec'] = uc_vec # finalize data package data['n_tr'] = idx_tr.shape[0] data['x_tr'] = x_pad[idx_tr] data['y_tr'] = y[idx_tr] data['y_ind'] = torch.zeros(data['n_tr'], len(seen_class)).scatter_( 1, data['y_tr'].unsqueeze(1), 1) data['n_te'] = idx_te.shape[0] data['x_te'] = x_pad[idx_te] data['y_te'] = y[idx_te] data['seen_class'] = seen_class data['unseen_class'] = unseen_class data['class_padded'] = class_pad print("------------------read dataset end---------------------") return data