ver = "sep_three_three_sib_little_batch_size_middle" path = f"processed_data_{ver}" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if not os.path.isdir(path): os.mkdir(path) if not os.path.isdir(f'ckpt_{ver}'): os.mkdir(f'ckpt_{ver}') tokenizer = None for mode in ['dev', 'train', 'test']: if not os.path.isfile(f'{path}/m_{mode}.pkl'): dir_path = f'release/{mode}/ca_data' pre = Preprocess(dir_path, max_length=180, train=not (mode == 'test')) front_dataset, back_dataset, dataset_not_for_train, middle_dataset = pre.process( ) data['f', mode] = TagValueDataset(front_dataset, tokenizer=pre.tokenizer, tags_num=pre.tags_num, train=not (mode == 'test')) data['b', mode] = TagValueDataset(back_dataset, tokenizer=pre.tokenizer, tags_num=pre.tags_num, train=not (mode == 'test')) data['m', mode] = TagValueDataset(middle_dataset, tokenizer=pre.tokenizer, tags_num=pre.tags_num, train=not (mode == 'test')) data_not_train[ mode] = dataset_not_for_train #TagValueDataset(dataset_not_for_train,tokenizer=pre.tokenizer,tags_num=pre.tags_num,train=not(mode == 'test')) torch.save(data['f', mode], f'{path}/f_{mode}.pkl') torch.save(data['b', mode], f'{path}/b_{mode}.pkl') torch.save(data['m', mode], f'{path}/m_{mode}.pkl') torch.save(data_not_train[mode], f'{path}/{mode}_not_train.pkl')
path = f"processed_data_{ver}" path2 = f"processed_data_{ver2}" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if not os.path.isdir(path): os.mkdir(path) if not os.path.isdir(f'ckpt_{ver}'): os.mkdir(f'ckpt_{ver}') tokenizer = None for p in [path, path2]: for mode in ['dev', 'train', 'test']: if not os.path.isfile(f'{p}/{mode}.pkl'): dir_path = f'release/{mode}/ca_data' pre3 = p3(dir_path, max_length=150, train=not (mode == 'test')) dataset, dataset_not_for_train = pre.process() data[p, mode] = TagValueDataset(dataset, tokenizer=pre3.tokenizer, tags_num=pre3.tags_num, train=not (mode == 'test')) data_not_train[ p, mode] = dataset_not_for_train #TagValueDataset(dataset_not_for_train,tokenizer=pre.tokenizer,tags_num=pre.tags_num,train=not(mode == 'test')) torch.save(data[p, mode], f'{p}/{mode}.pkl') torch.save(data_not_train[mode], f'{p}/{mode}_not_train.pkl') else: print(f"Load {mode}......") data[p, mode] = torch.load(f'{p}/{mode}.pkl') data_not_train[p, mode] = torch.load(f'{p}/{mode}_not_train.pkl') tokenizer = data[p, mode].tokenizer #print(len(data[mode])) if mode != 'test': tags_num[p, mode] = data[p, mode].tags_num