Esempio n. 1
0
ver = "sep_three_three_sib_little_batch_size_middle"
path = f"processed_data_{ver}"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if not os.path.isdir(path):
    os.mkdir(path)
if not os.path.isdir(f'ckpt_{ver}'):
    os.mkdir(f'ckpt_{ver}')
tokenizer = None
for mode in ['dev', 'train', 'test']:
    if not os.path.isfile(f'{path}/m_{mode}.pkl'):
        dir_path = f'release/{mode}/ca_data'
        pre = Preprocess(dir_path, max_length=180, train=not (mode == 'test'))
        front_dataset, back_dataset, dataset_not_for_train, middle_dataset = pre.process(
        )
        data['f', mode] = TagValueDataset(front_dataset,
                                          tokenizer=pre.tokenizer,
                                          tags_num=pre.tags_num,
                                          train=not (mode == 'test'))
        data['b', mode] = TagValueDataset(back_dataset,
                                          tokenizer=pre.tokenizer,
                                          tags_num=pre.tags_num,
                                          train=not (mode == 'test'))
        data['m', mode] = TagValueDataset(middle_dataset,
                                          tokenizer=pre.tokenizer,
                                          tags_num=pre.tags_num,
                                          train=not (mode == 'test'))
        data_not_train[
            mode] = dataset_not_for_train  #TagValueDataset(dataset_not_for_train,tokenizer=pre.tokenizer,tags_num=pre.tags_num,train=not(mode == 'test'))
        torch.save(data['f', mode], f'{path}/f_{mode}.pkl')
        torch.save(data['b', mode], f'{path}/b_{mode}.pkl')
        torch.save(data['m', mode], f'{path}/m_{mode}.pkl')
        torch.save(data_not_train[mode], f'{path}/{mode}_not_train.pkl')
Esempio n. 2
0
path = f"processed_data_{ver}"
path2 = f"processed_data_{ver2}"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if not os.path.isdir(path):
    os.mkdir(path)
if not os.path.isdir(f'ckpt_{ver}'):
    os.mkdir(f'ckpt_{ver}')
tokenizer = None
for p in [path, path2]:
    for mode in ['dev', 'train', 'test']:
        if not os.path.isfile(f'{p}/{mode}.pkl'):
            dir_path = f'release/{mode}/ca_data'
            pre3 = p3(dir_path, max_length=150, train=not (mode == 'test'))
            dataset, dataset_not_for_train = pre.process()
            data[p, mode] = TagValueDataset(dataset,
                                            tokenizer=pre3.tokenizer,
                                            tags_num=pre3.tags_num,
                                            train=not (mode == 'test'))
            data_not_train[
                p,
                mode] = dataset_not_for_train  #TagValueDataset(dataset_not_for_train,tokenizer=pre.tokenizer,tags_num=pre.tags_num,train=not(mode == 'test'))
            torch.save(data[p, mode], f'{p}/{mode}.pkl')
            torch.save(data_not_train[mode], f'{p}/{mode}_not_train.pkl')
        else:
            print(f"Load {mode}......")
            data[p, mode] = torch.load(f'{p}/{mode}.pkl')
            data_not_train[p, mode] = torch.load(f'{p}/{mode}_not_train.pkl')

        tokenizer = data[p, mode].tokenizer
        #print(len(data[mode]))
        if mode != 'test':
            tags_num[p, mode] = data[p, mode].tags_num