def build_train_dataLoader(split, last_iters): print( " ==================== start loading datasets for training ====================" ) train_dataset = build_openkp_dataset(args, 'train', tokenizer, tag_to_ix, split, last_iters) print("********************** Train set Len *******************{}".format( len(train_dataset))) train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset) train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=0, collate_fn=collate_wrapper, pin_memory=True) return train_dataset.last_iters, train_data_loader
def build_test_dataLoader(last_iters): print( "==================== start loading datasets for testing ====================" ) test_dataset = build_openkp_dataset(args, 'valid', tokenizer, tag_to_ix, 0, last_iters) print("********************** Test set Len *******************{}".format( len(test_dataset))) test_sampler = torch.utils.data.sampler.RandomSampler(test_dataset) test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, sampler=test_sampler, num_workers=0, collate_fn=collate_wrapper, pin_memory=True) return test_data_loader
] """ dataset_dict = utils.read_openkp_examples(args, tokenizer) # 5.1构建训练数据 # train dataloader """ `args.per_gpu_train_batch_size`: Batch size per GPU/CPU for training. `train_dataset` : index, src_tensor, valid_mask, label_tensor (for `train` or `dev`) index, src_tensor, valid_mask, valid_orig_doc_len (for `test`) """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) # 转为id,转为tensor train_dataset = utils.build_openkp_dataset(args, dataset_dict['train'], tokenizer, converter) """ set local_rank=0 for distributed training on multiple gpus. """ train_sampler = torch.utils.data.sampler.RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) """ `torch.utils.data.DataLoader`: Data loader. Combines a dataset and a sampler, and provides single- or multi-process iterators over the dataset. `args.data_workers`, Number of subprocesses for data loading, default=2 """ train_data_loader = torch.utils.data.DataLoader(
# ------------------------------------------------------------------------------------------- # init tokenizer & Converter tokenizer = BertTokenizer.from_pretrained(args.cache_dir) converter = IdxTag_Converter(Idx2Tag) # ------------------------------------------------------------------------------------------- # build dataloaders logger.info("start loading openkp datasets ...") dataset_dict = utils.read_openkp_examples(args, tokenizer) # ----------------------------------------------------------------------------------------------------------- # Dev dataloader args.eval_batch_size = args.per_gpu_test_batch_size * max(1, args.n_gpu) dev_dataset = utils.build_openkp_dataset(args, dataset_dict['valid'], tokenizer, converter) dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset) dev_data_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=args.eval_batch_size, sampler=dev_sampler, num_workers=args.data_workers, collate_fn=utils.batchify_features_for_test, shuffle=False, pin_memory=args.cuda, ) # ----------------------------------------------------------------------------------------------------------- # Eval dataloader eval_dataset = utils.build_openkp_dataset(args, dataset_dict['eval_public'],