コード例 #1
0
def build_train_dataLoader(split, last_iters):
    print(
        " ==================== start loading datasets for training ===================="
    )
    train_dataset = build_openkp_dataset(args, 'train', tokenizer, tag_to_ix,
                                         split, last_iters)
    print("********************** Train set Len *******************{}".format(
        len(train_dataset)))
    train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
    train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                    batch_size=batch_size,
                                                    sampler=train_sampler,
                                                    num_workers=0,
                                                    collate_fn=collate_wrapper,
                                                    pin_memory=True)
    return train_dataset.last_iters, train_data_loader
コード例 #2
0
def build_test_dataLoader(last_iters):
    print(
        "==================== start loading datasets for testing ===================="
    )
    test_dataset = build_openkp_dataset(args, 'valid', tokenizer, tag_to_ix, 0,
                                        last_iters)
    print("********************** Test set Len *******************{}".format(
        len(test_dataset)))

    test_sampler = torch.utils.data.sampler.RandomSampler(test_dataset)

    test_data_loader = torch.utils.data.DataLoader(test_dataset,
                                                   batch_size=batch_size,
                                                   sampler=test_sampler,
                                                   num_workers=0,
                                                   collate_fn=collate_wrapper,
                                                   pin_memory=True)

    return test_data_loader
コード例 #3
0
            ]
    """
    dataset_dict = utils.read_openkp_examples(args, tokenizer)

    # 5.1构建训练数据
    # train dataloader
    """
    `args.per_gpu_train_batch_size`: Batch size per GPU/CPU for training.
    
    `train_dataset` : 
        index, src_tensor, valid_mask, label_tensor        (for `train` or `dev`)
        index, src_tensor, valid_mask, valid_orig_doc_len  (for `test`)
    """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    # 转为id,转为tensor
    train_dataset = utils.build_openkp_dataset(args, dataset_dict['train'],
                                               tokenizer, converter)
    """
    set local_rank=0 for distributed training on multiple gpus.
    """
    train_sampler = torch.utils.data.sampler.RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    """
    `torch.utils.data.DataLoader`:
        Data loader. Combines a dataset and a sampler, and provides
        single- or multi-process iterators over the dataset.
    
    `args.data_workers`, 
        Number of subprocesses for data loading, default=2
    """
    train_data_loader = torch.utils.data.DataLoader(
コード例 #4
0
ファイル: test.py プロジェクト: yongbowin/Bert2Tag_annotation
    # -------------------------------------------------------------------------------------------
    # init tokenizer & Converter
    tokenizer = BertTokenizer.from_pretrained(args.cache_dir)
    converter = IdxTag_Converter(Idx2Tag)

    # -------------------------------------------------------------------------------------------
    # build dataloaders
    logger.info("start loading openkp datasets ...")
    dataset_dict = utils.read_openkp_examples(args, tokenizer)

    # -----------------------------------------------------------------------------------------------------------
    # Dev dataloader
    args.eval_batch_size = args.per_gpu_test_batch_size * max(1, args.n_gpu)

    dev_dataset = utils.build_openkp_dataset(args, dataset_dict['valid'],
                                             tokenizer, converter)
    dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
    dev_data_loader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=args.eval_batch_size,
        sampler=dev_sampler,
        num_workers=args.data_workers,
        collate_fn=utils.batchify_features_for_test,
        shuffle=False,
        pin_memory=args.cuda,
    )

    # -----------------------------------------------------------------------------------------------------------
    # Eval dataloader
    eval_dataset = utils.build_openkp_dataset(args,
                                              dataset_dict['eval_public'],