Example #1
0
    args.device = device
    # logger.info("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    #             args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

    # -------------------------------------------------------------------------------------------
    # init tokenizer & Converter 
    # logger.info("start setting tokenizer, dataset and dataloader (local_rank = {})... ".format(args.local_rank))
    tokenizer = tokenizer_class[args.pretrain_model_type].from_pretrained(args.cache_dir)
    
    # -------------------------------------------------------------------------------------------
    # Select dataloader
    batchify_features_for_train, batchify_features_for_test = dataloader.get_class(args.model_class)

    # -------------------------------------------------------------------------------------------
    # build dev dataloader
    dev_dataset = dataloader.build_dataset(**{'args':args, 'tokenizer':tokenizer, 'mode':'dev'})
    args.test_batch_size = args.per_gpu_test_batch_size * max(1, args.n_gpu)
    dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
    dev_data_loader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=args.test_batch_size,
        sampler=dev_sampler,
        num_workers=args.data_workers,
        collate_fn=batchify_features_for_test,
        pin_memory=args.cuda,
    )

    # -------------------------------------------------------------------------------------------
    # build eval dataloader 
    if args.dataset_class == 'kp20k':
        eval_dataset = dataloader.build_dataset(**{'args':args, 'tokenizer':tokenizer, 'mode':'eval'})
Example #2
0
    logger.info(
        "start setting tokenizer, dataset and dataloader (local_rank = {})... "
        .format(args.local_rank))
    tokenizer = tokenizer_class[args.pretrain_model_type].from_pretrained(
        args.cache_dir)

    # -------------------------------------------------------------------------------------------
    # Select dataloader
    batchify_features_for_train, batchify_features_for_test = dataloader.get_class(
        args.model_class)

    # -------------------------------------------------------------------------------------------
    # build train dataloader
    train_dataset = dataloader.build_dataset(**{
        "args": args,
        "tokenizer": tokenizer,
        "mode": "train"
    })
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = (torch.utils.data.sampler.RandomSampler(train_dataset)
                     if args.local_rank == -1 else
                     DistributedSampler(train_dataset))
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.train_batch_size,
        sampler=train_sampler,
        num_workers=args.data_workers,
        collate_fn=batchify_features_for_train,
        pin_memory=args.cuda,
    )
    logger.info("Successfully Preprocess Training Features !")