Exemple #1
0
def main(args):
    print(args)

    # Load tokenizer
    if args.tokenizer == 'sentencepiece':
        tokenizer = PretrainedTokenizer(pretrained_model=args.pretrained_model,
                                        vocab_file=args.vocab_file)
    else:
        tokenizer = TOKENIZER_CLASSES[args.tokenizer]()
        tokenizer = Tokenizer(tokenizer=tokenizer, vocab_file=args.vocab_file)

    # Build DataLoader
    train_dataset = create_examples(args, tokenizer, mode='train')
    test_dataset = create_examples(args, tokenizer, mode='test')
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             shuffle=True)

    # Build Trainer
    trainer = Trainer(args, train_loader, test_loader, tokenizer)

    # Train & Validate
    for epoch in range(1, args.epochs + 1):
        trainer.train(epoch)
        trainer.validate(epoch)
        trainer.save(epoch, args.output_model_prefix)
Exemple #2
0
def main(args):
    print(args)

    # Setup CUDA, GPU & distributed training
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend='nccl')
        # synchronize()
    # Load pretrained tokenizer
    tokenizer = PretrainedTokenizer(pretrained_model=args.pretrained_sp_model,
                                    vocab_file=args.vocab_file)

    # Build DataLoader
    train_dataset = create_examples(args, tokenizer, mode='train')
    train_sampler = RandomSampler(
        train_dataset
    ) if args.local_rank == -1 or args.local_rank == 1 or args.local_rank == 0 else DistributedSampler(
        train_dataset)
    train_loader = DataLoader(train_dataset,
                              sampler=train_sampler,
                              batch_size=args.batch_size,
                              num_workers=args.n_workers)
    if args.do_eval:
        test_dataset = create_examples(args, tokenizer, mode='test')
        test_sampler = RandomSampler(
            test_dataset
        ) if args.local_rank == -1 or args.local_rank == 1 or args.local_rank == 0 else DistributedSampler(
            test_dataset)
        test_loader = DataLoader(test_dataset,
                                 sampler=test_sampler,
                                 batch_size=args.batch_size,
                                 num_workers=args.n_workers)

    # Build Trainer
    trainer = Trainer(args=args,
                      train_loader=train_loader,
                      test_loader=test_loader if args.do_eval else None,
                      tokenizer=tokenizer)

    # Train
    for epoch in range(1, args.epochs + 1):
        trainer.train(epoch)
        # print(111)
        trainer.save(epoch, args.output_model_prefix)
        # print(111)
        if args.do_eval:
            trainer.evaluate(epoch)
Exemple #3
0
def main(args):
    print(args)

    # Load tokenizer
    tokenizer_src = PretrainedTokenizer(
        pretrained_model=args.pretrained_model_src,
        vocab_file=args.vocab_file_src)
    tokenizer_tgt = PretrainedTokenizer(
        pretrained_model=args.pretrained_model_tgt,
        vocab_file=args.vocab_file_tgt)

    # Build DataLoader
    train_dataset = create_examples(args,
                                    tokenizer_src,
                                    tokenizer_tgt,
                                    mode='train')
    test_dataset = create_examples(args,
                                   tokenizer_src,
                                   tokenizer_tgt,
                                   mode='test')
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             shuffle=True)

    # Build Trainer
    trainer = Trainer(args, train_loader, test_loader, tokenizer_src,
                      tokenizer_tgt)

    # Train & Validate
    for epoch in range(1, args.epochs + 1):
        trainer.train(epoch)
        trainer.validate(epoch)
        trainer.save(epoch, args.output_model_prefix)
Exemple #4
0
def main(args):
    print(args)
    set_seeds()
    # Build DataLoader
    train_dataset, test_dataset = create_examples(args)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             shuffle=True)
    # Build Trainer
    trainer = Trainer(args, train_loader, test_loader)
    # Warm up
    for epoch in range(1, args.pretrain + 1):
        trainer.pretrain(epoch)

    # Train & Validate
    for epoch in range(1, args.epochs + 1):
        trainer.train(epoch)
        trainer.validate(epoch)
        trainer.save(epoch, args.output_model_prefix)