Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name_or_path',
        type=str,
        default='gpt2',
        help='pretrained model name or path to local checkpoint')
    parser.add_argument("--train_input_file",
                        type=str,
                        default='data/train.128len.db')
    parser.add_argument("--eval_input_file",
                        type=str,
                        default='./data/dummy_data.tsv')
    parser.add_argument("--output_dir", type=str, default='output')
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--max_seq_length", type=int, default=128)

    parser.add_argument("--skip_eval",
                        action='store_true',
                        help='If true, skip evaluation.')

    parser.add_argument("--continue_from", type=int, default=0)

    parser.add_argument("--train_batch_size",
                        type=int,
                        default=4,
                        help="batch size now means per GPU per step")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=2,
        help="to increase effective batch size and reduce synchronization")
    parser.add_argument("--eval_batch_size", type=int, default=4)
    parser.add_argument("--learning_rate", type=float, default=1e-5)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_optim_steps",
                        type=int,
                        default=1000000,
                        help="new API specifies num update steps")
    parser.add_argument("--valid_step",
                        type=int,
                        default=10000,
                        help="how many optim steps between validations")
    parser.add_argument("--warmup_proportion", type=float, default=0.1)
    parser.add_argument("--warmup_steps", type=int, default=16000)

    parser.add_argument("--normalize_data", type=boolean_string, default=True)
    parser.add_argument("--fp16", type=boolean_string, default=True)
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--lr_schedule",
                        type=str,
                        choices=['noam', 'noamwd', 'BERT', 'None'],
                        default='noam')
    parser.add_argument("--loss_scale", type=float, default=0)
    parser.add_argument("--no_token_id", type=boolean_string, default=True)

    parser.add_argument("--log_dir", type=str)
    parser.add_argument('--pbar',
                        type=boolean_string,
                        default=True,
                        help='turn on progress bar')

    # distributed
    parser.add_argument('--local_rank',
                        type=int,
                        default=-1,
                        help='for torch.distributed')

    args = parser.parse_args()

    assert args.train_batch_size % args.gradient_accumulation_steps == 0, 'batch size % gradient accumulation steps != 0!'
    args.train_batch_size = (args.train_batch_size //
                             args.gradient_accumulation_steps)
    logger.info(
        f'train batch size = {args.train_batch_size*args.gradient_accumulation_steps}, '
        'new train batch size (after gradient accumulation) = {args.train_batch_size}'
    )

    if args.local_rank == -1:
        logger.info(f'CUDA available? {str(torch.cuda.is_available())}')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        n_gpu = torch.cuda.device_count()
        args.device, args.n_gpu = device, n_gpu
    else:
        # distributed training
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        n_gpu = torch.distributed.get_world_size()
        args.device, args.n_gpu = device, 1
        logger.info(
            f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)},16-bits training: {args.fp16}"
        )

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    if n_gpu > 0: torch.cuda.manual_seed_all(args.seed)

    timestamp = datetime.datetime.now().strftime('%Y-%m-%d%H%M%S')
    output_dir = join(
        args.output_dir,
        'GPT2.{}.{}.{}gpu.{}'.format(args.learning_rate, args.train_batch_size,
                                     n_gpu, timestamp))
    log_dir = args.log_dir if args.log_dir is not None and len(
        args.log_dir) > 0 else output_dir
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        os.makedirs(output_dir, exist_ok=True)
        train_logger = open(join(log_dir, 'train_log.txt'), 'a+', buffering=1)
        eval_logger = open(join(log_dir, 'eval_log.txt'), 'a+', buffering=1)
        print(
            'epoch,global_step,step,mean_loss,n_token_real,n_token_total,epoch_time',
            file=train_logger)
        print('epoch,global_step,step,eval_loss', file=eval_logger)

    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    config = GPT2Config.from_pretrained(args.model_name_or_path)

    if args.local_rank == -1:
        train_dataloader = BucketingDataLoader(args.train_input_file,
                                               args.train_batch_size,
                                               args.max_seq_length)
    else:
        train_dataloader = DistributedBucketingDataLoader(
            torch.distributed.get_rank(), torch.distributed.get_world_size(),
            args.train_input_file, args.train_batch_size, args.max_seq_length)

    model = GPT2LMHeadModel.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config)
    model = model.to(args.device)

    global_step, tr_loss = train(args, train_dataloader, model, tokenizer,
                                 train_logger, eval_logger)
Esempio n. 2
0
logger.info('Input Argument Information')
args_dict = vars(args)
for a in args_dict:
    logger.info('%-28s  %s' % (a, args_dict[a]))

#########################################################################
# Prepare Data Set
##########################################################################
enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)

config = GPT2Config.from_json_file(join(args.model_name_or_path,
                                        'config.json'))

if args.local_rank == -1:
    train_dataloader = BucketingDataLoader(args.train_input_file,
                                           args.train_batch_size,
                                           args.max_seq_length)
else:
    pass
    # train_dataloader = DistributedBucketingDataLoader(
    #     get_rank(), get_world_size(),
    #     args.train_input_file, args.train_batch_size,
    #     args.max_seq_length)

eval_dataloader_loss = DynamicBatchingLoader(args.eval_input_file, enc,
                                             args.normalize_data,
                                             args.eval_batch_size,
                                             args.max_seq_length)

eval_dataloader_gen = get_eval_list_same_length(args.eval_input_file, enc,
                                                args.eval_batch_size, True)
Esempio n. 3
0
    print('epoch,global_step,step,eval_loss,eval_ppl', file=eval_logger)

filenames = os.listdir(args.init_checkpoint)
filenames = [f for f in filenames if f.endswith('.pkl')]
filenames = sorted(filenames, key=lambda x: int(x[18:x.index('.')]))
for filename in tqdm(filenames):
    global_step = int(filename[18:filename.index('.')])
    model_path = os.path.join(args.init_checkpoint, filename)
    model, vocab = get_kogpt2_model(model_path, VOCAB_PATH, 0)

    if args.fp16:
        logger.info('in fp16, model.half() activated')
        model.half()

    if args.n_gpu > 1:
        logging.info('data parallel because more than one gpu')
        model = torch.nn.DataParallel(model)

    eval_dataloader_loss = BucketingDataLoader(args.eval_input_file,
                                               args.eval_batch_size,
                                               args.max_seq_length)

    if args.local_rank == -1 or get_rank() == 0:
        eval_loss, eval_ppl = eval_model_loss(model, enc, eval_dataloader_loss,
                                              args)
        print(f'{global_step + 1},{eval_loss},{eval_ppl}', file=eval_logger)

if args.local_rank == -1 or get_rank() == 0:
    train_logger.close()
    eval_logger.close()