Esempio n. 1
0
 def _validate():
     nonlocal best_kappa
     if df_valid is not None:
         valid_metrics, bins, _ = validate()
         if is_main:
             epoch_pbar.set_postfix(
                 {k: f'{v:.4f}'
                  for k, v in valid_metrics.items()})
             json_log_plots.write_event(run_root, step, **valid_metrics)
             if valid_metrics['kappa'] > best_kappa:
                 best_kappa = valid_metrics['kappa']
                 state = {
                     'weights': model.state_dict(),
                     'bins': bins,
                     'metrics': valid_metrics,
                     'params': params,
                 }
                 torch.save(state, model_path)
     elif is_main:
         state = {
             'weights': model.state_dict(),
             'bins': default_bins(N_CLASSES),
             'params': params,
         }
         torch.save(state, model_path)
Esempio n. 2
0
 def validate(epoch: int):
     if not is_main or world_size != 1:
         return
     valid_loss = get_valid_loss()
     json_log_plots.write_event(run_path, step=seen_tokens, valid_loss=valid_loss)
     log_writer_valid.add_scalar("loss_epoch", valid_loss, epoch)
     log_writer_valid.add_scalar("perplexity_epoch", np.exp(valid_loss), epoch)
    def train():
        nonlocal seen_tokens
        epoch_size = len(train_dataset) // step_tokens * step_tokens
        pbar = tqdm.trange(epochs, desc='epochs', dynamic_ncols=True, disable=not is_main)

        # pbar used for epochs 
        # init_epoch_pbar = lambda: tqdm.trange(
        #     epoch_size, dynamic_ncols=True, disable=not is_main)
        # init_epoch_pbar = lambda: tqdm.trange(epoch_size, disable=not is_main)
        # epoch_pbar = init_epoch_pbar()

        # # pbar.update(seen_tokens // epoch_size)
        # # pbar.refresh()
        # epoch_pbar.update(seen_tokens % epoch_size)
        step = 1
        loss_per_epoch = [] 
        j = 0 
        start_time = time.time()

        while seen_tokens < epochs * epoch_size:
            if max_tokens and seen_tokens >= max_tokens:
                print(f'max_tokens {max_tokens} reached, '
                      f'saving and exiting')
                save()
                validate()
                return

            train_step()
            seen_tokens += step_tokens
            step += 1
            # epoch_pbar.update(step_tokens)
            # epoch_pbar.set_description(f'epoch {1 + seen_tokens // epoch_size}')
            # epoch_pbar.set_postfix(loss=f'{loss_meter.mean():.2f}')
            # epoch_pbar.refresh()
            loss_per_epoch.append(loss_meter.mean())
            if step % save_every == 0:
                save()
            if is_main and step % log_every == 0:
                json_log_plots.write_event(dist_run_path, step=seen_tokens,
                                           loss=loss_meter.mean())
                loss_meter.reset()
                
            if step % validate_every == 0:
                validate()
            
            # create a new progress bar for the next epoch
            if seen_tokens % epoch_size == 0:
                # pbar.update()
                # epoch_pbar.close()
                # epoch_pbar = init_epoch_pbar()
                valid_loss = get_valid_loss()
                print(f'epoch: {j} \t train_loss: {np.mean(loss_per_epoch):.3f} \t valid_loss = {valid_loss:.3f} \t time: {(time.time()-start_time):.2f}')
                j += 1
                loss_per_epoch = []
                start_time = time.time()

        # end of training
        save()
        validate()
Esempio n. 4
0
 def log_training_loss(_):
     nonlocal step
     train_losses.append(trainer.state.output)
     smoothed_loss = np.mean(train_losses)
     epoch_pbar.set_postfix(loss=f'{smoothed_loss:.4f}')
     epoch_pbar.update(1)
     step += 1
     if step % 20 == 0 and output_dir:
         json_log_plots.write_event(output_dir,
                                    step=step * args.batch_size,
                                    loss=smoothed_loss)
Esempio n. 5
0
 def log_validation_results(_):
     nonlocal best_f1
     metrics = evaluate()
     if output_dir:
         json_log_plots.write_event(
             output_dir, step=step * args.batch_size, **metrics)
     if metrics['f1'] > best_f1:
         best_f1 = metrics['f1']
         if output_dir:
             torch.save(model.state_dict(), output_dir / 'model_best.pth')
     epochs_pbar.set_postfix({
         k: format_value(v) for k, v in metrics.items()})
Esempio n. 6
0
 def train():
     nonlocal seen_tokens
     epoch_size = len(train_dataset) // step_tokens * step_tokens
     pbar = tqdm.trange(epochs,
                        desc='epochs',
                        dynamic_ncols=True,
                        disable=not is_main)
     init_epoch_pbar = lambda: tqdm.trange(
         epoch_size, dynamic_ncols=True, disable=not is_main)
     epoch_pbar = init_epoch_pbar()
     pbar.update(seen_tokens // epoch_size)
     pbar.refresh()
     epoch_pbar.update(seen_tokens % epoch_size)
     step = 1
     while seen_tokens < epochs * epoch_size:
         if max_tokens and seen_tokens >= max_tokens:
             print(f'max_tokens {max_tokens} reached, '
                   f'saving and exiting')
             save()
             validate()
             return
         train_step()
         seen_tokens += step_tokens
         step += 1
         epoch_pbar.update(step_tokens)
         epoch_pbar.set_description(
             f'epoch {1 + seen_tokens // epoch_size}')
         epoch_pbar.set_postfix(loss=f'{loss_meters["loss"].mean():.2f}')
         epoch_pbar.refresh()
         if step % save_every == 0:
             save()
         if is_main and step % log_every == 0:
             json_log_plots.write_event(
                 run_path,
                 step=seen_tokens,
                 **{
                     name: meter.mean()
                     for name, meter in loss_meters.items()
                 })
             for meter in loss_meters.values():
                 meter.reset()
         if step % validate_every == 0:
             validate()
         if seen_tokens % epoch_size == 0:
             pbar.update()
             epoch_pbar.close()
             epoch_pbar = init_epoch_pbar()
     # end of training
     save()
     validate()
Esempio n. 7
0
 def train():
     nonlocal step
     for _ in tqdm.trange(params['epochs'],
                          desc='epoch',
                          dynamic_ncols=True):
         lr_scheduler.step()
         pbar = tqdm.tqdm(train_loader, desc='train', dynamic_ncols=True)
         for batch in pbar:
             loss_value = train_step(*batch)
             step += 1
             pbar.set_postfix(loss=f'{loss_value:.2f}')
             json_log_plots.write_event(run_path,
                                        step * params['batch_size'],
                                        loss=loss_value)
             if (params['validate_every']
                     and step % params['validate_every'] == 0):
                 validate()
         save()
         validate()
Esempio n. 8
0
 def train_epoch(epoch):
     nonlocal step
     model.train()
     report_freq = 5
     running_losses = []
     train_loader = make_loader(df_train, args.batch_size, training=True)
     if args.ddp:
         train_loader.sampler.set_epoch(epoch)
     pbar = tqdm.tqdm(train_loader,
                      dynamic_ncols=True,
                      desc='train',
                      disable=not is_main)
     optimizer.zero_grad()
     for i, (ids, xs, ys) in enumerate(pbar):
         step += len(ids) * n_devices
         with amp.autocast(enabled=amp_enabled):
             _, loss = forward(xs, ys)
         scaler.scale(loss).backward()
         if (i + 1) % args.grad_acc == 0:
             scaler.step(optimizer)
             scaler.update()
             optimizer.zero_grad()
         running_losses.append(float(loss))
         if lr_scheduler_per_step:
             try:
                 lr_scheduler.step()
             except ValueError as e:
                 print(e)
         if i and i % report_freq == 0:
             mean_loss = np.mean(running_losses)
             running_losses.clear()
             pbar.set_postfix({'loss': f'{mean_loss:.4f}'})
             json_log_plots.write_event(run_root, step, loss=mean_loss)
     pbar.close()
     if lr_scheduler is not None and not lr_scheduler_per_step:
         lr_scheduler.step()
Esempio n. 9
0
def train():
    # Turn on training mode which enables dropout.
    global train_loss, best_val_loss, eval_start_time, log_start_time, corpus, n_batches_per_epoch, max_step
    model.train()
    mems = tuple()
    sample_from_batch = 0

    for train_step in range(n_restart_step, max_step):

        data, target = corpus.get_batch('train', train_step)

        model.zero_grad()

        # logging ("=====>", data, target, seq_len)
        # logging ("%%%%%% data.shape=", data.shape, "target.shape=", target.shape)

        # logging ( list(zip(data[:, 3].tolist(), corpus.vocab.get_symbols(data[:, 3])  )) )
        # logging ( "====> data   :", data[:, 3] )
        # logging ( "====> target :", target[:, 3] )

        # print ( "====> data   [batch #%02d]: %s" % (sample_from_batch, ' '.join(corpus.vocab.get_symbols(data[:, sample_from_batch]))[:120] ))
        # print ( "====> data   [batch #%02d]: %s" % (sample_from_batch, ' '.join(corpus.vocab.get_symbols(data[:, sample_from_batch])) ))
        # print ( "====> target [batch #%02d]: %s" % (sample_from_batch, ' '.join(corpus.vocab.get_symbols(target[:, sample_from_batch]))[:120] ))

        ret = para_model(data, target, *mems)
        loss, mems = ret[0], ret[1:]
        loss = loss.float().mean().type_as(loss)
        loss.backward()
        train_loss += loss.float().item()

        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)

        optimizer.step()
        if args.sample_softmax > 0:
            optimizer_sparse.step()

        # step-wise learning rate annealing
        if args.scheduler in ['cosine', 'constant', 'dev_perf']:
            # linear warmup stage
            if train_step < args.warmup_step:
                curr_lr = args.lr * train_step / args.warmup_step
                optimizer.param_groups[0]['lr'] = curr_lr
                if args.sample_softmax > 0:
                    optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2
            else:
                if args.scheduler == 'cosine':
                    scheduler.step(train_step)
                    if args.sample_softmax > 0:
                        scheduler_sparse.step(train_step)
        elif args.scheduler == 'inv_sqrt':
            scheduler.step(train_step)

        if (train_step > n_restart_step) and (train_step % args.log_interval
                                              == 0):
            cur_loss = train_loss / args.log_interval
            elapsed = time.time() - log_start_time

            epoch = train_step / n_batches_per_epoch + 1

            log_str = '| epoch %2d/%2d batch %6d/%6d [%7.2f%%] | lr %.3g | ms/batch %5.2f | loss %7.3f' % (
                epoch, args.num_epochs, train_step % n_batches_per_epoch,
                n_batches_per_epoch, train_step * 100.0 / max_step,
                optimizer.param_groups[0]['lr'],
                elapsed * 1000 / args.log_interval, cur_loss)

            log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
            logging(log_str)

            json_log_plots.write_event(Path(args.work_dir),
                                       step=train_step,
                                       loss=cur_loss,
                                       lr=optimizer.param_groups[0]['lr'] *
                                       100000.0)

            train_loss = 0
            log_start_time = time.time()

            sample_from_batch = random.randint(0, args.batch_size - 1)

        if (train_step > n_restart_step) and (train_step % args.eval_interval
                                              == 0):
            save_model('cur')
            with open(n_steps_txt_path, 'w') as f:
                f.write("%s\n" % train_step)

            logging("evaluating model...")
            val_loss = evaluate('valid')

            json_log_plots.write_event(Path(args.work_dir),
                                       step=train_step,
                                       val_loss=val_loss)

            # Save the model if the validation loss is the best we've seen so far.
            if not best_val_loss or val_loss < best_val_loss:
                logging("best valid loss so far.")
                save_model('valid')
                best_val_loss = val_loss
Esempio n. 10
0
 def validate():
     if not is_main or world_size != 1:
         return
     json_log_plots.write_event(run_path,
                                step=seen_tokens,
                                valid_loss=get_valid_loss())
Esempio n. 11
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    arg = parser.add_argument

    arg('--model', default='fasterrcnn_resnet50_fpn', help='model')
    arg('--device', default='cuda', help='device')
    arg('--batch-size', default=16, type=int)
    arg('--workers',
        default=4,
        type=int,
        help='number of data loading workers')
    arg('--lr', default=0.01, type=float, help='initial learning rate')
    arg('--momentum', default=0.9, type=float, help='momentum')
    arg('--wd',
        '--weight-decay',
        default=1e-4,
        type=float,
        help='weight decay (default: 1e-4)',
        dest='weight_decay')
    arg('--epochs', default=45, type=int, help='number of total epochs to run')
    arg('--lr-steps',
        default=[35],
        nargs='+',
        type=int,
        help='decrease lr every step-size epochs')
    arg('--lr-gamma',
        default=0.1,
        type=float,
        help='decrease lr by a factor of lr-gamma')
    arg('--cosine',
        type=int,
        default=0,
        help='cosine lr schedule (disabled step lr schedule)')
    arg('--print-freq', default=100, type=int, help='print frequency')
    arg('--output-dir', help='path where to save')
    arg('--resume', help='resume from checkpoint')
    arg('--test-only', help='Only test the model', action='store_true')
    arg('--submission', help='Create test predictions', action='store_true')
    arg('--pretrained',
        type=int,
        default=0,
        help='Use pre-trained models from the modelzoo')
    arg('--score-threshold', type=float, default=0.5)
    arg('--nms-threshold', type=float, default=0.25)
    arg('--repeat-train-step', type=int, default=2)

    # fold parameters
    arg('--fold', type=int, default=0)
    arg('--n-folds', type=int, default=5)

    # distributed training parameters
    arg('--world-size',
        default=1,
        type=int,
        help='number of distributed processes')
    arg('--dist-url',
        default='env://',
        help='url used to set up distributed training')

    args = parser.parse_args()
    if args.test_only and args.submission:
        parser.error('pass one of --test-only and --submission')

    output_dir = Path(args.output_dir) if args.output_dir else None
    if output_dir:
        output_dir.mkdir(parents=True, exist_ok=True)

    utils.init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)

    # Data loading code
    print('Loading data')

    df_train, df_valid = load_train_valid_df(args.fold, args.n_folds)
    root = TRAIN_ROOT
    if args.submission:
        df_valid = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
        df_valid['labels'] = ''
        root = TEST_ROOT
    dataset = Dataset(df_train,
                      get_transform(train=True),
                      root,
                      skip_empty=False)
    dataset_test = Dataset(df_valid,
                           get_transform(train=False),
                           root,
                           skip_empty=False)

    print('Creating data loaders')
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            dataset)
        test_sampler = \
            torch.utils.data.distributed.DistributedSampler(dataset_test)
    else:
        train_sampler = torch.utils.data.RandomSampler(dataset)
        test_sampler = torch.utils.data.SequentialSampler(dataset_test)

    train_batch_sampler = torch.utils.data.BatchSampler(train_sampler,
                                                        args.batch_size,
                                                        drop_last=True)

    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_sampler=train_batch_sampler,
        num_workers=args.workers,
        collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(dataset_test,
                                                   batch_size=1,
                                                   sampler=test_sampler,
                                                   num_workers=args.workers,
                                                   collate_fn=utils.collate_fn)

    print('Creating model')
    model = build_model(args.model, args.pretrained, args.nms_threshold)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    lr_scheduler = None
    if args.cosine:
        lr_scheduler = CosineAnnealingLR(optimizer, args.epochs)
    elif args.lr_steps:
        lr_scheduler = MultiStepLR(optimizer,
                                   milestones=args.lr_steps,
                                   gamma=args.lr_gamma)

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        if 'model' in checkpoint:
            model_without_ddp.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            if lr_scheduler and 'lr_scheduler' in checkpoint:
                lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        else:
            model_without_ddp.load_state_dict(checkpoint)
        print(f'Loaded from checkpoint {args.resume}')

    def save_eval_results(er):
        scores, clf_gt = er
        if output_dir:
            pd.DataFrame(scores).to_csv(output_dir / 'eval.csv', index=None)
            pd.DataFrame(clf_gt).to_csv(output_dir / 'clf_gt.csv', index=None)

    if args.test_only or args.submission:
        _, eval_results = evaluate(model,
                                   data_loader_test,
                                   device=device,
                                   output_dir=output_dir,
                                   threshold=args.score_threshold)
        if args.test_only:
            save_eval_results(eval_results)
        elif output_dir:
            pd.DataFrame(eval_results[1]).to_csv(output_dir /
                                                 'test_predictions.csv',
                                                 index=None)
        return

    print('Start training')
    best_f1 = 0
    start_time = time.time()
    for epoch in range(args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        for _ in range(args.repeat_train_step):
            train_metrics = train_one_epoch(model, optimizer, data_loader,
                                            device, epoch, args.print_freq)
        if lr_scheduler:
            lr_scheduler.step()
        if output_dir:
            json_log_plots.write_event(output_dir, step=epoch, **train_metrics)
            utils.save_on_master(
                {
                    'model':
                    model_without_ddp.state_dict(),
                    'optimizer':
                    optimizer.state_dict(),
                    'lr_scheduler':
                    (lr_scheduler.state_dict() if lr_scheduler else None),
                    'args':
                    args
                }, output_dir / 'checkpoint.pth')

        # evaluate after every epoch
        eval_metrics, eval_results = evaluate(model,
                                              data_loader_test,
                                              device=device,
                                              output_dir=None,
                                              threshold=args.score_threshold)
        save_eval_results(eval_results)
        if output_dir:
            json_log_plots.write_event(output_dir, step=epoch, **eval_metrics)
            if eval_metrics['f1'] > best_f1:
                best_f1 = eval_metrics['f1']
                print(f'Updated best model with f1 of {best_f1}')
                utils.save_on_master(model_without_ddp.state_dict(),
                                     output_dir / 'model_best.pth')

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_corpus",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_corpus)
        train_dataset = BERTDataset(args.train_corpus,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)

        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    if not args.do_train:
        return

    def save():
        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        torch.save(model_to_save.state_dict(), output_model_file)

    global_step = 0
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_optimization_steps)

    if args.local_rank == -1:
        train_sampler = RandomSampler(train_dataset)
    else:
        #TODO: check if this works with current data generator from disk that relies on next(file)
        # (it doesn't return item back by index)
        train_sampler = DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  num_workers=2)

    model.train()
    nb_tr_examples, nb_tr_steps = 0, 0
    try:
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_losses = deque(maxlen=20)
            pbar = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(pbar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                loss = model(input_ids, segment_ids, input_mask, lm_label_ids,
                             is_next)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                tr_losses.append(loss.item())
                pbar.set_postfix(loss=f'{np.mean(tr_losses):.4f}')
                if (step + 1) % 20 == 0:
                    json_log_plots.write_event(Path(args.output_dir),
                                               nb_tr_examples,
                                               loss=np.mean(tr_losses))
                if (step + 1) % 10000 == 0:
                    save()

    except KeyboardInterrupt:
        print('Ctrl+C pressed, saving checkpoint')
        save()
        raise
    save()
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('run_root')
    arg('--train-size', type=int)
    arg('--valid-size', type=int)
    arg('--test-size', type=int)
    arg('--model', default='bert-base-uncased')
    arg('--train-seq-length', type=int, default=224)
    arg('--test-seq-length', type=int, default=296)
    arg('--epochs', type=int, default=2)
    arg('--validation', action='store_true')
    arg('--submission', action='store_true')
    arg('--lr', type=float, default=2e-5)
    arg('--batch-size', type=int, default=32)
    arg('--accumulation-steps', type=int, default=2)
    arg('--checkpoint-interval', type=int)
    arg('--clean', action='store_true')
    arg('--fold', type=int, default=0)
    arg('--bucket', type=int, default=1)
    arg('--load-weights', help='load weights for training')
    arg('--export', help='export everything for inference')
    args = parser.parse_args()

    run_root = Path(args.run_root)
    do_train = not (args.submission or args.validation or args.export)
    if do_train:
        if args.clean and run_root.exists():
            if input(f'Clean "{run_root.absolute()}"? ') == 'y':
                shutil.rmtree(run_root)
        if run_root.exists():
            parser.error(f'{run_root} exists')
        run_root.mkdir(exist_ok=True, parents=True)
        params_str = json.dumps(vars(args), indent=4)
        print(params_str)
        (run_root / 'params.json').write_text(params_str)
        shutil.copy(__file__, run_root)
    else:
        run_root.mkdir(exist_ok=True, parents=True)

    use_bert = 'bert' in args.model
    use_gpt2 = 'gpt2' in args.model
    if args.export:
        if ((use_bert and 'bert' not in args.export)
                or (use_gpt2 and 'gpt2' not in args.export)):
            parser.error("Can't determine model kind from the --export option")

    print('Loading tokenizer...')
    if use_bert:
        tokenizer = BertTokenizer.from_pretrained(args.model,
                                                  do_lower_case='uncased'
                                                  in args.model)
        pad_idx = 0
    elif use_gpt2:
        tokenizer = GPT2Tokenizer.from_pretrained(args.model)
        tokenizer.set_special_tokens([GPT2_PAD])
        pad_idx, = tokenizer.convert_tokens_to_ids([GPT2_PAD])
    else:
        raise ValueError(f'Unexpected model {args.model}')

    print('Loading model...')
    model_is_path = Path(args.model).exists()
    num_labels = 7
    if use_bert:
        model = BertForSequenceClassification.from_pretrained(
            args.model, num_labels=num_labels)
    else:
        model = GPT2ClassificationHeadModel(args.model, num_labels=num_labels)
        model.transformer.set_num_special_tokens(1)
        if model_is_path:
            # to also load linear layer weights
            model.load_state_dict(
                torch.load(Path(args.model) / 'pytorch_model.bin'))

    model_path = run_root / 'model.pt'
    optimizer_path = run_root / 'optimizer.pt'
    best_model_path = run_root / 'model-best.pt'
    valid_predictions_path = run_root / 'valid-predictions.csv'

    if args.export:
        model.load_state_dict(torch.load(best_model_path))
        export_path = Path(args.export)
        export_path.mkdir(exist_ok=True, parents=True)
        torch.save(model.state_dict(), export_path / WEIGHTS_NAME)
        model.config.to_json_file(export_path / CONFIG_NAME)
        tokenizer.save_vocabulary(export_path)
        return

    model = model.to(device)

    if args.submission:
        if not model_is_path:
            model.load_state_dict(torch.load(best_model_path))
        if amp is not None:
            model = amp.initialize(model, opt_level='O1', verbosity=0)
        make_submission(model=model,
                        tokenizer=tokenizer,
                        run_root=run_root,
                        max_seq_length=args.test_seq_length,
                        batch_size=args.batch_size,
                        pad_idx=pad_idx,
                        use_bert=use_bert,
                        bucket=args.bucket,
                        test_size=args.test_size)
        return

    train_pkl_path = DATA_ROOT / 'train.pkl'
    if not train_pkl_path.exists():
        pd.read_csv(DATA_ROOT / 'train.csv').to_pickle(train_pkl_path)
    df = pd.read_pickle(train_pkl_path)
    df = preprocess_df(df)

    folds = json.loads((DATA_ROOT / 'folds.json').read_text())
    valid_index = df['id'].isin(folds[args.fold])
    df_train, df_valid = df[~valid_index], df[valid_index]
    if args.train_size and len(df_train) > args.train_size:
        df_train = df_train.sample(n=args.train_size, random_state=42)
    if args.valid_size and len(df_valid) > args.valid_size:
        df_valid = df_valid.sample(n=args.valid_size, random_state=42)

    x_valid = tokenize_lines(df_valid.pop('comment_text'),
                             args.test_seq_length,
                             tokenizer,
                             use_bert=use_bert,
                             pad_idx=pad_idx)
    if args.bucket:
        indices, x_valid = sorted_by_length(x_valid, pad_idx)
        # TODO recover original order before saving
        df_valid = df_valid.iloc[indices]
    y_valid, _ = get_target(df_valid)
    y_train, loss_weight = get_target(df_train)
    print(f'X_valid.shape={x_valid.shape} y_valid.shape={y_valid.shape}')

    criterion = partial(get_loss, loss_weight=loss_weight)

    def _run_validation():
        return validation(model=model,
                          criterion=criterion,
                          x_valid=x_valid,
                          y_valid=y_valid,
                          df_valid=df_valid,
                          batch_size=args.batch_size,
                          pad_idx=pad_idx,
                          bucket=args.bucket)

    if args.validation:
        if not model_is_path:
            model.load_state_dict(torch.load(best_model_path))
        if amp is not None:
            model = amp.initialize(model, opt_level='O1', verbosity=0)
        metrics, valid_predictions = _run_validation()
        for k, v in metrics.items():
            if isinstance(v, float):
                print(f'{v:.4f}  {k}')
        valid_predictions.to_csv(valid_predictions_path, index=None)
        print(f'Saved validation predictions to {valid_predictions_path}')
        return

    def _save(step, model, optimizer):
        torch.save(model.state_dict(), model_path)
        torch.save({
            'optimizer': optimizer.state_dict(),
            'step': step
        }, optimizer_path)

    if args.load_weights:
        print(f'Loading weights from {args.load_weights}')
        load_info = model.load_state_dict(torch.load(args.load_weights),
                                          strict=False)
        if load_info:
            print(load_info)

    x_train = tokenize_lines(df_train.pop('comment_text'),
                             args.train_seq_length,
                             tokenizer,
                             use_bert=use_bert,
                             pad_idx=pad_idx)
    print(f'X_train.shape={x_train.shape} y_train.shape={y_train.shape}')

    best_auc = 0
    step = optimizer = None
    try:
        for model, optimizer, epoch_pbar, loss, step in train(
                model=model,
                criterion=criterion,
                x_train=x_train,
                y_train=y_train,
                epochs=args.epochs,
                yield_steps=args.checkpoint_interval or len(y_valid) // 8,
                bucket=args.bucket,
                lr=args.lr,
                batch_size=args.batch_size,
                accumulation_steps=args.accumulation_steps,
                pad_idx=pad_idx,
        ):
            if step == 0:
                continue  # step 0 allows saving on Ctrl+C from the start
            _save(step, model, optimizer)
            metrics, valid_predictions = _run_validation()
            metrics['loss'] = loss
            if metrics['auc'] > best_auc:
                best_auc = metrics['auc']
                shutil.copy(model_path, best_model_path)
                valid_predictions.to_csv(valid_predictions_path, index=None)
            epoch_pbar.set_postfix(valid_loss=f'{metrics["valid_loss"]:.4f}',
                                   auc=f'{metrics["auc"]:.4f}')
            json_log_plots.write_event(run_root, step=step, **metrics)
    except KeyboardInterrupt:
        if step is not None and optimizer is not None:
            print('Ctrl+C pressed, saving checkpoint')
            _save(step, model, optimizer)
        raise
Esempio n. 14
0
 def validate():
     json_log_plots.write_event(run_path, step * params['batch_size'],
                                **get_validation_metrics())
Esempio n. 15
0
    def train():
        nonlocal seen_tokens
        epoch_size = len(train_dataset) // step_tokens * step_tokens
        pbar = tqdm.trange(epochs, desc="epochs", dynamic_ncols=True, disable=not is_main)
        init_epoch_pbar = lambda: tqdm.trange(epoch_size, dynamic_ncols=True, disable=not is_main)
        epoch_pbar = init_epoch_pbar()
        pbar.update(seen_tokens // epoch_size)
        pbar.refresh()
        epoch_pbar.update(seen_tokens % epoch_size)
        step = 0
        epoch, train_loss = 0, 0.0
        # context_gen = _gen_training_batch(train_dataset, n_ctx=n_ctx, batch_size=batch_size * accum_gradients)
        context = None

        avg_epoch_loss, avg_epoch_perplexity = [], []
        while seen_tokens < epochs * epoch_size:
            if max_tokens and seen_tokens >= max_tokens:
                print(f"max_tokens {max_tokens} reached, " f"saving and exiting")
                save()
                validate(epoch)
                return

            # context = torch.LongTensor(next(context_gen)) # TODO GSBATCH
            train_step(context)

            seen_tokens += step_tokens
            step += 1

            epoch_pbar.update(step_tokens)
            epoch_pbar.set_description(f"epoch {1 + epoch}")
            epoch_pbar.set_postfix(loss=f"{loss_meter.mean():.2f}")
            epoch_pbar.refresh()

            if step % save_every == 0:
                save()

            if (epoch + 1) % checkpoint_every == 0:
                save(f"model-{epoch}epochs.pt")

            if is_main and step % log_every == 0:
                train_loss = loss_meter.mean()
                json_log_plots.write_event(run_path, step=seen_tokens, loss=train_loss)
                loss_meter.reset()

                avg_epoch_loss.append(train_loss)
                avg_epoch_perplexity.append(np.exp(train_loss))

                log_writer_train.add_scalar("loss_iter", float(train_loss), seen_tokens)
                log_writer_train.add_scalar("perplexity_iter", float(np.exp(train_loss)), seen_tokens)

            if step % validate_every == 0:
                validate(epoch)

            if seen_tokens % epoch_size == 0:
                pbar.update()
                epoch_pbar.close()
                epoch_pbar = init_epoch_pbar()

                if is_main:
                    log_writer_train.add_scalar("loss_epoch", sum(avg_epoch_loss) / len(avg_epoch_loss), epoch)
                    log_writer_train.add_scalar(
                        "perplexity_epoch", sum(avg_epoch_perplexity) / len(avg_epoch_perplexity), epoch
                    )
                    avg_epoch_loss.clear()
                    avg_epoch_perplexity.clear()

                epoch += 1

        # end of training
        save()
        validate(epoch)