Exemple #1
0
def train_epoch(loader, model, optimizer, device, tag=''):
    """Trainning model ..."""

    total_loss = Counter()

    model.train()

    with tqdm(total=len(loader.dataset)) as t:
        t.set_description(tag)

        for data in loader:
            images, targets = data
            count = len(images)

            # Transform data to device
            images = images.to(device)
            targets = targets.to(device)

            predicts = model(images)

            # xxxx--modify here
            loss = nn.L1Loss(predicts, targets)

            loss_value = loss.item()
            if not math.isfinite(loss_value):
                print("Loss is {}, stopping training".format(loss_value))
                sys.exit(1)

            # Update loss
            total_loss.update(loss_value, count)

            t.set_postfix(loss='{:.6f}'.format(total_loss.avg))
            t.update(count)

            # Optimizer
            optimizer.zero_grad()
            if os.environ["ENABLE_APEX"] == "YES":
                from apex import amp
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            optimizer.step()

        return total_loss.avg
Exemple #2
0
def train_epoch(loader, model, optimizer, device, tag=''):
    """Trainning model ..."""

    total_loss = Counter()
    model.train()
    criterion = nn.L1Loss()

    with tqdm(total=len(loader.dataset)) as t:
        t.set_description(tag)

        for data in loader:
            images, targets = data
            count = len(images)

            # Transform data to device
            images = images.to(device)
            targets = targets.to(device)

            predicts = model(images)

            loss = criterion(predicts, targets)
            loss_value = loss.item()

            del images, targets, predicts
            torch.cuda.empty_cache()

            if not math.isfinite(loss_value):
                print("Loss is {}, stopping training".format(loss_value))
                sys.exit(1)

            # Update loss
            total_loss.update(loss_value, count)

            t.set_postfix(loss='L1Loss: {:.6f}'.format(total_loss.avg))
            t.update(count)

            # Optimizer
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            del loss
            torch.cuda.empty_cache()

        return total_loss.avg
Exemple #3
0
def train_epoch(loader, model, optimizer, model_d, device, tag=''):
    """Trainning model ..."""

    total_loss = Counter()

    model.train()

    with tqdm(total=len(loader.dataset)) as t:
        t.set_description(tag)

        for data in loader:
            images, masks = data
            count = len(images)

            # Transform data to device
            images = images.to(device)
            masks = masks.to(device)

            GT = images
            new_images, new_masks = image_with_mask(images, masks)
            fake_images = model(new_images, new_masks)

            G_loss = model_d(new_images[:, 0:3, :, :],
                             new_masks, fake_images, GT)
            loss = G_loss.sum()

            loss_value = loss.item()
            if not math.isfinite(loss_value):
                print("Loss is {}, stopping training".format(loss_value))
                sys.exit(1)

            # Update loss
            total_loss.update(loss_value, count)

            t.set_postfix(loss='{:.6f}'.format(total_loss.avg))
            t.update(count)

            # Optimizer
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        return total_loss.avg
Exemple #4
0
def distill(args,
            output_model_file,
            processor,
            label_list,
            tokenizer,
            device,
            n_gpu,
            tensorboard_logger,
            eval_data=None):
    assert args.kd_policy is not None
    model = args.kd_policy.student
    args.kd_policy.teacher.eval()
    num_labels = len(args.labels)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    save_best_model = eval_data is not None and args.eval_interval > 0

    train_examples = processor.get_train_examples(args.data_dir)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
    optimizer, t_total = get_optimizer(args, model, num_train_steps)

    train_data = prepare(args, processor, label_list, tokenizer, 'train')
    logger.info("***** Running distillation *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    train_steps = 0
    best_eval_accuracy = 0
    for epoch in trange(int(args.num_train_epochs), desc="Epoch", dynamic_ncols=True):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        args.kd_policy.on_epoch_begin(model, None, None)

        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", dynamic_ncols=True)):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            model.train()
            logits = args.kd_policy.forward(input_ids, segment_ids, input_mask)
            loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1))
            loss = args.kd_policy.before_backward_pass(model, epoch, None, None, loss, None).overall_loss
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            train_steps += 1
            tensorboard_logger.add_scalar('distillation_train_loss', loss.item(), train_steps)

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                # modify learning rate with special warm up BERT uses
                lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if save_best_model and train_steps % args.eval_interval == 0:
                eval_loss, eval_accuracy, _ = eval(args, model, eval_data, device, verbose=False)
                tensorboard_logger.add_scalar('distillation_dev_loss', eval_loss, train_steps)
                tensorboard_logger.add_scalar('distillation_dev_accuracy', eval_accuracy, train_steps)
                if eval_accuracy > best_eval_accuracy:
                    save_model(model, output_model_file)
                    best_eval_accuracy = eval_accuracy

        args.kd_policy.on_epoch_end(model, None, None)

    if save_best_model:
        eval_loss, eval_accuracy, _ = eval(args, model, eval_data, device, verbose=False)
        if eval_accuracy > best_eval_accuracy:
            save_model(model, output_model_file)
    else:
        save_model(model, output_model_file)

    return global_step, tr_loss / nb_tr_steps