Esempio n. 1
0
def _setup_common_training_handlers(
    trainer: Engine,
    to_save: Optional[Mapping] = None,
    save_every_iters: int = 1000,
    output_path: Optional[str] = None,
    lr_scheduler: Optional[Union[ParamScheduler, _LRScheduler]] = None,
    with_gpu_stats: bool = False,
    output_names: Optional[Iterable[str]] = None,
    with_pbars: bool = True,
    with_pbar_on_iters: bool = True,
    log_every_iters: int = 100,
    stop_on_nan: bool = True,
    clear_cuda_cache: bool = True,
    save_handler: Optional[Union[Callable, BaseSaveHandler]] = None,
    **kwargs: Any,
) -> None:
    if output_path is not None and save_handler is not None:
        raise ValueError(
            "Arguments output_path and save_handler are mutually exclusive. Please, define only one of them"
        )

    if stop_on_nan:
        trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())

    if lr_scheduler is not None:
        if isinstance(lr_scheduler, torch.optim.lr_scheduler._LRScheduler):
            trainer.add_event_handler(
                Events.ITERATION_COMPLETED,
                lambda engine: cast(_LRScheduler, lr_scheduler).step())
        elif isinstance(lr_scheduler, LRScheduler):
            trainer.add_event_handler(Events.ITERATION_COMPLETED, lr_scheduler)
        else:
            trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler)

    if torch.cuda.is_available() and clear_cuda_cache:
        trainer.add_event_handler(Events.EPOCH_COMPLETED, empty_cuda_cache)

    if to_save is not None:

        if output_path is None and save_handler is None:
            raise ValueError(
                "If to_save argument is provided then output_path or save_handler arguments should be also defined"
            )
        if output_path is not None:
            save_handler = DiskSaver(dirname=output_path, require_empty=False)

        checkpoint_handler = Checkpoint(to_save,
                                        cast(Union[Callable, BaseSaveHandler],
                                             save_handler),
                                        filename_prefix="training",
                                        **kwargs)
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED(every=save_every_iters),
            checkpoint_handler)

    if with_gpu_stats:
        GpuInfo().attach(
            trainer,
            name="gpu",
            event_name=Events.ITERATION_COMPLETED(
                every=log_every_iters)  # type: ignore[arg-type]
        )

    if output_names is not None:

        def output_transform(x: Any, index: int, name: str) -> Any:
            if isinstance(x, Mapping):
                return x[name]
            elif isinstance(x, Sequence):
                return x[index]
            elif isinstance(x, (torch.Tensor, numbers.Number)):
                return x
            else:
                raise TypeError(
                    "Unhandled type of update_function's output. "
                    f"It should either mapping or sequence, but given {type(x)}"
                )

        for i, n in enumerate(output_names):
            RunningAverage(output_transform=partial(output_transform,
                                                    index=i,
                                                    name=n),
                           epoch_bound=False).attach(trainer, n)

    if with_pbars:
        if with_pbar_on_iters:
            ProgressBar(persist=False).attach(
                trainer,
                metric_names="all",
                event_name=Events.ITERATION_COMPLETED(every=log_every_iters))

        ProgressBar(persist=True,
                    bar_format="").attach(trainer,
                                          event_name=Events.EPOCH_STARTED,
                                          closing_event_name=Events.COMPLETED)
Esempio n. 2
0
    def _test(save_history):
        tensor = torch.ones([1], requires_grad=True)
        optimizer = torch.optim.SGD([tensor], lr=0.001)

        max_epochs = 25
        lr_max_value = 0.4
        num_iterations_per_epoch = 128
        num_iterations = max_epochs * num_iterations_per_epoch
        warmup_duration = 5 * num_iterations_per_epoch
        cooldown_duration = 5 * num_iterations_per_epoch

        scheduler_1 = LinearCyclicalScheduler(
            optimizer,
            "lr",
            start_value=lr_max_value,
            end_value=lr_max_value * 0.9,
            cycle_size=(num_iterations - warmup_duration - cooldown_duration) * 2,
        )

        scheduler_2 = LinearCyclicalScheduler(
            optimizer, "lr", start_value=lr_max_value, end_value=0.0, cycle_size=cooldown_duration * 2
        )

        lr_scheduler = ConcatScheduler(
            schedulers=[scheduler_1, scheduler_2],
            durations=[num_iterations - warmup_duration - cooldown_duration],
            save_history=False,
        )
        lr_values = [None] * num_iterations
        scheduler = create_lr_scheduler_with_warmup(
            lr_scheduler,
            warmup_start_value=0.0,
            warmup_end_value=lr_max_value,
            warmup_duration=warmup_duration,
            save_history=save_history,
            output_simulated_values=lr_values,
        )
        state_dict = scheduler.state_dict()

        trainer = Engine(lambda engine, batch: None)

        @trainer.on(Events.ITERATION_COMPLETED)
        def save_lr(engine):
            lrs.append(optimizer.param_groups[0]["lr"])

        trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

        data = [0] * num_iterations_per_epoch

        for _ in range(2):
            lrs = []
            trainer.run(data, max_epochs=max_epochs)

            assert lrs == pytest.approx([v for i, v in lr_values])

            if save_history:
                param_history = trainer.state.param_history["lr"]
                assert lrs == pytest.approx([v[0] for v in param_history])

                trainer.state.param_history = None

            scheduler.load_state_dict(state_dict)
Esempio n. 3
0
def test_concat_scheduler_3_schedulers():
    tensor = torch.zeros([1], requires_grad=True)
    optimizer = torch.optim.SGD([tensor], lr=0)

    scheduler_1 = LinearCyclicalScheduler(optimizer, "lr", start_value=1.0, end_value=0.5, cycle_size=20)
    scheduler_2 = LinearCyclicalScheduler(optimizer, "lr", start_value=0.5, end_value=0.45, cycle_size=10)
    scheduler_3 = LinearCyclicalScheduler(optimizer, "lr", start_value=0.5, end_value=0.0, cycle_size=20)
    durations = [10, 5]

    concat_scheduler = ConcatScheduler(
        schedulers=[scheduler_1, scheduler_2, scheduler_3], durations=durations, save_history=True
    )
    state_dict = concat_scheduler.state_dict()

    data = [0] * 10
    max_epochs = 2
    simulated_values = ConcatScheduler.simulate_values(
        num_events=len(data) * max_epochs, schedulers=[scheduler_1, scheduler_2, scheduler_3], durations=durations
    )

    def save_lr(engine):
        lrs.append(optimizer.param_groups[0]["lr"])

    trainer = Engine(lambda engine, batch: None)
    trainer.add_event_handler(Events.ITERATION_STARTED, concat_scheduler)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)

    for _ in range(2):
        lrs = []
        trainer.run(data, max_epochs=max_epochs)

        assert lrs == list(
            map(
                pytest.approx,
                [
                    # Cycle 1 of the first LinearCyclicalScheduler
                    1.0,
                    0.95,
                    0.9,
                    0.85,
                    0.8,
                    0.75,
                    0.7,
                    0.65,
                    0.6,
                    0.55,
                    # Cycle 1 of the second LinearCyclicalScheduler
                    0.5,
                    0.49,
                    0.48,
                    0.47,
                    0.46,
                    # Cycle 1 of the third LinearCyclicalScheduler
                    0.5,
                    0.45,
                    0.4,
                    0.35,
                    0.3,
                ],
            )
        )

        state_lrs = trainer.state.param_history["lr"]
        assert len(state_lrs) == len(lrs)
        # Unpack singleton lists
        assert [group[0] for group in state_lrs] == lrs

        assert lrs == pytest.approx([v for i, v in simulated_values])
        concat_scheduler.load_state_dict(state_dict)

        trainer.state.param_history = None
Esempio n. 4
0
def train(args):
    device = torch.device("cuda" if args.cuda else "cpu")

    train_loader = check_dataset(args)
    transformer = TransformerNet().to(device)
    optimizer = Adam(transformer.parameters(), args.lr)
    mse_loss = torch.nn.MSELoss()

    vgg = Vgg16(requires_grad=False).to(device)
    style_transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Lambda(lambda x: x.mul(255))])

    style = utils.load_image(args.style_image, size=args.style_size)
    style = style_transform(style)
    style = style.repeat(args.batch_size, 1, 1, 1).to(device)

    features_style = vgg(utils.normalize_batch(style))
    gram_style = [utils.gram_matrix(y) for y in features_style]

    running_avgs = OrderedDict()

    def step(engine, batch):

        x, _ = batch
        x = x.to(device)

        n_batch = len(x)

        optimizer.zero_grad()

        y = transformer(x)

        x = utils.normalize_batch(x)
        y = utils.normalize_batch(y)

        features_x = vgg(x)
        features_y = vgg(y)

        content_loss = args.content_weight * mse_loss(features_y.relu2_2,
                                                      features_x.relu2_2)

        style_loss = 0.0
        for ft_y, gm_s in zip(features_y, gram_style):
            gm_y = utils.gram_matrix(ft_y)
            style_loss += mse_loss(gm_y, gm_s[:n_batch, :, :])
        style_loss *= args.style_weight

        total_loss = content_loss + style_loss
        total_loss.backward()
        optimizer.step()

        return {
            "content_loss": content_loss.item(),
            "style_loss": style_loss.item(),
            "total_loss": total_loss.item(),
        }

    trainer = Engine(step)
    checkpoint_handler = ModelCheckpoint(
        args.checkpoint_model_dir,
        "checkpoint",
        n_saved=10,
        require_empty=False,
        create_dir=True,
    )
    progress_bar = Progbar(loader=train_loader, metrics=running_avgs)

    trainer.add_event_handler(
        event_name=Events.EPOCH_COMPLETED(every=args.checkpoint_interval),
        handler=checkpoint_handler,
        to_save={"net": transformer},
    )
    trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED,
                              handler=progress_bar)
    trainer.run(train_loader, max_epochs=args.epochs)
Esempio n. 5
0
class Señalizador(nn.Module):
    def __init__(self, train_data, validation_data, test_data):
        super(type(self), self).__init__()
        self.conv1 = torch.nn.Conv2d(kernel_size=7,
                                     in_channels=3,
                                     out_channels=18,
                                     bias=True)
        self.conv2 = torch.nn.Conv2d(kernel_size=5,
                                     in_channels=18,
                                     out_channels=18,
                                     bias=True)
        self.conv3 = torch.nn.Conv2d(kernel_size=3,
                                     in_channels=18,
                                     out_channels=36)
        self.conv4 = torch.nn.Conv2d(kernel_size=3,
                                     in_channels=36,
                                     out_channels=64)
        self.conv5 = torch.nn.Conv2d(kernel_size=3,
                                     in_channels=64,
                                     out_channels=64,
                                     bias=True)
        self.conv6 = torch.nn.Conv2d(kernel_size=3,
                                     in_channels=64,
                                     out_channels=128,
                                     bias=True)
        self.conv7 = torch.nn.Conv2d(kernel_size=3,
                                     in_channels=128,
                                     out_channels=254,
                                     padding=1,
                                     bias=True)
        self.mpool = torch.nn.MaxPool2d(kernel_size=2)
        self.activation = torch.nn.ReLU()
        self.linear1 = torch.nn.Linear(in_features=254, out_features=128)
        self.linear2 = torch.nn.Linear(in_features=128, out_features=16)
        self.linear3 = torch.nn.Linear(in_features=16, out_features=4)
        self.dropout = torch.nn.Dropout(p=0.2)

        self.train_data = train_data
        self.validation_data = validation_data
        self.test_data = test_data

        self.device = torch.device('cuda:0')

        self.optimizer = torch.optim.Adam(self.parameters(), lr=5e-3)
        self.criterion = torch.nn.CrossEntropyLoss(reduction='sum')
        self = self.to(self.device)

        self.loaders()

    def forward(self, x):
        x = self.mpool(self.activation(self.conv1(x)))
        x = self.mpool(self.activation(self.conv2(x)))
        x = self.activation(self.conv3(x))
        x = self.mpool(self.activation(self.conv4(x)))
        x = self.mpool(self.activation(self.conv5(x)))
        x = self.mpool(self.activation(self.conv6(x)))
        x = self.mpool(self.activation(self.conv7(x)))
        x = x.view(-1, self.linear1.in_features)
        x = self.activation(self.linear1(x))
        x = self.activation(self.linear2(x))
        x = self.linear3(x)
        return x

    def loaders(self):
        self.train_loader = DataLoader(self.train_data,
                                       shuffle=True,
                                       batch_size=32)
        self.valid_loader = DataLoader(self.validation_data,
                                       shuffle=False,
                                       batch_size=256)
        self.test_loader = DataLoader(self.test_data,
                                      shuffle=False,
                                      batch_size=512)
        print("Loaders initialized")

    def load_checkpoint(self, dir):
        self.load_state_dict(torch.load(dir))

    def train_one_step(self, engine, batch):
        self.optimizer.zero_grad()
        x, y = batch
        x, y = x.to(self.device), y.to(self.device)
        yhat = self.forward(x)
        loss = self.criterion(yhat, y)
        loss.backward()
        self.optimizer.step()
        del x
        del y
        torch.cuda.empty_cache()
        return loss.item(
        )  # Este output puede llamar luego como trainer.state.output

    def evaluate_one_step(self, engine, batch):
        with torch.no_grad():
            x, y = batch
            x, y = x.to(self.device), y.to(self.device)
            yhat = self.forward(x)
            del x
            loss = self.criterion(yhat, y)
            torch.cuda.empty_cache()
            return yhat, y

    def train_epochs(self, max_epochs):
        self.trainer = Engine(self.train_one_step)
        self.evaluator = Engine(self.evaluate_one_step)
        self.metrics = {'Loss': Loss(self.criterion), 'Acc': Accuracy()}
        for name, metric in self.metrics.items():
            metric.attach(self.evaluator, name)

        with SummaryWriter(
                log_dir="/tmp/tensorboard/Transform" +
                str(type(self))[17:len(str(type(self))) - 2]) as writer:

            @self.trainer.on(Events.EPOCH_COMPLETED(every=1))  # Cada 1 epocas
            def log_results(engine):
                # Evaluo el conjunto de entrenamiento
                self.eval()
                self.evaluator.run(self.train_loader)
                writer.add_scalar("train/loss",
                                  self.evaluator.state.metrics['Loss'],
                                  engine.state.epoch)
                writer.add_scalar("train/accy",
                                  self.evaluator.state.metrics['Acc'],
                                  engine.state.epoch)

                # Evaluo el conjunto de validación
                self.evaluator.run(self.valid_loader)
                writer.add_scalar("valid/loss",
                                  self.evaluator.state.metrics['Loss'],
                                  engine.state.epoch)
                writer.add_scalar("valid/accy",
                                  self.evaluator.state.metrics['Acc'],
                                  engine.state.epoch)
                self.train()

            # Guardo el mejor modelo en validación
            best_model_handler = ModelCheckpoint(
                dirname='.',
                require_empty=False,
                filename_prefix="best",
                n_saved=1,
                score_function=lambda engine: -engine.state.metrics['Loss'],
                score_name="val_loss")
            # Lo siguiente se ejecuta cada ves que termine el loop de validación
            self.evaluator.add_event_handler(
                Events.COMPLETED, best_model_handler, {
                    f'Transform{str(type(self))[17:len(str(type(self)))-2]}':
                    model
                })

        self.trainer.run(self.train_loader, max_epochs=max_epochs)

    def test(self, confussion, report):
        self.eval()
        test_targets = np.array(self.test_data.targets)
        prediction_test = []
        for mbdata, label in self.test_loader:
            mbdata = mbdata.to(self.device)
            logits = self.forward(mbdata).to("cpu")
            prediction_test.append(logits.argmax(dim=1).detach().numpy())
            del mbdata
            del logits
            torch.cuda.empty_cache()
        prediction_test = np.concatenate(prediction_test)
        cm = confusion_matrix(test_targets, prediction_test)
        if (confussion):
            display(cm)
        if (report):
            print(classification_report(test_targets, prediction_test))
        self.train()
        return cm
Esempio n. 6
0
def test_piecewiselinear(milestones_as_np_int):

    tensor = torch.zeros([1], requires_grad=True)
    optimizer = torch.optim.SGD([tensor], lr=0)

    milestones_values = [(5, 0.5), (15, 1.0), (25, 0.0), (35, 1.0), (40, 0.5)]
    if milestones_as_np_int:
        milestones_values = [(np.int64(t), v) for t, v in milestones_values]

    scheduler = PiecewiseLinear(optimizer, "lr", milestones_values=milestones_values)
    state_dict = scheduler.state_dict()

    def save_lr(engine):
        lrs.append(optimizer.param_groups[0]["lr"])

    trainer = Engine(lambda engine, batch: None)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, scheduler)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)

    for _ in range(2):
        lrs = []
        trainer.run([0] * 25, max_epochs=2)

        assert lrs == list(
            map(
                pytest.approx,
                [
                    0.5,
                    0.5,
                    0.5,
                    0.5,
                    0.5,
                    0.5,
                    0.55,
                    0.6,
                    0.65,
                    0.7,
                    0.75,
                    0.8,
                    0.85,
                    0.9,
                    0.95,
                    1.0,
                    0.9,
                    0.8,
                    0.7,
                    0.6,
                    0.5,
                    0.4,
                    0.3,
                    0.2,
                    0.1,
                    0.0,
                    0.1,
                    0.2,
                    0.3,
                    0.4,
                    0.5,
                    0.6,
                    0.7,
                    0.8,
                    0.9,
                    1.0,
                    0.9,
                    0.8,
                    0.7,
                    0.6,
                    0.5,
                    0.5,
                    0.5,
                    0.5,
                    0.5,
                    0.5,
                    0.5,
                    0.5,
                    0.5,
                    0.5,
                ],
            )
        )
        scheduler.load_state_dict(state_dict)
Esempio n. 7
0
 def attach(self, engine: Engine) -> None:
     """
     Args:
         engine: Ignite Engine, it can be a trainer, validator or evaluator.
     """
     engine.add_event_handler(IterationEvents.MODEL_COMPLETED, self)
Esempio n. 8
0
    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
    if engine.state.iteration % args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
    return loss.item()


trainer = Engine(update)

# Add progressbar with loss
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
ProgressBar(persist=True).attach(trainer, metric_names=['loss'])

# Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine
cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0,
                                         len(dataloader) * args.n_epochs)
scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr,
                                            args.n_warmup)
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

# Save checkpoints and training config
checkpoint_handler = ModelCheckpoint(args.log_dir,
                                     'checkpoint',
                                     save_interval=1,
                                     n_saved=5)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                          {'mymodel': model})
torch.save(args, os.path.join(args.log_dir, 'training_args.bin'))

trainer.run(dataloader, max_epochs=args.n_epochs)
Esempio n. 9
0
    def train(self):
        train_input_ids, train_token_type_ids, \
        train_attention_mask, train_label_ids = self.get_X_y_ids(self.train_path)

        dev_input_ids, dev_token_type_ids, \
        dev_attention_mask, dev_label_ids = self.get_X_y_ids(self.dev_path)

        train_ds = TensorDataset(train_input_ids, train_token_type_ids,
                                 train_attention_mask, train_label_ids)
        dev_ds = TensorDataset(dev_input_ids, dev_token_type_ids,
                               dev_attention_mask, dev_label_ids)

        batch_size = self.n_gpu * self.per_gpu_batch_size
        train_iter = DataLoader(train_ds,
                                batch_size=batch_size,
                                shuffle=True,
                                drop_last=True)
        dev_iter = DataLoader(dev_ds,
                              batch_size=batch_size,
                              shuffle=True,
                              drop_last=True)

        model = CLS_Model(vocab_size=self.bert_tokenizer.vocab_size,
                          embed_size=self.embed_size,
                          num_labels=len(self.label_list),
                          dense_layer_type=self.dense_layer_type,
                          dropout=self.dropout,
                          embed_type=self.embed_type,
                          max_len=self.max_seq_len,
                          model_name_or_path=self.model_name_or_path,
                          vector_file=self.vector_file)

        model.to(self.device)

        if self.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        logger.info("model.named_parameters()")
        for n, p in model.named_parameters():
            logger.info(n)

        parameters = [{
            "params": [p for n, p in model.named_parameters() if "bert" in n],
            "lr":
            self.bert_lr
        }, {
            "params":
            [p for n, p in model.named_parameters() if "bert" not in n],
            "lr":
            self.normal_lr
        }]

        optimizer = torch.optim.AdamW(parameters, lr=self.normal_lr)

        tb_writer = SummaryWriter()

        def train_fn(engine, batch):
            model.train()
            optimizer.zero_grad()
            batch = tuple(t.to(self.device) for t in batch)
            labels = batch[3]

            inputs = {
                "input_ids": batch[0],
                "token_type_ids": batch[1],
                "attention_mask": batch[2],
                "label_ids": labels
            }

            loss, sequence_tags = model(**inputs)

            score = f1_score(labels.detach().cpu().numpy(),
                             y_pred=sequence_tags.detach().cpu().numpy(),
                             average="macro")

            if self.n_gpu > 1:
                loss = loss.mean()

            ## tensorboard
            global_step = global_step_from_engine(engine)(
                engine, engine.last_event_name)
            # tb_writer.add_scalar('learning_rate', scheduler.get_lr()[0], global_step)
            tb_writer.add_scalar('train_loss', loss.item(), global_step)
            tb_writer.add_scalar('train_score', score, global_step)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 3.0)
            optimizer.step()
            return loss.item(), score

        trainer = Engine(train_fn)
        RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss')
        RunningAverage(output_transform=lambda x: x[1]).attach(
            trainer, 'score')

        def dev_fn(engine, batch):
            model.eval()
            optimizer.zero_grad()
            with torch.no_grad():
                batch = tuple(t.to(self.device) for t in batch)
                labels = batch[3]

                inputs = {
                    "input_ids": batch[0],
                    "token_type_ids": batch[1],
                    "attention_mask": batch[2],
                    "label_ids": labels
                }

                loss, sequence_tags = model(**inputs)

            score = f1_score(labels.detach().cpu().numpy(),
                             y_pred=sequence_tags.detach().cpu().numpy(),
                             average="macro")

            if self.n_gpu > 1:
                loss = loss.mean()

            ## tensorboard
            global_step = global_step_from_engine(engine)(
                engine, engine.last_event_name)
            # tb_writer.add_scalar('learning_rate', scheduler.get_lr()[0], global_step)
            tb_writer.add_scalar('dev_loss', loss.item(), global_step)
            tb_writer.add_scalar('dev_score', score, global_step)

            return loss.item(), score

        dev_evaluator = Engine(dev_fn)
        RunningAverage(output_transform=lambda x: x[0]).attach(
            dev_evaluator, 'loss')
        RunningAverage(output_transform=lambda x: x[1]).attach(
            dev_evaluator, 'score')

        pbar = ProgressBar(persist=True, bar_format="")
        pbar.attach(trainer, ['loss', 'score'])
        pbar.attach(dev_evaluator, ['loss', 'score'])

        def score_fn(engine):
            loss = engine.state.metrics['loss']
            score = engine.state.metrics['score']
            '''
            if score < 0.5:
                logger.info("Too low to learn!")
                trainer.terminate()
            '''

            return score / (loss + 1e-12)

        handler = EarlyStopping(patience=self.patience,
                                score_function=score_fn,
                                trainer=trainer)
        dev_evaluator.add_event_handler(Events.COMPLETED, handler)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_dev_results(engine):
            dev_evaluator.run(dev_iter)
            dev_metrics = dev_evaluator.state.metrics
            avg_score = dev_metrics['score']
            avg_loss = dev_metrics['loss']
            logger.info(
                "Validation Results - Epoch: {}  Avg score: {:.2f} Avg loss: {:.2f}"
                .format(engine.state.epoch, avg_score, avg_loss))

        def model_score(engine):
            score = engine.state.metrics['score']
            return score

        checkpointer = ModelCheckpoint(
            self.output_dir,
            "cmed_qq",
            n_saved=self.n_saved,
            create_dir=True,
            score_name="model_score",
            score_function=model_score,
            global_step_transform=global_step_from_engine(trainer),
            require_empty=False)

        dev_evaluator.add_event_handler(Events.COMPLETED, checkpointer, {
            self.model_name:
            model.module if hasattr(model, 'module') else model
        })

        # Clear cuda cache between training/testing
        def empty_cuda_cache(engine):
            torch.cuda.empty_cache()
            import gc
            gc.collect()

        trainer.add_event_handler(Events.EPOCH_COMPLETED, empty_cuda_cache)
        dev_evaluator.add_event_handler(Events.COMPLETED, empty_cuda_cache)

        trainer.run(train_iter, max_epochs=self.max_epochs)
Esempio n. 10
0
def test_linear_scheduler():

    with pytest.raises(ValueError):
        LinearCyclicalScheduler({}, 'lr', 1, 0, cycle_size=0)

    with pytest.raises(ValueError):
        LinearCyclicalScheduler({}, 'lr', 1, 0, cycle_size=1)

    tensor = torch.zeros([1], requires_grad=True)
    optimizer = torch.optim.SGD([tensor], lr=0)

    scheduler = LinearCyclicalScheduler(optimizer, 'lr', 1, 0, 10)
    lrs = []

    def save_lr(engine):
        lrs.append(optimizer.param_groups[0]['lr'])

    trainer = Engine(lambda engine, batch: None)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)
    trainer.run([0] * 10, max_epochs=2)

    assert lrs == list(
        map(
            pytest.approx,
            [
                # Cycle 1
                1.0,
                0.8,
                0.6,
                0.4,
                0.2,
                0.0,
                0.2,
                0.4,
                0.6,
                0.8,
                # Cycle 2
                1.0,
                0.8,
                0.6,
                0.4,
                0.2,
                0.0,
                0.2,
                0.4,
                0.6,
                0.8,
            ]))

    optimizer = torch.optim.SGD([tensor], lr=0)
    scheduler = LinearCyclicalScheduler(optimizer,
                                        'lr',
                                        1,
                                        0,
                                        10,
                                        cycle_mult=2)

    trainer = Engine(lambda engine, batch: None)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)

    lrs = []
    trainer.run([0] * 10, max_epochs=3)

    assert lrs == list(
        map(
            pytest.approx,
            [
                # Cycle 1
                1.0,
                0.8,
                0.6,
                0.4,
                0.2,
                0.0,
                0.2,
                0.4,
                0.6,
                0.8,
                # Cycle 2
                1.0,
                0.9,
                0.8,
                0.7,
                0.6,
                0.5,
                0.4,
                0.3,
                0.2,
                0.1,
                0.0,
                0.1,
                0.2,
                0.3,
                0.4,
                0.5,
                0.6,
                0.7,
                0.8,
                0.9,
            ]))

    # With float cycle_size
    optimizer = torch.optim.SGD([tensor], lr=0)
    scheduler = LinearCyclicalScheduler(optimizer,
                                        'lr',
                                        start_value=1.2,
                                        end_value=0.2,
                                        cycle_size=10.00000012,
                                        cycle_mult=1.0)

    trainer = Engine(lambda engine, batch: None)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)

    lrs = []
    trainer.run([0] * 10, max_epochs=2)
    assert lrs == list(
        map(
            pytest.approx,
            [
                # Cycle 1
                1.2,
                1.0,
                0.8,
                0.6,
                0.4,
                0.2,
                0.4,
                0.6,
                0.8,
                1.0,
                # Cycle 2
                1.2,
                1.0,
                0.8,
                0.6,
                0.4,
                0.2,
                0.4,
                0.6,
                0.8,
                1.0,
            ]))
Esempio n. 11
0
def test_concat_scheduler_two_linear():
    tensor = torch.zeros([1], requires_grad=True)
    optimizer = torch.optim.SGD([tensor], lr=0)

    scheduler_1 = LinearCyclicalScheduler(optimizer,
                                          "lr",
                                          start_value=0.0,
                                          end_value=0.1,
                                          cycle_size=2)
    scheduler_2 = LinearCyclicalScheduler(optimizer,
                                          "lr",
                                          start_value=0.2,
                                          end_value=1.0,
                                          cycle_size=2)

    durations = [
        5,
    ]
    concat_scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2],
                                       durations=durations,
                                       save_history=True)

    assert concat_scheduler.get_param() == 0.0

    data = [0] * 10
    max_epochs = 2
    simulated_values = ConcatScheduler.simulate_values(
        num_events=len(data) * max_epochs,
        schedulers=[scheduler_1, scheduler_2],
        durations=durations)

    lrs = []

    def save_lr(engine):
        lrs.append(optimizer.param_groups[0]['lr'])

    trainer = Engine(lambda engine, batch: None)
    trainer.add_event_handler(Events.ITERATION_STARTED, concat_scheduler)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)
    trainer.run(data, max_epochs=max_epochs)

    assert lrs == list(
        map(
            pytest.approx,
            [
                # first LinearCyclicalScheduler
                0.0,
                0.1,
                0.0,
                0.1,
                0.0,
                # second LinearCyclicalScheduler
                0.2,
                1.0,
                0.2,
                1.0,
                0.2,
                1.0,
                0.2,
                1.0,
                0.2,
                1.0,
                0.2,
                1.0,
                0.2,
                1.0,
                0.2,
            ]))

    state_lrs = trainer.state.param_history['lr']
    assert len(state_lrs) == len(lrs)
    # Unpack singleton lists
    assert [group[0] for group in state_lrs] == lrs

    assert lrs == pytest.approx([v for i, v in simulated_values])
Esempio n. 12
0
class TrajPredEngine:
    def __init__(self, net, optim, train_loader, val_loader, args):
        self.net = net
        self.args = args
        self.pretrainEpochs = args["pretrainEpochs"]
        self.trainEpochs = args["trainEpochs"]
        self.optim = optim
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.cuda = args['cuda']
        self.device = args['device']
        self.dsId = self.args['dsId']
        self.n_iterations = max(len(train_loader),
                                len(train_loader) / args["batch_size"])

        ## training metrics to keep track of, consider making a metrics class
        # remember to 0 these out
        self.avg_trn_loss = 0

        self.metrics = {"Avg train loss": 0, "Avg val loss": 0}
        ## validation metrics
        self.avg_val_loss = 0
        self.val_batch_count = 1

        # only if using maneuvers
        self.avg_lat_acc = 0
        self.avg_lon_acc = 0

        self.trainer = None
        self.evaluator = None

        self.makeTrainer()

        self.save_name = args['name']

        # testing stuff wow need 2 clean this so bad

        self.lossVals = torch.zeros(self.args['out_length']).cuda(
            self.device) if self.cuda else torch.zeros(self.args['out_length'])
        self.counts = torch.zeros(self.args['out_length']).cuda(
            self.device) if self.cuda else torch.zeros(self.args['out_length'])
        self.lastTestLoss = 0

        self.writer = None
        self.log_dir = args['log_dir']
        self.tensorboard = args['tensorboard']

    def netPred(self, batch):
        raise NotImplementedError

    def saveModel(self, engine):

        os.makedirs(self.args['modelLoc'], exist_ok=True)
        name = os.path.join(self.args['modelLoc'], self.args['name'])
        torch.save(self.net.state_dict(), name)
        print("Model saved {}.".format(name))

    def train_a_batch(self, engine, batch):

        self.net.train_flag = True
        epoch = engine.state.epoch

        _, _, _, _, _, _, _, fut, op_mask = batch

        fut_pred = self.netPred(batch)

        if self.cuda:
            fut = fut.cuda(self.device)
            op_mask = op_mask.cuda(self.device)

        if epoch < self.pretrainEpochs:
            if self.args["pretrain_loss"] == 'MSE':
                l = maskedMSE(fut_pred, fut, op_mask, device=self.device)
            elif self.args['pretrain_loss'] == 'NLL':
                l = maskedNLL(fut_pred, fut, op_mask, device=self.device)
            else:
                l = maskedMSE(fut_pred, fut, op_mask, device=self.device)
        else:
            if self.args["train_loss"] == 'MSE':
                l = maskedMSE(fut_pred, fut, op_mask, device=self.device)
            elif self.args['train_loss'] == 'NLL':
                l = maskedNLL(fut_pred, fut, op_mask, device=self.device)
            else:
                l = maskedNLL(fut_pred, fut, op_mask, device=self.device)

        # if self.args['nll_only']:
        #     l = maskedNLL(fut_pred, fut, op_mask)
        # else:
        #     if epoch < self.pretrainEpochs:
        #         l = maskedMSE(fut_pred, fut, op_mask)
        #     else:
        #         l = maskedNLL(fut_pred, fut, op_mask)

        # Backprop and update weights
#        if l.item() != l.item():
#            print(l.item())
#            exit(1)
#            return 1
        self.optim.zero_grad()
        l.backward()
        self.optim.step()

        # Track average train loss:
        self.avg_trn_loss += l.item()
        self.metrics["Avg train loss"] += l.item() / 100.0

        if self.writer:
            self.writer.add_scalar(
                "{}epoch/trainingloss".format(engine.state.epoch), l.item(),
                engine.state.iteration)

        return l.item()

    def eval_a_batch(self, engine, batch):
        self.net.train_flag = False

        epoch = engine.state.epoch

        _, _, _, _, _, _, _, fut, op_mask = batch
        fut_pred = self.netPred(batch)
        if self.cuda:
            fut = fut.cuda(self.device)
            op_mask = op_mask.cuda(self.device)

        # Forward pass

        if epoch < self.pretrainEpochs:
            if self.args["pretrain_loss"] == 'MSE':
                l = maskedMSE(fut_pred, fut, op_mask, device=self.device)
            elif self.args['pretrain_loss'] == 'NLL':
                l = maskedNLL(fut_pred, fut, op_mask, device=self.device)
            else:
                l = maskedMSE(fut_pred, fut, op_mask, device=self.device)
        else:
            if self.args["train_loss"] == 'MSE':
                l = maskedMSE(fut_pred, fut, op_mask, device=self.device)
            elif self.args['train_loss'] == 'NLL':
                l = maskedNLL(fut_pred, fut, op_mask, device=self.device)
            else:
                l = maskedNLL(fut_pred, fut, op_mask, device=self.device)

        # if self.args['nll_only']:
        #     l = maskedNLL(fut_pred, fut, op_mask)
        # else:
        #     if epoch_num < pretrainEpochs:
        #         l = maskedMSE(fut_pred, fut, op_mask)
        #     else:
        #         l = maskedNLL(fut_pred, fut, op_mask)

        self.avg_val_loss += l.item()
        self.metrics["Avg val loss"] += l.item() / (self.val_batch_count *
                                                    100.0)
        self.val_batch_count += 1

        return fut_pred, fut

    def validate(self, engine):
        self.evaluator.run(self.val_loader)
        max_epochs = self.args["pretrainEpochs"] + self.args["trainEpochs"]

        # if not self.eval_only:
        print("{}/{} Epochs in dataset{}".format(engine.state.epoch,
                                                 max_epochs, self.dsId))
        # print(max((engine.state.epoch / max_epochs) * 100,1))
        print("EPOCH {}: Train loss: {}  Val loss: {}\n".format(
            engine.state.epoch, self.metrics["Avg train loss"],
            self.metrics["Avg val loss"]))
        # else:
        #     print("EPOCH {}: Test loss: {}\n".format(engine.state.epoch, self.metrics["Avg val loss"]))

        if self.writer:
            self.writer.add_scalar("training_avg_loss",
                                   self.metrics['Avg train loss'],
                                   engine.state.epoch)
            self.writer.add_scalar("validating_avg_loss",
                                   self.metrics['Avg val loss'],
                                   engine.state.epoch)

        self.metrics["Avg train loss"] = 0
        self.metrics["Avg val loss"] = 0

    def zeroMetrics(self, engine):
        self.val_batch_count = 1
        self.metrics["Avg val loss"] = 0

    def zeroTrainLoss(self, engine):
        self.metrics["Avg train loss"] = 0

    def zeroValLoss(self, engine):
        self.metrics["Avg val loss"] = 0

    def makeTrainer(self):
        self.trainer = Engine(self.train_a_batch)
        self.evaluator = Engine(self.eval_a_batch)

        pbar = ProgressBar(persist=True, postfix=self.metrics)
        pbar.attach(self.trainer)
        pbar.attach(self.evaluator)

        ## attach hooks
        self.trainer.add_event_handler(Events.EPOCH_COMPLETED, self.validate)
        self.trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                       self.zeroMetrics)
        self.trainer.add_event_handler(Events.COMPLETED, self.saveModel)
        # zero out metrics for next epoch

    def create_summary_writer(self, model, data_loader, log_dir):
        writer = SummaryWriter(logdir=log_dir)
        data_loader_iter = iter(data_loader)
        b = next(data_loader_iter)
        b = tuple(x.cuda(self.device) for x in b)
        try:
            writer.add_graph(model, b[:7])
        except Exception as e:
            print("Failed to save model graph: {}".format(e))
        return writer

    def start(self):
        max_epochs = self.args["pretrainEpochs"] + self.args["trainEpochs"]

        if self.tensorboard:
            self.writer = self.create_summary_writer(self.net,
                                                     self.train_loader,
                                                     self.log_dir)

#        @self.trainer.on(Events.ITERATION_COMPLETED)
#        def log_training_loss(engine):
#            iter = (engine.state.iteration - 1) % len(self.train_loader) + 1
#            if iter % 10 == 0:
#                self.writer.add_scalar("training/loss", engine.state.output, engine.state.iteration)

# if not self.eval_only:
        self.trainer.run(self.train_loader, max_epochs=max_epochs)
        # else:
        # self.trainer.run(self.train_loader, max_epochs=1)

        if self.tensorboard:
            self.writer.close()

    def test_a_batch(self, engine, batch):
        _, _, _, _, _, _, _, fut, op_mask, _, _, _, _ = batch

        # Initialize Variables
        if self.cuda:
            fut = fut.cuda(self.device)
            op_mask = op_mask.cuda(self.device)

        if self.args["train_loss"] == 'NLL':
            # Forward pass
            if self.args['use_maneuvers']:
                fut_pred, lat_pred, lon_pred = self.netPred(batch)
                l, c = maskedNLLTest(fut_pred,
                                     lat_pred,
                                     lon_pred,
                                     fut,
                                     op_mask,
                                     device=self.device,
                                     cuda=self.args.cuda)
            else:
                fut_pred = self.netPred(batch)
                l, c = maskedNLLTest(fut_pred,
                                     0,
                                     0,
                                     fut,
                                     op_mask,
                                     device=self.device,
                                     use_maneuvers=False,
                                     cuda=self.cuda)
        else:
            # Forward pass
            if self.args['use_maneuvers']:
                fut_pred, lat_pred, lon_pred = self.netPred(batch)
                fut_pred_max = torch.zeros_like(fut_pred[0])
                for k in range(lat_pred.shape[0]):
                    lat_man = torch.argmax(lat_pred[k, :]).detach()
                    lon_man = torch.argmax(lon_pred[k, :]).detach()
                    indx = lon_man * 3 + lat_man
                    fut_pred_max[:, k, :] = fut_pred[indx][:, k, :]
                l, c = maskedMSETest(fut_pred_max,
                                     fut,
                                     op_mask,
                                     device=self.device)
            else:
                fut_pred = self.netPred(batch)
                l, c = maskedMSETest(fut_pred,
                                     fut,
                                     op_mask,
                                     device=self.device)

        self.lossVals += l.detach()
        self.lastTestLoss = l.detach()
        self.counts += c.detach()

    def eval(self, test_loader):

        self.test_batch_size = len(test_loader)
        tester = Engine(self.test_a_batch)

        pbar = ProgressBar(persist=True, postfix=self.metrics)
        pbar.attach(tester)
        print('evaluating on dataset{}...'.format(self.dsId))
        tester.run(test_loader)

        if (self.args["train_loss"]) == "NLL":
            nll_loss = self.lossVals / self.counts
            nll_loss[nll_loss != nll_loss] = 0
            print("NLL:")
            print("Last Test loss: " + str(self.lastTestLoss.mean().item()))
            print("Avg Test loss: " + str(nll_loss.mean().item()))
        else:
            rmse = torch.pow(self.lossVals / self.counts,
                             0.5) * .3048  # converting from feet to meters
            rmse[torch.isnan(rmse)] = 0
            # self.lastTestLoss = torch.pow(self.lastTestLoss, 0.5) * .3048
            # print(self.lastTestLoss)
            seq_loss = rmse.tolist()
            seq_loss = [x for x in seq_loss if x != 0]
            print("RMSE:")
            print(rmse)
            print("Last Test loss: " + str(seq_loss[-1]))
            print("Avg Test loss: " + str(rmse.mean().item()))
Esempio n. 13
0
def train(args):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                              do_lower_case=False)
    args.num_embeddings = len(
        tokenizer.vocab
    )  # We need this to create the model at next line (number of embeddings to use)
    model = TransformerWithLMHead(args)
    model.to(args.device)
    optimizer = Adam(model.parameters(),
                     lr=args.lr,
                     weight_decay=args.weight_decay)

    logger.info("Model has %s parameters",
                sum(p.numel() for p in model.parameters() if p.requires_grad))

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler, train_num_words, valid_num_words = get_data_loaders(
        args, tokenizer)

    # Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original
    def mask_tokens(inputs):
        labels = inputs.clone()
        masked_indices = torch.bernoulli(
            torch.full(labels.shape, args.mlm_probability)).byte()
        labels[~masked_indices] = -1  # We only compute loss on masked tokens
        indices_replaced = torch.bernoulli(torch.full(
            labels.shape, 0.8)).byte() & masked_indices
        inputs[indices_replaced] = tokenizer.vocab[
            "[MASK]"]  # 80% of the time, replace masked input tokens with [MASK]
        indices_random = torch.bernoulli(torch.full(
            labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced
        random_words = torch.randint(args.num_embeddings,
                                     labels.shape,
                                     dtype=torch.long,
                                     device=args.device)
        inputs[indices_random] = random_words[
            indices_random]  # 10% of the time, replace masked input tokens with random word
        return inputs, labels

    def update(engine, batch):
        model.train()
        inputs = batch.transpose(0, 1).contiguous().to(args.device)
        inputs, labels = mask_tokens(inputs) if args.mlm else (inputs, inputs)
        logits, loss = model(inputs, labels=labels)
        loss = loss / args.gradient_accumulation_steps
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            inputs = batch.transpose(0, 1).contiguous().to(args.device)
            inputs, labels = mask_tokens(inputs) if args.mlm else (
                inputs,
                inputs)  # Prepare masked input/labels if we use masked LM
            logits = model(inputs)
            shift_logits = logits[:-1] if not args.mlm else logits
            shift_labels = labels[1:] if not args.mlm else labels
            return shift_logits.view(-1,
                                     logits.size(-1)), shift_labels.view(-1)

    evaluator = Engine(inference)

    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.eval_every > 0:
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            lambda engine: evaluator.run(val_loader)
            if engine.state.iteration % args.eval_every == 0 else None)
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))

    # Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine schedule
    cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0,
                                             len(train_loader) * args.n_epochs)
    scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr,
                                                args.n_warmup)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we average distributed metrics using average_distributed_scalar
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))}
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    # Let's convert sub-word perplexities in word perplexities. If you need details: http://sjmielke.com/comparing-perplexities.htm
    metrics["average_word_ppl"] = MetricsLambda(
        lambda x: math.exp(x * val_loader.dataset.numel() / valid_num_words),
        metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train
    if args.local_rank in [-1, 0]:
        checkpoint_handler, tb_logger = add_logging_and_checkpoint_saving(
            trainer, evaluator, metrics, model, optimizer, args)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)
Esempio n. 14
0
    def attach(
        self,
        trainer: Engine,
        to_save: Mapping,
        output_transform: Callable = lambda output: output,
        num_iter: Optional[int] = None,
        end_lr: float = 10.0,
        step_mode: str = "exp",
        smooth_f: float = 0.05,
        diverge_th: float = 5.0,
    ) -> Any:
        """Attaches lr_finder to a given trainer. It also resets model and optimizer at the end of the run.

        Usage:

        .. code-block:: python

            to_save = {"model": model, "optimizer": optimizer}
            with lr_finder.attach(trainer, to_save=to_save) as trainer_with_lr_finder:
                trainer_with_lr_finder.run(dataloader)`

        Args:
            trainer: lr_finder is attached to this trainer. Please, keep in mind that all attached handlers
                will be executed.
            to_save: dictionary with optimizer and other objects that needs to be restored after running
                the LR finder. For example, `to_save={'optimizer': optimizer, 'model': model}`. All objects should
                implement `state_dict` and `load_state_dict` methods.
            output_transform: function that transforms the trainer's `state.output` after each
                iteration. It must return the loss of that iteration.
            num_iter: number of iterations for lr schedule between base lr and end_lr. Default, it will
                run for `trainer.state.epoch_length * trainer.state.max_epochs`.
            end_lr: upper bound for lr search. Default, 10.0.
            step_mode: "exp" or "linear", which way should the lr be increased from optimizer's initial
                lr to `end_lr`. Default, "exp".
            smooth_f: loss smoothing factor in range `[0, 1)`. Default, 0.05
            diverge_th: Used for stopping the search when `current loss > diverge_th * best_loss`.
                Default, 5.0.

        Returns:
            trainer_with_lr_finder (trainer used for finding the lr)

        Note:
            lr_finder cannot be attached to more than one trainer at a time.
        """
        if not isinstance(to_save, Mapping):
            raise TypeError(
                f"Argument to_save should be a mapping, but given {type(to_save)}"
            )

        Checkpoint._check_objects(to_save, "state_dict")
        Checkpoint._check_objects(to_save, "load_state_dict")

        if "optimizer" not in to_save:
            raise ValueError("Mapping to_save should contain 'optimizer' key")

        if not isinstance(to_save["optimizer"], torch.optim.Optimizer):
            raise TypeError(
                f"Object to_save['optimizer'] should be torch optimizer, but given {type(to_save['optimizer'])}"
            )

        if smooth_f < 0 or smooth_f >= 1:
            raise ValueError("smooth_f is outside the range [0, 1]")
        if diverge_th < 1:
            raise ValueError("diverge_th should be larger than 1")
        if step_mode not in ["exp", "linear"]:
            raise ValueError(
                f"step_mode should be 'exp' or 'linear', but given {step_mode}"
            )
        if num_iter is not None:
            if not isinstance(num_iter, int):
                raise TypeError(
                    f"if provided, num_iter should be an integer, but give {num_iter}"
                )
            if num_iter <= 0:
                raise ValueError(
                    f"if provided, num_iter should be positive, but give {num_iter}"
                )

        # store to_save
        with tempfile.TemporaryDirectory() as tmpdirname:
            obj = {k: o.state_dict() for k, o in to_save.items()}
            # add trainer
            obj["trainer"] = trainer.state_dict()
            cache_filepath = Path(tmpdirname) / "ignite_lr_finder_cache.pt"
            torch.save(obj, cache_filepath.as_posix())

            optimizer = to_save["optimizer"]
            # Attach handlers
            if not trainer.has_event_handler(self._run):
                trainer.add_event_handler(
                    Events.STARTED,
                    self._run,
                    optimizer,
                    output_transform,
                    num_iter,
                    end_lr,
                    step_mode,
                    smooth_f,
                    diverge_th,
                )
            if not trainer.has_event_handler(self._warning):
                trainer.add_event_handler(Events.COMPLETED, self._warning)
            if not trainer.has_event_handler(self._reset):
                trainer.add_event_handler(Events.COMPLETED, self._reset)

            yield trainer
            self._detach(trainer)
            # restore to_save and reset trainer's state
            obj = torch.load(cache_filepath.as_posix())
            trainer.load_state_dict(obj["trainer"])
            for k, o in obj.items():
                if k in to_save:
                    to_save[k].load_state_dict(o)
Esempio n. 15
0
    def train(self):
        # Get self.configs
        bn = self.config.bn
        name = self.config.name
        load_from_ckpt = self.config.load_from_ckpt
        lr = self.config.lr
        epochs = self.config.epochs
        wd = self.config.weight_decay

        if self.config.restart and not self.is_debug:
            mod_ckpt, op_ckpt = self._load_ckpt("reg_ckpt")
        else:
            mod_ckpt = op_ckpt = None

        # get datasets for training and testing
        def w_init_fn(worker_id):
            return np.random.seed(np.random.get_state()[1][0] + worker_id)

        # Load Datasets and DataLoader
        dset = get_dataset(self.config.dataset)
        transforms = T.ToTensor()
        train_dataset = dset(self.config, transforms, train=True)
        test_dataset = dset(self.config, transforms, train=False)

        train_loader = DataLoader(
            train_dataset,
            batch_size=bn,
            shuffle=True,
            num_workers=0 if self.is_debug else self.config.n_workers,
            worker_init_fn=w_init_fn)
        test_loader = DataLoader(
            test_dataset,
            batch_size=bn,
            shuffle=True,
            num_workers=0 if self.is_debug else self.config.n_workers,
            worker_init_fn=w_init_fn)
        eval_loader = DataLoader(
            test_dataset,
            batch_size=bn,
            num_workers=0 if self.is_debug else self.config.n_workers,
            worker_init_fn=w_init_fn)

        # model
        model = Model(self.config)
        self.logger.info(
            f"Number of trainable parameters in model is {sum(p.numel() for p in model.parameters())}"
        )
        if self.config.restart and mod_ckpt is not None:
            self.logger.info("Load pretrained parameters and resume training.")
            model.load_state_dict(mod_ckpt)

        model.cuda(self.device)

        wandb.watch(model, log="all")

        # optimizer
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=lr,
                                     weight_decay=wd)
        if self.config.restart and op_ckpt is not None:
            self.logger.info("Load state_dict of optimizer.")
            optimizer.load_state_dict(op_ckpt)

        if self.perc_loss:
            self.vgg = PerceptualVGG()
            self.vgg.cuda(self.device)
        else:
            self.vgg = None

        n_epoch_train = self.config.epochs
        start_it = 0
        start_epoch = 0
        if self.config.restart and op_ckpt is not None:
            start_it = list(
                optimizer.state_dict()["state"].values())[-1]["step"]
            start_epoch = int(np.floor(start_it / len(train_loader)))
            assert self.config.epochs > start_epoch
            n_epoch_train = self.config.epochs - start_epoch

        def train_step(engine, batch):
            model.train()
            original = batch["images"].cuda(self.device)

            tps_param_dic = tps_parameters(original.shape[0], self.config.scal,
                                           self.config.tps_scal,
                                           self.config.rot_scal,
                                           self.config.off_scal,
                                           self.config.scal_var,
                                           self.config.augm_scal)
            coord, vector = make_input_tps_param(tps_param_dic)
            coord, vector = coord.cuda(self.device), vector.cuda(self.device)
            image_spatial_t, _ = ThinPlateSpline(original, coord, vector,
                                                 original.shape[3],
                                                 self.device)
            image_appearance_t = K.ColorJitter(self.config.brightness,
                                               self.config.contrast,
                                               self.config.saturation,
                                               self.config.hue)(original)
            # Zero out gradients

            rec, ssp, asp, mu, heat_map = model(original, image_spatial_t,
                                                image_appearance_t, coord,
                                                vector)

            loss, rec_loss, equiv_loss = total_loss(
                original, rec, ssp, asp, mu, coord, vector, self.device,
                self.config.L_mu, self.config.L_cov, self.config.scal,
                self.config.l_2_scal, self.config.l_2_threshold, self.vgg)

            # fixme compute keypoint metrics if available

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            out_dict = {
                "loss": loss.item(),
                "rec_loss": rec_loss.item(),
                "equiv_loss": equiv_loss.item()
            }

            return out_dict

        def eval_step(engine, batch):
            model.eval()
            with torch.no_grad():
                original = batch["images"].cuda(self.device)

                tps_param_dic = tps_parameters(
                    original.shape[0], self.config.scal, self.config.tps_scal,
                    self.config.rot_scal, self.config.off_scal,
                    self.config.scal_var, self.config.augm_scal)
                coord, vector = make_input_tps_param(tps_param_dic)
                coord, vector = coord.cuda(self.device), vector.cuda(
                    self.device)
                image_spatial_t, _ = ThinPlateSpline(original, coord, vector,
                                                     original.shape[3],
                                                     self.device)
                image_appearance_t = K.ColorJitter(self.config.brightness,
                                                   self.config.contrast,
                                                   self.config.saturation,
                                                   self.config.hue)(original)
                # Zero out gradients

                rec, ssp, asp, mu, heat_map = model(original, image_spatial_t,
                                                    image_appearance_t, coord,
                                                    vector)

                loss, rec_loss, equiv_loss = total_loss(
                    original, rec, ssp, asp, mu, coord, vector, self.device,
                    self.config.L_mu, self.config.L_cov, self.config.scal,
                    self.config.l_2_scal, self.config.l_2_threshold)

            metric_ssim = ssim(original, rec)
            metric_psnr = psnr(original, rec)
            # fixme keypoint metrics

            return {
                "loss": loss.item(),
                "rec_loss": rec_loss.item(),
                "equiv_loss": equiv_loss.item(),
                "ssim": float(metric_ssim),
                "psnr": float(metric_psnr)
            }

        def eval_visual(engine, eval_batch):
            model.eval()
            with torch.no_grad():
                original = eval_batch["images"].cuda(self.device)

                tps_param_dic = tps_parameters(
                    original.shape[0], self.config.scal, self.config.tps_scal,
                    self.config.rot_scal, self.config.off_scal,
                    self.config.scal_var, self.config.augm_scal)
                coord, vector = make_input_tps_param(tps_param_dic)
                coord, vector = coord.cuda(self.device), vector.cuda(
                    self.device)
                image_spatial_t, _ = ThinPlateSpline(original, coord, vector,
                                                     original.shape[3],
                                                     self.device)
                image_appearance_t = K.ColorJitter(self.config.brightness,
                                                   self.config.contrast,
                                                   self.config.saturation,
                                                   self.config.hue)(original)
                # Zero out gradients

                rec, ssp, asp, mu, heat_map = model(original, image_spatial_t,
                                                    image_appearance_t, coord,
                                                    vector)

            img_grid = make_img_grid(image_appearance_t,
                                     image_spatial_t,
                                     rec,
                                     original,
                                     mus=mu,
                                     n_logged=6)

            wandb.log({
                "Evaluation image logs":
                wandb.Image(img_grid, caption=f"Image logs on test set.")
            })

        self.logger.info("Initialize engines...")
        trainer = Engine(train_step)
        evaluator = Engine(eval_step)
        test_img_generator = Engine(eval_visual)
        self.logger.info("Finish engine initialization...")

        # checkpointing
        n_saved = 10
        ckpt_handler = ModelCheckpoint(self.dirs["ckpt"],
                                       "reg_ckpt",
                                       n_saved=n_saved,
                                       require_empty=False)
        save_dict = {"model": model, "optimizer": optimizer}
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED(every=self.config.ckpt_intervall),
            ckpt_handler, save_dict)

        pbar = ProgressBar(ascii=True)
        pbar.attach(trainer, output_transform=lambda x: x)
        pbar.attach(evaluator, output_transform=lambda x: x)

        @trainer.on(Events.ITERATION_COMPLETED(every=self.config.log_intervall)
                    )
        def log(engine):
            it = engine.state.iteration
            wandb.log({"iteration": it})

            # log losses
            for key in engine.state.output:
                wandb.log({key: engine.state.output[key]})

            batch = engine.state.batch
            model.eval()

            original = batch["images"].cuda(self.device)

            with torch.no_grad():

                tps_param_dic = tps_parameters(
                    original.shape[0], self.config.scal, self.config.tps_scal,
                    self.config.rot_scal, self.config.off_scal,
                    self.config.scal_var, self.config.augm_scal)
                coord, vector = make_input_tps_param(tps_param_dic)
                coord, vector = coord.cuda(self.device), vector.cuda(
                    self.device)
                image_spatial_t, _ = ThinPlateSpline(original, coord, vector,
                                                     original.shape[3],
                                                     self.device)
                image_appearance_t = K.ColorJitter(self.config.brightness,
                                                   self.config.contrast,
                                                   self.config.saturation,
                                                   self.config.hue)(original)

                rec, ssp, asp, mu, heat_map = model(original, image_spatial_t,
                                                    image_appearance_t, coord,
                                                    vector)

            img_grid = make_img_grid(image_appearance_t,
                                     image_spatial_t,
                                     rec,
                                     original,
                                     mus=mu,
                                     n_logged=6)

            wandb.log({
                "Training image logs":
                wandb.Image(img_grid,
                            caption=f"Image logs after {it} train steps.")
            })

        # metrics for training
        Average(output_transform=lambda x: x["loss"]).attach(
            trainer, "loss-epoch_avg")
        Average(output_transform=lambda x: x["rec_loss"]).attach(
            trainer, "rec_loss-epoch_avg")
        Average(output_transform=lambda x: x["equiv_loss"]).attach(
            trainer, "equiv_loss-epoch_avg")

        # metrics during evaluation
        Average(output_transform=lambda x: x["loss"]).attach(
            evaluator, "loss-eval")
        Average(output_transform=lambda x: x["rec_loss"]).attach(
            evaluator, "rec_loss-eval")
        Average(output_transform=lambda x: x["equiv_loss"]).attach(
            evaluator, "equiv_loss-eval")
        Average(output_transform=lambda x: x["psnr"]).attach(
            evaluator, "psnr-eval")
        Average(output_transform=lambda x: x["ssim"]).attach(
            evaluator, "ssim-eval")

        @trainer.on(Events.EPOCH_COMPLETED(every=self.config.metric_at_epochs))
        def metrics(engine):
            self.logger.info(
                f"Computing metrics after epoch #{engine.state.epoch}")
            batch_size = eval_loader.batch_size
            bs = 20 if self.is_debug else (int(
                8000 /
                batch_size) if len(test_dataset) > 8000 else len(eval_loader))
            evaluator.run(eval_loader, max_epochs=1, epoch_length=bs)
            [
                wandb.log({key: evaluator.state.metrics[key]})
                for key in evaluator.state.metrics
            ]

        @trainer.on(
            Events.ITERATION_COMPLETED(every=self.config.test_img_intervall))
        def make_test_grid(engine):
            test_img_generator.run(test_loader, max_epochs=1, epoch_length=1)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_train_avg(engine):
            wandb.log({"epoch": engine.state.epoch})
            [
                wandb.log({key: engine.state.metrics[key]})
                for key in engine.state.metrics
            ]

        @trainer.on(Events.STARTED)
        def set_start_it(engine):
            self.logger.info(
                f'Engine starting from iteration {start_it}, epoch {start_epoch}'
            )
            engine.state.iteration = start_it
            engine.state.epoch = start_epoch

        # run everything
        n_step_per_epoch = 10 if self.is_debug else len(train_loader)
        self.logger.info("Start training...")
        trainer.run(train_loader,
                    max_epochs=n_epoch_train,
                    epoch_length=n_step_per_epoch)
        self.logger.info("End training.")
Esempio n. 16
0
 def attach(self, engine: Engine):
     if self._name is None:
         self.logger = engine.logger
     return engine.add_event_handler(Events.STARTED, self)
Esempio n. 17
0
def test_lr_scheduler(torch_lr_scheduler_cls, kwargs):

    if torch_lr_scheduler_cls is None:
        return

    tensor = torch.zeros([1], requires_grad=True)
    optimizer1 = torch.optim.SGD([tensor], lr=0.01)
    optimizer2 = torch.optim.SGD([tensor], lr=0.01)
    optimizer3 = torch.optim.SGD([tensor], lr=0.01)
    opt_state_dict1 = optimizer1.state_dict()
    opt_state_dict2 = optimizer2.state_dict()
    opt_state_dict3 = optimizer3.state_dict()

    torch_lr_scheduler1 = torch_lr_scheduler_cls(optimizer=optimizer1, **kwargs)
    scheduler1 = LRScheduler(torch_lr_scheduler1)
    state_dict1 = scheduler1.state_dict()

    torch_lr_scheduler2 = torch_lr_scheduler_cls(optimizer=optimizer2, **kwargs)
    with pytest.warns(UserWarning, match=r"the first lr value from the optimizer, otherwise it is will be skipped"):
        scheduler2 = LRScheduler(torch_lr_scheduler2, use_legacy=True)
    state_dict2 = scheduler2.state_dict()

    torch_lr_scheduler3 = torch_lr_scheduler_cls(optimizer=optimizer3, **kwargs)
    state_dict3 = torch_lr_scheduler3.state_dict()

    def dummy_update(engine, batch):
        optimizer1.step()
        optimizer2.step()
        optimizer3.step()

    trainer = Engine(dummy_update)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler1)

    @trainer.on(Events.ITERATION_STARTED)
    def save_lr1(engine):
        lrs1.append(optimizer1.param_groups[0]["lr"])

    @trainer.on(Events.ITERATION_STARTED)
    def save_lr2(engine):
        lrs2.append(optimizer2.param_groups[0]["lr"])

    @trainer.on(Events.ITERATION_STARTED)
    def save_true_lr(engine):
        lrs_true.append(optimizer3.param_groups[0]["lr"])

    @trainer.on(Events.ITERATION_COMPLETED)
    def torch_lr_scheduler_step(engine):
        torch_lr_scheduler3.step()

    trainer.add_event_handler(Events.ITERATION_COMPLETED, scheduler2)

    for _ in range(2):
        lrs1 = []
        lrs2 = []
        lrs_true = []
        data = [0] * 10
        max_epochs = 2
        trainer.run(data, max_epochs=max_epochs)
        assert lrs_true == pytest.approx(lrs1), f"{_}: {lrs_true} ({len(lrs_true)}) vs {lrs1} ({len(lrs1)})"
        assert lrs_true == pytest.approx(lrs2), f"{_}: {lrs_true} ({len(lrs_true)}) vs {lrs2} ({len(lrs2)})"
        optimizer1.load_state_dict(opt_state_dict1)
        scheduler1.load_state_dict(state_dict1)
        optimizer2.load_state_dict(opt_state_dict2)
        scheduler2.load_state_dict(state_dict2)
        optimizer3.load_state_dict(opt_state_dict3)
        torch_lr_scheduler3.load_state_dict(state_dict3)

    optimizer4 = torch.optim.SGD([tensor], lr=0.01)
    torch_lr_scheduler4 = torch_lr_scheduler_cls(optimizer=optimizer4, **kwargs)

    simulated_values = LRScheduler.simulate_values(num_events=len(data) * max_epochs, lr_scheduler=torch_lr_scheduler4)
    assert lrs1 == pytest.approx([v for i, v in simulated_values])
    assert lrs2 == pytest.approx([v for i, v in simulated_values])
Esempio n. 18
0
def train():
    config_file = "configs/train_multihead_config.json"
    config = Config.from_json_file(config_file)

    ec_coef = 1
    sc_coef = 1

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d",
                   config.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(config))

    # Initialize distributed training if needed
    config.distributed = (config.local_rank != -1)
    if config.distributed:
        torch.cuda.set_device(config.local_rank)
        config.device = torch.device("cuda", config.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
    tokenizer_class = OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
    model_class = OpenAIGPTMultiHeadModel
    model = model_class.from_pretrained(config.model_checkpoint)
    tokenizer.set_special_tokens(SPECIAL_TOKENS)
    model.set_num_special_tokens(len(SPECIAL_TOKENS))
    model.to(config.device)
    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if config.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16)
    if config.distributed:
        model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        # input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(input_tensor.to(config.device) for input_tensor in batch)
        input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, sc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(
            input_tensor.to(config.device) for input_tensor in batch)

        lm_loss, emotion_loss, sentence_loss = model(input_ids, ec_token_ids, sc_token_ids,
                                                     lm_labels, ec_labels, sc_labels, token_type_ids,
                                                     token_emotion_ids, token_action_ids)
        loss = (lm_loss * config.lm_coef + emotion_loss * ec_coef + sentence_loss * sc_coef) / config.gradient_accumulation_steps
        if config.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm)
        if engine.state.iteration % config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(input_tensor.to(config.device) for input_tensor in batch)
            input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, \
            sc_labels, token_type_ids, token_emotion_ids, token_action_ids = batch
            # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            model_outputs = model(input_ids, ec_token_ids, sc_token_ids, token_type_ids=token_type_ids,
                                  token_emotion_ids=token_emotion_ids,
                                  token_action_ids=token_action_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[2]  # So we can also use GPT2 outputs
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, sc_labels)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if config.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if config.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if config.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config),
                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if config.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED,
                                    lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=config.log_dir)
        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()),
                                                              another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {
            'mymodel': getattr(model, 'module', model)})  # "getattr" take care of distributed encapsulation

        torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=config.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if config.local_rank in [-1, 0] and config.n_epochs > 0:
        os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir,
                                                                     WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Esempio n. 19
0
 def attach(self, engine: Engine, name: str, usage: Union[str, MetricUsage] = EpochWise()) -> None:
     usage = self._check_usage(usage)
     # recursively attach all its dependencies (partially)
     self._internal_attach(engine, usage)
     # attach only handler on EPOCH_COMPLETED
     engine.add_event_handler(usage.COMPLETED, self.completed, name)
Esempio n. 20
0
        loss.backward()
        model_opt.step()
        model_opt.zero_grad()

        return loss.item() / targ.shape[0]


    def inference(engine, batch):
        model_par.eval()
        inp, targ = batch
        out = model_par.forward(inp)
        return out, targ


    trainer = Engine(training_update_function)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, best_checkpointer, {'best_model': model})

    evaluator = Engine(inference)

    @trainer.on(Events.ITERATION_COMPLETED)
    def track_results(trainer):
        results['best_loss'] = min(results['best_loss'], trainer.state.output)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(trainer):
        if (trainer.state.iteration + 1) % 1 == 0:
            print("Epoch[{}] Iteration[{}] Loss: {:.8f}".format(trainer.state.epoch,
                                                                trainer.state.iteration + 1,
                                                                trainer.state.output))
    trainer.run(train_loader, max_epochs=5)
Esempio n. 21
0
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='persona_comet_weak_label_preprocessed',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="openai-gpt",
                        help="Path, url or short name of the model")
    parser.add_argument("--num_candidates",
                        type=int,
                        default=2,
                        help="Number of candidates for training")
    parser.add_argument("--max_history",
                        type=int,
                        default=2,
                        help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-5,
                        help="Learning rate")
    parser.add_argument("--lm_coef",
                        type=float,
                        default=1.0,
                        help="LM loss coefficient")
    parser.add_argument("--mc_coef",
                        type=float,
                        default=1.0,
                        help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=3,
                        help="Number of training epochs")
    parser.add_argument("--personality_permutations",
                        type=int,
                        default=1,
                        help="Number of permutations of personality sentences")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    parser.add_argument("--num_beams",
                        type=int,
                        default=5,
                        help="Number of beams for comet expansion")
    parser.add_argument("--test_run_num",
                        type=int,
                        default=-1,
                        help="Datapoints to run with in a test run")
    parser.add_argument("--exp_name",
                        type=str,
                        default="",
                        required=True,
                        help="Provide an experiment name")
    parser.add_argument("--do_train", action='store_true', help="Do training")
    parser.add_argument("--do_eval", action='store_true', help="Do Evaluation")
    parser.add_argument("--no_persona",
                        action='store_true',
                        help="No Persona Evaluation")
    parser.add_argument("--no_comet_persona",
                        action='store_true',
                        help="No Persona Evaluation")
    parser.add_argument("--uniform_prior",
                        action='store_true',
                        help="Uniform prior")
    parser.add_argument("--log_dir",
                        type=str,
                        default="",
                        required=True,
                        help="Provide a log dir")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    print(
        "Running process {}".format(args.local_rank)
    )  # This is a logger.warning: it will be printed by all distributed processes
    print("Arguments: {}".format(pformat(args)))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    print("Prepare tokenizer, pretrained model and optimizer.")
    tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer  # cant use Autotokenizer because checkpoint could be a Path
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)

    model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
    if args.do_eval and not args.do_train:
        print('Loading model from checkpoint {}'.format(args.model_checkpoint))
    # model = model_class.from_pretrained(args.model_checkpoint)
    # model.to(args.device)

    model = LatentMarginalizedModel(args, generator_class=model_class)
    model.to(args.device)

    # Add special tokens if they are not already added
    add_special_tokens_(model, tokenizer)
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    print("Prepare datasets")

    train_dataset = PersonaChatDataset(args, tokenizer, split='train')
    if args.do_eval:
        val_dataset = PersonaChatDataset(args, tokenizer, split='valid')

    train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)

    train_loader = DataLoader(train_dataset,
                              sampler=train_sampler,
                              batch_size=args.train_batch_size,
                              collate_fn=collate_dialog,
                              pin_memory=True)

    if args.do_eval:
        val_loader = DataLoader(val_dataset,
                                shuffle=False,
                                batch_size=args.valid_batch_size,
                                collate_fn=collate_dialog,
                                pin_memory=True)

    # train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer)

    # Training function and trainer
    def update(engine, batch):

        model.train()

        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        input_ids, token_type_ids, lm_labels, mc_token_ids, mc_labels, persona, history = batch

        (lm_loss), (mc_loss) = model(input_ids=input_ids,
                                     token_type_ids=token_type_ids,
                                     mc_token_ids=mc_token_ids,
                                     lm_labels=lm_labels,
                                     mc_labels=mc_labels,
                                     persona=persona,
                                     history=history)

        loss = (lm_loss * args.lm_coef +
                mc_loss * args.mc_coef) / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)

        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return loss.item(), lm_loss.item(), mc_loss.item(), math.exp(
            lm_loss.item())

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):

        model.eval()

        with torch.no_grad():

            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch

            # print(tokenizer.decode(input_ids[0, -1, :].tolist()))
            # if we dont send labels to model, it doesnt return losses

            lm_logits, mc_logits, *_ = model(
                input_ids,
                token_type_ids=token_type_ids,
                mc_token_ids=mc_token_ids,
            )

            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)

            return (lm_logits_flat_shifted,
                    mc_logits), (lm_labels_flat_shifted, mc_labels)

    evaluator = Engine(inference)

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    # if args.distributed:
    #     trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
    #     evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss")
    RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lm_loss")
    RunningAverage(output_transform=lambda x: x[2]).attach(trainer, "mc_loss")
    RunningAverage(output_transform=lambda x: x[3],
                   alpha=0.01).attach(trainer, "perplexity")

    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-100),
             output_transform=lambda x: (x[0][0], x[1][0])),
        "accuracy":
        Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))
    }

    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args),
        "average_accuracy":
        MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)
    })

    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])

    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    def print_model_save(engine):
        print("Training complete. Saving Model.")

    def print_validation(engine):
        print("Model saved. Starting validation.")

    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer,
                    metric_names=["loss", "lm_loss", "mc_loss", "perplexity"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.model_checkpoint, args.exp_name)
        log_dir = os.path.join(args.log_dir, log_dir)

        print("Logging at log dir: {}".format(log_dir))

        # tb stuff
        # tb_logger = TensorboardLogger(log_dir)
        # tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        # tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        # tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        # save model checkpoints
        checkpoint_handler = ModelCheckpoint(log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, print_model_save)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        # getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        tokenizer.save_pretrained(log_dir)

        # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
        trainer.add_event_handler(Events.EPOCH_COMPLETED, print_validation)
        if args.do_eval:
            trainer.add_event_handler(Events.EPOCH_COMPLETED,
                                      lambda _: evaluator.run(val_loader))
            if args.n_epochs < 1:
                trainer.add_event_handler(Events.COMPLETED,
                                          lambda _: evaluator.run(val_loader))
            if args.eval_before_start:
                trainer.add_event_handler(Events.STARTED,
                                          lambda _: evaluator.run(val_loader))

    # Run the training
    if args.do_train:
        trainer.run(train_loader, max_epochs=args.n_epochs)
    if args.do_eval and not args.do_train:
        print('Running only Evaluation. No Training.')
        evaluator.run(val_loader)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0 and args.do_train:
        os.rename(
            os.path.join(log_dir, checkpoint_handler._saved[-1][1]),
            os.path.join(log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
Esempio n. 22
0
def create_train_and_validation_engines(train_func, val_func=None, device='cpu'):
    """
    Helper function for creating an ignite Engine object with helpful defaults.
    This sets up an Engine that has four handlers attached to it:

    - prepare_batch: before a batch is passed to train_func or val_func, this
      function runs, moving every item in the batch (which is a dictionary) to
      the appropriate device ('cpu'  or 'cuda').

    - book_keeping: sets up some dictionaries that are used for bookkeeping so one
      can easily track the epoch and iteration losses for both training and
      validation.

    - add_to_iter_history: records the iteration, epoch, and past iteration losses
      into the dictionaries set up by book_keeping.

    - clear_iter_history: resets the current iteration history of losses after moving
      the current iteration history into past iteration history.
    
    Args:
        train_func (func): Function that provides the closure for training for
          a single batch.
        val_func (func, optional): Function that provides the closure for
          validating a single batch. Defaults to None.
        device (str, optional): Device to move tensors to. Defaults to 'cpu'.
    """
    # Set up engines for training and validation
    trainer = Engine(train_func)
    trainer.register_events(*ValidationEvents)
    trainer.register_events(*BackwardsEvents)

    validator = None if val_func is None else Engine(val_func)

    # Before a batch starts, the items should be float and moved to the 
    # correct device, for both training and validation. Checks to make
    # sure "cuda" is available if user requested cuda.
    device = device if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    def prepare_batch(engine):
        batch = engine.state.batch
        for key in batch:
            if torch.is_tensor(batch[key]):
                batch[key] = batch[key].float().to(device)
        engine.state.batch = batch

    # Set up stuff for bookkeeping as training progresses.
    def book_keeping(engine):
        engine.state.epoch_history = {}
        engine.state.iter_history = {}
        engine.state.past_iter_history = {}

    def add_to_iter_history(engine):
        for key in engine.state.output:
            if key not in engine.state.iter_history:
                engine.state.iter_history[key] = []
            if key not in engine.state.past_iter_history:
                engine.state.past_iter_history[key] = []
            engine.state.iter_history[key].append(
                engine.state.output[key]
            )
            engine.state.past_iter_history[key].append(
                engine.state.iter_history[key]
            )

    def clear_iter_history(engine):
        engine.state.iter_history = {}

    trainer.add_event_handler(
        Events.ITERATION_STARTED, prepare_batch)
    trainer.add_event_handler(
        Events.STARTED, book_keeping)
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, add_to_iter_history)
    trainer.add_event_handler(
        Events.EPOCH_STARTED, clear_iter_history)

    if validator is not None:
        validator.add_event_handler(
            Events.ITERATION_STARTED, prepare_batch)
        validator.add_event_handler(
            Events.STARTED, book_keeping)
        validator.add_event_handler(
            Events.ITERATION_COMPLETED, add_to_iter_history)
        validator.add_event_handler(
            Events.EPOCH_STARTED, clear_iter_history)

    return trainer, validator
Esempio n. 23
0
 def attach(self, engine: Engine):
     engine.add_event_handler(
         Events.ITERATION_COMPLETED(every=self.interval), self)
Esempio n. 24
0
def train():
    parser = ArgumentParser()
    parser.add_argument('--gpt2', action='store_true', help="use gpt2")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="config/cgpt/",
                        help="Path or URL of the model")
    parser.add_argument("--from_step",
                        type=int,
                        default=-1,
                        help="Init learning rate from this step")
    parser.add_argument('--pretrained',
                        action='store_true',
                        help="If False train from scratch")
    parser.add_argument("--data_path",
                        type=str,
                        default="",
                        help="Path or url of the dataset. ")
    parser.add_argument(
        "--train_path",
        type=str,
        default=
        "/Users/sunhongchao/Documents/craft/09_Dialogue/corpus/chitchat/gpt-chinese/toy_train.txt",
        help="Path of the train dataset for dist dataset. ")
    parser.add_argument(
        "--valid_path",
        type=str,
        default=
        "/Users/sunhongchao/Documents/craft/09_Dialogue/corpus/chitchat/gpt-chinese/toy_valid.txt",
        help="Path of the valid dataset for dist dataset. ")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default="dataset_cache",
                        help="Path or url of the dataset cache")
    parser.add_argument('--log_file',
                        '-log_file',
                        type=str,
                        default="",
                        help="Output logs to a file under this path")
    parser.add_argument("--num_workers",
                        type=int,
                        default=8,
                        help="Number of subprocesses for data loading")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=70,
                        help="Number of training epochs")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=2,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=2,
                        help="Batch size for validation")
    parser.add_argument("--max_history",
                        type=int,
                        default=15,
                        help="Number of previous exchanges to keep in history")
    parser.add_argument("--scheduler",
                        type=str,
                        default="noam",
                        choices=['noam', 'linear'],
                        help="method of optim")
    parser.add_argument("--n_emd",
                        type=int,
                        default=768,
                        help="Number of n_emd in config file (for noam)")
    parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--warmup_steps",
                        type=int,
                        default=5000,
                        help="Warm up steps")
    parser.add_argument("--valid_steps",
                        type=int,
                        default=5000,
                        help="Perfom validation every X steps")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=64,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process.
    # logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", args.local_rank)
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning"
    )
    model_class = OpenAIGPTLMHeadModel if not args.gpt2 else GPT2LMHeadModel
    config_class = OpenAIGPTConfig if not args.gpt2 else GPT2Config
    tokenizer_class = BertTokenizer
    if args.pretrained:
        tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint,
                                                    do_lower_case=True)
        model = model_class.from_pretrained(args.model_checkpoint)
    else:
        tokenizer = tokenizer_class(os.path.join(args.model_checkpoint,
                                                 "vocab.txt"),
                                    do_lower_case=True)
        config = config_class.from_json_file(
            os.path.join(args.model_checkpoint, CONFIG_NAME))
        model = model_class(config)
    model.to(args.device)

    optimizer = AdamW([{
        'params': model.parameters(),
        'initial_lr': args.lr
    }],
                      lr=args.lr,
                      correct_bias=True)

    logger.info("Prepare datasets")
    loader_class = build_dist_loaders if not args.data_path else build_dataloaders
    train_loader, val_loader, train_sampler, valid_sampler = loader_class(
        args, tokenizer, logger)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DataParallel(model,
                             device_ids=[args.local_rank],
                             output_device=args.local_rank)

    # Training function and trainer
    def update(engine, batch):
        input_ids, token_type_ids, lm_labels = tuple(
            input_tensor.to(args.device) for input_tensor in batch)
        model.train()
        (lm_loss), *_ = model(input_ids,
                              labels=lm_labels,
                              token_type_ids=token_type_ids)
        loss = lm_loss / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item(), optimizer.param_groups[0]['lr']

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            input_ids, token_type_ids, lm_labels = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids)
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return lm_logits_flat_shifted, lm_labels_flat_shifted

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Evaluation during training
    @trainer.on(Events.ITERATION_STARTED)
    def log_iterations(engine):
        # if engine.state.iteration % max(int(0.1 * len(train_loader)), 1) == 0:
        if engine.state.iteration % args.valid_steps == 0:
            evaluator.run(val_loader)

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # noam decrease the learning rate
    # model_size = model.config.n_embd
    model_size = args.n_emd
    noam_lambda = lambda step: (model_size**(-0.5) * min(
        (step + 1)**(-0.5), (step + 1) * args.warmup_steps**(-1.5)))
    noam_scheduler = LambdaLR(optimizer,
                              lr_lambda=noam_lambda,
                              last_epoch=args.from_step)
    scheduler = LRScheduler(noam_scheduler)
    if args.scheduler == "linear":
        scheduler = PiecewiseLinear(optimizer, "lr",
                                    [(0, args.lr),
                                     (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss")
    RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lr")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-1),
             output_transform=lambda x: (x[0], x[1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints
    # And save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True, mininterval=2)
        pbar.attach(trainer, metric_names=["loss", "lr"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=None)
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        # save model after evaluation
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)})
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" take care of distributed encapsulation

        torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(
            os.path.join(tb_logger.writer.logdir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.logdir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint
    # (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Esempio n. 25
0
def training(config,
             local_rank=None,
             with_mlflow_logging=False,
             with_plx_logging=False):

    if not getattr(config, "use_fp16", True):
        raise RuntimeError("This training script uses by default fp16 AMP")

    set_seed(config.seed + local_rank)
    torch.cuda.set_device(local_rank)
    device = "cuda"

    torch.backends.cudnn.benchmark = True

    train_loader = config.train_loader
    train_sampler = getattr(train_loader, "sampler", None)
    assert (
        train_sampler is not None
    ), "Train loader of type '{}' " "should have attribute 'sampler'".format(
        type(train_loader))
    assert hasattr(train_sampler, "set_epoch") and callable(
        train_sampler.set_epoch
    ), "Train sampler should have a callable method `set_epoch`"

    train_eval_loader = config.train_eval_loader
    val_loader = config.val_loader

    model = config.model.to(device)
    optimizer = config.optimizer
    model, optimizer = amp.initialize(
        model,
        optimizer,
        opt_level=getattr(config, "fp16_opt_level", "O2"),
        num_losses=1,
    )
    model = DDP(model, delay_allreduce=True)
    criterion = config.criterion.to(device)

    prepare_batch = getattr(config, "prepare_batch", _prepare_batch)
    non_blocking = getattr(config, "non_blocking", True)

    # Setup trainer
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    def train_update_function(engine, batch):

        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y)

        if isinstance(loss, Mapping):
            assert "supervised batch loss" in loss
            loss_dict = loss
            output = {k: v.item() for k, v in loss_dict.items()}
            loss = loss_dict["supervised batch loss"] / accumulation_steps
        else:
            output = {"supervised batch loss": loss.item()}

        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return output

    output_names = getattr(
        config,
        "output_names",
        [
            "supervised batch loss",
        ],
    )

    trainer = Engine(train_update_function)
    common.setup_common_distrib_training_handlers(
        trainer,
        train_sampler,
        to_save={
            "model": model,
            "optimizer": optimizer
        },
        save_every_iters=1000,
        output_path=config.output_path.as_posix(),
        lr_scheduler=config.lr_scheduler,
        with_gpu_stats=True,
        output_names=output_names,
        with_pbars=True,
        with_pbar_on_iters=with_mlflow_logging,
        log_every_iters=1,
    )

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    evaluator_args = dict(
        model=model,
        metrics=val_metrics,
        device=device,
        non_blocking=non_blocking,
        prepare_batch=prepare_batch,
        output_transform=lambda x, y, y_pred: (
            model_output_transform(y_pred),
            y,
        ),
    )
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if dist.get_rank() == 0 and with_mlflow_logging:
        ProgressBar(persist=False,
                    desc="Train Evaluation").attach(train_evaluator)
        ProgressBar(persist=False, desc="Val Evaluation").attach(evaluator)

    def run_validation(_):
        train_evaluator.run(train_eval_loader)
        evaluator.run(val_loader)

    if getattr(config, "start_by_validation", False):
        trainer.add_event_handler(Events.STARTED, run_validation)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)),
        run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    score_metric_name = "mIoU_bg"

    if hasattr(config, "es_patience"):
        common.add_early_stopping_by_val_score(config.es_patience,
                                               evaluator,
                                               trainer,
                                               metric_name=score_metric_name)

    if dist.get_rank() == 0:

        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={
                "training": train_evaluator,
                "validation": evaluator
            },
        )
        if with_mlflow_logging:
            common.setup_mlflow_logging(
                trainer,
                optimizer,
                evaluators={
                    "training": train_evaluator,
                    "validation": evaluator
                },
            )

        if with_plx_logging:
            common.setup_plx_logging(
                trainer,
                optimizer,
                evaluators={
                    "training": train_evaluator,
                    "validation": evaluator
                },
            )

        common.save_best_model_by_val_score(
            config.output_path.as_posix(),
            evaluator,
            model,
            metric_name=score_metric_name,
            trainer=trainer,
        )

        # Log train/val predictions:
        tb_logger.attach(
            evaluator,
            log_handler=predictions_gt_images_handler(
                img_denormalize_fn=config.img_denormalize,
                n_images=15,
                another_engine=trainer,
                prefix_tag="validation",
            ),
            event_name=Events.EPOCH_COMPLETED,
        )

        log_train_predictions = getattr(config, "log_train_predictions", False)
        if log_train_predictions:
            tb_logger.attach(
                train_evaluator,
                log_handler=predictions_gt_images_handler(
                    img_denormalize_fn=config.img_denormalize,
                    n_images=15,
                    another_engine=trainer,
                    prefix_tag="validation",
                ),
                event_name=Events.EPOCH_COMPLETED,
            )

    trainer.run(train_loader, max_epochs=config.num_epochs)
Esempio n. 26
0
def main(cfg):
    """
    Performs training, validation and testing.
    """
    assert isdir(cfg.data_dir), \
        '`data_dir` must be a valid path.'

    cfg.cuda = torch.cuda.is_available() \
        and not cfg.no_cuda

    cfg.model_dir = os.getcwd()

    # setting random seed for reproducibility
    if cfg.seed: set_random_seed(cfg)

    device = torch.device('cuda' if cfg.cuda else 'cpu')

    os.makedirs(cfg.model_dir, exist_ok=True)

    label2id = create_label2id(cfg)
    cfg.num_labels = len(label2id)

    xlmr = create_pretrained(cfg.model_type, cfg.force_download)

    # creating dataset split loaders
    datasets = create_dataset(cfg, xlmr, label2id)

    train_dataset, valid_dataset = datasets

    def compute_loss(batch):
        """
        Computes the forward pass and returns the
        cross entropy loss.
        """
        inputs, labels = [
            torch.from_numpy(tensor).to(device).long() for tensor in batch
        ]

        logits = model(inputs)

        logits = logits.view(-1, logits.size(-1))
        labels = labels.view(-1)

        loss = torch.nn.functional.cross_entropy(logits,
                                                 labels,
                                                 ignore_index=-1)

        return loss

    def train_step(engine, batch):
        """
        Propagates the inputs forward and updates
        the parameters.
        """
        step = engine.state.iteration

        model.train()

        loss = compute_loss(batch)

        backward(loss)

        if cfg.clip_grad_norm is not None:
            clip_grad_norm(cfg.clip_grad_norm)

        if step % cfg.grad_accum_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

        # restoring the averaged loss across steps
        loss *= cfg.grad_accum_steps

        return loss.item()

    def eval_step(engine, batch):
        """
        Propagates the inputs forward without
        storing any gradients.
        """
        model.eval()

        with torch.no_grad():
            loss = compute_loss(batch)

        return loss.item()

    def backward(loss):
        """
        Backpropagates the loss in either mixed or
        normal precision mode.
        """
        if cfg.fp16:
            with amp.scale_loss(loss, optimizer) as sc:
                sc.backward()

        else:
            loss.backward()

    def clip_grad_norm(max_norm):
        """
        Applies gradient clipping.
        """
        if cfg.fp16:
            params = amp.master_params(optimizer)
        else:
            params = model.parameters()

        torch.nn.utils.clip_grad_norm_(params, max_norm)

    trainer = Engine(train_step)
    validator = Engine(eval_step)

    checkpoint = ModelCheckpoint(
        cfg.model_dir,
        cfg.model_type,
        n_saved=5,
        save_as_state_dict=True,
        score_function=lambda e: -e.state.metrics['loss'])

    last_ckpt_path = cfg.ckpt_path

    if last_ckpt_path is not None:
        msg = 'Loading state from {}'
        print(msg.format(basename(last_ckpt_path)))

        last_state = torch.load(last_ckpt_path, map_location=device)

    model = create_model(xlmr, len(label2id), cfg)
    model = model.to(device)

    del xlmr.model

    optimizer = create_optimizer(cfg, model)

    scheduler = create_scheduler(cfg, optimizer, len(train_dataset))

    # using apex if required and loading its state
    if cfg.fp16:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

        if last_ckpt_path is not None and \
                'amp' in last_state:
            amp.load_state_dict(last_state['amp'])

    if last_ckpt_path is not None:
        model.load_state_dict(last_state['model'])
        optimizer.load_state_dict(last_state['optimizer'])
        scheduler.load_state_dict(last_state['scheduler'])

    checkpoint_dict = {
        'model': model,
        'optimizer': optimizer,
        'scheduler': scheduler
    }

    if cfg.fp16: checkpoint_dict['amp'] = amp

    validator.add_event_handler(Events.COMPLETED, checkpoint, checkpoint_dict)

    metric = RunningAverage(output_transform=lambda x: x)
    metric.attach(trainer, 'loss')
    metric.attach(validator, 'loss')

    pbar = ProgressBar()
    pbar.attach(trainer, metric_names=['loss'])

    history_path = join(cfg.model_dir, 'history.json')
    history = collections.defaultdict(list)
    headers = ['epoch', 'train_loss', 'valid_loss']

    if exists(history_path):
        with open(history_path, 'r') as fh:
            history = json.load(fh)

    def record_history(results):
        """
        Records the results to the history.
        """
        for header in headers:
            history[header].append(results[header])

        with open(history_path, 'w') as fh:
            json.dump(history, fh)

    @trainer.on(Events.EPOCH_COMPLETED)
    def print_results(engine):
        """
        Logs the training results.
        """
        validator.run(valid_dataset)

        record_history({
            'epoch': engine.state.epoch,
            'train_loss': engine.state.metrics['loss'],
            'valid_loss': validator.state.metrics['loss']
        })

        data = list(zip(*[history[h] for h in headers]))
        table = tabulate(data, headers, floatfmt='.3f')

        print(table.split('\n')[-1])

    data = list(zip(*[history[h] for h in headers]))

    print()
    print(cfg.pretty())

    print()
    print('***** Running training *****')

    print()
    print(tabulate(data, headers, floatfmt='.3f'))

    trainer.run(train_dataset, cfg.max_epochs)
Esempio n. 27
0
    def _test(duration_vals_as_np_int):
        scheduler_1 = LinearCyclicalScheduler(optimizer, "lr", start_value=1.0, end_value=0.0, cycle_size=10)
        scheduler_2 = CosineAnnealingScheduler(optimizer, "lr", start_value=0.0, end_value=1.0, cycle_size=10)

        durations = [10]
        if duration_vals_as_np_int:
            durations = [np.int64(t) for t in durations]

        concat_scheduler = ConcatScheduler(
            schedulers=[scheduler_1, scheduler_2], durations=durations, save_history=True
        )
        state_dict = concat_scheduler.state_dict()

        data = [0] * 10
        max_epochs = 2
        simulated_values = ConcatScheduler.simulate_values(
            num_events=len(data) * max_epochs, schedulers=[scheduler_1, scheduler_2], durations=durations
        )

        def save_lr(engine):
            lrs.append(optimizer.param_groups[0]["lr"])

        trainer = Engine(lambda engine, batch: None)
        trainer.add_event_handler(Events.ITERATION_STARTED, concat_scheduler)
        trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)

        for _ in range(2):
            lrs = []
            trainer.run(data, max_epochs=max_epochs)

            assert lrs == list(
                map(
                    pytest.approx,
                    [
                        # Cycle 1 of the LinearCyclicalScheduler
                        1.0,
                        0.8,
                        0.6,
                        0.4,
                        0.2,
                        0.0,
                        0.2,
                        0.4,
                        0.6,
                        0.8,
                        # Cycle 1 of the CosineAnnealingScheduler
                        0.0,
                        0.02447174185242318,
                        0.09549150281252627,
                        0.20610737385376332,
                        0.3454915028125263,
                        0.5,
                        0.6545084971874737,
                        0.7938926261462365,
                        0.9045084971874737,
                        0.9755282581475768,
                    ],
                )
            )

            state_lrs = trainer.state.param_history["lr"]
            assert len(state_lrs) == len(lrs)
            # Unpack singleton lists
            assert [group[0] for group in state_lrs] == lrs
            assert lrs == pytest.approx([v for i, v in simulated_values])
            concat_scheduler.load_state_dict(state_dict)

            trainer.state.param_history = None
Esempio n. 28
0
def run(output_path, config):
    device = "cuda"

    local_rank = config['local_rank']
    distributed = backend is not None
    if distributed:
        torch.cuda.set_device(local_rank)
        device = "cuda"
    rank = dist.get_rank() if distributed else 0

    # Rescale batch_size and num_workers
    ngpus_per_node = torch.cuda.device_count()
    ngpus = dist.get_world_size() if distributed else 1
    batch_size = config['batch_size'] // ngpus
    num_workers = int((config['num_workers'] + ngpus_per_node - 1) / ngpus_per_node)

    train_labelled_loader, test_loader = \
        get_train_test_loaders(path=config['data_path'],
                               batch_size=batch_size,
                               distributed=distributed,
                               num_workers=num_workers)

    model = get_model(config['model'])
    model = model.to(device)

    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[local_rank, ],
                                                          output_device=local_rank)

    optimizer = optim.SGD(model.parameters(), lr=config['learning_rate'],
                          momentum=config['momentum'],
                          weight_decay=config['weight_decay'],
                          nesterov=True)

    criterion = nn.CrossEntropyLoss().to(device)

    le = len(train_labelled_loader)
    milestones_values = [
        (0, 0.0),
        (le * config['num_warmup_epochs'], config['learning_rate']),
        (le * config['num_epochs'], 0.0)
    ]
    lr_scheduler = PiecewiseLinear(optimizer, param_name="lr",
                                   milestones_values=milestones_values)

    def _prepare_batch(batch, device, non_blocking):
        x, y = batch
        return (convert_tensor(x, device=device, non_blocking=non_blocking),
                convert_tensor(y, device=device, non_blocking=non_blocking))

    def process_function(engine, labelled_batch):

        x, y = _prepare_batch(labelled_batch, device=device, non_blocking=True)

        model.train()
        # Supervised part
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return {
            'batch loss': loss.item(),
        }

    trainer = Engine(process_function)

    if not hasattr(lr_scheduler, "step"):
        trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler)
    else:
        trainer.add_event_handler(Events.ITERATION_STARTED, lambda engine: lr_scheduler.step())

    metric_names = [
        'batch loss',
    ]

    def output_transform(x, name):
        return x[name]

    for n in metric_names:
        # We compute running average values on the output (batch loss) across all devices
        RunningAverage(output_transform=partial(output_transform, name=n),
                       epoch_bound=False, device=device).attach(trainer, n)

    if rank == 0:
        checkpoint_handler = ModelCheckpoint(dirname=output_path,
                                             filename_prefix="checkpoint",
                                             save_interval=1000)
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  checkpoint_handler,
                                  {'model': model, 'optimizer': optimizer})

        ProgressBar(persist=True, bar_format="").attach(trainer,
                                                        event_name=Events.EPOCH_STARTED,
                                                        closing_event_name=Events.COMPLETED)
        if config['display_iters']:
            ProgressBar(persist=False, bar_format="").attach(trainer, metric_names=metric_names)

        tb_logger = TensorboardLogger(log_dir=output_path)
        tb_logger.attach(trainer,
                         log_handler=tbOutputHandler(tag="train",
                                                     metric_names=metric_names),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=tbOptimizerParamsHandler(optimizer, param_name="lr"),
                         event_name=Events.ITERATION_STARTED)

    metrics = {
        "accuracy": Accuracy(device=device if distributed else None),
        "loss": Loss(criterion, device=device if distributed else None)
    }

    evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True)
    train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True)

    def run_validation(engine, val_interval):
        if engine.state.epoch % val_interval == 0:
            torch.cuda.synchronize()
            train_evaluator.run(train_labelled_loader)
            evaluator.run(test_loader)

    trainer.add_event_handler(Events.EPOCH_STARTED, run_validation, val_interval=3)
    trainer.add_event_handler(Events.COMPLETED, run_validation, val_interval=1)

    if rank == 0:
        if config['display_iters']:
            ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator)
            ProgressBar(persist=False, desc="Test evaluation").attach(evaluator)

        tb_logger.attach(train_evaluator,
                         log_handler=tbOutputHandler(tag="train",
                                                     metric_names=list(metrics.keys()),
                                                     another_engine=trainer),
                         event_name=Events.COMPLETED)

        tb_logger.attach(evaluator,
                         log_handler=tbOutputHandler(tag="test",
                                                     metric_names=list(metrics.keys()),
                                                     another_engine=trainer),
                         event_name=Events.COMPLETED)

        # Store the best model
        def default_score_fn(engine):
            score = engine.state.metrics['accuracy']
            return score

        score_function = default_score_fn if not hasattr(config, "score_function") else config.score_function

        best_model_handler = ModelCheckpoint(dirname=output_path,
                                             filename_prefix="best",
                                             n_saved=3,
                                             score_name="val_accuracy",
                                             score_function=score_function)
        evaluator.add_event_handler(Events.COMPLETED, best_model_handler, {'model': model, })

    trainer.run(train_labelled_loader, max_epochs=config['num_epochs'])

    if rank == 0:
        tb_logger.close()
Esempio n. 29
0
def test_linear_scheduler():

    with pytest.raises(TypeError, match=r"Argument optimizer should be torch.optim.Optimizer"):
        LinearCyclicalScheduler({}, "lr", 1, 0, cycle_size=0)

    tensor = torch.zeros([1], requires_grad=True)
    optimizer = torch.optim.SGD([tensor], lr=0.0)

    with pytest.raises(ValueError, match=r"Argument cycle_size should be positive and larger than 1"):
        LinearCyclicalScheduler(optimizer, "lr", 1, 0, cycle_size=0)

    with pytest.raises(ValueError, match=r"Argument cycle_size should be positive and larger than 1"):
        LinearCyclicalScheduler(optimizer, "lr", 1, 0, cycle_size=1)

    scheduler = LinearCyclicalScheduler(optimizer, "lr", 1, 0, 10)
    state_dict = scheduler.state_dict()

    def save_lr(engine):
        lrs.append(optimizer.param_groups[0]["lr"])

    trainer = Engine(lambda engine, batch: None)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)

    for _ in range(2):
        lrs = []
        trainer.run([0] * 9, max_epochs=2)

        assert lrs == list(
            map(
                pytest.approx,
                [
                    # Cycle 1
                    1.0,
                    0.8,
                    0.6,
                    0.4,
                    0.2,
                    0.0,
                    0.2,
                    0.4,
                    0.6,
                    0.8,
                    # Cycle 2
                    1.0,
                    0.8,
                    0.6,
                    0.4,
                    0.2,
                    0.0,
                    0.2,
                    0.4,  # 0.6, 0.8,
                ],
            )
        )
        scheduler.load_state_dict(state_dict)

    optimizer = torch.optim.SGD([tensor], lr=0)
    scheduler = LinearCyclicalScheduler(optimizer, "lr", 1, 0, 10, cycle_mult=2)
    state_dict = scheduler.state_dict()

    trainer = Engine(lambda engine, batch: None)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)

    for _ in range(2):
        lrs = []
        trainer.run([0] * 10, max_epochs=3)

        assert lrs == list(
            map(
                pytest.approx,
                [
                    # Cycle 1
                    1.0,
                    0.8,
                    0.6,
                    0.4,
                    0.2,
                    0.0,
                    0.2,
                    0.4,
                    0.6,
                    0.8,
                    # Cycle 2
                    1.0,
                    0.9,
                    0.8,
                    0.7,
                    0.6,
                    0.5,
                    0.4,
                    0.3,
                    0.2,
                    0.1,
                    0.0,
                    0.1,
                    0.2,
                    0.3,
                    0.4,
                    0.5,
                    0.6,
                    0.7,
                    0.8,
                    0.9,
                ],
            )
        )
        scheduler.load_state_dict(state_dict)

    # With float cycle_size
    optimizer = torch.optim.SGD([tensor], lr=0)
    scheduler = LinearCyclicalScheduler(
        optimizer, "lr", start_value=1.2, end_value=0.2, cycle_size=10.00000012, cycle_mult=1.0
    )
    state_dict = scheduler.state_dict()

    trainer = Engine(lambda engine, batch: None)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)

    for _ in range(2):
        lrs = []
        trainer.run([0] * 9, max_epochs=2)
        assert lrs == list(
            map(
                pytest.approx,
                [
                    # Cycle 1
                    1.2,
                    1.0,
                    0.8,
                    0.6,
                    0.4,
                    0.2,
                    0.4,
                    0.6,
                    0.8,
                    1.0,
                    # Cycle 2
                    1.2,
                    1.0,
                    0.8,
                    0.6,
                    0.4,
                    0.2,
                    0.4,
                    0.6,  # 0.8, 1.0,
                ],
            )
        )
        scheduler.load_state_dict(state_dict)
Esempio n. 30
0
 def attach(self, engine: Engine) -> None:
     if self._name is None:
         self.logger = engine.logger
     if not engine.has_event_handler(self, Events.ITERATION_COMPLETED):
         engine.add_event_handler(Events.ITERATION_COMPLETED, self)