def run(train_batch_size, val_batch_size,
        epochs, lr, momentum,
        log_interval, restore_from, crash_iteration=1000):

    train_loader, val_loader = get_data_loaders(
        train_batch_size, val_batch_size)
    model = Net()
    device = 'cpu'
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    trainer = create_supervised_trainer(
        model, optimizer, F.nll_loss, device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={'accuracy': Accuracy(),
                                                     'nll': Loss(F.nll_loss)},
                                            device=device)
    # Setup debug level of engine logger:
    trainer._logger.setLevel(logging.INFO)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        "%(asctime)s|%(name)s|%(levelname)s| %(message)s")
    ch.setFormatter(formatter)
    trainer._logger.addHandler(ch)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
                  "".format(
                      engine.state.epoch, iter,
                      len(train_loader), engine.state.output))

        if engine.state.iteration == crash_iteration:
            raise Exception("STOP at {}".format(engine.state.iteration))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print(
            "Training Results - Epoch: {}\
            Avg accuracy: {:.2f} Avg loss: {:.2f}".format(
                engine.state.epoch, avg_accuracy, avg_nll))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print(
            "Validation Results - Epoch: {}\
            Avg accuracy: {:.2f} Avg loss: {:.2f}".format(
                engine.state.epoch, avg_accuracy, avg_nll))

    objects_to_checkpoint = {"model": model, "optimizer": optimizer}
    engine_checkpoint = ModelCheckpoint(
        dirname="engine_checkpoint",
        filename_prefix='ignite_checking',
        require_empty=False,
        save_interval=100)
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, engine_checkpoint, objects_to_checkpoint)

    if restore_from == "":
        trainer.run(train_loader, max_epochs=epochs)
    else:
        raise NotImplementedError('Not implemented yet')
Beispiel #2
0
    def fit(self, model, optimizer, train_loader, val_loader, epochs,
            batch_size):
        print('starting fit()')
        # self.criterion = nn.CrossEntropyLoss()
        if torch.cuda.is_available():
            # print('using model.cuda()')
            torch.cuda.device(self.device)
            model.cuda()

        ### print('$%$%$%$ in fit(), calling summary() with model.input_shape =', model.input_shape)
        ### summary(model, model.input_shape, device=self.device_str)
        trainer = create_supervised_trainer(model,
                                            optimizer,
                                            self.criterion,
                                            device=self.device_str)
        metrics = {
            'accuracy': Accuracy(),
            'nll': Loss(self.criterion),
            'cm': ConfusionMatrix(num_classes=len(self.classLabels))
        }
        training_history = {'accuracy': [], 'loss': []}
        validation_history = {'accuracy': [], 'loss': []}
        # last_epoch = []
        evaluator = create_supervised_evaluator(model,
                                                metrics=metrics,
                                                device=self.device_str)

        def score_function(engine):
            val_loss = engine.state.metrics['nll']
            return -val_loss

        if self.torch_patience > 0:
            early_stopping = EarlyStopping(patience=self.torch_patience,
                                           score_function=score_function,
                                           trainer=trainer)
            evaluator.add_event_handler(Events.COMPLETED, early_stopping)

        # self.writer = SummaryWriter(log_dir=self.weight_dir)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_training_results(trainer):
            model.eval()
            # print('# running log_training_results(trainer):')
            evaluator.run(train_loader)
            metrics = evaluator.state.metrics
            loss = metrics['nll']
            accuracy = metrics['accuracy']
            # self.writer.add_scalar('Loss/train', loss, self.n_iter)
            # self.writer.add_scalar('Accuracy/train', accuracy, self.n_iter)
            # last_epoch.append(0)
            training_history['accuracy'].append(accuracy)
            training_history['loss'].append(loss)
            print(
                "Training - Epoch: {}  Avg accuracy: {:.4f} Avg loss: {:.4f}".
                format(trainer.state.epoch, accuracy, loss))
            model.train()

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_validation_results(trainer):
            model.eval()
            # print('# running log_validation_results(trainer):')
            evaluator.run(val_loader)
            metrics = evaluator.state.metrics
            loss = metrics['nll']
            accuracy = metrics['accuracy']
            # self.writer.add_scalar('Loss/test', loss, self.n_iter)
            # self.writer.add_scalar('Accuracy/test', accuracy, self.n_iter)
            validation_history['accuracy'].append(accuracy)
            validation_history['loss'].append(loss)
            print(
                "Validation Results - Epoch: {}  Avg val accuracy: {:.4f} Avg val loss: {:.4f}"
                .format(trainer.state.epoch, accuracy, loss))
            # save the model with the best accuracy
            if self.best_accuracy < accuracy:
                if not isdir(self.weight_dir):
                    mkdir(self.weight_dir)
                torch.save(model.state_dict(), self.weight_path_best)
                print('--> At Epoch: ',
                      trainer.state.epoch,
                      ', saved to ',
                      self.weight_path_best,
                      sep='')
                self.model = model
                self.best_accuracy = accuracy
            model.train()

        checkpointer = ModelCheckpoint(self.weight_dir,
                                       'modelCheckpoint',
                                       save_interval=1,
                                       n_saved=2,
                                       create_dir=True,
                                       save_as_state_dict=True,
                                       require_empty=False)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer,
                                  {'epoch': model})

        # self.n_iter = 0
        print('before trainer.run()')
        trainer.run(train_loader, max_epochs=self.deep_epochs)
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="openai-gpt",
                        help="Path, url or short name of the model")
    parser.add_argument("--num_candidates",
                        type=int,
                        default=2,
                        help="Number of candidates for training")
    parser.add_argument("--max_history",
                        type=int,
                        default=2,
                        help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-5,
                        help="Learning rate")
    parser.add_argument("--lm_coef",
                        type=float,
                        default=1.0,
                        help="LM loss coefficient")
    parser.add_argument("--mc_coef",
                        type=float,
                        default=1.0,
                        help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=3,
                        help="Number of training epochs")
    parser.add_argument("--personality_permutations",
                        type=int,
                        default=1,
                        help="Number of permutations of personality sentences")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    parser.add_argument(
        "--data_faiss",
        type=str,
        default="data_persona_faiss_fase1_opcion4",
        help=
        "list of the personalities selected with faiss according to the strategy selected"
    )
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer.")
    tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer  # cant use Autotokenizer because checkpoint could be a Path
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)

    model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    # Add special tokens if they are not already added
    add_special_tokens_(model, tokenizer)
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        args, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
        output_loss = model(input_ids,
                            token_type_ids=token_type_ids,
                            mc_token_ids=mc_token_ids,
                            mc_labels=mc_labels,
                            labels=lm_labels)
        loss = (output_loss.loss * args.lm_coef + output_loss.mc_loss *
                args.mc_coef) / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            # if we dont send labels to model, it doesnt return losses
            output_gpt = model(
                input_ids,
                token_type_ids=token_type_ids,
                mc_token_ids=mc_token_ids,
            )
            lm_logits_flat_shifted = output_gpt.logits[
                ..., :-1, :].contiguous().view(-1, output_gpt.logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted,
                    output_gpt.mc_logits), (lm_labels_flat_shifted, mc_labels)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-100),
             output_transform=lambda x: (x[0][0], x[1][0])),
        "accuracy":
        Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args),
        "average_accuracy":
        MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.model_checkpoint)
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(
            evaluator,
            log_handler=OutputHandler(
                tag="validation",
                metric_names=list(metrics.keys()),
                global_step_transform=global_step_from_engine(trainer)),
            event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        getattr(model, 'module',
                model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            os.path.join(log_dir, checkpoint_handler._saved[-1][1]),
            os.path.join(log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Beispiel #4
0
    def _start(self):
        """Method to run the task
        """
        if 'cuda' in self.device:
            self.model = self.model.to(self.device)

        mlflow.log_param("model", get_object_name(self.model))

        self.logger.debug("Setup criterion")
        if "cuda" in self.device:
            self.criterion = self.criterion.to(self.device)

        mlflow.log_param("criterion", get_object_name(self.criterion))
        mlflow.log_param("optimizer", get_object_name(self.optimizer))

        self.logger.debug("Setup ignite trainer")
        trainer = self._setup_trainer()
        self._setup_trainer_handlers(trainer)

        metrics = {'loss': Loss(self.criterion)}
        metrics.update(self.metrics)

        self.logger.debug("Input data info: ")
        msg = "- train data loader: {} number of batches".format(
            len(self.train_dataloader))
        if isinstance(self.train_dataloader, DataLoader):
            msg += " | {} number of samples".format(
                len(self.train_dataloader.sampler))
        self.logger.debug(msg)

        if isinstance(self.train_dataloader, DataLoader):
            write_model_graph(self.writer,
                              model=self.model,
                              data_loader=self.train_dataloader,
                              device=self.device)

        self.pbar_eval = None
        if self.train_eval_dataloader is not None:
            self.pbar_eval = ProgressBar()
            self._setup_offline_train_metrics_computation(trainer, metrics)

        if self.val_dataloader is not None:
            if self.val_metrics is None:
                self.val_metrics = metrics

            if self.pbar_eval is None:
                self.pbar_eval = ProgressBar()

            val_evaluator = self._setup_val_metrics_computation(trainer)

            if self.reduce_lr_on_plateau is not None:
                assert self.reduce_lr_on_plateau_var in self.val_metrics, \
                    "Monitor variable {} is not found in metrics {}" \
                    .format(self.reduce_lr_on_plateau_var, metrics)

                @val_evaluator.on(Events.COMPLETED)
                def update_reduce_on_plateau(engine):
                    val_var = engine.state.metrics[
                        self.reduce_lr_on_plateau_var]
                    self.reduce_lr_on_plateau.step(val_var)

            def default_score_function(engine):
                val_loss = engine.state.metrics['loss']
                # Objects with highest scores will be retained.
                return -val_loss

            # Setup early stopping:
            if self.early_stopping_kwargs is not None:
                if 'score_function' in self.early_stopping_kwargs:
                    es_score_function = self.early_stopping_kwargs[
                        'score_function']
                else:
                    es_score_function = default_score_function
                self._setup_early_stopping(trainer, val_evaluator,
                                           es_score_function)

            # Setup model checkpoint:
            if self.model_checkpoint_kwargs is None:
                self.model_checkpoint_kwargs = {
                    "filename_prefix": "model",
                    "score_name": "val_loss",
                    "score_function": default_score_function,
                    "n_saved": 3,
                    "atomic": True,
                    "create_dir": True,
                    "save_as_state_dict": True
                }
            self._setup_best_model_checkpointing(val_evaluator)

        self.logger.debug("Setup other handlers")

        if self.lr_scheduler is not None:

            @trainer.on(Events.ITERATION_STARTED)
            def update_lr_scheduler(engine):
                self.lr_scheduler.step()

        self._setup_log_learning_rate(trainer)

        self.logger.info("Start training: {} epochs".format(self.num_epochs))
        mlflow.log_param("num_epochs", self.num_epochs)
        trainer.run(self.train_dataloader, max_epochs=self.num_epochs)
        self.logger.debug("Training is ended")
Beispiel #5
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "batch_size",
                "momentum",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_evaluator(model, metrics=metrics, config=config)
    train_evaluator = create_evaluator(model, metrics=metrics, config=config)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics)

    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators)

    # Store 2 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        get_save_handler(config),
        filename_prefix="best",
        n_saved=2,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()
Beispiel #6
0
def run_classification(model,
                       train_loader,
                       val_loader,
                       epochs,
                       early_stopping,
                       lr,
                       momentum,
                       log_interval,
                       experiment_name,
                       continueing=False):
    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda'

    writer = SummaryWriter(here / f'tensorboard/runs_{experiment_name}')
    data_loader_iter = iter(train_loader)
    x, y = next(data_loader_iter)
    writer.add_graph(model, x)

    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    #optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    start_epoch = 0
    start_best_accuracy = 0.0
    if continueing:
        model, optimizer, start_epoch, start_best_accuracy = _load_checkpoint(
            model, optimizer, experiment_name)
        model.train(
        )  # In case the model was saved after a test loop where model.eval() was called

    evaluator = create_supervised_evaluator(model,
                                            device=device,
                                            metrics={
                                                'accuracy': Accuracy(),
                                                'nll': Loss(F.nll_loss)
                                            })
    evaluator_val = create_supervised_evaluator(model,
                                                device=device,
                                                metrics={
                                                    'accuracy': Accuracy(),
                                                    'nll': Loss(F.nll_loss)
                                                })
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        nn.NLLLoss(),
                                        device=device)

    desc = 'ITERATION - loss: {:.4f}'
    progress_bar = tqdm(initial=0,
                        leave=False,
                        total=len(train_loader),
                        desc=desc.format(0))

    @trainer.on(Events.STARTED)
    def init(engine):
        engine.state.epoch = start_epoch
        engine.state.best_accuracy = start_best_accuracy

    # One iteration = one batch
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            progress_bar.desc = desc.format(engine.state.output)
            progress_bar.update(log_interval)
            writer.add_scalar('training/loss', engine.state.output,
                              engine.state.iteration)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_gradients(engine):
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            for n, p in model.named_parameters():
                if p.requires_grad:
                    writer.add_scalar(f'{n}/gradient',
                                      p.grad.abs().mean(),
                                      engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        progress_bar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        logger.info(
            'Training Results - Epoch: {}  Avg accuracy: {:.4f} Avg loss: {:.4f}'
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        writer.add_scalar('training/avg_loss', avg_nll, engine.state.epoch)
        writer.add_scalar('training/avg_accuracy', avg_accuracy,
                          engine.state.epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results_and_save(engine):
        evaluator_val.run(val_loader)
        metrics = evaluator_val.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        logger.info(
            'Validation Results - Epoch: {}  Avg accuracy: {:.4f} Avg loss: {:.4f}'
            .format(engine.state.epoch, avg_accuracy, avg_nll))

        progress_bar.n = progress_bar.last_print_n = 0
        writer.add_scalar('valdation/avg_loss', avg_nll, engine.state.epoch)
        writer.add_scalar('valdation/avg_accuracy', avg_accuracy,
                          engine.state.epoch)

        # Save the model every epoch. If it's the best seen so far, save it separately
        torch.save(
            {
                'epoch': engine.state.epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'accuracy': avg_accuracy,
                'best_accuracy': engine.state.best_accuracy,
                'loss': avg_nll
            }, f'model_latest_{experiment_name}.pt')
        if avg_accuracy > engine.state.best_accuracy:
            engine.state.best_accuracy = avg_accuracy
            shutil.copyfile(f'model_latest_{experiment_name}.pt',
                            f'model_best_{experiment_name}.pt')

    # Early stopping
    handler = EarlyStopping(
        patience=early_stopping,
        score_function=(lambda engine: -evaluator_val.state.metrics['nll']),
        trainer=trainer)
    evaluator_val.add_event_handler(Events.COMPLETED, handler)

    trainer.run(train_loader, max_epochs=epochs)
    progress_bar.close()
    writer.close()
Beispiel #7
0
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(), output_transform=lambda x: (x[0][0], x[1][0]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args,args.model_checkpoint)
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
Beispiel #8
0
    def __init__(self,
                 optimizer: OptimizerType,
                 train_loader: DataLoaderType,
                 model: torch.nn.Module,
                 train_engine: Optional[ignite.engine.Engine] = None,
                 test_engine: Optional[ignite.engine.Engine] = None,
                 test_loader: Optional[DataLoaderType] = None,
                 loss_fn: Optional[LossFnType] = None,
                 eval_metric: Optional[ignite.metrics.Metric] = None,
                 descending: bool = True,
                 device: str = 'cuda') -> None:

        super().__init__()
        self.descending = descending
        self.optimizer: OptimizerType = optimizer
        self.model: Optional[torch.nn.Module] = model
        self.train_engine: ignite.engine.Engine
        self.train_loader: DataLoaderType = train_loader
        self.test_loader: Optional[DataLoaderType] = test_loader
        self.test_engine: Optional[ignite.engine.Engine]

        # create the train engine if necessary
        # if so, build it from  the model and loss_fn
        if train_engine is None and model is None:
            raise TypeError('either train_engine or model have to be provided')
        if train_engine is not None:
            self.train_engine = train_engine  # directly use it
        elif model is not None:
            if loss_fn is None:
                raise TypeError(
                    'loss_fn has to be provided if passing a plain pytorch model'
                )
            self.train_engine = ignite.engine.create_supervised_trainer(
                model,
                optimizer,
                loss_fn=loss_fn,
                device=device,
                non_blocking=True)

        # get the metric to use
        new_metric = None
        if eval_metric is not None:
            new_metric = eval_metric
        elif loss_fn is not None:
            # use the given eval_metric if provided, but fallback
            # to using the loss averaged over the entire epoch
            new_metric = Loss(loss_fn)

        # if the test loader is present, then we need an engine for training
        if test_loader is not None:
            # test engine is needed only if we have a test loader
            if test_engine is None:
                if eval_metric is None:
                    if loss_fn is None:
                        # error if no metric or loss_fn
                        raise TypeError(
                            'loss_fn has to be provided if using the default evaluator and not '
                            'providing a metric')
                if model is None:
                    raise TypeError(
                        'model must be provided if using the default evaluator'
                    )

                # create a default test engine
                self.test_engine = ignite.engine.create_supervised_evaluator(
                    model,
                    metrics={'loss': new_metric},
                    device=device,
                    non_blocking=True)
            else:
                self.test_engine = test_engine  # use the specified engine
                # attach a new metric if present
                if new_metric is not None:
                    new_metric.attach(self.test_engine, 'loss')
        else:
            self.test_engine = None  # no need for a test engine if no test loader specified
Beispiel #9
0
def run(
    train_batch_size,
    val_batch_size,
    epochs,
    lr,
    momentum,
    log_interval,
    log_dir,
    checkpoint_every,
    resume_from,
    crash_iteration=-1,
    deterministic=False,
):
    # Setup seed to have same model's initialization:
    manual_seed(75)

    train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size)
    model = Net()
    writer = SummaryWriter(log_dir=log_dir)
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    criterion = nn.NLLLoss()
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.5)

    # Setup trainer and evaluator
    if deterministic:
        tqdm.write("Setup deterministic trainer")
    trainer = create_supervised_trainer(
        model, optimizer, criterion, device=device, deterministic=deterministic
    )

    evaluator = create_supervised_evaluator(
        model, metrics={"accuracy": Accuracy(), "nll": Loss(criterion)}, device=device
    )

    # Apply learning rate scheduling
    @trainer.on(Events.EPOCH_COMPLETED)
    def lr_step(engine):
        lr_scheduler.step()

    pbar = tqdm(
        initial=0,
        leave=False,
        total=len(train_loader),
        desc=f"Epoch {0} - loss: {0:.4f} - lr: {lr:.4f}",
    )

    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training_loss(engine):
        lr_ = optimizer.param_groups[0]["lr"]
        pbar.desc = f"Epoch {engine.state.epoch} - loss: {engine.state.output:.4f} - lr: {lr_:.4f}"
        pbar.update(log_interval)
        writer.add_scalar("training/loss", engine.state.output, engine.state.iteration)
        writer.add_scalar("lr", lr_, engine.state.iteration)

    if crash_iteration > 0:

        @trainer.on(Events.ITERATION_COMPLETED(once=crash_iteration))
        def _(engine):
            raise Exception(f"STOP at {engine.state.iteration}")

    if resume_from is not None:

        @trainer.on(Events.STARTED)
        def _(engine):
            pbar.n = engine.state.iteration % engine.state.epoch_length

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            f"Training Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}"
        )
        writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch)

    # Compute and log validation metrics
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            f"Validation Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}"
        )
        pbar.n = pbar.last_print_n = 0
        writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch)

    # Setup object to checkpoint
    objects_to_checkpoint = {
        "trainer": trainer,
        "model": model,
        "optimizer": optimizer,
        "lr_scheduler": lr_scheduler,
    }
    training_checkpoint = Checkpoint(
        to_save=objects_to_checkpoint,
        save_handler=DiskSaver(log_dir, require_empty=False),
        n_saved=None,
        global_step_transform=lambda *_: trainer.state.epoch,
    )
    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=checkpoint_every), training_checkpoint)

    # Setup logger to print and dump into file: model weights, model grads and data stats
    # - first 3 iterations
    # - 4 iterations after checkpointing
    # This helps to compare resumed training with checkpointed training
    def log_event_filter(e_, event):
        if event in [1, 2, 3]:
            return True
        elif 0 <= (event % (checkpoint_every * e_.state.epoch_length)) < 5:
            return True
        return False

    fp = Path(log_dir) / ("run.log" if resume_from is None else "resume_run.log")
    fp = fp.as_posix()
    for h in [log_data_stats, log_model_weights, log_model_grads]:
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED(event_filter=log_event_filter), h, model=model, fp=fp
        )

    if resume_from is not None:
        tqdm.write(f"Resume from the checkpoint: {resume_from}")
        checkpoint = torch.load(resume_from)
        Checkpoint.load_objects(to_load=objects_to_checkpoint, checkpoint=checkpoint)

    try:
        # Synchronize random states
        manual_seed(15)
        trainer.run(train_loader, max_epochs=epochs)
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    pbar.close()
    writer.close()
def run(*options, cfg=None, debug=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options passed in via options argument will override option loaded from cfg file
    
    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the
                                      config. To see what options are available consult
                                      default.py
        cfg (str, optional): Location of config file to load. Defaults to None.        
        debug (bool): Places scripts in debug/test mode and only executes a few iterations
    """
    # Configuration:
    update_config(config, options=options, config_file=cfg)

    # The model will be saved under: outputs/<config_file_name>/<model_dir>
    config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]
    try:
        output_dir = generate_path(
            config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),
        )
    except:
        output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),)

    # Logging:
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)

    # Set CUDNN benchmark mode:
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    # We will write the model under outputs / config_file_name / model_dir
    config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]

    # Fix random seeds:
    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    # Augmentation:
    basic_aug = Compose(
        [
            Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=1),
            PadIfNeeded(
                min_height=config.TRAIN.PATCH_SIZE,
                min_width=config.TRAIN.PATCH_SIZE,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=255,
                value=0,
            ),
            Resize(
                config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True,
            ),
            PadIfNeeded(
                min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT,
                min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=255,
            ),
        ]
    )
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    # Training and Validation Loaders:
    TrainPatchLoader = get_patch_loader(config)
    logging.info(f"Using {TrainPatchLoader}")
    train_set = TrainPatchLoader(
        config.DATASET.ROOT,
        config.DATASET.NUM_CLASSES,
        split="train",
        is_transform=True,
        stride=config.TRAIN.STRIDE,
        patch_size=config.TRAIN.PATCH_SIZE,
        augmentations=train_aug,
        debug=debug,
    )
    logger.info(train_set)
    n_classes = train_set.n_classes
    val_set = TrainPatchLoader(
        config.DATASET.ROOT,
        config.DATASET.NUM_CLASSES,
        split="val",
        is_transform=True,
        stride=config.TRAIN.STRIDE,
        patch_size=config.TRAIN.PATCH_SIZE,
        augmentations=val_aug,
        debug=debug,
    )
    logger.info(val_set)

    if debug:
        logger.info("Running in debug mode..")
        train_set = data.Subset(train_set, range(config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES))
        val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU))

    train_loader = data.DataLoader(
        train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True
    )
    val_loader = data.DataLoader(
        val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=1
    )  # config.WORKERS)

    # Model:
    model = getattr(models, config.MODEL.NAME).get_seg_model(config)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Optimizer and LR Scheduler:
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader)
    scheduler = CosineAnnealingScheduler(
        optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration
    )

    # Tensorboard writer:
    summary_writer = create_summary_writer(log_dir=path.join(output_dir, "logs"))

    # class weights are inversely proportional to the frequency of the classes in the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False)

    # Loss:
    criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean")

    # Ignite trainer and evaluator:
    trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device)
    transform_fn = lambda output_dict: (output_dict["y_pred"].squeeze(), output_dict["mask"].squeeze())
    evaluator = create_supervised_evaluator(
        model,
        prepare_batch,
        metrics={
            "nll": Loss(criterion, output_transform=transform_fn),
            "pixacc": pixelwise_accuracy(n_classes, output_transform=transform_fn, device=device),
            "cacc": class_accuracy(n_classes, output_transform=transform_fn),
            "mca": mean_class_accuracy(n_classes, output_transform=transform_fn),
            "ciou": class_iou(n_classes, output_transform=transform_fn),
            "mIoU": mean_iou(n_classes, output_transform=transform_fn),
        },
        device=device,
    )
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Logging:
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.PRINT_FREQ),
    )
    trainer.add_event_handler(Events.EPOCH_COMPLETED, logging_handlers.log_lr(optimizer))

    # Tensorboard and Logging:
    trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer))
    trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_validation_output(summary_writer))

    # add specific logger which also triggers printed metrics on training set
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Training")
        logging_handlers.log_metrics(engine, evaluator, stage="Training")

    # add specific logger which also triggers printed metrics on validation set
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Validation")
        logging_handlers.log_metrics(engine, evaluator, stage="Validation")
        # dump validation set metrics at the very end for debugging purposes
        if engine.state.epoch == config.TRAIN.END_EPOCH and debug:
            fname = f"metrics_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
            metrics = evaluator.state.metrics
            out_dict = {x: metrics[x] for x in ["nll", "pixacc", "mca", "mIoU"]}
            with open(fname, "w") as fid:
                json.dump(out_dict, fid)
            log_msg = " ".join(f"{k}: {out_dict[k]}" for k in out_dict.keys())
            logging.info(log_msg)

    # Checkpointing: snapshotting trained models to disk
    checkpoint_handler = SnapshotHandler(
        output_dir,
        config.MODEL.NAME,
        extract_metric_from("mIoU"),
        lambda: (trainer.state.iteration % snapshot_duration) == 0,
    )
    evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model})

    logger.info("Starting training")
    trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED)

    summary_writer.close()
Beispiel #11
0
model = models.alexnet(pretrained=True)
model = model.to(device)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# --------------------------------- Training ----------------------------------
# Set up pytorch-ignite trainer and evaluator.
trainer = create_supervised_trainer(
    model,
    optimizer,
    loss_func,
    device=device,
)
metrics = {
    "accuracy": Accuracy(),
    "loss": Loss(loss_func),
}
evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)


@trainer.on(Events.ITERATION_COMPLETED(every=print_every))
def log_batch(trainer):
    batch = (trainer.state.iteration - 1) % trainer.state.epoch_length + 1
    print(f"Epoch {trainer.state.epoch} / {num_epochs}, "
          f"batch {batch} / {trainer.state.epoch_length}: "
          f"loss: {trainer.state.output:.3f}")


@trainer.on(Events.EPOCH_COMPLETED)
def log_epoch(trainer):
    print(f"Epoch {trainer.state.epoch} / {num_epochs} average results: ")
Beispiel #12
0
    if args.loss_fn == "MSE":
        loss_fn = nn.MSELoss(reduction='sum').to(device)
        print("use MSELoss")
    elif args.loss_fn == "L1":
        loss_fn = nn.L1Loss(reduction='sum').to(device)
        print("use L1Loss")

    optimizer = torch.optim.Adam(model.parameters(), args.lr,
                                weight_decay=args.decay)

    trainer = create_supervised_trainer(model, optimizer, loss_fn, device=device)
    evaluator_train = create_supervised_evaluator(model,
                                            metrics={
                                                'mae': CrowdCountingMeanAbsoluteError(),
                                                'mse': CrowdCountingMeanSquaredError(),
                                                'loss': Loss(loss_fn)
                                            }, device=device)

    evaluator_validate = create_supervised_evaluator(model,
                                            metrics={
                                                'mae': CrowdCountingMeanAbsoluteError(),
                                                'mse': CrowdCountingMeanSquaredError(),
                                                'loss': Loss(loss_fn)
                                            }, device=device)
    print(model)

    print(args)


    # timer
    train_timer = Timer(average=True)  # time to train whole epoch
Beispiel #13
0

def test_step(engine, batch):
    global model, g
    with th.no_grad():
        model.eval()
        model = model.to(gpu)
        g = g.to(gpu)
        (idx, ) = [x.to(gpu) for x in batch]
        y_pred = model(g, idx)
        y_true = g.ndata['label'][idx]
        return y_pred, y_true


evaluator = Engine(test_step)
metrics = {'acc': Accuracy(), 'nll': Loss(th.nn.NLLLoss())}
for n, f in metrics.items():
    f.attach(evaluator, n)


@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    evaluator.run(idx_loader_train)
    metrics = evaluator.state.metrics
    train_acc, train_nll = metrics["acc"], metrics["nll"]
    evaluator.run(idx_loader_val)
    metrics = evaluator.state.metrics
    val_acc, val_nll = metrics["acc"], metrics["nll"]
    evaluator.run(idx_loader_test)
    metrics = evaluator.state.metrics
    test_acc, test_nll = metrics["acc"], metrics["nll"]
Beispiel #14
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    trainer.logger = setup_logger("trainer")
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                "accuracy": Accuracy(),
                                                "nll": Loss(F.nll_loss)
                                            },
                                            device=device)
    evaluator.logger = setup_logger("evaluator")

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training_loss(engine):
        pbar.desc = desc.format(engine.state.output)
        pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

        pbar.n = pbar.last_print_n = 0

    @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED)
    def log_time(engine):
        tqdm.write("{} took {} seconds".format(
            trainer.last_event_name.name,
            trainer.state.times[trainer.last_event_name.name]))

    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
def run(tb, vb, lr, epochs, writer):
    device = os.environ['main-device']
    logging.info('Training program start!')
    logging.info('Configuration:')
    logging.info('\n' + json.dumps(INFO, indent=2))

    # ------------------------------------
    # 1. Define dataloader
    train_loader, train4val_loader, val_loader, num_of_images, mapping = get_dataloaders(
        tb, vb)
    # train_loader, train4val_loader, val_loader, num_of_images = get_dataloaders(tb, vb)
    weights = (1 / num_of_images) / ((1 / num_of_images).sum().item())
    # weights = (1/num_of_images)/(1/num_of_images + 1/(num_of_images.sum().item()-num_of_images))
    weights = weights.to(device=device)

    # ------------------------------------
    # 2. Define model
    model = EfficientNet.from_pretrained(
        'efficientnet-b3', num_classes=INFO['dataset-info']['num-of-classes'])
    model = carrier(model)

    # ------------------------------------
    # 3. Define optimizer
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
    ignite_scheduler = LRScheduler(scheduler)

    # ------------------------------------
    # 4. Define metrics

    class DOCLoss(nn.Module):
        def __init__(self, weight):
            super(DOCLoss, self).__init__()
            self.class_weights = weight

        def forward(self, input, target):
            sigmoid = 1 - 1 / (1 + torch.exp(-input))
            sigmoid[range(0, sigmoid.shape[0]),
                    target] = 1 - sigmoid[range(0, sigmoid.shape[0]), target]
            sigmoid = torch.log(sigmoid)
            if self.class_weights is not None:
                loss = -torch.sum(sigmoid * self.class_weights)
            else:
                loss = -torch.sum(sigmoid)
            return loss

    train_metrics = {
        'accuracy':
        Accuracy(),
        'loss':
        Loss(DOCLoss(weight=weights)),
        'precision_recall':
        MetricsLambda(PrecisionRecallTable, Precision(), Recall(),
                      train_loader.dataset.classes),
        'cmatrix':
        MetricsLambda(CMatrixTable,
                      ConfusionMatrix(INFO['dataset-info']['num-of-classes']),
                      train_loader.dataset.classes)
    }

    def val_pred_transform(output):
        y_pred, y = output
        new_y_pred = torch.zeros(
            (y_pred.shape[0],
             INFO['dataset-info']['num-of-classes'] + 1)).to(device=device)
        for ind, c in enumerate(train_loader.dataset.classes):
            new_col = val_loader.dataset.class_to_idx[c]
            new_y_pred[:, new_col] += y_pred[:, ind]
        ukn_ind = val_loader.dataset.class_to_idx['UNKNOWN']
        import math
        new_y_pred[:, ukn_ind] = -math.inf
        return new_y_pred, y

    val_metrics = {
        'accuracy':
        Accuracy(),
        'precision_recall':
        MetricsLambda(PrecisionRecallTable, Precision(val_pred_transform),
                      Recall(val_pred_transform), val_loader.dataset.classes),
        'cmatrix':
        MetricsLambda(
            CMatrixTable,
            ConfusionMatrix(INFO['dataset-info']['num-of-classes'] + 1,
                            output_transform=val_pred_transform),
            val_loader.dataset.classes)
    }

    # ------------------------------------
    # 5. Create trainer
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        DOCLoss(weight=weights),
                                        device=device)

    # ------------------------------------
    # 6. Create evaluator
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=train_metrics,
                                                  device=device)
    val_evaluator = create_supervised_evaluator(model,
                                                metrics=val_metrics,
                                                device=device)

    desc = 'ITERATION - loss: {:.4f}'
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    # ------------------------------------
    # 7. Create event hooks

    # Update process bar on each iteration completed.
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        log_interval = 1
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)

    # Compute metrics on train data on each epoch completed.
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        print('Checking on training set.')
        train_evaluator.run(train4val_loader)
        metrics = train_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_loss = metrics['loss']
        precision_recall = metrics['precision_recall']
        cmatrix = metrics['cmatrix']
        prompt = """
      Training Results - Epoch: {}
      Avg accuracy: {:.4f}
      Avg loss: {:.4f}
      precision_recall: \n{}
      confusion matrix: \n{}
      """.format(engine.state.epoch, avg_accuracy, avg_loss,
                 precision_recall['pretty'], cmatrix['pretty'])
        tqdm.write(prompt)
        logging.info('\n' + prompt)
        writer.add_text(os.environ['run-id'], prompt, engine.state.epoch)
        writer.add_scalars('Aggregate/Acc', {'Train Acc': avg_accuracy},
                           engine.state.epoch)
        writer.add_scalars('Aggregate/Loss', {'Train Loss': avg_loss},
                           engine.state.epoch)

    # Compute metrics on val data on each epoch completed.
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        print('Checking on validation set.')
        val_evaluator.run(val_loader)
        metrics = val_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        precision_recall = metrics['precision_recall']
        cmatrix = metrics['cmatrix']
        prompt = """
      Validating Results - Epoch: {}
      Avg accuracy: {:.4f}
      precision_recall: \n{}
      confusion matrix: \n{}
      """.format(engine.state.epoch, avg_accuracy, precision_recall['pretty'],
                 cmatrix['pretty'])
        tqdm.write(prompt)
        logging.info('\n' + prompt)
        writer.add_text(os.environ['run-id'], prompt, engine.state.epoch)
        writer.add_scalars('Aggregate/Acc', {'Val Acc': avg_accuracy},
                           engine.state.epoch)
        writer.add_scalars(
            'Aggregate/Score', {
                'Val avg precision': precision_recall['data'][0, -1],
                'Val avg recall': precision_recall['data'][1, -1]
            }, engine.state.epoch)
        pbar.n = pbar.last_print_n = 0

    # Save model ever N epoch.
    save_model_handler = ModelCheckpoint(os.environ['savedir'],
                                         '',
                                         save_interval=50,
                                         n_saved=2)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, save_model_handler,
                              {'model': model})

    # Update learning-rate due to scheduler.
    trainer.add_event_handler(Events.EPOCH_STARTED, ignite_scheduler)

    # ------------------------------------
    # Run
    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
Beispiel #16
0
def test_zero_div():
    loss = Loss(nll_loss)
    with pytest.raises(NotComputableError):
        loss.compute()
Beispiel #17
0
                               batch_size=25,
                               shuffle=True)
val_loader = data.DataLoader(TGSSaltDataset(validate_images, validate_masks),
                             batch_size=50,
                             shuffle=False)

learning_rate = 1e-4
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

trainer = create_supervised_trainer(model, optimizer, loss_fn, device="cuda")
evaluator = create_supervised_evaluator(model,
                                        device="cuda",
                                        metrics={
                                            'accuracy': BinaryAccuracy(),
                                            'my_loss': Loss(loss_fn)
                                        })


@trainer.on(Events.ITERATION_COMPLETED)
def log_training_loss(trainer):
    #print("Epoch[{}] Loss: {:.2f}".format(trainer.state.epoch, trainer.state.output))
    pass


@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    evaluator.run(train_loader)
    metrics = evaluator.state.metrics
    print(
        "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}".
Beispiel #18
0
def train(prefix, epochs, batch_size, num_workers, embedding_size, num_layers,
          learning_rate, weight_decay, model_dir, run_dir):
    ts = datetime.now().strftime("%m_%d_%Y__%H_%M")
    run_name = '{}_{}'.format(prefix, ts)

    model_dir = os.path.join(model_dir, run_name)

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    writer = SummaryWriter(os.path.join(run_dir, run_name))

    ds_train = TriggeredEarthquake(
        mode=DatasetMode.TRAIN,
        downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE)

    ds_test = TriggeredEarthquake(
        mode=DatasetMode.TEST,
        downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE,
        transform=triggered_earthquake_transform(random_trim_offset=False))
    # ds_train = SiameseDataset(ds_train)
    train_loader = DataLoader(ds_train,
                              batch_size=batch_size,
                              num_workers=num_workers,
                              shuffle=True)
    test_loader = DataLoader(ds_test,
                             batch_size=batch_size,
                             num_workers=num_workers,
                             shuffle=True)

    model = DilatedConvolutional(embedding_size=embedding_size,
                                 num_layers=num_layers)
    params = filter(lambda p: p.requires_grad, model.parameters())

    optimizer = torch.optim.Adam(params,
                                 lr=learning_rate,
                                 weight_decay=weight_decay)
    loss_fn = DeepClusteringLoss()

    trainer = create_engine(model, optimizer, loss_fn, device)
    evaluator = create_eval(model, {'dcl': Loss(loss_fn)}, device)

    summary(
        model,
        (1,
         gin.query_parameter('triggered_earthquake_transform.target_length')))
    writer.add_graph(model,
                     next(iter(train_loader))[0].unsqueeze(1).to(device))

    save_handler = ModelCheckpoint(model_dir,
                                   prefix,
                                   n_saved=1,
                                   create_dir=True,
                                   require_empty=False)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, save_handler,
                              {'model': model})

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(_):
        """
        report training loss
        :param _:
        :return:
        """
        writer.add_scalar('Iter/train_loss', trainer.state.output,
                          trainer.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(_):
        """
        report training loss
        :param _:
        :return:
        """
        evaluator.run(train_loader)
        loss = trainer.state.output
        writer.add_scalar('Loss/train', loss, trainer.state.epoch)
        print("Training Results - Epoch: {} Avg loss: {:.2f}".format(
            trainer.state.epoch, trainer.state.output))

    @trainer.on(Events.EPOCH_COMPLETED)
    def test_acc(_):
        """
        report testing accurarcy
        :param _:
        :return:
        """
        acc, cm, _, = test_classification(
            model,
            gin.query_parameter('triggered_earthquake_dataset.testing_quakes'),
            device,
            gin.query_parameter('triggered_earthquake_dataset.data_dir'))
        writer.add_scalar('Accurarcy/test', acc, trainer.state.epoch)
        print('Testing Accurarcy: {:.2f}'.format(acc))
        print(cm)

    def report_embeddings(_):
        """
        write embeddings to tensorboard
        :param _:
        :return:
        """
        train_loader = DataLoader(ds_train, batch_size=1)
        test_loader = DataLoader(ds_test, batch_size=1)

        text_labels = gin.query_parameter(
            'triggered_earthquake_dataset.labels')
        train_embeddings, train_labels = get_embeddings(model,
                                                        train_loader,
                                                        device=device)
        train_labels = [
            text_labels[np.argmax(l)] for l in train_labels.squeeze(1)
        ]
        writer.add_embedding(train_embeddings.squeeze(1),
                             metadata=train_labels,
                             global_step=trainer.state.epoch,
                             tag='train_embeddings')

        test_embeddings, test_labels = get_embeddings(model,
                                                      test_loader,
                                                      device=device)
        test_labels = [
            text_labels[np.argmax(l)] for l in test_labels.squeeze(1)
        ]
        writer.add_embedding(test_embeddings.squeeze(1),
                             metadata=test_labels,
                             global_step=trainer.state.epoch,
                             tag='test_embeddings')

    trainer.add_event_handler(Events.EPOCH_COMPLETED(once=1),
                              report_embeddings)
    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=5),
                              report_embeddings)

    @trainer.on(Events.COMPLETED)
    def save_classifier(_):
        '''
        create and save two svc classifiers in the model_dir
            - one with only training data
            - ont with all data
        :param _:
        :return:
        '''
        # save classifier only trained on training data
        _, _, classifier = test_classification(
            model,
            gin.query_parameter('triggered_earthquake_dataset.testing_quakes'),
            device,
            gin.query_parameter('triggered_earthquake_dataset.data_dir'))
        with open(os.path.join(model_dir, '{}_classifier.p'.format(prefix)),
                  'wb') as f:
            pickle.dump(classifier, f)

        # save classifier trained on all data (for running inference)
        ds = TriggeredEarthquake(
            data_dir=gin.query_parameter(
                'triggered_earthquake_dataset.data_dir'),
            testing_quakes=[],
            downloadable_data=DownloadableData.TRIGGERED_EARTHQUAKE,
            mode=DatasetMode.INFERENCE,
            transform=triggered_earthquake_transform(random_trim_offset=False),
        )
        loader = DataLoader(ds, batch_size=1, num_workers=10, shuffle=True)
        classifier_alldata = create_classifier(model,
                                               loader,
                                               type='svc',
                                               device=device)
        with open(
                os.path.join(model_dir, '{}_svc_classifier.p'.format(prefix)),
                'wb') as f:
            pickle.dump(classifier_alldata, f)

    @trainer.on(Events.COMPLETED)
    def save_metadata(_):
        '''
        save a metadata file, used for inference
        :param _:
        :return:
        '''
        transformer = triggered_earthquake_transform(random_trim_offset=False)
        transformer_path = os.path.join(model_dir, 'transformer.p')
        pickle.dump(transformer, open(transformer_path, 'wb'))

        metadata = {
            'name':
            run_name,
            'classes':
            gin.query_parameter('triggered_earthquake_dataset.labels'),
            'model_state_path':
            save_handler.last_checkpoint,
            'classifier_path':
            os.path.join(model_dir, '{}_classifier.p'.format(prefix)),
            'embedding_size':
            embedding_size,
            'num_layers':
            num_layers,
            'transformer':
            transformer_path
        }

        with open(os.path.join(model_dir, 'metadata.json'), 'w') as f:
            json.dump(metadata, f)

    trainer.run(train_loader, max_epochs=epochs)
    writer.close()
Beispiel #19
0
def train_with_ignite(networks, dataset, data_dir, batch_size, img_size,
                      epochs, lr, momentum, num_workers, optimizer, logger):

    from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
    from ignite.metrics import Loss
    from utils.metrics import MultiThresholdMeasures, Accuracy, IoU, F1score

    # device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # build model
    model = get_network(networks)

    # log model summary
    input_size = (3, img_size, img_size)
    summarize_model(model.to(device), input_size, logger, batch_size, device)

    # build loss
    loss = torch.nn.BCEWithLogitsLoss()

    # build optimizer and scheduler
    model_optimizer = get_optimizer(optimizer, model, lr, momentum)
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer)

    # transforms on both image and mask
    train_joint_transforms = jnt_trnsf.Compose([
        jnt_trnsf.RandomCrop(img_size),
        jnt_trnsf.RandomRotate(5),
        jnt_trnsf.RandomHorizontallyFlip()
    ])

    # transforms only on images
    train_image_transforms = std_trnsf.Compose([
        std_trnsf.ColorJitter(0.05, 0.05, 0.05, 0.05),
        std_trnsf.ToTensor(),
        std_trnsf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    test_joint_transforms = jnt_trnsf.Compose([jnt_trnsf.Safe32Padding()])

    test_image_transforms = std_trnsf.Compose([
        std_trnsf.ToTensor(),
        std_trnsf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    # transforms only on mask
    mask_transforms = std_trnsf.Compose([std_trnsf.ToTensor()])

    # build train / test loader
    train_loader = get_loader(dataset=dataset,
                              data_dir=data_dir,
                              train=True,
                              joint_transforms=train_joint_transforms,
                              image_transforms=train_image_transforms,
                              mask_transforms=mask_transforms,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=num_workers)

    test_loader = get_loader(dataset=dataset,
                             data_dir=data_dir,
                             train=False,
                             joint_transforms=test_joint_transforms,
                             image_transforms=test_image_transforms,
                             mask_transforms=mask_transforms,
                             batch_size=1,
                             shuffle=False,
                             num_workers=num_workers)

    # build trainer / evaluator with ignite
    trainer = create_supervised_trainer(model,
                                        model_optimizer,
                                        loss,
                                        device=device)
    measure = MultiThresholdMeasures()
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                '': measure,
                                                'pix-acc': Accuracy(measure),
                                                'iou': IoU(measure),
                                                'loss': Loss(loss),
                                                'f1': F1score(measure),
                                            },
                                            device=device)

    # initialize state variable for checkpoint
    state = update_state(model.state_dict(), 0, 0, 0, 0, 0)

    # make ckpt path
    ckpt_root = './ckpt/'
    filename = '{network}_{optimizer}_lr_{lr}_epoch_{epoch}.pth'
    ckpt_path = os.path.join(ckpt_root, filename)

    # execution after every training iteration
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(trainer):
        num_iter = (trainer.state.iteration - 1) % len(train_loader) + 1
        if num_iter % 20 == 0:
            logger.info("Epoch[{}] Iter[{:03d}] Loss: {:.2f}".format(
                trainer.state.epoch, num_iter, trainer.state.output))

    # execution after every training epoch
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        # evaluate on training set
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        logger.info(
            "Training Results - Epoch: {} Avg-loss: {:.3f}\n Pix-acc: {}\n IoU: {}\n F1: {}\n"
            .format(trainer.state.epoch, metrics['loss'],
                    str(metrics['pix-acc']), str(metrics['iou']),
                    str(metrics['f1'])))

        # update state
        update_state(weight=model.state_dict(),
                     train_loss=metrics['loss'],
                     val_loss=state['val_loss'],
                     val_pix_acc=state['val_pix_acc'],
                     val_iou=state['val_iou'],
                     val_f1=state['val_f1'])

    # execution after every epoch
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(trainer):
        # evaluate test(validation) set
        evaluator.run(test_loader)
        metrics = evaluator.state.metrics
        logger.info(
            "Validation Results - Epoch: {} Avg-loss: {:.3f}\n Pix-acc: {}\n IoU: {}\n F1: {}\n"
            .format(trainer.state.epoch, metrics['loss'],
                    str(metrics['pix-acc']), str(metrics['iou']),
                    str(metrics['f1'])))

        # update scheduler
        lr_scheduler.step(metrics['loss'])

        # update and save state
        update_state(weight=model.state_dict(),
                     train_loss=state['train_loss'],
                     val_loss=metrics['loss'],
                     val_pix_acc=metrics['pix-acc'],
                     val_iou=metrics['iou'],
                     val_f1=metrics['f1'])

        path = ckpt_path.format(network=networks,
                                optimizer=optimizer,
                                lr=lr,
                                epoch=trainer.state.epoch)
        save_ckpt_file(path, state)

    trainer.run(train_loader, max_epochs=epochs)
Beispiel #20
0
                              collate_fn=collate_fn)
    val_loader = DataLoader(dataset,
                            batch_size=4,
                            sampler=val_sampler,
                            drop_last=False,
                            collate_fn=collate_fn)

    #bertmodel = BertModel.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    for names, parameters in model.bert.named_parameters():
        parameters.requiers_grad = False

    #optimizer = Adam([p for p in model.parameters() if p.requires_grad], lr=1e-3)
    optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
    criterion = nn.CrossEntropyLoss()
    metrics = {'loss': Loss(criterion), 'accuracy': Accuracy()}
    trainer = BertTrainer(model,
                          optimizer,
                          newbob_period=3,
                          checkpoint_dir='./checkpoints/bert',
                          metrics=metrics,
                          non_blocking=True,
                          retain_graph=True,
                          patience=3,
                          loss_fn=criterion,
                          device=DEVICE,
                          parallel=False)
    trainer.fit(train_loader, val_loader, epochs=10)
    trainer = BertTrainer(model,
                          optimizer=None,
                          checkpoint_dir='./checkpoints/bert',
def run(tb, vb, lr, epochs, writer):
    device = os.environ['main-device']
    logging.info('Training program start!')
    logging.info('Configuration:')
    logging.info('\n' + json.dumps(INFO, indent=2))

    # ------------------------------------
    # 1. Define dataloader
    train_loader, train4val_loader, val_loader, num_of_images, mapping = get_dataloaders(
        tb, vb)
    weights = (1 / num_of_images) / ((1 / num_of_images).sum().item())
    weights = weights.to(device=device)

    # ------------------------------------
    # 2. Define model
    model = EfficientNet.from_pretrained(
        'efficientnet-b3', num_classes=INFO['dataset-info']['num-of-classes'])
    model = carrier(model)

    # ------------------------------------
    # 3. Define optimizer
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
    ignite_scheduler = LRScheduler(scheduler)

    # ------------------------------------
    # 4. Define metrics
    train_metrics = {
        'accuracy':
        Accuracy(),
        'loss':
        Loss(nn.CrossEntropyLoss(weight=weights)),
        'precision_recall':
        MetricsLambda(PrecisionRecallTable, Precision(), Recall(),
                      train_loader.dataset.classes),
        'cmatrix':
        MetricsLambda(CMatrixTable,
                      ConfusionMatrix(INFO['dataset-info']['num-of-classes']),
                      train_loader.dataset.classes)
    }

    def val_pred_transform(output):
        y_pred, y = output
        new_y_pred = torch.zeros(
            (y_pred.shape[0],
             len(INFO['dataset-info']['known-classes']) + 1)).to(device=device)
        for c in range(y_pred.shape[1]):
            if c == 0:
                new_y_pred[:, mapping[c]] += y_pred[:, c]
            elif mapping[c] == val_loader.dataset.class_to_idx['UNKNOWN']:
                new_y_pred[:, mapping[c]] = torch.where(
                    new_y_pred[:, mapping[c]] > y_pred[:, c],
                    new_y_pred[:, mapping[c]], y_pred[:, c])
            else:
                new_y_pred[:, mapping[c]] += y_pred[:, c]
        return new_y_pred, y

    val_metrics = {
        'accuracy':
        Accuracy(val_pred_transform),
        'precision_recall':
        MetricsLambda(PrecisionRecallTable, Precision(val_pred_transform),
                      Recall(val_pred_transform), val_loader.dataset.classes),
        'cmatrix':
        MetricsLambda(
            CMatrixTable,
            ConfusionMatrix(len(INFO['dataset-info']['known-classes']) + 1,
                            output_transform=val_pred_transform),
            val_loader.dataset.classes)
    }

    # ------------------------------------
    # 5. Create trainer
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        nn.CrossEntropyLoss(weight=weights),
                                        device=device)

    # ------------------------------------
    # 6. Create evaluator
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=train_metrics,
                                                  device=device)
    val_evaluator = create_supervised_evaluator(model,
                                                metrics=val_metrics,
                                                device=device)

    desc = 'ITERATION - loss: {:.4f}'
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0))

    # ------------------------------------
    # 7. Create event hooks
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        log_interval = 1
        iter = (engine.state.iteration - 1) % len(train_loader) + 1
        if iter % log_interval == 0:
            pbar.desc = desc.format(engine.state.output)
            pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        print('Checking on training set.')
        train_evaluator.run(train4val_loader)
        metrics = train_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_loss = metrics['loss']
        precision_recall = metrics['precision_recall']
        cmatrix = metrics['cmatrix']
        prompt = """
      <Training> Results - Epoch: {}
      Avg accuracy: {:.4f}
      Avg loss: {:.4f}
      precision_recall: \n{}
      confusion matrix: \n{}
      """.format(engine.state.epoch, avg_accuracy, avg_loss,
                 precision_recall['pretty'], cmatrix['pretty'])
        tqdm.write(prompt)
        logging.info('\n' + prompt)
        writer.add_text(os.environ['run-id'], prompt, engine.state.epoch)
        writer.add_scalars('Aggregate/Acc', {'Train Acc': avg_accuracy},
                           engine.state.epoch)
        writer.add_scalars('Aggregate/Loss', {'Train Loss': avg_loss},
                           engine.state.epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        print('Checking on validation set.')
        val_evaluator.run(val_loader)
        metrics = val_evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        precision_recall = metrics['precision_recall']
        cmatrix = metrics['cmatrix']
        prompt = """
      <Validating> Results - Epoch: {}
      Avg accuracy: {:.4f}
      precision_recall: \n{}
      confusion matrix: \n{}
      """.format(engine.state.epoch, avg_accuracy, precision_recall['pretty'],
                 cmatrix['pretty'])
        tqdm.write(prompt)
        logging.info('\n' + prompt)
        writer.add_text(os.environ['run-id'], prompt, engine.state.epoch)
        writer.add_scalars('Aggregate/Acc', {'Val Acc': avg_accuracy},
                           engine.state.epoch)
        writer.add_scalars(
            'Aggregate/Score', {
                'Val avg precision': precision_recall['data'][0, -1],
                'Val avg recall': precision_recall['data'][1, -1]
            }, engine.state.epoch)
        pbar.n = pbar.last_print_n = 0

    trainer.add_event_handler(Events.EPOCH_STARTED, ignite_scheduler)

    # ------------------------------------
    # Run
    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
Beispiel #22
0
def main():
    set_seed(13)

    batch_size = 64
    nb_epochs = 200
    val_set_size = 0.2
    print_report = True

    data_dir = Path('../data/head_classification_data/normed/')
    train_dir = data_dir.joinpath('data/')
    labels_filename = data_dir.joinpath('attention_norm_annotated.tsv')
    model_filename = '../models/head_classifier/classify_normed_patterns.tar'

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # load data
    images, labels, label2id, min_max_size = load_data(train_dir,
                                                       labels_filename)

    if val_set_size > 0:
        images_train, images_val, labels_train, labels_val = train_test_split(
            images, labels, test_size=val_set_size, stratify=labels)
    else:
        images_train, labels_train = images, labels
        images_val, labels_val = None, None

    print(f'Train: {images_train.shape} {labels_train.shape}')
    if labels_val is not None:
        print(f'Val: {images_val.shape}, {labels_val.shape}')

    dataset_train = torch.utils.data.TensorDataset(
        torch.from_numpy(images_train), torch.from_numpy(labels_train))
    data_loader_train = torch.utils.data.DataLoader(dataset_train,
                                                    batch_size=batch_size,
                                                    shuffle=True)

    if labels_val is not None:
        dataset_val = torch.utils.data.TensorDataset(
            torch.from_numpy(images_val), torch.from_numpy(labels_val))
        data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                      batch_size=batch_size,
                                                      shuffle=False)

    model = Net(len(label2id))
    model = model.to(device)
    init_weights(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = torch.nn.CrossEntropyLoss()

    def update_function(engine, batch):
        model.train()
        optimizer.zero_grad()

        inputs, targets = [x.to(device) for x in batch]

        outputs = model(inputs)
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

        return loss

    def inference_function(engine, batch):
        model.eval()
        with torch.no_grad():
            inputs, targets = [x.to(device) for x in batch]

            outputs = model(inputs)

            return outputs, targets

    trainer = Engine(update_function)
    evaluator = Engine(inference_function)

    metrics = [
        ('loss', Loss(torch.nn.CrossEntropyLoss())),
        ('accuracy', Accuracy()),
    ]
    for name, metric in metrics:
        metric.attach(evaluator, name)

    best_val_acc = 0

    @trainer.on(Events.EPOCH_COMPLETED)
    def on_epoch_completed(engine):
        nonlocal best_val_acc

        evaluator.run(data_loader_train)
        metrics_train = format_metrics_str(evaluator.state.metrics)

        if labels_val is not None:
            evaluator.run(data_loader_val)
            metrics_val = format_metrics_str(evaluator.state.metrics)

            acc_val = evaluator.state.metrics['accuracy']
            if acc_val >= best_val_acc:
                save_model(model_filename, model, label2id, min_max_size)
                best_val_acc = acc_val
        else:
            metrics_val = {}

        print(
            f'Epoch: {engine.state.epoch} | Train: {metrics_train} | Val: {metrics_val}'
        )

    trainer.run(data_loader_train, max_epochs=nb_epochs)

    if labels_val is None:
        save_model(model_filename, model, label2id, min_max_size)
    else:
        print(f'Best val accuracy: {best_val_acc}')

    if print_report:
        print(f'Train classification report')
        plot_confusion_matrix(model, data_loader_train, device, label2id)

        print(f'Val classification report')
        if labels_val is not None:
            plot_confusion_matrix(model, data_loader_val, device, label2id)
Beispiel #23
0
    train_set = NonterminalFeaturesDataset(os.path.join(
        args.data_dir, 'train'))
    train_loader = DataLoader(train_set, batch_size=None)
    val_set = NonterminalFeaturesDataset(os.path.join(args.data_dir, 'val'))
    val_loader = DataLoader(train_set, batch_size=None)
    gen_set = NonterminalFeaturesDataset(os.path.join(args.data_dir, 'gen'))
    gen_loader = DataLoader(train_set, batch_size=None)

    classifier = Classifier(args.features, args.nonterminals)
    classifier = classifier.to(device)

    optimizer = torch.optim.SGD(classifier.parameters(), lr=args.lr)

    # Trainer and metrics
    save_dict = {'classifier': classifier}
    trainer = Engine(step_train(classifier, optimizer))
    metric_names = ['loss', 'accuracy']
    RunningAverage(Loss(F.cross_entropy, lambda x:
                        (x['y_pred'], x['y_true']))).attach(trainer, 'loss')
    RunningAverage(Accuracy(lambda x: (x['y_pred'], x['y_true']))).attach(
        trainer, 'accuracy')

    # Evaluator and metrics
    evaluator = Engine(step_train(classifier, None, train=False))
    Accuracy(lambda x: (x['y_pred'], x['y_true'])).attach(
        evaluator, 'accuracy')

    # Begin training
    run(args.run_name, save_dict, metric_names, trainer, evaluator,
        train_loader, val_loader, gen_loader, args.epochs, 'accuracy')
def train(NN_index, trainsetsize, log, max_epoch):
    if_gpu = True
    if if_gpu:
        torch.set_default_tensor_type(torch.cuda.DoubleTensor if torch.cuda.is_available()
                                      else torch.DoubleTensor)
        device = "cuda:0"
        # print("Graphics Power!")
    else:
        torch.set_default_tensor_type(torch.DoubleTensor)
        device = None

    # file = 'grouprow.npy'
    # file = 'full_len.npy'
    file = 'data11.npy'  # feat1A, feat1B, feat1C, feat2A, feat2B
    if os.path.isfile('traindata.npy'):
        newdata, elementdict, featperelem, datavariables, feattotal = generateData(file)
        print("loaded given datasets")
        train_data = np.load(open('traindata.npy', 'rb'))
        val_data = np.load(open('valdata.npy', 'rb'))
    else:
        # disable features in classes to gen new data
        newdata, elementdict, featperelem, datavariables, feattotal = generateData(file)  # insert filename
        print("Shape of read data: ", newdata.shape)
        print("generating random files")
        create_dataset(newdata, trainsetsize)
        train_data = np.load(open('traindata.npy', 'rb'))
        val_data = np.load(open('valdata.npy', 'rb'))


    # newdata = znormalize(newdata)

    # train_data, val_data = getRandomSets(newdata)  # now in create_dataset class
    mean, stnddev = get_mean_stndev(train_data)

    # normalization
    train_data = (train_data - mean) / stnddev
    val_data = (val_data - mean) / stnddev  # welches????????????????????????
    # val_data = (val_data[:, 1::] - mean[1::]) / stnddev[1::]
    # print("val data shape and ex:", val_data.shape, val_data[0])

    train_set, val_set = PerovskiteDataset(train_data), PerovskiteDataset(val_data)

    # Variable batch and set loader
    train_batchsize = 1000
    val_batchsize = 10000  # len(val_data)  # 231472  # all or small like 2000 ?
    train_loader, val_loader = DataLoader(train_set, batch_size=train_batchsize, shuffle=True, drop_last=False), \
                               DataLoader(val_set, batch_size=val_batchsize, drop_last=True)  # shuffle=True

    # model = get_NN(feattotal)
    model = get_CNN(feattotal)

    # Shape for saving netstucture
    modelform = str(model)
    # print("Type:", type(modelform))
    # summary(netz, (1, train_batchsize, int(feattotal)))  # channel, H ,W


    lossMAE = nn.L1Loss()  # MAE  # to ignite
    lossMSE = nn.MSELoss()
    # torch.optim.SGD(params, lr=0.01)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # list of trainers?
    trainer = create_supervised_trainer(model, optimizer, lossMAE, std=stnddev[0], prepare_batch=prepare_batch)  # model[:]

    evaluator = create_supervised_evaluator(model, std=stnddev[0], prepare_batch=prepare_batch,
                                            metrics={'MAE': Loss(lossMAE),
                                                     'MSE': Loss(lossMSE),
                                                     # 'accuracy': Accuracy(),  ???
                                                     # 'NLL': Loss(lossNLL)
                                                     })  # output_transform=output_retransform_znormalize) expects (x, pred, y)

    # Progressbar
    pbar = ignite.contrib.handlers.ProgressBar(persist=False)
    pbar.attach(trainer, output_transform=lambda x: {'MAE': x})

    # Save n Load
    model_checkpoint = 'NN_'  # NN_index = sys.argv[1]
    # log = 'active'
    logcount = 0
    al_level = 0
    while (os.access(log + "/run_" + str(logcount), os.F_OK) == True):  # +str(NN_index)
        logcount += 1
    while (os.access(log + "/run_" + str(logcount-1) + "/" + model_checkpoint + str(NN_index) + "/al_" + str(al_level), os.F_OK) == True):
        al_level += 1
    os.mkdir(log + "/run_" + str(logcount-1) + "/" + model_checkpoint + str(NN_index) + "/al_" + str(al_level))
    writer = SummaryWriter(log_dir=log + "/run_" + str(logcount-1) + "/" + model_checkpoint + str(NN_index) + "/al_" + str(al_level))  # +"NN_1" ? declaration for multiple NN
    print("Run: ", (logcount - 1), "NN: ", NN_index, "AL: ", al_level, "len of trainset: ", len(train_data))  # , comment=modelform)
    # print("Modelform:", modelform)

    if (os.path.isfile(model_checkpoint + str(NN_index) + '.pt')):
        print("NN: ", NN_index, "loaded")
        checkpoint = torch.load(model_checkpoint + str(NN_index) + '.pt')
        # try to load only optimizer
        # model.load_state_dict(checkpoint['model_state_dict'])
        # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        print("model not loaded!")


    start = timeit.default_timer()
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(trainer):
        iteration = trainer.state.iteration
        writer.add_scalar('loss_vs_iteration', trainer.state.output, iteration)
        # writer.close()  # generating mass of files


    @trainer.on(ignite.engine.Events.EPOCH_STARTED)
    def log_time(trainer):
        elapsed = round(timeit.default_timer() - start, 2)
        writer.add_scalar('time_vs_epoch', elapsed, trainer.state.epoch)
        epoch = trainer.state.epoch
        if trainer.state.epoch == 100:
            writer.add_text(str(logcount), "Netzstruktur: " + modelform)
            # writer.close()    # generating mass of files


    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        if (trainer.state.epoch % evaluate_every == 0):
            evaluator.run(train_loader)
            metrics = evaluator.state.metrics
            print(trainer.state.epoch)
            print("\nTraining:", metrics)
            writer.add_scalar('MAEvsEpoch_training', metrics["MAE"], trainer.state.epoch)
            evaluator.run(val_loader)
            metrics = evaluator.state.metrics
            print("Validation: ", metrics)
            writer.add_scalar('MAEvsEpoch_validation', metrics["MAE"], trainer.state.epoch)
        if trainer.state.epoch % evaluate_every == max_epoch:
            writer.close()

    evaluate_every = 100

    trainer.run(train_loader, max_epochs=max_epoch)

    torch.save({'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()},
               model_checkpoint + str(NN_index) + '.pt')
    torch.save({'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()},
               log + "/run_" + str(logcount-1) + "/" + model_checkpoint + str(NN_index) + "/al_" + str(al_level) + "/" + model_checkpoint + str(NN_index) + '.pt')
Beispiel #25
0
    elif TARGET == "dvd":
        pre = './sdvd'
    elif TARGET == "electronics":
        pre = './sele'
    else:
        pre = './skit'

    model = DoubleHeadBert.from_pretrained(pre)
    #for names, parameters in model.bert.named_parameters():
    #    parameters.requiers_grad=False

    optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
    in_fn = nn.CrossEntropyLoss()
    criterion = DoubleLoss(in_fn)
    metrics = {
        'loss': Loss(criterion)
        #'accuracy': Accuracy(transform_pred_tar)
    }
    path = SOURCE + TARGET
    trainer = DoubleBertTrainer(model,
                                optimizer,
                                newbob_period=3,
                                checkpoint_dir=os.path.join(
                                    './checkpoints/double', path),
                                metrics=metrics,
                                non_blocking=True,
                                retain_graph=True,
                                patience=3,
                                accumulation_steps=5,
                                loss_fn=criterion,
                                device=DEVICE,
    """
    y_pred, y = output
    return torch.max(y_pred, dim=1)[1], y


# attach running loss (will be displayed in progess bar)
RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss')

# attach running accuracy (will be displayed in progess bar)
RunningAverage(Accuracy(output_transform=lambda x: [x[1], x[2]])).attach(
    trainer, 'acc')

# attach accuracy and loss to train_evaluator
Accuracy(output_transform=max_output_transform).attach(train_evaluator,
                                                       'accuracy')
Loss(loss_fn).attach(train_evaluator, 'bce')

# attach accuracy and loss to validation_evaluator
Accuracy(output_transform=max_output_transform).attach(validation_evaluator,
                                                       'accuracy')
Loss(loss_fn).attach(validation_evaluator, 'bce')
#############################################
# Report progress through tqdm progress bar #
#############################################
pbar = ProgressBar(persist=True, bar_format="")
pbar.attach(trainer, ['loss', 'acc'])


# Log after each EPOCH
@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(engine):
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.decay)

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        loss_fn,
                                        device=device)
    evaluator = create_supervised_evaluator(
        model,
        metrics={
            'mae': CrowdCountingMeanAbsoluteError(),
            'mse': CrowdCountingMeanSquaredError(),
            'nll': Loss(loss_fn)
        },
        device=device)
    print(model)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(trainer):
        print("Epoch[{}] Interation [{}] Loss: {:.2f}".format(
            trainer.state.epoch, trainer.state.iteration,
            trainer.state.output))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        print(
Beispiel #28
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval):
    vis = visdom.Visdom()

    # if not vis.check_connection():
    #     raise RuntimeError("Visdom server not running. Please run python -m visdom.server")

    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = 'cpu'

    if torch.cuda.is_available():
        device = 'cuda'

    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        F.nll_loss,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                'accuracy': Accuracy(),
                                                'nll': Loss(F.nll_loss)
                                            },
                                            device=device)

    train_loss_window = create_plot_window(vis, '#Iterations', 'Loss',
                                           'Training Loss')
    train_avg_loss_window = create_plot_window(vis, '#Iterations', 'Loss',
                                               'Training Average Loss')
    train_avg_accuracy_window = create_plot_window(
        vis, '#Iterations', 'Accuracy', 'Training Average Accuracy')
    val_avg_loss_window = create_plot_window(vis, '#Epochs', 'Loss',
                                             'Validation Average Loss')
    val_avg_accuracy_window = create_plot_window(
        vis, '#Epochs', 'Accuracy', 'Validation Average Accuracy')

    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training_loss(engine):
        print("Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
              "".format(engine.state.epoch, iter, len(train_loader),
                        engine.state.output))
        vis.line(X=np.array([engine.state.iteration]),
                 Y=np.array([engine.state.output]),
                 update='append',
                 win=train_loss_window)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        vis.line(X=np.array([engine.state.epoch]),
                 Y=np.array([avg_accuracy]),
                 win=train_avg_accuracy_window,
                 update='append')
        vis.line(X=np.array([engine.state.epoch]),
                 Y=np.array([avg_nll]),
                 win=train_avg_loss_window,
                 update='append')

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics['accuracy']
        avg_nll = metrics['nll']
        print(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        vis.line(X=np.array([engine.state.epoch]),
                 Y=np.array([avg_accuracy]),
                 win=val_avg_accuracy_window,
                 update='append')
        vis.line(X=np.array([engine.state.epoch]),
                 Y=np.array([avg_nll]),
                 win=val_avg_loss_window,
                 update='append')

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)
Beispiel #29
0
batch_size = 64
lr = 1e-3
train_loader, val_loader = get_data_loaders(batch_size, batch_size)

model = ConvNet()
device = 'cuda'
optimizer = optim.Adam(model.parameters(), lr=lr)
trainer = create_supervised_trainer(model,
                                    optimizer,
                                    F.nll_loss,
                                    device=device)
evaluator = create_supervised_evaluator(model=model,
                                        metrics={
                                            'accuracy': Accuracy(),
                                            'nll': Loss(F.nll_loss)
                                        },
                                        device=device)

desc = "ITERATION - loss: {:.2f}"
pbar = tqdm(initial=0,
            leave=False,
            total=len(train_loader),
            desc=desc.format(0))


@trainer.on(Events.ITERATION_COMPLETED)
def log_training_loss(engine):
    iter = (engine.state.iteration - 1) % len(train_loader) + 1
    pbar.desc = desc.format(engine.state.output)
    pbar.update(1)
def run(
    train_batch_size,
    val_batch_size,
    epochs,
    lr,
    momentum,
    log_interval,
    log_dir,
    checkpoint_every,
    resume_from,
    crash_iteration=1000,
):

    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    writer = SummaryWriter(log_dir=log_dir)
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    criterion = nn.NLLLoss()
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.5)
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                "accuracy": Accuracy(),
                                                "nll": Loss(criterion)
                                            },
                                            device=device)

    @trainer.on(Events.EPOCH_COMPLETED)
    def lr_step(engine):
        lr_scheduler.step()

    desc = "ITERATION - loss: {:.4f} - lr: {:.4f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=desc.format(0, lr))

    if log_interval is None:
        e = Events.ITERATION_COMPLETED
        log_interval = 1
    else:
        e = Events.ITERATION_COMPLETED(every=log_interval)

    @trainer.on(e)
    def log_training_loss(engine):
        lr = optimizer.param_groups[0]["lr"]
        pbar.desc = desc.format(engine.state.output, lr)
        pbar.update(log_interval)
        writer.add_scalar("training/loss", engine.state.output,
                          engine.state.iteration)
        writer.add_scalar("lr", lr, engine.state.iteration)

    if resume_from is None:

        @trainer.on(Events.ITERATION_COMPLETED(once=crash_iteration))
        def _(engine):
            raise Exception("STOP at {}".format(engine.state.iteration))

    else:

        @trainer.on(Events.STARTED)
        def _(engine):
            pbar.n = engine.state.iteration

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("training/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))
        pbar.n = pbar.last_print_n = 0
        writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("valdation/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    objects_to_checkpoint = {
        "trainer": trainer,
        "model": model,
        "optimizer": optimizer,
        "lr_scheduler": lr_scheduler
    }
    training_checkpoint = Checkpoint(to_save=objects_to_checkpoint,
                                     save_handler=DiskSaver(
                                         log_dir, require_empty=False))

    trainer.add_event_handler(
        Events.ITERATION_COMPLETED(every=checkpoint_every),
        training_checkpoint)

    if resume_from is not None:
        tqdm.write("Resume from a checkpoint: {}".format(resume_from))
        checkpoint = torch.load(resume_from)
        Checkpoint.load_objects(to_load=objects_to_checkpoint,
                                checkpoint=checkpoint)

    try:
        trainer.run(train_loader, max_epochs=epochs)
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    pbar.close()
    writer.close()