Esempio n. 1
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_trains"]:
            from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "batch_size",
                "momentum",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="test",
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(
                f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
def create_trainer(loader, model, opt, loss_fn, device, args):

    def _update(engine, batch):
        model.train()

        x = batch['x'].to(engine.state.device, non_blocking=True)
        y = batch['y'].to(engine.state.device, non_blocking=True)
        m = batch['m'].to(engine.state.device, non_blocking=True)
        opt.zero_grad()
        y_pred = model(x)

        softmax = nn.Softmax()
        masked_loss = softmax(y_pred)
        #masked_loss = y_pred*m
        loss = loss_fn(masked_loss, y)
        if m.sum().item() / m.numel() > 0.7:
            loss.backward()
            opt.step()
        masked_loss = (masked_loss>0.5).float()
        acc = accuracy_segmentation(masked_loss[:,1,:,:,:],y[:,1,:,:,:])

        return {
            'x': x.detach(),
            'y': y.detach(),
            'm': m.detach(),
            'y_pred': y_pred.detach(),
            'loss': loss.item(),
            'acc' : acc
        }

    def _inference(engine, batch):
        model.eval()

        with th.no_grad():
            x = batch['x'].to(engine.state.device, non_blocking=True)
            y = batch['y'].to(engine.state.device, non_blocking=True)
            m = batch['m'].to(engine.state.device, non_blocking=True)

            y_pred = model(x)
            
            softmax = nn.Softmax(dim=1)
            masked_loss = softmax(y_pred)
            #masked_loss = y_pred*m
            loss = loss_fn(masked_loss, y)
            masked_loss = (masked_loss[-3:]>0.5).float()
            acc = accuracy_segmentation(masked_loss[:,1,:,:,:],y[:,1,:,:,:])

        return {
            'x': x.detach(),
            'y': y.detach(),
            'm': m.detach(),
            'y_pred': y_pred.detach(),
            'loss': loss.item(),
            'acc' : acc
        }


    #wandb.watch(model, log ='all')

    trainer = Engine(_update)
    evaluator = Engine(_inference)

    profiler = BasicTimeProfiler()
    profiler.attach(trainer)
    logdir = args.logdir
    save_ = (not args.devrun) and (not args.nosave)

    # initialize trainer state
    trainer.state.device = device
    trainer.state.hparams = args
    trainer.state.save = save_
    trainer.state.logdir = logdir

    trainer.state.df = defaultdict(dict)
    trainer.state.metrics = dict()
    trainer.state.val_metrics = dict()
    trainer.state.best_metrics = defaultdict(list)
    trainer.state.gradnorm = defaultdict(dict)

    # initialize evaluator state
    evaluator.logger = setup_logger('evaluator')
    evaluator.state.device = device
    evaluator.state.df = defaultdict(dict)
    evaluator.state.metrics = dict()

    pbar = ProgressBar(persist=True)
    ebar = ProgressBar(persist=False)

    pbar.attach(trainer, ['loss'])
    ebar.attach(evaluator, ['loss'])

    pbar.attach(trainer,['acc'])
    ebar.attach(evaluator,['acc'])

    # model summary
    if args.model_summary:
        trainer.add_event_handler(
            Events.STARTED,
            print_model_summary, model
        )

    # terminate on nan
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED,
        TerminateOnNan(lambda x: x['loss'])
    )

    # metrics
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED,
        _metrics
    )

    evaluator.add_event_handler(
        Events.ITERATION_COMPLETED,
        _metrics
    )

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED,
        _metrics_mean
    )

    evaluator.add_event_handler(
        Events.COMPLETED,
        _metrics_mean
    )

    trainer.add_event_handler(
        #Events.STARTED | Events.EPOCH_COMPLETED,
        Events.EPOCH_COMPLETED,
        _evaluate, evaluator, loader
    )

    # logging
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED,
        _log_metrics
    )

    # early stopping
    if args.early_stopping > 0:
        es_p = args.early_stopping
        es_s = lambda engine: -engine.state.metrics['loss']
        evaluator.add_event_handler(
            Events.COMPLETED,
            EarlyStopping(patience=es_p, score_function=es_s, trainer=trainer)
        )

    # lr schedulers
    if args.epoch_length is None:
        el = len(loader['train'])
    else:
        el = args.epoch_length

    if args.lr_scheduler is not None:
        lr_sched = create_lr_scheduler(opt, args, num_steps=el)

        if args.lr_scheduler != 'plateau':
            def _sched_fun(engine):
                lr_sched.step()
        else:
            def _sched_fun(engine):
                e = engine.state.epoch
                v = engine.state.val_metrics[e]['nmse']
                lr_sched.step(v)

        if args.lr_scheduler == 'linearcycle':
            trainer.add_event_handler(Events.ITERATION_STARTED, lr_sched)
        else:
            trainer.add_event_handler(Events.EPOCH_COMPLETED, _sched_fun)

    # FIXME: warmup is modifying opt base_lr -> must create last
    if args.lr_warmup > 0:
        wsched = create_lr_scheduler(opt, args, 'warmup', num_steps=el)
        wsts = wsched.total_steps
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED(event_filter=lambda _, i: i <= wsts),
            lambda _: wsched.step()
        )

    # saving
    if save_:
        to_save = {
            'model': model,
            'optimizer': opt,
            'trainer': trainer,
            'evaluator': evaluator
        }

        trainer.add_event_handler(
            Events.EPOCH_COMPLETED,
            Checkpoint(to_save, DiskSaver(logdir), n_saved=3)
        )

        # handler = Checkpoint(
        #     {'model': model},
        #     DiskSaver(logdir),
        #     n_saved = 3,
        #     filename_prefix = 'best',
        #     score_function = lambda engine: -engine.state.metrics['nmae'],
        #     score_name = 'val_nmae',
        # )

        # evaluator.add_event_handler(
        #     Events.COMPLETED,
        #     handler
        # )

        # handler = Checkpoint(
        #     {'model': model},
        #     DiskSaver(logdir),
        #     n_saved = 3,
        #     filename_prefix = 'best',
        #     score_function = lambda engine: -engine.state.metrics['nmse'],
        #     score_name = 'val_nmse',
        # )

        # evaluator.add_event_handler(
        #     Events.COMPLETED,
        #     handler
        # )

        # handler = Checkpoint(
        #     {'model': model},
        #     DiskSaver(logdir),
        #     n_saved = 3,
        #     filename_prefix = 'best',
        #     score_function = lambda engine: engine.state.metrics['R2'],
        #     score_name = 'val_R2',
        # )

        # evaluator.add_event_handler(
        #     Events.COMPLETED,
        #     handler
        # )

        trainer.add_event_handler(
            Events.EPOCH_COMPLETED,
            _save_metrics
        )

        # timer
        trainer.add_event_handler(
            Events.COMPLETED | Events.TERMINATE,
            lambda _: profiler.write_results(logdir + '/time.csv')
        )

    return trainer
Esempio n. 3
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_interval):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.NLLLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("trainer")

    val_metrics = {"accuracy": Accuracy(), "nll": Loss(criterion)}
    evaluator = create_supervised_evaluator(model,
                                            metrics=val_metrics,
                                            device=device)
    evaluator.logger = setup_logger("evaluator")

    pbar = tqdm(
        initial=0,
        leave=False,
        total=len(train_loader),
        desc=f"ITERATION - loss: {0:.2f}",
    )

    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training_loss(engine):
        pbar.desc = f"ITERATION - loss: {engine.state.output:.2f}"
        pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            f"Training Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}"
        )

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            f"Validation Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}"
        )

        pbar.n = pbar.last_print_n = 0

    @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED)
    def log_time(engine):
        tqdm.write(
            f"{trainer.last_event_name.name} took { trainer.state.times[trainer.last_event_name.name]} seconds"
        )

    trainer.run(train_loader, max_epochs=epochs)
    pbar.close()
def run(train_batch_size, val_batch_size, epochs, lr, momentum):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    clearml_logger = ClearMLLogger(project_name="examples", task_name="ignite")

    clearml_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
    )

    for tag, evaluator in [("training metrics", train_evaluator),
                           ("validation metrics", validation_evaluator)]:
        clearml_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    clearml_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)

    clearml_logger.attach(
        trainer,
        log_handler=WeightsScalarHandler(model, whitelist=["fc1"]),
        event_name=Events.ITERATION_COMPLETED(every=100),
    )

    def is_conv(n, _):
        return "conv" in n

    clearml_logger.attach(
        trainer,
        log_handler=WeightsHistHandler(model, whitelist=is_conv),
        event_name=Events.ITERATION_COMPLETED(every=100),
    )

    clearml_logger.attach(trainer,
                          log_handler=GradsScalarHandler(model),
                          event_name=Events.ITERATION_COMPLETED(every=100))

    clearml_logger.attach(
        trainer,
        log_handler=GradsHistHandler(model, whitelist=["fc2.weight"]),
        event_name=Events.ITERATION_COMPLETED(every=100),
    )

    handler = Checkpoint(
        {"model": model},
        ClearMLSaver(),
        n_saved=1,
        score_function=lambda e: e.state.metrics["accuracy"],
        score_name="val_acc",
        filename_prefix="best",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.EPOCH_COMPLETED, handler)

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    clearml_logger.close()
Esempio n. 5
0
def run():
    writer = SummaryWriter()

    CUDA = Config.device
    model = Retriever()
    print(f'Initializing model on {CUDA}')
    model.to(CUDA)
    optimizer = torch.optim.Adam(model.parameters(), lr=Config.LR)
    loss_fn = torch.nn.L1Loss().to(CUDA)
    print(f'Creating sentence transformer')
    encoder = SentenceTransformer(Config.sentence_transformer).to(CUDA)
    for parameter in encoder.parameters():
        parameter.requires_grad = False
    print(f'Loading data')
    if os.path.exists('_full_dump'):
        with open('_full_dump', 'rb') as pin:
            train_loader, train_utts, val_loader, val_utts = pickle.load(pin)
    else:
        data = load_data(Config.data_source)
        train_loader, train_utts, val_loader, val_utts = get_loaders(data, encoder, Config.batch_size)
    
        with open('_full_dump', 'wb') as pout:
            pickle.dump((train_loader, train_utts, val_loader, val_utts), pout, protocol=-1)


    def train_step(engine, batch):
        model.train()
        optimizer.zero_grad()
        x, not_ys, y = batch
        yhat = model(x[0])
        loss = loss_fn(yhat, y)
        gains = loss_fn(not_ys[0], yhat) * Config.negative_weight
        loss -= gains

        loss.backward()
        optimizer.step()
        return loss.item()
    
    def eval_step(engine, batch):
        model.eval()
        with torch.no_grad():
            x, _, y = batch
            yhat = model(x[0])
            return yhat, y
    
    trainer = Engine(train_step)
    trainer.logger = setup_logger('trainer')

    evaluator = Engine(eval_step)
    evaluator.logger = setup_logger('evaluator')
    
    latent_space = BallTree(numpy.array(list(train_utts.keys())))

    l1 = Loss(loss_fn)

    recall = RecallAt(latent_space)

    recall.attach(evaluator, 'recall')
    l1.attach(evaluator, 'l1')
    
    @trainer.on(Events.ITERATION_COMPLETED(every=1000))
    def log_training(engine):
        batch_loss = engine.state.output
        lr = optimizer.param_groups[0]['lr']
        e = engine.state.epoch
        n = engine.state.max_epochs
        i = engine.state.iteration
        print("Epoch {}/{} : {} - batch loss: {}, lr: {}".format(e, n, i, batch_loss, lr))
        writer.add_scalar('Training/loss', batch_loss, i)
    
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        print(f"Training Results - Epoch: {engine.state.epoch} " 
              f" L1: {metrics['l1']:.2f} "
              f" R@1: {metrics['r1']:.2f} "
              f" R@3: {metrics['r3']:.2f} "
              f" R@10: {metrics['r10']:.2f} ")

        for metric, value in metrics.items():
            writer.add_scalar(f'Training/{metric}', value, engine.state.epoch)
        
    #@trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        print(f"Validation Results - Epoch: {engine.state.epoch} "
              f"L1: {metrics['l1']:.2f} " 
              f" R@10: {metrics['r10']:.2f} ")
        for metric, value in metrics.items():
            writer.add_scalar(f'Validation/{metric}', value, engine.state.epoch)
 
    trainer.run(train_loader, max_epochs=Config.max_epochs)

    torch.save(model.state_dict(), Config.checkpoint)
    print(f'Saved checkpoint at {Config.checkpoint}')
    interact(model, encoder, latent_space, train_utts)
Esempio n. 6
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="IMDB-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_dir"]
    if rank == 0:

        now = datetime.now().strftime("%Y%m%d-%H%M%S")
        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_dir"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_dir']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("IMDB-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "dropout",
                "n_fc",
                "batch_size",
                "max_length",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy":
        Accuracy(output_transform=utils.thresholded_output_transform),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_evaluator(model, metrics, config, tag="val")
    train_evaluator = create_evaluator(model, metrics, config, tag="train")

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED | Events.STARTED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(
            output_path,
            trainer,
            optimizer,
            evaluators=evaluators,
            log_every_iters=config["log_every_iters"])

    # Store 2 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        utils.get_save_handler(config),
        filename_prefix="best",
        n_saved=2,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(
            lambda *_: trainer.state.epoch > config["num_epochs"] // 2),
        best_model_handler)

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()
Esempio n. 7
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir):
    train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model, optimizer, criterion, device=device)
    trainer.logger = setup_logger("Trainer")

    if sys.version_info > (3,):
        from ignite.contrib.metrics.gpu_info import GpuInfo

        try:
            GpuInfo().attach(trainer)
        except RuntimeError:
            print(
                "INFO: By default, in this example it is possible to log GPU information (used memory, utilization). "
                "As there is no pynvml python package installed, GPU information won't be logged. Otherwise, please "
                "install it : `pip install pynvml`"
            )

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    tb_logger = TensorboardLogger(log_dir=log_dir)

    tb_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
        metric_names="all",
    )

    for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]:
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    tb_logger.attach_opt_params_handler(trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer)

    tb_logger.attach(
        trainer,
        log_handler=WeightsScalarHandler(model, whitelist=["fc1"]),
        event_name=Events.ITERATION_COMPLETED(every=100),
    )

    def is_conv(n, _):
        return "conv" in n

    tb_logger.attach(
        trainer,
        log_handler=WeightsHistHandler(model, whitelist=is_conv),
        event_name=Events.ITERATION_COMPLETED(every=100),
    )

    tb_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED(every=100))

    tb_logger.attach(
        trainer,
        log_handler=GradsHistHandler(model, whitelist=["fc2.weight"]),
        event_name=Events.ITERATION_COMPLETED(every=100),
    )

    def score_function(engine):
        return engine.state.metrics["accuracy"]

    model_checkpoint = ModelCheckpoint(
        log_dir,
        n_saved=2,
        filename_prefix="best",
        score_function=score_function,
        score_name="validation_accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model})

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    tb_logger.close()
Esempio n. 8
0
def auto_dataloader(dataset, **kwargs):
    """Helper method to create a dataloader adapted for non-distributed and distributed configurations (supporting
    all available backends from :meth:`~ignite.distributed.utils.available_backends()`).

    Internally, we create a dataloader with provided kwargs while applying the following updates:

    - batch size is scaled by world size: ``batch_size / world_size``.
    - number of workers is scaled by number of local processes: ``num_workers / nprocs``.
    - if no sampler provided by user, `torch DistributedSampler` is setup.
    - if a sampler is provided by user, it is wrapped by :class:`~ignite.distributed.auto.DistributedProxySampler`.

    .. warning::

        Custom batch sampler is not adapted for distributed configuration. Please, make sure that provided batch
        sampler is compatible with distributed configuration.

    Examples:

    .. code-block:: python

        import ignite.distribted as idist

        train_loader = idist.auto_dataloader(
            train_dataset,
            batch_size=32,
            num_workers=4,
            shuffle=True,
            pin_memory="cuda" in idist.device().type,
            drop_last=True,
        )

    Args:
        dataset (Dataset): input torch dataset
        **kwargs: keyword arguments for `torch DataLoader`_.

    Returns:
        `torch DataLoader`_ or `XLA MpDeviceLoader`_ for XLA devices

    .. _torch DataLoader: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
    .. _XLA MpDeviceLoader: https://github.com/pytorch/xla/blob/master/torch_xla/distributed/parallel_loader.py#L178
    .. _torch DistributedSampler:
        https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler
    """
    rank = idist.get_rank()
    world_size = idist.get_world_size()

    logger = setup_logger(__name__ + ".auto_dataloader")

    if world_size > 1:
        if "batch_size" in kwargs:
            kwargs["batch_size"] //= world_size

        if "num_workers" in kwargs:
            nproc = idist.get_nproc_per_node()
            kwargs["num_workers"] = (kwargs["num_workers"] + nproc -
                                     1) // nproc

        if "batch_sampler" not in kwargs:
            if kwargs.get("sampler", None) is not None:
                sampler = DistributedProxySampler(kwargs["sampler"],
                                                  num_replicas=world_size,
                                                  rank=rank)
            else:
                sampler = DistributedSampler(dataset,
                                             num_replicas=world_size,
                                             rank=rank,
                                             shuffle=kwargs.get(
                                                 "shuffle", True))
                # we need to remove "shuffle" from kwargs if sampler is used
                if "shuffle" in kwargs:
                    del kwargs["shuffle"]

            kwargs["sampler"] = sampler
        else:
            warnings.warn(
                "Found batch_sampler in provided kwargs. Please, make sure that it is compatible "
                "with distributed configuration")

    if idist.has_xla_support and idist.backend(
    ) == idist_xla.XLA_TPU and kwargs.get("pin_memory", False):
        # TODO: How about XLA GPU ?
        warnings.warn(
            "Found incompatible options: xla support and pin_memory args equal True. "
            "Argument `pin_memory=False` will be used to construct data loader."
        )
        kwargs["pin_memory"] = False

    logger.info("Use data loader kwargs for dataset '{}': \n\t{}".format(
        repr(dataset)[:20].strip(), kwargs))
    dataloader = DataLoader(dataset, **kwargs)

    if idist.has_xla_support and idist.backend(
    ) == idist_xla.XLA_TPU and world_size > 1:

        logger.info("DataLoader is wrapped by `MpDeviceLoader` on XLA")

        mp_device_loader_cls = _MpDeviceLoader
        try:
            from torch_xla.distributed.parallel_loader import MpDeviceLoader

            mp_device_loader_cls = MpDeviceLoader
        except ImportError:
            pass

        sampler = dataloader.sampler
        dataloader = mp_device_loader_cls(dataloader, idist.device())
        dataloader.sampler = sampler

    return dataloader
Esempio n. 9
0
def run_training(local_rank: int, config: ConfigSchema) -> Dict[str, float]:
    rank = idist.get_rank()
    if config.seed is not None:
        manual_seed(config.seed + rank)

    logger = setup_logger(name=config.experiment_name, distributed_rank=local_rank)

    log_basic_info(logger, config)

    if rank == 0:
        prepare_output_directory(config)
        logger.info("Output path: {}".format(config.output_path))

    weak_label_mgr = get_weak_label_manager(config)

    # Setup dataflow, model, optimizer, criterion
    data_loaders = get_dataflow(config, weak_label_mgr)
    train_loader = data_loaders["train"]
    config.num_iters_per_epoch = len(train_loader)

    model, optimizer, criterion = initialize(config, weak_label_mgr)

    metrics = get_metrics(criterion)
    trainer, evaluators = create_trainer_and_evaluators(
        model, optimizer, criterion, data_loaders, metrics, config, logger
    )

    if rank == 0:
        tb_logger = common.setup_tb_logging(
            config.output_path, trainer, optimizer, evaluators=evaluators
        )

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluators["val"],
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="test",
    )
    state_at_best_val = StateAtBestVal(
        score_function=lambda: evaluators["val"].state.metrics["accuracy"],
        state_function=lambda: dict(
            {"val_" + key: val for key, val in evaluators["val"].state.metrics.items()},
            **{
                "test_" + key: val
                for key, val in evaluators["test"].state.metrics.items()
            },
            epoch=trainer.state.epoch
        ),
    )
    trainer.add_event_handler(Events.EPOCH_COMPLETED, state_at_best_val)

    try:
        trainer.run(train_loader, max_epochs=config.num_epochs)
    except Exception:
        import traceback

        print(traceback.format_exc())
    else:
        assert state_at_best_val.best_state is not None
        tb_logger.writer.add_hparams(  # type: ignore
            get_hparams(config),
            {"hparam/" + key: val for key, val in state_at_best_val.best_state.items()},
        )
    finally:
        if rank == 0:
            tb_logger.close()  # type: ignore

    return evaluators["val"].state.metrics
Esempio n. 10
0
def run(mode, cfg):
    device = 'cuda' if cfg.SYSTEM.USE_CUDA else 'cpu'
    print(cfg.MODEL.NAME)
    model = ResNext()
    train_loader, val_loader, test_loader = get_data_loaders(
        cfg.TRAIN, cfg.EVALUATE, cfg.TEST, cfg.AUGMENT)
    if cfg.MODEL.CHECKPOINT:
        model.load_state_dict(torch.load(cfg.MODEL.CHECKPOINT))
        print(
            f"Load {cfg.MODEL.NAME} weight ({cfg.MODEL.CHECKPOINT}) sucessfully!"
        )
    loss = torch.nn.CrossEntropyLoss()
    pbar = ProgressBar()

    if mode == 'train':
        timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
        dir_name = f'{timestamp}_{cfg.MODEL.NAME}{cfg.TAG}'
        writer = create_summary_writer(model, train_loader, f"runs/{dir_name}")

        optimizer = RAdam(model.parameters(), lr=cfg.OPTIM.INIT_LR)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                    cfg.LR_SCHEDULER.STEP_SIZE)
        trainer = create_supervised_trainer(model, optimizer, loss, device)
        trainer.logger = setup_logger("trainer")
        pbar.attach(trainer)

        evaluator = create_supervised_evaluator(model, {"TOP_1": TOP_1()},
                                                device)
        evaluator.logger = setup_logger("evaluator")
        pbar.attach(evaluator)

        model_saver = ModelCheckpoint(f"checkpoints/{dir_name}",
                                      f"{cfg.MODEL.NAME}{cfg.TAG}",
                                      n_saved=10,
                                      create_dir=True)

        @trainer.on(Events.ITERATION_COMPLETED)
        def log_training_loss(engine):
            trainer.logger.info(trainer.state)
            trainer.logger.info("Epoch[{}_{}] Loss: {:.2f}".format(
                trainer.state.epoch, trainer.state.iteration,
                trainer.state.output))
            writer.add_scalar("training/loss", trainer.state.output,
                              trainer.state.iteration)
            writer.add_scalar("training/lr", optimizer.param_groups[0]['lr'],
                              trainer.state.iteration)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_training_results(engine):
            model_saver(engine, {"model": model})
            trainer.logger.info("Model saved!")
            scheduler.step()

        @trainer.on(Events.EPOCH_COMPLETED(every=cfg.EVAL_MODEL_EVERY_EPOCH))
        def log_validation_results(engine):
            evaluator.run(train_loader)
            metrics = evaluator.state.metrics
            evaluator.logger.info(
                "Training Results - Epoch: {} TOP_1: {:.2f}".format(
                    trainer.state.epoch, metrics['TOP_1']))
            writer.add_scalar("training/TOP_1", metrics['TOP_1'],
                              trainer.state.iteration)

            evaluator.run(val_loader)
            metrics = evaluator.state.metrics
            evaluator.logger.info(
                "Validation Results - Epoch: {} TOP_1: {:.2f}".format(
                    trainer.state.epoch, metrics['TOP_1']))
            writer.add_scalar("validation/TOP_1", metrics['TOP_1'],
                              trainer.state.iteration)

        trainer.run(train_loader, max_epochs=cfg.EPOCH)

    elif mode == 'infer':
        predictor = create_supervised_evaluator(model,
                                                {"Predict": Predict(cfg.TEST)},
                                                device)
        pbar.attach(predictor)
        predictor.logger = setup_logger("predictor")
        predictor.run(test_loader)
        predictor.logger.info("Inference Done.")

    elif mode == 'eval':
        evaluator = create_supervised_evaluator(model, {"TOP_1": TOP_1()},
                                                device)
        pbar.attach(evaluator)
        evaluator.logger = setup_logger('evaluator')
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        evaluator.logger.info("Validation Results - TOP_1: {:.2f}".format(
            metrics['TOP_1']))
Esempio n. 11
0
def run(
    data_path="/tmp/MNIST",
    seed=3321,
    mode="xentropy",
    noise_fraction=0.35,
    batch_size=64,
    val_batch_size=1000,
    num_epochs=50,
    lr=0.01,
    momentum=0.5,
    as_pseudo_label=None,
    log_dir="/tmp/output-bootstraping-loss/mnist/",
    with_trains=False,
):
    """Training on noisy labels with bootstrapping

    Args:
        data_path (str): Path to MNIST dataset. Default, "/tmp/MNIST"
        seed (int): Random seed to setup. Default, 3321
        mode (str): Loss function mode: cross-entropy or bootstrapping (soft, hard). 
            Choices 'xentropy', 'soft_bootstrap', 'hard_bootstrap'.
        noise_fraction (float): Label noise fraction. Default, 0.35.
        batch_size (int): Input batch size for training. Default, 64.
        val_batch_size (int): input batch size for validation. Default, 1000.
        num_epochs (int): Number of epochs to train. Default, 50.
        lr (float): Learning rate. Default, 0.01.
        momentum (float): SGD momentum. Default, 0.5.
        log_dir (str): Log directory for Tensorboard log output. Default="/tmp/output-bootstraping-loss/mnist/".
        with_trains (bool): if True, experiment Trains logger is setup. Default, False.

    """
    assert torch.cuda.is_available(), "Training should running on GPU"
    device = "cuda"

    manual_seed(seed)
    logger = setup_logger(name="MNIST-Training")

    now = datetime.now().strftime("%Y%m%d-%H%M%S")

    # Setup output path
    suffix = ""
    if mode == "soft_bootstrap" and (as_pseudo_label is not None
                                     and not as_pseudo_label):
        suffix = "as_xreg"
    output_path = Path(log_dir) / "train_{}_{}_{}_{}__{}".format(
        mode, noise_fraction, suffix, now, num_epochs)

    if not output_path.exists():
        output_path.mkdir(parents=True)

    parameters = {
        "seed": seed,
        "mode": mode,
        "noise_fraction": noise_fraction,
        "batch_size": batch_size,
        "num_epochs": num_epochs,
        "lr": lr,
        "momentum": momentum,
        "as_pseudo_label": as_pseudo_label,
    }
    log_basic_info(logger, parameters)

    if with_trains:
        from trains import Task

        task = Task.init("BootstrappingLoss - Experiments on MNIST",
                         task_name=output_path.name)
        # Log hyper parameters
        task.connect(parameters)

    train_loader, test_loader = get_data_loaders(data_path, noise_fraction,
                                                 batch_size, val_batch_size)
    model = Net().to(device)
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)

    if mode == 'xentropy':
        criterion = nn.CrossEntropyLoss()
    elif mode == 'soft_bootstrap':
        if as_pseudo_label is None:
            as_pseudo_label = True
        criterion = SoftBootstrappingLoss(beta=0.95,
                                          as_pseudo_label=as_pseudo_label)
    elif mode == 'hard_bootstrap':
        criterion = HardBootstrappingLoss(beta=0.8)
    else:
        raise ValueError(
            "Wrong mode {}, expected: xentropy, soft_bootstrap or hard_bootstrap"
            .format(mode))

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device,
                                        non_blocking=True)

    metrics = {
        "Accuracy": Accuracy(),
        "{} loss".format(mode): Loss(criterion),
    }
    if mode is not "xentropy":
        metrics["xentropy loss"] = Loss(nn.CrossEntropyLoss())

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, "Train", state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, "Test", state.metrics)

    trainer.add_event_handler(Events.EPOCH_COMPLETED | Events.COMPLETED,
                              run_validation)

    evaluators = {"training": train_evaluator, "test": evaluator}
    tb_logger = common.setup_tb_logging(output_path,
                                        trainer,
                                        optimizer,
                                        evaluators=evaluators)

    trainer.run(train_loader, max_epochs=num_epochs)

    test_acc = evaluator.state.metrics["Accuracy"]
    tb_logger.writer.add_hparams(parameters,
                                 {"hparam/test_accuracy": test_acc})

    tb_logger.close()

    return (mode, noise_fraction, as_pseudo_label, test_acc)
Esempio n. 12
0
                      non_blocking: bool = False):
        dict_inputs = dict()
        inputs, target_sequence = dict_inputs
        input_seq, input_pos, input_chuck = dict_inputs
        input_sequence, input_length = input_seq

        (x_tokens, x_length), x_pos, x_chunk = x_inputs

        return

    model = BiLSTMCRF(config).to(config.device)
    optimizer = torch.optim.Adam(params=model.parameters(),
                                 lr=config.learn.learning_rate,
                                 weight_decay=config.learn.weight_decay)
    criterion = SequenceCRFLoss(config).to(config.device)

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        prepare_batch=prepare_batch,
                                        device=config.device)

    # Create an object of the profiler and attach an engine to it
    profiler = BasicTimeProfiler()
    profiler.attach(trainer)

    trainer.logger = setup_logger("trainer")

    trainer.run(train_loader, max_epochs=100)
    pass
Esempio n. 13
0
def training(rank, config):
    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    # Define output folder:
    config.output = "/tmp/output"

    model = idist.auto_model(config.model)
    optimizer = idist.auto_optim(config.optimizer)
    criterion = config.criterion

    train_set, val_set = config.train_set, config.val_set
    train_loader = idist.auto_dataloader(train_set,
                                         batch_size=config.train_batch_size)
    val_loader = idist.auto_dataloader(val_set,
                                       batch_size=config.val_batch_size)

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED(every=config.val_interval))
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    if rank == 0:
        tb_logger = TensorboardLogger(log_dir=config.output)

        tb_logger.attach_output_handler(
            trainer,
            event_name=Events.ITERATION_COMPLETED(every=100),
            tag="training",
            output_transform=lambda loss: {"batchloss": loss},
            metric_names="all",
        )

        for tag, evaluator in [("training", train_evaluator),
                               ("validation", validation_evaluator)]:
            tb_logger.attach_output_handler(
                evaluator,
                event_name=Events.EPOCH_COMPLETED,
                tag=tag,
                metric_names=["loss", "accuracy"],
                global_step_transform=global_step_from_engine(trainer),
            )

        tb_logger.attach_opt_params_handler(
            trainer,
            event_name=Events.ITERATION_COMPLETED(every=100),
            optimizer=optimizer)

    model_checkpoint = ModelCheckpoint(
        config.output,
        n_saved=2,
        filename_prefix="best",
        score_name="accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint,
                                           {"model": model})

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if rank == 0:
        tb_logger.close()
Esempio n. 14
0
def run(config):
    train_loader = get_instance(utils, 'dataloader', config, 'train')
    val_loader = get_instance(utils, 'dataloader', config, 'val')

    model = get_instance(models, 'arch', config)

    model = init_model(model, train_loader)
    model, device = ModelPrepper(model, config).out

    loss_fn = get_instance(nn, 'loss_fn', config)

    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = get_instance(torch.optim, 'optimizer', config,
                             trainable_params)

    writer = create_summary_writer(config, model, train_loader)
    batch_size = config['dataloader']['args']['batch_size']

    if config['mode'] == 'eval' or config['resume']:
        model.load_state_dict(torch.load(config['ckpt_path']))

    epoch_length = int(ceil(len(train_loader) / batch_size))
    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=epoch_length,
                desc=desc.format(0))

    def process_batch(engine, batch):
        inputs, outputs = func(batch)
        model.train()
        model.zero_grad()
        optimizer.zero_grad()
        preds = model(inputs)
        loss = loss_fn(preds, outputs.to(device))

        a = list(model.parameters())[0].clone()

        loss.backward()
        optimizer.step()

        # check if training is happening
        b = list(model.parameters())[0].clone()
        try:
            assert not torch.allclose(a.data,
                                      b.data), 'Model not updating anymore'
        except AssertionError:
            plot_grad_flow(model.named_parameters())

        return loss.item()

    def predict_on_batch(engine, batch):
        inputs, outputs = func(batch)
        model.eval()
        with torch.no_grad():
            y_pred = model(inputs)

        return inputs, y_pred, outputs.to(device)

    trainer = Engine(process_batch)
    trainer.logger = setup_logger("trainer")
    evaluator = Engine(predict_on_batch)
    evaluator.logger = setup_logger("evaluator")

    if config['task'] == 'actionpred':
        Accuracy(output_transform=lambda x: (x[1], x[2])).attach(
            evaluator, 'val_acc')

    if config['task'] == 'gazepred':
        MeanSquaredError(output_transform=lambda x: (x[1], x[2])).attach(
            evaluator, 'val_MSE')

    RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')

    training_saver = ModelCheckpoint(config['checkpoint_dir'],
                                     filename_prefix='checkpoint_' +
                                     config['task'],
                                     n_saved=1,
                                     atomic=True,
                                     save_as_state_dict=True,
                                     create_dir=True,
                                     require_empty=False)

    trainer.add_event_handler(Events.EPOCH_COMPLETED, training_saver,
                              {'model': model})

    @trainer.on(Events.ITERATION_COMPLETED)
    def tb_log(engine):
        pbar.desc = desc.format(engine.state.output)
        pbar.update(1)
        writer.add_scalar('training/avg_loss', engine.state.metrics['loss'],
                          engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED)
    def print_trainer_logs(engine):
        pbar.refresh()

        avg_loss = engine.state.metrics['loss']
        tqdm.write('Trainer Results - Epoch {} - Avg loss: {:.2f} \n'.format(
            engine.state.epoch, avg_loss))
        viz_param(writer=writer, model=model, global_step=engine.state.epoch)

        pbar.n = pbar.last_print_n = 0

    @evaluator.on(Events.EPOCH_COMPLETED)
    def print_result(engine):
        try:
            print('Evaluator Results - Accuracy {} \n'.format(
                engine.state.metrics['val_acc']))
        except KeyError:
            print('Evaluator Results - MSE {} \n'.format(
                engine.state.metrics['val_MSE']))

    @evaluator.on(Events.ITERATION_COMPLETED)
    def viz_outputs(engine):
        visualize_outputs(writer=writer,
                          state=engine.state,
                          task=config['task'])

    if config['mode'] == 'train':
        trainer.run(train_loader,
                    max_epochs=config['epochs'],
                    epoch_length=epoch_length)

    pbar.close()

    evaluator.run(val_loader,
                  max_epochs=1,
                  epoch_length=int(ceil(len(val_loader) / batch_size)))

    writer.flush()
    writer.close()
Esempio n. 15
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = "stop-on-{}".format(config["stop_iteration"])

        folder_name = "{}_backend-{}-{}_{}".format(config["model"], idist.backend(), idist.get_world_size(), now)
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info("Output path: {}".format(config["output_path"]))

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True)
    train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics)

    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators)

        if config["with_trains"]:
            trains_logger = common.setup_trains_logging(
                trainer,
                optimizer,
                evaluators=evaluators,
                project_name="cifar10-ignite",
                task_name=Path(output_path).stem,
            )

    # Store 3 best models by validation accuracy:
    common.save_best_model_by_val_score(
        output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test"
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info("Stop training on {} iteration".format(trainer.state.iteration))
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
        if config["with_trains"]:
            trains_logger.close()
Esempio n. 16
0
def _setup_a_logger_and_dump(name, message):
    logger = setup_logger(name)
    logger.info(message)
Esempio n. 17
0
def run(args, seed):
    config.make_paths()

    torch.random.manual_seed(seed)
    train_loader, val_loader, shape = get_data_loaders(
        config.Training.batch_size,
        proportion=config.Training.proportion,
        test_batch_size=config.Training.batch_size * 2,
    )
    n, d, t = shape
    model = models.ConvNet(d, seq_len=t)

    writer = tb.SummaryWriter(log_dir=config.TENSORBOARD)

    model.to(config.device)  # Move model before creating optimizer
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.MSELoss()

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=config.device)
    trainer.logger = setup_logger("trainer")

    checkpointer = ModelCheckpoint(
        config.MODEL,
        model.__class__.__name__,
        n_saved=2,
        create_dir=True,
        save_as_state_dict=True,
    )
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config.Training.save_every),
        checkpointer,
        {"model": model},
    )

    val_metrics = {
        "mse": Loss(criterion),
        "mae": MeanAbsoluteError(),
        "rmse": RootMeanSquaredError(),
    }

    evaluator = create_supervised_evaluator(model,
                                            metrics=val_metrics,
                                            device=config.device)
    evaluator.logger = setup_logger("evaluator")

    ar_evaluator = create_ar_evaluator(model,
                                       metrics=val_metrics,
                                       device=config.device)
    ar_evaluator.logger = setup_logger("ar")

    @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.save_every))
    def log_ar(engine):
        ar_evaluator.run(val_loader)
        y_pred, y = ar_evaluator.state.output
        fig = plot_output(y, y_pred)
        writer.add_figure("eval/ar", fig, engine.state.epoch)
        plt.close()

    # desc = "ITERATION - loss: {:.2f}"
    # pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED(every=config.Training.log_every))
    def log_training_loss(engine):
        # pbar.desc = desc.format(engine.state.output)
        # pbar.update(log_interval)
        if args.verbose:
            grad_norm = torch.stack(
                [p.grad.norm() for p in model.parameters()]).sum()
            writer.add_scalar("train/grad_norm", grad_norm,
                              engine.state.iteration)
        writer.add_scalar("train/loss", engine.state.output,
                          engine.state.iteration)

    @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.eval_every))
    def log_training_results(engine):
        # pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        for k, v in metrics.items():
            writer.add_scalar(f"train/{k}", v, engine.state.epoch)
        # tqdm.write(
        #    f"Training Results - Epoch: {engine.state.epoch}  Avg mse: {evaluator.state.metrics['mse']:.2f}"
        # )

    @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.eval_every))
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics

        for k, v in metrics.items():
            writer.add_scalar(f"eval/{k}", v, engine.state.epoch)
        # tqdm.write(
        #    f"Validation Results - Epoch: {engine.state.epoch}  Avg mse: {evaluator.state.metrics['mse']:.2f}"
        # )

        # pbar.n = pbar.last_print_n = 0

        y_pred, y = evaluator.state.output

        fig = plot_output(y, y_pred)
        writer.add_figure("eval/preds", fig, engine.state.epoch)
        plt.close()

    # @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED)
    # def log_time(engine):
    #    #tqdm.write(
    #    #    f"{trainer.last_event_name.name} took {trainer.state.times[trainer.last_event_name.name]} seconds"
    #    #)
    if args.ckpt is not None:
        ckpt = torch.load(args.ckpt)
        ModelCheckpoint.load_objects({"model": model}, ckpt)

    try:
        trainer.run(train_loader, max_epochs=config.Training.max_epochs)
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    # pbar.close()
    writer.close()
Esempio n. 18
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-QAT-Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")

        folder_name = "{}_backend-{}-{}_{}".format(config["model"],
                                                   idist.backend(),
                                                   idist.get_world_size(), now)
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info("Output path: {}".format(config["output_path"]))

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 3 best models by validation accuracy:
    common.save_best_model_by_val_score(
        output_path=config["output_path"],
        evaluator=evaluator,
        model=model,
        metric_name="Accuracy",
        n_saved=1,
        trainer=trainer,
        tag="test",
    )

    trainer.run(train_loader, max_epochs=config["num_epochs"])

    if rank == 0:
        tb_logger.close()
Esempio n. 19
0
def training(local_rank, cfg):

    logger = setup_logger("FixMatch Training", distributed_rank=idist.get_rank())

    if local_rank == 0:
        logger.info(cfg.pretty())

    rank = idist.get_rank()
    manual_seed(cfg.seed + rank)
    device = idist.device()

    model, ema_model, optimizer, sup_criterion, lr_scheduler = utils.initialize(cfg)

    unsup_criterion = instantiate(cfg.solver.unsupervised_criterion)

    cta = get_default_cta()

    (
        supervised_train_loader,
        test_loader,
        unsup_train_loader,
        cta_probe_loader,
    ) = utils.get_dataflow(cfg, cta=cta, with_unsup=True)

    def train_step(engine, batch):
        model.train()
        optimizer.zero_grad()

        x, y = batch["sup_batch"]["image"], batch["sup_batch"]["target"]
        if x.device != device:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

        weak_x, strong_x = (
            batch["unsup_batch"]["image"],
            batch["unsup_batch"]["strong_aug"],
        )
        if weak_x.device != device:
            weak_x = weak_x.to(device, non_blocking=True)
            strong_x = strong_x.to(device, non_blocking=True)

        # according to TF code: single forward pass on concat data: [x, weak_x, strong_x]
        le = 2 * engine.state.mu_ratio + 1
        # Why interleave: https://github.com/google-research/fixmatch/issues/20#issuecomment-613010277
        # We need to interleave due to multiple-GPU batch norm issues. Let's say we have to GPUs, and our batch is
        # comprised of labeled (L) and unlabeled (U) images. Let's use a batch size of 2 for making easier visually
        # in my following example.
        #
        # - Without interleaving, we have a batch LLUUUUUU...U (there are 14 U). When the batch is split to be passed
        # to both GPUs, we'll have two batches LLUUUUUU and UUUUUUUU. Note that all labeled examples ended up in batch1
        # sent to GPU1. The problem here is that batch norm will be computed per batch and the moments will lack
        # consistency between batches.
        #
        # - With interleaving, by contrast, the two batches will be LUUUUUUU and LUUUUUUU. As you can notice the
        # batches have the same distribution of labeled and unlabeled samples and will therefore have more consistent
        # moments.
        #
        x_cat = interleave(torch.cat([x, weak_x, strong_x], dim=0), le)
        y_pred_cat = model(x_cat)
        y_pred_cat = deinterleave(y_pred_cat, le)

        idx1 = len(x)
        idx2 = idx1 + len(weak_x)
        y_pred = y_pred_cat[:idx1, ...]
        y_weak_preds = y_pred_cat[idx1:idx2, ...]  # logits_weak
        y_strong_preds = y_pred_cat[idx2:, ...]  # logits_strong

        # supervised learning:
        sup_loss = sup_criterion(y_pred, y)

        # unsupervised learning:
        y_weak_probas = torch.softmax(y_weak_preds, dim=1).detach()
        y_pseudo = y_weak_probas.argmax(dim=1)
        max_y_weak_probas, _ = y_weak_probas.max(dim=1)
        unsup_loss_mask = (
            max_y_weak_probas >= engine.state.confidence_threshold
        ).float()
        unsup_loss = (
            unsup_criterion(y_strong_preds, y_pseudo) * unsup_loss_mask
        ).mean()

        total_loss = sup_loss + engine.state.lambda_u * unsup_loss

        total_loss.backward()

        optimizer.step()

        return {
            "total_loss": total_loss.item(),
            "sup_loss": sup_loss.item(),
            "unsup_loss": unsup_loss.item(),
            "mask": unsup_loss_mask.mean().item(),  # this should not be averaged for DDP
        }

    output_names = ["total_loss", "sup_loss", "unsup_loss", "mask"]

    trainer = trainers.create_trainer(
        train_step,
        output_names=output_names,
        model=model,
        ema_model=ema_model,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        supervised_train_loader=supervised_train_loader,
        test_loader=test_loader,
        cfg=cfg,
        logger=logger,
        cta=cta,
        unsup_train_loader=unsup_train_loader,
        cta_probe_loader=cta_probe_loader,
    )

    trainer.state.confidence_threshold = cfg.ssl.confidence_threshold
    trainer.state.lambda_u = cfg.ssl.lambda_u
    trainer.state.mu_ratio = cfg.ssl.mu_ratio

    distributed = idist.get_world_size() > 1

    @trainer.on(Events.ITERATION_COMPLETED(every=cfg.ssl.cta_update_every))
    def update_cta_rates():
        batch = trainer.state.batch
        x, y = batch["cta_probe_batch"]["image"], batch["cta_probe_batch"]["target"]
        if x.device != device:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

        policies = batch["cta_probe_batch"]["policy"]

        ema_model.eval()
        with torch.no_grad():
            y_pred = ema_model(x)
            y_probas = torch.softmax(y_pred, dim=1)  # (N, C)

            if distributed:
                for y_proba, t, policy in zip(y_probas, y, policies):
                    error = y_proba
                    error[t] -= 1
                    error = torch.abs(error).sum()
                    cta.update_rates(policy, 1.0 - 0.5 * error.item())
            else:
                error_per_op = []
                for y_proba, t, policy in zip(y_probas, y, policies):
                    error = y_proba
                    error[t] -= 1
                    error = torch.abs(error).sum()
                    for k, bins in policy:
                        error_per_op.append(pack_as_tensor(k, bins, error))
                error_per_op = torch.stack(error_per_op)
                # all gather
                tensor_list = idist.all_gather(error_per_op)
                # update cta rates
                for t in tensor_list:
                    k, bins, error = unpack_from_tensor(t)
                    cta.update_rates([(k, bins),], 1.0 - 0.5 * error)

    epoch_length = cfg.solver.epoch_length
    num_epochs = cfg.solver.num_epochs if not cfg.debug else 2
    try:
        trainer.run(
            supervised_train_loader, epoch_length=epoch_length, max_epochs=num_epochs
        )
    except Exception as e:
        import traceback

        print(traceback.format_exc())
Esempio n. 20
0
 def __init__(self, output_transform: Callable = lambda x: x):
     self.logger = setup_logger(__name__ + "." + self.__class__.__name__)
     self.logger.addHandler(logging.StreamHandler())
     self._output_transform = output_transform
Esempio n. 21
0
def auto_model(model: nn.Module) -> nn.Module:
    """Helper method to adapt provided model for non-distributed and distributed configurations (supporting
    all available backends from :meth:`~ignite.distributed.utils.available_backends()`).

    Internally, we perform to following:

    - send model to current :meth:`~ignite.distributed.utils.device()` if model's parameters are not on the device.
    - wrap the model to `torch DistributedDataParallel`_ for native torch distributed if world size is larger than 1.
    - wrap the model to `torch DataParallel`_ if no distributed context found and more than one CUDA devices available.
    - broadcast the initial variable states from rank 0 to all other processes if Horovod distributed framework is used.

    Examples:

    .. code-block:: python

        import ignite.distribted as idist

        model = idist.auto_model(model)

    In addition with NVidia/Apex, it can be used in the following way:

    .. code-block:: python

        import ignite.distribted as idist

        model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
        model = idist.auto_model(model)

    Args:
        model (torch.nn.Module): model to adapt.

    Returns:
        torch.nn.Module

    .. _torch DistributedDataParallel: https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel
    .. _torch DataParallel: https://pytorch.org/docs/stable/nn.html#torch.nn.DataParallel
    """
    logger = setup_logger(__name__ + ".auto_model")

    # Put model's parameters to device if its parameters are not on the device
    device = idist.device()
    if not all([p.device == device for p in model.parameters()]):
        model.to(device)

    # distributed data parallel model
    if idist.get_world_size() > 1:
        bnd = idist.backend()
        if idist.has_native_dist_support and bnd == idist_native.NCCL:
            lrank = idist.get_local_rank()
            logger.info(
                "Apply torch DistributedDataParallel on model, device id: {}".
                format(lrank))
            model = torch.nn.parallel.DistributedDataParallel(model,
                                                              device_ids=[
                                                                  lrank,
                                                              ])
        elif idist.has_native_dist_support and bnd == idist_native.GLOO:
            logger.info("Apply torch DistributedDataParallel on model")
            model = torch.nn.parallel.DistributedDataParallel(model)
        elif idist.has_hvd_support and bnd == idist_hvd.HOROVOD:
            import horovod.torch as hvd

            logger.info(
                "Broadcast the initial variable states from rank 0 to all other processes"
            )
            hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    # not distributed but multiple GPUs reachable so data parallel model
    elif torch.cuda.device_count() > 1 and "cuda" in idist.device().type:
        logger.info("Apply torch DataParallel on model")
        model = torch.nn.parallel.DataParallel(model)

    return model
Esempio n. 22
0
                                                              127:128])
    loss.backward()
    optimizer.step()
    return loss.item()


# Create Trainer or Evaluators
trainer = Engine(backprop_step)
train_evaluator = create_supervised_evaluator(model,
                                              metrics=metrics,
                                              device=device)
validation_evaluator = create_supervised_evaluator(model,
                                                   metrics=metrics,
                                                   device=device)

trainer.logger = setup_logger("Trainer")
train_evaluator.logger = setup_logger("Train Evaluator")
validation_evaluator.logger = setup_logger("Validation Evaluator")


# Tensorboard Logger setup below based on pytorch ignite example
# https://github.com/pytorch/ignite/blob/master/examples/contrib/mnist/mnist_with_tensorboard_logger.py
@trainer.on(Events.EPOCH_COMPLETED)
def compute_metrics(engine):
    """Callback to compute metrics on the train and validation data."""
    train_evaluator.run(finetuning_loader)
    validation_evaluator.run(test_loader)
    scheduler.step(validation_evaluator.state.metrics['loss'])


def score_function(engine):
Esempio n. 23
0
def auto_dataloader(dataset: Dataset, **kwargs: Any) -> Union[DataLoader, "_MpDeviceLoader"]:
    """Helper method to create a dataloader adapted for non-distributed and distributed configurations (supporting
    all available backends from :meth:`~ignite.distributed.utils.available_backends()`).

    Internally, we create a dataloader with provided kwargs while applying the following updates:

    - batch size is scaled by world size: ``batch_size / world_size`` if larger or equal world size.
    - number of workers is scaled by number of local processes: ``num_workers / nprocs`` if larger or equal world size.
    - if no sampler provided by user, a `torch DistributedSampler`_ is setup.
    - if a `torch DistributedSampler`_ is provided by user, it is used without wrapping it.
    - if another sampler is provided, it is wrapped by :class:`~ignite.distributed.auto.DistributedProxySampler`.
    - if the default device is 'cuda', `pin_memory` is automatically set to `True`.

    .. warning::

        Custom batch sampler is not adapted for distributed configuration. Please, make sure that provided batch
        sampler is compatible with distributed configuration.

    Args:
        dataset: input torch dataset. If input dataset is `torch IterableDataset`_ then dataloader will be
            created without any distributed sampling. Please, make sure that the dataset itself produces
            different data on different ranks.
        kwargs: keyword arguments for `torch DataLoader`_.

    Returns:
        `torch DataLoader`_ or `XLA MpDeviceLoader`_ for XLA devices

    Examples:
        .. code-block:: python

            import ignite.distribted as idist

            train_loader = idist.auto_dataloader(
                train_dataset,
                batch_size=32,
                num_workers=4,
                shuffle=True,
                pin_memory="cuda" in idist.device().type,
                drop_last=True,
            )

    .. _torch DataLoader: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
    .. _XLA MpDeviceLoader: https://github.com/pytorch/xla/blob/master/torch_xla/distributed/parallel_loader.py#L178
    .. _torch DistributedSampler:
        https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler
    .. _torch IterableDataset: https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset
    """
    rank = idist.get_rank()
    world_size = idist.get_world_size()

    logger = setup_logger(__name__ + ".auto_dataloader")
    if world_size > 1:
        if "batch_size" in kwargs and kwargs["batch_size"] >= world_size:
            kwargs["batch_size"] //= world_size

        nproc = idist.get_nproc_per_node()
        if "num_workers" in kwargs and kwargs["num_workers"] >= nproc:
            kwargs["num_workers"] = (kwargs["num_workers"] + nproc - 1) // nproc

        if "batch_sampler" not in kwargs:
            if isinstance(dataset, IterableDataset):
                logger.info(
                    "Found iterable dataset, dataloader will be created without any distributed sampling. "
                    "Please, make sure that the dataset itself produces different data on different ranks."
                )
            else:
                sampler: Optional[Union[DistributedProxySampler, DistributedSampler, Sampler]]
                sampler = kwargs.get("sampler", None)
                if isinstance(sampler, DistributedSampler):
                    if sampler.rank != rank:
                        warnings.warn(f"Found distributed sampler with rank={sampler.rank}, but process rank is {rank}")
                    if sampler.num_replicas != world_size:
                        warnings.warn(
                            f"Found distributed sampler with num_replicas={sampler.num_replicas}, "
                            f"but world size is {world_size}"
                        )
                elif sampler is None:
                    # removes "shuffle" from kwargs if sampler is used
                    shuffle = kwargs.pop("shuffle", True)
                    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=shuffle)
                else:
                    sampler = DistributedProxySampler(sampler, num_replicas=world_size, rank=rank)
                kwargs["sampler"] = sampler
        else:
            warnings.warn(
                "Found batch_sampler in provided kwargs. Please, make sure that it is compatible "
                "with distributed configuration"
            )

    if idist.has_xla_support and idist.backend() == idist_xla.XLA_TPU and kwargs.get("pin_memory", False):
        # TODO: How about XLA GPU ?
        warnings.warn(
            "Found incompatible options: xla support and pin_memory args equal True. "
            "Argument `pin_memory=False` will be used to construct data loader."
        )
        kwargs["pin_memory"] = False
    else:
        kwargs["pin_memory"] = kwargs.get("pin_memory", "cuda" in idist.device().type)

    logger.info(f"Use data loader kwargs for dataset '{repr(dataset)[:20].strip()}': \n\t{kwargs}")
    dataloader = DataLoader(dataset, **kwargs)

    if idist.has_xla_support and idist.backend() == idist_xla.XLA_TPU and world_size > 1:

        logger.info("DataLoader is wrapped by `MpDeviceLoader` on XLA")

        mp_device_loader_cls = _MpDeviceLoader
        try:
            from torch_xla.distributed.parallel_loader import MpDeviceLoader

            mp_device_loader_cls = MpDeviceLoader
        except ImportError:
            pass

        mp_dataloader = mp_device_loader_cls(dataloader, idist.device())
        mp_dataloader.sampler = dataloader.sampler  # type: ignore[attr-defined]
        return mp_dataloader

    return dataloader
Esempio n. 24
0
def test_dist_setup_logger():

    logger = setup_logger("trainer", level=logging.CRITICAL, distributed_rank=1)
    assert logger.level != logging.CRITICAL
Esempio n. 25
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    npt_logger = NeptuneLogger(
        api_token="ANONYMOUS",
        project_name="shared/pytorch-ignite-integration",
        name="ignite-mnist-example",
        params={
            "train_batch_size": train_batch_size,
            "val_batch_size": val_batch_size,
            "epochs": epochs,
            "lr": lr,
            "momentum": momentum,
        },
    )

    npt_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
    )

    for tag, evaluator in [("training", train_evaluator),
                           ("validation", validation_evaluator)]:
        npt_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    npt_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)

    npt_logger.attach(trainer,
                      log_handler=WeightsScalarHandler(model),
                      event_name=Events.ITERATION_COMPLETED(every=100))

    npt_logger.attach(trainer,
                      log_handler=GradsScalarHandler(model),
                      event_name=Events.ITERATION_COMPLETED(every=100))

    def score_function(engine):
        return engine.state.metrics["accuracy"]

    handler = Checkpoint(
        {"model": model},
        NeptuneSaver(npt_logger),
        n_saved=2,
        filename_prefix="best",
        score_function=score_function,
        score_name="validation_accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, handler)

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    npt_logger.close()
Esempio n. 26
0
def run(train_batch_size, val_batch_size, epochs, lr, momentum, log_dir):
    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED)
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    vd_logger = VisdomLogger(env="mnist_training")

    vd_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        tag="training",
        output_transform=lambda loss: {"batchloss": loss},
    )

    for tag, evaluator in [("training", train_evaluator),
                           ("validation", validation_evaluator)]:
        vd_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names=["loss", "accuracy"],
            global_step_transform=global_step_from_engine(trainer),
        )

    vd_logger.attach_opt_params_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=100),
        optimizer=optimizer)

    vd_logger.attach(trainer,
                     log_handler=WeightsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED(every=100))

    vd_logger.attach(trainer,
                     log_handler=GradsScalarHandler(model),
                     event_name=Events.ITERATION_COMPLETED(every=100))

    def score_function(engine):
        return engine.state.metrics["accuracy"]

    model_checkpoint = ModelCheckpoint(
        log_dir,
        n_saved=2,
        filename_prefix="best",
        score_function=score_function,
        score_name="validation_accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint,
                                           {"model": model})

    # kick everything off
    trainer.run(train_loader, max_epochs=epochs)

    vd_logger.close()
Esempio n. 27
0
def run(epochs, lr, momentum, log_interval):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    net = Net().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum)

    trainer = create_supervised_trainer(net,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("trainer")

    val_metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
        "recall": Recall()
    }
    evaluator = create_supervised_evaluator(net,
                                            metrics=val_metrics,
                                            device=device)
    evaluator.logger = setup_logger("evaluator")

    # Attach handler to plot trainer's loss every 100 iterations
    tb_logger.attach_output_handler(
        trainer,
        event_name=Events.ITERATION_COMPLETED(every=params.get('loss_report')),
        tag="training",
        output_transform=lambda loss: {"loss": loss},
    )

    # Attach handler to dump evaluator's metrics every epoch completed
    for tag, evaluator in [("training", trainer), ("validation", evaluator)]:
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.EPOCH_COMPLETED,
            tag=tag,
            metric_names="all",
            global_step_transform=global_step_from_engine(trainer),
        )

    # Attach function to build debug images and report every epoch end
    tb_logger.attach(
        evaluator,
        log_handler=predictions_gt_images_handler,
        event_name=Events.EPOCH_COMPLETED(once=1),
    )

    desc = "ITERATION - loss: {:.2f}"
    pbar = tqdm(initial=0,
                leave=False,
                total=len(trainloader),
                desc=desc.format(0))

    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training_loss(engine):
        pbar.desc = desc.format(engine.state.output)
        pbar.update(log_interval)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(trainloader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["loss"]
        tqdm.write(
            "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(testloader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["loss"]
        tqdm.write(
            "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

        pbar.n = pbar.last_print_n = 0

    @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED)
    def log_time():
        tqdm.write("{} took {} seconds".format(
            trainer.last_event_name.name,
            trainer.state.times[trainer.last_event_name.name]))

    trainer.run(trainloader, max_epochs=epochs)
    pbar.close()

    PATH = './cifar_net.pth'
    torch.save(net.state_dict(), PATH)

    print('Finished Training')
    print('Task ID number is: {}'.format(task.id))
Esempio n. 28
0
def auto_model(model: nn.Module,
               sync_bn: bool = False,
               **kwargs: Any) -> nn.Module:
    """Helper method to adapt provided model for non-distributed and distributed configurations (supporting
    all available backends from :meth:`~ignite.distributed.utils.available_backends()`).

    Internally, we perform to following:

    - send model to current :meth:`~ignite.distributed.utils.device()` if model's parameters are not on the device.
    - wrap the model to `torch DistributedDataParallel`_ for native torch distributed if world size is larger than 1.
    - wrap the model to `torch DataParallel`_ if no distributed context found and more than one CUDA devices available.
    - broadcast the initial variable states from rank 0 to all other processes if Horovod distributed framework is used.

    Examples:

    .. code-block:: python

        import ignite.distribted as idist

        model = idist.auto_model(model)

    In addition with NVidia/Apex, it can be used in the following way:

    .. code-block:: python

        import ignite.distribted as idist

        model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
        model = idist.auto_model(model)

    Args:
        model: model to adapt.
        sync_bn: if True, applies `torch convert_sync_batchnorm`_ to the model for native torch
            distributed only. Default, False. Note, if using Nvidia/Apex, batchnorm conversion should be
            applied before calling ``amp.initialize``.
        kwargs: kwargs to model's wrapping class: `torch DistributedDataParallel`_ or `torch DataParallel`_
            if applicable. Please, make sure to use acceptable kwargs for given backend.

    Returns:
        torch.nn.Module

    .. _torch DistributedDataParallel: https://pytorch.org/docs/stable/generated/torch.nn.parallel.
        DistributedDataParallel.html
    .. _torch DataParallel: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html
    .. _torch convert_sync_batchnorm: https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html#
        torch.nn.SyncBatchNorm.convert_sync_batchnorm

    .. versionchanged:: 0.4.2

        - Added Horovod distributed framework.
        - Added ``sync_bn`` argument.

    .. versionchanged:: 0.4.3
        Added kwargs to ``idist.auto_model``.
    """
    logger = setup_logger(__name__ + ".auto_model")

    # Put model's parameters to device if its parameters are not on the device
    device = idist.device()
    if not all([p.device == device for p in model.parameters()]):
        model.to(device)

    # distributed data parallel model
    if idist.get_world_size() > 1:
        bnd = idist.backend()
        if idist.has_native_dist_support and bnd in (idist_native.NCCL,
                                                     idist_native.GLOO,
                                                     idist_native.MPI):
            if sync_bn:
                logger.info("Convert batch norm to sync batch norm")
                model = nn.SyncBatchNorm.convert_sync_batchnorm(model)

            if torch.cuda.is_available():
                if "device_ids" in kwargs:
                    raise ValueError(
                        f"Argument kwargs should not contain 'device_ids', but got {kwargs}"
                    )

                lrank = idist.get_local_rank()
                logger.info(
                    f"Apply torch DistributedDataParallel on model, device id: {lrank}"
                )
                kwargs["device_ids"] = [
                    lrank,
                ]
            else:
                logger.info("Apply torch DistributedDataParallel on model")

            model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
        elif idist.has_hvd_support and bnd == idist_hvd.HOROVOD:
            import horovod.torch as hvd

            logger.info(
                "Broadcast the initial variable states from rank 0 to all other processes"
            )
            hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    # not distributed but multiple GPUs reachable so data parallel model
    elif torch.cuda.device_count() > 1 and "cuda" in idist.device().type:
        logger.info("Apply torch DataParallel on model")
        model = torch.nn.parallel.DataParallel(model, **kwargs)

    return model
    def setup(self):
        self._init_distribution()

        self.trainer = Engine(self.train_step)
        self.trainer.logger = setup_logger(name="trainer", distributed_rank=self.local_rank)
        self.log_basic_info(self.trainer.logger)

        self.load_trainer_from_checkpoint()

        if self.scheduler:
            self.scheduler_event = self.trainer.add_event_handler(Events.ITERATION_STARTED, self.scheduler)
        else:
            self.scheduler_event = None
        self.attach_metrics(self.trainer, self.train_metrics)
        

        if idist.get_world_size() >1:
            def set_epoch(engine):
                self.train_loader.sampler.set_epoch(engine.state.epoch)

            self.trainer.add_event_handler(Events.EPOCH_STARTED, set_epoch)


        common.setup_common_training_handlers(
            self.trainer,
            train_sampler=self.train_loader.sampler,
            to_save=None,
            save_every_iters=0,
            output_path= None,
            lr_scheduler= None,
            output_names= None,
            with_pbars=self.hparams.add_pbar,
            clear_cuda_cache=True,
            stop_on_nan=False
        )
        
        self.evaluator = Engine(self.eval_step)
        self.evaluator.logger = setup_logger("evaluator", distributed_rank=self.local_rank)
        if self.hparams.add_pbar:
            ProgressBar(persist=False).attach(self.evaluator)

        def complete_clear(engine):
            engine.state.batch = None
            engine.state.output = None
            import gc
            gc.collect()
        self.trainer.add_event_handler(Events.EPOCH_COMPLETED, complete_clear)

        self.validation_handler_event = self.trainer.add_event_handler(Events.EPOCH_COMPLETED(every=self.hparams.eval_every), self.validate(self.valid_loader))
        self.evaluator.add_event_handler(Events.EPOCH_COMPLETED, complete_clear)

        train_handler_params = {
            "model": self.model,
            "optimizer": self.optimizer,
            "scheduler": self.scheduler
        }

        eval_handler_params = {
            "model": self.model,
            "optimizer": self.optimizer,
            "scheduler": self.scheduler
        }

        to_save = {
                "model": self.model,
                "trainer": self.trainer,
                "optimizer": self.optimizer
            }
        if self.scheduler is not None:
            to_save["scheduler"] = self.scheduler
        if USE_AMP:
            to_save["amp"] = amp
        self.attach_metrics(self.evaluator, self.validation_metrics)
        self.setup_checkpoint_saver(to_save)
        
        if self.rank == 0:
            self._init_logger()
            if self.logger:
                self.logger._init_logger(self.trainer, self.evaluator)
                self.logger._add_train_events(**train_handler_params)
                self.logger._add_eval_events(**eval_handler_params)