コード例 #1
0
ファイル: train.py プロジェクト: SyomaKiss/S20_project
def run_training(config):
    tb_writer = CustomWriter(config)
    logger = saver.get_logger(config)

    num_epochs = config.training.num_epochs
    model = {'G': models.get_model(config, tag='G'), 'D': models.get_model(config, tag='D')}
    model = {key: torch.nn.DataParallel(value) for key, value in model.items()}
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_loader = get_DataLoader(config, phase="train")
    val_loader = get_DataLoader(config, phase="val")

    optimizer = {'G': optimizers.get_optimizer(model['G'].parameters(), config.model.G.optimizer),
                 'D': optimizers.get_optimizer(model['D'].parameters(), config.model.D.optimizer)}

    scheduler = {'G': schedulers.get_scheduler(optimizer['G'], config.model.G.scheduler),
                 'D': schedulers.get_scheduler(optimizer['D'], config.model.D.scheduler)}

    criterion = {'G': losses.get_loss(config.model.G.criterion),
                 'D': losses.get_loss(config.model.D.criterion)}

    start_epoch_num = (
        saver.get_latest_epoch_num(config)
        if config.model.G.load_state == -1 or config.model.G.load_state == "latest"
        else config.model.G.load_state
    )

    # Dynamic imports according to protocols
    epoch_module = importlib.import_module('protocols.{}.epoch'.format(config.protocol))
    train_one_epoch, test = getattr(epoch_module, 'train_one_epoch'), getattr(epoch_module, 'test')

    for epoch in range(start_epoch_num + 1, start_epoch_num + num_epochs + 1):

        train_buffer = train_one_epoch(
            config=config,
            model=model,
            device=device,
            train_loader=train_loader,
            optimizer=optimizer,
            scheduler=scheduler,
            criterion=criterion,
            epoch=epoch,
            logger=logger,
            log_interval=config.training.log_interval,
        )
        tb_writer.write_result(train_buffer, epoch, phase="train")

        if epoch % config.training.validation_period == 0:
            val_buffer = test(
                config=config,
                model=model,
                device=device,
                test_loader=val_loader,
                criterion=criterion,
                logger=logger,
                phase="val",
                tag=epoch,
                log_interval=8,
            )
            tb_writer.write_result(val_buffer, epoch, phase="val")
コード例 #2
0
ファイル: etm.py プロジェクト: psanch21/ETM-Lightning
 def configure_optimizers(self):
     optim = get_optimizer(self.optim_params['name'])(
         self.parameters(), **self.optim_params['params'])
     if isinstance(self.sched_params, dict):
         sched = get_scheduler(self.sched_params['name'])(
             optim, **self.sched_params['params'])
     else:
         sched = []
     return [optim], sched
コード例 #3
0
ファイル: scaffold.py プロジェクト: zhouhuaman/plato
    def get_optimizer(self, model):
        """Initialize the SCAFFOLD optimizer."""
        optimizer = optimizers.get_optimizer(model)

        optimizer.server_update_direction = self.server_update_direction
        optimizer.client_update_direction = self.client_update_direction
        optimizer.client_id = self.client_id
        optimizer.update_flag = True

        return optimizer
コード例 #4
0
ファイル: lr_schedule_tests.py プロジェクト: zhouhuaman/plato
    def setUp(self):
        super().setUp()
        __ = Config()

        fields = [
            'optimizer', 'lr_schedule', 'learning_rate', 'momentum',
            'weight_decay', 'lr_gamma', 'lr_milestone_steps', 'lr_warmup_steps'
        ]
        params = ['SGD', '', 0.1, 0.5, 0.0, 0.0, '', '']
        Config().trainer = namedtuple('trainer', fields)(*params)

        self.model = models_registry.get('resnet_18')
        self.optimizer = optimizers.get_optimizer(self.model)
コード例 #5
0
    def init_optimizer(self):
        """Initialize the optimizer."""
        optimizer_cls = get_optimizer(self.params[TRAIN])
        optimizer_params = {k: v for k, v in
                            self.params[TRAIN][OPTIMIZER].items()
                            if k != "name"}
        criterion_params = list(self.criterion.parameters())
        if criterion_params is not None:
            model_params = list(self.model.parameters()) + criterion_params
        else:
            model_params = self.model.parameters()
        optimizer = optimizer_cls(model_params, **optimizer_params)

        return optimizer
コード例 #6
0
ファイル: fedsarah.py プロジェクト: zhouhuaman/plato
    def get_optimizer(self, model):
        """Initialize the FedSarah optimizer."""
        optimizer = optimizers.get_optimizer(model)
        optimizer.server_control_variates = self.server_control_variates
        optimizer.client_control_variates = self.client_control_variates
        optimizer.client_id = self.client_id
        optimizer.max_counter = Config().trainer.epochs

        if self.adjustment:
            optimizer.epsilon = optimizer.max_epsilon - (
                optimizer.max_epsilon - optimizer.min_epsilon) * np.exp(
                    -1 * optimizer.epsilon_decay * self.fl_round_counter)
            #optimizer.epsilon = optimizer.min_epsilon + (
            #optimizer.max_epsilon - optimizer.min_epsilon) * np.exp(
            #   -1 * optimizer.epsilon_decay * self.fl_round_counter)
        else:
            optimizer.epsilon = optimizer.min_epsilon

        return optimizer
コード例 #7
0
def main():
    """Main function"""

    # Initialization
    args = parse_args()
    dist = init_workers(args.distributed)
    config = load_config(args)
    os.makedirs(config['output_dir'], exist_ok=True)
    config_logging(verbose=args.verbose)
    logging.info('Initialized rank %i size %i local_rank %i local_size %i',
                 dist.rank, dist.size, dist.local_rank, dist.local_size)
    if dist.rank == 0:
        logging.info('Configuration: %s', config)

    # Setup MLPerf logging
    if args.mlperf:
        mllogger = configure_mllogger(config['output_dir'])
    if dist.rank == 0 and args.mlperf:
        mllogger.event(key=mllog.constants.CACHE_CLEAR)
        mllogger.start(key=mllog.constants.INIT_START)

    # Initialize Weights & Biases logging
    if args.wandb and dist.rank == 0:
        import wandb
        wandb.init(project='cosmoflow',
                   name=args.run_tag,
                   id=args.run_tag,
                   config=config,
                   resume=args.run_tag)

    # Device and session configuration
    gpu = dist.local_rank if args.rank_gpu else None
    if gpu is not None:
        logging.info('Taking gpu %i', gpu)
    configure_session(gpu=gpu,
                      intra_threads=args.intra_threads,
                      inter_threads=args.inter_threads,
                      kmp_blocktime=args.kmp_blocktime,
                      kmp_affinity=args.kmp_affinity,
                      omp_num_threads=args.omp_num_threads)

    # Mixed precision
    if args.amp:
        logging.info('Enabling mixed float16 precision')

        # Suggested bug workaround from https://github.com/tensorflow/tensorflow/issues/38516
        if tf.__version__.startswith('2.2.'):
            from tensorflow.python.keras.mixed_precision.experimental import device_compatibility_check
            device_compatibility_check.log_device_compatibility_check = lambda policy_name, skip_local: None
        tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
        # TF 2.3
        #tf.keras.mixed_precision.set_global_policy('mixed_float16')

    # Start MLPerf logging
    if dist.rank == 0 and args.mlperf:
        log_submission_info(**config.get('mlperf', {}))
        mllogger.end(key=mllog.constants.INIT_STOP)
        mllogger.start(key=mllog.constants.RUN_START)

    # Load the data
    data_config = config['data']
    if dist.rank == 0:
        logging.info('Loading data')
    datasets = get_datasets(dist=dist, **data_config)
    logging.debug('Datasets: %s', datasets)

    # Construct or reload the model
    if dist.rank == 0:
        logging.info('Building the model')
    train_config = config['train']
    initial_epoch = 0
    checkpoint_format = os.path.join(config['output_dir'],
                                     'checkpoint-{epoch:03d}.h5')
    if args.resume and os.path.exists(checkpoint_format.format(epoch=1)):
        # Reload model from last checkpoint
        initial_epoch, model = reload_last_checkpoint(
            checkpoint_format,
            data_config['n_epochs'],
            distributed=args.distributed)
    else:
        # Build a new model
        model = get_model(**config['model'])
        # Configure the optimizer
        opt = get_optimizer(distributed=args.distributed,
                            **config['optimizer'])
        # Compile the model
        model.compile(optimizer=opt,
                      loss=train_config['loss'],
                      metrics=train_config['metrics'])

    if dist.rank == 0:
        model.summary()

    # Save configuration to output directory
    if dist.rank == 0:
        config['n_ranks'] = dist.size
        save_config(config)

    # Prepare the callbacks
    if dist.rank == 0:
        logging.info('Preparing callbacks')
    callbacks = []
    if args.distributed:

        # Broadcast initial variable states from rank 0 to all processes.
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))

        # Average metrics across workers
        callbacks.append(hvd.callbacks.MetricAverageCallback())

    # Learning rate decay schedule
    if 'lr_schedule' in config:
        global_batch_size = data_config['batch_size'] * dist.size
        callbacks.append(
            tf.keras.callbacks.LearningRateScheduler(
                get_lr_schedule(global_batch_size=global_batch_size,
                                **config['lr_schedule'])))

    # Timing
    timing_callback = TimingCallback()
    callbacks.append(timing_callback)

    # Checkpointing and logging from rank 0 only
    if dist.rank == 0:
        callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_format))
        callbacks.append(
            tf.keras.callbacks.CSVLogger(os.path.join(config['output_dir'],
                                                      'history.csv'),
                                         append=args.resume))
        if args.tensorboard:
            callbacks.append(
                tf.keras.callbacks.TensorBoard(
                    os.path.join(config['output_dir'], 'tensorboard')))
        if args.mlperf:
            callbacks.append(MLPerfLoggingCallback())
        if args.wandb:
            callbacks.append(wandb.keras.WandbCallback())

    # Early stopping
    patience = train_config.get('early_stopping_patience', None)
    if patience is not None:
        callbacks.append(
            tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                             min_delta=1e-5,
                                             patience=patience,
                                             verbose=1))

    # Stopping at specified target
    target_mae = train_config.get('target_mae', None)
    callbacks.append(StopAtTargetCallback(target_max=target_mae))

    if dist.rank == 0:
        logging.debug('Callbacks: %s', callbacks)

    # Train the model
    if dist.rank == 0:
        logging.info('Beginning training')
    fit_verbose = 1 if (args.verbose and dist.rank == 0) else 2
    model.fit(datasets['train_dataset'],
              steps_per_epoch=datasets['n_train_steps'],
              epochs=data_config['n_epochs'],
              validation_data=datasets['valid_dataset'],
              validation_steps=datasets['n_valid_steps'],
              callbacks=callbacks,
              initial_epoch=initial_epoch,
              verbose=fit_verbose)

    # Stop MLPerf timer
    if dist.rank == 0 and args.mlperf:
        mllogger.end(key=mllog.constants.RUN_STOP,
                     metadata={'status': 'success'})

    # Print training summary
    if dist.rank == 0:
        print_training_summary(config['output_dir'], args.print_fom)

    # Print GPU memory - not supported in TF 2.2?
    #if gpu is not None:
    #    device = tf.config.list_physical_devices('GPU')[gpu]
    #    #print(tf.config.experimental.get_memory_usage(device))
    #    #print(tf.config.experimental.get_memory_info(device))

    # Finalize
    if dist.rank == 0:
        logging.info('All done!')
コード例 #8
0
def get_params(args):
    np.set_printoptions(precision=3, suppress=True)
    global model_name, data_name, model_type, norm_type, acti_type
    global pretrain_augment, train_augment
    global num_cluster, embeded_dim, silent, slevel, to_disk, flevel, time_stamp, log_file
    global save_model_dir, print_every, buffer_size, batch_size, alpha
    global pretrain_lr, pretrain_optimizer_type
    global pretrain_epoch, train_lr, train_optimizer_type, maxiter, interval
    global cluster_loss_type, reconstruct_loss_type
    global num_repeat_kmeans, use_pretrain_model, save_pretrain_model, delta
    global re, cre, cl, dccbc
    # ==================== some other hyper-parameters ===================
    model_name = args.model_name  # "dec"
    data_name = args.data_name  # "mnist"
    model_type = args.model_type  # "conv" or "all_conv" or "mlp"
    norm_type = args.norm_type if args.norm_type is not "None" else None  # None or "bn" or "normal"
    acti_type = args.acti_type  # "relu"
    pretrain_augment = args.pretrain_augment  # True
    train_augment = args.train_augment # True
    model_name = "_".join([model_name, model_type])  # "dec_mnist"
    num_cluster = dataset_clusters[data_name] if args.num_cluster < 0 else args.num_cluster  # 10
    embeded_dim = num_cluster if args.embeded_dim < 0 else args.embeded_dim  # 10
    silent = args.silent  # False
    slevel = args.slevel  # "debug" or "info"
    to_disk = args.to_disk  # True
    flevel = args.flevel  # "debug" or "info"
    time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    log_file = f"results/log/{model_name}_{data_name}_{time_stamp}.txt"
    save_model_dir = "results/model/"
    print_every = args.print_every  # 10
    buffer_size = args.buffer_size  # 70000
    batch_size = args.batch_size  # 256

    alpha = args.alpha  # 1
    pretrain_lr = args.pretrain_lr  # 0.001
    pretrain_optimizer_type = args.pretrain_optimizer_type  # "rmsprop"
    pretrain_epoch = args.pretrain_epoch  # 100
    train_lr = args.train_lr  # 0.001
    train_optimizer_type = args.train_optimizer_type  # "rmsprop"
    maxiter = args.maxiter
    interval = args.interval
    cluster_loss_type = args.cluster_loss_type  # "kl"
    reconstruct_loss_type = args.reconstruct_loss_type  # "l2"
    num_repeat_kmeans = args.num_repeat_kmeans  # 20
    use_pretrain_model = args.use_pretrain_model  # False
    save_pretrain_model = args.save_pretrain_model  # True

    delta = args.delta  # 0.001

    re, cre, cl, dccbc = args.re, args.cre, args.cl, args.dccbc

    # ==================== some settings ====================
    global logger, feature, label, train_dataset, test_dataset, transformer
    global sample_x, sample_y, pretrain_optimizer, train_optimizer
    global encoder, decoder, cluster_layer, model, cluster_loss_fn, reconstruct_loss_fn
    global encoder_path, decoder_path

    logger = Logger(silent=silent, slevel=slevel, to_disk=to_disk,
                    log_file=log_file, flevel=flevel)
    feature, label = load_merged_data(data_name)
    feature = feature / 255.0


    dataset = tf.data.Dataset.from_tensor_slices({"feature": feature,
                                                  "label": label,
                                                  "idx": np.arange(label.shape[0])})
    train_dataset = dataset.shuffle(buffer_size).batch(batch_size)
    test_dataset = dataset.batch(batch_size)
    transformer = Transformer(data_name, batch_size)

    sample_x, sample_y = load_sample_data(data_name, 100)
    sample_x = sample_x / 255.

    pretrain_optimizer = get_optimizer(type=pretrain_optimizer_type,
                                       learning_rate=pretrain_lr)
    train_optimizer = get_optimizer(type=train_optimizer_type,
                                    learning_rate=train_lr)
    pretrain_augment_string = "augment" if pretrain_augment else "no_augment"
    encoder_path = "_".join([data_name, model_type, pretrain_augment_string, "encoder.h5"])
    encoder_path = os.path.join(save_model_dir, encoder_path)
    decoder_path = "_".join([data_name, model_type, pretrain_augment_string, "decoder.h5"])
    decoder_path = os.path.join(save_model_dir, decoder_path)
    if use_pretrain_model:
        logger.info(f"Use pretrained model: {encoder_path}, {decoder_path}")
        encoder = tf.keras.models.load_model(encoder_path)
        decoder = tf.keras.models.load_model(decoder_path)
        pretrain_epoch = 0
    else:
        logger.info("Warning: train Autoencoder from scratch")
        encoder, decoder = get_backbone(data_name, embeded_dim=embeded_dim,
                                        model_type=model_type, norm_type=norm_type,
                                        acti_type=acti_type)

    cluster_layer = ClusterLayer(num_cluster, alpha)
    model = {"encoder": encoder, "decoder": decoder, "cluster_layer": cluster_layer}

    cluster_loss_fn = losses[cluster_loss_type]
    reconstruct_loss_fn = losses[reconstruct_loss_type]
コード例 #9
0
ファイル: execute.py プロジェクト: drewlinsley/spine
def run_training(
        experiment_name,
        debug=False,
        only_ees=False,
        only_kinematics=False,
        use_neptune=False,
        epochs=2000,  # 20000
        train_batch=21330,  # 54726 // 2,
        val_batch=1123,  # 2000
        dtype=np.float32,
        val_split=0.05,
        shuffle_data=True,
        model_type="linear",  # "GRU",  # 'transformer',
        model_cfg=None,
        bptt=350,
        hidden_size=6,
        lr=1e-2,
        start_trim=700,
        log_interval=5,
        val_interval=20,
        clip_grad_norm=False,
        output_dir="results",
        normalize_input=True,
        optimizer="Adam",  # "AdamW",
        scheduler=None,  # "StepLR",
        train_weight=100.,
        batch_first=True,
        toss_allzero_mn=True,
        dumb_augment=False,
        score="pearson",
        metric="l2"):  # pearson
    """Run training and validation."""
    if use_neptune and NEPTUNE_IMPORTED:
        neptune.init("Serre-Lab/deepspine")
        if experiment_name is None:
            experiment_name = "synthetic_data"
        neptune.create_experiment(experiment_name)
    assert model_type is not None, "You must select a model."
    default_model_params = tools.get_model_defaults()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    timestamp = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d-%H_%M_%S')
    if model_cfg is None:
        print("Using default model cfg file.")
        model_cfg = model_type
    data = np.load(DATA_FILE)
    mn = data["mn"]
    ees = data["ees"]
    kinematics = data["kinematics"]
    X = torch.from_numpy(np.concatenate((ees, kinematics), 1).astype(dtype))
    Y = torch.from_numpy(mn.astype(dtype))
    X = X.permute(0, 2, 1)
    Y = Y.permute(0, 2, 1)
    if only_ees:
        X = X[..., 0][..., None]  # Only ees -- 0.73
    if only_kinematics:
        X = X[..., 1:]  # Only kinematics -- 0.89
    input_size = X.size(-1)
    output_size = Y.size(-1)
    meta = Meta(
        batch_first=batch_first,
        data_size=X.shape,
        train_batch=train_batch,
        val_batch=val_batch,
        val_split=val_split,
        model_type=model_type,
        model_cfg=model_cfg,
        input_size=input_size,
        hidden_size=hidden_size,
        output_size=output_size,
        metric=metric,
        score=score,
        normalize_input=normalize_input,
        lr=lr,
        bptt=bptt,
        epochs=epochs,
        optimizer=optimizer,
        scheduler=scheduler,
        clip_grad_norm=clip_grad_norm,
        log_interval=log_interval,
        val_interval=val_interval,
        start_trim=start_trim,
        train_weight=train_weight,
        device=device)

    # Prepare data
    if toss_allzero_mn:
        # Restrict to nonzero mn fibers
        # mask = (Y.sum(1) > 127.5).sum(-1) == 2  # Ys where both are nonzero at some point
        mask = ((Y > 200).sum(1) > 0).sum(-1) == 2  # Ys where both are > 127.5 at some point
        # mask = ((Y > 127.5).sum(1) > 0).sum(-1) >= 1  # Ys where either is > 127.5 at some point
        print("Throwing out {} examples.".format((mask == False).sum()))
        X = X[mask]
        Y = Y[mask]
    if meta.start_trim:
        X = X.narrow(1, meta.start_trim, X.size(1) - meta.start_trim)
        Y = Y.narrow(1, meta.start_trim, Y.size(1) - meta.start_trim)

    if shuffle_data:
        idx = np.random.permutation(len(X))
        X = X[idx]
        Y = Y[idx]

    if meta.normalize_input:
        # X = (X - 127.5) / 127.5
        # Y = (Y - 127.5) / 127.5
        k_X = X[..., 1:]
        k_X = (k_X - k_X.mean(1, keepdim=True)) / (k_X.std(1, keepdim=True) + 1e-8)  # This is peaking but whatever...
        e_X = X[..., 0][..., None]
        e_X = e_X / 255.
        X = torch.cat((k_X, e_X), -1)
        if meta.metric != "bce":
            Y = (Y - Y.mean(1, keepdim=True)) / (Y.std(1, keepdim=True) + 1e-8)
            # Y = Y / 255.
        else:
            # Quantize Y
            Y = (Y > 127.5).float()
    X = X.to(meta.device)
    Y = Y.to(meta.device)
    cv_idx = np.arange(len(X))
    cv_idx = cv_idx > np.round(float(len(X)) * val_split).astype(int)
    X_train = X[cv_idx]
    Y_train = Y[cv_idx]
    X_val = X[~cv_idx]
    Y_val = Y[~cv_idx]
    assert meta.train_batch < len(X_train), "Train batch size > dataset size {}.".format(len(X_train) - 1)
    assert meta.val_batch < len(X_val), "Val batch size > dataset size {}.".format(len(X_val) - 1)

    if dumb_augment:
        X_train = torch.cat((X_train, X_train[:, torch.arange(X_train.size(1) - 1, -1, -1).long()]))
        Y_train = torch.cat((Y_train, Y_train[:, torch.arange(Y_train.size(1) - 1, -1, -1).long()]))

    if not meta.batch_first:
        X_train = X_train.permute(1, 0, 2)
        Y_train = Y_train.permute(1, 0, 2)
        X_val = X_val.permute(1, 0, 2)
        Y_val = Y_val.permute(1, 0, 2)

    # Create model
    model = modeling.create_model(
        batch_first=meta.batch_first,
        bptt=meta.bptt,
        model_type=meta.model_type,
        model_cfg=meta.model_cfg,
        input_size=meta.input_size,
        hidden_size=meta.hidden_size,
        output_size=meta.output_size,
        default_model_params=default_model_params,
        device=meta.device)
    num_params = sum([p.numel() for p in model.parameters() if p.requires_grad])
    print('Total number of parameters: {}'.format(num_params))
    score, criterion = metrics.get_metric(metric, meta.batch_first)
    optimizer_fun = optimizers.get_optimizer(optimizer)
    assert lr < 1, "LR is greater than 1."
    if "adam" in optimizer.lower():
        optimizer = optimizer_fun(model.parameters(), lr=lr, amsgrad=True)
    else:
        optimizer = optimizer_fun(model.parameters(), lr=lr)
    if scheduler is not None:
        scheduler = optimizers.get_scheduler(scheduler) 
        scheduler = scheduler(optimizer)

    # Start training
    best_val_loss = float("inf")
    best_model = None
    X_val, _ = batchify(X_val, bsz=meta.val_batch, random=False, batch_first=meta.batch_first)
    Y_val, _ = batchify(Y_val, bsz=meta.val_batch, random=False, batch_first=meta.batch_first)
    for epoch in range(1, meta.epochs + 1):
        epoch_start_time = time.time()
        meta.epoch = epoch
        X_train_i, random_idx = batchify(
            X_train,
            bsz=meta.train_batch,
            random=True,
            batch_first=meta.batch_first)
        Y_train_i, _ = batchify(
            Y_train,
            bsz=meta.train_batch,
            random=random_idx,
            batch_first=meta.batch_first)
        min_train_loss, max_train_loss, train_output, train_gt = train(
            model=model,
            X=X_train_i,
            Y=Y_train_i,
            optimizer=optimizer,
            criterion=criterion,
            score=score,
            scheduler=scheduler,
            meta=meta)
        if epoch % meta.val_interval == 0:
            val_loss, val_score, val_output, val_gt = evaluate(
                model=model,
                X=X_val,
                Y=Y_val,
                criterion=criterion,
                score=score,
                meta=meta)
            meta.min_train_loss.append(min_train_loss)
            meta.max_train_loss.append(max_train_loss)
            meta.val_loss.append(val_loss)
            meta.val_score.append(val_score)
            print('-' * 89)
            print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid score {:5.2f}'.format(
                  epoch,
                  (time.time() - epoch_start_time),
                  meta.val_loss[-1],
                  meta.val_score[-1]))
            print('-' * 89)
            if use_neptune and NEPTUNE_IMPORTED:
                neptune.log_metric('min_train_loss', min_train_loss)
                neptune.log_metric('max_train_loss', max_train_loss)
                neptune.log_metric('val_{}'.format(meta.metric), val_loss)
                neptune.log_metric('val_pearson', val_score)
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = model
            if val_loss < 0.65 and debug:
                from matplotlib import pyplot as plt
                fig = plt.figure()
                plt.title('val')
                plt.subplot(211)
                plt.plot(val_output[50].cpu())
                plt.subplot(212)
                plt.plot(val_gt[50].cpu())
                plt.show()
                plt.close(fig)
                fig = plt.figure()
                plt.title('train')
                plt.subplot(211)
                plt.plot(train_output[50].cpu().detach())
                plt.subplot(212)
                plt.plot(train_gt[50].cpu())
                plt.show()
                plt.close(fig)
            if scheduler is not None:
                scheduler.step()

    # Fix some type issues
    meta.val_loss = [x.cpu() for x in meta.val_loss]
    meta.val_score = [x.cpu() for x in meta.val_score]
    np.savez(os.path.join(output_dir, '{}results_{}'.format(experiment_name, timestamp)), **meta.__dict__)  # noqa
    np.savez(os.path.join(output_dir, '{}example_{}'.format(experiment_name, timestamp)), train_output=train_output.cpu().detach(), train_gt=train_gt.cpu(), val_output=val_output.cpu(), val_gt=val_gt.cpu())
    torch.save(best_model.state_dict(), os.path.join(output_dir, '{}model_{}.pth'.format(experiment_name, timestamp)))
コード例 #10
0
def main():
    """Main function"""

    # Initialization
    args = parse_args()
    rank, local_rank, n_ranks = init_workers(args.distributed)
    config = load_config(args.config,
                         output_dir=args.output_dir,
                         data_config=args.data_config)

    os.makedirs(config['output_dir'], exist_ok=True)
    config_logging(verbose=args.verbose)
    logging.info('Initialized rank %i local_rank %i size %i', rank, local_rank,
                 n_ranks)
    if rank == 0:
        logging.info('Configuration: %s', config)

    # Device and session configuration
    gpu = local_rank if args.rank_gpu else None
    configure_session(gpu=gpu, **config.get('device', {}))

    # Load the data
    data_config = config['data']
    if rank == 0:
        logging.info('Loading data')
    datasets = get_datasets(rank=rank, n_ranks=n_ranks, **data_config)
    logging.debug('Datasets: %s', datasets)

    # Construct or reload the model
    if rank == 0:
        logging.info('Building the model')
    initial_epoch = 0
    checkpoint_format = os.path.join(config['output_dir'],
                                     'checkpoint-{epoch:03d}.h5')
    if args.resume:
        # Reload model from last checkpoint
        initial_epoch, model = reload_last_checkpoint(
            checkpoint_format,
            data_config['n_epochs'],
            distributed=args.distributed)
    else:
        # Build a new model
        model = get_model(**config['model'])
        # Configure the optimizer
        opt = get_optimizer(n_ranks=n_ranks,
                            distributed=args.distributed,
                            **config['optimizer'])
        # Compile the model
        train_config = config['train']
        model.compile(optimizer=opt,
                      loss=train_config['loss'],
                      metrics=train_config['metrics'])

    if rank == 0:
        model.summary()

    # Save configuration to output directory
    if rank == 0:
        data_config['n_train'] = datasets['n_train']
        data_config['n_valid'] = datasets['n_valid']
        save_config(config)

    # Prepare the callbacks
    if rank == 0:
        logging.info('Preparing callbacks')
    callbacks = []
    if args.distributed:

        # Broadcast initial variable states from rank 0 to all processes.
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))

        # Average metrics across workers
        callbacks.append(hvd.callbacks.MetricAverageCallback())

        # Learning rate warmup
        train_config = config['train']
        warmup_epochs = train_config.get('lr_warmup_epochs', 0)
        callbacks.append(
            hvd.callbacks.LearningRateWarmupCallback(
                warmup_epochs=warmup_epochs, verbose=1))

    # Learning rate decay schedule
    lr_schedule = train_config.get('lr_schedule', {})
    if rank == 0:
        logging.info('Adding LR decay schedule: %s', lr_schedule)
    callbacks.append(
        tf.keras.callbacks.LearningRateScheduler(
            schedule=lambda epoch, lr: lr * lr_schedule.get(epoch, 1)))

    # Timing
    timing_callback = TimingCallback()
    callbacks.append(timing_callback)

    # Checkpointing and CSV logging from rank 0 only
    if rank == 0:
        callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_format))
        callbacks.append(
            tf.keras.callbacks.CSVLogger(os.path.join(config['output_dir'],
                                                      'history.csv'),
                                         append=args.resume))

    if rank == 0:
        logging.debug('Callbacks: %s', callbacks)

    # Train the model
    if rank == 0:
        logging.info('Beginning training')
    fit_verbose = 1 if (args.verbose and rank == 0) else 2
    model.fit(datasets['train_dataset'],
              steps_per_epoch=datasets['n_train_steps'],
              epochs=data_config['n_epochs'],
              validation_data=datasets['valid_dataset'],
              validation_steps=datasets['n_valid_steps'],
              callbacks=callbacks,
              initial_epoch=initial_epoch,
              verbose=fit_verbose)

    # Print training summary
    if rank == 0:
        print_training_summary(config['output_dir'])

    # Finalize
    if rank == 0:
        logging.info('All done!')
コード例 #11
0
ファイル: train.py プロジェクト: lzhimin/cosmoflow-benchmark
def main():
    """Main function"""

    # Initialization
    args = parse_args()
    dist = init_workers(args.distributed)
    config = load_config(args)
    os.makedirs(config['output_dir'], exist_ok=True)
    config_logging(verbose=args.verbose)
    logging.info('Initialized rank %i size %i local_rank %i local_size %i',
                 dist.rank, dist.size, dist.local_rank, dist.local_size)
    if dist.rank == 0:
        logging.info('Configuration: %s', config)

    # Device and session configuration
    gpu = dist.local_rank if args.rank_gpu else None
    if gpu is not None:
        logging.info('Taking gpu %i', gpu)
    configure_session(gpu=gpu,
                      intra_threads=args.intra_threads,
                      inter_threads=args.inter_threads,
                      kmp_blocktime=args.kmp_blocktime,
                      kmp_affinity=args.kmp_affinity,
                      omp_num_threads=args.omp_num_threads)

    # Load the data
    data_config = config['data']
    if dist.rank == 0:
        logging.info('Loading data')
    datasets = get_datasets(dist=dist, **data_config)
    logging.debug('Datasets: %s', datasets)

    # Construct or reload the model
    if dist.rank == 0:
        logging.info('Building the model')
    train_config = config['train']
    initial_epoch = 0
    checkpoint_format = os.path.join(config['output_dir'],
                                     'checkpoint-{epoch:03d}.h5')
    if args.resume and os.path.exists(checkpoint_format.format(epoch=1)):
        # Reload model from last checkpoint
        initial_epoch, model = reload_last_checkpoint(
            checkpoint_format,
            data_config['n_epochs'],
            distributed=args.distributed)
    else:
        # Build a new model
        model = get_model(**config['model'])
        # Configure the optimizer
        opt = get_optimizer(distributed=args.distributed,
                            **config['optimizer'])
        # Compile the model
        model.compile(optimizer=opt,
                      loss=train_config['loss'],
                      metrics=train_config['metrics'])

    if dist.rank == 0:
        model.summary()

    # Save configuration to output directory
    if dist.rank == 0:
        config['n_ranks'] = dist.size
        save_config(config)

    # Prepare the callbacks
    if dist.rank == 0:
        logging.info('Preparing callbacks')
    callbacks = []
    if args.distributed:

        # Broadcast initial variable states from rank 0 to all processes.
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))

        # Average metrics across workers
        callbacks.append(hvd.callbacks.MetricAverageCallback())

    # Learning rate decay schedule
    if 'lr_schedule' in config:
        global_batch_size = data_config['batch_size'] * dist.size
        callbacks.append(
            tf.keras.callbacks.LearningRateScheduler(
                get_lr_schedule(global_batch_size=global_batch_size,
                                **config['lr_schedule'])))

    # Timing
    timing_callback = TimingCallback()
    callbacks.append(timing_callback)

    # Checkpointing and logging from rank 0 only
    if dist.rank == 0:
        callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_format))
        callbacks.append(
            tf.keras.callbacks.CSVLogger(os.path.join(config['output_dir'],
                                                      'history.csv'),
                                         append=args.resume))
        if args.tensorboard:
            callbacks.append(
                tf.keras.callbacks.TensorBoard(
                    os.path.join(config['output_dir'], 'tensorboard')))

    # Early stopping
    patience = config.get('early_stopping_patience', None)
    if patience is not None:
        callbacks.append(
            tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                             min_delta=1e-5,
                                             patience=patience,
                                             verbose=1))

    if dist.rank == 0:
        logging.debug('Callbacks: %s', callbacks)

    # Train the model
    if dist.rank == 0:
        logging.info('Beginning training')
    fit_verbose = 1 if (args.verbose and dist.rank == 0) else 2
    model.fit(datasets['train_dataset'],
              steps_per_epoch=datasets['n_train_steps'],
              epochs=data_config['n_epochs'],
              validation_data=datasets['valid_dataset'],
              validation_steps=datasets['n_valid_steps'],
              callbacks=callbacks,
              initial_epoch=initial_epoch,
              verbose=fit_verbose)

    # Print training summary
    if dist.rank == 0:
        print_training_summary(config['output_dir'], args.print_fom)

    # Finalize
    if dist.rank == 0:
        logging.info('All done!')
コード例 #12
0
    def __init__(self, cfg, writer, img_writer, logger, run_id):
        # Copy shared config fields
        if "monodepth_options" in cfg:
            cfg["data"].update(cfg["monodepth_options"])
            cfg["model"].update(cfg["monodepth_options"])
            cfg["training"]["monodepth_loss"].update(cfg["monodepth_options"])
        if "generated_depth_dir" in cfg["data"]:
            dataset_name = f"{cfg['data']['dataset']}_" \
                           f"{cfg['data']['width']}x{cfg['data']['height']}"
            depth_teacher = cfg["data"].get("depth_teacher", None)
            assert not (depth_teacher and cfg['model'].get('detph_estimator_weights') is not None)
            if depth_teacher is not None:
                cfg["data"]["generated_depth_dir"] += dataset_name + "/" + depth_teacher + "/"
            else:
                cfg["data"]["generated_depth_dir"] += dataset_name + "/" + cfg['model']['depth_estimator_weights'] + "/"

        # Setup seeds
        setup_seeds(cfg.get("seed", 1337))
        if cfg["data"]["dataset_seed"] == "same":
            cfg["data"]["dataset_seed"] = cfg["seed"]

        # Setup device
        torch.backends.cudnn.benchmark = cfg["training"].get("benchmark", True)
        self.cfg = cfg
        self.writer = writer
        self.img_writer = img_writer
        self.logger = logger
        self.run_id = run_id
        self.mIoU = 0
        self.fwAcc = 0
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.setup_segmentation_unlabeled()

        self.unlabeled_require_depth = (self.cfg["training"]["unlabeled_segmentation"] is not None and
                                        (self.cfg["training"]["unlabeled_segmentation"]["mix_mask"] == "depth" or
                                         self.cfg["training"]["unlabeled_segmentation"]["mix_mask"] == "depthcomp" or
                                         self.cfg["training"]["unlabeled_segmentation"]["mix_mask"] == "depthhist"))

        # Prepare depth estimates
        do_precalculate_depth = self.cfg["training"]["segmentation_lambda"] != 0 and self.unlabeled_require_depth and \
                                self.cfg['model']['segmentation_name'] != 'mtl_pad'
        use_depth_teacher = cfg["data"].get("depth_teacher", None) is not None
        if do_precalculate_depth or use_depth_teacher:
            assert not (do_precalculate_depth and use_depth_teacher)
            if not self.cfg["training"].get("disable_depth_estimator", False):
                print("Prepare depth estimates")
                depth_estimator = DepthEstimator(cfg)
                depth_estimator.prepare_depth_estimates()
                del depth_estimator
                torch.cuda.empty_cache()
        else:
            self.cfg["data"]["generated_depth_dir"] = None

        # Setup Dataloader
        load_labels, load_sequence = True, True
        if self.cfg["training"]["monodepth_lambda"] == 0:
            load_sequence = False
        if self.cfg["training"]["segmentation_lambda"] == 0:
            load_labels = False
        train_data_cfg = deepcopy(self.cfg["data"])
        if not do_precalculate_depth and not use_depth_teacher:
            train_data_cfg["generated_depth_dir"] = None
        self.train_loader = build_loader(train_data_cfg, "train", load_labels=load_labels, load_sequence=load_sequence)
        if self.cfg["training"].get("minimize_entropy_unlabeled", False) or self.enable_unlabled_segmentation:
            unlabeled_segmentation_cfg = deepcopy(self.cfg["data"])
            if not self.only_unlabeled and self.mix_use_gt:
                unlabeled_segmentation_cfg["load_onehot"] = True
            if self.only_unlabeled:
                unlabeled_segmentation_cfg.update({"load_unlabeled": True, "load_labeled": False})
            elif self.only_labeled:
                unlabeled_segmentation_cfg.update({"load_unlabeled": False, "load_labeled": True})
            else:
                unlabeled_segmentation_cfg.update({"load_unlabeled": True, "load_labeled": True})
            if self.mix_video:
                assert not self.mix_use_gt and not self.only_labeled and not self.only_unlabeled, \
                    "Video sample indices are not compatible with non-video indices."
                unlabeled_segmentation_cfg.update({"only_sequences_with_segmentation": not self.mix_video,
                                                   "restrict_to_subset": None})
            self.unlabeled_loader = build_loader(unlabeled_segmentation_cfg, "train",
                                                 load_labels=load_labels if not self.mix_video else False,
                                                 load_sequence=load_sequence)
        else:
            self.unlabeled_loader = None
        self.val_loader = build_loader(self.cfg["data"], "val", load_labels=load_labels,
                                       load_sequence=load_sequence)
        self.n_classes = self.train_loader.n_classes

        # monodepth dataloader settings uses drop_last=True and shuffle=True even for val
        self.train_data_loader = data.DataLoader(
            self.train_loader,
            batch_size=self.cfg["training"]["batch_size"],
            num_workers=self.cfg["training"]["n_workers"],
            shuffle=self.cfg["data"]["shuffle_trainset"],
            pin_memory=True,
            # Setting to false will cause crash at the end of epoch
            drop_last=True,
        )
        if self.unlabeled_loader is not None:
            self.unlabeled_data_loader = infinite_iterator(data.DataLoader(
                self.unlabeled_loader,
                batch_size=self.cfg["training"]["batch_size"],
                num_workers=self.cfg["training"]["n_workers"],
                shuffle=self.cfg["data"]["shuffle_trainset"],
                pin_memory=True,
                # Setting to false will cause crash at the end of epoch
                drop_last=True,
            ))

        self.val_batch_size = self.cfg["training"]["val_batch_size"]
        self.val_data_loader = data.DataLoader(
            self.val_loader,
            batch_size=self.val_batch_size,
            num_workers=self.cfg["training"]["n_workers"],
            pin_memory=True,
            # If using a dataset with odd number of samples (CamVid), the memory consumption suddenly increases for the
            # last batch. This can be circumvented by dropping the last batch. Only do that if it is necessary for your
            # system as it will result in an incomplete validation set.
            # drop_last=True,
        )

        # Setup Model
        self.model = get_model(cfg["model"], self.n_classes).to(self.device)
        # print(self.model)
        assert not (self.enable_unlabled_segmentation and self.cfg["training"]["save_monodepth_ema"])
        if self.enable_unlabled_segmentation and not self.only_labeled:
            print("Create segmentation ema model.")
            self.ema_model = self.create_ema_model(self.model).to(self.device)
        elif self.cfg["training"]["save_monodepth_ema"]:
            print("Create depth ema model.")
            # TODO: Try to remove unnecessary components and fit into gpu for better performance
            self.ema_model = self.create_ema_model(self.model)  # .to(self.device)
        else:
            self.ema_model = None

        # Setup optimizer, lr_scheduler and loss function
        optimizer_cls = get_optimizer(cfg)
        optimizer_params = {k: v for k, v in cfg["training"]["optimizer"].items() if
                            k not in ["name", "backbone_lr", "pose_lr", "depth_lr", "segmentation_lr"]}
        train_params = get_train_params(self.model, self.cfg)
        self.optimizer = optimizer_cls(train_params, **optimizer_params)

        self.scheduler = get_scheduler(self.optimizer, self.cfg["training"]["lr_schedule"])

        # Creates a GradScaler once at the beginning of training.
        self.scaler = GradScaler(enabled=self.cfg["training"]["amp"])

        self.loss_fn = get_segmentation_loss_function(self.cfg)
        self.monodepth_loss_calculator_train = get_monodepth_loss(self.cfg, is_train=True)
        self.monodepth_loss_calculator_val = get_monodepth_loss(self.cfg, is_train=False, batch_size=self.val_batch_size)

        if cfg["training"]["early_stopping"] is None:
            logger.info("Using No Early Stopping")
            self.earlyStopping = None
        else:
            self.earlyStopping = EarlyStopping(
                patience=round(cfg["training"]["early_stopping"]["patience"] / cfg["training"]["val_interval"]),
                min_delta=cfg["training"]["early_stopping"]["min_delta"],
                cumulative_delta=cfg["training"]["early_stopping"]["cum_delta"],
                logger=logger
            )
コード例 #13
0
ファイル: main.py プロジェクト: YueWangpl/STRAVE
def train():
    model = STARVE()

    # get style target
    style_img_path = DatasetParam.style_img_path
    style_target = model(tf.constant(load_img(style_img_path)))['style']

    # get content image path list
    if DatasetParam.use_video:
        content_img_list = glob.glob(
            join(TrainParam.video_frames_dir,
                 '*.{}'.format(DatasetParam.img_fmt)))
        content_img_list.sort(key=lambda x: int(splitext(basename(x))[0]))
    else:
        content_img_list = [DatasetParam.content_img_path]

    # record all frames from last iteration
    img_sqe = []

    for n_img, content_img_path in enumerate(content_img_list):
        # Call tf.function each time, or there will be
        # ValueError: tf.function-decorated function tried to create variables on non-first call
        # because of issues with lazy execution.
        # https://www.machinelearningplus.com/deep-learning/how-use-tf-function-to-speed-up-python-code-tensorflow/
        tf_train_step = tf.function(train_step)

        optimizer = get_optimizer()
        content_target = model(tf.constant(
            load_img(content_img_path)))['content']
        generated_image = tf.Variable(
            load_img(content_img_path, do_preprocess=False))

        pbar = tqdm(range(TrainParam.n_step))
        pbar.set_description_str('[{}/{} {}]'.format(
            n_img + 1, len(content_img_list), basename(content_img_path)))
        for step in pbar:
            tf_train_step(model, generated_image, optimizer, content_target,
                          style_target)

            if (step + 1) % TrainParam.draw_step == 0:
                plt.imsave(
                    join(TrainParam.iter_img_dir,
                         "{}.{}".format(step + 1, DatasetParam.img_fmt)),
                    tensor_to_image(generated_image))
        else:
            plt.imsave(
                join(TrainParam.stylized_img_dir, basename(content_img_path)),
                tensor_to_image(generated_image))

        img_sqe.append(generated_image)

    # long term consistency
    if DatasetParam.use_video:
        direction = -1
        step_bar = tqdm(range(TrainParam.consistency_step))
        step_bar.set_description_str('[consistency step]')
        # new it
        new_img_sqe = []
        for step in step_bar:
            tf_train_step = tf.function(consistent_train_step)

            optimizer = get_optimizer()

            pbar = tqdm(range(len(content_img_list)))
            pbar.set_description_str('[{}/{}]'.format(len(content_img_list),
                                                      step + 1))

            for frame_idx in pbar:
                # can try to optimize them by putting them outside of the loop
                # or init at begining of this function
                content_img_path = content_img_list[frame_idx]
                content_target = model(tf.constant(
                    load_img(content_img_path)))['content']
                generated_image = tf.Variable(
                    init_img(img_sqe, frame_idx, direction))

                tf_train_step(model, optimizer, content_target, style_target,
                              frame_idx, img_sqe, direction, generated_image)
                new_img_sqe.append(generated_image)
                if frame_idx % TrainParam.check_frame_step == 0:
                    plt.imsave(
                        join(TrainParam.iter_consistent_frames_dir,
                             "{}.{}".format(step + 1, DatasetParam.img_fmt)),
                        tensor_to_image(generated_image))

            if step % TrainParam.change_passdir_step == 0:
                direction = -direction

            img_sqe = new_img_sqe

    for frame_idx, generated_image in enumerate(img_sqe):
        content_img_path = content_img_list[frame_idx]
        plt.imsave(
            join(TrainParam.consistent_frames_dir, basename(content_img_path)),
            tensor_to_image(generated_image))

    return
コード例 #14
0
def main():
    global best_iou
    global best_dice
    # model
    model = smp.Unet(encoder_name=configs.encoder,
                     encoder_weights=configs.encoder_weights,
                     classes=configs.num_classes,
                     activation=configs.activation)
    if len(configs.gpu_id) > 1:
        model = nn.DataParallel(model)
    model.cuda()
    # get files
    filenames = glob(configs.dataset + "masks/*")
    filenames = [os.path.basename(i) for i in filenames]
    # random split dataset into train and val
    train_files, val_files = train_test_split(filenames, test_size=0.2)
    # define different aug
    if configs.use_strong_aug:
        transform_train = stong_aug()
    else:
        transform_train = get_training_augmentation()
    transform_valid = get_valid_augmentation()
    # make data loader for train and val
    train_dataset = SegDataset(train_files,
                               phase="train",
                               transforms=transform_train)
    valid_dataset = SegDataset(val_files,
                               phase="valid",
                               transforms=transform_valid)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=configs.bs,
                                               shuffle=True,
                                               num_workers=configs.workers)
    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=configs.bs,
                                               shuffle=False,
                                               num_workers=configs.workers)
    optimizer = get_optimizer(model)
    loss_func = get_loss_func(configs.loss_func)
    criterion = loss_func().cuda()
    # tensorboardX writer
    writer = SummaryWriter(configs.log_dir)
    # set lr scheduler method
    if configs.lr_scheduler == "step":
        scheduler_default = torch.optim.lr_scheduler.StepLR(optimizer,
                                                            step_size=10,
                                                            gamma=0.1)
    elif configs.lr_scheduler == "on_loss":
        scheduler_default = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.2, patience=5, verbose=False)
    elif configs.lr_scheduler == "on_iou":
        scheduler_default = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='max', factor=0.2, patience=5, verbose=False)
    elif configs.lr_scheduler == "on_dice":
        scheduler_default = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='max', factor=0.2, patience=5, verbose=False)
    elif configs.lr_scheduler == "cosine":
        scheduler_default = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, configs.epochs - configs.warmup_epo)
    else:
        scheduler_default = torch.optim.lr_scheduler.StepLR(optimizer,
                                                            step_size=6,
                                                            gamma=0.1)
    # scheduler with warmup
    if configs.warmup:
        scheduler = GradualWarmupScheduler(optimizer,
                                           multiplier=configs.warmup_factor,
                                           total_epoch=configs.warmup_epo,
                                           after_scheduler=scheduler_default)
    else:
        scheduler = scheduler_default
    for epoch in range(configs.epochs):
        print('\nEpoch: [%d | %d] LR: %.8f' %
              (epoch + 1, configs.epochs, optimizer.param_groups[0]['lr']))
        train_loss, train_dice, train_iou = train(train_loader, model,
                                                  criterion, optimizer, epoch,
                                                  writer)
        valid_loss, valid_dice, valid_iou = eval(valid_loader, model,
                                                 criterion, epoch, writer)
        if configs.lr_scheduler == "step" or configs.lr_scheduler == "cosine" or configs.warmup:
            scheduler.step(epoch)
        elif configs.lr_scheduler == "on_iou":
            scheduler.step(valid_iou)
        elif configs.lr_scheduler == "on_dice":
            scheduler.step(valid_dice)
        elif configs.lr_scheduler == "on_loss":
            scheduler.step(valid_loss)
        # save model
        is_best_iou = valid_iou > best_iou
        is_best_dice = valid_dice > best_dice
        best_iou = max(valid_iou, best_iou)
        best_dice = max(valid_dice, best_dice)
        print("Best {}: {} ,Best Dice: {}".format(configs.metric, best_iou,
                                                  best_dice))
        save_checkpoint({
            'state_dict': model.state_dict(),
        }, is_best_iou, is_best_dice)
コード例 #15
0
def main():
    """Main function"""

    # Initialization
    args = parse_args()
    rank, n_ranks = init_workers(args.distributed)

    # Load configuration
    config = load_config(args.config)
    train_config = config['training']
    output_dir = os.path.expandvars(config['output_dir'])
    checkpoint_format = os.path.join(output_dir, 'checkpoints',
                                     'checkpoint-{epoch}.h5')
    if rank==0:
        os.makedirs(output_dir, exist_ok=True)

    # Loggging
    config_logging(verbose=args.verbose)
    logging.info('Initialized rank %i out of %i', rank, n_ranks)
    if args.show_config:
        logging.info('Command line config: %s', args)
    if rank == 0:
        logging.info('Job configuration: %s', config)
        logging.info('Saving job outputs to %s', output_dir)

    # Configure session
    device_config = config.get('device', {})
    configure_session(**device_config)

    # Load the data
    train_gen, valid_gen = get_datasets(batch_size=train_config['batch_size'],
                                        **config['data'])

    # Build the model
    model = get_model(**config['model'])
    # Configure optimizer
    opt = get_optimizer(n_ranks=n_ranks, dist_wrapper=hvd.DistributedOptimizer, **config['optimizer'])
    # Compile the model
    model.compile(loss=train_config['loss'], optimizer=opt,
                  metrics=train_config['metrics'])
    if rank == 0:
        model.summary()

    # Prepare the training callbacks
    callbacks = get_basic_callbacks(args.distributed)

    # Learning rate warmup
    warmup_epochs = train_config.get('lr_warmup_epochs', 0)
    callbacks.append(hvd.callbacks.LearningRateWarmupCallback(
                     warmup_epochs=warmup_epochs, verbose=1))

    # Learning rate decay schedule
    for lr_schedule in train_config.get('lr_schedule', []):
        if rank == 0:
            logging.info('Adding LR schedule: %s', lr_schedule)
        callbacks.append(hvd.callbacks.LearningRateScheduleCallback(**lr_schedule))

    # Checkpoint only from rank 0
    if rank == 0:
        os.makedirs(os.path.dirname(checkpoint_format), exist_ok=True)
        callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format))
        
    # Timing callback
    timing_callback = TimingCallback()
    callbacks.append(timing_callback)

    # Train the model
    train_steps_per_epoch = max([len(train_gen) // n_ranks, 1])
    valid_steps_per_epoch = max([len(valid_gen) // n_ranks, 1])
    history = model.fit_generator(train_gen,
                                  epochs=train_config['n_epochs'],
                                  steps_per_epoch=train_steps_per_epoch,
                                  validation_data=valid_gen,
                                  validation_steps=valid_steps_per_epoch,
                                  callbacks=callbacks,
                                  workers=4, verbose=2 if rank==0 else 0)

    # Save training history
    if rank == 0:
        # Print some best-found metrics
        if 'val_acc' in history.history.keys():
            logging.info('Best validation accuracy: %.3f',
                         max(history.history['val_acc']))
        if 'val_top_k_categorical_accuracy' in history.history.keys():
            logging.info('Best top-5 validation accuracy: %.3f',
                         max(history.history['val_top_k_categorical_accuracy']))
        logging.info('Average time per epoch: %.3f s',
                     np.mean(timing_callback.times))
        np.savez(os.path.join(output_dir, 'history'),
                 n_ranks=n_ranks, **history.history)

    # Drop to IPython interactive shell
    if args.interactive and (rank == 0):
        logging.info('Starting IPython interactive session')
        import IPython
        IPython.embed()

    if rank == 0:
        logging.info('All done!')
コード例 #16
0
def main():
    """Main function"""

    # Initialization
    args = parse_args()
    rank, local_rank, n_ranks = init_workers(args.distributed)

    # Load configuration
    config = load_config(args.config)

    # Configure logging
    config_logging(verbose=args.verbose)
    logging.info('Initialized rank %i local_rank %i size %i',
                 rank, local_rank, n_ranks)

    # Device configuration
    configure_session(gpu=local_rank, **config.get('device', {}))

    # Load the data
    train_data, valid_data = get_datasets(rank=rank, n_ranks=n_ranks,
                                          **config['data'])
    if rank == 0:
        logging.info(train_data)
        logging.info(valid_data)

    # Construct the model and optimizer
    model = get_model(**config['model'])
    optimizer = get_optimizer(n_ranks=n_ranks, **config['optimizer'])
    train_config = config['train']

    # Custom metrics for pixel accuracy and IoU
    metrics = [PixelAccuracy(), PixelIoU(name='iou', num_classes=3)]

    # Compile the model
    model.compile(loss=train_config['loss'], optimizer=optimizer,
                  metrics=metrics)

    # Print a model summary
    if rank == 0:
        model.summary()

    # Prepare the callbacks
    callbacks = []

    if args.distributed:

        # Broadcast initial variable states from rank 0 to all processes.
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))

        # Average metrics across workers
        callbacks.append(hvd.callbacks.MetricAverageCallback())

        # Learning rate warmup
        warmup_epochs = train_config.get('lr_warmup_epochs', 0)
        callbacks.append(hvd.callbacks.LearningRateWarmupCallback(
            warmup_epochs=warmup_epochs, verbose=1))

    # Timing
    timing_callback = TimingCallback()
    callbacks.append(timing_callback)

    # Checkpointing and CSV logging from rank 0 only
    #if rank == 0:
    #    callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_format))
    #    callbacks.append(tf.keras.callbacks.CSVLogger(
    #        os.path.join(config['output_dir'], 'history.csv'), append=args.resume))

    if rank == 0:
        logging.debug('Callbacks: %s', callbacks)

    # Train the model
    verbosity = 2 if rank==0 or args.verbose else 0
    history = model.fit(train_data,
                        validation_data=valid_data,
                        epochs=train_config['n_epochs'],
                        callbacks=callbacks,
                        verbose=verbosity)

    # All done
    if rank == 0:
        logging.info('All done!')
コード例 #17
0
def main():
    """Main function"""
    # Initialization
    args = parse_args()
    rank, n_ranks = init_workers(args.distributed)

    # Load configuration
    config = load_config(args.config)
    train_config = config['training']
    output_dir = os.path.expandvars(config['output_dir'])
    checkpoint_format = os.path.join(output_dir, 'checkpoints',
                                     'checkpoint-{epoch}.h5')
    os.makedirs(output_dir, exist_ok=True)

    # Logging
    config_logging(verbose=args.verbose, output_dir=output_dir)
    logging.info('Initialized rank %i out of %i', rank, n_ranks)
    if args.show_config:
        logging.info('Command line config: %s', args)
    if rank == 0:
        logging.info('Job configuration: %s', config)
        logging.info('Saving job outputs to %s', output_dir)

    # Configure session
    if args.distributed:
        gpu = hvd.local_rank()
    else:
        gpu = args.gpu
    device_config = config.get('device', {})
    configure_session(gpu=gpu, **device_config)

    # Load the data
    train_gen, valid_gen = get_datasets(batch_size=train_config['batch_size'],
                                        **config['data_and_model'],
                                        **config['data'])

    # Build the model
    # if (type(config['data']['n_components']) is int):
    #     rho_length_in = config['data']['n_components']
    # else:
    rho_length_in = config['model']['rho_length_out']

    model = get_model(rho_length_in=rho_length_in, 
                      **config['data_and_model'],
                      **config['model'])
    # Configure optimizer
    opt = get_optimizer(n_ranks=n_ranks, distributed=args.distributed,
                        **config['optimizer'])
    # Compile the model
    model.compile(loss=train_config['loss'], optimizer=opt,
                  metrics=train_config['metrics'])
    if rank == 0:
        model.summary()

    # Prepare the training callbacks
    callbacks = []
    if args.distributed:

        # Broadcast initial variable states from rank 0 to all processes.
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))

        # # Learning rate warmup
        # warmup_epochs = train_config.('lr_warmup_epochs', 0)
        # callbacks.append(hvd.callbacks.LearningRateWarmupCallback(
        #     warmup_epochs=warmup_epochs, verbose=1))

        # # Learning rate decay schedule
        # for lr_schedule in train_config.get('lr_schedule', []):
        #     if rank == 0:
        #         logging.info('Adding LR schedule: %s', lr_schedule)
        #     callbacks.append(hvd.callbacks.LearningRateScheduleCallback(**lr_schedule))

    # Checkpoint only from rank 0
    if rank == 0:
        #os.makedirs(os.path.dirname(checkpoint_format), exist_ok=True)
        #callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format))
        #callbacks.append(keras.callbacks.EarlyStopping(monitor='val_loss',
        #                                           patience=5))
        callbacks.append(keras.callbacks.ModelCheckpoint(filepath=os.path.join(output_dir, 'model.h5'),
                                                         monitor='val_mean_absolute_error',
                                                         save_best_only=False,
                                                         verbose=2))


    # Timing
    timing_callback = TimingCallback()
    callbacks.append(timing_callback)

    # Train the model
    steps_per_epoch = len(train_gen) // n_ranks 
#     import pdb
#     pdb.set_trace()
    
    history = model.fit_generator(train_gen,
                                  epochs=train_config['n_epochs'],
                                  steps_per_epoch=steps_per_epoch,
                                  validation_data=valid_gen,
                                  validation_steps=len(valid_gen),
                                  callbacks=callbacks,
                                  workers=4, verbose=1)

    # Save training history
    if rank == 0:
        # Print some best-found metrics
        if 'val_acc' in history.history.keys():
            logging.info('Best validation accuracy: %.3f',
                         max(history.history['val_acc']))
        if 'val_top_k_categorical_accuracy' in history.history.keys():
            logging.info('Best top-5 validation accuracy: %.3f',
                         max(history.history['val_top_k_categorical_accuracy']))
        if 'val_mean_absolute_error' in history.history.keys():
            logging.info('Best validation mae: %.3f',
                         min(history.history['val_mean_absolute_error']))

        

        logging.info('Average time per epoch: %.3f s',
                     np.mean(timing_callback.times))
        np.savez(os.path.join(output_dir, 'history'),
                 n_ranks=n_ranks, **history.history)

    # Drop to IPython interactive shell
    if args.interactive and (rank == 0):
        logging.info('Starting IPython interactive session')
        import IPython
        IPython.embed()

    if rank == 0:
        logging.info('All done!')