def test_top1_accuracy():
    output_1 = torch.tensor([[0, 1], [0, 1], [1, 0], [0, 1],
                             [1, 0]]).reshape(5, 2)
    output_2 = torch.tensor([1, 1, 0, 1, 0]).reshape(5, 1)
    target = torch.tensor([0, 1, 0, 0, 1]).reshape(5, 1)

    acc = TopKAccuracy()
    expected_score = (2 / 5) * 100

    actual_score_1 = acc(None, output_1, target)
    actual_score_2 = acc(None, output_2, target)

    assert actual_score_1 == expected_score
    assert actual_score_2 == expected_score
def test_top3_accuracy():
    output_1 = torch.tensor([
        [0.2, 0.2, 0.3, 0.1],
        [0.15, 0.2, 0.05, 0.6],
        [0.25, 0.3, 0.15, 0.3],
        [0.3, 0.1, 0.2, 0.2],
        [0.15, 0.15, 0.2, 0.5],
    ]).reshape(5, 4)
    target = torch.tensor([3, 1, 0, 2, 1]).reshape(5, 1)

    acc = TopKAccuracy(topk=3)
    expected_score = (3 / 5) * 100

    actual_score_1 = acc(output_1, target)

    assert actual_score_1 == expected_score
def test_update_best_runtime_metric(mocker):
    tracker = Tracker([TopKAccuracy(5)], 1, 0)
    # tracker = mocker.patch('mlbench_core.utils.pytorch.helpers.Tracker')

    is_best, best_metric_name = update_best_runtime_metric(tracker, 10.0, "prec")

    assert is_best
    assert best_metric_name == "best_prec"

    is_best, best_metric_name = update_best_runtime_metric(tracker, 11.0, "prec")

    assert is_best
    assert best_metric_name == "best_prec"

    is_best, best_metric_name = update_best_runtime_metric(tracker, 9.0, "prec")

    assert not is_best
    assert best_metric_name == "best_prec"
def metrics():
    return [TopKAccuracy(topk=1)]
Beispiel #5
0
def main(run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False):
    r"""Main logic."""
    num_parallel_workers = 2
    use_cuda = True
    max_batch_per_epoch = None
    train_epochs = 164
    batch_size = 128

    initialize_backends(comm_backend='mpi',
                        logging_level='INFO',
                        logging_file=os.path.join(output_dir, 'mlbench.log'),
                        use_cuda=use_cuda,
                        seed=42,
                        cudnn_deterministic=False,
                        ckpt_run_dir=ckpt_run_dir,
                        delete_existing_ckpts=not validation_only)

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    model = ResNetCIFAR(resnet_size=20,
                        bottleneck=False,
                        num_classes=10,
                        version=1)

    optimizer = SSGDWM(model,
                       world_size=world_size,
                       num_coordinates=1,
                       lr=0.1,
                       weight_decay=0)

    # Create a learning rate scheduler for an optimizer
    scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1)

    # A loss_function for computing the loss
    loss_function = CrossEntropyLoss()

    if use_cuda:
        model = model.cuda()
        optimizer = optimizer.cuda()
        loss_function = loss_function.cuda()

    # Metrics like Top 1/5 Accuracy
    metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)]

    train_set = CIFAR10V1(dataset_dir, train=True, download=True)
    val_set = CIFAR10V1(dataset_dir, train=False, download=True)

    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    train_loader = DataLoader(train_set,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=num_parallel_workers,
                              pin_memory=use_cuda,
                              drop_last=False)

    val_loader = DataLoader(val_set,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=num_parallel_workers,
                            pin_memory=use_cuda,
                            drop_last=False)

    checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir,
                                rank=rank,
                                freq=CheckpointFreq.NONE)

    if not validation_only:
        controlflow = TrainValidation(model=model,
                                      optimizer=optimizer,
                                      loss_function=loss_function,
                                      metrics=metrics,
                                      scheduler=scheduler,
                                      batch_size=batch_size,
                                      train_epochs=train_epochs,
                                      rank=rank,
                                      world_size=world_size,
                                      run_id=run_id,
                                      dtype='fp32',
                                      validate=True,
                                      schedule_per='epoch',
                                      checkpoint=checkpointer,
                                      transform_target_type=None,
                                      average_models=True,
                                      use_cuda=use_cuda,
                                      max_batch_per_epoch=max_batch_per_epoch)

        controlflow.run(dataloader_train=train_loader,
                        dataloader_val=val_loader,
                        dataloader_train_fn=None,
                        dataloader_val_fn=None,
                        resume=False,
                        repartition_per_epoch=False)
    else:
        cecf = CheckpointsEvaluationControlFlow(ckpt_dir=ckpt_run_dir,
                                                rank=rank,
                                                world_size=world_size,
                                                checkpointer=checkpointer,
                                                model=model,
                                                epochs=train_epochs,
                                                loss_function=loss_function,
                                                metrics=metrics,
                                                use_cuda=use_cuda,
                                                dtype='fp32',
                                                max_batch_per_epoch=None)

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(output_dir, "train_stats.json"), 'w') as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(output_dir, "val_stats.json"), 'w') as f:
            json.dump(val_stats, f)
Beispiel #6
0
def test_tracker():
    tracker = Tracker([TopKAccuracy(5)], 1, 0)

    assert tracker is not None
def run(rank, size, run_id):
    """ Distributed Synchronous SGD Example """
    torch.manual_seed(1234)
    logging.info("Loading Dataset")
    train_set, bsz = partition_dataset_train()
    val_set, bsz_val = partition_dataset_val()
    logging.info("Setting up models and training")
    model = Net()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    metrics = [
        TopKAccuracy(topk=1),
        TopKAccuracy(topk=5)
    ]
    loss_func = nn.NLLLoss()

    goal = task1_time_to_accuracy_goal()

    tracker = Tracker(metrics, run_id, rank, goal=goal)

    num_batches = ceil(len(train_set.dataset) / float(bsz))
    num_batches_val = ceil(len(val_set.dataset) / float(bsz_val))

    tracker.start()

    logging.info("Starting train loop")

    for epoch in range(10):
        tracker.train()

        epoch_loss = 0.0
        for i, (data, target) in enumerate(train_set):
            tracker.batch_start()

            optimizer.zero_grad()
            output = model(data)

            tracker.record_batch_step('forward')

            loss = loss_func(output, target)
            epoch_loss += loss.data.item()

            tracker.record_batch_step('loss')

            loss.backward()

            tracker.record_batch_step('backward')

            average_gradients(model)
            optimizer.step()

            tracker.batch_end()

            logging.info("Batch: {}, Loss: {}".format(i, loss.item()))

        tracker.record_loss(epoch_loss, num_batches, log_to_api=True)

        logging.debug('Rank %s, epoch %s: %s',
                      dist.get_rank(), epoch,
                      epoch_loss / num_batches)

        metrics, loss = validation_round(val_set, model, loss_func, metrics, "fp32",
                                         tracker=tracker, transform_target_type=False,
                                         use_cuda=False, max_batches=num_batches_val)
        record_validation_stats(metrics, loss, tracker=tracker, rank=rank)

        tracker.epoch_end()

        if tracker.goal_reached:
            logging.debug("Goal Reached!")
            return
Beispiel #8
0
def train_loop(
    run_id,
    dataset_dir,
    ckpt_run_dir,
    output_dir,
    validation_only=False,
    use_cuda=False,
    light_target=False,
):
    """Main logic."""
    num_parallel_workers = 2
    max_batch_per_epoch = None
    train_epochs = 20
    batch_size = 100

    n_features = 2000

    l1_coef = 0.0
    l2_coef = 0.0000025  # Regularization 1 / train_size ( 1 / 400,000)
    dtype = "fp32"

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    lr = 4
    scaled_lr = lr * min(16, world_size)

    by_layer = False
    agg_grad = False  # According to paper, we aggregate weights after update

    model = LogisticRegression(n_features)

    # A loss_function for computing the loss
    loss_function = BCELossRegularized(l1=l1_coef, l2=l2_coef, model=model)

    if use_cuda:
        model = model.cuda()
        loss_function = loss_function.cuda()

    optimizer = CentralizedSGD(
        world_size=world_size,
        model=model,
        lr=scaled_lr,
        use_cuda=use_cuda,
        by_layer=by_layer,
        agg_grad=agg_grad,
    )

    metrics = [
        TopKAccuracy(),  # Binary accuracy with threshold 0.5
        F1Score(),
        DiceCoefficient(),
    ]

    train_set = LMDBDataset(name="epsilon",
                            data_type="train",
                            root=dataset_dir)
    val_set = LMDBDataset(name="epsilon", data_type="test", root=dataset_dir)

    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    val_loader = DataLoader(
        val_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    num_batches_per_device_train = len(train_loader)

    scheduler = ReduceLROnPlateau(
        optimizer.optimizer,
        factor=0.75,
        patience=0,
        verbose=True,
        threshold_mode="abs",
        threshold=0.01,
        min_lr=lr,
    )
    checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir,
                                rank=rank,
                                freq=CheckpointFreq.NONE)

    if not validation_only:
        if light_target:
            goal = task2_time_to_accuracy_light_goal()
        else:
            goal = task2_time_to_accuracy_goal()

        tracker = Tracker(metrics, run_id, rank, goal=goal)

        dist.barrier()
        tracker.start()

        for epoch in range(0, train_epochs):
            # Set tracker and model in training mode
            model.train()
            tracker.train()

            for batch_idx, (data, target) in enumerate(train_loader):
                tracker.batch_start()
                data, target = prepare_batch(
                    data,
                    target,
                    dtype=dtype,
                    transform_target_dtype=False,
                    use_cuda=use_cuda,
                )
                tracker.record_batch_load()

                # Clear gradients in the optimizer.
                optimizer.zero_grad()
                tracker.record_batch_init()

                # Compute the output
                output = model(data)
                tracker.record_batch_fwd_pass()

                # Compute the loss
                loss = loss_function(output, target)
                tracker.record_batch_comp_loss()

                # Backprop
                loss.backward()
                tracker.record_batch_backprop()

                # Aggregate gradients/parameters from all workers and apply updates to model
                optimizer.step(tracker=tracker)

                metrics_results = compute_train_batch_metrics(
                    output,
                    target,
                    metrics,
                )

                tracker.record_batch_comp_metrics()

                # scheduler.batch_step()
                tracker.batch_end()

                record_train_batch_stats(
                    batch_idx,
                    loss.item(),
                    output,
                    metrics_results,
                    tracker,
                    num_batches_per_device_train,
                )

            tracker.epoch_end()

            # Perform validation and gather results
            metrics_values, loss = validation_round(
                val_loader,
                model=model,
                loss_function=loss_function,
                metrics=metrics,
                dtype=dtype,
                tracker=tracker,
                transform_target_type=False,
                use_cuda=use_cuda,
                max_batches=max_batch_per_epoch,
            )
            # Scheduler per epoch
            scheduler.step(loss)
            # Record validation stats
            is_best = record_validation_stats(metrics_values=metrics_values,
                                              loss=loss,
                                              tracker=tracker,
                                              rank=rank)
            checkpointer.save(tracker, model, optimizer, scheduler,
                              tracker.current_epoch, is_best)
            if tracker.goal_reached:
                print("Goal Reached!")
                dist.barrier()
                time.sleep(10)
                return
    else:
        cecf = CheckpointsEvaluationControlFlow(
            ckpt_dir=ckpt_run_dir,
            rank=rank,
            world_size=world_size,
            checkpointer=checkpointer,
            model=model,
            epochs=train_epochs,
            loss_function=loss_function,
            metrics=metrics,
            use_cuda=use_cuda,
            dtype="fp32",
            max_batch_per_epoch=None,
        )

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(output_dir, "train_stats.json"), "w") as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(output_dir, "val_stats.json"), "w") as f:
            json.dump(val_stats, f)
Beispiel #9
0
def train_loop(run_id,
               dataset_dir,
               ckpt_run_dir,
               output_dir,
               validation_only=False,
               use_cuda=False,
               light_target=False):
    r"""Main logic."""
    num_parallel_workers = 2
    max_batch_per_epoch = None
    train_epochs = 164
    batch_size = 128

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    model = ResNetCIFAR(resnet_size=20,
                        bottleneck=False,
                        num_classes=10,
                        version=1)

    optimizer = CentralizedSGD(world_size=world_size,
                               model=model,
                               lr=0.1,
                               momentum=0.9,
                               weight_decay=1e-4,
                               nesterov=False)

    # Create a learning rate scheduler for an optimizer
    scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1)

    # A loss_function for computing the loss
    loss_function = CrossEntropyLoss()

    if use_cuda:
        model = model.cuda()
        loss_function = loss_function.cuda()

    # Metrics like Top 1/5 Accuracy
    metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)]

    train_set = CIFAR10V1(dataset_dir, train=True, download=True)
    val_set = CIFAR10V1(dataset_dir, train=False, download=True)

    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    train_loader = DataLoader(train_set,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=num_parallel_workers,
                              pin_memory=use_cuda,
                              drop_last=False)

    val_loader = DataLoader(val_set,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=num_parallel_workers,
                            pin_memory=use_cuda,
                            drop_last=False)

    checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir,
                                rank=rank,
                                freq=CheckpointFreq.NONE)

    if not validation_only:
        if light_target:
            goal = task1_time_to_accuracy_light_goal
        else:
            goal = task1_time_to_accuracy_goal

        tracker = Tracker(metrics, run_id, rank, goal=goal)

        dist.barrier()

        tracker.start()

        for epoch in range(0, train_epochs):
            train_round(train_loader,
                        model,
                        optimizer,
                        loss_function,
                        metrics,
                        scheduler,
                        'fp32',
                        schedule_per='epoch',
                        transform_target_type=None,
                        use_cuda=use_cuda,
                        max_batch_per_epoch=max_batch_per_epoch,
                        tracker=tracker)

            is_best = validation_round(val_loader,
                                       model,
                                       loss_function,
                                       metrics,
                                       run_id,
                                       rank,
                                       'fp32',
                                       transform_target_type=None,
                                       use_cuda=use_cuda,
                                       max_batch_per_epoch=max_batch_per_epoch,
                                       tracker=tracker)

            checkpointer.save(tracker, model, optimizer, scheduler,
                              tracker.current_epoch, is_best)

            tracker.epoch_end()

            if tracker.goal_reached:
                print("Goal Reached!")
                return

    else:
        cecf = CheckpointsEvaluationControlFlow(ckpt_dir=ckpt_run_dir,
                                                rank=rank,
                                                world_size=world_size,
                                                checkpointer=checkpointer,
                                                model=model,
                                                epochs=train_epochs,
                                                loss_function=loss_function,
                                                metrics=metrics,
                                                use_cuda=use_cuda,
                                                dtype='fp32',
                                                max_batch_per_epoch=None)

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(output_dir, "train_stats.json"), 'w') as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(output_dir, "val_stats.json"), 'w') as f:
            json.dump(val_stats, f)
Beispiel #10
0
def main(run_id):
    checkpoint_dir = os.path.join(config['checkpoint_root'], run_id)
    rank, world_size, _ = initialize_backends(
        comm_backend=config['comm_backend'],
        logging_level=config['logging_level'],
        logging_file=config['logging_file'],
        use_cuda=config['use_cuda'],
        seed=config['seed'],
        ckpt_run_dir=checkpoint_dir)

    os.makedirs(config['dataset_root'], exist_ok=True)

    train_set = CIFAR10V1(config['dataset_root'], train=True, download=True)
    val_set = CIFAR10V1(config['dataset_root'], train=False, download=True)

    train_set = partition_dataset_by_rank(train_set, rank, world_size)

    train_loader = DataLoader(train_set,
                              batch_size=config['batch_size'],
                              shuffle=True,
                              num_workers=config['num_parallel_workers'],
                              pin_memory=config['use_cuda'],
                              drop_last=False)
    val_loader = DataLoader(val_set,
                            batch_size=config['batch_size'],
                            shuffle=False,
                            num_workers=config['num_parallel_workers'],
                            pin_memory=config['use_cuda'],
                            drop_last=False)

    model = get_resnet_model('resnet20',
                             2,
                             'fp32',
                             num_classes=config['num_classes'],
                             use_cuda=True)

    if config['use_cuda']:
        model.cuda()

    lr = config['lr_per_sample'] * config['batch_size']

    optimizer = optim.SGD(model.parameters(),
                          lr=lr,
                          momentum=config['momentum'],
                          weight_decay=config['weight_decay'],
                          nesterov=config['nesterov'])

    scheduler = multistep_learning_rates_with_warmup(
        optimizer,
        world_size,
        lr,
        config['multisteplr_gamma'],
        config['multisteplr_milestones'],
        warmup_duration=config['warmup_duration'],
        warmup_linear_scaling=config['warmup_linear_scaling'],
        warmup_lr=lr)

    loss_function = CrossEntropyLoss()

    if config['use_cuda']:
        loss_function.cuda()

    metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)]

    checkpointer = Checkpointer(checkpoint_dir, rank)

    controlflow = TrainValidation(model,
                                  optimizer,
                                  loss_function,
                                  metrics,
                                  scheduler,
                                  config['batch_size'],
                                  config['train_epochs'],
                                  rank,
                                  world_size,
                                  run_id,
                                  dtype=config['dtype'],
                                  checkpoint=checkpointer,
                                  use_cuda=config['use_cuda'])

    controlflow.run(dataloader_train=train_loader, dataloader_val=val_loader)
def main(run_id, validation_only=False):
    r"""Main logic."""
    num_parallel_workers = 2
    dataset_root = '/datasets/torch/cifar10'
    ckpt_run_dir = '/checkpoints/decentralized/cifar_resnet20'
    use_cuda = True
    train_epochs = 164

    initialize_backends(
        comm_backend='mpi',
        logging_level='INFO',
        logging_file='/mlbench.log',
        use_cuda=use_cuda,
        seed=42,
        cudnn_deterministic=False,
        ckpt_run_dir=ckpt_run_dir,
        delete_existing_ckpts=not validation_only)

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    batch_size = 256 // world_size

    model = ResNetCIFAR(
        resnet_size=20,
        bottleneck=False,
        num_classes=10,
        version=1)

    optimizer = optim.SGD(
        model.parameters(),
        lr=0.1,
        momentum=0.9,
        weight_decay=1e-4,
        nesterov=True)

    # Create a learning rate scheduler for an optimizer
    scheduler = MultiStepLR(
        optimizer,
        milestones=[82, 109],
        gamma=0.1)

    # A loss_function for computing the loss
    loss_function = CrossEntropyLoss()

    if use_cuda:
        model = model.cuda()
        loss_function = loss_function.cuda()

    # Metrics like Top 1/5 Accuracy
    metrics = [
        TopKAccuracy(topk=1),
        TopKAccuracy(topk=5)
    ]

    train_set = CIFAR10V1(dataset_root, train=True, download=True)
    val_set = CIFAR10V1(dataset_root, train=False, download=True)

    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False)

    val_loader = DataLoader(
        val_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False)

    checkpointer = Checkpointer(
        ckpt_run_dir=ckpt_run_dir,
        rank=rank,
        checkpoint_all=True)

    if not validation_only:
        # Aggregation
        ring_neighbors = [(rank + 1) % world_size, (rank - 1) % world_size]

        agg_fn = DecentralizedAggregation(
            rank=rank, neighbors=ring_neighbors).agg_model

        controlflow = TrainValidation(
            model=model,
            optimizer=optimizer,
            loss_function=loss_function,
            metrics=metrics,
            scheduler=scheduler,
            batch_size=batch_size,
            train_epochs=train_epochs,
            rank=rank,
            world_size=world_size,
            run_id=run_id,
            dtype='fp32',
            validate=True,
            schedule_per='epoch',
            checkpoint=checkpointer,
            transform_target_type=None,
            average_models=True,
            use_cuda=use_cuda,
            max_batch_per_epoch=None,
            agg_fn=agg_fn)

        controlflow.run(
            dataloader_train=train_loader,
            dataloader_val=val_loader,
            dataloader_train_fn=None,
            dataloader_val_fn=None,
            resume=False,
            repartition_per_epoch=False)
    else:
        cecf = CheckpointsEvaluationControlFlow(
            ckpt_dir=ckpt_run_dir,
            rank=rank,
            world_size=world_size,
            checkpointer=checkpointer,
            model=model,
            epochs=train_epochs,
            loss_function=loss_function,
            metrics=metrics,
            use_cuda=use_cuda,
            dtype='fp32',
            max_batch_per_epoch=None)

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(ckpt_run_dir, "train_stats.json"), 'w') as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(ckpt_run_dir, "val_stats.json"), 'w') as f:
            json.dump(val_stats, f)
Beispiel #12
0
def train_loop(
    run_id,
    dataset_dir,
    ckpt_run_dir,
    output_dir,
    validation_only=False,
    use_cuda=False,
    light_target=False,
):
    """Train loop"""
    num_parallel_workers = 2
    max_batch_per_epoch = None
    train_epochs = 164
    batch_size = 128
    dtype = "fp32"

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    # LR = 0.1 / 256 / sample
    lr = 0.02
    scaled_lr = lr * world_size
    by_layer = False

    # Create Model
    model = ResNetCIFAR(resnet_size=20,
                        bottleneck=False,
                        num_classes=10,
                        version=1)

    # Create optimizer
    optimizer = CentralizedSGD(
        world_size=world_size,
        model=model,
        lr=lr,
        momentum=0.9,
        weight_decay=1e-4,
        nesterov=False,
        use_cuda=use_cuda,
        by_layer=by_layer,
    )

    # A loss_function for computing the loss
    loss_function = CrossEntropyLoss()

    if use_cuda:
        model = model.cuda()
        loss_function = loss_function.cuda()

    # Metrics like Top 1/5 Accuracy
    metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)]

    train_set = CIFAR10V1(dataset_dir, train=True, download=True)
    val_set = CIFAR10V1(dataset_dir, train=False, download=True)

    # Create train/validation sets and loaders
    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    val_loader = DataLoader(
        val_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    # Create a learning rate scheduler for an optimizer
    scheduler = ReduceLROnPlateauWithWarmup(
        optimizer.optimizer,
        warmup_init_lr=lr,
        scaled_lr=scaled_lr,
        warmup_epochs=int(math.log(world_size, 2)),  # Adaptive warmup period
        factor=0.5,
        threshold_mode="abs",
        threshold=0.01,
        patience=1,
        verbose=True,
        min_lr=lr,
    )

    checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir,
                                rank=rank,
                                freq=CheckpointFreq.NONE)

    if not validation_only:
        if light_target:
            goal = task1_time_to_accuracy_light_goal()
        else:
            goal = task1_time_to_accuracy_goal()

        num_batches_per_device_train = len(train_loader)

        tracker = Tracker(metrics, run_id, rank, goal=goal)

        dist.barrier()

        tracker.start()

        for epoch in range(0, train_epochs):
            # Set tracker and model in training mode
            model.train()
            tracker.train()

            for batch_idx, (data, target) in enumerate(train_loader):
                tracker.batch_start()
                data, target = prepare_batch(
                    data,
                    target,
                    dtype=dtype,
                    transform_target_dtype=False,
                    use_cuda=use_cuda,
                )
                tracker.record_batch_load()

                # Clear gradients in the optimizer.
                optimizer.zero_grad()
                tracker.record_batch_init()

                # Compute the output
                output = model(data)
                tracker.record_batch_fwd_pass()

                # Compute the loss
                loss = loss_function(output, target)
                tracker.record_batch_comp_loss()

                # Backprop
                loss.backward()
                tracker.record_batch_backprop()

                # Aggregate gradients/parameters from all workers and apply updates to model
                optimizer.step(tracker=tracker)

                metrics_results = compute_train_batch_metrics(
                    output,
                    target,
                    metrics,
                )
                tracker.record_batch_comp_metrics()
                tracker.batch_end()

                record_train_batch_stats(
                    batch_idx,
                    loss.item(),
                    output,
                    metrics_results,
                    tracker,
                    num_batches_per_device_train,
                )

            # Scheduler per epoch
            tracker.epoch_end()

            # Perform validation and gather results
            metrics_values, loss = validation_round(
                val_loader,
                model=model,
                loss_function=loss_function,
                metrics=metrics,
                dtype=dtype,
                tracker=tracker,
                transform_target_type=False,
                use_cuda=use_cuda,
                max_batches=max_batch_per_epoch,
            )
            scheduler.step(loss)

            # Record validation stats
            is_best = record_validation_stats(metrics_values=metrics_values,
                                              loss=loss,
                                              tracker=tracker,
                                              rank=rank)

            checkpointer.save(tracker, model, optimizer, scheduler,
                              tracker.current_epoch, is_best)

            if tracker.goal_reached:
                print("Goal Reached!")
                dist.barrier()
                time.sleep(10)
                return
    else:
        cecf = CheckpointsEvaluationControlFlow(
            ckpt_dir=ckpt_run_dir,
            rank=rank,
            world_size=world_size,
            checkpointer=checkpointer,
            model=model,
            epochs=train_epochs,
            loss_function=loss_function,
            metrics=metrics,
            use_cuda=use_cuda,
            dtype="fp32",
            max_batch_per_epoch=None,
        )

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(output_dir, "train_stats.json"), "w") as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(output_dir, "val_stats.json"), "w") as f:
            json.dump(val_stats, f)
Beispiel #13
0
def train_loop(
    run_id,
    dataset_dir,
    ckpt_run_dir,
    output_dir,
    validation_only=False,
    use_cuda=False,
    light_target=False,
    by_layer=False,
):
    r"""Main logic."""
    num_parallel_workers = 2
    train_epochs = 164
    batch_size = 128

    rank = dist.get_rank()
    world_size = dist.get_world_size()
    current_device = cuda.current_device()

    local_model = ResNetCIFAR(resnet_size=20,
                              bottleneck=False,
                              num_classes=10,
                              version=1).to(current_device)
    model = DDP(local_model, device_ids=[current_device])

    optimizer = SGD(
        model.parameters(),
        lr=0.1,
        momentum=0.9,
        weight_decay=1e-4,
    )

    # Create a learning rate scheduler for an optimizer
    scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1)

    # A loss_function for computing the loss
    loss_function = CrossEntropyLoss()

    if use_cuda:
        model = model.cuda()
        loss_function = loss_function.cuda()

    # Metrics like Top 1/5 Accuracy
    metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)]

    train_set = CIFAR10V1(dataset_dir, train=True, download=True)
    val_set = CIFAR10V1(dataset_dir, train=False, download=True)

    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    val_loader = DataLoader(
        val_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False,
    )

    checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir,
                                rank=rank,
                                freq=CheckpointFreq.NONE)

    if not validation_only:
        if light_target:
            goal = task1_time_to_accuracy_light_goal()
        else:
            goal = task1_time_to_accuracy_goal()

        tracker = Tracker(metrics, run_id, rank, goal=goal)

        dist.barrier()

        tracker.start()

        for epoch in range(0, train_epochs):
            model.train()
            tracker.train()

            data_iter = iterate_dataloader(train_loader,
                                           dtype="fp32",
                                           use_cuda=use_cuda)
            num_batches_per_device_train = len(train_loader)

            for batch_idx, (data, target) in enumerate(data_iter):
                tracker.batch_start()

                # Clear gradients in the optimizer.
                optimizer.zero_grad()
                tracker.record_batch_init()

                # Compute the output
                output = model(data)
                tracker.record_batch_fwd_pass()

                # Compute the loss
                loss = loss_function(output, target)
                tracker.record_batch_comp_loss()

                # Backprop
                loss.backward()
                tracker.record_batch_backprop()

                # Aggregate gradients/parameters from all workers and apply updates to model
                optimizer.step()
                tracker.record_batch_opt_step()

                metrics_results = compute_train_batch_metrics(
                    output,
                    target,
                    metrics,
                )

                tracker.record_batch_comp_metrics()
                tracker.batch_end()

                record_train_batch_stats(
                    batch_idx,
                    loss.item(),
                    output,
                    metrics_results,
                    tracker,
                    num_batches_per_device_train,
                )

            tracker.epoch_end()
            metrics_values, loss = validation_round(
                val_loader,
                model=model,
                loss_function=loss_function,
                metrics=metrics,
                dtype="fp32",
                tracker=tracker,
                use_cuda=use_cuda,
            )

            scheduler.step()
            # Record validation stats
            is_best = record_validation_stats(metrics_values=metrics_values,
                                              loss=loss,
                                              tracker=tracker,
                                              rank=rank)

            checkpointer.save(tracker, model, optimizer, scheduler,
                              tracker.current_epoch, is_best)

            if tracker.goal_reached:
                print("Goal Reached!")
                time.sleep(10)
                return

    else:
        cecf = CheckpointsEvaluationControlFlow(
            ckpt_dir=ckpt_run_dir,
            rank=rank,
            world_size=world_size,
            checkpointer=checkpointer,
            model=model,
            epochs=train_epochs,
            loss_function=loss_function,
            metrics=metrics,
            use_cuda=use_cuda,
            dtype="fp32",
            max_batch_per_epoch=None,
        )

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(output_dir, "train_stats.json"), "w") as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(output_dir, "val_stats.json"), "w") as f:
            json.dump(val_stats, f)
def main(run_id):
    r"""Main logic."""
    num_parallel_workers = 2
    dataset_root = '/datasets/torch/cifar10'
    use_cuda = True
    batch_size = 128

    initialize_backends(comm_backend='mpi',
                        logging_level='INFO',
                        logging_file='/mlbench.log',
                        use_cuda=use_cuda,
                        seed=42,
                        cudnn_deterministic=False,
                        ckpt_run_dir='/checkpoints',
                        delete_existing_ckpts=False)

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    model = ResNetCIFAR(resnet_size=20,
                        bottleneck=False,
                        num_classes=10,
                        version=1)

    optimizer = optim.SGD(model.parameters(),
                          lr=0.1,
                          momentum=0.9,
                          weight_decay=1e-4,
                          nesterov=True)

    # Create a learning rate scheduler for an optimizer
    scheduler = MultiStepLR(optimizer,
                            milestones=[82, 109],
                            gamma=0.1)

    # A loss_function for computing the loss
    loss_function = CrossEntropyLoss()

    if use_cuda:
        model = model.cuda()
        loss_function = loss_function.cuda()

    # Metrics like Top 1/5 Accuracy
    metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)]

    train_set = CIFAR10V1(dataset_root, train=True, download=True)
    val_set = CIFAR10V1(dataset_root, train=False, download=True)

    train_set = partition_dataset_by_rank(train_set, rank, world_size)

    train_loader = DataLoader(
        train_set, batch_size=batch_size, shuffle=True,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda, drop_last=False)

    val_loader = DataLoader(
        val_set, batch_size=batch_size, shuffle=False,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda, drop_last=False)

    checkpointer = Checkpointer(
        ckpt_run_dir='/checkpoints',
        rank=rank,
        checkpoint_all=True)

    controlflow = TrainValidation(
        model=model,
        optimizer=optimizer,
        loss_function=loss_function,
        metrics=metrics,
        scheduler=scheduler,
        batch_size=batch_size,
        train_epochs=164,
        rank=rank,
        world_size=world_size,
        run_id=run_id,
        dtype='fp32',
        validate=True,
        schedule_per='epoch',
        checkpoint=checkpointer,
        transform_target_type=None,
        average_models=True,
        use_cuda=True,
        max_batch_per_epoch=None)

    controlflow.run(
        dataloader_train=train_loader,
        dataloader_val=val_loader,
        dataloader_train_fn=None,
        dataloader_val_fn=None,
        resume=False,
        repartition_per_epoch=False)