Esempio n. 1
0
def fake_data(size=100, image_size=(1, 4, 4), train=False):
    return FakeData(size=size, image_size=image_size, transform=ToTensor())
Esempio n. 2
0
def train(
    rank: int,
    world_size: int,
    num_epochs: int = 10,
    batch_size: int = 32,
    data_size: int = 200,
    use_oss: bool = True,
    check_regression: bool = True,
    reference_speed: float = -1.0,
    reference_memory: float = -1.0,
):

    # DDP
    dist_init(rank, world_size)

    # Standard RN101
    model = resnet101(pretrained=False, progress=True).to(rank)

    # Data setup, dummy data
    def collate(inputs: List[Any]):
        return {
            "inputs": torch.stack([i[0] for i in inputs]).to(rank),
            "label": torch.stack([i[1] for i in inputs]).to(rank),
        }

    dataloader = DataLoader(dataset=FakeData(transform=ToTensor(),
                                             size=data_size),
                            batch_size=batch_size,
                            collate_fn=collate)
    loss_fn = nn.CrossEntropyLoss()

    # Reset the memory use counter
    torch.cuda.reset_peak_memory_stats(rank)

    # Shard the optimizer
    optimizer: Union[OSS, OPTIM] = OSS(
        params=model.parameters(), optim=OPTIM, lr=1e-4,
        momentum=0.9) if use_oss else OPTIM(
            model.parameters(), lr=1e-4, momentum=0.9)

    # Dummy training loop
    torch.cuda.synchronize(rank)
    training_start = time.monotonic()
    model.train()

    measurements = []

    for epoch in range(num_epochs):
        epoch_start = time.monotonic()

        for batch in dataloader:

            def closure():
                model.zero_grad()
                outputs = model(batch["inputs"])
                loss = loss_fn(outputs, batch["label"])
                dist.all_reduce(loss, op=dist.ReduceOp.SUM)
                loss /= world_size
                loss.backward()
                return loss

            optimizer.step(closure)

        epoch_end = time.monotonic()

        if use_oss:
            # Check the checkpointing in the case of the OSS optimizer
            # Memory usage could spill over from there
            optimizer = cast(OSS, optimizer)
            # optimizer.consolidate_state_dict()
            if dist.get_rank() == 0:
                # _ = optimizer.state_dict()
                print("... State dict collected")

        measurements.append(data_size / (epoch_end - epoch_start))
        if dist.get_rank() == 0:
            print(
                f"Epoch {epoch} - processed {measurements[-1]:.2f} img per sec"
            )

    torch.cuda.synchronize(rank)
    training_stop = time.monotonic()
    img_per_sec = data_size / (training_stop - training_start) * num_epochs
    max_memory = torch.cuda.max_memory_allocated(rank) / 2**20

    print(
        f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec overall"
    )
    print(f"[{dist.get_rank()}] : Peak memory {max_memory:.1f}MiB")

    # Compute the mean and average img per second
    mean = sum(measurements) / len(measurements)
    diff = map(lambda x: pow(x - mean, 2.0), measurements)
    std = math.sqrt(sum(diff) / (len(measurements) - 1))
    print(f"[{dist.get_rank()}] : Mean speed: {mean:.2f} +/- {std:.2f}")

    if use_oss and check_regression and dist.get_rank() == 0:
        assert (mean -
                3.0 * std) < reference_speed, "Speed regression detected"
        assert max_memory < 1.05 * reference_memory, "Memory use regression detected"
        print("[Regression Test] VALID")
Esempio n. 3
0
    args.repeat = 10
    args.number = 10

    model = resnet18()
    criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.SGD(
        model.parameters(),
        0.001
    )

    dataset = FakeData(
        size=args.batch_size * args.number,
        image_size=(3, 224, 224),
        num_classes=1000,
        transform=transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor()
        ])
    )

    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=args.batch_size,
        pin_memory=True,
        num_workers=args.workers,
        shuffle=False,
    )

    bench = BenchTrainer(
        __file__,
Esempio n. 4
0
    def __init__(self,
                 dataset_name=None,
                 batch_size=2,
                 train_size=10,
                 val_size=10,
                 image_size=(3, 224, 224),
                 num_classes=10,
                 transform=None,
                 target_transform=None,
                 random_offset=0,
                 num_workers=0):

        self.dataset_name = dataset_name or "temp_data.hdf5"
        if ".hdf5" not in self.dataset_name[-5:]:
            self.dataset_name = self.dataset_name + ".hdf5"

        # Create Fake directory of data.
        self.temp_dir = tempfile.TemporaryDirectory()

        # Create saver to help store new data.
        self.dataset_path = os.path.join(self.temp_dir.name, self.dataset_name)
        self.data_saver = HDF5DataSaver(data_path=self.dataset_path)

        # Create fake datasets that help generate data.
        self.fake_train_dataset = FakeData(size=train_size,
                                           image_size=image_size,
                                           num_classes=num_classes,
                                           transform=transform,
                                           target_transform=target_transform,
                                           random_offset=random_offset)
        val_random_offset = random_offset + max(int(num_classes / 2), 1)
        self.fake_val_dataset = FakeData(size=val_size,
                                         image_size=image_size,
                                         num_classes=num_classes,
                                         transform=transform,
                                         target_transform=target_transform,
                                         random_offset=val_random_offset)

        # Keep track of classes; small sampling sizes may give incomplete coverage.
        # e.g. `train_size=10` may yield sampled classes, say, 1-6 and not all 10.
        self.classes = {"train": set(), "val": set()}

        # Append one pass of each dataset to the "train" and "val" hdf5 groupings.
        self._append_dataset(dataset=self.fake_train_dataset,
                             group_name="train")
        self._append_dataset(dataset=self.fake_val_dataset, group_name="val")

        # Create dataloaders of newly saved temp data.
        self.batch_size = batch_size
        self.train_num_classes = len(self.classes["train"])
        self.val_num_classes = len(self.classes["val"])
        self.train_dataset = create_train_dataset(
            self.dataset_path,
            "train",
            num_classes=self.train_num_classes,
        )
        self.train_dataloader = DataLoader(dataset=self.train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=num_workers)

        self.val_dataset = create_validation_dataset(
            self.dataset_path, "val", num_classes=self.val_num_classes)
        self.val_dataloader = DataLoader(dataset=self.val_dataset,
                                         batch_size=batch_size,
                                         shuffle=False,
                                         num_workers=num_workers)