def fake_data(size=100, image_size=(1, 4, 4), train=False): return FakeData(size=size, image_size=image_size, transform=ToTensor())
def train( rank: int, world_size: int, num_epochs: int = 10, batch_size: int = 32, data_size: int = 200, use_oss: bool = True, check_regression: bool = True, reference_speed: float = -1.0, reference_memory: float = -1.0, ): # DDP dist_init(rank, world_size) # Standard RN101 model = resnet101(pretrained=False, progress=True).to(rank) # Data setup, dummy data def collate(inputs: List[Any]): return { "inputs": torch.stack([i[0] for i in inputs]).to(rank), "label": torch.stack([i[1] for i in inputs]).to(rank), } dataloader = DataLoader(dataset=FakeData(transform=ToTensor(), size=data_size), batch_size=batch_size, collate_fn=collate) loss_fn = nn.CrossEntropyLoss() # Reset the memory use counter torch.cuda.reset_peak_memory_stats(rank) # Shard the optimizer optimizer: Union[OSS, OPTIM] = OSS( params=model.parameters(), optim=OPTIM, lr=1e-4, momentum=0.9) if use_oss else OPTIM( model.parameters(), lr=1e-4, momentum=0.9) # Dummy training loop torch.cuda.synchronize(rank) training_start = time.monotonic() model.train() measurements = [] for epoch in range(num_epochs): epoch_start = time.monotonic() for batch in dataloader: def closure(): model.zero_grad() outputs = model(batch["inputs"]) loss = loss_fn(outputs, batch["label"]) dist.all_reduce(loss, op=dist.ReduceOp.SUM) loss /= world_size loss.backward() return loss optimizer.step(closure) epoch_end = time.monotonic() if use_oss: # Check the checkpointing in the case of the OSS optimizer # Memory usage could spill over from there optimizer = cast(OSS, optimizer) # optimizer.consolidate_state_dict() if dist.get_rank() == 0: # _ = optimizer.state_dict() print("... State dict collected") measurements.append(data_size / (epoch_end - epoch_start)) if dist.get_rank() == 0: print( f"Epoch {epoch} - processed {measurements[-1]:.2f} img per sec" ) torch.cuda.synchronize(rank) training_stop = time.monotonic() img_per_sec = data_size / (training_stop - training_start) * num_epochs max_memory = torch.cuda.max_memory_allocated(rank) / 2**20 print( f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec overall" ) print(f"[{dist.get_rank()}] : Peak memory {max_memory:.1f}MiB") # Compute the mean and average img per second mean = sum(measurements) / len(measurements) diff = map(lambda x: pow(x - mean, 2.0), measurements) std = math.sqrt(sum(diff) / (len(measurements) - 1)) print(f"[{dist.get_rank()}] : Mean speed: {mean:.2f} +/- {std:.2f}") if use_oss and check_regression and dist.get_rank() == 0: assert (mean - 3.0 * std) < reference_speed, "Speed regression detected" assert max_memory < 1.05 * reference_memory, "Memory use regression detected" print("[Regression Test] VALID")
args.repeat = 10 args.number = 10 model = resnet18() criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD( model.parameters(), 0.001 ) dataset = FakeData( size=args.batch_size * args.number, image_size=(3, 224, 224), num_classes=1000, transform=transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) ) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, pin_memory=True, num_workers=args.workers, shuffle=False, ) bench = BenchTrainer( __file__,
def __init__(self, dataset_name=None, batch_size=2, train_size=10, val_size=10, image_size=(3, 224, 224), num_classes=10, transform=None, target_transform=None, random_offset=0, num_workers=0): self.dataset_name = dataset_name or "temp_data.hdf5" if ".hdf5" not in self.dataset_name[-5:]: self.dataset_name = self.dataset_name + ".hdf5" # Create Fake directory of data. self.temp_dir = tempfile.TemporaryDirectory() # Create saver to help store new data. self.dataset_path = os.path.join(self.temp_dir.name, self.dataset_name) self.data_saver = HDF5DataSaver(data_path=self.dataset_path) # Create fake datasets that help generate data. self.fake_train_dataset = FakeData(size=train_size, image_size=image_size, num_classes=num_classes, transform=transform, target_transform=target_transform, random_offset=random_offset) val_random_offset = random_offset + max(int(num_classes / 2), 1) self.fake_val_dataset = FakeData(size=val_size, image_size=image_size, num_classes=num_classes, transform=transform, target_transform=target_transform, random_offset=val_random_offset) # Keep track of classes; small sampling sizes may give incomplete coverage. # e.g. `train_size=10` may yield sampled classes, say, 1-6 and not all 10. self.classes = {"train": set(), "val": set()} # Append one pass of each dataset to the "train" and "val" hdf5 groupings. self._append_dataset(dataset=self.fake_train_dataset, group_name="train") self._append_dataset(dataset=self.fake_val_dataset, group_name="val") # Create dataloaders of newly saved temp data. self.batch_size = batch_size self.train_num_classes = len(self.classes["train"]) self.val_num_classes = len(self.classes["val"]) self.train_dataset = create_train_dataset( self.dataset_path, "train", num_classes=self.train_num_classes, ) self.train_dataloader = DataLoader(dataset=self.train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) self.val_dataset = create_validation_dataset( self.dataset_path, "val", num_classes=self.val_num_classes) self.val_dataloader = DataLoader(dataset=self.val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)