def repair(in_path): with open(in_path, "rb") as f: checkpoint = pickle.load(f) fix_needed = False if "lr_scheduler" in checkpoint: print( "Loading LR scheduler state dict (this might take a few minutes)") with io.BytesIO(checkpoint["lr_scheduler"]) as buf: lr_sched_state_dict = deserialize_state_dict(buf) if "anneal_func" in lr_sched_state_dict: fix_needed = True del lr_sched_state_dict["anneal_func"] with io.BytesIO() as buf: serialize_state_dict(buf, lr_sched_state_dict) checkpoint["lr_scheduler"] = buf.getvalue() out_path = f"{in_path}.repaired" print(f"Saving {out_path}") with open(out_path, "wb") as f: pickle.dump(checkpoint, f) if not fix_needed: print("This checkpoint does not need repair")
def test_identical(self): model_args = dict(config=dict( num_classes=3, defaults_sparse=True, )) model_class = nupic.research.frameworks.pytorch.models.resnets.resnet50 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = create_model( model_class=model_class, model_args=model_args, init_batch_norm=False, device=device, ) state = {} with io.BytesIO() as buffer: serialize_state_dict(buffer, model.state_dict()) state["model"] = buffer.getvalue() with tempfile.NamedTemporaryFile(delete=True) as checkpoint_file: pickle.dump(state, checkpoint_file) checkpoint_file.flush() model2 = create_model(model_class=model_class, model_args=model_args, init_batch_norm=False, device=device, checkpoint_file=checkpoint_file.name) self.assertTrue(compare_models(model, model2, (3, 224, 224)))
def setUp(self): set_random_seed(20) self.model = MNISTSparseCNN() self.model.eval() # Make all params twice as large to differentiate it from an init-ed model. for name, param in self.model.named_parameters(): if ("cnn" in name or "linear" in name) and ("weight" in name): param[:] = param.data * 2 # self.model.eval() self.in_1 = torch.rand(2, 1, 28, 28) self.in_2 = torch.rand(2, 1024) self.out_full = full_forward(self.model, self.in_1) self.out_lower = lower_forward(self.model, self.in_1) self.out_upper = upper_forward(self.model, self.in_2) # Create temporary results directory. self.tempdir = tempfile.TemporaryDirectory() self.results_dir = Path(self.tempdir.name) / Path("results") self.results_dir.mkdir() # Save model state. state = {} with io.BytesIO() as buffer: serialize_state_dict(buffer, self.model.state_dict(), compresslevel=-1) state["model"] = buffer.getvalue() self.checkpoint_path = self.results_dir / Path("mymodel") with open(self.checkpoint_path, "wb") as f: pickle.dump(state, f)
def test_creaate_model_from_checkpoint(self): model1 = create_model(model_class=resnet50, model_args={}, init_batch_norm=False, device="cpu") # Simulate imagenet experiment by changing the weights def init(m): if hasattr(m, "weight") and m.weight is not None: m.weight.data.fill_(0.042) model1.apply(init) # Save model checkpoint only, ignoring optimizer and other imagenet # experiment objects state. See ImagenetExperiment.get_state state = {} with io.BytesIO() as buffer: serialize_state_dict(buffer, model1.state_dict()) state["model"] = buffer.getvalue() with tempfile.NamedTemporaryFile() as checkpoint_file: # Ray save checkpoints as pickled dicts pickle.dump(state, checkpoint_file) checkpoint_file.file.flush() # Load model from checkpoint model2 = create_model(model_class=resnet50, model_args={}, init_batch_norm=False, device="cpu", checkpoint_file=checkpoint_file.name) self.assertTrue(compare_models(model1, model2, (3, 32, 32)))
def get_state(self): """ Get experiment serialized state as a dictionary of byte arrays :return: dictionary with "model", "optimizer" and "lr_scheduler" states """ state = {"current_epoch": self.current_epoch} # Save state into a byte array to avoid ray's GPU serialization issues # See https://github.com/ray-project/ray/issues/5519 with io.BytesIO() as buffer: serialize_state_dict(buffer, self.model.module.state_dict()) state["model"] = buffer.getvalue() with io.BytesIO() as buffer: serialize_state_dict(buffer, self.optimizer.state_dict()) state["optimizer"] = buffer.getvalue() if self.lr_scheduler is not None: with io.BytesIO() as buffer: serialize_state_dict(buffer, self.lr_scheduler.state_dict()) state["lr_scheduler"] = buffer.getvalue() if self.mixed_precision: with io.BytesIO() as buffer: serialize_state_dict(buffer, amp.state_dict()) state["amp"] = buffer.getvalue() return state
def get_state(self): """ Get experiment serialized state as a dictionary of byte arrays :return: dictionary with "model", "optimizer" and "lr_scheduler" states """ state = { "current_epoch": self.current_epoch, "total_steps": self.total_steps, } # Save state into a byte array to avoid ray's GPU serialization issues # See https://github.com/ray-project/ray/issues/5519 with io.BytesIO() as buffer: algorithm = self.algorithm serialize_state_dict(buffer, algorithm.state_dict()) state["algorithm"] = buffer.getvalue() return state
def test_serialization(self): model1 = simple_linear_net() model2 = simple_linear_net() def init(m): if hasattr(m, "weight") and m.weight is not None: m.weight.data.fill_(42.0) model2.apply(init) with io.BytesIO() as buffer: serialize_state_dict(buffer, model1.state_dict()) buffer.seek(0) state_dict = deserialize_state_dict(buffer) model2.load_state_dict(state_dict) self.assertTrue(compare_models(model1, model2, (32, )))
def _create_test_checkpoint(file_name): """ Creates a checkpoint file to be used with `test_checkpoint_backward_compatibility`. Whenever `test_checkpoint_backward_compatibility` test fails you need to create a new checkpoint file from the previous version (commit) using this function and update the test to include the new file """ model = _create_test_model() # Save model checkpoint only, ignoring optimizer and other imagenet # experiment objects state. See ImagenetExperiment.get_state state = {} with io.BytesIO() as buffer: serialize_state_dict(buffer, model.state_dict(), compresslevel=9) state["model"] = buffer.getvalue() with open(file_name, "wb") as checkpoint_file: pickle.dump(state, checkpoint_file) checkpoint_file.flush()
def test_creaate_model_from_checkpoint(self): model1 = _create_test_model() # Save model checkpoint only, ignoring optimizer and other imagenet # experiment objects state. See ImagenetExperiment.get_state state = {} with io.BytesIO() as buffer: serialize_state_dict(buffer, model1.state_dict()) state["model"] = buffer.getvalue() with tempfile.NamedTemporaryFile() as checkpoint_file: # Ray save checkpoints as pickled dicts pickle.dump(state, checkpoint_file) checkpoint_file.file.flush() # Load model from checkpoint model2 = create_model( model_class=resnet50, model_args=TEST_MODEL_ARGS, init_batch_norm=False, device="cpu", checkpoint_file=checkpoint_file.name) self.assertTrue(compare_models(model1, model2, (3, 32, 32)))
def test_identical(self): model_args = dict(num_classes=3, ) model_class = nupic.research.frameworks.pytorch.models.resnets.resnet50 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model_class(**model_args) model.to(device) state = {} with io.BytesIO() as buffer: serialize_state_dict(buffer, model.state_dict()) state["model"] = buffer.getvalue() with tempfile.NamedTemporaryFile(delete=True) as checkpoint_file: pickle.dump(state, checkpoint_file) checkpoint_file.flush() model2 = model_class(**model_args) model2.to(device) load_state_from_checkpoint(model2, checkpoint_file.name, device) self.assertTrue(compare_models(model, model2, (3, 224, 224)))
def setUp(self): set_random_seed(20) self.model = torch.nn.Sequential( torch.nn.Linear(8, 8), KWinners(8, percent_on=0.1), ) # Create temporary results directory. self.tempdir = tempfile.TemporaryDirectory() self.results_dir = Path(self.tempdir.name) / Path("results") self.results_dir.mkdir() # Save model state. state = {} with io.BytesIO() as buffer: serialize_state_dict(buffer, self.model.state_dict(), compresslevel=-1) state["model"] = buffer.getvalue() self.checkpoint_path = self.results_dir / Path("mymodel") with open(self.checkpoint_path, "wb") as f: pickle.dump(state, f)
def get_state(self): """ Get experiment serialized state as a dictionary of byte arrays :return: dictionary with "model", "optimizer" and "lr_scheduler" states """ state = { "current_epoch": self.current_epoch, } # Save state into a byte array to avoid ray's GPU serialization issues # See https://github.com/ray-project/ray/issues/5519 with io.BytesIO() as buffer: model = self.model if hasattr(model, "module"): # DistributedDataParallel model = model.module serialize_state_dict(buffer, model.state_dict()) state["model"] = buffer.getvalue() with io.BytesIO() as buffer: serialize_state_dict(buffer, self.optimizer.state_dict()) state["optimizer"] = buffer.getvalue() if self.lr_scheduler is not None: with io.BytesIO() as buffer: state_dict = self.lr_scheduler.state_dict() if "anneal_func" in state_dict: # FIXME: This is a workaround for a PyTorch bug. # https://github.com/pytorch/pytorch/issues/42376 del state_dict["anneal_func"] serialize_state_dict(buffer, state_dict) state["lr_scheduler"] = buffer.getvalue() if self.mixed_precision: with io.BytesIO() as buffer: serialize_state_dict(buffer, amp.state_dict()) state["amp"] = buffer.getvalue() return state