Example #1
0
    def save_checkpoint(self):
        """Saves a checkpoint of the network and other variables."""

        net = self.actor.net.module if multigpu.is_multi_gpu(
            self.actor.net) else self.actor.net

        actor_type = type(self.actor).__name__
        net_type = type(net).__name__
        state = {
            'epoch': self.epoch,
            'actor_type': actor_type,
            'net_type': net_type,
            'net': net.state_dict(),
            'net_info': getattr(net, 'info', None),
            'constructor': getattr(net, 'constructor', None),
            'optimizer': self.optimizer.state_dict(),
            'stats': self.stats,
            'settings': self.settings
        }

        directory = '{}/{}'.format(self._checkpoint_dir,
                                   self.settings.project_path)
        if not os.path.exists(directory):
            os.makedirs(directory)

        # First save as a tmp file
        tmp_file_path = '{}/{}_ep{:04d}.tmp'.format(directory, net_type,
                                                    self.epoch)
        torch.save(state, tmp_file_path)

        file_path = '{}/{}_ep{:04d}.pth.tar'.format(directory, net_type,
                                                    self.epoch)

        # Now rename to actual checkpoint. os.rename seems to be atomic if files are on same filesystem. Not 100% sure
        os.rename(tmp_file_path, file_path)
Example #2
0
    def load_pretrained(self,
                        checkpoint=None,
                        fields=None,
                        ignore_fields=None,
                        load_constructor=False):
        """ Loads a pre-trained network parameter from a checkpoint file. """
        from ltr.admin import loading, multigpu

        net = self.actor.net.module if multigpu.is_multi_gpu(
            self.actor.net) else self.actor.net

        net_type = type(net).__name__

        # Load network
        print("load from: {}".format(checkpoint))
        checkpoint_dict = loading.torch_load_legacy(checkpoint)

        assert net_type == checkpoint_dict[
            'net_type'], 'Network is not of correct type.'

        if fields is None:
            fields = checkpoint_dict.keys()

        # Load all fields
        for key in fields:
            if key == 'net':
                net.load_state_dict(checkpoint_dict[key])

        return True
Example #3
0
    def save_checkpoint(self):
        """Saves a checkpoint of the network and other variables."""

        net = self.actor.net.module if multigpu.is_multi_gpu(
            self.actor.net) else self.actor.net

        actor_type = type(self.actor).__name__
        net_type = type(net).__name__
        state = {
            'epoch': self.epoch,
            'actor_type': actor_type,
            'net_type': net_type,
            'net': net.state_dict(),
            'net_info': getattr(net, 'info', None),
            'constructor': getattr(net, 'constructor', None),
            'optimizer': self.optimizer.state_dict(),
            'stats': self.stats,
            'settings': self.settings
        }

        directory = '{}/{}'.format(self._checkpoint_dir,
                                   self.settings.project_path)
        if not os.path.exists(directory):
            os.makedirs(directory)

        file_path = '{}/{}_ep{:04d}.pth.tar'.format(directory, net_type,
                                                    self.epoch)
        torch.save(state, file_path)
Example #4
0
 def load_weights(self, checkpoint = None):
     """Loads network weights"""
     net = self.actor.net.module if multigpu.is_multi_gpu(self.actor.net) else self.actor.net
     # Load network
     checkpoint_dict = loading.torch_load_legacy(checkpoint)
     net.load_state_dict(checkpoint_dict['net'], strict=False)
     return True
Example #5
0
    def load_checkpoint(self,
                        checkpoint=None,
                        fields=None,
                        ignore_fields=None,
                        load_constructor=False):
        """Loads a network checkpoint file.

        Can be called in three different ways:
            load_checkpoint():
                Loads the latest epoch from the workspace. Use this to continue training.
            load_checkpoint(epoch_num):
                Loads the network at the given epoch number (int).
            load_checkpoint(path_to_checkpoint):
                Loads the file from the given absolute path (str).
        """

        net = self.actor.net.module if multigpu.is_multi_gpu(
            self.actor.net) else self.actor.net

        actor_type = type(self.actor).__name__
        net_type = type(net).__name__

        if checkpoint is None:
            # Load most recent checkpoint
            # checkpoint_list = sorted(glob.glob('{}/{}/{}_ep*.pth.tar'.format(self._checkpoint_dir,
            #                                                                  self.settings.project_path, net_type)))
            checkpoint_list = sorted(
                glob.glob('{}/{}/{}_ep*.pth.tar'.format(
                    self._checkpoint_dir, self.settings.project_path,
                    net_type)))
            print('{}/{}/{}_ep*.pth.tar'.format(self._checkpoint_dir,
                                                self.settings.project_path,
                                                net_type))
            if checkpoint_list:
                checkpoint_path = checkpoint_list[-1]
            else:
                print('No matching checkpoint file found')
                return
        elif isinstance(checkpoint, int):
            # Checkpoint is the epoch number
            checkpoint_path = '{}/{}/{}_ep{:04d}.pth.tar'.format(
                self._checkpoint_dir, self.settings.project_path, net_type,
                checkpoint)
        elif isinstance(checkpoint, str):
            # checkpoint is the path
            if os.path.isdir(checkpoint):
                checkpoint_list = sorted(
                    glob.glob('{}/*_ep*.pth.tar'.format(checkpoint)))
                if checkpoint_list:
                    checkpoint_path = checkpoint_list[-1]
                else:
                    raise Exception('No checkpoint found')
            else:
                checkpoint_path = os.path.expanduser(checkpoint)
        else:
            raise TypeError

        # Load network
        checkpoint_dict = loading.torch_load_legacy(checkpoint_path)

        # assert net_type == checkpoint_dict['net_type'], 'Network is not of correct type.'

        if fields is None:
            fields = checkpoint_dict.keys()
        if ignore_fields is None:
            ignore_fields = ['settings']

            # Never load the scheduler. It exists in older checkpoints.
        ignore_fields.extend([
            'lr_scheduler', 'constructor', 'net_type', 'actor_type', 'net_info'
        ])

        # Load all fields
        for key in fields:
            if key in ignore_fields:
                continue
            if key == 'net':
                # net.load_state_dict(checkpoint_dict[key])

                # Song for merge layer
                model_dict = net.state_dict()
                pretrained_dict = {
                    k: v
                    for k, v in checkpoint_dict[key].items() if k in model_dict
                }
                model_dict.update(pretrained_dict)
                net.load_state_dict(model_dict)

            elif key == 'optimizer':
                try:
                    self.optimizer.load_state_dict(checkpoint_dict[key])
                except:
                    continue
            else:
                setattr(self, key, checkpoint_dict[key])

        # Set the net info
        if load_constructor and 'constructor' in checkpoint_dict and checkpoint_dict[
                'constructor'] is not None:
            net.constructor = checkpoint_dict['constructor']
        if 'net_info' in checkpoint_dict and checkpoint_dict[
                'net_info'] is not None:
            net.info = checkpoint_dict['net_info']

        # Update the epoch in lr scheduler
        if 'epoch' in fields:
            self.lr_scheduler.last_epoch = self.epoch

        return True
Example #6
0
    def load_checkpoint(self, checkpoint = None, fields = None, ignore_fields = None, load_constructor = False):
        """Loads a network checkpoint file.

        Can be called in three different ways:
            load_checkpoint():
                Loads the latest epoch from the workspace. Use this to continue training.
            load_checkpoint(epoch_num):
                Loads the network at the given epoch number (int).
            load_checkpoint(path_to_checkpoint):
                Loads the file from the given absolute path (str).
        """

        net = self.actor.net.module if multigpu.is_multi_gpu(self.actor.net) else self.actor.net

        actor_type = type(self.actor).__name__
        net_type = type(net).__name__

        if checkpoint is None:
            # Load most recent checkpoint
            checkpoint_list = sorted(glob.glob('{}/{}/{}_ep*.pth.tar'.format(self._checkpoint_dir,
                                                                             self.settings.project_path, net_type)))
            print(checkpoint_list)

            if checkpoint_list:
                checkpoint_path = checkpoint_list[-1]
            else:
                print('No matching checkpoint file found')
                return
        elif isinstance(checkpoint, int):
            # Checkpoint is the epoch number
            checkpoint_path = '{}/{}/{}_ep{:04d}.pth.tar'.format(self._checkpoint_dir, self.settings.project_path,
                                                                 net_type, checkpoint)
        elif isinstance(checkpoint, str):
            # checkpoint is the path
            if os.path.isdir(checkpoint):
                checkpoint_list = sorted(glob.glob('{}/*_ep*.pth.tar'.format(checkpoint)))
                if checkpoint_list:
                    checkpoint_path = checkpoint_list[-1]
                else:
                    raise Exception('No checkpoint found')
            else:
                checkpoint_path = os.path.expanduser(checkpoint)
        else:
            raise TypeError

        # Load network
        checkpoint_dict = loading.torch_load_legacy(checkpoint_path)

        #print([net_type, checkpoint_dict['net_type']])['DiMPnet_rgbd', 'DiMPnet']

        assert checkpoint_dict['net_type'] in net_type, 'Network is not of correct type.'

        if fields is None:
            fields = checkpoint_dict.keys()
        if ignore_fields is None:
            ignore_fields = ['settings']

            # Never load the scheduler. It exists in older checkpoints.
        ignore_fields.extend(['lr_scheduler', 'constructor', 'net_type', 'actor_type', 'net_info'])

        # Load all fields
        for key in fields:
            if key in ignore_fields:
                continue
            if key == 'net':
                #print(checkpoint_dict[key].keys())
                net.load_state_dict(checkpoint_dict[key], strict=False)
            elif key == 'optimizer':# and 'dimp50.pth' not in checkpoint_path:
                # param_lens = (len(g['params']) for g in self.optimizer.param_groups)
                # saved_lens = (len(g['params']) for g in checkpoint_dict[key]['param_groups'])
                # # for p_len,s_len in zip(param_lens, saved_lens):
                # #     print([p_len, s_len])
                # print(self.optimizer.param_groups[3]['params'][0].shape)
                # if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)):
                #     raise ValueError("loaded state dict contains a parameter group "
                #                      "that doesn't match the size of optimizer's group")
                self.optimizer.load_state_dict(checkpoint_dict[key])
            else:
                setattr(self, key, checkpoint_dict[key])

        # Set the net info
        if load_constructor and 'constructor' in checkpoint_dict and checkpoint_dict['constructor'] is not None:
            net.constructor = checkpoint_dict['constructor']
        if 'net_info' in checkpoint_dict and checkpoint_dict['net_info'] is not None:
            net.info = checkpoint_dict['net_info']

        # Update the epoch in lr scheduler
        if 'epoch' in fields:
            self.lr_scheduler.last_epoch = self.epoch

        return True