Ejemplo n.º 1
0
 def __load(self):
     # support
     self.support_dataset = Cifar100(config=self.config,
                                     file='base',
                                     mode='support',
                                     transform=transforms.Compose(
                                         [transforms.ToTensor()]))
     self.support_sampler = Sampler(
         labels=self.support_dataset.label,
         n_way=self.config['sampler']['train']['n_way'],
         k_samples=self.config['sampler']['train']['k_shot'],
         n_episodes=self.config['sampler']['train']['episodes'])
     self.support_dataloader = DataLoader(
         dataset=self.support_dataset, batch_sampler=self.support_sampler)
     # query
     self.query_dataset = Cifar100(config=self.config,
                                   file='base',
                                   mode='query',
                                   transform=transforms.Compose(
                                       [transforms.ToTensor()]))
     self.query_sampler = Sampler(
         labels=self.query_dataset.label,
         n_way=self.config['sampler']['train']['n_way'],
         k_samples=self.config['sampler']['train']['k_query'],
         n_episodes=self.config['sampler']['train']['episodes'])
     self.query_dataloader = DataLoader(dataset=self.query_dataset,
                                        batch_sampler=self.query_sampler)
Ejemplo n.º 2
0
    def __init__(self, config: Config) -> None:
        super().__init__()
        self.hparams = config

        self.env = env_selector(
            self.hparams
        )  # TODO: normalization is not required but will it be needed?
        self.eval_env = env_selector(self.hparams, config.seed + 1)
        self.Da = self.env.action_space.flat_dim
        self.Do = self.env.observation_space.flat_dim  # includes skill in case env is option wrapped
        self.qf = ValueFunction(self.Do + self.Da,
                                [config.layer_size, config.layer_size])
        # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier
        # init for weights and zero init for biases.
        self.vf = ValueFunction(self.Do,
                                [config.layer_size, config.layer_size])
        self.vf_target = ValueFunction(self.Do,
                                       [config.layer_size, config.layer_size])
        self.vf_target.load_state_dict(self.vf.state_dict())

        self.pool = SimpleReplayBuffer(
            env_spec=self.env.spec,
            max_replay_buffer_size=config.max_pool_size,
        )  # create a replay buffer for state+skill and action.

        self.policy = GMMPolicy(
            env_spec=self.env.spec,
            K=config.K,
            hidden_layer_sizes=[config.layer_size, config.layer_size],
            qf=self.qf,
            reg=config.reg,
            device=self.hparams.device
        )  # GMM policy with K mixtures, no reparametrization trick, regularization
        self.modules = [
            "Policy", self.policy, "QF", self.qf, "VF", self.vf, "VF_Target",
            self.vf_target
        ]

        # TODO: add assertion to test qf of policy and qf of model.

        self.sampler = Sampler(self.env, config.max_path_length)

        self._policy_lr = config.lr
        self._qf_lr = config.lr
        self._vf_lr = config.lr
        # TODO fix varialbe naming with _
        self._scale_reward = config.scale_reward
        self._discount = config.discount
        self._tau = config.tau
        self.max_path_return = -np.inf
        self.last_path_return = 0
        self.val_path_return = 0
        self._scale_entropy = config.scale_entropy

        self._save_full_state = config.save_full_state
        # Runs on CPU(moved sampling to (on_train_start) to avoid bug in DIAYN + use GPU instead of CPU(No need for device logic!!) as Models are transferred to GPU only by trainer which happens after the lightning model init.
        # TODO remove device logic in Policy
        # Also the reason why wandb logger is not available
        self.batch_idx = None
Ejemplo n.º 3
0
 def __init__(self):
     self.config = ConfigParser()
     self.config.read(
         os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
         os.sep + 'config' + os.sep + 'appConfig.ini')
     self.score = 0
     self.model = None
     self.data = None
     self.labels = None
     self.sampler = Sampler()
Ejemplo n.º 4
0
    def get_best_skill(self,
                       policy,
                       env,
                       num_skills,
                       max_path_length,
                       n_paths=1):
        print('Finding best skill...')
        reward_list = []
        with policy.deterministic(self.hparams.deterministic_eval):
            for z in range(num_skills):
                env.reset(state=None, skill=z)
                total_returns = 0
                sampler = Sampler(env, max_path_length)
                for p in range(n_paths):
                    new_paths = sampler.sample(max_path_length, policy)
                    total_returns += new_paths[-1]['path_return']
                print('Reward for skill %d = %.3f' % (z, total_returns))
                reward_list.append(total_returns)

        best_z = np.argmax(reward_list)
        print('Best skill found: z = %d, reward = %d, seed = %d' %
              (best_z, reward_list[best_z], self.hparams.seed))
        return best_z
Ejemplo n.º 5
0
class SAC(pl.LightningModule):
    def __init__(self, config: Config) -> None:
        super().__init__()
        self.hparams = config

        self.env = env_selector(
            self.hparams
        )  # TODO: normalization is not required but will it be needed?
        self.eval_env = env_selector(self.hparams, config.seed + 1)
        self.Da = self.env.action_space.flat_dim
        self.Do = self.env.observation_space.flat_dim  # includes skill in case env is option wrapped
        self.qf = ValueFunction(self.Do + self.Da,
                                [config.layer_size, config.layer_size])
        # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier
        # init for weights and zero init for biases.
        self.vf = ValueFunction(self.Do,
                                [config.layer_size, config.layer_size])
        self.vf_target = ValueFunction(self.Do,
                                       [config.layer_size, config.layer_size])
        self.vf_target.load_state_dict(self.vf.state_dict())

        self.pool = SimpleReplayBuffer(
            env_spec=self.env.spec,
            max_replay_buffer_size=config.max_pool_size,
        )  # create a replay buffer for state+skill and action.

        self.policy = GMMPolicy(
            env_spec=self.env.spec,
            K=config.K,
            hidden_layer_sizes=[config.layer_size, config.layer_size],
            qf=self.qf,
            reg=config.reg,
            device=self.hparams.device
        )  # GMM policy with K mixtures, no reparametrization trick, regularization
        self.modules = [
            "Policy", self.policy, "QF", self.qf, "VF", self.vf, "VF_Target",
            self.vf_target
        ]

        # TODO: add assertion to test qf of policy and qf of model.

        self.sampler = Sampler(self.env, config.max_path_length)

        self._policy_lr = config.lr
        self._qf_lr = config.lr
        self._vf_lr = config.lr
        # TODO fix varialbe naming with _
        self._scale_reward = config.scale_reward
        self._discount = config.discount
        self._tau = config.tau
        self.max_path_return = -np.inf
        self.last_path_return = 0
        self.val_path_return = 0
        self._scale_entropy = config.scale_entropy

        self._save_full_state = config.save_full_state
        # Runs on CPU(moved sampling to (on_train_start) to avoid bug in DIAYN + use GPU instead of CPU(No need for device logic!!) as Models are transferred to GPU only by trainer which happens after the lightning model init.
        # TODO remove device logic in Policy
        # Also the reason why wandb logger is not available
        self.batch_idx = None
        # torch.autograd.set_detect_anomaly(True) #TODO: disable if compute overhead

    def get_best_skill(self,
                       policy,
                       env,
                       num_skills,
                       max_path_length,
                       n_paths=1):
        print('Finding best skill...')
        reward_list = []
        with policy.deterministic(self.hparams.deterministic_eval):
            for z in range(num_skills):
                env.reset(state=None, skill=z)
                total_returns = 0
                sampler = Sampler(env, max_path_length)
                for p in range(n_paths):
                    new_paths = sampler.sample(max_path_length, policy)
                    total_returns += new_paths[-1]['path_return']
                print('Reward for skill %d = %.3f' % (z, total_returns))
                reward_list.append(total_returns)

        best_z = np.argmax(reward_list)
        print('Best skill found: z = %d, reward = %d, seed = %d' %
              (best_z, reward_list[best_z], self.hparams.seed))
        return best_z

    def on_sanity_check_start(self) -> None:
        self.pool.add_samples(
            self.sampler.sample(self.hparams.min_pool_size, self.policy))
        print("Initialized Replay Buffer with %d samples" % self.pool.size)

    def __dataloader(self) -> DataLoader:
        """Initialize the Replay Buffer dataset used for retrieving experiences"""
        dataset = RLDataset(self.pool, self.hparams.epoch_length,
                            self.hparams.batch_size)

        # TODO: figure out why referencee codeee uses episode length abovee instead of batch size

        def _init_fn(worker_id):
            np.random.seed(self.hparams.seed + worker_id)

        dataloader = DataLoader(dataset=dataset,
                                batch_size=self.hparams.batch_size,
                                num_workers=self.hparams.num_workers,
                                worker_init_fn=_init_fn)
        return dataloader

    def train_dataloader(self) -> DataLoader:
        """Get train loader"""
        return self.__dataloader()

    def val_dataloader(self) -> DataLoader:
        """Initialize the Replay Buffer dataset used for retrieving experiences"""
        dataset = RLDataset(self.pool, 1, 1)
        dataloader = DataLoader(
            dataset=dataset,
            batch_size=1,
            # num_workers=5
        )
        return dataloader

    # def _split_obs(self,t):
    # TODO remove from DIAYN, herf, and v2?
    #     # TODO: verify that dim is 1, assert shape
    #     return torch.split(t, [self._Do, self._num_skills], 1)

    def training_step(self, batch, batch_idx, optimizer_idx) -> OrderedDict:

        states, actions, rewards, dones, next_states = batch
        self.batch_idx = batch_idx
        # print(states[0], batch_idx)

        # print(self.pool.size,optimizer_idx,batch_idx,states[0])
        # print("Running train",states.shape,batch_idx,optimizer_idx)

        # TODO: vars are already floatTensors.
        # Train Policy
        if optimizer_idx == 0:
            # for param in self.policy.parameters():
            #     print(param.names, param.size(), param.requires_grad)
            # print("Done")
            # for param in self.vf.parameters():
            #     print(param.names, param.size(), param.requires_grad)
            # print("Donevf")
            # print(torch.max(rewards),torch.min(rewards),torch.mean(rewards))
            samples = self.sampler.sample(
                1, self.policy)  # TODO remove magic numbers
            self.pool.add_samples(samples)

            if samples[0]['done'] or samples[0][
                    'path_length'] == self.hparams.max_path_length:
                self.max_path_return = max(self.max_path_return,
                                           samples[0]['path_return'])
                self.last_path_return = samples[0]['path_return']

            distributions, action_samples, log_probs, corr, reg_loss = self.policy(
                states)
            assert log_probs.shape == torch.Size([action_samples.shape[0]])
            # TODO: figure out why squash correction is not done in policy as kl_surrogate seems
            # to need uncorrected log probs?
            self.values = self.vf(states)
            # print(action_samples.shape,log_probs.shape,reg_loss.shape,states.shape) #TODO assert shapes

            with torch.no_grad():
                self.log_targets = self.qf(states, action_samples)
                self.scaled_log_pi = self._scale_entropy * (log_probs - corr)

            # How is this kl surrogate loss derived?
            self._kl_surrogate_loss = torch.mean(
                log_probs *
                (self.scaled_log_pi - self.log_targets + self.values.detach()))
            self._policy_loss = reg_loss + self._kl_surrogate_loss
            self._vf_loss = 0.5 * torch.mean(
                (self.values - self.log_targets + self.scaled_log_pi)**2)

            log = {
                'max_path_return':
                self.max_path_return,
                'train_loss':
                self._policy_loss.detach().cpu().numpy(),
                'kl_loss':
                self._kl_surrogate_loss.detach().cpu().numpy(),
                'reg_loss':
                reg_loss.detach().cpu().numpy(),
                'gmm_means':
                torch.mean(distributions.component_distribution.mean).detach().
                cpu().numpy(),
                'gmm_sigmas':
                torch.mean(distributions.component_distribution.stddev).detach(
                ).cpu().numpy(),
                'vf_loss':
                self._vf_loss.detach().cpu().numpy(),
                'vf_value':
                torch.mean(self.values).detach().cpu().numpy(),
                'scaled_log_pi':
                torch.mean(self.scaled_log_pi).detach().cpu().numpy()
            }
            status = {
                'train_loss':
                self._policy_loss.detach().cpu().numpy(),
                # 'vf_loss': self._vf_loss,
                # 'steps': torch.tensor(self.global_step),#.to(device),#Where did this global_step comee from is it PL inbuilt?
                'max_ret':
                self.max_path_return,
                'last_ret':
                self.last_path_return,
                'gmm_mu':
                torch.mean(distributions.component_distribution.mean).detach().
                cpu().numpy(),
                'gmm_sig':
                torch.mean(distributions.component_distribution.stddev).detach(
                ).cpu().numpy(),
                'vf_loss':
                self._vf_loss.detach().cpu().numpy(),
                'vf_mu':
                torch.mean(self.values).detach().cpu().numpy()
            }

            return OrderedDict({
                'loss': self._policy_loss + self._vf_loss,
                'log': log,
                'progress_bar': status
            })

        # TODO is it faster if qf is also optimized simultaneously along with vf and policy?

        # Train QF
        if optimizer_idx == 1:
            # for param in self.qf.parameters():
            #     print(param.names, param.size(), param.requires_grad)
            # print("Doneqf")
            self.q_values = self.qf(states, actions)
            # assert (self.policy._qf(states,actions)==self.q_values).all()
            with torch.no_grad():
                vf_next_target = self.vf_target(next_states)  # N
                ys = self._scale_reward * rewards + (
                    1 - dones) * self._discount * vf_next_target  # N

            self._td_loss = 0.5 * torch.mean((ys - self.q_values)**2)

            return OrderedDict({
                'loss': self._td_loss,
                'log': {
                    'qf_loss': self._td_loss.detach().cpu().numpy(),
                    'qf_value':
                    torch.mean(self.q_values).detach().cpu().numpy(),
                    'rewards': torch.mean(rewards).detach().cpu().numpy()
                },
                'progress_bar': {
                    'qf_loss': self._td_loss,
                    'rewards': torch.mean(rewards).detach().cpu().numpy(),
                    'qf_mu': torch.mean(self.q_values).detach().cpu().numpy()
                }
            })

        # if self.trainer.use_dp or self.trainer.use_ddp2:
        #     loss = loss.unsqueeze(0)

    def on_batch_end(self) -> None:
        with torch.no_grad():
            for vf, vf_targ in zip(self.vf.parameters(),
                                   self.vf_target.parameters()):
                vf_targ.data.mul_(1 - self.hparams.tau)
                vf_targ.data.add_(self.hparams.tau * vf.data)

    def validation_step(self, batch, batch_idx) -> OrderedDict:
        # state = self.eval_env.reset()
        # print("Running Validation step")
        # path_return = 0
        # path_length = 0
        # for i in range(self.config.max_path_length):
        #     action = self.policy.get_actions(state.reshape((1, -1)))
        #     next_ob, reward, terminal, info = self.env.step(action)
        #     state = next_ob
        #     path_return += reward
        #     path_length += 1
        #     if(terminal):
        #         break

        return OrderedDict({'val_ret': 0, 'path_len': 0})

    def validation_epoch_end(self, outputs) -> OrderedDict:
        gc.collect()
        state = self.eval_env.reset()
        print(
            datetime.datetime.now(
                dateutil.tz.tzlocal()).strftime('%Y-%m-%d-%H-%M-%S-%f-%Z'))
        # print("Running Validation")
        path_return = 0
        path_length = 0
        self.ims = []
        with self.policy.deterministic(self.hparams.deterministic_eval):
            # TODO add support for n_eval_iters
            for i in range(self.hparams.max_path_length):
                action = self.policy.get_actions(state.reshape((1, -1)))
                next_ob, reward, done, info = self.eval_env.step(action)
                if self.hparams.render_validation:
                    # TODO use common resizing everywhere
                    self.ims.append(
                        cv2.resize(self.eval_env.render(mode='rgb_array'),
                                   (500, 500)))
                    # print(self.ims[0].shape)#config={'height':500,'width':500,'xpos':0,'ypos':0,'title':'validation'}
                state = next_ob
                path_return += reward
                path_length += 1
                if done:
                    break

        self.val_path_return = path_return  # TODO : remove printcall back for this, already printed in progress bar
        return OrderedDict({
            'log': {
                'path_return': path_return,
                'path_length': path_length
            },
            'progress_bar': {
                'val_ret': path_return,
                'path_len': path_length
            }
        })

    def configure_optimizers(self) -> List[Optimizer]:
        """ Initialize Adam optimizer"""
        optimizers = []
        # TODO: combining vf and policy, figure out more elegant way to have unlinked learning rates than as
        # a multiplication factor in the loss sum. Also figure out why having them separate doesn't increase
        # compute time by the expected
        optimizers.append(
            optim.Adam(list(self.policy.parameters()) +
                       list(self.vf.parameters()),
                       lr=self._policy_lr))
        # optimizers.append(optim.Adam(self.vf.parameters(), lr=self._vf_lr))
        optimizers.append(optim.Adam(self.qf.parameters(), lr=self._qf_lr))
        return optimizers

    def forward(self, *args, **kwargs):
        return None

    def check_modules(self):
        self.policy.cuda(self.hparams.device)
        self.vf.cuda(self.hparams.device)
        self.qf.cuda(self.hparams.device)
        self.vf_target.cuda(self.hparams.device)
        for param in self.policy.parameters():
            print(param.data.shape, param.data.mean(), param.data.max(),
                  param.data.min(), param.data.std())
        for param in self.vf.parameters():
            print(param.data.shape, param.data.mean(), param.data.max(),
                  param.data.min(), param.data.std())
        for param in self.qf.parameters():
            print(param.data.shape, param.data.mean(), param.data.max(),
                  param.data.min(), param.data.std())
        for param in self.vf_target.parameters():
            print(param.data.shape, param.data.mean(), param.data.max(),
                  param.data.min(), param.data.std())
Ejemplo n.º 6
0
    def __init__(self, config: Config) -> None:
        self.hparams = config
        self.env = env_selector(self.hparams)  # TODO: ensure normalization is not required
        self.eval_env = env_selector(self.hparams, config.seed + 1)  # TODO: add functionality to optionwrap for DIAYN
        # TODO: check all config.names to ensure they are in dict
        self.Da = self.env.action_space.flat_dim
        self.Do = self.env.observation_space.flat_dim
        self.qf = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size])
        # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier
        # init for weights and zero init for biases.
        self.vf = ValueFunction(self.Do, [config.layer_size, config.layer_size])
        self.vf_target = ValueFunction(self.Do, [config.layer_size, config.layer_size])
        self.vf_target.load_state_dict(self.vf.state_dict())

        self.pool = SimpleReplayBuffer(
            env_spec=self.env.spec,
            max_replay_buffer_size=config.max_pool_size,
        )  # create a replay buffer for state+skill and action.

        self.policy = GMMPolicy(
            env_spec=self.env.spec,
            K=config.K,
            hidden_layer_sizes=[config.layer_size, config.layer_size],
            qf=self.qf,
            reg=config.reg,
            device="cpu"
        )  # GMM policy with K mixtures, no reparametrization trick, regularization

        # self.policy.cuda(config.device)
        # self.vf.cuda(config.device)
        # self.qf.cuda(config.device)
        # self.vf_target.cuda(config.device)

        # TODO: add assertion to test qf of policy and qf of model.

        self.sampler = Sampler(self.env, config.max_path_length)

        self._policy_lr = config.lr
        self._qf_lr = config.lr
        self._vf_lr = config.lr
        # TODO fix varialbe naming with _
        self._scale_reward = config.scale_reward
        self._discount = config.discount
        self._tau = config.tau
        self.max_path_return = -np.inf
        self.last_path_return = 0
        self.val_path_return = 0
        self._scale_entropy = config.scale_entropy

        self._save_full_state = config.save_full_state
        # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length)
        # self.env.reset(None,self.z)

        # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init.
        # Also the reason why wandb logger is not available
        self.pool.add_samples(self.sampler.sample(config.min_pool_size, self.policy))
        # self.optimizers = []
        # TODO: combining vf and policy, figure out more elegant way to have unlinked learning rates than as
        # a multiplication factor in the loss sum. Also figure out why having them separate doesn't increase
        # compute time by the expected
        self.optimizer_policy = optim.Adam(list(self.policy.parameters())  # +list(self.vf.parameters())
                                           , lr=self._policy_lr)
        self.optimizer_vf = optim.Adam(self.vf.parameters(), lr=self._vf_lr)
        self.optimizer_qf = optim.Adam(self.qf.parameters(), lr=self._qf_lr)
        self.optimizer = optim.Adam(list(self.policy.parameters())+
                                    list(self.vf.parameters())+
                                    list(self.qf.parameters()), lr=self._policy_lr)
Ejemplo n.º 7
0
class SAC():

    def __init__(self, config: Config) -> None:
        self.hparams = config
        self.env = env_selector(self.hparams)  # TODO: ensure normalization is not required
        self.eval_env = env_selector(self.hparams, config.seed + 1)  # TODO: add functionality to optionwrap for DIAYN
        # TODO: check all config.names to ensure they are in dict
        self.Da = self.env.action_space.flat_dim
        self.Do = self.env.observation_space.flat_dim
        self.qf = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size])
        # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier
        # init for weights and zero init for biases.
        self.vf = ValueFunction(self.Do, [config.layer_size, config.layer_size])
        self.vf_target = ValueFunction(self.Do, [config.layer_size, config.layer_size])
        self.vf_target.load_state_dict(self.vf.state_dict())

        self.pool = SimpleReplayBuffer(
            env_spec=self.env.spec,
            max_replay_buffer_size=config.max_pool_size,
        )  # create a replay buffer for state+skill and action.

        self.policy = GMMPolicy(
            env_spec=self.env.spec,
            K=config.K,
            hidden_layer_sizes=[config.layer_size, config.layer_size],
            qf=self.qf,
            reg=config.reg,
            device="cpu"
        )  # GMM policy with K mixtures, no reparametrization trick, regularization

        # self.policy.cuda(config.device)
        # self.vf.cuda(config.device)
        # self.qf.cuda(config.device)
        # self.vf_target.cuda(config.device)

        # TODO: add assertion to test qf of policy and qf of model.

        self.sampler = Sampler(self.env, config.max_path_length)

        self._policy_lr = config.lr
        self._qf_lr = config.lr
        self._vf_lr = config.lr
        # TODO fix varialbe naming with _
        self._scale_reward = config.scale_reward
        self._discount = config.discount
        self._tau = config.tau
        self.max_path_return = -np.inf
        self.last_path_return = 0
        self.val_path_return = 0
        self._scale_entropy = config.scale_entropy

        self._save_full_state = config.save_full_state
        # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length)
        # self.env.reset(None,self.z)

        # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init.
        # Also the reason why wandb logger is not available
        self.pool.add_samples(self.sampler.sample(config.min_pool_size, self.policy))
        # self.optimizers = []
        # TODO: combining vf and policy, figure out more elegant way to have unlinked learning rates than as
        # a multiplication factor in the loss sum. Also figure out why having them separate doesn't increase
        # compute time by the expected
        self.optimizer_policy = optim.Adam(list(self.policy.parameters())  # +list(self.vf.parameters())
                                           , lr=self._policy_lr)
        self.optimizer_vf = optim.Adam(self.vf.parameters(), lr=self._vf_lr)
        self.optimizer_qf = optim.Adam(self.qf.parameters(), lr=self._qf_lr)
        self.optimizer = optim.Adam(list(self.policy.parameters())+
                                    list(self.vf.parameters())+
                                    list(self.qf.parameters()), lr=self._policy_lr)
        # torch.autograd.set_detect_anomaly(True)

    @staticmethod
    def _squash_correction(t):
        """receives action samples from gmm of shape batchsize x dim_action. For each action, the log probability
         correction requires a product by the inverse of the jacobian determinant. In log, it reduces to a sum, including
         the determinant of the diagonal jacobian. Adding epsilon to avoid overflow due to log
         Should return a tensor of batchsize x 1"""
        # TODO: Refer to OpenAI implementation for more numerically stable correction
        # https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/sac/core.py
        return torch.sum(torch.log(1 - (t ** 2) + EPS), dim=1)

    def train(self):
        for epoch in range(self.hparams.max_epochs):
            for step in range(self.hparams.epoch_length):

                samples = self.sampler.sample(1, self.policy)  # TODO remove magic numbers
                self.pool.add_samples(samples)
                # print(samples[0]['done'])
                if samples[0]['done'] or samples[0]['path_length'] == self.hparams.max_path_length:
                    self.max_path_return = max(self.max_path_return, samples[0]['path_return'])
                    self.last_path_return = samples[0]['path_return']

                batch = self.pool.random_batch(self.hparams.batch_size)
                states, rewards, actions, dones, next_states = torch.FloatTensor(
                    batch['observations']), torch.FloatTensor(batch['rewards']), torch.FloatTensor(
                    batch['actions']), torch.FloatTensor(batch['dones']), torch.FloatTensor(batch['next_observations'])
                # self.optimizer_policy.zero_grad()
                self.optimizer.zero_grad()
                distributions, action_samples, log_probs, reg_loss = self.policy(states)
                # print(log_probs.shape)
                # assert log_probs.shape == torch.Size([action_samples.shape[0]])
                # TODO: figure out why squash correction is not done in policy as kl_surrogate seems
                # to need uncorrected log probs?
                self.values = self.vf(states)
                # print(action_samples.shape,log_probs.shape,reg_loss.shape,states.shape) #TODO assert shapes

                with torch.no_grad():

                    self.log_targets = self.qf(states, action_samples)
                    # Probability of squashed action is not same as probability of unsquashed action.
                    corr = self._squash_correction(action_samples)
                    # print(log_probs.shape,corr.shape)
                    # assert not torch.isnan(corr).any() and not torch.isinf(corr).any()
                    # correction must be subtracted from log_probs as we need inverse of jacobian determinant.
                    self.scaled_log_pi = self._scale_entropy * (log_probs - corr)


                # self._vf_loss = 0.5 * torch.mean(
                #             (self.values - self.log_targets - self.scaled_log_pi) ** 2)
                ## How is this kl surrogate loss derived?
                self._kl_surrogate_loss = torch.mean(log_probs * (
                        self.scaled_log_pi - self.log_targets + self.values.detach()))
                self._policy_loss = reg_loss + self._kl_surrogate_loss

                # self._policy_loss.backward()
                # self.optimizer_policy.step()
                #
                # self.optimizer_vf.zero_grad()
                # self.values = self.vf(states)
                self._vf_loss = 0.5 * torch.mean(
                    (self.values - self.log_targets + self.scaled_log_pi) ** 2)



                # self._vf_loss.backward()
                # self.optimizer_vf.step()
                #
                # self.optimizer_qf.zero_grad()
                self.q_values = self.qf(states, actions)
                # assert (self.policy._qf(states,actions)==self.q_values).all()
                with torch.no_grad():
                    vf_next_target = self.vf_target(next_states)  # N
                    # self._vf_target_params = self._vf.get_params_internal()

                    ys = self._scale_reward * rewards + (1 - dones) * self._discount * vf_next_target  # N


                self._td_loss = 0.5 * torch.mean((ys - self.q_values) ** 2)

                #TODO COde not working, need to fix bug
                self.loss = self._policy_loss + self._vf_loss + self._td_loss
                self.loss.backward()
                self.optimizer.step()

                with torch.no_grad():
                    for vf, vf_targ in zip(self.vf.parameters(), self.vf_target.parameters()):
                        vf_targ.data.mul_(1 - self.hparams.tau)
                        vf_targ.data.add_((self.hparams.tau) * vf.data)


            print('train_loss: ', self._policy_loss.detach().numpy(),
                  'epoch: ', epoch,
                  # 'vf_loss': self._vf_loss,
                  # 'steps': torch.tensor(self.global_step),#.to(device),#Where did this global_step comee from is it PL inbuilt?
                  'max_return: ', (self.max_path_return),
                  'last_return: ', (self.last_path_return),
                  # 'gmm_means: ', torch.mean(distributions.component_distribution.mean).detach().numpy(),
                  # 'gmm_sigmas: ', torch.mean(distributions.component_distribution.stddev).detach().numpy(),
                  'vf_loss: ', self._vf_loss.detach().numpy(),
                  'vf_value: ', torch.mean(self.values).detach().numpy(),
                  'qf_loss: ', self._td_loss.detach().numpy(),
                  'rewards: ', torch.mean(rewards).detach().numpy(),
                  'actions: ', torch.mean(actions).detach().numpy(),
                  'qf_value: ', torch.mean(self.q_values).detach().numpy()
                  )

            state = self.eval_env.reset()
            # print("Running Validation")
            path_return = 0
            path_length = 0
            self.ims = []
            print(datetime.datetime.now(dateutil.tz.tzlocal()).strftime('%Y-%m-%d-%H-%M-%S-%f-%Z'))
            # with self.policy.deterministic(True):
            #     for i in range(self.hparams.max_path_length):
            #         action = self.policy.get_actions(state.reshape((1, -1)))
            #         next_ob, reward, done, info = self.eval_env.step(action)
            #         if self.hparams.render_validation:
            #             self.ims.append(self.eval_env.render(mode='rgb_array'))
            #             # print(self.ims[0].shape)#config={'height':500,'width':500,'xpos':0,'ypos':0,'title':'validation'}
            #         # print(reward)
            #         state = next_ob
            #         path_return += reward
            #         path_length += 1
            #         if (done):
            #             break

            self.val_path_return = path_return
            print('path_return: ', path_return,
                  'path_length: ', path_length)
Ejemplo n.º 8
0
def train(algo, opt, model_type, batch_size, learning_rate, num_epochs, stop_at_done, gamma, tau, num_workers, task_name, file_index, num_actions, max_requests, starting_request, random_start, critic_coef, actor_coef, entropy_coef, output_dir, output_prefix, save_interval):
  assert model_type in MODEL_TYPES, "Invalid model type. Choices: {}".format(MODEL_TYPES)
  assert opt in OPT_TYPES, "Invalid optimizer type. Choices: {}".format(OPT_TYPES)
  assert algo in ALGOS, "Invalid algorithm. Choices: {}".format(ALGOS)
  assert task_name in TASKS, "Invalid task. Choices: {}".format(TASKS)
  assert file_index in FILE_INDEX, "Invalid file index. Choices: {}".format(FILE_INDEX)
  assert num_actions in CACHE_SIZE, "Invalid number of actions. Choices: {}".format(CACHE_SIZE)
  assert max_requests in MAX_REQUESTS, "Invalid maximum requests allowed. Choices: {}".format(MAX_REQUESTS)
  assert num_workers >= 0, "Invalid number of workers ({}). Must be at least 0.".format(num_workers)
  assert num_epochs >= 1, "Invalid number of epochs ({}). Must be at least 1.".format(num_epochs)
  assert 1 <= save_interval <= num_epochs, "Invalid save interval ({}). Must be between 1 and {}".format(save_interval, num_epochs)

  num_feature = num_actions * 3

  # Setup environment
  task_name = "Cache-Bandit-C{}-Max{}-{}-{}-v0".format(num_actions, max_requests, task_name, file_index)

  opt_construct = optim.Adam if opt == OPT_ADAM else optim.SGD

  # Create the model
  if (model_type == GRU and algo == REINFORCE):
    model = GRUPolicy(num_actions, num_feature)
    # Set the optimizer
    optimizer = opt_construct(model.parameters(), lr=learning_rate)
    agent = Reinforce(model, optimizer, entropy_coef)
  elif(model_type == GRU and algo == A2C):
    model = GRUActorCritic(num_actions, num_feature)
    # Set the optimizer
    optimizer = opt_construct(model.parameters(), lr=learning_rate)
    agent = AdvantageActorCritic(model, optimizer, critic_coef, actor_coef, entropy_coef)

  model = model.to(DEVICE)
  model.train()

  # Setup sampler
  sampler = Sampler(model, task_name, num_actions, deterministic=False, gamma=gamma, tau=tau, num_workers=num_workers)

  def _random_start(max_request):
    return random.randint(0, max(0, max_request - 1 - num_actions))

  get_starting_point = _random_start if random_start else lambda x: starting_request

  print(optimizer)
  print(model)
  print("Stop after singlefull trajectory is completed for each epoch: {}".format(stop_at_done))

  if not os.path.isdir(output_dir):
    print("Constructing directories {}".format(output_dir))
    os.makedirs(output_dir, exist_ok=True)

  print("Output Directory: {}".format(output_dir))

  for epoch in range(num_epochs):
    print("EPOCH {} ==========================================".format(epoch))
    sampler.reset_storage()
    sampler.last_hidden_state = None

    if (MAP_LOCATION == CUDA):
      torch.cuda.empty_cache()

    sampler.sample(batch_size, stop_at_done=stop_at_done, starting_point=get_starting_point(sampler.max_length))
    sampler.concat_storage()
    agent.update(sampler)

    if ((epoch + 1) % save_interval == 0):
      out_file = '{}/{}_{}.pkl'.format(output_dir.rstrip("/"), output_prefix, epoch)
      print("Saving model as {}".format(out_file))
      torch.save(model, out_file)
  
  print("DONE")

  sampler.envs.close()
Ejemplo n.º 9
0
    dims = np.max(df.to_numpy().astype(int), axis=0) + 1
    ''' GET GROUND-TRUTH AND CANDIDATES '''
    # get ground truth
    test_ur = get_ur(test_set, context=args.context, eval=False)
    val_ur = get_ur(val_set, context=args.context, eval=False)

    total_train_ur = get_ur(train_set, context=args.context, eval=True)
    # initial candidate item pool
    item_pool = set(range(dims[0], dims[1]))
    candidates_num = args.cand_num

    print('=' * 50, '\n')
    ''' FORMAT DATA '''
    sampler = Sampler(
        dims,
        num_ng=args.num_ng,
        sample_method=args.sample_method,
        sample_ratio=args.sample_ratio,
    )

    # negative sampling and adjacency matrix construction
    neg_set, adj_mx = sampler.transform(train_set,
                                        is_training=True,
                                        context=args.context,
                                        pair_pos=None)

    # create graph needed structure if it is activated
    if args.gce:
        # embed()
        if args.mh > 1:
            print(f'[ MULTI HOP {args.mh} ACTIVATED ]')
            adj_mx = adj_mx.__pow__(int(args.mh))
            smote = BorderlineSMOTE()
            train_x, train_y = smote.fit_resample(tr_x, tr_y)
            train_aux, _ = smote.fit_resample(tr_aux, tr_y)
        else:
            train_x = tr_x
            train_y = tr_y
            train_aux = tr_aux

        train_set = MeatData(train_x, train_y, train_aux, transform_x,
                             transform_aux)
        test_set = MeatData(te_x, te_y, te_aux, transform_x, transform_aux)

        if args.sampler_type.lower() == 'binomial':
            sampler = ImbalancedDatasetSampler(train_set)
        elif args.sampler_type.lower() == 'down':
            sampler = Sampler(train_set, type='under')
        elif args.sampler_type.lower() == 'up':
            sampler = Sampler(train_set, type='over')
        else:
            sampler = None

    else:
        train_set = MeatData(tr_x, tr_y, tr_aux, transform_x, transform_aux)
        test_set = MeatData(te_x, te_y, te_aux, transform_x, transform_aux)
        sampler = None

        # re-weighting
        if args.train_rule.lower() == 'reweight':
            beta = 0.9999
            effective_num = 1.0 - np.power(beta, cls_num_list)
            per_cls_weights = (1.0 - beta) / np.array(effective_num)
Ejemplo n.º 11
0
class DeepLearn:
    def __init__(self):
        self.config = ConfigParser()
        self.config.read(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
            os.sep + 'config' + os.sep + 'appConfig.ini')
        self.score = 0
        self.model = None
        self.data = None
        self.labels = None
        self.sampler = Sampler()

    def init_deep_learning(self):
        self.process_data()
        self.create_model()
        self.train_model()

    def process_data(self):
        location = self.config['img']['train_data_set_location']
        self.sampler.read_and_process_images(location)
        self.data, self.labels = self.sampler.get_images_and_labels()
        #TODO Add Data Split for Training and Validation Set

    def create_model(self):
        """Creates a Deep Learning Convolutional Neural Net Model"""
        # Layer 1: Conv
        self.model = Sequential()
        self.model.add(
            Conv2D(32, (5, 5),
                   strides=(1, 1),
                   padding='same',
                   input_shape=self.data.shape[1:]))
        self.model.add(Activation('relu'))
        # Layer 2: Conv
        self.model.add(Conv2D(32, (5, 5), strides=(1, 1)))
        self.model.add(Activation('relu'))
        # Layer 3: MaxPool
        self.model.add(MaxPooling2D(pool_size=(2, 2)))
        self.model.add(Dropout(0.25))
        # Layer 4: Conv
        self.model.add(Conv2D(32, (5, 5), strides=(1, 1)))
        self.model.add(Activation('relu'))
        # Layer 5: Conv
        self.model.add(Conv2D(32, (5, 5), strides=(1, 1)))
        self.model.add(Activation('relu'))
        # Layer 6: MaxPool
        self.model.add(MaxPooling2D(pool_size=(2, 2)))
        self.model.add(Dropout(0.25))
        # Layer 7: Flatten
        self.model.add(Flatten())
        # Layer 8: Dense
        self.model.add(Dense(512))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(0.5))
        # Layer 9: Dense Final Classification
        self.model.add(Dense(len(label_dict.keys())))
        self.model.add(Activation('softmax'))
        # TODO Promote to Logging
        print(self.model.summary())

    def train_model(self):
        """Trains Model """
        self.model.compile(
            loss='categorical_crossentropy',
            optimizer=SGD(
                lr=float(self.config['hyperparameters']['learning_rate']),
                momentum=float(self.config['hyperparameters']['momentum']),
                decay=float(self.config['hyperparameters']['decay']),
                nesterov=False),
            metrics=['accuracy'])
        # Split data and labels into training, validation and test sets
        x_train, x_test, y_train, y_test = train_test_split(
            self.data,
            self.labels,
            test_size=float(self.config['hyperparameters']['split']),
            random_state=42)
        x_train, x_val, y_train, y_val = train_test_split(
            x_train,
            y_train,
            test_size=float(self.config['hyperparameters']['split']),
            random_state=42)
        # One Hot Encoding for Output Labels
        y_train = to_categorical(y_train, len(label_dict.keys()))
        y_val = to_categorical(y_val, len(label_dict.keys()))
        y_test = to_categorical(y_test, len(label_dict.keys()))

        # Train
        history = self.model.fit(
            x_train,
            y_train,
            batch_size=int(self.config['hyperparameters']['batch_size']),
            epochs=int(self.config['hyperparameters']['epochs']),
            verbose=1,
            validation_data=(x_val, y_val))

        self.plot_loss_accuracy(history)
        self.score = self.model.evaluate(x_test, y_test)
        # TODO Promote to Logging
        print("Accuracy %.6f" % self.score[1])
        self.model.save('kiera_trained.h5')
        with open('accuracy.txt', mode='w') as f:
            f.write(self.score[1])

    def plot_loss_accuracy(self, history):
        fig, ax = plt.subplots(1, 2, figsize=(12, 6))
        ax[0].plot(history.history["loss"], 'r-x', label="Train Loss")
        ax[0].plot(history.history["val_loss"], 'b-x', label="Validation Loss")
        ax[0].legend()
        ax[0].set_title('cross_entropy loss')
        ax[0].grid(True)

        ax[1].plot(history.history["acc"], 'r-x', label="Train Accuracy")
        ax[1].plot(history.history["val_acc"],
                   'b-x',
                   label="Validation Accuracy")
        ax[1].legend()
        ax[1].set_title('accuracy')
        ax[1].grid(True)
        plt.savefig('LossAndAccuracy.png')

    def get_accuracy(self):
        if not self.score:
            return "Training Not Initiated"
        return self.score[1]

    def predict(self, img):
        return 5
        try:
            if not self.model:
                self.model = load_model('kiera_trained.h5')
            return label_dict[
                self.model.predict(self.sampler.process_image(img)).argmax() +
                1]
        except Exception as e:
            raise Exception('Model Not Saved')
Ejemplo n.º 12
0
    def __init__(self, config: Config) -> None:
        super().__init__()
        self.hparams = config

        self.env = env_selector(
            self.hparams
        )  # TODO: normalization is not required but will it be needed?
        self.eval_env = env_selector(self.hparams, config.seed + 1)
        # TODO: check all config.names to ensure they are in dict
        self.Da = self.env.action_space.flat_dim
        self.Do = self.env.observation_space.flat_dim
        self.q1 = ValueFunction(self.Do + self.Da,
                                [config.layer_size, config.layer_size])
        self.q2 = ValueFunction(self.Do + self.Da,
                                [config.layer_size, config.layer_size])
        # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier
        # init for weights and zero init for biases.
        self.q1_target = ValueFunction(self.Do + self.Da,
                                       [config.layer_size, config.layer_size])
        self.q2_target = ValueFunction(self.Do + self.Da,
                                       [config.layer_size, config.layer_size])

        self.q1_target.load_state_dict(self.q1.state_dict())
        self.q2_target.load_state_dict(self.q2.state_dict())

        self.pool = SimpleReplayBuffer(
            env_spec=self.env.spec,
            max_replay_buffer_size=config.max_pool_size,
        )  # create a replay buffer for state+skill and action.

        self.policy = GMMPolicy(
            env_spec=self.env.spec,
            K=config.K,
            hidden_layer_sizes=[config.layer_size, config.layer_size],
            #TODO: pass both q functions to use policy in deterministic mode
            qf=self.q1_target,
            reg=config.reg,
            device=self.hparams.device,
            reparametrization=True
        )  # GMM policy with K mixtures, no reparametrization trick, regularization

        # TODO: add assertion to test qf of policy and qf of model.

        self.sampler = Sampler(self.env, config.max_path_length)

        self._policy_lr = config.lr
        self._qf_lr = config.lr
        self._vf_lr = config.lr
        # TODO fix varialbe naming with _
        self._scale_reward = config.scale_reward
        self._discount = config.discount
        self._tau = config.tau
        self.max_path_return = -np.inf
        self.last_path_return = 0
        self.val_path_return = 0
        self._scale_entropy = config.scale_entropy

        self._save_full_state = config.save_full_state
        self.modules = [
            "Policy", self.policy, "Q1", self.q1, "Q2", self.q2, "Q1_target",
            self.q1_target, "Q2_target", self.q2_target
        ]
        # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length)
        # self.env.reset(None,self.z)

        # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init.
        # Also the reason why wandb logger is not available
        self.batch_idx = None
Ejemplo n.º 13
0
class SAC(pl.LightningModule):
    def __init__(self, config: Config) -> None:
        super().__init__()
        self.hparams = config

        self.env = env_selector(
            self.hparams
        )  # TODO: normalization is not required but will it be needed?
        self.eval_env = env_selector(self.hparams, config.seed + 1)
        # TODO: check all config.names to ensure they are in dict
        self.Da = self.env.action_space.flat_dim
        self.Do = self.env.observation_space.flat_dim
        self.q1 = ValueFunction(self.Do + self.Da,
                                [config.layer_size, config.layer_size])
        self.q2 = ValueFunction(self.Do + self.Da,
                                [config.layer_size, config.layer_size])
        # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier
        # init for weights and zero init for biases.
        self.q1_target = ValueFunction(self.Do + self.Da,
                                       [config.layer_size, config.layer_size])
        self.q2_target = ValueFunction(self.Do + self.Da,
                                       [config.layer_size, config.layer_size])

        self.q1_target.load_state_dict(self.q1.state_dict())
        self.q2_target.load_state_dict(self.q2.state_dict())

        self.pool = SimpleReplayBuffer(
            env_spec=self.env.spec,
            max_replay_buffer_size=config.max_pool_size,
        )  # create a replay buffer for state+skill and action.

        self.policy = GMMPolicy(
            env_spec=self.env.spec,
            K=config.K,
            hidden_layer_sizes=[config.layer_size, config.layer_size],
            #TODO: pass both q functions to use policy in deterministic mode
            qf=self.q1_target,
            reg=config.reg,
            device=self.hparams.device,
            reparametrization=True
        )  # GMM policy with K mixtures, no reparametrization trick, regularization

        # TODO: add assertion to test qf of policy and qf of model.

        self.sampler = Sampler(self.env, config.max_path_length)

        self._policy_lr = config.lr
        self._qf_lr = config.lr
        self._vf_lr = config.lr
        # TODO fix varialbe naming with _
        self._scale_reward = config.scale_reward
        self._discount = config.discount
        self._tau = config.tau
        self.max_path_return = -np.inf
        self.last_path_return = 0
        self.val_path_return = 0
        self._scale_entropy = config.scale_entropy

        self._save_full_state = config.save_full_state
        self.modules = [
            "Policy", self.policy, "Q1", self.q1, "Q2", self.q2, "Q1_target",
            self.q1_target, "Q2_target", self.q2_target
        ]
        # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length)
        # self.env.reset(None,self.z)

        # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init.
        # Also the reason why wandb logger is not available
        self.batch_idx = None
        # torch.autograd.set_detect_anomaly(True) #TODO: disable if compute overhead

    def get_best_skill(self,
                       policy,
                       env,
                       num_skills,
                       max_path_length,
                       n_paths=1):
        print('Finding best skill...')
        reward_list = []
        with policy.deterministic(self.hparams.deterministic_eval):
            for z in range(num_skills):
                env.reset(state=None, skill=z)
                total_returns = 0
                sampler = Sampler(env, max_path_length)
                for p in range(n_paths):
                    new_paths = sampler.sample(max_path_length, policy)
                    total_returns += new_paths[-1]['path_return']
                print('Reward for skill %d = %.3f' % (z, total_returns))
                reward_list.append(total_returns)

        best_z = np.argmax(reward_list)
        print('Best skill found: z = %d, reward = %d, seed = %d' %
              (best_z, reward_list[best_z], self.hparams.seed))
        return best_z

    def on_sanity_check_start(self) -> None:
        # self.z = self.get_best_skill(self.policy, self.env, self.hparams.num_skills, self.hparams.max_path_length,
        #                              self.hparams.num_runs)
        # self._num_skills = self.hparams.num_skills
        # self.env.reset(state=None, skill=self.z)
        # self.eval_env.reset(state=None, skill=self.z)
        # # TODO sampler reset logic and epoch length interaction seems adhoc
        # self.sampler.reset()
        if self.pool.size < self.hparams.min_pool_size:
            self.pool.add_samples(
                self.sampler.sample(self.hparams.min_pool_size, None))
            print("Initialized Replay Buffer with %d samples" % self.pool.size)

    def __dataloader(self) -> DataLoader:
        """Initialize the Replay Buffer dataset used for retrieving experiences"""
        dataset = RLDataset(self.pool, self.hparams.epoch_length,
                            self.hparams.batch_size)

        # TODO: figure out why referencee codeee uses episode length abovee instead of batch size

        def _init_fn(worker_id):
            np.random.seed(self.hparams.seed + worker_id)

        dataloader = DataLoader(dataset=dataset,
                                batch_size=self.hparams.batch_size,
                                num_workers=self.hparams.num_workers,
                                worker_init_fn=_init_fn)
        return dataloader

    def train_dataloader(self) -> DataLoader:
        """Get train loader"""
        return self.__dataloader()

    def val_dataloader(self) -> DataLoader:
        """Initialize the Replay Buffer dataset used for retrieving experiences"""
        dataset = RLDataset(self.pool, 1, 1)
        # TODO: figure out why referencee codeee uses episode length abovee instead of batch size
        dataloader = DataLoader(
            dataset=dataset,
            batch_size=1,
            # num_workers=5
        )
        return dataloader

    # def _split_obs(self,t):
    #     # TODO: verify that dim is 1, assert shape
    #     return torch.split(t, [self._Do, self._num_skills], 1)

    def training_step(self, batch, batch_idx, optimizer_idx) -> OrderedDict:

        states, actions, rewards, dones, next_states = batch
        self.batch_idx = batch_idx
        # print(states[0], batch_idx)

        # print(self.pool.size,optimizer_idx,batch_idx,states[0])
        # print("Running train",states.shape,batch_idx,optimizer_idx)

        # TODO: vars are already floatTensors.
        # Train Policy
        if optimizer_idx == 1:
            # for param in self.policy.parameters():
            #     print(param.names, param.size(), param.requires_grad)
            # print("Done")
            # for param in self.vf.parameters():
            #     print(param.names, param.size(), param.requires_grad)
            # print("Donevf")
            # print(torch.max(rewards),torch.min(rewards),torch.mean(rewards))
            samples = self.sampler.sample(
                1, self.policy)  # TODO remove magic numbers
            self.pool.add_samples(samples)

            if samples[0]['done'] or samples[0][
                    'path_length'] == self.hparams.max_path_length:
                self.max_path_return = max(self.max_path_return,
                                           samples[0]['path_return'])
                self.last_path_return = samples[0]['path_return']

            distributions, action_samples, log_probs, corr, reg_loss = self.policy(
                states)
            # print(log_probs.shape)
            assert log_probs.shape == torch.Size([action_samples.shape[0]])
            values1 = self.q1(states, action_samples)
            values2 = self.q2(states, action_samples)
            self.value = torch.min(values1, values2)  # N
            # print(action_samples.shape,log_probs.shape,reg_loss.shape,states.shape) #TODO assert shapes

            # with torch.no_grad():
            # TODO : check grad
            self.scaled_log_pi = self._scale_entropy * (log_probs - corr)
            self._policy_loss = torch.mean(self.scaled_log_pi - self.value)

            log = {
                'max_path_return': torch.tensor(self.max_path_return),
                'train_loss': self._policy_loss,
                'reg_loss': reg_loss,
                'vf_value': torch.mean(self.value)
            }
            status = {
                'train_loss': self._policy_loss,
                'max_ret': torch.tensor(self.max_path_return),
                'last_ret': torch.tensor(self.last_path_return),
                'vf_mu': torch.mean(self.value)
            }

            return OrderedDict({
                'loss': self._policy_loss,
                'log': log,
                'progress_bar': status
            })

        # Train QF
        if optimizer_idx == 0:
            # for param in self.qf.parameters():
            #     print(param.names, param.size(), param.requires_grad)
            # print("Doneqf")
            self.q1_values = self.q1(states, actions)
            self.q2_values = self.q2(states, actions)
            # assert (self.policy._qf(states,actions)==self.q_values).all()
            with torch.no_grad():
                distributions, action_samples, log_probs, corr, reg_loss = self.policy(
                    next_states)
                q1_next_target = self.q1_target(next_states,
                                                action_samples)  # N
                q2_next_target = self.q2_target(next_states, action_samples)
                q_next_target = torch.min(q1_next_target, q2_next_target)  # N

                ys = self._scale_reward * rewards + (1 - dones) * self._discount * \
                     (q_next_target-self._scale_entropy*(log_probs - corr))  # N

            self._td1_loss = torch.mean((ys - self.q1_values)**2)
            self._td2_loss = torch.mean((ys - self.q2_values)**2)

            return OrderedDict({
                'loss': self._td1_loss + self._td2_loss,
                'log': {
                    'qf_loss': self._td1_loss + self._td2_loss,
                    'qf_value': torch.mean(self.q1_values),
                    'rewards': torch.mean(rewards)
                },
                'progress_bar': {
                    'qf_loss': self._td1_loss + self._td2_loss,
                    'rewards': torch.mean(rewards),
                    'qf_mu': torch.mean(self.q1_values),
                    'log_probs': torch.mean(log_probs - corr)
                }
            })

        # if self.trainer.use_dp or self.trainer.use_ddp2:
        #     loss = loss.unsqueeze(0)

    def on_batch_end(self) -> None:
        with torch.no_grad():
            for q1, q1_targ in zip(self.q1.parameters(),
                                   self.q1_target.parameters()):
                q1_targ.data.mul_(1 - self.hparams.tau)
                q1_targ.data.add_((self.hparams.tau) * q1.data)
            for q2, q2_targ in zip(self.q2.parameters(),
                                   self.q2_target.parameters()):
                q2_targ.data.mul_(1 - self.hparams.tau)
                q2_targ.data.add_((self.hparams.tau) * q2.data)

    def validation_step(self, batch, batch_idx) -> OrderedDict:
        # state = self.eval_env.reset()
        # print("Running Validation step")
        # path_return = 0
        # path_length = 0
        # for i in range(self.config.max_path_length):
        #     action = self.policy.get_actions(state.reshape((1, -1)))
        #     next_ob, reward, terminal, info = self.env.step(action)
        #     state = next_ob
        #     path_return += reward
        #     path_length += 1
        #     if(terminal):
        #         break

        return OrderedDict({'val_ret': 0, 'path_len': 0})

    def validation_epoch_end(self, outputs) -> OrderedDict:
        state = self.eval_env.reset()
        print(
            datetime.datetime.now(
                dateutil.tz.tzlocal()).strftime('%Y-%m-%d-%H-%M-%S-%f-%Z'))
        # print("Running Validation")
        path_return = 0
        path_length = 0
        self.ims = []
        with self.policy.deterministic(self.hparams.deterministic_eval):
            for i in range(self.hparams.max_path_length):
                action = self.policy.get_actions(state.reshape((1, -1)))
                next_ob, reward, done, info = self.eval_env.step(action)
                # self.eval_env.render(mode='human')
                if self.hparams.render_validation:
                    self.ims.append(self.eval_env.render(mode='rgb_array'))
                    # print(self.ims[0].shape)#config={'height':500,'width':500,'xpos':0,'ypos':0,'title':'validation'}
                # print(reward)
                state = next_ob
                path_return += reward
                path_length += 1
                if (done):
                    break

        self.val_path_return = path_return  # TODO : remove printcall back for this, already printed in progress bar
        return OrderedDict({
            'log': {
                'path_return': path_return,
                'path_length': path_length
            },
            'progress_bar': {
                'val_ret': path_return,
                'path_len': path_length
            }
        })

    def configure_optimizers(self) -> List[Optimizer]:
        """ Initialize Adam optimizer"""
        optimizers = []
        # TODO: combining vf and policy, figure out more elegant way to have unlinked learning rates than as
        # a multiplication factor in the loss sum. Also figure out why having them separate doesn't increase
        # compute time by the expected
        optimizers.append(
            optim.Adam(list(self.q1.parameters()) + list(self.q2.parameters()),
                       lr=self._qf_lr))
        # optimizers.append(optim.Adam(self.vf.parameters(), lr=self._vf_lr))
        optimizers.append(
            optim.Adam(self.policy.parameters(), lr=self._policy_lr))
        return optimizers

    def forward(self, *args, **kwargs):
        return None