def start_or_resume_from_checkpoint():
    """
    Create actor, critic, actor optimizer and critic optimizer from scratch
    or load from latest checkpoint if it exists.
    """
    max_checkpoint_iteration = get_last_checkpoint_iteration()

    obsv_dim, action_dim, continuous_action_space = get_env_space()
    actor = Actor(obsv_dim,
                  action_dim,
                  continuous_action_space=continuous_action_space,
                  trainable_std_dev=hp.trainable_std_dev,
                  init_log_std_dev=hp.init_log_std_dev)
    critic = Critic(obsv_dim)
    var_critic = Var_Critic(obsv_dim)

    actor_optimizer = optim.AdamW(actor.parameters(), lr=hp.actor_learning_rate)
    critic_optimizer = optim.AdamW(critic.parameters(), lr=hp.critic_learning_rate)

    var_critic_optimizer = optim.AdamW(var_critic.parameters(), lr=hp.var_critic_learning_rate)

    stop_conditions = StopConditions()

    # If max checkpoint iteration is greater than zero initialise training with the checkpoint.
    if max_checkpoint_iteration > 0:
        actor_state_dict, critic_state_dict, var_critic_state_dict, \
        actor_optimizer_state_dict, critic_optimizer_state_dict, \
        var_critic_optimizer_state_dict, stop_conditions = load_checkpoint(max_checkpoint_iteration)

        actor.load_state_dict(actor_state_dict, strict=True)
        critic.load_state_dict(critic_state_dict, strict=True)
        var_critic.load_state_dict(var_critic_state_dict, strict=True)

        actor_optimizer.load_state_dict(actor_optimizer_state_dict)
        critic_optimizer.load_state_dict(critic_optimizer_state_dict)
        var_critic_optimizer.load_state_dict(var_critic_optimizer_state_dict)

        '''We have to move manually move optimizer states to 
        TRAIN_DEVICE manually since optimizer doesn't yet have a "to" method.#'''

        for state in actor_optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(TRAIN_DEVICE)

        for state in critic_optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(TRAIN_DEVICE)

        for state in var_critic_optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(TRAIN_DEVICE)

    return actor, critic, var_critic, actor_optimizer, critic_optimizer, var_critic_optimizer, \
           max_checkpoint_iteration, stop_conditions
Exemple #2
0
class Trainer:
    def __init__(self, env, seed):
        self.seed = seed
        self.successes = []
        self.epochs = []
        self.env = env
        self.device = torch.device(p.device)
        # create the network
        self.actor = Actor(self.env.ob_shape, self.env.goal_shape,
                           self.env.action_shape).to(self.device)
        self.critic = Critic(self.env.ob_shape, self.env.goal_shape,
                             self.env.action_shape).to(self.device)
        # build up the target network
        self.actor_target = Actor(self.env.ob_shape, self.env.goal_shape,
                                  self.env.action_shape).to(self.device)
        self.critic_target = Critic(self.env.ob_shape, self.env.goal_shape,
                                    self.env.action_shape).to(self.device)
        # load the weights into the target networks
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())
        # if use gpu
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr)
        # her sampler
        self.buffer = replay_buffer(self.env.ob_shape, self.env.action_shape)

    def start(self):
        for self.epoch in range(p.n_epochs):
            for _ in range(p.n_cycles):
                mb_obs, mb_ag, mb_g, mb_obs_next, mb_ag_next, mb_actions = [], [], [], [], [], []
                for _ in range(1):
                    # reset the rollouts
                    ep_obs, ep_ag, ep_g, ep_obs_next, ep_ag_next, ep_actions = [], [], [], [], [], []
                    # reset the environment
                    observation = self.env.reset()
                    obs = observation['observation']
                    ag = observation['achieved_goal']
                    g = observation['desired_goal']
                    # start to collect samples
                    for t in range(p.max_episode_steps):
                        with torch.no_grad():
                            obs_norm, g_norm = self.normalize(obs, g)
                            pi = self.actor(obs_norm, g_norm)
                            action = self.add_noise(pi)
                        # feed the actions into the environment
                        observation_new, _, _, info = self.env.step(action)
                        obs_new = observation_new['observation']
                        ag_new = observation_new['achieved_goal']
                        # append rollouts
                        ep_obs.append(obs.copy())
                        ep_ag.append(ag.copy())
                        ep_g.append(g.copy())
                        ep_obs_next.append(obs_new.copy())
                        ep_ag_next.append(ag_new.copy())
                        ep_actions.append(action.copy())
                        # re-assign the observation
                        obs = obs_new
                        ag = ag_new

                    mb_obs.append(ep_obs)
                    mb_ag.append(ep_ag)
                    mb_g.append(ep_g)
                    mb_obs_next.append(ep_obs_next)
                    mb_ag_next.append(ep_ag_next)
                    mb_actions.append(ep_actions)

                # convert them into arrays
                mb_obs = np.array(mb_obs)
                mb_ag = np.array(mb_ag)
                mb_g = np.array(mb_g)
                mb_obs_next = np.array(mb_obs_next)
                mb_ag_next = np.array(mb_ag_next)
                mb_actions = np.array(mb_actions)
                # store the episodes
                self.buffer.store_episode(
                    [mb_obs, mb_ag, mb_g, mb_obs_next, mb_ag_next, mb_actions])
                self.buffer.update_normalizer(
                    [mb_obs, mb_ag, mb_g, mb_obs_next, mb_ag_next, mb_actions])
                for _ in range(p.update_per_episode):
                    # train the network
                    c_loss, a_loss = self.update_network()
                # soft update
                self.soft_update_target_network()
            # start to do the evaluation
            success_rate = self.eval_agent()
            print('[{}] epoch: {}, seed: {}, eval success rate is: {}'.format(
                self.env.name, self.epoch, self.seed, success_rate))
            self.save_csv(self.epoch, success_rate)
            if len(self.successes) >= 10:
                if sum(self.successes[-10:]) == 10.0:
                    break

    def save_csv(self, epoch, success_rate):
        try:
            os.mkdir("Generated_data")
        except:
            pass

        self.epochs.append(epoch + 1)
        self.successes.append(success_rate)

        di = {}
        di['epochs'] = self.epochs
        di["success_rate"] = self.successes

        frame = pd.DataFrame(di)
        frame.to_csv("Generated_data/{}_{}.csv".format(self.env.name,
                                                       self.seed))

    def normalize(self, obs, g):
        print(self.env.name)
        time.sleep(10000)
        obs_norm = self.buffer.o_norm.normalize(obs)
        g_norm = self.buffer.g_norm.normalize(g)
        obs_norm = torch.FloatTensor(obs_norm).to(self.device)
        g_norm = torch.FloatTensor(g_norm).to(self.device)
        # concatenate the stuffs
        return obs_norm, g_norm

    # this function will choose action for the agent and do the exploration
    def add_noise(self, pi):
        action = pi.cpu().numpy().squeeze()
        # add the gaussian
        action += p.noise_eps * np.random.randn(*action.shape)
        action = np.clip(action, -1.0, 1.0)
        # random actions...
        random_actions = np.random.uniform(low=-1.0,
                                           high=1.0,
                                           size=self.env.action_shape)
        # choose if use the random actions
        action += np.random.binomial(1, p.random_eps,
                                     1)[0] * (random_actions - action)
        return action

    # soft update
    def soft_update_target_network(self):
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_((1 - p.polyak) * param.data +
                                    p.polyak * target_param.data)

        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_((1 - p.polyak) * param.data +
                                    p.polyak * target_param.data)

    # update the network
    def update_network(self):
        # sample the episodes
        transitions = self.buffer.sample()
        # pre-process the observation and goal
        o, o_next, g = transitions['obs'], transitions[
            'obs_next'], transitions['g']
        transitions['obs'], transitions['g'] = self.buffer.preproc_og(o, g)
        transitions['obs_next'], transitions[
            'g_next'] = self.buffer.preproc_og(o_next, g)

        # start to do the update
        obs_norm, g_norm = self.normalize(transitions['obs'], transitions['g'])
        obs_next_norm, g_next_norm = self.normalize(transitions['obs_next'],
                                                    transitions['g_next'])

        actions_tensor = torch.FloatTensor(transitions['actions']).to(
            self.device)
        r_tensor = torch.FloatTensor(transitions['r']).to(self.device)

        with torch.no_grad():
            # do the normalization
            # concatenate the stuffs
            actions_next = self.actor_target(obs_next_norm, g_next_norm)
            q_next_value = self.critic_target(obs_next_norm, g_next_norm,
                                              actions_next)
            q_next_value = q_next_value.detach()
            target_q_value = r_tensor + p.gamma * q_next_value
            target_q_value = target_q_value.detach()
            # clip the q value
            clip_return = 1 / (1 - p.gamma)
            target_q_value = torch.clamp(target_q_value, -clip_return, 0)

        # the q loss
        real_q_value = self.critic(obs_norm, g_norm, actions_tensor)
        critic_loss = (target_q_value - real_q_value).pow(2).mean()
        # the actor loss
        actions_real = self.actor(obs_norm, g_norm)
        actor_loss = -self.critic(obs_norm, g_norm, actions_real).mean()
        self.a1 = actor_loss
        self.a2 = (actions_real).pow(2).mean()
        self.actions_real = actions_real
        actor_loss += (actions_real).pow(2).mean()

        # start to update the network
        self.actor_optim.zero_grad()
        actor_loss.backward()

        # update the critic_network
        self.critic_optim.zero_grad()
        critic_loss.backward()

        self.actor_optim.step()
        self.critic_optim.step()

        return critic_loss.item(), actor_loss.item()

    # do the evaluation
    def eval_agent(self):
        total_success_rate = []
        for _ in range(p.testing_eps):
            total_success_rate.append(0.0)
            observation = self.env.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            for _ in range(p.max_episode_steps):
                with torch.no_grad():
                    obs_norm, g_norm = self.normalize(obs, g)
                    pi = self.actor(obs_norm, g_norm)
                    # convert the actions
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, _, _, info = self.env.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                if info["is_success"]:
                    break
            total_success_rate[-1] = info['is_success']
        total_success_rate = round(np.array(total_success_rate).mean(), 2)
        return total_success_rate
Exemple #3
0
class Trainer:
    def __init__(self, envs, testing_envs, seed, variance_limit = 0.25):
        self.seed = seed
        self.successes = []
        self.testing_envs = testing_envs
        self.envs = envs
        self.variance_limit = variance_limit
        
        training_envs_per_dof = int(len(self.envs.envs)/3)
        
        self.training_env_seq = [4]*training_envs_per_dof + [5]*training_envs_per_dof + [6]*training_envs_per_dof
        self.testing_env_seq = [4]*10 + [5]*10 + [6]*10

        if p.mode == "retrain":
            self.training_env_seq = self.testing_env_seq

        self.device = torch.device(p.device)
        # create the network
        self.actor = Actor().to(self.device)
        self.critic = Critic().to(self.device)

        if p.mode == 'retrain':
            self.actor.load_state_dict(torch.load("actor_seed_{}".format(seed)))
            self.critic.load_state_dict(torch.load("critic_seed_{}".format(seed)))

        # build up the target network
        self.actor_target = Actor().to(self.device)
        self.critic_target = Critic().to(self.device)
        # load the weights into the target networks
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())
        # if use gpu
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr)
        # her sampler
        self.buffer = replay_buffer(seed)

        if p.mode == 'retrain':
            self.buffer.load_normalizers()
            print("loading done")

        self.training_data, self.testing_data = {}, {}
        for env in self.envs.envs:
            self.training_data[env.name] = []
        for env in self.testing_envs.envs:
            self.testing_data[env.name] = []

        try:
            os.mkdir("Generated_data")
        except FileExistsError:
            pass
        
    def start(self):
        if p.mode == "retrain":
            for self.epoch in range(-10, 0):
                training_success_rate, testing_success_rate = self.eval_agent()     
                self.log_data(training_success_rate, testing_success_rate)

        else:
            for self.epoch in range(p.n_epochs):
                for _ in range(p.n_cycles):
                    # reset the rollouts
                    ep_obs, ep_ag, ep_g, ep_obs_next, ep_ag_next, ep_actions, ep_seq = [], [], [], [], [], [], []
                    # reset the environment
                    observation = self.envs.reset()
                    obs = observation['observation']
                    ag = observation['achieved_goal']
                    g = observation['desired_goal']

                    # start to collect samples
                    for t in range(p.max_episode_steps):
                        with torch.no_grad():
                            obs_norm, g_norm = self.normalize(obs, g)
                            pi = self.actor(obs_norm, g_norm, self.training_env_seq)
                            action = self.add_noise(pi)
                          
                        # feed the actions into the environment
                        observation_new, info = self.envs.step(action)
                        obs_new = observation_new['observation']
                        ag_new = observation_new['achieved_goal']
                        # append rollouts
                        ep_obs.append(obs.copy())
                        ep_ag.append(ag.copy())
                        ep_g.append(g.copy())
                        ep_obs_next.append(obs_new.copy())
                        ep_ag_next.append(ag_new.copy())
                        ep_actions.append(action.copy())
                        ep_seq.append(self.training_env_seq)
                        # re-assign the observation
                        obs = obs_new
                        ag = ag_new

                    #convert them into arrays
                    ep_obs = np.array(ep_obs).swapaxes(0,1)
                    ep_ag = np.array(ep_ag).swapaxes(0,1)
                    ep_g = np.array(ep_g).swapaxes(0,1)
                    ep_obs_next = np.array(ep_obs_next).swapaxes(0,1)
                    ep_ag_next = np.array(ep_ag_next).swapaxes(0,1)
                    ep_actions = np.array(ep_actions).swapaxes(0,1)
                    ep_seq = np.array(ep_seq).swapaxes(0,1)

                    for i in range(ep_obs.shape[0]):
                        # store the episodes
                        self.buffer.store_episode([np.expand_dims(ep_obs[i],0), np.expand_dims(ep_ag[i],0), np.expand_dims(ep_g[i],0), np.expand_dims(ep_obs_next[i],0), np.expand_dims(ep_ag_next[i],0), np.expand_dims(ep_actions[i],0), np.expand_dims(ep_seq[i],0)])
                        self.buffer.update_normalizer([np.expand_dims(ep_obs[i],0), np.expand_dims(ep_ag[i],0), np.expand_dims(ep_g[i],0), np.expand_dims(ep_obs_next[i],0), np.expand_dims(ep_ag_next[i],0), np.expand_dims(ep_actions[i],0), np.expand_dims(ep_seq[i],0)])
                   
                    for _ in range(p.update_per_episode):
                        # train the network
                        c_loss, a_loss = self.update_network()
                        
                    # soft update
                    self.soft_update_target_network()

                training_success_rate, testing_success_rate = self.eval_agent()
                self.log_data(training_success_rate, testing_success_rate)
            
                torch.save(self.actor.state_dict(), "actor_seed_{}".format(self.seed))
                torch.save(self.critic.state_dict(), "critic_seed_{}".format(self.seed))
                self.buffer.save_normalizers()

    def log_data(self, training_data, testing_data):
        os.system("clear")
        print("Epoch: {}".format(self.epoch))
        print("Training_data: ")
        end = "\t"

        for i, env in enumerate(self.envs.envs):
            print(env.name, training_data[i], end=end)
            self.training_data[env.name].append(training_data[i])
            end = "\t" if end=="\n" else "\n"
        print(end="\n\n")
        
        frame = pd.DataFrame(self.training_data)
        if self.variance_limit == 0.25:
            frame.to_csv("Generated_data/" + p.mode + "ing_data_{}.csv".format(self.seed))
        else:
            frame.to_csv("Generated_data/" + p.mode + "ing_data_{}_{}.csv".format(self.variance_limit, self.seed))

        print("Testing_data: ")
        end = "\t"
        for i, env in enumerate(self.testing_envs.envs):
            print(env.name, testing_data[i], end=end)
            self.testing_data[env.name].append(testing_data[i])
            end = "\t" if end=="\n" else "\n"
        print(end="\n\n")

        frame = pd.DataFrame(self.testing_data)
        if self.variance_limit == 0.25:
            frame.to_csv("Generated_data/" + p.mode + "ing_test_data_{}.csv".format(self.seed))
        else:
            frame.to_csv("Generated_data/" + p.mode + "ing_test_data_{}_{}.csv".format(self.variance_limit, self.seed))

    def normalize(self, obs, g):
        obs_norm = self.buffer.o_norm.normalize(obs)
        g_norm = self.buffer.g_norm.normalize(g)
        obs_norm = torch.FloatTensor(obs_norm).to(self.device)
        g_norm = torch.FloatTensor(g_norm).to(self.device)
        # concatenate the stuffs
        return obs_norm, g_norm
    
    # this function will choose action for the agent and do the exploration
    def add_noise(self, pi):
        action = pi.cpu().numpy().squeeze()
        # add the gaussian
        action += p.noise_eps * np.random.randn(*action.shape)
        action = np.clip(action, -1.0, 1.0)
        # random actions...
        random_actions = np.random.uniform(low = -1.0, high = 1.0, size=p.max_dof)
        # choose if use the random actions
        action += np.random.binomial(1, p.random_eps, 1)[0] * (random_actions - action)
        return action

    # soft update
    def soft_update_target_network(self):
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_((1 - p.polyak) * param.data + p.polyak * target_param.data)

        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_((1 - p.polyak) * param.data + p.polyak * target_param.data)

    # update the network
    def update_network(self):
        # sample the episodes
        transitions = self.buffer.sample()
        # pre-process the observation and goal
        o, o_next, g = transitions['obs'], transitions['obs_next'], transitions['g']
        transitions['obs'], transitions['g'] = self.buffer.preproc_og(o, g)
        transitions['obs_next'], transitions['g_next'] = self.buffer.preproc_og(o_next, g)
        seq = transitions['seq']

        # start to do the update
        obs_norm, g_norm = self.normalize(transitions['obs'], transitions['g'])
        obs_next_norm, g_next_norm = self.normalize(transitions['obs_next'], transitions['g_next'])
        
        actions_tensor = torch.FloatTensor(transitions['actions']).to(self.device)
        r_tensor = torch.FloatTensor(transitions['r']).to(self.device)
           
        with torch.no_grad():
            # do the normalization
            # concatenate the stuffs
            r_tensor = r_tensor.view(p.batch_size)
            actions_next = self.actor_target(obs_next_norm, g_next_norm, seq)
            q_next_value = self.critic_target(obs_next_norm, g_next_norm, actions_next, seq)
            q_next_value = q_next_value.detach()
            target_q_value = r_tensor + p.gamma * q_next_value
            target_q_value = target_q_value.detach()
            # clip the q value
            clip_return = 1 / (1 - p.gamma)
            target_q_value = torch.clamp(target_q_value, -clip_return, 0)
        # the q loss
        real_q_value = self.critic(obs_norm, g_norm, actions_tensor, seq)
        critic_loss = (target_q_value - real_q_value).pow(2).mean()
        # the actor loss
        actions_real = self.actor(obs_norm, g_norm, seq)
        actor_loss = -self.critic(obs_norm, g_norm, actions_real, seq).mean()
        self.a1 = actor_loss
        self.a2 = (actions_real).pow(2).mean()
        self.actions_real = actions_real  
        actor_loss += (actions_real).pow(2).mean()
        
        # start to update the network
        self.actor_optim.zero_grad()
        actor_loss.backward()

        # update the critic_network
        self.critic_optim.zero_grad()
        critic_loss.backward()
        
        self.actor_optim.step()
        self.critic_optim.step()

        return critic_loss.item(), actor_loss.item()

    # do the evaluation
    def eval_agent(self):
        training_success_rate = np.array([0.0] * len(self.envs.envs))
        
        for _ in range(p.testing_eps):
            successes = np.array([0.0]*len(self.envs.envs))
            observation = self.envs.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            
            for _ in range(p.max_episode_steps):
                with torch.no_grad():
                    obs_norm, g_norm = self.normalize(obs, g)
                    pi = self.actor(obs_norm, g_norm, self.training_env_seq)
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, info = self.envs.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                successes = successes + info['is_success']

            successes = np.array([1.0 if i else 0.0 for i in successes])
            training_success_rate = training_success_rate + successes
        training_success_rate = training_success_rate/p.testing_eps
        
        testing_success_rate = np.array([0.0] * len(self.testing_envs.envs))    
        for _ in range(p.testing_eps):
            successes = np.array([0.0]*len(self.testing_envs.envs))
            observation = self.testing_envs.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            
            for _ in range(p.max_episode_steps):
                with torch.no_grad():
                    obs_norm, g_norm = self.normalize(obs, g)
                    pi = self.actor(obs_norm, g_norm, self.testing_env_seq)
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, info = self.testing_envs.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                successes = successes + info['is_success']

            successes = np.array([1.0 if i else 0.0 for i in successes])
            testing_success_rate = testing_success_rate + successes
        testing_success_rate = testing_success_rate/p.testing_eps

        return training_success_rate, testing_success_rate