Ejemplo n.º 1
0
    def step(self, states, actions, rewards, next_states, dones, running_timestep):
    
        # Store experience to the replay buffer
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            # print('adding: ', state.shape, action.shape, reward, next_state.shape, done)
            self.memory.add(state, action, reward, next_state, done)
    
        # When the memory is at-least full as the batch size and if the step num is a factor of UPDATE_AFTER_STEP
        # then we learn the parameters of the network
        # Update the weights of local network and soft-update the weighs of the target_network
        # self.t_step = (self.t_step + 1) % self.UPDATE_AFTER_STEP  # Run from {1->UPDATE_AFTER_STEP}
        # print('[Step] Current Step is: ', self.tstep)
        if (running_timestep % self.LEARNING_FREQUENCY) == 0:
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, self.GAMMA, running_timestep)

        if running_timestep > self.DATA_TO_BUFFER_BEFORE_LEARNING:
            if self.IS_HARD_UPDATE:
                if (running_timestep % self.HARD_UPDATE_FREQUENCY) == 0:
                    utils.hard_update(self.actor_local, self.actor_target)
    
            elif self.IS_SOFT_UPDATE:
                if (running_timestep % self.SOFT_UPDATE_FREQUENCY) == 0:
                    utils.soft_update(self.critic_local, self.critic_target, self.TAU)
                    utils.soft_update(self.actor_local, self.actor_target, self.TAU)
            else:
                raise ValueError('Only One of HARD_UPDATE and SOFT_UPDATE is to be activated')
Ejemplo n.º 2
0
 def update(self, running_time_step):
     
     if self.IS_HARD_UPDATE:
         if (running_time_step % self.HARD_UPDATE_FREQUENCY) == 0:
             utils.hard_update(self.actor_local, self.actor_target)
     
     elif self.IS_SOFT_UPDATE:
         if (running_time_step % self.SOFT_UPDATE_FREQUENCY) == 0:
             utils.soft_update(self.critic_local, self.critic_target, self.TAU)
             utils.soft_update(self.actor_local, self.actor_target, self.TAU)
     else:
         raise ValueError('Only One of HARD_UPDATE and SOFT_UPDATE is to be activated')
Ejemplo n.º 3
0
 def __init__(self, args, agent_id, mode):
     """
     :param args:            Config parameters
     :param agent_id:        The agent id to run
     :param mode:            train or test
     """
     self.agent_id = agent_id
     self.mode = mode
     self.SEED = args.SEED
     random.seed(self.SEED)
     
     self.NOISE = args.NOISE_FN()
     self.STATE_SIZE = args.STATE_SIZE
     self.ACTION_SIZE = args.ACTION_SIZE
     self.TAU = args.TAU
     self.ACTOR_LEARNING_RATE = args.ACTOR_LEARNING_RATE
     self.CRITIC_LEARNING_RATE = args.CRITIC_LEARNING_RATE
     self.WEIGHT_DECAY = args.WEIGHT_DECAY
     
     self.IS_HARD_UPDATE = args.IS_HARD_UPDATE
     self.IS_SOFT_UPDATE = args.IS_SOFT_UPDATE
     self.SOFT_UPDATE_FREQUENCY = args.SOFT_UPDATE_FREQUENCY
     self.HARD_UPDATE_FREQUENCY = args.HARD_UPDATE_FREQUENCY
     self.NOISE_AMPLITUDE_DEACAY = args.NOISE_AMPLITUDE_DECAY_FN()
     
     self.CHECKPOINT_DIR = args.CHECKPOINT_DIR
     self.SUMMARY_LOGGER = args.SUMMARY_LOGGER
     
     # Actor Network
     self.actor_local = model.Actor(self.STATE_SIZE, self.ACTION_SIZE, [256, 256, 2], self.SEED).to(device)
     self.actor_target = model.Actor(self.STATE_SIZE, self.ACTION_SIZE, [256, 256, 2], self.SEED).to(device)
     self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.ACTOR_LEARNING_RATE)
     
     # Critic Network
     self.critic_local = model.Critic(self.STATE_SIZE, self.ACTION_SIZE, [256, 256, 1], self.SEED).to(device)
     self.critic_target = model.Critic(self.STATE_SIZE, self.ACTION_SIZE, [256, 256, 1], self.SEED).to(device)
     self.critic_optimizer = optim.Adam(
             self.critic_local.parameters(), lr=self.CRITIC_LEARNING_RATE, weight_decay=self.WEIGHT_DECAY
     )
     
     # Set weights for local and target actor, respectively, critic the same
     utils.hard_update(self.actor_local, self.actor_target)
     utils.hard_update(self.critic_local, self.critic_target)
     
     # Noise process
     self.noise = args.NOISE_FN()
Ejemplo n.º 4
0
 def load_models(self, episode):
     """
     loads the target actor and critic models, and copies them onto actor and critic models
     :param episode: the count of episodes iterated (used to find the file name)
     :return:
     """
     for i in range(self.num_agents):
         self.target_actors[i].load_state_dict(
             torch.load('./Models/' + str(episode) + '_target_actor' +
                        str(i) + '.pt',
                        map_location=self.device))
         # self.target_critics[i].load_state_dict(
         #     torch.load('./Models/' + str(episode) + '_target_critic' + str(i) + '.pt', map_location = self.device))
         self.actors[i].load_state_dict(
             torch.load('./Models/' + str(episode) + '_actor' + str(i) +
                        '.pt',
                        map_location=self.device))
         # self.critics[i].load_state_dict(
         #     torch.load('./Models/' + str(episode) + '_critic' + str(i) + '.pt', map_location = self.device))
         utils.hard_update(self.target_actors[i], self.actors[i])
         # utils.hard_update(self.target_critics[i], self.critics[i])
     print('Models loaded succesfully')
Ejemplo n.º 5
0
    def step(self, states, actions, rewards, next_states, dones,
             running_timestep):
        # print('Taking a step: ', state.shape, action, reward, next_state.shape, done, episode_num, running_time_step)
        # Insert the tuple into the memory buffer
        self.memory.add(states, actions, rewards, next_states, dones)

        if (running_timestep % self.LEARNING_FREQUENCY) == 0:
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, running_timestep)

        if running_timestep > self.DATA_TO_BUFFER_BEFORE_LEARNING:
            if self.IS_HARD_UPDATE:
                if (running_timestep % self.HARD_UPDATE_FREQUENCY) == 0:
                    utils.hard_update(self.local_network, self.target_network)

            elif self.IS_SOFT_UPDATE:
                if (running_timestep % self.SOFT_UPDATE_FREQUENCY) == 0:
                    utils.soft_update(self.local_network, self.target_network,
                                      self.TAU)
            else:
                raise ValueError(
                    'Only One of HARD_UPDATE and SOFT_UPDATE is to be activated'
                )
Ejemplo n.º 6
0
    def __init__(self,
                 gamma,
                 lr_a,
                 lr_c,
                 state_dim_actor,
                 state_dim_critic,
                 num_agents,
                 num_agent_lim,
                 action_dim,
                 mem_size,
                 batch_size,
                 agent_name,
                 chkpoint,
                 chkpt_dir,
                 env=None):

        self.state_dim_actor = state_dim_actor
        self.state_dim_critic = state_dim_critic
        self.action_dim = action_dim
        self.action_lim = action_dim
        self.iter = 0
        self.lr_a = lr_a
        self.lr_c = lr_c
        self.tau = 0.05
        self.steps_done = 0
        self.nrand_action = 0
        self.gamma = gamma
        self.num_agent_lim = num_agent_lim
        self.max_n_agents = self.num_agent_lim
        self.learn_step_counter = 0
        self.batch_size = batch_size
        self.chkpt_dir = chkpt_dir
        self.env = env
        self.critic_loss_value = 0
        self.actor_loss_value = 0
        self.chkpoint = chkpoint
        self.num_agents = num_agents
        self.agent_name = agent_name
        self.use_cuda = False
        self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim)

        self.actors = [
            Actor(self.state_dim_actor, self.action_dim)
            for i in range(num_agent_lim)
        ]
        self.critics = [
            Critic(self.state_dim_critic, self.action_dim, num_agent_lim)
            for i in range(num_agent_lim)
        ]

        self.target_actors = deepcopy(self.actors)
        self.target_critics = deepcopy(self.critics)
        self.actor_optimizers = [
            torch.optim.Adam(self.actors[i].parameters(), self.lr_a)
            for i in range(num_agent_lim)
        ]
        self.critic_optimizers = [
            torch.optim.Adam(self.critics[i].parameters(), self.lr_c)
            for i in range(num_agent_lim)
        ]
        ''' Setup CUDA Environment'''
        self.device = 'cuda' if self.use_cuda else 'cpu'
        if self.use_cuda:
            for i in range(num_agent_lim):
                self.actors[i].to(self.device)
                self.target_actors[i].to(self.device)
                self.critics[i].to(self.device)
                self.target_critics[i].to(self.device)

        for i in range(num_agent_lim):
            utils.hard_update(self.target_actors[i], self.actors[i])
            utils.hard_update(self.target_critics[i], self.critics[i])
        self.memories = [ReplayBuffer(mem_size) for i in range(num_agent_lim)]