class TestMemory(unittest.TestCase):
    def setUp(self):
        self.cls = ReplayBuffer(buffer_size=1000, batch_size=4, past_size=2)

        states = np.arange(50).reshape(-1, 2)
        next_states = np.arange(2, 52).reshape(-1, 2)
        actions = np.arange(0, 25, 1)**2
        rewards = np.linspace(-1, 2, 25)
        terms = np.zeros(25)

        for s, a, r, s2, d in zip(states, actions, rewards, next_states,
                                  terms):
            self.cls.add(s, a, r, s2, d)

    def test_lenght_memory(self):
        self.assertEqual(25, len(self.cls))

    def test_memory(self):
        self.assertTrue(
            np.array([6, 7]).all() == self.cls.memory[3]['state'].all())

    def test_get_historic_state(self):
        self.assertTrue(
            np.array([[0, 1], [2, 3], [4, 5]]).all() == \
            self.cls.get(2)['state'].all())

    def test_sample_size(self):
        self.assertEqual(4, len(self.cls.sample()))
    def __init__(self, cfg):
        # Replay memory
        self.memory = ReplayBuffer(**cfg['agent']['memory'])

        # Environment configuration

        self.action_shape = cfg['env']['action_shape']

        # Algorithm parameters
        self.exploration_mu, self.exploration_theta, self.exploration_sigma = cfg['agent']['noise']
        self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        self.gamma = cfg['agent']['gamma']
        self.tau = cfg['agent']['tau']

        state_flatten_shape = [np.prod(self.memory.flatten_state_shape)]
        # Actor Model
        self.actor = Actor(state_flatten_shape, self.action_shape, cfg['env']['action_range'],
                           self.tau, self.memory.batch_size, cfg['actor'])

        # Critic Model
        self.critic = Critic(state_flatten_shape, self.action_shape, self.tau, cfg['critic'])

        # Flag & Counter
        self.add_noise = True
        self.episode = 0
        self.max_episode_explore = 100
Exemple #3
0
    def __init__(self, config, network_manager):

        self.norm_type = config.norm_type

        # Env config
        self.state_dim = config.state_dim
        self.state_min = config.state_min
        self.state_max = config.state_max

        self.action_dim = config.action_dim
        self.action_min = config.action_min
        self.action_max = config.action_max

        self.replay_buffer = ReplayBuffer(config.buffer_size,
                                          config.random_seed)
        self.batch_size = config.batch_size
        self.warmup_steps = config.warmup_steps
        self.gamma = config.gamma

        # to log useful stuff within agent
        self.write_log = config.write_log
        self.write_plot = config.write_plot

        self.network_manager = network_manager
        self.writer = config.writer
        self.config = config
    def setUp(self):
        self.cls = ReplayBuffer(buffer_size=1000, batch_size=4, past_size=2)

        states = np.arange(50).reshape(-1, 2)
        next_states = np.arange(2, 52).reshape(-1, 2)
        actions = np.arange(0, 25, 1)**2
        rewards = np.linspace(-1, 2, 25)
        terms = np.zeros(25)

        for s, a, r, s2, d in zip(states, actions, rewards, next_states,
                                  terms):
            self.cls.add(s, a, r, s2, d)
    def __init__(
        self,
        agent_count,
        observation_size,
        action_size,
        train_config,
        agent_config,
        seed=None,
        actor_model_states=None,
        critic_model_states=None,
        device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')):
        def create_brain(idx):
            return Brain(
                agent_count=agent_count,
                observation_size=observation_size,
                action_size=action_size,
                actor_optim_params=train_config['actor_optim_params'],
                critic_optim_params=train_config['critic_optim_params'],
                soft_update_tau=train_config['soft_update_tau'],
                discount_gamma=train_config['discount_gamma'],
                use_batch_norm=False,
                seed=seed,
                actor_network_states=actor_model_states[idx]
                if actor_model_states else None,
                critic_network_states=critic_model_states[idx]
                if critic_model_states else None,
                device=device)

        self.brains = [create_brain(i) for i in range(agent_count)]
        self.agent_count = agent_count
        self.observation_size = observation_size
        self.action_size = action_size
        self.train_config = train_config
        self.agent_config = agent_config
        self.device = device

        self._batch_size = train_config['mini_batch_size']
        self._update_every = train_config['update_every']

        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, train_config['buffer_size'],
                                   self._batch_size, device)

        self.t_step = 0
class TestMemoryOverwrite(unittest.TestCase):
    def setUp(self):
        self.cls = ReplayBuffer(24, 4, 2)

        states = np.arange(50).reshape(-1, 2)
        next_states = np.arange(2, 52).reshape(-1, 2)
        actions = np.arange(0, 25, 1)**2
        rewards = np.linspace(-1, 2, 25)
        terms = np.zeros(25)

        for s, a, r, s2, d in zip(states, actions, rewards, next_states,
                                  terms):
            self.cls.add(s, a, r, s2, d)

    def test_rolling(self):
        self.assertTrue(
            np.all(np.array([2, 3]) == self.cls.memory[0]['state']))

    def test_overwritting(self):
        self.assertTrue(
            np.all(np.array([48, 49]) == self.cls.memory[-1]['state']))
Exemple #7
0
    def __init__(self, env, params, random_seed):
        super(NAF, self).__init__(env)

        np.random.seed(random_seed) # Random action selection
        random.seed(random_seed) # Experience Replay Buffer

        self.noise_scale = params['noise_scale']
        self.epsilon = params['epsilon'] # 0.3
        self.epsilon_decay = params['epsilon_decay'] # 0.9
        self.epsilon_decay_step = params['epsilon_decay_step'] # 100
        self.policyfunc = nafnn(env, params, random_seed)

        self.replay_buffer = ReplayBuffer(params['buffer_size'])
        self.batch_size = params['batch_size']

        self.gamma = params['gamma'] # 0.99

        self.warmup_steps = params['warmup_steps'] 

        # self.noise_t = np.zeros(self.actionDim)
        self.action_is_greedy = None
        self.eps_decay = True
        
        self.cum_steps = 0 # cumulative steps across episodes
Exemple #8
0
def main(args):
    CUDA = torch.cuda.is_available()
    OUTPUT_RESULTS_DIR = './saver'
    init_set = np.array([0, 0, 1, 1, 0, 1])
    state_dim = 5  # 3#10#3
    action_dim = 1
    env = environment(env_new_model,
                      init_set,
                      state_dim=state_dim,
                      action_dim=action_dim)
    action_bound = np.array([1, -1])

    actor = ActorNetwork(state_dim, action_dim, action_bound, args.actor_lr,
                         args.tau, args.seed)
    target_actor = ActorNetwork(state_dim, action_dim, action_bound,
                                args.actor_lr, args.tau, args.seed)
    critic = CriticNetwork(state_dim, action_dim, action_bound, args.critic_lr,
                           args.tau, args.l2_decay, args.seed)
    target_critic = CriticNetwork(state_dim, action_dim, action_bound,
                                  args.critic_lr, args.tau, args.l2_decay,
                                  args.seed)

    if CUDA:
        actor = actor.cuda()
        target_actor = target_actor.cuda()
        critic = critic.cuda()
        target_critic = target_critic.cuda()

    replay_buffer = ReplayBuffer(args.bufferlength, args.seed)

    agent = DDPGAgent(actor,
                      target_actor,
                      critic,
                      target_critic,
                      replay_buffer,
                      batch_size=args.batch_size,
                      gamma=args.gamma,
                      seed=args.seed,
                      episode_len=args.episode_len,
                      episode_steps=args.episode_steps,
                      noise_mean=args.noise_mean,
                      noise_th=args.noise_th,
                      noise_std=args.noise_std,
                      noise_decay=args.noise_decay)

    agent.train(env)
class Agent():
    def __init__(self, cfg):
        # Replay memory
        self.memory = ReplayBuffer(**cfg['agent']['memory'])

        # Environment configuration

        self.action_shape = cfg['env']['action_shape']

        # Algorithm parameters
        self.exploration_mu, self.exploration_theta, self.exploration_sigma = cfg['agent']['noise']
        self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        self.gamma = cfg['agent']['gamma']
        self.tau = cfg['agent']['tau']

        state_flatten_shape = [np.prod(self.memory.flatten_state_shape)]
        # Actor Model
        self.actor = Actor(state_flatten_shape, self.action_shape, cfg['env']['action_range'],
                           self.tau, self.memory.batch_size, cfg['actor'])

        # Critic Model
        self.critic = Critic(state_flatten_shape, self.action_shape, self.tau, cfg['critic'])

        # Flag & Counter
        self.add_noise = True
        self.episode = 0
        self.max_episode_explore = 100

    def init_actor_critic(self):
        # Initialize target model
        self.critic.copy_local_in_target()
        self.actor.copy_local_in_target()

    def reset(self):
        self.memory.reset_past()
        self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward,
                        next_state, done)
        if done:
            self.reset()


    def act(self, state):
        self.last_state = state

        window_states = self.memory.get_state_vector(state).reshape(1, -1)
        action = self.actor.predict(window_states)

        if self.add_noise and self.episode < self.max_episode_explore:
            p = self.episode / self.max_episode_explore
            action = np.clip(action*p + (1-p)*self.noise.sample(), a_max=1, a_min=-1)

        return action

    def learn(self):
        if self.memory.is_sufficient():
            experiences = self.memory.sample()

            states = experiences['state'][:, 0].reshape(self.memory.batch_size, -1)
            actions = experiences['action']
            rewards = experiences['reward']
            dones = experiences['done']
            next_states = experiences['next_state'][:, 0].reshape(self.memory.batch_size, -1)

            # get predicted next state action and Q values from target models
            actions_next = self.actor.get_targets(next_states)
            Q_targets_next = self.critic.get_targets(next_states, actions_next)

            # Compute Q targets for current states and train critic model
            Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
            critic_summaries = self.critic.fit(states, actions, Q_targets)

            # Train actor model
            action_gradients = self.critic.get_actions_grad(states, actions)[0]
            actor_summaries = self.actor.fit(states, action_gradients)

            # Soft-update target models
            self.critic.soft_update()
            self.actor.soft_update()

            summary_reward = summary('sample_rewards', rewards)

            return critic_summaries, actor_summaries, summary_reward
Exemple #10
0
class BaseAgent(object):
    def __init__(self, config, network_manager):

        self.norm_type = config.norm_type

        # Env config
        self.state_dim = config.state_dim
        self.state_min = config.state_min
        self.state_max = config.state_max

        self.action_dim = config.action_dim
        self.action_min = config.action_min
        self.action_max = config.action_max

        self.replay_buffer = ReplayBuffer(config.buffer_size,
                                          config.random_seed)
        self.batch_size = config.batch_size
        self.warmup_steps = config.warmup_steps
        self.gamma = config.gamma

        # to log useful stuff within agent
        self.write_log = config.write_log
        self.write_plot = config.write_plot

        self.network_manager = network_manager
        self.writer = config.writer
        self.config = config

    def start(self, state, is_train):
        return self.take_action(state, is_train, is_start=True)

    def step(self, state, is_train):
        return self.take_action(state, is_train, is_start=False)

    def take_action(self, state, is_train, is_start):
        # Warmup step not really used
        if self.replay_buffer.get_size() < self.warmup_steps:

            # use random seed
            # action = (np.random.random_sample(size=self.action_dim) - 0.5) * 2 * self.action_max[0]
            raise NotImplementedError
        else:
            action = self.network_manager.take_action(state, is_train,
                                                      is_start)
        return action

    def get_value(self, s, a):
        raise NotImplementedError

    def update(self, state, next_state, reward, action, is_terminal,
               is_truncated):
        if not is_truncated:
            if not is_terminal:
                self.replay_buffer.add(state, action, reward, next_state,
                                       self.gamma)
            else:
                self.replay_buffer.add(state, action, reward, next_state, 0.0)

        if self.norm_type != 'none':
            self.network_manager.input_norm.update(np.array([state]))
        self.learn()

    def learn(self):
        if self.replay_buffer.get_size() > max(self.warmup_steps,
                                               self.batch_size):
            state, action, reward, next_state, gamma = self.replay_buffer.sample_batch(
                self.batch_size)
            self.network_manager.update_network(state, action, next_state,
                                                reward, gamma)
        else:
            return

    # Resets the agent between episodes. Should primarily be used to clear traces or other temporally linked parameters
    def reset(self):
        self.network_manager.reset()
Exemple #11
0
class NAF(Agent):
    def __init__(self, env, params, random_seed):
        super(NAF, self).__init__(env)

        np.random.seed(random_seed) # Random action selection
        random.seed(random_seed) # Experience Replay Buffer

        self.noise_scale = params['noise_scale']
        self.epsilon = params['epsilon'] # 0.3
        self.epsilon_decay = params['epsilon_decay'] # 0.9
        self.epsilon_decay_step = params['epsilon_decay_step'] # 100
        self.policyfunc = nafnn(env, params, random_seed)

        self.replay_buffer = ReplayBuffer(params['buffer_size'])
        self.batch_size = params['batch_size']

        self.gamma = params['gamma'] # 0.99

        self.warmup_steps = params['warmup_steps'] 

        # self.noise_t = np.zeros(self.actionDim)
        self.action_is_greedy = None
        self.eps_decay = True
        
        self.cum_steps = 0 # cumulative steps across episodes
    
        #print('agent params gamma, epsilon', self.gamma, self.epsilon)


    def update(self, S, Sp, r, a, episodeEnd):
        if not episodeEnd:
            self.replay_buffer.add(S, a, r, Sp, self.gamma)
            self.learn()
        else:
            self.replay_buffer.add(S, a, r, Sp, 0.0)
            self.learn()
    
    def learn(self):
        if self.replay_buffer.getSize() > max(self.warmup_steps, self.batch_size):
            s, a, r, sp, gamma = self.replay_buffer.sample_batch(self.batch_size)
            self.policyfunc.update_vars(s, a, sp, r, gamma)
        else:
            return
        #print r
        #self.policyfunc.performtest(s, a, sp, r, gamma)

    def takeAction(self, state, isTrain):
        # epsilon greedy
        meanact, covmat = self.policyfunc.takeAction(state)
        #print bestact
        if self.cum_steps < self.warmup_steps:
            action = np.random.uniform(self.actionMin, self.actionMax, self.actionDim)
        #action = self.env.instance.action_space.sample()
        else:
            if isTrain:
                action = np.random.multivariate_normal(meanact, self.noise_scale*covmat)
            #print self.noise_scale*covmat
            else:
                action = meanact
        self.cum_steps +=1
        #print self.actionMin, self.actionMax
        return np.clip(action, self.actionMin[0], self.actionMax[0])

    def getAction(self, state, isTrain):
        self.next_action = self.takeAction(state, isTrain)
        return self.next_action, self.action_is_greedy
    
    def start(self, state, isTrain):
        self.next_action = self.takeAction(state, isTrain)
        return self.next_action

    def reset(self):
        # self.erbuffer = [] # maybe do not reset erbuffer
        self.noise_t = np.zeros(self.actionDim)
        self.action_is_greedy = None
Exemple #12
0
class BaseAgent(object):
    def __init__(self, config, network_manager):

        # Env config
        self.state_dim = config.state_dim
        self.state_min = config.state_min
        self.state_max = config.state_max

        self.action_dim = config.action_dim
        self.action_min = config.action_min
        self.action_max = config.action_max

        self.use_replay = config.use_replay
        if self.use_replay:
            self.replay_buffer = ReplayBuffer(config.buffer_size,
                                              config.random_seed)
        else:
            self.replay_buffer = None
        self.batch_size = config.batch_size
        self.warmup_steps = config.warmup_steps
        self.gamma = config.gamma

        # to log useful stuff within agent
        self.write_log = config.write_log
        self.write_plot = config.write_plot

        self.network_manager = network_manager
        self.writer = config.writer
        self.config = config

    def start(self, state, is_train):
        return self.take_action(state, is_train, is_start=True)

    def step(self, state, is_train):
        return self.take_action(state, is_train, is_start=False)

    def take_action(self, state, is_train, is_start):

        if self.use_replay and self.replay_buffer.get_size(
        ) < self.warmup_steps:
            # Currently not using warmup steps
            raise NotImplementedError
        else:
            action = self.network_manager.take_action(state, is_train,
                                                      is_start)
        return action

    def get_value(self, s, a):
        raise NotImplementedError

    def update(self, state, next_state, reward, action, is_terminal,
               is_truncated):

        if not is_truncated:

            # if using replay buffer
            if self.use_replay:
                if not is_terminal:
                    self.replay_buffer.add(state, action, reward, next_state,
                                           self.gamma)
                else:
                    self.replay_buffer.add(state, action, reward, next_state,
                                           0.0)
                self.learn()

            # if not using replay buffer
            else:
                if not is_terminal:
                    self.learn([state], [action], [reward], [next_state],
                               [self.gamma])
                else:
                    self.learn([state], [action], [reward], [next_state],
                               [0.0])

    def learn(self,
              state=None,
              action=None,
              reward=None,
              next_state=None,
              gamma=None):

        # if using replay, overwrite with batches
        if self.use_replay:
            if self.replay_buffer.get_size() > max(self.warmup_steps,
                                                   self.batch_size):
                state, action, reward, next_state, gamma = self.replay_buffer.sample_batch(
                    self.batch_size)

                self.network_manager.update_network(state, action, next_state,
                                                    reward, gamma)
        else:
            assert state is not None
            self.network_manager.update_network(state, action, next_state,
                                                reward, gamma)

    # Resets the agent between episodes. Should primarily be used to clear traces or other temporally linked parameters
    def reset(self):
        self.network_manager.reset()
class MultiAgent:
    def __init__(
        self,
        agent_count,
        observation_size,
        action_size,
        train_config,
        agent_config,
        seed=None,
        actor_model_states=None,
        critic_model_states=None,
        device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')):
        def create_brain(idx):
            return Brain(
                agent_count=agent_count,
                observation_size=observation_size,
                action_size=action_size,
                actor_optim_params=train_config['actor_optim_params'],
                critic_optim_params=train_config['critic_optim_params'],
                soft_update_tau=train_config['soft_update_tau'],
                discount_gamma=train_config['discount_gamma'],
                use_batch_norm=False,
                seed=seed,
                actor_network_states=actor_model_states[idx]
                if actor_model_states else None,
                critic_network_states=critic_model_states[idx]
                if critic_model_states else None,
                device=device)

        self.brains = [create_brain(i) for i in range(agent_count)]
        self.agent_count = agent_count
        self.observation_size = observation_size
        self.action_size = action_size
        self.train_config = train_config
        self.agent_config = agent_config
        self.device = device

        self._batch_size = train_config['mini_batch_size']
        self._update_every = train_config['update_every']

        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, train_config['buffer_size'],
                                   self._batch_size, device)

        self.t_step = 0

    def step(self, obs, actions, rewards, next_obs, dones):
        """observation and learning by replay
        :param obs: array of shape == (agent_count, observation_size)
        :param actions: array of shape == (agent_count, action_size)
        :param rewards: array of shape == (agent_count,)
        :param next_obs: list of  array of shape == (agent_count, observation_size)
        :param dones: array of shape == (agent_count,)
        """
        self.memory.add(obs, actions, rewards, next_obs,
                        dones.astype(np.uint8))

        self.t_step = (self.t_step + 1) % self._update_every

        if self.t_step == 0:
            if len(self.memory) > self._batch_size:
                self._learn()

    def act_torch(self, obs, target, noise=0.0, train=False):
        """Act based on the given batch of observations.
        :param obs: current observation, array of shape == (b, observation_size)
        :param noise: noise factor
        :param train: True for training mode else eval mode
        :return: actions for given state as per current policy.
        """
        actions = [
            brain.act(obs[:, i], target, noise, train)
            for i, brain in enumerate(self.brains)
        ]

        actions = torch.stack(actions).transpose(1, 0)

        return actions

    def act(self, obs, target=False, noise=0.0):
        obs = torch.from_numpy(obs).float().\
            to(self.device).unsqueeze(0)

        with torch.no_grad():
            actions = np.vstack(
                [a.cpu().numpy() for a in self.act_torch(obs, target, noise)])

        return actions

    def reset(self):
        for brain in self.brains:
            brain.reset()

    def _learn(self):

        experiences = self.memory.sample()
        experiences = self._tensor_experiences(experiences)

        observations, actions, rewards, next_observations, dones = experiences

        all_obs = self._flatten(observations)
        all_actions = self._flatten(actions)
        all_next_obs = self._flatten(next_observations)

        all_target_next_actions = self._flatten(
            self.act_torch(next_observations, target=True,
                           train=False).contiguous())

        all_local_actions = self.act_torch(observations,
                                           target=False,
                                           train=True).contiguous()

        for i, brain in enumerate(self.brains):
            # update critics
            brain.update_critic(rewards[:, i].unsqueeze(-1),
                                dones[:,
                                      i].unsqueeze(-1), all_obs, all_actions,
                                all_next_obs, all_target_next_actions)

            # update actors
            all_local_actions_agent = all_local_actions.detach()
            all_local_actions_agent[:, i] = all_local_actions[:, i]
            all_local_actions_agent = self._flatten(all_local_actions_agent)
            brain.update_actor(all_obs, all_local_actions_agent)

            # update targets
            brain.update_targets()

    def _tensor_experiences(self, experiences):
        ob, actions, rewards, next_ob, dones = \
            [torch.from_numpy(e).float().to(self.device) for e in experiences]
        return ob, actions, rewards, next_ob, dones

    @staticmethod
    def _flatten(tensor):
        b, n_agents, d = tensor.shape
        return tensor.view(b, n_agents * d)