class TestMemory(unittest.TestCase): def setUp(self): self.cls = ReplayBuffer(buffer_size=1000, batch_size=4, past_size=2) states = np.arange(50).reshape(-1, 2) next_states = np.arange(2, 52).reshape(-1, 2) actions = np.arange(0, 25, 1)**2 rewards = np.linspace(-1, 2, 25) terms = np.zeros(25) for s, a, r, s2, d in zip(states, actions, rewards, next_states, terms): self.cls.add(s, a, r, s2, d) def test_lenght_memory(self): self.assertEqual(25, len(self.cls)) def test_memory(self): self.assertTrue( np.array([6, 7]).all() == self.cls.memory[3]['state'].all()) def test_get_historic_state(self): self.assertTrue( np.array([[0, 1], [2, 3], [4, 5]]).all() == \ self.cls.get(2)['state'].all()) def test_sample_size(self): self.assertEqual(4, len(self.cls.sample()))
def __init__(self, cfg): # Replay memory self.memory = ReplayBuffer(**cfg['agent']['memory']) # Environment configuration self.action_shape = cfg['env']['action_shape'] # Algorithm parameters self.exploration_mu, self.exploration_theta, self.exploration_sigma = cfg['agent']['noise'] self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.gamma = cfg['agent']['gamma'] self.tau = cfg['agent']['tau'] state_flatten_shape = [np.prod(self.memory.flatten_state_shape)] # Actor Model self.actor = Actor(state_flatten_shape, self.action_shape, cfg['env']['action_range'], self.tau, self.memory.batch_size, cfg['actor']) # Critic Model self.critic = Critic(state_flatten_shape, self.action_shape, self.tau, cfg['critic']) # Flag & Counter self.add_noise = True self.episode = 0 self.max_episode_explore = 100
def __init__(self, config, network_manager): self.norm_type = config.norm_type # Env config self.state_dim = config.state_dim self.state_min = config.state_min self.state_max = config.state_max self.action_dim = config.action_dim self.action_min = config.action_min self.action_max = config.action_max self.replay_buffer = ReplayBuffer(config.buffer_size, config.random_seed) self.batch_size = config.batch_size self.warmup_steps = config.warmup_steps self.gamma = config.gamma # to log useful stuff within agent self.write_log = config.write_log self.write_plot = config.write_plot self.network_manager = network_manager self.writer = config.writer self.config = config
def setUp(self): self.cls = ReplayBuffer(buffer_size=1000, batch_size=4, past_size=2) states = np.arange(50).reshape(-1, 2) next_states = np.arange(2, 52).reshape(-1, 2) actions = np.arange(0, 25, 1)**2 rewards = np.linspace(-1, 2, 25) terms = np.zeros(25) for s, a, r, s2, d in zip(states, actions, rewards, next_states, terms): self.cls.add(s, a, r, s2, d)
def __init__( self, agent_count, observation_size, action_size, train_config, agent_config, seed=None, actor_model_states=None, critic_model_states=None, device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')): def create_brain(idx): return Brain( agent_count=agent_count, observation_size=observation_size, action_size=action_size, actor_optim_params=train_config['actor_optim_params'], critic_optim_params=train_config['critic_optim_params'], soft_update_tau=train_config['soft_update_tau'], discount_gamma=train_config['discount_gamma'], use_batch_norm=False, seed=seed, actor_network_states=actor_model_states[idx] if actor_model_states else None, critic_network_states=critic_model_states[idx] if critic_model_states else None, device=device) self.brains = [create_brain(i) for i in range(agent_count)] self.agent_count = agent_count self.observation_size = observation_size self.action_size = action_size self.train_config = train_config self.agent_config = agent_config self.device = device self._batch_size = train_config['mini_batch_size'] self._update_every = train_config['update_every'] if seed is not None: random.seed(seed) np.random.seed(seed) # Replay memory self.memory = ReplayBuffer(action_size, train_config['buffer_size'], self._batch_size, device) self.t_step = 0
class TestMemoryOverwrite(unittest.TestCase): def setUp(self): self.cls = ReplayBuffer(24, 4, 2) states = np.arange(50).reshape(-1, 2) next_states = np.arange(2, 52).reshape(-1, 2) actions = np.arange(0, 25, 1)**2 rewards = np.linspace(-1, 2, 25) terms = np.zeros(25) for s, a, r, s2, d in zip(states, actions, rewards, next_states, terms): self.cls.add(s, a, r, s2, d) def test_rolling(self): self.assertTrue( np.all(np.array([2, 3]) == self.cls.memory[0]['state'])) def test_overwritting(self): self.assertTrue( np.all(np.array([48, 49]) == self.cls.memory[-1]['state']))
def __init__(self, env, params, random_seed): super(NAF, self).__init__(env) np.random.seed(random_seed) # Random action selection random.seed(random_seed) # Experience Replay Buffer self.noise_scale = params['noise_scale'] self.epsilon = params['epsilon'] # 0.3 self.epsilon_decay = params['epsilon_decay'] # 0.9 self.epsilon_decay_step = params['epsilon_decay_step'] # 100 self.policyfunc = nafnn(env, params, random_seed) self.replay_buffer = ReplayBuffer(params['buffer_size']) self.batch_size = params['batch_size'] self.gamma = params['gamma'] # 0.99 self.warmup_steps = params['warmup_steps'] # self.noise_t = np.zeros(self.actionDim) self.action_is_greedy = None self.eps_decay = True self.cum_steps = 0 # cumulative steps across episodes
def main(args): CUDA = torch.cuda.is_available() OUTPUT_RESULTS_DIR = './saver' init_set = np.array([0, 0, 1, 1, 0, 1]) state_dim = 5 # 3#10#3 action_dim = 1 env = environment(env_new_model, init_set, state_dim=state_dim, action_dim=action_dim) action_bound = np.array([1, -1]) actor = ActorNetwork(state_dim, action_dim, action_bound, args.actor_lr, args.tau, args.seed) target_actor = ActorNetwork(state_dim, action_dim, action_bound, args.actor_lr, args.tau, args.seed) critic = CriticNetwork(state_dim, action_dim, action_bound, args.critic_lr, args.tau, args.l2_decay, args.seed) target_critic = CriticNetwork(state_dim, action_dim, action_bound, args.critic_lr, args.tau, args.l2_decay, args.seed) if CUDA: actor = actor.cuda() target_actor = target_actor.cuda() critic = critic.cuda() target_critic = target_critic.cuda() replay_buffer = ReplayBuffer(args.bufferlength, args.seed) agent = DDPGAgent(actor, target_actor, critic, target_critic, replay_buffer, batch_size=args.batch_size, gamma=args.gamma, seed=args.seed, episode_len=args.episode_len, episode_steps=args.episode_steps, noise_mean=args.noise_mean, noise_th=args.noise_th, noise_std=args.noise_std, noise_decay=args.noise_decay) agent.train(env)
class Agent(): def __init__(self, cfg): # Replay memory self.memory = ReplayBuffer(**cfg['agent']['memory']) # Environment configuration self.action_shape = cfg['env']['action_shape'] # Algorithm parameters self.exploration_mu, self.exploration_theta, self.exploration_sigma = cfg['agent']['noise'] self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.gamma = cfg['agent']['gamma'] self.tau = cfg['agent']['tau'] state_flatten_shape = [np.prod(self.memory.flatten_state_shape)] # Actor Model self.actor = Actor(state_flatten_shape, self.action_shape, cfg['env']['action_range'], self.tau, self.memory.batch_size, cfg['actor']) # Critic Model self.critic = Critic(state_flatten_shape, self.action_shape, self.tau, cfg['critic']) # Flag & Counter self.add_noise = True self.episode = 0 self.max_episode_explore = 100 def init_actor_critic(self): # Initialize target model self.critic.copy_local_in_target() self.actor.copy_local_in_target() def reset(self): self.memory.reset_past() self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma) def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) if done: self.reset() def act(self, state): self.last_state = state window_states = self.memory.get_state_vector(state).reshape(1, -1) action = self.actor.predict(window_states) if self.add_noise and self.episode < self.max_episode_explore: p = self.episode / self.max_episode_explore action = np.clip(action*p + (1-p)*self.noise.sample(), a_max=1, a_min=-1) return action def learn(self): if self.memory.is_sufficient(): experiences = self.memory.sample() states = experiences['state'][:, 0].reshape(self.memory.batch_size, -1) actions = experiences['action'] rewards = experiences['reward'] dones = experiences['done'] next_states = experiences['next_state'][:, 0].reshape(self.memory.batch_size, -1) # get predicted next state action and Q values from target models actions_next = self.actor.get_targets(next_states) Q_targets_next = self.critic.get_targets(next_states, actions_next) # Compute Q targets for current states and train critic model Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) critic_summaries = self.critic.fit(states, actions, Q_targets) # Train actor model action_gradients = self.critic.get_actions_grad(states, actions)[0] actor_summaries = self.actor.fit(states, action_gradients) # Soft-update target models self.critic.soft_update() self.actor.soft_update() summary_reward = summary('sample_rewards', rewards) return critic_summaries, actor_summaries, summary_reward
class BaseAgent(object): def __init__(self, config, network_manager): self.norm_type = config.norm_type # Env config self.state_dim = config.state_dim self.state_min = config.state_min self.state_max = config.state_max self.action_dim = config.action_dim self.action_min = config.action_min self.action_max = config.action_max self.replay_buffer = ReplayBuffer(config.buffer_size, config.random_seed) self.batch_size = config.batch_size self.warmup_steps = config.warmup_steps self.gamma = config.gamma # to log useful stuff within agent self.write_log = config.write_log self.write_plot = config.write_plot self.network_manager = network_manager self.writer = config.writer self.config = config def start(self, state, is_train): return self.take_action(state, is_train, is_start=True) def step(self, state, is_train): return self.take_action(state, is_train, is_start=False) def take_action(self, state, is_train, is_start): # Warmup step not really used if self.replay_buffer.get_size() < self.warmup_steps: # use random seed # action = (np.random.random_sample(size=self.action_dim) - 0.5) * 2 * self.action_max[0] raise NotImplementedError else: action = self.network_manager.take_action(state, is_train, is_start) return action def get_value(self, s, a): raise NotImplementedError def update(self, state, next_state, reward, action, is_terminal, is_truncated): if not is_truncated: if not is_terminal: self.replay_buffer.add(state, action, reward, next_state, self.gamma) else: self.replay_buffer.add(state, action, reward, next_state, 0.0) if self.norm_type != 'none': self.network_manager.input_norm.update(np.array([state])) self.learn() def learn(self): if self.replay_buffer.get_size() > max(self.warmup_steps, self.batch_size): state, action, reward, next_state, gamma = self.replay_buffer.sample_batch( self.batch_size) self.network_manager.update_network(state, action, next_state, reward, gamma) else: return # Resets the agent between episodes. Should primarily be used to clear traces or other temporally linked parameters def reset(self): self.network_manager.reset()
class NAF(Agent): def __init__(self, env, params, random_seed): super(NAF, self).__init__(env) np.random.seed(random_seed) # Random action selection random.seed(random_seed) # Experience Replay Buffer self.noise_scale = params['noise_scale'] self.epsilon = params['epsilon'] # 0.3 self.epsilon_decay = params['epsilon_decay'] # 0.9 self.epsilon_decay_step = params['epsilon_decay_step'] # 100 self.policyfunc = nafnn(env, params, random_seed) self.replay_buffer = ReplayBuffer(params['buffer_size']) self.batch_size = params['batch_size'] self.gamma = params['gamma'] # 0.99 self.warmup_steps = params['warmup_steps'] # self.noise_t = np.zeros(self.actionDim) self.action_is_greedy = None self.eps_decay = True self.cum_steps = 0 # cumulative steps across episodes #print('agent params gamma, epsilon', self.gamma, self.epsilon) def update(self, S, Sp, r, a, episodeEnd): if not episodeEnd: self.replay_buffer.add(S, a, r, Sp, self.gamma) self.learn() else: self.replay_buffer.add(S, a, r, Sp, 0.0) self.learn() def learn(self): if self.replay_buffer.getSize() > max(self.warmup_steps, self.batch_size): s, a, r, sp, gamma = self.replay_buffer.sample_batch(self.batch_size) self.policyfunc.update_vars(s, a, sp, r, gamma) else: return #print r #self.policyfunc.performtest(s, a, sp, r, gamma) def takeAction(self, state, isTrain): # epsilon greedy meanact, covmat = self.policyfunc.takeAction(state) #print bestact if self.cum_steps < self.warmup_steps: action = np.random.uniform(self.actionMin, self.actionMax, self.actionDim) #action = self.env.instance.action_space.sample() else: if isTrain: action = np.random.multivariate_normal(meanact, self.noise_scale*covmat) #print self.noise_scale*covmat else: action = meanact self.cum_steps +=1 #print self.actionMin, self.actionMax return np.clip(action, self.actionMin[0], self.actionMax[0]) def getAction(self, state, isTrain): self.next_action = self.takeAction(state, isTrain) return self.next_action, self.action_is_greedy def start(self, state, isTrain): self.next_action = self.takeAction(state, isTrain) return self.next_action def reset(self): # self.erbuffer = [] # maybe do not reset erbuffer self.noise_t = np.zeros(self.actionDim) self.action_is_greedy = None
class BaseAgent(object): def __init__(self, config, network_manager): # Env config self.state_dim = config.state_dim self.state_min = config.state_min self.state_max = config.state_max self.action_dim = config.action_dim self.action_min = config.action_min self.action_max = config.action_max self.use_replay = config.use_replay if self.use_replay: self.replay_buffer = ReplayBuffer(config.buffer_size, config.random_seed) else: self.replay_buffer = None self.batch_size = config.batch_size self.warmup_steps = config.warmup_steps self.gamma = config.gamma # to log useful stuff within agent self.write_log = config.write_log self.write_plot = config.write_plot self.network_manager = network_manager self.writer = config.writer self.config = config def start(self, state, is_train): return self.take_action(state, is_train, is_start=True) def step(self, state, is_train): return self.take_action(state, is_train, is_start=False) def take_action(self, state, is_train, is_start): if self.use_replay and self.replay_buffer.get_size( ) < self.warmup_steps: # Currently not using warmup steps raise NotImplementedError else: action = self.network_manager.take_action(state, is_train, is_start) return action def get_value(self, s, a): raise NotImplementedError def update(self, state, next_state, reward, action, is_terminal, is_truncated): if not is_truncated: # if using replay buffer if self.use_replay: if not is_terminal: self.replay_buffer.add(state, action, reward, next_state, self.gamma) else: self.replay_buffer.add(state, action, reward, next_state, 0.0) self.learn() # if not using replay buffer else: if not is_terminal: self.learn([state], [action], [reward], [next_state], [self.gamma]) else: self.learn([state], [action], [reward], [next_state], [0.0]) def learn(self, state=None, action=None, reward=None, next_state=None, gamma=None): # if using replay, overwrite with batches if self.use_replay: if self.replay_buffer.get_size() > max(self.warmup_steps, self.batch_size): state, action, reward, next_state, gamma = self.replay_buffer.sample_batch( self.batch_size) self.network_manager.update_network(state, action, next_state, reward, gamma) else: assert state is not None self.network_manager.update_network(state, action, next_state, reward, gamma) # Resets the agent between episodes. Should primarily be used to clear traces or other temporally linked parameters def reset(self): self.network_manager.reset()
class MultiAgent: def __init__( self, agent_count, observation_size, action_size, train_config, agent_config, seed=None, actor_model_states=None, critic_model_states=None, device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')): def create_brain(idx): return Brain( agent_count=agent_count, observation_size=observation_size, action_size=action_size, actor_optim_params=train_config['actor_optim_params'], critic_optim_params=train_config['critic_optim_params'], soft_update_tau=train_config['soft_update_tau'], discount_gamma=train_config['discount_gamma'], use_batch_norm=False, seed=seed, actor_network_states=actor_model_states[idx] if actor_model_states else None, critic_network_states=critic_model_states[idx] if critic_model_states else None, device=device) self.brains = [create_brain(i) for i in range(agent_count)] self.agent_count = agent_count self.observation_size = observation_size self.action_size = action_size self.train_config = train_config self.agent_config = agent_config self.device = device self._batch_size = train_config['mini_batch_size'] self._update_every = train_config['update_every'] if seed is not None: random.seed(seed) np.random.seed(seed) # Replay memory self.memory = ReplayBuffer(action_size, train_config['buffer_size'], self._batch_size, device) self.t_step = 0 def step(self, obs, actions, rewards, next_obs, dones): """observation and learning by replay :param obs: array of shape == (agent_count, observation_size) :param actions: array of shape == (agent_count, action_size) :param rewards: array of shape == (agent_count,) :param next_obs: list of array of shape == (agent_count, observation_size) :param dones: array of shape == (agent_count,) """ self.memory.add(obs, actions, rewards, next_obs, dones.astype(np.uint8)) self.t_step = (self.t_step + 1) % self._update_every if self.t_step == 0: if len(self.memory) > self._batch_size: self._learn() def act_torch(self, obs, target, noise=0.0, train=False): """Act based on the given batch of observations. :param obs: current observation, array of shape == (b, observation_size) :param noise: noise factor :param train: True for training mode else eval mode :return: actions for given state as per current policy. """ actions = [ brain.act(obs[:, i], target, noise, train) for i, brain in enumerate(self.brains) ] actions = torch.stack(actions).transpose(1, 0) return actions def act(self, obs, target=False, noise=0.0): obs = torch.from_numpy(obs).float().\ to(self.device).unsqueeze(0) with torch.no_grad(): actions = np.vstack( [a.cpu().numpy() for a in self.act_torch(obs, target, noise)]) return actions def reset(self): for brain in self.brains: brain.reset() def _learn(self): experiences = self.memory.sample() experiences = self._tensor_experiences(experiences) observations, actions, rewards, next_observations, dones = experiences all_obs = self._flatten(observations) all_actions = self._flatten(actions) all_next_obs = self._flatten(next_observations) all_target_next_actions = self._flatten( self.act_torch(next_observations, target=True, train=False).contiguous()) all_local_actions = self.act_torch(observations, target=False, train=True).contiguous() for i, brain in enumerate(self.brains): # update critics brain.update_critic(rewards[:, i].unsqueeze(-1), dones[:, i].unsqueeze(-1), all_obs, all_actions, all_next_obs, all_target_next_actions) # update actors all_local_actions_agent = all_local_actions.detach() all_local_actions_agent[:, i] = all_local_actions[:, i] all_local_actions_agent = self._flatten(all_local_actions_agent) brain.update_actor(all_obs, all_local_actions_agent) # update targets brain.update_targets() def _tensor_experiences(self, experiences): ob, actions, rewards, next_ob, dones = \ [torch.from_numpy(e).float().to(self.device) for e in experiences] return ob, actions, rewards, next_ob, dones @staticmethod def _flatten(tensor): b, n_agents, d = tensor.shape return tensor.view(b, n_agents * d)