def train_step(self, rb: ReplayBuffer, sample_size=300): # loss calcualation trans_sts = rb.sample(sample_size) states = torch.stack([trans.state_tensor for trans in trans_sts]).to(self.device) next_states = torch.stack( [trans.next_state_tensor for trans in trans_sts]).to(self.device) not_done = torch.Tensor([trans.not_done_tensor for trans in trans_sts]).to(self.device) actions = [trans.action for trans in trans_sts] rewards = torch.stack([trans.reward_tensor for trans in trans_sts]).to(self.device) with torch.no_grad(): qvals_predicted = self.tgt_model(next_states).max(-1) self.model.optimizer.zero_grad() qvals_current = self.model(states) one_hot_actions = torch.nn.functional.one_hot( torch.LongTensor(actions), self.num_actions).to(self.device) loss = ((rewards + (not_done * qvals_predicted.values) - torch.sum(qvals_current * one_hot_actions, -1))**2).mean() loss.backward() self.model.optimizer.step() return loss.detach().item()
def train(self, replay_buffer: ReplayBuffer): """Train the Agent""" self.total_it += 1 # Sample replay buffer state, action, reward, next_state, done = replay_buffer.sample( ) #sample 256 experiences with torch.no_grad(): # Select action according to policy and add clipped noise noise = (torch.randn_like(action) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) next_action = ( self.actor_target(next_state) + noise #noise only set in training to prevent from overestimation ).clamp(-self.max_action, self.max_action) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(next_state, next_action) #Q1, Q2 target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + (1 - done) * self.discount * target_Q #TD-target # Get current Q estimates current_Q1, current_Q2 = self.critic(state, action) #Q1, Q2 # Compute critic loss using MSE critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Delayed policy updates (DDPG baseline = 1) if (self.total_it % self.policy_freq == 0): # Compute actor loss actor_loss = -self.critic(state, self.actor(state))[0].mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Soft update by updating the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
class DDPG: def __init__(self, action_dim, action_bound, tau, lr_a, lr_c, state_dim, gamma, batch_size): self.target = tf.placeholder(tf.float32, [None, 1], 'critic_target') self.s = tf.placeholder(tf.float32, [None, state_dim], 'state') self.s_ = tf.placeholder(tf.float32, [None, state_dim], 'next_state') self.memory = ReplayBuffer(max_size=10000) self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) self.batch_size = batch_size self.gamma = gamma self.sess = tf.Session() self.actor = Actor(self.sess, self.s, self.s_, action_dim, action_bound, tau, lr_a, f1_units=300) self.critic = Critic(self.sess, lr_c, self.s, self.s_, self.actor.a, self.actor.a_, self.target, tau, gamma, state_dim, action_dim, f1_units=300) self.actor.add_grad_to_graph(self.critic.a_g) self.sess.run(tf.global_variables_initializer()) def choose_action(self, s): a = self.actor.choose_action(s) var = self.noise() a = a + var return a[0] def update_target_networks(self): self.sess.run([self.actor.replace, self.critic.replace]) def store(self, s, a, r, s_, done): self.memory.store(s, a, r, s_, done) def learn(self): bs, ba, br, bs_, _ = self.memory.sample(self.batch_size) q_ = self.sess.run(self.critic.q_, {self.s_: bs_}) br = br[:, np.newaxis] target_critic = br + self.gamma * q_ self.critic.learn(bs, ba, target_critic) self.actor.learn(bs) self.update_target_networks()
class Simple_DQNAgent(Agent): """ This agent can handle the networks ConvDQN and LinearDQN. This agent uses a single DQN and a replay buffer for learning. """ def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size): super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec) if self.network == "SimpleConvDQN": self.model = ConvDQN(env.env_shape, env.no_of_actions) elif self.network == "LinearDQN": self.model = LinearDQN(env.env_shape, env.no_of_actions) self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape) def get_action(self, state): if(np.random.randn() <= self.eps): return self.env.sample_action() else: state = T.tensor(state, dtype=T.float).unsqueeze(0).to(self.model.device) actions = self.model.forward(state) return T.argmax(actions).item() def update(self, batch_size): self.model.optimizer.zero_grad() batch = self.replay_buffer.sample(batch_size) states, actions, rewards, next_states, dones = batch states_t = T.tensor(states, dtype=T.float).to(self.model.device) actions_t = T.tensor(actions).to(self.model.device) rewards_t = T.tensor(rewards, dtype=T.float).to(self.model.device) next_states_t = T.tensor(next_states, dtype=T.float).to(self.model.device) curr_Q = self.model.forward(states_t).gather(1, actions_t.unsqueeze(1)) curr_Q = curr_Q.squeeze(1) next_Q = self.model.forward(next_states_t) max_next_Q = T.max(next_Q, 1)[0] expected_Q = rewards_t + self.gamma * max_next_Q loss = self.model.MSE_loss(curr_Q, expected_Q).to(self.model.device) loss.backward() self.model.optimizer.step() self.dec_eps() def learn(self,state, action, reward, next_state, done, batch_size): self.replay_buffer.store_transition(state, action, reward, next_state, done) if len(self.replay_buffer) > batch_size: self.update(batch_size)
def generator(self): self.policy.init() time_step = self.env.reset() episode_reward = 0 episode_time_steps = 0 episode_num = 0 state_shape = self.env.observation_spec().shape action_dim = self.env.action_spec().shape[0] replay_buffer = ReplayBuffer(state_shape, action_dim, max_size=self.buffer_size) next(self.rng) for t in ( range(int(self.max_time_steps)) if self.max_time_steps else itertools.count() ): episode_time_steps += 1 state = time_step.observation # Select action randomly or according to policy action = yield state # Perform action time_step = self.env.step(action) done_bool = float(time_step.last()) # Store data in replay buffer replay_buffer.add( state, action, time_step.observation, time_step.reward, done_bool ) episode_reward += time_step.reward # Train agent after collecting sufficient data if t >= self.start_time_steps: for _ in range(self.train_steps): data = replay_buffer.sample(next(self.rng), self.batch_size) self.policy.update(**vars(data)) if time_step.last(): # +1 to account for 0 indexing. +0 on ep_time_steps since it will increment +1 even if done=True self.report( time_steps=t + 1, episode=episode_num + 1, episode_time_steps=episode_time_steps, reward=episode_reward, ) # Reset environment time_step = self.env.reset() episode_reward = 0 episode_time_steps = 0 episode_num += 1
class MADDPG(): def __init__(self, num_agents=2, state_size=24, action_size=2, random_seed=2): self.num_agents = num_agents self.agents = [ Agent(state_size, action_size, random_seed) for i in range(self.num_agents) ] self.memory = ReplayBuffer(action_size, buffer_size=BUFFER_SIZE, batch_size=MINI_BATCH, seed=random_seed) def act(self, states, add_noise=True): actions = [] for state, agent in zip(states, self.agents): action = agent.act(state, add_noise) actions.append(action) return actions def reset(self): for agent in self.agents: agent.reset() def step(self, states, actions, rewards, next_states, dones): for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) if (len(self.memory) > MINI_BATCH): for _ in range(self.num_agents): experience = self.memory.sample() self.learn(experience) def learn(self, experiences, gamma=GAMMA): for agent in self.agents: agent.learn(experiences, gamma)
def play_and_train(env, agent, t_max=10**4, replay_buffer: ReplayBuffer = None, replay_batch_size: int = None): """ This function should - run a full game, actions given by agent's e-greedy policy - train agent using agent.update(...) whenever it is possible - return total reward """ total_reward = 0.0 s = env.reset() for t in range(t_max): # get agent to pick action given state s. a = agent.get_action(s) next_s, r, done, _ = env.step(a) # train (update) agent for state s agent.update(s, a, r, next_s) if replay_buffer is not None: # store current <s,a,r,s'> transition in buffer replay_buffer.add(s, a, r, next_s, done) # sample replay_batch_size random transitions from replay, # then update agent on each of them in a loop s_, a_, r_, next_s_, done_ = replay_buffer.sample( replay_batch_size) for i in range(replay_batch_size): agent.update(s_[i], a_[i], r_[i], next_s_[i]) s = next_s total_reward += r if done: break return total_reward
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.name = "DDPG" self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, 'actor_local') self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, 'actor_target') # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, 'critic_local') self.critic_target = Critic(self.state_size, self.action_size, 'critic_target') # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Reward counter self.total_reward = 0 self.n_steps = 0 def load(self): self.actor_local.load() self.actor_target.load() self.critic_local.load() self.critic_target.load() print("Agent's weights loaded from disk.") def save(self): self.actor_local.save() self.actor_target.save() self.critic_local.save() self.critic_target.save() print("Agent's weights saved to disk.") def reset_episode(self): self.total_reward = 0 self.n_steps = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Add reward to total self.total_reward += reward self.n_steps += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state, add_noise=True): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] # Hack, rescale rotor revs to +-5 range from average # rev_mean = np.mean(action) # action = (action-450)/450 # action *= 50 # action += rev_mean if add_noise: action += self.noise.sample() # additive noise for exploration return list(action) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def q_learning_fun(env, q_func, optimizer_spec, exploration, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, max_learning_steps=1000000, frame_history_len=4, target_update_freq=10000): if not os.path.isdir("./dqn"): os.mkdir("./dqn") img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 with torch.no_grad(): ret = model(obs).data.max(1)[1].cpu() return ret else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') save_best_mean_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 SAVE_EVERY_N_STEPS = 2000000 for t in count(): ### Check stopping criterion if env.get_total_steps() >= max_learning_steps: break last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() # Choose random action if not learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0] else: action = random.randrange(num_actions) obs, reward, done, _ = env.step(action) # clip rewards between -1 and 1 reward = max(-1.0, min(reward, 1.0)) # Store other info in replay memory replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs if (t > learning_starts and t % learning_freq == 0): obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = torch.from_numpy(obs_batch).type(dtype) / 255.0 act_batch = torch.from_numpy(act_batch).long() rew_batch = torch.from_numpy(rew_batch) next_obs_batch = torch.from_numpy(next_obs_batch).type( dtype) / 255.0 not_done_mask = torch.from_numpy(1 - done_mask).type(dtype) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() # update rule cur_all_Q_values = Q(obs_batch) cur_act_Q_values = cur_all_Q_values.gather( 1, act_batch.unsqueeze(1)).squeeze() next_all_target_Q_values = target_Q(next_obs_batch).detach() next_max_target_Q_values = next_all_target_Q_values.max(1)[0] next_max_target_Q_values = not_done_mask * next_max_target_Q_values target = rew_batch + (gamma * next_max_target_Q_values) error = target - cur_act_Q_values clipped_error = error.clamp(-1, 1) d_error = clipped_error * -1.0 optimizer.zero_grad() cur_act_Q_values.backward(d_error.data) optimizer.step() num_param_updates += 1 # Periodically update the target network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) # Log progress and save of statistics episode_rewards = env.get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) sys.stdout.flush() # Dump statistics to pickle with open('./dqn/statistics.pkl', 'wb') as f: pickle.dump(Statistic, f) print("Saved to %s" % './dqn/statistics.pkl') if save_best_mean_reward < best_mean_episode_reward: save_best_mean_reward = best_mean_episode_reward torch.save(Q.state_dict(), './dqn/best_model.pth') if t % SAVE_EVERY_N_STEPS == 0: torch.save(Q.state_dict(), './dqn/n_steps_%d.pth' % t)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, random_seed=42, num_agents=1): """Initialize Agent object. Params ==== state_size (int): Dimension of each state action_size (int): Dimension of each action lr_actor (float): Learning rate for actor model lr_critic (float): Learning Rate for critic model random_seed (int): Random seed num_agents (int): Number of agents return ==== None """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.num_agents = num_agents # Initialize time step (for updating every hyperparameters["update_every"] steps) self.t_step = 0 # Actor network self.actor = ActorNetwork(lr_actor, state_size, action_size, random_seed, name="actor") self.actor_target = ActorNetwork(lr_actor, state_size, action_size, random_seed, name="actor_target") self.soft_update(self.actor, self.actor_target, tau=1) # Critic network self.critic = CriticNetwork(lr_critic, state_size, action_size, random_seed, name="critic") self.critic_target = CriticNetwork(lr_critic, state_size, action_size, random_seed, name="critic_target") self.soft_update(self.critic, self.critic_target, tau=1) # Noise process self.noise = OUActionNoise(mu=np.zeros(action_size)) # Replay buffer memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward # Support for multi agents learners for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Update timestep to learn self.t_step = (self.t_step + 1) % UPDATE_EVERY # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and self.t_step == 0: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" states = T.from_numpy(state).float().to(device) self.actor.eval() with T.no_grad(): actions = self.actor(states).cpu().data.numpy() self.actor.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic.optimizer.zero_grad() critic_loss.backward() T.nn.utils.clip_grad_norm_(self.critic.parameters(), 1.0) self.critic.optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor(states) actor_loss = -self.critic(states, actions_pred).mean() # Minimize the loss self.actor.optimizer.zero_grad() actor_loss.backward() self.actor.optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic, self.critic_target, TAU) self.soft_update(self.actor, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_models(self): """ Save models weights """ self.actor.save_checkpoint() self.critic.save_checkpoint() self.actor_target.save_checkpoint() self.critic_target.save_checkpoint() def load_models(self): """ Load models weights """ self.actor.load_checkpoint() self.critic.load_checkpoint() self.actor_target.load_checkpoint() self.critic_target.load_checkpoint()
class AgentDDPG(): def __init__(self, env): """ :param task: (class instance) Instructions about the goal and reward """ self.env = env self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.shape[0] self.action_low = env.action_space.low self.action_high = env.action_space.high self.score = 0.0 self.best = 0.0 # Instances of the policy function or actor and the value function or critic # Actor critic with Advantage # Actor local and target self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Save actor model for future use actor_local_model_yaml = self.actor_local.model.to_yaml() with open("actor_local_model.yaml", "w") as yaml_file: yaml_file.write(actor_local_model_yaml) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic local and target self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model with local model self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Initialize the Gaussin Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Initialize the Replay Memory self.buffer_size = 100000 self.batch_size = 64 # original 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Parameters for the Algorithm self.gamma = 0.99 # Discount factor self.tau = 0.01 # Soft update for target parameters Actor Critic with Advantage # Actor can reset the episode def reset_episode(self): # Your total reward goes to 0 same as your count self.total_reward = 0.0 self.count = 0 # Reset the gaussian noise self.noise.reset() # Gets a new state from the task state = self.env.reset() # Protect the state obtained from the task # by storing it as last state self.last_state = state # Return the state obtained from task return state # Actor interact with the environment def step(self, action, reward, next_state, done): # Add to the total reward the reward of this time step self.total_reward += reward # Increase your count based on the number of rewards # received in the episode self.count += 1 # Stored previous state in the replay buffer self.memory.add(self.last_state, action, reward, next_state, done) # Check to see if you have enough to produce a batch # and learn from it if len(self.memory) > self.batch_size: experiences = self.memory.sample() # Train the networks using the experiences self.learn(experiences) # Roll over last state action self.last_state = next_state # Actor determines what to do based on the policy def act(self, state): # Given a state return the action recommended by the policy # Reshape the state to fit the keras model input state = np.reshape(state, newshape=[-1, self.state_size]) # Pass the state to the actor local model to get an action # recommend for the policy in a state action = self.actor_local.model.predict(state)[0] # Because we are exploring we add some noise to the # action vector return list(action + self.noise.sample()) # This is the Actor learning logic called when the agent # take a step to learn def learn(self, experiences): """ Learning means that the networks parameters needs to be updated Using the experineces batch. Network learns from experiences not form interaction with the environment """ # Reshape the experience tuples in separate arrays of states, actions # rewards, next_state, done # Your are converting every memeber of the tuple in a column or vector states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Firs we pass a batch of next states to the actor so it tell us what actions # to execute, we use the actor target network instead of the actor local network # because of the advantage principle actions_next = self.actor_target.model.predict_on_batch(next_states) # The critic evaluates the actions taking by the actor and generates the # Q(a,s) value of those actions. This action, state tuple comes from the # ReplayBuffer not from interacting with the environment. # Remember the Critic or value function inputs is states, actions Q_targets_next = self.critic_target.model.predict_on_batch( ([next_states, actions_next])) # With the Q_targets_next that is a vector of action values Q(s,a) of a random selected # next_states from the replay buffer. We calculate the target Q(s,a). # For that we use the TD one-step Sarsa equations # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value # This is done to train the critic in a supervise learning fashion. Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train the actor action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def get_episode_score(self): """ Calculate the episode scores :return: None """ # Update score and best score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best: self.best = self.score def save_model_weights(self, actor_model): actor_model.model.save_weights('weights.h5')
class DuelingDQNAgent(Agent): def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size, replace_cnt): super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec) self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape) self.learn_step_counter = 0 self.replace_cnt = replace_cnt self.q_eval = DuelingDQN(env.env_shape, env.no_of_actions) self.q_target = DuelingDQN(env.env_shape, env.no_of_actions) def get_action(self, state): if(np.random.randn() <= self.eps): return self.env.sample_action() else: state = T.tensor(state, dtype=T.float).unsqueeze(0).to(self.q_eval.device) _, advantage = self.q_eval.forward(state) return T.argmax(advantage).item() def replace_target_network(self): if self.learn_step_counter % self.replace_cnt == 0: self.q_target.load_state_dict(self.q_eval.state_dict()) def get_batch_tensors(self, batch_size): batch = self.replay_buffer.sample(batch_size) states, actions, rewards, next_states, dones = batch states_t = T.tensor(states, dtype=T.float).to(self.q_eval.device) actions_t = T.tensor(actions).to(self.q_eval.device) rewards_t = T.tensor(rewards, dtype=T.float).to(self.q_eval.device) next_states_t = T.tensor(next_states, dtype=T.float).to(self.q_eval.device) return states_t, actions_t, rewards_t, next_states_t def update(self, batch_size): states_t, actions_t, rewards_t, next_states_t = self.get_batch_tensors(batch_size) self.q_eval.optimizer.zero_grad() self.replace_target_network() indices = np.arange(batch_size) Vs, As = self.q_eval.forward(states_t) curr_Q = T.add(Vs, (As - As.mean(dim=1, keepdim=True)))[indices, actions_t] Vns, Ans = self.q_target.forward(next_states_t) max_next_Q = T.add(Vs, (As - As.mean(dim=1, keepdim=True))).max(1)[0] expected_Q = rewards_t + self.gamma * max_next_Q loss = self.q_eval.MSE_loss(curr_Q, expected_Q).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.dec_eps() def learn(self,state, action, reward, next_state, done, batch_size): self.replay_buffer.store_transition(state, action, reward, next_state, done) if len(self.replay_buffer) > batch_size: self.update(batch_size)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Variables to store best score and scores self.best_score = -np.inf self.score_list = [] def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state self.last_state = next_state # Track rewards self.total_reward += reward self.count += 1 if done: # Average total reward by step counts self.score = self.total_reward / float( self.count) if self.count else 0.0 # Store scores and update the best core self.score_list.append(self.score) if self.score > self.best_score: self.best_score = self.score def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def train(target_vars, saver, sess, logger, resume_iter, env): tot_iter = int(FLAGS.nsteps // FLAGS.num_env) X = target_vars['X'] X_NOISE = target_vars['X_NOISE'] train_op = target_vars['train_op'] loss_ml = target_vars['loss_ml'] x_grad = target_vars['x_grad'] x_mod = target_vars['x_mod'] action_grad = target_vars['action_grad'] X_START = target_vars['X_START'] X_END = target_vars['X_END'] X_PLAN = target_vars['X_PLAN'] ACTION_PLAN = target_vars['ACTION_PLAN'] ACTION_LABEL = target_vars['ACTION_LABEL'] ACTION_NOISE = target_vars['ACTION_NOISE_LABEL'] x_joint = target_vars['x_joint'] actions = target_vars['actions'] energy_pos = target_vars['energy_pos'] energy_neg = target_vars['energy_neg'] loss_total = target_vars['loss_total'] dyn_loss = target_vars['dyn_loss'] dyn_dist = target_vars['dyn_dist'] ob = env.reset()[:, None, None, :] output = [train_op, x_mod] log_output = [ train_op, dyn_loss, dyn_dist, energy_pos, energy_neg, loss_ml, loss_total, x_grad, action_grad, x_mod ] print(log_output) replay_buffer = ReplayBuffer(1000000) pos_replay_buffer = ReplayBuffer(1000000) epinfos = [] points = [] total_obs = [] for itr in range(resume_iter, tot_iter): x_plan = np.random.uniform( -1.0, 1.0, (FLAGS.num_env, FLAGS.plan_steps, 1, FLAGS.latent_dim)) action_plan = np.random.uniform(-1, 1, (FLAGS.num_env, FLAGS.plan_steps, 2)) if FLAGS.datasource == "maze": x_end = np.tile(np.array([[0.7, -0.8]]), (FLAGS.num_env, 1))[:, None, None, :] elif FLAGS.datasource == "reacher": x_end = np.tile(np.array([[0.7, 0.5]]), (FLAGS.num_env, 1))[:, None, None, :] else: x_end = np.tile(np.array([[0.5, 0.5]]), (FLAGS.num_env, 1))[:, None, None, :] x_traj, traj_actions = sess.run([x_joint, actions], { X_START: ob, X_PLAN: x_plan, X_END: x_end, ACTION_PLAN: action_plan }) # Add some amount of exploration into predicted actions # traj_actions = traj_actions + np.random.uniform(-0.1, 0.1, traj_actions.shape) # traj_actions = np.clip(traj_actions, -1, 1) if FLAGS.debug: print(x_traj[0]) obs = [ob[:, 0, 0, :]] dones = [] diffs = [] for i in range(traj_actions.shape[1] - 1): if FLAGS.random_action: action = np.random.uniform(-1, 1, traj_actions[:, i].shape) else: action = traj_actions[:, i] ob, _, done, infos = env.step(action) if i == 0: print(x_traj[0, 0], x_traj[0, 1], ob[0]) target_ob = x_traj[:, i + 1] print("Abs dist: ", np.mean(np.abs(ob - target_ob))) dones.append(done) obs.append(ob) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) diffs.append(np.abs(x_traj[:, i + 1] - ob).mean()) ob = ob[:, None, None, :] dones = np.array(dones).transpose() obs = np.stack(obs, axis=1)[:, :, None, :] if FLAGS.heatmap: total_obs.append(obs.reshape((-1, FLAGS.latent_dim))) action, ob_pair = parse_valid_obs(obs, traj_actions, dones) # x_noise = np.stack([x_traj[:, :-1], x_traj[:, 1:]], axis=2) x_noise = np.stack([x_traj[:, :10], x_traj[:, 1:11]], axis=2) s = x_noise.shape x_noise_neg = x_noise.reshape((s[0] * s[1], s[2], s[3], s[4])) action_noise_neg = traj_actions[:, :-1] s = action_noise_neg.shape action_noise_neg = action_noise_neg.reshape((s[0] * s[1], s[2])) traj_action_encode = action.reshape((-1, 1, 1, FLAGS.action_dim)) encode_data = np.concatenate([ ob_pair, np.tile(traj_action_encode, (1, FLAGS.total_frame, 1, 1)) ], axis=3) pos_replay_buffer.add(encode_data) if len(pos_replay_buffer ) > FLAGS.num_env * FLAGS.plan_steps and FLAGS.replay_batch: sample_data = pos_replay_buffer.sample(FLAGS.num_env * FLAGS.plan_steps) sample_ob = sample_data[:, :, :, :-FLAGS.action_dim] sample_actions = sample_data[:, 0, 0, -FLAGS.action_dim:] ob_pair = np.concatenate([ob_pair, sample_ob], axis=0) action = np.concatenate([action, sample_actions], axis=0) feed_dict = { X: ob_pair, X_NOISE: x_noise_neg, ACTION_NOISE: action_noise_neg, ACTION_LABEL: action } batch_size = x_noise_neg.shape[0] if FLAGS.replay_batch and len( replay_buffer) > batch_size and not FLAGS.ff_model: replay_batch = replay_buffer.sample(int(batch_size / 2.)) # replay_mask = (np.random.uniform(0, 1, (batch_size)) > 0.95) # feed_dict[X_NOISE][replay_mask] = replay_batch[replay_mask] feed_dict[X_NOISE] = np.concatenate( [feed_dict[X_NOISE], replay_batch], axis=0) if itr % FLAGS.log_interval == 0: _, dyn_loss, dyn_dist, e_pos, e_neg, loss_ml, loss_total, x_grad, action_grad, x_mod = sess.run( log_output, feed_dict=feed_dict) kvs = {} kvs['e_pos'] = e_pos.mean() kvs['e_neg'] = e_neg.mean() kvs['loss_ml'] = loss_ml.mean() kvs['loss_total'] = loss_total.mean() kvs['x_grad'] = np.abs(x_grad).mean() kvs['action_grad'] = np.abs(action_grad).mean() kvs['dyn_loss'] = dyn_loss.mean() kvs['dyn_dist'] = np.abs(dyn_dist).mean() kvs['iter'] = itr kvs["train_episode_length_mean"] = safemean( [epinfo['l'] for epinfo in epinfos]) kvs["diffs"] = diffs[-1] epinfos = [] string = "Obtained a total of " for key, value in kvs.items(): string += "{}: {}, ".format(key, value) print(string) logger.writekvs(kvs) else: _, x_mod = sess.run(output, feed_dict=feed_dict) if FLAGS.replay_batch and (x_mod is not None): replay_buffer.add(x_mod) replay_buffer.add(ob_pair) if itr % FLAGS.save_interval == 0: saver.save( sess, osp.join(FLAGS.logdir, FLAGS.exp, 'model_{}'.format(itr))) if FLAGS.heatmap and itr == 100: total_obs = np.concatenate(total_obs, axis=0) # total_obs = total_obs[np.random.permutation(total_obs.shape[0])[:1000000]] sns.kdeplot(data=total_obs[:, 0], data2=total_obs[:, 1], shade=True) plt.savefig("kde.png") assert False
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=0, double_dqn=False, dueling=False, per=False, per_args=(0.2, 0.01, 2e-5)): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed double_dqn (bool): whether to implement Double DQN (default=False) dueling (bool): whether to implement Dueling DQN per (bool): whether to implement Prioritized Experience Replay per_args (tuple): a,beta,beta_increment for PER """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_dqn = double_dqn self.per = per self.gamma = GAMMA # output name for checkpoint self.output_name = '' self.output_name += '_double' if double_dqn else '' self.output_name += '_dueling' if dueling else '' self.output_name += '_per' if per else '' # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, dueling=dueling).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, dueling=dueling).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory if self.per: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, *per_args) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def train(self, env, n_episodes=1000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== env (UnityEnvironment): Bananas environment n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # list containing scores from each episode scores = [] # list containing window averaged scores avg_scores = [] # last 100 scores scores_window = deque(maxlen=100) # initialize epsilon eps = eps_start for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = self.act(state, eps) env_info = env.step(action)[brain_name] # get the next state next_state = env_info.vector_observations[0] # get the reward reward = env_info.rewards[0] # see if episode has finished done = env_info.local_done[0] self.step((state, action, reward, next_state, done)) state = next_state score += reward if done: break # save most recent score scores_window.append(score) scores.append(score) avg_scores.append(np.mean(scores_window)) # decrease epsilon eps = max(eps_end, eps_decay * eps) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(self.qnetwork_local.state_dict(), f'./checkpoints/checkpoint{self.output_name}.pth') break return scores, avg_scores def step(self, experience): """Save experience in replay memory and learn. Params ====== experience (tuple): (state, action, reward, next_state, done) """ # save experience self.memory.add(experience) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: self.learn() def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self): """Update value parameters using given batch of experience tuples. """ # if using PER if self.per: states, actions, rewards, next_states, dones, idxs, is_weights = self.memory.sample( ) # else normal replay buffer else: states, actions, rewards, next_states, dones = self.memory.sample() # if Double DQN if self.double_dqn: # Get predicted Q values (for next actions chosen by local model) from target model self.qnetwork_local.eval() with torch.no_grad(): next_actions = self.qnetwork_local(next_states).detach().max( 1)[1].unsqueeze(1) self.qnetwork_local.train() Q_targets_next = self.qnetwork_target(next_states).gather( 1, next_actions) else: # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss if self.per: loss = (torch.FloatTensor(is_weights) * F.mse_loss(Q_expected, Q_targets)).mean() else: loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # if PER, update priority if self.per: errors = torch.abs(Q_expected - Q_targets).data.numpy() self.memory.update(idxs, errors) # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Trainer(): def __init__(self, params: Parameters): self.parms = params self.env = Env(params.game, params.gamma, norm_rewards=None, norm_states=False) self.buffer = ReplayBuffer(params.replay_size) # Seed self.env.seed(params.seed) np.random.seed(params.seed) tf.random.set_seed(params.seed) # Four critic nets critic_nets = [ DDPGValueNet(feature_shape=self.env.features_shape, a_num=self.env.num_actions, lr=params.lr_c) for _ in range(4) ] self.critic1, self.critic2, self.target_critic1, self.target_critic2 = critic_nets # Two actor nets self.actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) self.target_actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) # Copy parms self._copy_para(self.critic1, self.target_critic1) self._copy_para(self.critic2, self.target_critic2) self._copy_para(self.actor, self.target_actor) self.train_step_cnt = 0 def _copy_para(self, from_model, to_model): """ Copy parameters for soft updating :param from_model: latest model :param to_model: target model :return: None """ for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): j.assign(i) def _target_soft_update(self, net, target_net): """ soft update the target net with Polyak averaging """ for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): target_param.assign( # copy weight value into target parameters target_param * (1.0 - self.parms.tau) + param * self.parms.tau) def _train(self): # Sample batch = self.buffer.sample(self.parms.batch_size) s = np.array([batch_[0] for batch_ in batch]) a = np.array([batch_[1] for batch_ in batch]) r = np.array([batch_[2] for batch_ in batch]) s_next = np.array([batch_[3] for batch_ in batch]) not_done = np.array([not batch_[4] for batch_ in batch]) # Reshpe r = r[:, np.newaxis] not_done = not_done[:, np.newaxis] # Set target y pi_next = self.target_actor(s_next) a_next = pi_next.sample() q_next = tf.minimum(self.target_critic1([s_next, a_next]), self.target_critic2([s_next, a_next])) y = r + self.parms.gamma * q_next * not_done # Train critic1 with tf.GradientTape() as c1_tape: q1 = self.critic1([s, a]) c1_loss = tf.losses.mean_squared_error(y, q1) c1_grads = c1_tape.gradient(c1_loss, self.critic1.trainable_weights) self.critic1.optimizer.apply_gradients( zip(c1_grads, self.critic1.trainable_weights)) # Train critic2 with tf.GradientTape() as c2_tape: q2 = self.critic2([s, a]) c2_loss = tf.losses.mean_squared_error(y, q2) c2_grads = c2_tape.gradient(c2_loss, self.critic2.trainable_weights) self.critic2.optimizer.apply_gradients( zip(c2_grads, self.critic2.trainable_weights)) # Train actor if self.train_step_cnt % self.parms.actor_interval == 0: with tf.GradientTape() as a_tape: pi = self.actor(s) a = pi.sample() q = self.critic1([s, a]) a_loss = -tf.reduce_mean(q) a_grads = a_tape.gradient(a_loss, self.actor.trainable_weights) self.actor.optimizer.apply_gradients( zip(a_grads, self.actor.trainable_weights)) # update parms self._target_soft_update(self.actor, self.target_actor) self._target_soft_update(self.critic1, self.target_critic1) self._target_soft_update(self.critic2, self.target_critic2) def train_step(self): # Episode infomation episode_ret = [] # Initialize s s = self.env.reset() for _ in range(self.parms.train_step_len): # Interact pi = self.actor(s[np.newaxis, :]) # batch_size=1 a = pi.sample()[0] s_next, r, done, info = self.env.step(a) # Store self.buffer.store((s, a, r, s_next, done)) # Train if self.buffer.size() > self.parms.start_size: self._train() self.train_step_cnt += 1 if done: _, ret = info['done'] episode_ret.append(ret) s_next = self.env.reset() s = s_next return np.mean(episode_ret)
def train(target_vars, saver, sess, logger, dataloaders, test_dataloaders, resume_iter, logdir): X = target_vars['X'] Y = target_vars['Y'] X_NOISE = target_vars['X_NOISE'] train_op = target_vars['train_op'] energy_pos = target_vars['energy_pos'] energy_neg = target_vars['energy_neg'] loss_energy = target_vars['loss_energy'] loss_ml = target_vars['loss_ml'] loss_total = target_vars['total_loss'] gvs = target_vars['gvs'] x_grad = target_vars['x_grad'] x_grad_first = target_vars['x_grad_first'] x_off = target_vars['x_off'] temp = target_vars['temp'] x_mod = target_vars['x_mod'] LABEL = target_vars['LABEL'] LABEL_POS = target_vars['LABEL_POS'] weights = target_vars['weights'] test_x_mod = target_vars['test_x_mod'] eps = target_vars['eps_begin'] label_ent = target_vars['label_ent'] set_seed(0) np.random.seed(0) random.seed(0) if FLAGS.use_attention: gamma = weights[0]['atten']['gamma'] else: gamma = tf.zeros(1) val_output = [test_x_mod] gvs_dict = dict(gvs) log_output = [ train_op, energy_pos, energy_neg, eps, loss_energy, loss_ml, loss_total, x_grad, x_off, x_mod, gamma, x_grad_first, label_ent, *gvs_dict.keys() ] output = [train_op, x_mod] replay_buffer = ReplayBuffer(10000) itr = resume_iter x_mod = None gd_steps = 1 err_message = 'Total number of epochs should be divisible by the number of CL tasks.' assert FLAGS.epoch_num % FLAGS.num_tasks == 0, err_message epochs_per_task = FLAGS.epoch_num // FLAGS.num_tasks // FLAGS.num_cycles for task_index, dataloader in enumerate(dataloaders): dataloader_iterator = iter(dataloader) best_inception = 0.0 for epoch in range(1, epochs_per_task + 1): for data_corrupt, data, label in dataloader: print('Iter: {}; Epoch: {}/{}; Task: {}/{}'.format( itr, epoch + (task_index * epochs_per_task), FLAGS.epoch_num, task_index + 1, FLAGS.num_tasks)) data_corrupt = data_corrupt_init = data_corrupt.numpy() data_corrupt_init = data_corrupt.copy() data = data.numpy() label = label.numpy() label_init = label.copy() if FLAGS.mixup: idx = np.random.permutation(data.shape[0]) lam = np.random.beta(1, 1, size=(data.shape[0], 1, 1, 1)) data = data * lam + data[idx] * (1 - lam) if FLAGS.replay_batch and (x_mod is not None): replay_buffer.add(compress_x_mod(x_mod)) if len(replay_buffer) > FLAGS.batch_size: replay_batch = replay_buffer.sample(FLAGS.batch_size) replay_batch = decompress_x_mod(replay_batch) replay_mask = (np.random.uniform( 0, FLAGS.rescale, FLAGS.batch_size) > 0.05) data_corrupt[replay_mask] = replay_batch[replay_mask] if FLAGS.pcd: if x_mod is not None: data_corrupt = x_mod feed_dict = {X_NOISE: data_corrupt, X: data, Y: label} if FLAGS.cclass: feed_dict[LABEL] = label feed_dict[LABEL_POS] = label_init if itr % FLAGS.log_interval == 0: _, e_pos, e_neg, eps, loss_e, loss_ml, loss_total, x_grad, x_off, x_mod, gamma, x_grad_first, label_ent, * \ grads = sess.run(log_output, feed_dict) kvs = {} kvs['e_pos'] = e_pos.mean() kvs['e_pos_std'] = e_pos.std() kvs['e_neg'] = e_neg.mean() kvs['e_diff'] = kvs['e_pos'] - kvs['e_neg'] kvs['e_neg_std'] = e_neg.std() kvs['temp'] = temp kvs['loss_e'] = loss_e.mean() kvs['eps'] = eps.mean() kvs['label_ent'] = label_ent kvs['loss_ml'] = loss_ml.mean() kvs['loss_total'] = loss_total.mean() kvs['x_grad'] = np.abs(x_grad).mean() kvs['x_grad_first'] = np.abs(x_grad_first).mean() kvs['x_off'] = x_off.mean() kvs['iter'] = itr kvs['gamma'] = gamma for v, k in zip(grads, [v.name for v in gvs_dict.values()]): kvs[k] = np.abs(v).max() string = "Obtained a total of " for key, value in kvs.items(): string += "{}: {}, ".format(key, value) if hvd.rank() == 0: print(string) logger.writekvs(kvs) for key, value in kvs.items(): neptune.log_metric(key, x=itr, y=value) else: _, x_mod = sess.run(output, feed_dict) if itr % FLAGS.save_interval == 0 and hvd.rank() == 0: saver.save( sess, osp.join(FLAGS.logdir, FLAGS.exp, 'model_{}'.format(itr))) if itr % FLAGS.test_interval == 0 and hvd.rank( ) == 0 and FLAGS.dataset != '2d': if FLAGS.dataset == 'cifar10': cifar10_map = { 0: 'airplane', 1: 'automobile', 2: 'bird', 3: 'cat', 4: 'deer', 5: 'dog', 6: 'frog', 7: 'horse', 8: 'ship', 9: 'truck' } imgs = data labels = np.argmax(label, axis=1) for idx, img in enumerate(imgs[:20, :, :, :]): neptune.log_image( 'input_images', rescale_im(imgs[idx]), description=str(int(labels[idx])) + ': ' + cifar10_map[int(labels[idx])]) if FLAGS.evaluate: print('Test.') train_acc = test_accuracy(target_vars, saver, sess, logger, test_dataloaders[0]) test_acc = test_accuracy(target_vars, saver, sess, logger, test_dataloaders[1]) neptune.log_metric('train_accuracy', x=itr, y=train_acc) neptune.log_metric('test_accuracy', x=itr, y=test_acc) try_im = x_mod orig_im = data_corrupt.squeeze() actual_im = rescale_im(data) orig_im = rescale_im(orig_im) try_im = rescale_im(try_im).squeeze() for i, (im, t_im, actual_im_i) in enumerate( zip(orig_im[:20], try_im[:20], actual_im)): shape = orig_im.shape[1:] new_im = np.zeros((shape[0], shape[1] * 3, *shape[2:])) size = shape[1] new_im[:, :size] = im new_im[:, size:2 * size] = t_im new_im[:, 2 * size:] = actual_im_i log_image(new_im, logger, 'train_gen_{}'.format(itr), step=i) neptune.log_image( 'train_gen', x=new_im, description='train_gen_iter:{}_idx:{}'.format( itr, i)) test_im = x_mod try: data_corrupt, data, label = next(dataloader_iterator) except BaseException: dataloader_iterator = iter(dataloader) data_corrupt, data, label = next(dataloader_iterator) data_corrupt = data_corrupt.numpy() if FLAGS.replay_batch and ( x_mod is not None) and len(replay_buffer) > 0: replay_batch = replay_buffer.sample(FLAGS.batch_size) replay_batch = decompress_x_mod(replay_batch) replay_mask = (np.random.uniform( 0, 1, (FLAGS.batch_size)) > 0.05) data_corrupt[replay_mask] = replay_batch[replay_mask] if FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'imagenet' or FLAGS.dataset == 'imagenetfull': n = 128 if FLAGS.dataset == "imagenetfull": n = 32 if len(replay_buffer) > n: data_corrupt = decompress_x_mod( replay_buffer.sample(n)) elif FLAGS.dataset == 'imagenetfull': data_corrupt = np.random.uniform( 0, FLAGS.rescale, (n, 128, 128, 3)) else: data_corrupt = np.random.uniform( 0, FLAGS.rescale, (n, 32, 32, 3)) if FLAGS.dataset == 'cifar10': label = np.eye(10)[np.random.randint(0, 10, (n))] else: label = np.eye(1000)[np.random.randint( 0, 1000, (n))] feed_dict[X_NOISE] = data_corrupt feed_dict[X] = data if FLAGS.cclass: feed_dict[LABEL] = label test_x_mod = sess.run(val_output, feed_dict) try_im = test_x_mod orig_im = data_corrupt.squeeze() actual_im = rescale_im(data.numpy()) orig_im = rescale_im(orig_im) try_im = rescale_im(try_im).squeeze() for i, (im, t_im, actual_im_i) in enumerate( zip(orig_im[:20], try_im[:20], actual_im)): shape = orig_im.shape[1:] new_im = np.zeros((shape[0], shape[1] * 3, *shape[2:])) size = shape[1] new_im[:, :size] = im new_im[:, size:2 * size] = t_im new_im[:, 2 * size:] = actual_im_i log_image(new_im, logger, 'val_gen_{}'.format(itr), step=i) neptune.log_image( 'val_gen', new_im, description='val_gen_iter:{}_idx:{}'.format( itr, i)) score, std = get_inception_score(list(try_im), splits=1) print("Inception score of {} with std of {}".format( score, std)) kvs = {} kvs['inception_score'] = score kvs['inception_score_std'] = std logger.writekvs(kvs) for key, value in kvs.items(): neptune.log_metric(key, x=itr, y=value) if score > best_inception: best_inception = score saver.save( sess, osp.join(FLAGS.logdir, FLAGS.exp, 'model_best')) if itr > 600000 and FLAGS.dataset == "mnist": assert False itr += 1 saver.save(sess, osp.join(FLAGS.logdir, FLAGS.exp, 'model_{}'.format(itr)))
class MADDPG(): def __init__(self, state_size, action_size, num_agents, random_seed=0): in_critic = num_agents * state_size self.agents = [ DDPG_agent(state_size, in_critic, action_size, num_agents, random_seed) for i in range(num_agents) ] self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) self.num_agents = num_agents def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" actions = [ agent.act(state, add_noise) for agent, state in zip(self.agents, states) ] return actions def target_act(self, states): """Returns actions for given state as per current policy.""" actions = [ agent.target_act(state) for agent, state in zip(self.agents, states) ] return actions def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward #for i in range(state.shape[0]): state = np.asanyarray(state) action = np.asanyarray(action) reward = np.asanyarray(reward) next_state = np.asanyarray(next_state) done = np.asanyarray(done) self.memory.add(state.reshape((1, self.num_agents, -1)), action.reshape((1, self.num_agents, -1)), \ reward.reshape((1, self.num_agents, -1)), next_state.reshape((1,self.num_agents, -1)), \ done.reshape((1, self.num_agents, -1))) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for i_agent in range(self.num_agents): experiences = self.memory.sample() self.learn(experiences, i_agent, GAMMA) def reset(self): [agent.reset() for agent in self.agents] def learn(self, experiences, i_agent, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences agent = self.agents[i_agent] # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models next_states = next_states.view(1, BATCH_SIZE, self.num_agents, -1) actions_next = torch.cat(self.target_act(next_states), dim=1) next_states = next_states.view(BATCH_SIZE, -1) actions_next = actions_next.view(BATCH_SIZE, -1) Q_targets_next = agent.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards[:, i_agent] + (gamma * Q_targets_next * (1 - dones[:, i_agent])) # Compute critic loss Q_expected = agent.critic_local(states.view(BATCH_SIZE, -1), actions.view(BATCH_SIZE, -1)) # mean squared error loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss # zero_grad because we do not want to accumulate # gradients from other batches, so needs to be cleared agent.critic_optimizer.zero_grad() # compute derivatives for all variables that # requires_grad-True critic_loss.backward() # update those variables that requires_grad-True agent.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss # take the current states and predict actions #states = states.view(1, BATCH_SIZE, self.num_agents, -1) actions_pred = agent.actor_local(states) #print (actions_pred.shape) #actions_pred = torch.cat(actions_pred, dim=1) # -1 * (maximize) Q value for the current prediction actor_loss = -agent.critic_local(states.view( BATCH_SIZE, -1), actions_pred.view(BATCH_SIZE, -1)).mean() # Minimize the loss # zero_grad because we do not want to accumulate # gradients from other batches, so needs to be cleared agent.actor_optimizer.zero_grad() # compute derivatives for all variables that # requires_grad-True actor_loss.backward() # update those variables that requires_grad-True agent.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(agent.critic_local, agent.critic_target, TAU) self.soft_update(agent.actor_local, agent.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def gentest(sess, kvs, data, latents, save_exp_dir): X_NOISE = kvs['X_NOISE'] LABEL_SIZE = kvs['LABEL_SIZE'] LABEL_SHAPE = kvs['LABEL_SHAPE'] LABEL_POS = kvs['LABEL_POS'] LABEL_ROT = kvs['LABEL_ROT'] model_size = kvs['model_size'] model_shape = kvs['model_shape'] model_pos = kvs['model_pos'] model_rot = kvs['model_rot'] weight_size = kvs['weight_size'] weight_shape = kvs['weight_shape'] weight_pos = kvs['weight_pos'] weight_rot = kvs['weight_rot'] X = tf.placeholder(shape=(None, 64, 64), dtype=tf.float32) datafull = data # Test combination of generalization where we use slices of both training x_final = X_NOISE x_mod_size = X_NOISE x_mod_pos = X_NOISE for i in range(FLAGS.num_steps): # use cond_pos energies = [] x_mod_pos = x_mod_pos + tf.random_normal(tf.shape(x_mod_pos), mean=0.0, stddev=0.005) e_noise = model_pos.forward(x_final, weight_pos, label=LABEL_POS) # energies.append(e_noise) x_grad = tf.gradients(e_noise, [x_final])[0] x_mod_pos = x_mod_pos + tf.random_normal(tf.shape(x_mod_pos), mean=0.0, stddev=0.005) x_mod_pos = x_mod_pos - FLAGS.step_lr * x_grad x_mod_pos = tf.clip_by_value(x_mod_pos, 0, 1) if FLAGS.joint_shape: # use cond_shape e_noise = model_shape.forward(x_mod_pos, weight_shape, label=LABEL_SHAPE) elif FLAGS.joint_rot: e_noise = model_rot.forward(x_mod_pos, weight_rot, label=LABEL_ROT) else: # use cond_size e_noise = model_size.forward(x_mod_pos, weight_size, label=LABEL_SIZE) # energies.append(e_noise) # energy_stack = tf.concat(energies, axis=1) # energy_stack = tf.reduce_logsumexp(-1*energy_stack, axis=1) # energy_stack = tf.reduce_sum(energy_stack, axis=1) x_grad = tf.gradients(e_noise, [x_mod_pos])[0] x_mod_pos = x_mod_pos - FLAGS.step_lr * x_grad x_mod_pos = tf.clip_by_value(x_mod_pos, 0, 1) # for x_mod_size # use cond_size # e_noise = model_size.forward(x_mod_size, weight_size, label=LABEL_SIZE) # x_grad = tf.gradients(e_noise, [x_mod_size])[0] # x_mod_size = x_mod_size + tf.random_normal(tf.shape(x_mod_size), mean=0.0, stddev=0.005) # x_mod_size = x_mod_size - FLAGS.step_lr * x_grad # x_mod_size = tf.clip_by_value(x_mod_size, 0, 1) # # use cond_pos # e_noise = model_pos.forward(x_mod_size, weight_pos, label=LABEL_POS) # x_grad = tf.gradients(e_noise, [x_mod_size])[0] # x_mod_size = x_mod_size + tf.random_normal(tf.shape(x_mod_size), mean=0.0, stddev=0.005) # x_mod_size = x_mod_size - FLAGS.step_lr * tf.stop_gradient(x_grad) # x_mod_size = tf.clip_by_value(x_mod_size, 0, 1) x_mod = x_mod_pos x_final = x_mod if FLAGS.joint_shape: loss_kl = model_shape.forward(x_final, weight_shape, reuse=True, label=LABEL_SHAPE, stop_grad=True) + \ model_pos.forward(x_final, weight_pos, reuse=True, label=LABEL_POS, stop_grad=True) energy_pos = model_shape.forward(X, weight_shape, reuse=True, label=LABEL_SHAPE) + \ model_pos.forward(X, weight_pos, reuse=True, label=LABEL_POS) energy_neg = model_shape.forward(tf.stop_gradient(x_mod), weight_shape, reuse=True, label=LABEL_SHAPE) + \ model_pos.forward(tf.stop_gradient(x_mod), weight_pos, reuse=True, label=LABEL_POS) elif FLAGS.joint_rot: loss_kl = model_rot.forward(x_final, weight_rot, reuse=True, label=LABEL_ROT, stop_grad=True) + \ model_pos.forward(x_final, weight_pos, reuse=True, label=LABEL_POS, stop_grad=True) energy_pos = model_rot.forward(X, weight_rot, reuse=True, label=LABEL_ROT) + \ model_pos.forward(X, weight_pos, reuse=True, label=LABEL_POS) energy_neg = model_rot.forward(tf.stop_gradient(x_mod), weight_rot, reuse=True, label=LABEL_ROT) + \ model_pos.forward(tf.stop_gradient(x_mod), weight_pos, reuse=True, label=LABEL_POS) else: loss_kl = model_size.forward(x_final, weight_size, reuse=True, label=LABEL_SIZE, stop_grad=True) + \ model_pos.forward(x_final, weight_pos, reuse=True, label=LABEL_POS, stop_grad=True) energy_pos = model_size.forward(X, weight_size, reuse=True, label=LABEL_SIZE) + \ model_pos.forward(X, weight_pos, reuse=True, label=LABEL_POS) energy_neg = model_size.forward(tf.stop_gradient(x_mod), weight_size, reuse=True, label=LABEL_SIZE) + \ model_pos.forward(tf.stop_gradient(x_mod), weight_pos, reuse=True, label=LABEL_POS) energy_neg_reduced = (energy_neg - tf.reduce_min(energy_neg)) coeff = tf.stop_gradient(tf.exp(-energy_neg_reduced)) norm_constant = tf.stop_gradient(tf.reduce_sum(coeff)) + 1e-4 neg_loss = coeff * (-1*energy_neg) / norm_constant loss_ml = tf.reduce_mean(energy_pos) - tf.reduce_mean(energy_neg) loss_total = loss_ml + tf.reduce_mean(loss_kl) + 1 * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square(energy_neg))) optimizer = AdamOptimizer(1e-3, beta1=0.0, beta2=0.999) gvs = optimizer.compute_gradients(loss_total) gvs = [(k, v) for (k, v) in gvs if k is not None] train_op = optimizer.apply_gradients(gvs) vs = optimizer.variables() sess.run(tf.variables_initializer(vs)) dataloader = DataLoader(DSpritesGen(data, latents), batch_size=FLAGS.batch_size, num_workers=6, drop_last=True, shuffle=True) x_off = tf.reduce_mean(tf.square(x_mod - X)) itr = 0 saver = tf.train.Saver() x_mod = None if FLAGS.train: replay_buffer = ReplayBuffer(10000) for _ in range(1): for data_corrupt, data, label_size, label_pos in tqdm(dataloader): data_corrupt = data_corrupt.numpy()[:, :, :] data = data.numpy()[:, :, :] if x_mod is not None: replay_buffer.add(x_mod) replay_batch = replay_buffer.sample(FLAGS.batch_size) replay_mask = (np.random.uniform(0, 1, (FLAGS.batch_size)) > 0.95) data_corrupt[replay_mask] = replay_batch[replay_mask] if FLAGS.joint_shape: feed_dict = {X_NOISE: data_corrupt, X: data, LABEL_SHAPE: label_size, LABEL_POS: label_pos} elif FLAGS.joint_rot: feed_dict = {X_NOISE: data_corrupt, X: data, LABEL_ROT: label_size, LABEL_POS: label_pos} else: feed_dict = {X_NOISE: data_corrupt, X: data, LABEL_SIZE: label_size, LABEL_POS: label_pos} _, off_value, e_pos, e_neg, x_mod = sess.run([train_op, x_off, energy_pos, energy_neg, x_final], feed_dict=feed_dict) itr += 1 if itr % 10 == 0: print("x_off of {}, e_pos of {}, e_neg of {} itr of {}".format(off_value, e_pos.mean(), e_neg.mean(), itr)) if itr == FLAGS.break_steps: break saver.save(sess, osp.join(save_exp_dir, 'model_gentest')) saver.restore(sess, osp.join(save_exp_dir, 'model_gentest')) l = latents if FLAGS.joint_shape: mask_gen = (l[:, 3] == 30 * np.pi / 39) * (l[:, 2] == 0.5) elif FLAGS.joint_rot: mask_gen = (l[:, 1] == 1) * (l[:, 2] == 0.5) else: mask_gen = (l[:, 3] == 30 * np.pi / 39) * (l[:, 1] == 1) & (~((l[:, 2] == 0.5) | ((l[:, 4] == 16/31) & (l[:, 5] == 16/31)))) data_gen = datafull[mask_gen] latents_gen = latents[mask_gen] losses = [] for dat, latent in zip(np.array_split(data_gen, 120), np.array_split(latents_gen, 120)): x = 0.5 + np.random.randn(*dat.shape) if FLAGS.joint_shape: feed_dict = {LABEL_SHAPE: np.eye(3)[latent[:, 1].astype(np.int32) - 1], LABEL_POS: latent[:, 4:], X_NOISE: x, X: dat} elif FLAGS.joint_rot: feed_dict = {LABEL_ROT: np.concatenate([np.cos(latent[:, 3:4]), np.sin(latent[:, 3:4])], axis=1), LABEL_POS: latent[:, 4:], X_NOISE: x, X: dat} else: feed_dict = {LABEL_SIZE: latent[:, 2:3], LABEL_POS: latent[:, 4:], X_NOISE: x, X: dat} for i in range(2): x = sess.run([x_final], feed_dict=feed_dict)[0] feed_dict[X_NOISE] = x loss = sess.run([x_off], feed_dict=feed_dict)[0] losses.append(loss) print("Mean MSE loss of {} ".format(np.mean(losses))) data_try = data_gen[:10] data_init = 0.5 + 0.5 * np.random.randn(10, 64, 64) latent_scale = latents_gen[:10, 2:3] latent_pos = latents_gen[:10, 4:] if FLAGS.joint_shape: feed_dict = {X_NOISE: data_init, LABEL_SHAPE: np.eye(3)[latent[:10, 1].astype(np.int32)-1], LABEL_POS: latent_pos} elif FLAGS.joint_rot: feed_dict = {LABEL_ROT: np.concatenate([np.cos(latent[:10, 3:4]), np.sin(latent[:10, 3:4])], axis=1), LABEL_POS: latent[:10, 4:], X_NOISE: data_init} else: feed_dict = {X_NOISE: data_init, LABEL_SIZE: latent_scale, LABEL_POS: latent_pos} x_output = sess.run([x_final], feed_dict=feed_dict)[0] if FLAGS.joint_shape: im_name = "size_shape_combine_gentest.png" else: im_name = "size_scale_combine_gentest.png" x_output_wrap = np.ones((10, 66, 66)) data_try_wrap = np.ones((10, 66, 66)) x_output_wrap[:, 1:-1, 1:-1] = x_output data_try_wrap[:, 1:-1, 1:-1] = data_try im_output = np.concatenate([x_output_wrap, data_try_wrap], axis=2).reshape(-1, 66*2) impath = osp.join(save_exp_dir, im_name) imsave(impath, im_output) print("Successfully saved images at {}".format(impath))
class Trainer(): def __init__(self, params: Parameters): self.parms = params self.env = Env(params.game, params.gamma, norm_rewards=None, norm_states=False) self.buffer = ReplayBuffer(params.replay_size) # Seed self.env.seed(params.seed) np.random.seed(params.seed) tf.random.set_seed(params.seed) self.critic = DDPGValueNet(feature_shape=self.env.features_shape, a_num=self.env.num_actions, lr=params.lr_c) self.target_critic = DDPGValueNet( feature_shape=self.env.features_shape, a_num=self.env.num_actions, lr=params.lr_c) self._copy_para(self.critic.model, self.target_critic.model) self.actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) self.target_actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) self._copy_para(self.actor, self.target_actor) self.ema = tf.train.ExponentialMovingAverage(decay=1.0 - self.parms.tau) def _copy_para(self, from_model, to_model): """ Copy parameters for soft updating :param from_model: latest model :param to_model: target model :return: None """ for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): j.assign(i) def _ema_update(self): paras = self.actor.trainable_weights + \ self.critic.model.trainable_weights self.ema.apply(paras) for i, j in zip(self.target_actor.trainable_weights + \ self.target_critic.model.trainable_weights, paras): i.assign(self.ema.average(j)) def _train(self): # Sample batch = self.buffer.sample(self.parms.batch_size) s = np.array([batch_[0] for batch_ in batch]) a = np.array([batch_[1] for batch_ in batch]) r = np.array([batch_[2] for batch_ in batch]) s_next = np.array([batch_[3] for batch_ in batch]) not_done = np.array([not batch_[4] for batch_ in batch]) # Reshpe r = r[:, np.newaxis] not_done = not_done[:, np.newaxis] # Train critic with tf.GradientTape() as tape: pi_next = self.target_actor(s_next) a_next = pi_next.sample() q_next = self.target_critic([s_next, a_next]) y = r + self.parms.gamma * q_next * not_done q = self.critic([s, a]) c_loss = tf.losses.mean_squared_error(y, q) c_grads = tape.gradient(c_loss, self.critic.model.trainable_weights) self.critic.model.optimizer.apply_gradients( zip(c_grads, self.critic.model.trainable_weights)) # Train actor with tf.GradientTape() as tape: pi = self.actor(s) a = pi.sample() q = self.critic([s, a]) a_loss = -tf.reduce_mean(q) a_grads = tape.gradient(a_loss, self.actor.trainable_weights) self.actor.optimizer.apply_gradients( zip(a_grads, self.actor.trainable_weights)) self._ema_update() def train_step(self): # Episode infomation episode_ret = [] # Initialize s s = self.env.reset() for _ in range(self.parms.train_step_len): # Interact pi = self.actor(s[np.newaxis, :]) # batch_size=1 a = pi.sample()[0] s_next, r, done, info = self.env.step(a) # Store self.buffer.store((s, a, r, s_next, done)) # Train if self.buffer.size() > self.parms.start_size: self._train() if done: _, ret = info['done'] episode_ret.append(ret) s_next = self.env.reset() s = s_next return np.mean(episode_ret)
class Agent: def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.replay_buffer = ReplayBuffer(buffer_size) self.dqn = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters()) self.dqn_loss = torch.nn.MSELoss() def update_model(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size) states = torch.FloatTensor(states) actions = torch.LongTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) dones = torch.FloatTensor(dones) curr_Q = self.dqn.forward(states) curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1) next_Q = self.dqn.forward(next_states) max_next_Q = torch.max(next_Q, 1)[0] expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q self.dqn_optimizer.zero_grad() loss = self.dqn_loss(curr_Q, expected_Q) loss.backward() self.dqn_optimizer.step() return loss def max_action(self, state): state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0)) qvals = self.dqn.forward(state) action = np.argmax(qvals.detach().numpy()) return action def train(self, max_episodes, max_steps, batch_size): episode_rewards = [] loss = [] for episodes in range(max_episodes): state = self.env.reset() episode_reward = 0 for steps in range(max_steps): action = self.max_action(state) next_state, reward, done, _ = self.env.step(action) self.replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: episode_rewards.append(episode_reward) print(episode_reward) break if(len(self.replay_buffer) > batch_size): step_loss = self.update_model(batch_size) loss.append(step_loss) #self.adjust_temperature(loss) # return episode_rewards, loss def run(self, max_episodes, max_steps): episode_rewards = [] for episodes in range(max_episodes): state = self.env.reset() episode_reward = 0 for steps in range(max_steps): action = self.max_action(state) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward if done: episode_rewards.append(episode_reward) break return episode_rewards def save_model(self, PATH): torch.save(self.dqn.state_dict(), PATH)
class DDPGAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # for MADDPG self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM) self.timestep = 0 # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number): """Save experience in replay memory, and use random sample from buffer to learn.""" self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): # For MADDPG: get action for each agent and concatenate them for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: actions += self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Construct next actions vector relative to the agent if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # For MADDPG: Construct action vector for each agent actions_pred = self.actor_local(states) if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter #self.eps -= self.eps_decay #self.eps = max(self.eps, EPS_FINAL) #self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters.""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_checkpoint(self, agent_number, filename='checkpoint'): checkpoint = { 'action_size': self.action_size, 'state_size': self.state_size, 'actor_state_dict': self.actor_local.state_dict(), 'critic_state_dict': self.critic_local.state_dict() } filepath = filename + '_' + str(agent_number) + '.pth' torch.save(checkpoint, filepath) print(filepath + ' succesfully saved.') def load_checkpoint(self, agent_number, filename='checkpoint'): filepath = filename + '_' + str(agent_number) + '.pth' checkpoint = torch.load(filepath) state_size = checkpoint['state_size'] action_size = checkpoint['action_size'] self.actor_local = Actor(state_size, action_size, seed=42).to(device) self.critic_local = Critic(state_size, action_size, seed=42).to(device) self.actor_local.load_state_dict(checkpoint['actor_state_dict']) self.critic_local.load_state_dict(checkpoint['critic_state_dict']) print(filepath + ' successfully loaded.')
retrace = truncated_rho * (retrace - q_value.detach()) + values[step].detach() loss += actor_loss + critic_loss - entropy if args.type == 'trpo': loss = TRPO(model, policies, average_policies, 1, loss, policies[step] / average_policies[step]) optimizer.zero_grad() loss.backward() optimizer.step() if args.batch_size < len(replay_buffer) + 1: for _ in range(np.random.poisson(args.replay_ratio)): trajecs = replay_buffer.sample(args.batch_size) s_x, a_x, r_x, old_pol, m_x = map( torch.stack, zip(*(map(torch.cat, zip(*trajec)) for trajec in trajecs))) q_vals = [] vals = [] pols = [] avg_pols = [] for step in range(s_x.size(0)): pol, q_val, val = model(s_x[step]) q_vals.append(q_val) pols.append(pol) vals.append(val)
class DQNAgent(Agent): """ Uses a replay buffer and has two DQNs, one that is used to get best actions and updated every step and the other, a target network, used to compute the target Q value every step. This target network is only updated with the first DQN only after a fixed number of steps. """ def __init__(self, env, network, learning_rate, gamma, eps_max, eps_min, eps_dec, buffer_size, replace_cnt): super().__init__(env, network, learning_rate, gamma, eps_max, eps_min, eps_dec) self.replay_buffer = ReplayBuffer(max_size=buffer_size, input_shape = env.env_shape) self.learn_step_counter = 0 self.replace_cnt = replace_cnt self.q_eval = ConvDQN(env.env_shape, env.no_of_actions) self.q_target = ConvDQN(env.env_shape, env.no_of_actions) def get_action(self, state): if(np.random.randn() <= self.eps): return self.env.sample_action() else: state = T.tensor(state, dtype=T.float).unsqueeze(0).to(self.q_eval.device) actions = self.q_eval.forward(state) return T.argmax(actions).item() def replace_target_network(self): if self.learn_step_counter % self.replace_cnt == 0: self.q_target.load_state_dict(self.q_eval.state_dict()) def get_batch_tensors(self, batch_size): batch = self.replay_buffer.sample(batch_size) states, actions, rewards, next_states, dones = batch states_t = T.tensor(states, dtype=T.float).to(self.q_eval.device) actions_t = T.tensor(actions).to(self.q_eval.device) rewards_t = T.tensor(rewards, dtype=T.float).to(self.q_eval.device) next_states_t = T.tensor(next_states, dtype=T.float).to(self.q_eval.device) return states_t, actions_t, rewards_t, next_states_t def update(self, batch_size): states_t, actions_t, rewards_t, next_states_t = self.get_batch_tensors(batch_size) self.q_eval.optimizer.zero_grad() self.replace_target_network() indices = np.arange(batch_size) curr_Q = self.q_eval.forward(states_t)[indices, actions_t] max_next_Q = self.q_target.forward(next_states_t).max(1)[0] expected_Q = rewards_t + self.gamma * max_next_Q loss = self.q_eval.MSE_loss(curr_Q, expected_Q).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.dec_eps() def learn(self,state, action, reward, next_state, done, batch_size): self.replay_buffer.store_transition(state, action, reward, next_state, done) if len(self.replay_buffer) > batch_size: self.update(batch_size)
def train(target_vars, saver, sess, logger, dataloader, resume_iter, logdir): X = target_vars['X'] Y = target_vars['Y'] X_NOISE = target_vars['X_NOISE'] train_op = target_vars['train_op'] energy_pos = target_vars['energy_pos'] energy_neg = target_vars['energy_neg'] loss_energy = target_vars['loss_energy'] loss_ml = target_vars['loss_ml'] loss_total = target_vars['total_loss'] gvs = target_vars['gvs'] x_grad = target_vars['x_grad'] x_grad_first = target_vars['x_grad_first'] x_off = target_vars['x_off'] temp = target_vars['temp'] x_mod = target_vars['x_mod'] LABEL = target_vars['LABEL'] LABEL_POS = target_vars['LABEL_POS'] weights = target_vars['weights'] test_x_mod = target_vars['test_x_mod'] eps = target_vars['eps_begin'] label_ent = target_vars['label_ent'] if FLAGS.use_attention: gamma = weights[0]['atten']['gamma'] else: gamma = tf.zeros(1) val_output = [test_x_mod] gvs_dict = dict(gvs) log_output = [ train_op, energy_pos, energy_neg, eps, loss_energy, loss_ml, loss_total, x_grad, x_off, x_mod, gamma, x_grad_first, label_ent, *gvs_dict.keys() ] output = [train_op, x_mod] replay_buffer = ReplayBuffer(10000) itr = resume_iter x_mod = None gd_steps = 1 dataloader_iterator = iter(dataloader) best_inception = 0.0 for epoch in range(FLAGS.epoch_num): print("Training epoch:%d" % epoch) for data_corrupt, data, label in dataloader: data_corrupt = data_corrupt_init = data_corrupt.numpy() data_corrupt_init = data_corrupt.copy() data = data.numpy() label = label.numpy() label_init = label.copy() if FLAGS.mixup: idx = np.random.permutation(data.shape[0]) lam = np.random.beta(1, 1, size=(data.shape[0], 1, 1, 1)) data = data * lam + data[idx] * (1 - lam) if FLAGS.replay_batch and (x_mod is not None): replay_buffer.add(compress_x_mod(x_mod)) if len(replay_buffer) > FLAGS.batch_size: replay_batch = replay_buffer.sample(FLAGS.batch_size) replay_batch = decompress_x_mod(replay_batch) replay_mask = (np.random.uniform(0, FLAGS.rescale, FLAGS.batch_size) > 0.05) data_corrupt[replay_mask] = replay_batch[replay_mask] if FLAGS.pcd: if x_mod is not None: data_corrupt = x_mod feed_dict = {X_NOISE: data_corrupt, X: data, Y: label} if FLAGS.cclass: feed_dict[LABEL] = label feed_dict[LABEL_POS] = label_init if itr % FLAGS.log_interval == 0: _, e_pos, e_neg, eps, loss_e, loss_ml, loss_total, x_grad, x_off, x_mod, gamma, x_grad_first, label_ent, * \ grads = sess.run(log_output, feed_dict) kvs = {} kvs['e_pos'] = e_pos.mean() kvs['e_pos_std'] = e_pos.std() kvs['e_neg'] = e_neg.mean() kvs['e_diff'] = kvs['e_pos'] - kvs['e_neg'] kvs['e_neg_std'] = e_neg.std() kvs['temp'] = temp kvs['loss_e'] = loss_e.mean() kvs['eps'] = eps.mean() kvs['label_ent'] = label_ent kvs['loss_ml'] = loss_ml.mean() kvs['loss_total'] = loss_total.mean() kvs['x_grad'] = np.abs(x_grad).mean() kvs['x_grad_first'] = np.abs(x_grad_first).mean() kvs['x_off'] = x_off.mean() kvs['iter'] = itr kvs['gamma'] = gamma for v, k in zip(grads, [v.name for v in gvs_dict.values()]): kvs[k] = np.abs(v).max() string = "Obtained a total of " for key, value in kvs.items(): string += "{}: {}, ".format(key, value) if hvd.rank() == 0: print(string) logger.writekvs(kvs) else: _, x_mod = sess.run(output, feed_dict) if itr % FLAGS.save_interval == 0 and hvd.rank() == 0: saver.save( sess, osp.join(FLAGS.logdir, FLAGS.exp, 'model_{}'.format(itr))) if itr % FLAGS.test_interval == 0 and hvd.rank( ) == 0 and FLAGS.dataset != '2d': try_im = x_mod orig_im = data_corrupt.squeeze() actual_im = rescale_im(data) orig_im = rescale_im(orig_im) try_im = rescale_im(try_im).squeeze() for i, (im, t_im, actual_im_i) in enumerate( zip(orig_im[:20], try_im[:20], actual_im)): shape = orig_im.shape[1:] new_im = np.zeros((shape[0], shape[1] * 3, *shape[2:])) size = shape[1] new_im[:, :size] = im new_im[:, size:2 * size] = t_im new_im[:, 2 * size:] = actual_im_i log_image(new_im, logger, 'train_gen_{}'.format(itr), step=i) test_im = x_mod try: data_corrupt, data, label = next(dataloader_iterator) except BaseException: dataloader_iterator = iter(dataloader) data_corrupt, data, label = next(dataloader_iterator) data_corrupt = data_corrupt.numpy() if FLAGS.replay_batch and ( x_mod is not None) and len(replay_buffer) > 0: replay_batch = replay_buffer.sample(FLAGS.batch_size) replay_batch = decompress_x_mod(replay_batch) replay_mask = (np.random.uniform(0, 1, (FLAGS.batch_size)) > 0.05) data_corrupt[replay_mask] = replay_batch[replay_mask] if FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'imagenet' or FLAGS.dataset == 'imagenetfull': n = 128 if FLAGS.dataset == "imagenetfull": n = 32 if len(replay_buffer) > n: data_corrupt = decompress_x_mod( replay_buffer.sample(n)) elif FLAGS.dataset == 'imagenetfull': data_corrupt = np.random.uniform( 0, FLAGS.rescale, (n, 128, 128, 3)) else: data_corrupt = np.random.uniform( 0, FLAGS.rescale, (n, 32, 32, 3)) if FLAGS.dataset == 'cifar10': label = np.eye(10)[np.random.randint(0, 10, (n))] else: label = np.eye(1000)[np.random.randint(0, 1000, (n))] feed_dict[X_NOISE] = data_corrupt feed_dict[X] = data if FLAGS.cclass: feed_dict[LABEL] = label test_x_mod = sess.run(val_output, feed_dict) try_im = test_x_mod orig_im = data_corrupt.squeeze() actual_im = rescale_im(data.numpy()) orig_im = rescale_im(orig_im) try_im = rescale_im(try_im).squeeze() for i, (im, t_im, actual_im_i) in enumerate( zip(orig_im[:20], try_im[:20], actual_im)): shape = orig_im.shape[1:] new_im = np.zeros((shape[0], shape[1] * 3, *shape[2:])) size = shape[1] new_im[:, :size] = im new_im[:, size:2 * size] = t_im new_im[:, 2 * size:] = actual_im_i log_image(new_im, logger, 'val_gen_{}'.format(itr), step=i) score, std = get_inception_score(list(try_im), splits=1) print("///Inception score of {} with std of {}".format( score, std)) kvs = {} kvs['inception_score'] = score kvs['inception_score_std'] = std logger.writekvs(kvs) if score > best_inception: best_inception = score saver.save(sess, osp.join(FLAGS.logdir, FLAGS.exp, 'model_best')) if itr > 60000 and FLAGS.dataset == "mnist": assert False itr += 1 print("Training iteration:%d" % itr) saver.save(sess, osp.join(FLAGS.logdir, FLAGS.exp, 'model_{}'.format(itr)))
class Agent(): def __init__(self, state_size, action_size, policy_network, value_network, n_agents, device, use_gae=True): self.state_size = state_size self.action_size = action_size self.n_agents = n_agents self.device = device self.policy_network = policy_network( state_size=state_size, action_size=action_size).to(device) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=LR) self.value_network = value_network(state_size=state_size, action_size=1).to(device) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=LR) self.epsilon = EPSILON self.beta = BETA self.reset_memory() self.buffer = ReplayBuffer(int(128), 64) self.use_gae = use_gae def reset_memory(self): self.rnn_memory = None def policy_loss(self, old_log_probs, states, actions, rewards, epsilon=EPSILON, beta=BETA): distribution, _ = self.policy_network(states, None) new_log_prob = distribution.log_prob(actions) new_probs = torch.exp(new_log_prob) ratio = torch.exp(new_log_prob - old_log_probs) # clipped function clip = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) rewards = rewards.reshape(self.n_agents, clip.shape[1], -1) clipped_surrogate = torch.min(ratio * rewards, clip * rewards) entropy = -(new_probs * old_log_probs + (1.0 - new_probs) * old_log_probs) loss = (clipped_surrogate + beta * entropy).mean() return loss def value_loss(self, states, rewards): estimated_value = self.value_network(states).reshape(self.n_agents, -1) return (estimated_value - rewards).pow(2).mean(1).mean() def act(self, state): state = torch.from_numpy(state).float().to(self.device).unsqueeze(1) self.policy_network.eval() with torch.no_grad(): action_distribution, self.rnn_memory = self.policy_network( state, self.rnn_memory) self.policy_network.train() action = action_distribution.sample() return action.detach().cpu().numpy() def action_probs(self, states, actions): self.policy_network.eval() log_probs = None with torch.no_grad(): distribution, _ = self.policy_network(states, None) log_probs = distribution.log_prob(actions).detach() self.policy_network.train() return log_probs def learn(self, trajectory): states = torch.from_numpy(trajectory['states']).float().to(self.device) actions = torch.from_numpy(trajectory['actions']).float().to( self.device) rewards = rewards_to_go(trajectory['rewards'], self.n_agents, self.device) next_states = torch.from_numpy(trajectory['next_states']).float().to( self.device) dones = torch.from_numpy(trajectory['dones']).float().to(self.device) log_probs = self.action_probs(states, actions) policy_signal = None if self.use_gae: self.buffer.add(states, rewards) policy_signal = generalized_advantage_estimate( states, rewards, next_states, dones, self.value_network).detach() else: policy_signal = rewards # print(policy_signal.shape) # policy_signal = (policy_signal - policy_signal.mean()) / (policy_signal.std() + 1e-10) # Optimize Policy for _ in range(TRAIN_P_ITERS): self.policy_optimizer.zero_grad() pl = self.policy_loss(log_probs, states, actions, policy_signal, self.epsilon, self.beta) writer.add_scalar('loss/policy', pl.cpu().detach().numpy()) pl.backward() torch.nn.utils.clip_grad_norm_(self.policy_network.parameters(), 1) self.policy_optimizer.step() del pl if self.use_gae: # Optimize Value Function for _ in range(TRAIN_V_ITERS): self.value_optimizer.zero_grad() s_, r_ = self.buffer.sample() all_rewards = torch.stack(r_) r_mean = all_rewards.mean() r_std = all_rewards.std() + 1e-10 losses = [] for s, r in zip(s_, r_): losses.append(self.value_loss(s, r).mean()) loss = torch.stack(losses).mean() writer.add_scalar('loss/value', loss.cpu().detach().numpy()) loss.backward() self.value_optimizer.step() del loss self.epsilon *= .999 self.beta *= .995 self.reset_memory()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.timestep = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_num): """Save experience in replay memory, and use random sample from buffer to learn.""" self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_num) def act(self, states, eps, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: actions += eps * self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_num): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # -------------------------- update critic -------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) if agent_num == 0: actions_next = torch.cat((actions_next, actions[:, :2]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # -------------------------- update actor -------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_num == 0: actions_pred = torch.cat((actions_pred, actions[:, :2]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # --------------------- update target networks --------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=0, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, gamma=GAMMA, checkpoint_path='./checkpoints/', pretrained=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.gamma = gamma self.checkpoint_path = checkpoint_path # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # If pretrained, load weights if pretrained: actor_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_actor.pth')) critic_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_critic.pth')) self.actor_local.load_state_dict(actor_dict) self.actor_target.load_state_dict(actor_dict) self.critic_local.load_state_dict(critic_dict) self.critic_target.load_state_dict(critic_dict) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) def step(self, state, action, reward, next_state, done, tstep=LEARN_EVERY+1): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and tstep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences) def train(self, env, n_episodes=1000): """Deep Deterministic Policy Gradient (DDPG) Learning. Params ====== env (UnityEnvironment): Unity environment n_episodes (int): maximum number of training episodes """ # create checkpoints folder if necessary if not os.path.exists(self.checkpoint_path): os.makedirs(self.checkpoint_path) # get the default brain brain_name = env.brain_names[0] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) # last 100 scores scores_deque = deque(maxlen=100) # list containing scores from each episode all_scores = [] # list containing window averaged scores avg_scores = [] # for each episode for i_episode in range(1, n_episodes+1): # reset environment env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations # reset noise self.reset() scores = np.zeros(num_agents) # for each timepoint t=0 while True: # agent action actions = self.act(states) # get the next state env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get the reward rewards = env_info.rewards # see if episode has ended dones = env_info.local_done # step for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.step(state, action, reward, next_state, done, t) states = next_states scores += rewards t+=1 if np.any(dones): break # save most recent score max_score = np.max(scores) scores_deque.append(max_score) all_scores.append(max_score) avg_scores.append(np.mean(scores_deque)) print('\rEpisode {}\tScore: {:.2f}\tMax Score: {:.2f}'.format(i_episode, max_score, np.mean(scores_deque)), end="") if i_episode % 50 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque))) if np.mean(scores_deque)>=0.5: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque))) torch.save(self.actor_local.state_dict(), self.checkpoint_path+'checkpoint_actor.pth') torch.save(self.critic_local.state_dict(), self.checkpoint_path+'checkpoint_critic.pth') break return all_scores, avg_scores def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) self.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def play(self, env, n_episodes=5): """Play a few episodes with trained agents. Params ====== env (UnityEnvironment): Unity environment n_episodes (int): maximum number of training episodes """ # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size state_size = env_info.vector_observations.shape[1] # for each episode for i_episode in range(1, n_episodes+1): env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations self.reset() # set the noise to zero score = np.zeros(num_agents) while(True): actions = self.act(states, add_noise=False) env_info = env.step(actions)[brain_name] # get the next states next_states = env_info.vector_observations # get the rewards rewards = env_info.rewards # see if the episode has finished for any agent dones = env_info.local_done self.step(states, actions, rewards, next_states, dones) states = next_states score += rewards if np.any(dones): break print('Best Score:', np.max(score)) env.close()
def train(target_vars, saver, sess, logger, dataloader, resume_iter, logdir): X = target_vars['X'] X_NOISE = target_vars['X_NOISE'] train_op = target_vars['train_op'] energy_pos = target_vars['energy_pos'] energy_neg = target_vars['energy_neg'] loss_energy = target_vars['loss_energy'] loss_ml = target_vars['loss_ml'] loss_total = target_vars['total_loss'] gvs = target_vars['gvs'] x_off = target_vars['x_off'] x_grad = target_vars['x_grad'] x_mod = target_vars['x_mod'] LABEL = target_vars['LABEL'] HIER_LABEL = target_vars['HIER_LABEL'] LABEL_POS = target_vars['LABEL_POS'] eps = target_vars['eps_begin'] ATTENTION_MASK = target_vars['ATTENTION_MASK'] attention_mask = target_vars['attention_mask'] attention_grad = target_vars['attention_grad'] if FLAGS.prelearn_model or FLAGS.prelearn_model_shape: models_pretrain = target_vars['models_pretrain'] if not FLAGS.comb_mask: attention_mask = tf.zeros(1) attention_grad = tf.zeros(1) if FLAGS.use_attention: gamma = weights['atten']['gamma'] else: gamma = tf.zeros(1) gvs_dict = dict(gvs) log_output = [ train_op, energy_pos, energy_neg, eps, loss_energy, loss_ml, loss_total, x_grad, x_off, x_mod, attention_mask, attention_grad, *gvs_dict.keys()] output = [train_op, x_mod] print("log_output ", log_output) replay_buffer = ReplayBuffer(10000) itr = resume_iter x_mod = None gd_steps = 1 dataloader_iterator = iter(dataloader) best_inception = 0.0 for epoch in range(FLAGS.epoch_num): for data_corrupt, data, label in dataloader: data_corrupt = data_corrupt_init = data_corrupt.numpy() data_corrupt_init = data_corrupt.copy() data = data.numpy() if FLAGS.mixup: idx = np.random.permutation(data.shape[0]) lam = np.random.beta(1, 1, size=(data.shape[0], 1, 1, 1)) data = data * lam + data[idx] * (1 - lam) if FLAGS.replay_batch and (x_mod is not None) and not FLAGS.joint_baseline: replay_buffer.add(compress_x_mod(x_mod)) if len(replay_buffer) > FLAGS.batch_size: replay_batch = replay_buffer.sample(FLAGS.batch_size) replay_batch = decompress_x_mod(replay_batch) replay_mask = ( np.random.uniform( 0, FLAGS.rescale, FLAGS.batch_size) > FLAGS.keep_ratio) data_corrupt[replay_mask] = replay_batch[replay_mask] if FLAGS.pcd: if x_mod is not None: data_corrupt = x_mod attention_mask = np.random.uniform(-1., 1., (data.shape[0], 64, 64, int(FLAGS.cond_func))) feed_dict = {X_NOISE: data_corrupt, X: data, ATTENTION_MASK: attention_mask} if FLAGS.joint_baseline: feed_dict[target_vars['NOISE']] = np.random.uniform(-1., 1., (data.shape[0], 128)) if FLAGS.prelearn_model or FLAGS.prelearn_model_shape: _, _, labels = zip(*models_pretrain) labels = [LABEL, LABEL_POS] + list(labels) for lp, l in zip(labels, label): # print("lp, l ", lp, l) # print("l shape ", l.shape) feed_dict[lp] = l else: label = label.numpy() label_init = label.copy() if FLAGS.cclass: feed_dict[LABEL] = label feed_dict[LABEL_POS] = label_init if FLAGS.heir_mask: feed_dict[HIER_LABEL] = label if itr % FLAGS.log_interval == 0: # print(feed_dict.keys()) # print(feed_dict) _, e_pos, e_neg, eps, loss_e, loss_ml, loss_total, x_grad, x_off, x_mod, attention_mask, attention_grad, * \ grads = sess.run(log_output, feed_dict) kvs = {} kvs['e_pos'] = e_pos.mean() kvs['e_pos_std'] = e_pos.std() kvs['e_neg'] = e_neg.mean() kvs['e_diff'] = kvs['e_pos'] - kvs['e_neg'] kvs['e_neg_std'] = e_neg.std() kvs['loss_e'] = loss_e.mean() kvs['loss_ml'] = loss_ml.mean() kvs['loss_total'] = loss_total.mean() kvs['x_grad'] = np.abs(x_grad).mean() kvs['attention_grad'] = np.abs(attention_grad).mean() kvs['x_off'] = x_off.mean() kvs['iter'] = itr for v, k in zip(grads, [v.name for v in gvs_dict.values()]): kvs[k] = np.abs(v).max() string = "Obtained a total of " for key, value in kvs.items(): string += "{}: {}, ".format(key, value) if kvs['e_diff'] < -0.5: print("Training is unstable") assert False print(string) logger.writekvs(kvs) else: _, x_mod = sess.run(output, feed_dict) if itr % FLAGS.save_interval == 0: saver.save( sess, osp.join( FLAGS.logdir, FLAGS.exp, 'model_{}'.format(itr))) if itr > 30000: assert False # For some reason conditioning on position fails earlier # if FLAGS.cond_pos and itr > 30000: # assert False if itr % FLAGS.test_interval == 0 and not FLAGS.joint_baseline and FLAGS.dataset != 'celeba': try_im = x_mod orig_im = data_corrupt.squeeze() actual_im = rescale_im(data) if not FLAGS.comb_mask: attention_mask = np.random.uniform(-1., 1., (data.shape[0], 64, 64, int(FLAGS.cond_func))) orig_im = rescale_im(orig_im) try_im = rescale_im(try_im).squeeze() attention_mask = rescale_im(attention_mask) for i, (im, t_im, actual_im_i, attention_im) in enumerate( zip(orig_im[:20], try_im[:20], actual_im, attention_mask)): im, t_im, actual_im_i, attention_im = im[::-1], t_im[::-1], actual_im_i[::-1], attention_im[::-1] shape = orig_im.shape[1:] new_im = np.zeros((shape[0], shape[1] * (3 + FLAGS.cond_func), *shape[2:])) size = shape[1] new_im[:, :size] = im new_im[:, size:2 * size] = t_im new_im[:, 2 * size: 3 * size] = actual_im_i for i in range(FLAGS.cond_func): new_im[:, (3+i) * size: (4+i) * size] = np.tile(attention_im[:, :, i:i+1], (1, 1, 3)) log_image( new_im, logger, 'train_gen_{}'.format(itr), step=i) test_im = x_mod try: data_corrupt, data, label = next(dataloader_iterator) except BaseException: dataloader_iterator = iter(dataloader) data_corrupt, data, label = next(dataloader_iterator) data_corrupt = data_corrupt.numpy() itr += 1 saver.save(sess, osp.join(FLAGS.logdir, FLAGS.exp, 'model_{}'.format(itr)))
def train(): """ init dir and log config """ init_cluster_ray() base_dir, ckpt_dir, summary_dir = init_dir_and_log() kwargs = FLAGS.flag_values_dict() kwargs["BASE_DIR"] = base_dir kwargs["ckpt_dir"] = ckpt_dir act_space = int(FLAGS.act_space) kwargs["act_space"] = act_space """ get one seg from rollout worker for dtype and shapes :param kwargs rollout worker config """ logging.info('get one seg from Evaluator for dtype and shapes') ps = AsyncPS.remote() small_data_collector = RolloutCollector( server_nums=1, ps=ps, policy_evaluator_build_func=build_policy_evaluator, **kwargs) cache_struct_path = '/tmp/%s.pkl' % FLAGS.dir structure = fetch_one_structure(small_data_collector, cache_struct_path=cache_struct_path, is_head=True) del small_data_collector """ init data prefetch thread, prepare_input_pipe """ keys = list(structure.keys()) dtypes = [structure[k].dtype for k in keys] shapes = [structure[k].shape for k in keys] segBuffer = tf.queue.RandomShuffleQueue( capacity=FLAGS.qsize * FLAGS.batch_size, min_after_dequeue=FLAGS.qsize * FLAGS.batch_size // 2, dtypes=dtypes, shapes=shapes, names=keys, shared_name="buffer") server_nums = FLAGS.nof_evaluator nof_server_gpus = FLAGS.nof_server_gpus server_nums_refine = server_nums // nof_server_gpus data_collector = RolloutCollector( server_nums=server_nums_refine, ps=ps, policy_evaluator_build_func=build_policy_evaluator, **kwargs) config = tf.ConfigProto( allow_soft_placement=True, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=1)) config.gpu_options.allow_growth = True sess = tf.Session(config=config) reader = QueueReader(sess=sess, global_queue=segBuffer, data_collector=data_collector, keys=keys, dtypes=dtypes, shapes=shapes) reader.daemon = True reader.start() dequeued = segBuffer.dequeue_many(FLAGS.batch_size) # ////////////////////// if FLAGS.use_demo: demo_buffer = build_demo_buffer(keys, 0.9) replay_buffer = ReplayBuffer(10000, keys) batch_weights = tf.placeholder(tf.float32, shape=[None]) phs = { key: tf.placeholder(dtype=dtype, shape=[None] + list(shape)) for key, dtype, shape in zip(keys, dtypes, shapes) } from_where = phs else: from_where = dequeued batch_weights = tf.ones(FLAGS.batch_size) # ////////////////////// prephs, postphs = dict(), dict() for k, v in from_where.items(): if k == "state_in": prephs[k] = v else: prephs[k], postphs[k] = tf.split( v, [FLAGS.burn_in, FLAGS.seqlen + FLAGS.n_step], axis=1) prekeys = list(prephs.keys()) postkeys = list(postphs.keys()) """ count frame and total steps """ num_frames = tf.get_variable('num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int32, trainable=False) tf.summary.scalar("frames", num_frames) global_step = tf.train.get_or_create_global_step() dur_time_tensor = tf.placeholder(dtype=tf.float32) tf.summary.scalar('time_per_step', dur_time_tensor) """ set stage_op and build learner """ with tf.device("/gpu"): if FLAGS.use_stage: area = tf.contrib.staging.StagingArea( [prephs[key].dtype for key in prekeys] + [postphs[key].dtype for key in postkeys], [prephs[key].shape for key in prekeys] + [postphs[key].shape for key in postkeys]) stage_op = area.put([prephs[key] for key in prekeys] + [postphs[key] for key in postkeys]) from_stage = area.get() predatas = {key: from_stage[i] for i, key in enumerate(prekeys)} postdatas = { key: from_stage[i + len(prekeys)] for i, key in enumerate(postkeys) } else: stage_op = [] predatas, postdatas = prephs, postphs num_frames_and_train, global_step_and_train, init_target_op, priority, beta = build_learner( pre=predatas, post=postdatas, act_space=act_space, num_frames=num_frames, batch_weights=batch_weights) """ add summary """ summary_ops = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(summary_dir, sess.graph) """ initialize and save ckpt """ saver = tf.train.Saver(max_to_keep=100, keep_checkpoint_every_n_hours=6) ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) ws = Model.get_ws(sess) logging.info('pushing weight to ps') ray.get(ps.push.remote(ws)) saver.save(sess, os.path.join(ckpt_dir, "CKPT"), global_step=global_step) """ step """ total_frames = 0 sess.run(stage_op) sess.run(init_target_op) if FLAGS.use_demo: dequeued_datas, sample_beta = sess.run([dequeued, beta]) replay_buffer.add_batch(dequeued_datas, FLAGS.batch_size) dur_time = 0 while total_frames < FLAGS.total_environment_frames: start = time.time() if FLAGS.use_demo: batch_size = np.random.binomial(FLAGS.batch_size - 2, 0.99) + 1 demo_batch_size = FLAGS.batch_size - batch_size datas = replay_buffer.sample(batch_size) demo_datas, demo_is_weights, demo_idxes = demo_buffer.sample( demo_batch_size, sample_beta) fd = { phs[k]: np.concatenate([datas[k], demo_datas[k]], axis=0) for k in keys } fd[batch_weights] = np.concatenate( [np.ones(batch_size), np.zeros(demo_batch_size)], axis=0) fd[dur_time_tensor] = dur_time total_frames, gs, summary, _, p, sample_beta, dequeued_datas = sess.run( [ num_frames_and_train, global_step_and_train, summary_ops, stage_op, priority, beta, dequeued ], feed_dict=fd) demo_buffer.update_priorities(demo_idxes, p[batch_size:]) replay_buffer.add_batch(dequeued_datas, FLAGS.batch_size) else: fd = {dur_time_tensor: dur_time} total_frames, gs, summary, _ = sess.run([ num_frames_and_train, global_step_and_train, summary_ops, stage_op ], feed_dict=fd) if gs % FLAGS.target_update == 0: sess.run(init_target_op) if gs % 25 == 0: ws = Model.get_ws(sess) logging.info('pushing weight to ps') try: ray.get(ps.push.remote(ws)) except ray.exceptions.UnreconstructableError as e: logging.info(str(e)) except ray.exceptions.RayError as e: logging.info(str(e)) if gs % 1000 == 0: saver.save(sess, os.path.join(ckpt_dir, "CKPT"), global_step=global_step) if gs % 1 == 0: summary_writer.add_summary(summary, global_step=gs) dur_time = time.time() - start msg = "Global Step %d, Total Frames %d, Time Consume %.2f" % ( gs, total_frames, dur_time) logging.info(msg) saver.save(sess, os.path.join(ckpt_dir, "CKPT"), global_step=global_step)