class Trainer(object): def __init__(self, agent, env): self.agent = agent self.env = env self.seed = random.randint(0, 20180818) self.optimizer = optim.Adam(agent.parameters, lr=LEARNING_RATE) self.buffer = ReplayBuffer(capacity=CAPACITY) self.total_step = 0 def run(self, device='cpu', buffer=False, explore=False): """Run an episode and buffer""" self.env.reset() self.env.env.seed(self.seed) state = self.env.get_screen() states = np.asarray([state for _ in range(4)]) # shape (4, 84, 84) step = 0 accumulated_reward = 0 while True: action = self.agent.make_action(torch.Tensor([states]).to(device), explore=explore) state_next, reward, done = self.env.step(action) states_next = np.concatenate([states[1:, :, :], [state_next]], axis=0) step += 1 accumulated_reward += reward if buffer: self.buffer.append(states, action, reward, states_next, done) states = states_next if done: break return accumulated_reward, step def _fill_buffer(self, num, device='cpu'): start = time.time() while self.buffer.size < num: self.run(device, buffer=True, explore=True) print('Fill buffer: {}/{}'.format(self.buffer.size, self.buffer.capacity)) print('Filling buffer takes {:.3f} seconds'.format(time.time() - start)) def train(self, device='cpu'): self.env.change_record_every_episode(100000000) self._fill_buffer(OBSERV, device) if self.env.record_every_episode: self.env.change_record_every_episode(self.env.record_every_episode) episode = 0 while 'training' != 'converge': self.env.reset() state = self.env.get_screen() states = np.asarray([state for _ in range(4)]) # shape (4, 84, 84) step_prev = self.total_step accumulated_reward = 0 done = False n_flap = 0 n_none = 0 while not done: #### -------------------- #### Add a new transition action = self.agent.make_action(torch.Tensor([states ]).to(device), explore=True) state_next, reward, done = self.env.step(action) states_next = np.concatenate([states[1:, :, :], [state_next]], axis=0) self.total_step += 1 accumulated_reward += reward self.buffer.append(states, action, reward, states_next, done) states = states_next #### -------------------- #### -------------------- #### Training step start = time.time() # prepare training data minibatch = self.buffer.sample(n_sample=BATCH) _states = [b[0] for b in minibatch] _actions = [b[1] for b in minibatch] _rewards = [b[2] for b in minibatch] _states_next = [b[3] for b in minibatch] _dones = [b[4] for b in minibatch] ys = [] for i in range(len(minibatch)): terminal = _dones[i] r = _rewards[i] if terminal: y = r else: # Double DQN s_t_next = torch.Tensor([_states_next[i]]).to(device) online_act = self.agent.make_action(s_t_next) y = r + DISCOUNT * self.agent.Q( s_t_next, online_act, target=True) ys.append(y) ys = torch.Tensor(ys).to(device) # Apply gradient self.optimizer.zero_grad() input = torch.Tensor(_states).to(device) output = self.agent.net(input) # shape (BATCH, 2) actions_one_hot = np.zeros([BATCH, 2]) actions_one_hot[np.arange(BATCH), _actions] = 1.0 actions_one_hot = torch.Tensor(actions_one_hot).to(device) ys_hat = (output * actions_one_hot).sum(dim=1) loss = F.smooth_l1_loss(ys_hat, ys) loss.backward() self.optimizer.step() #### -------------------- # logging if action == 0: n_flap += 1 else: n_none += 1 if done and self.total_step % LOGGING_CYCLE == 0: log = '[{}, {}] alive: {}, reward: {}, F/N: {}/{}, loss: {:.4f}, epsilon: {:.4f}, time: {:.3f}'.format( episode, self.total_step, self.total_step - step_prev, accumulated_reward, n_flap, n_none, loss.item(), self.agent.epsilon, time.time() - start) print(log) self.agent.update_epsilon() if self.total_step % TARGET_UPDATE_CYCLE == 0: #print('[Update target network]') self.agent.update_target() if self.total_step % SAVE_MODEL_CYCLE == 0: print('[Save model]') self.save(id=self.total_step) episode += 1 def save(self, id): filename = 'tmp/models/model_{}.pth.tar'.format(id) dirpath = os.path.dirname(filename) if not os.path.exists(dirpath): os.mkdir(dirpath) checkpoint = { 'net': self.agent.net.state_dict(), 'target': self.agent.target.state_dict(), 'optimizer': self.optimizer.state_dict(), 'total_step': self.total_step } torch.save(checkpoint, filename) def load(self, filename, device='cpu'): ckpt = torch.load(filename, map_location=lambda storage, loc: storage) ## Deal with the missing of bn.num_batches_tracked net_new = OrderedDict() tar_new = OrderedDict() for k, v in ckpt['net'].items(): for _k, _v in self.agent.net.state_dict().items(): if k == _k: net_new[k] = v for k, v in ckpt['target'].items(): for _k, _v in self.agent.target.state_dict().items(): if k == _k: tar_new[k] = v self.agent.net.load_state_dict(net_new) self.agent.target.load_state_dict(tar_new) ## ----------------------------------------------- self.optimizer.load_state_dict(ckpt['optimizer']) self.total_step = ckpt['total_step']
class DQNAgent: def __init__( self, n_actions, learning_rate=0.001, gamma=0.9, #gamma=0.95, batch_size=64, replay_buffer_size=200000, replay_start_size=1000): """ :param n_actions: the number of possible actions :param learning_rate: the learning rate for the optimizer :param gamma: discount factor :param batch_size: size of a minibatch :param replay_buffer_size: the size of the replay memory :param replay_start_size: the initial size of the replay memory before learning starts :param target_update_interval: number of steps between consecutive updates of the target network """ self.n_actions = n_actions self.learning_rate = learning_rate self.gamma = gamma self.batch_size = batch_size # Create the replay buffer #self.replay_buffer = deque(maxlen=replay_buffer_size) self.replay_buffer = ReplayBuffer(max_size=replay_buffer_size) self.replay_start_size = replay_start_size # Build the Q-network self.q_network = self.build_model() # Create the target network as a copy of the Q-network self.target_network = keras.models.clone_model(self.q_network) # Create the optimizer self.optimizer = keras.optimizers.Adam(self.learning_rate) self.training_step = 0 def build_model(self): model = keras.models.Sequential([ layers.Dense(256, activation='relu'), layers.Dense(256, activation='relu'), layers.Dense(256, activation='relu'), layers.Dense(self.n_actions) ]) return model def select_action(self, state, epsilon): """ An epsilon-greedy action selection :param state: the current state of the environment :param epsilon: the exploration rate :return: an action """ if np.random.rand() < epsilon: return np.random.choice(self.n_actions) else: q_values = self.q_network.predict(np.expand_dims(state, axis=0))[0] return np.argmax(q_values) def remember(self, state, action, reward, next_state, done, info): """Store a new transition in the replay buffer""" self.replay_buffer.append((state, action, reward, next_state, done)) def sample_transitions(self): #indices = np.random.randint(len(self.replay_buffer), size=self.batch_size) #mini_batch = [self.replay_buffer[index] for index in indices] mini_batch = self.replay_buffer.sample(self.batch_size) states, actions, rewards, next_states, dones = [ np.array([transition[field_index] for transition in mini_batch]) for field_index in range(5) ] return states, actions, rewards, next_states, dones def train(self): """Perform a single training step on the network""" # Check that we have enough transitions in the replay buffer if len(self.replay_buffer) < max(self.batch_size, self.replay_start_size): return # Sample transitions from the replay buffer states, actions, rewards, next_states, dones = self.sample_transitions( ) # Compute the target Q values for the sampled transitions next_q_values = self.target_network.predict(next_states) max_next_q_values = np.max(next_q_values, axis=1) target_q_values = rewards + (1 - dones) * self.gamma * max_next_q_values with tf.GradientTape() as tape: # Forward pass: compute the Q-values for the states in the batch all_q_values = self.q_network(states) # Mask out the Q-values for the non-chosen actions mask = tf.one_hot(actions, self.n_actions) q_values = tf.reduce_sum(all_q_values * mask, axis=1) # Compute the loss between the targets and the Q-values loss_fn = keras.losses.Huber() loss = loss_fn(target_q_values, q_values) # Perform a gradient descent step to minimize the loss with respect # to the model's trainable variables gradients = tape.gradient(loss, self.q_network.trainable_variables) self.optimizer.apply_gradients( zip(gradients, self.q_network.trainable_variables)) def update_target_network(self): self.target_network.set_weights(self.q_network.get_weights()) def save_model(self, folder, env_id): """Save the network params to a file""" agent_file = os.path.join(folder, f'{env_id}.h5') keras.models.save_model(self.q_network, agent_file) def load_model(self, folder, env_id): """Load the network params from a file""" agent_file = os.path.join(folder, f'{env_id}.h5') self.q_network = keras.models.load_model(agent_file)
class Agent: def __init__(self): self.name = "expected_sarsa_agent" def agent_init(self, agent_config): """Setup for the agent called when the experiment first starts. Set parameters needed to setup the agent. Assume agent_config dict contains: { network_pickle: string (optional), network_config: dictionary, optimizer_config: dictionary, replay_buffer_size: integer, minibatch_sz: integer, num_replay_updates_per_step: float discount_factor: float, } """ self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'], agent_config['minibatch_sz'], agent_config.get("seed")) if "network_pickle" in agent_config: self.network = pickle.load( open(agent_config["network_pickle"], 'rb')) else: self.network = ActionValueNetwork(agent_config['network_config']) self.optimizer = Adam(self.network.layer_sizes, agent_config["optimizer_config"]) self.num_actions = agent_config['network_config']['num_actions'] self.num_replay = agent_config['num_replay_updates_per_step'] self.discount = agent_config['gamma'] self.tau = agent_config['tau'] self.rand_generator = np.random.RandomState(agent_config.get("seed")) self.last_state = None self.last_action = None self.sum_rewards = 0 self.episode_steps = 0 def policy(self, state): """ Args: state (Numpy array): the state. Returns: the action. """ action_values = self.network.get_action_values(state) probs_batch = self.softmax(action_values, self.tau) action = self.rand_generator.choice(self.num_actions, p=probs_batch.squeeze()) return action def agent_start(self, state): """The first method called when the experiment starts, called after the environment starts. Args: state (Numpy array): the state from the environment's evn_start function. Returns: The first action the agent takes. """ self.sum_rewards = 0 self.episode_steps = 0 self.last_state = np.array([state]) self.last_action = self.policy(self.last_state) return self.last_action def agent_step(self, reward, state): """A step taken by the agent. Args: reward (float): the reward received for taking the last action taken state (Numpy array): the state from the environment's step based, where the agent ended up after the last step Returns: The action the agent is taking. """ self.sum_rewards += reward self.episode_steps += 1 state = np.array([state]) action = self.policy(state) self.replay_buffer.append(self.last_state, self.last_action, reward, 0, state) if self.replay_buffer.size() > self.replay_buffer.minibatch_size: current_q = deepcopy(self.network) for _ in range(self.num_replay): experiences = self.replay_buffer.sample() self.optimize_network(experiences, current_q) self.last_state = state self.last_action = action return action def agent_end(self, reward): """Run when the agent terminates. Args: reward (float): the reward the agent received for entering the terminal state. """ self.sum_rewards += reward self.episode_steps += 1 state = np.zeros_like(self.last_state) self.replay_buffer.append(self.last_state, self.last_action, reward, 1, state) if self.replay_buffer.size() > self.replay_buffer.minibatch_size: current_q = deepcopy(self.network) for _ in range(self.num_replay): experiences = self.replay_buffer.sample() self.optimize_network(experiences, current_q) def agent_message(self, message): if message == "get_sum_reward": return self.sum_rewards else: raise Exception("Unrecognized Message!") def softmax(self, action_values, tau=1.0): """ Args: action_values (Numpy array): A 2D array of shape (batch_size, num_actions). The action-values computed by an action-value network. tau (float): The temperature parameter scalar. Returns: A 2D array of shape (batch_size, num_actions). Where each column is a probability distribution over the actions representing the policy. """ preferences = action_values / tau max_preference = np.amax(preferences, 1) reshaped_max_preference = max_preference.reshape((-1, 1)) exp_preferences = np.exp(preferences - reshaped_max_preference) sum_of_exp_preferences = np.sum(exp_preferences, 1) reshaped_sum_of_exp_preferences = sum_of_exp_preferences.reshape( (-1, 1)) action_probs = exp_preferences / reshaped_sum_of_exp_preferences action_probs = action_probs.squeeze() return action_probs def get_td_error(self, states, next_states, actions, rewards, terminals, current_q): """ Args: states (Numpy array): The batch of states with the shape (batch_size, state_dim). next_states (Numpy array): The batch of next states with the shape (batch_size, state_dim). actions (Numpy array): The batch of actions with the shape (batch_size,). rewards (Numpy array): The batch of rewards with the shape (batch_size,). discount (float): The discount factor. terminals (Numpy array): The batch of terminals with the shape (batch_size,). network (ActionValueNetwork): The latest state of the network that is getting replay updates. current_q (ActionValueNetwork): The fixed network used for computing the targets, and particularly, the action-values at the next-states. Returns: The TD errors (Numpy array) for actions taken, of shape (batch_size,) """ q_next_mat = np.apply_along_axis(current_q.get_action_values, 1, next_states).squeeze() probs_mat = self.softmax(q_next_mat, self.tau) v_next_vec = np.einsum("ij,ij->i", probs_mat, q_next_mat) v_next_vec *= (1 - terminals) target_vec = rewards + self.discount * v_next_vec q_mat = np.apply_along_axis(self.network.get_action_values, 1, states).squeeze() batch_indices = np.arange(q_mat.shape[0]) q_vec = np.array([q_mat[i][actions[i]] for i in batch_indices]) delta_vec = target_vec - q_vec return delta_vec def optimize_network(self, experiences, current_q): """ Args: experiences (Numpy array): The batch of experiences including the states, actions, rewards, terminals, and next_states. discount (float): The discount factor. network (ActionValueNetwork): The latest state of the network that is getting replay updates. current_q (ActionValueNetwork): The fixed network used for computing the targets, and particularly, the action-values at the next-states. """ states, actions, rewards, terminals, next_states = map( list, zip(*experiences)) states = np.concatenate(states) next_states = np.concatenate(next_states) rewards = np.array(rewards) terminals = np.array(terminals) batch_size = states.shape[0] delta_vec = self.get_td_error(states, next_states, actions, rewards, terminals, current_q) batch_indices = np.arange(batch_size) delta_mat = np.zeros((batch_size, self.network.num_actions)) delta_mat[batch_indices, actions] = delta_vec td_update = self.network.get_TD_update(states, delta_mat) weights = self.optimizer.update_weights(self.network.get_weights(), td_update) self.network.set_weights(weights)
def worker_process(job_name, task_index, cluster_dict, file_name): import tensorflow as tf # GPU training. if USE_GPU: os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=PER_PROCESS_GPU_MEMORY_FRACTION) config = tf.ConfigProto(gpu_options=gpu_options) else: config = None # Create and start a server for the local task. cluster = tf.train.ClusterSpec(cluster_dict) server = tf.train.Server(cluster, job_name=job_name, task_index=task_index, config=config) if job_name == "ps": # Parameter server. with tf.device("/job:" + job_name + "/task:" + str(task_index)): queue = tf.FIFOQueue(cluster.num_tasks("worker"), tf.int32, shared_name="done_queue" + str(task_index)) # Close the parameter server when all queues from workers have been filled. with tf.Session(server.target) as sess: for i in range(cluster.num_tasks("worker")): sess.run(queue.dequeue()) return [] elif job_name == "worker": # Obtain environment parameters. env = make_atari(ENV_NAME) obs_space = env.observation_space action_space = env.action_space # Worker. with tf.device( tf.train.replica_device_setter(worker_device="/job:" + job_name + "/task:" + str(task_index), cluster=cluster)): # Build networks. main_network = QValueNetwork(obs_space, action_space, name="main_network") target_network = QValueNetwork(obs_space, action_space, name="target_network", auxiliary_network=main_network) replay_buffer = ReplayBuffer(buffer_size=BUFFER_SIZE) list_episodic_reward = [] episodic_reward = 0 obs = env.reset() # Additional settings for the first worker (task_index = 0). if task_index == 0: saver = tf.train.Saver(var_list=main_network.variables, max_to_keep=1) next_target_network_update_step = 0 next_autosave_step = 0 with tf.train.MonitoredTrainingSession( master=server.target, is_chief=(task_index == 0), config=config, save_summaries_steps=None, save_summaries_secs=None, save_checkpoint_steps=None, save_checkpoint_secs=None) as sess: # Initialize buffers. for _ in range(INITIAL_BUFFER_SIZE): # Sample random action. action = np.random.randint(action_space.n) # Interact with the environment. obs_next, reward, done, _ = env.step(action) episodic_reward += reward if done: obs_next = env.reset() episodic_reward = 0 # Store data. data = [obs, action, reward, done, obs_next] replay_buffer.append(data) # Update observation. obs = obs_next # Run until reaching maximum training steps. while sess.run(main_network.global_step) < TOTAL_STEP: global_step = sess.run(main_network.global_step) if task_index == 0: # Synchronize the target network periodically (target network <- main network). if global_step >= next_target_network_update_step: sess.run(target_network.sync_op) next_target_network_update_step += TARGET_NETWORK_UPDATE_STEP # Sample action with epsilon-greedy policy. epsilon = EPSILON_MAX - ( EPSILON_MAX - EPSILON_MIN) * np.minimum( global_step / EPSILON_DECAY_STEP, 1) if np.random.uniform() < epsilon: action = np.random.randint(action_space.n) else: q = sess.run(target_network.q, feed_dict={ target_network.Obs: np.expand_dims(np.array(obs) / 255.0, 0) }) action = np.argmax(q[0]) # Interact with the environment. obs_next, reward, done, _ = env.step(action) episodic_reward += reward if done: obs_next = env.reset() list_episodic_reward.append((global_step, episodic_reward)) delta_time = int(time.time() - start_time) print("Step ", global_step, "/", TOTAL_STEP, ": Time spent = ", delta_time, " s , Episodic reward = ", episodic_reward, sep="") episodic_reward = 0 # Store data. data = [obs, action, reward, done, obs_next] replay_buffer.append(data) # Update observation. obs = obs_next # Learning rate. lr = LEARNING_RATE[-1] for i in range(len(LR_ANNEAL_STEP)): if global_step < LR_ANNEAL_STEP[i]: lr = LEARNING_RATE[i] break # Sample training data from the replay buffer. batch_data = replay_buffer.sample(BATCH_SIZE) batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \ [np.array([batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0]))] # Compute the target Q value: # target_q = r + (1 - done) * REWARD_DISCOUNT * max[q(s', a)] q_next = sess.run( target_network.q, feed_dict={target_network.Obs: batch_obs_next / 255.0}) max_qnext = np.amax(q_next, axis=1) target_q = batch_reward + ( 1 - batch_done) * REWARD_DISCOUNT * max_qnext # Update the main network (main network <- local network gradients). sess.run(main_network.train_op, feed_dict={ main_network.Obs: batch_obs / 255.0, main_network.Action: batch_action, main_network.TargetQ: target_q, main_network.LR: lr }) if task_index == 0: # Save the main network periodically. if global_step >= next_autosave_step: saver.save(sess._sess._sess._sess._sess, SAVE_DIR + file_name) next_autosave_step += AUTOSAVE_STEP if task_index == 0: # Save the main network. saver.save(sess._sess._sess._sess._sess, SAVE_DIR + file_name) tf.contrib.keras.backend.clear_session() # Close the environment. env.close() queues = [] # Create a shared queue on the worker which is visible on the parameter server. for i in range(cluster.num_tasks("ps")): with tf.device("/job:ps/task:" + str(i)): queue = tf.FIFOQueue(cluster.num_tasks("worker"), tf.int32, shared_name="done_queue" + str(i)) queues.append(queue) # Notify all parameter servers that the current worker has finished the task. with tf.Session(server.target) as sess: for i in range(cluster.num_tasks("ps")): sess.run(queues[i].enqueue(task_index)) # Release memory when a worker is finished. tf.contrib.keras.backend.clear_session() return list_episodic_reward
class RL_Agent: """ Main activities: - Making actual moves in a game. - Making search moves during MCTS - Updating the target policy via supervised learning Must contain the ANET(the actor network) Should save net params to file """ def __init__(self, input_shape, output_size, anet_layer_sizes, anet_layer_activations, gm: GameManager, model_load_path, e_greedy, lr=0.01, optimizer="adam"): self.e_greedy = e_greedy self.anet = ActorNetwork(hidden_layer_sizes=anet_layer_sizes, hidden_activations=anet_layer_activations, optimizer=optimizer, input_shape=input_shape, output_size=output_size, model_load_path=model_load_path, gm=gm, lr=lr) self.rbuf = ReplayBuffer(size_x=len(gm.get_start_state()), size_y=output_size, max_size=5000, default_batch_size=64) self.env = gm def normalize_action_values(self, action_values, actions_available): d = defaultdict() all_act = self.env.get_all_actions() for action in actions_available: idx = all_act.index(action) d[action] = action_values[-1][idx] return {k: v / total for total in (sum(d.values()),) for k, v in d.items()} def default_policy(self, state): # make general for game. moves = self.env.legal_actions(state) if random() < self.e_greedy: return choice(moves) prediction = self.anet.predict(state) normalized_prediction = self.normalize_action_values(prediction, moves) # too slow! return max(normalized_prediction, key=lambda action: normalized_prediction[action]) def retain(self, state, edge_visits): self.rbuf.append(x=state, y=self.get_distribution(edge_visits)) def get_distribution(self, edge_visits: dict): """ dict should be on form {action:visits} :param Edge_visits: :return: """ all_acts = self.env.get_all_actions() distribution = np.zeros(len(all_acts)) s = sum(edge_visits.values()) for action, visits in edge_visits.items(): idx = all_acts.index(action) distribution[idx] = visits / s return distribution def train_rbuf(self, verbose): history = self.anet.train(self.rbuf.minibatch()) if verbose: # do something about loss print(history.history["loss"]) return history def extend_saved_rbuf(self): self.rbuf.save()
env = Gridworld(10) gpu_num = 1 dqn = GoalQWrapper(env, 'dqn', 0) buffer = ReplayBuffer(100000) steps_before_train = 1000 viz_freq = 1000 batch_size = 32 s = env.reset() for time in itertools.count(): a = np.random.randint(0, 4) sp, r, t, info = env.step(a) buffer.append(s, a, r, sp, t) s = sp if time < steps_before_train: continue s_batch, a_batch, r_batch, sp_batch, t_batch = buffer.sample(batch_size) g_batch, _, _, _, _ = buffer.sample(batch_size) loss = dqn.train_batch_goals(time, s_batch, a_batch, sp_batch, g_batch) print(time, loss) if time % viz_freq == 0: visualize_all_values(dqn, env.get_all_states())
def training(file_name): # Create folders. if not os.path.isdir(SAVE_DIR): os.makedirs(SAVE_DIR) if not os.path.isdir(CSV_DIR): os.makedirs(CSV_DIR) if not os.path.isdir(FIGURE_TRAINING_DIR): os.makedirs(FIGURE_TRAINING_DIR) # Load models. actor = Actor(name="actor") actor_target = Actor(name="actor_target") actor_initial_update_op = target_update_op( actor.trainable_variables, actor_target.trainable_variables, 1.0) actor_target_update_op = target_update_op(actor.trainable_variables, actor_target.trainable_variables, TARGET_UPDATE_RATE) critic = Critic(name="critic") critic.build_training() critic_target = Critic(name="critic_target") critic_initial_update_op = target_update_op( critic.trainable_variables, critic_target.trainable_variables, 1.0) critic_target_update_op = target_update_op( critic.trainable_variables, critic_target.trainable_variables, TARGET_UPDATE_RATE) critic_with_actor = Critic(name="critic", A=actor.pi) actor.build_training(critic_with_actor.actor_loss) env = PendulumEnv() replay_buffer = ReplayBuffer(BUFFER_SIZE) action_noise = OUActionNoise(np.zeros(A_LENGTH)) with tf.Session() as sess: # Initialize actor and critic networks. sess.run(tf.global_variables_initializer()) sess.run([actor_initial_update_op, critic_initial_update_op]) list_final_reward = [] additional_episode = int(np.ceil(MIN_BUFFER_SIZE / MAX_FRAME)) for episode in range(-additional_episode, MAX_EPISODE): list_actor_loss = [] list_critic_loss = [] # Reset the environment and noise. s = env.reset() action_noise.reset() for step in range(MAX_FRAME): env.render() # Get action. a = sess.run(actor.pi, feed_dict={actor.S: np.reshape(s, (1, -1))}) noise = action_noise.get_noise() a = a[0] + ACTION_SCALING * noise a = np.clip(a, -ACTION_SCALING, ACTION_SCALING) # Interact with the game engine. s1, r, _, _ = env.step(a) # Add data to the replay buffer. data = [s, a, [r], s1] replay_buffer.append(data) if episode >= 0: for _ in range(BATCHES_PER_STEP): # Sample data from the replay buffer. batch_data = replay_buffer.sample(BATCH_SIZE) batch_s, batch_a, batch_r, batch_s1 = [ np.array( [batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0])) ] # Compute the next action. a1 = sess.run(actor_target.pi, feed_dict={actor_target.S: batch_s1}) # Compute the target Q. q1 = sess.run(critic_target.q, feed_dict={ critic_target.S: batch_s1, critic_target.A: a1 }) q_target = batch_r + DISCOUNT * q1 # Update actor and critic. _, _, actor_loss, critic_loss = sess.run( [ actor.train_op, critic.train_op, actor.actor_loss, critic.critic_loss ], feed_dict={ actor.S: batch_s, critic_with_actor.S: batch_s, actor.LR: LR_ACTOR, critic.S: batch_s, critic.A: batch_a, critic.QTarget: q_target, critic.LR: LR_CRITIC }) list_actor_loss.append(actor_loss) list_critic_loss.append(critic_loss) # Update target networks. sess.run( [actor_target_update_op, critic_target_update_op]) s = s1 # Postprocessing after each episode. if episode >= 0: list_final_reward.append(r) avg_actor_loss = np.mean(list_actor_loss) avg_critic_loss = np.mean(list_critic_loss) print("Episode ", format(episode, "03d"), ":", sep="") print(" Final Reward = ", format(r, ".6f"), ", Actor Loss = ", format(avg_actor_loss, ".6f"), ", Critic Loss = ", format(avg_critic_loss, ".6f"), sep="") # Testing. avg_reward = 0 for i in range(TEST_EPISODE): # Reset the environment and noise. s = env.reset() action_noise.reset() for step in range(MAX_FRAME): env.render() # Get action. a = sess.run(actor.pi, feed_dict={actor.S: np.reshape(s, (1, -1))}) a = a[0] # Interact with the game engine. s, r, _, _ = env.step(a) # Postprocessing after each episode. avg_reward += r avg_reward /= TEST_EPISODE # Save the parameters. saver = tf.train.Saver( [*actor.trainable_variables, *critic.trainable_variables]) saver.save(sess, SAVE_DIR + file_name) tf.contrib.keras.backend.clear_session() env.close() # Store data in the csv file. with open(CSV_DIR + file_name + ".csv", "w") as f: fieldnames = ["Episode", "Final Reward", "Average Reward"] writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n") writer.writeheader() for episode in range(MAX_EPISODE): content = { "Episode": episode, "Final Reward": list_final_reward[episode] } if episode == MAX_EPISODE - 1: content.update({"Average Reward": avg_reward}) writer.writerow(content) # Plot the training process. list_episode = list(range(MAX_EPISODE)) f, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 5)) ax.plot(list_episode, list_final_reward, "r-", label="Final Reward") ax.plot([MAX_EPISODE - 1], [avg_reward], "b.", label="Average Reward") ax.set_title("Final Reward") ax.set_xlabel("Episode") ax.set_ylabel("Reward") ax.legend(loc="lower right") ax.grid() f.savefig(FIGURE_TRAINING_DIR + file_name + ".png") plt.close(f)
def train(env, agent, args): """ Trains the given agent in the given environment, following the specification in the arguments passed via command-line :param env: environment :type env: OpenAI gym environment :param agent: agent to be trained :type agent: SAC :param args: the arguments parsed from command-line :type args: object returned by argparse library :return: array with the returns per episode cumulated by the agent during training :rtype: numpy array of dtype float32 """ if args.max_episode_steps is not None: # if user has specified a maximum number of steps per episode, set it env.set_max_episode_steps(args.max_episode_steps) # build replay buffer replay_buffer = ReplayBuffer(args.replay_size) total_steps = 0 updates = 0 returns = [] epsilon = args.initial_epsilon # for each episode counting from 1 for i_episode in itertools.count(1): # reset the environment and the episode counters, and get the initial state state = env.reset() episode_return = 0 i_step = 0 # for each step in the episode for i_step in itertools.count(0): if args.render: env.render() # if user has specified a number of initial exploratory steps, # then just sample a random action from the environment action space # if user has specified an epsilon randomness different from zero (and the exploratory steps are over) # then just sample a random action from the environment action space # otherwise let the agent choose an appropriate action if total_steps <= args.exploratory_steps: action = env.action_space.sample() elif epsilon > 0 and np.random.uniform(0, 1) <= epsilon: action = env.action_space.sample() else: action = agent.choose_action(state) # perform the action and observe the resulting next state, reward and done signal next_state, reward, done, _ = env.step(action) # if very verbose print per step log if args.verbose >= 2: print("Step: {}".format(i_step)) print("(s,a,r,s',d): ({}, {}, {}, {}, {})".format( state, action, reward, next_state, done)) # append observed transition to replay buffer replay_buffer.append(state, action, reward, next_state, done) # if user has specified a number of steps without having the agent update its networks (and learn), # then skip the update # if that phase is over, then proceed to update agent's networks if total_steps > args.learning_starts and len( replay_buffer) > args.batch_size: for _ in range(args.gradient_steps): q1l, q2l, pl, al = agent.update(replay_buffer, args.batch_size, updates) if args.verbose >= 2: print("Losses: ({}, {}, {}, {})".format( q1l, q2l, pl, al)) updates += 1 # update per step variables and cumulate episode return state = next_state episode_return += reward i_step += 1 total_steps += 1 # if received done signal from the environment, then terminate the episode if done: break # append the cumulated episode return to the array returns.append(episode_return) # if verbose print a summary of the training occurred in the last episode if args.verbose >= 1: summary = "Episode: {}. Steps: {}. Episode steps: {}. Episode return: {:.3f}.\n".format( i_episode, total_steps, i_step, episode_return) if args.learning_starts > total_steps: summary += "Learning starts in: {} steps. ".format( args.learning_starts - total_steps) if args.exploratory_steps > total_steps: summary += "Exploratory steps left: {}. ".format( args.exploratory_steps - total_steps) elif epsilon > 0: summary += "Epsilon: {:.3f}.".format(epsilon) print(summary) # if user has specified plotting, then plot the returns cumulated so far if args.plot and i_episode % args.plot_interval == 0: plot_mean_k_episodes_return(returns) # if user has specified a fixed number of training episodes, check if time is up if args.train_episodes is not None and i_episode >= args.train_episodes: break # update epsilon randomness coefficient, # if still positive and if exploratory phase is over and learning has started # linear decrease update wins over exponential decay update, in case user specified both if epsilon > 0 and \ total_steps > args.learning_starts and \ total_steps > args.exploratory_steps: if args.epsilon_decrease > 0 and epsilon > args.final_epsilon: epsilon = max(args.final_epsilon, epsilon - args.epsilon_decrease) elif args.epsilon_decay > 0: epsilon *= args.epsilon_decay return np.array(returns)
def train(file_name): # Create folders. if not os.path.isdir(SAVE_DIR): os.makedirs(SAVE_DIR) if not os.path.isdir(FIGURE_TRAINING_DIR): os.makedirs(FIGURE_TRAINING_DIR) # Obtain environment parameters. env = make_atari(ENV_NAME) obs_space = env.observation_space action_space = env.action_space # Build networks. main_network = QValueNetwork(obs_space, action_space, name = "main_network") target_network = QValueNetwork(obs_space, action_space, name = "target_network", auxiliary_network = main_network) variables_initializer = tf.global_variables_initializer() replay_buffer = ReplayBuffer(buffer_size = BUFFER_SIZE) start_time = time.time() list_episodic_reward = [] episodic_reward = 0 obs = env.reset() with tf.Session() as sess: # Initialize all variables. sess.run(variables_initializer) # Only save the main network. saver = tf.train.Saver(var_list = main_network.variables) # Initialize buffers. for _ in range(INITIAL_BUFFER_SIZE): # Sample random action. action = np.random.randint(action_space.n) # Interact with the environment. obs_next, reward, done, _ = env.step(action) episodic_reward += reward if done: obs_next = env.reset() episodic_reward = 0 # Store data. data = [obs, action, reward, done, obs_next] replay_buffer.append(data) # Update observation. obs = obs_next for step in range(TOTAL_STEP): # Synchronize the target network periodically (target network <- main network). if step % TARGET_NETWORK_UPDATE_STEP == 0: sess.run(target_network.sync_op) # Sample action with epsilon-greedy policy. epsilon = EPSILON_MAX - (EPSILON_MAX - EPSILON_MIN) * np.minimum(step / EPSILON_DECAY_STEP, 1) if np.random.uniform() < epsilon: action = np.random.randint(action_space.n) else: q = sess.run(target_network.q, feed_dict = {target_network.Obs: np.expand_dims(np.array(obs) / 255.0, 0)}) action = np.argmax(q[0]) # Interact with the environment. obs_next, reward, done, _ = env.step(action) episodic_reward += reward if done: obs_next = env.reset() list_episodic_reward.append((step, episodic_reward)) delta_time = int(time.time() - start_time) print("Step ", step, "/", TOTAL_STEP, ": Time spent = ", delta_time, " s , Episodic reward = ", episodic_reward, sep = "") episodic_reward = 0 # Store data. data = [obs, action, reward, done, obs_next] replay_buffer.append(data) # Update observation. obs = obs_next # Learning rate. lr = LEARNING_RATE[-1] for i in range(len(LR_ANNEAL_STEP)): if step < LR_ANNEAL_STEP[i]: lr = LEARNING_RATE[i] break # Sample training data from the replay buffer. batch_data = replay_buffer.sample(BATCH_SIZE) batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \ [np.array([batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0]))] # Compute the target Q value: # target_q = r + (1 - done) * REWARD_DISCOUNT * max[q(s', a)] q_next = sess.run(target_network.q, feed_dict = {target_network.Obs: batch_obs_next / 255.0}) max_qnext = np.amax(q_next, axis = 1) target_q = batch_reward + (1 - batch_done) * REWARD_DISCOUNT * max_qnext # Update the main network. sess.run(main_network.train_op, feed_dict = { main_network.Obs: batch_obs / 255.0, main_network.Action: batch_action, main_network.TargetQ: target_q, main_network.LR: lr }) # Save the main network periodically. if step % AUTOSAVE_STEP == 0: saver.save(sess, SAVE_DIR + file_name) # Save the main network. saver = tf.train.Saver(var_list = main_network.variables) saver.save(sess, SAVE_DIR + file_name) total_time = int(time.time() - start_time) print("Training finished in ", total_time, " s.", sep = "") # Close the environment. env.close() # Plot the episodic reward against training step curve. plot_episodic_reward(list_episodic_reward, file_name)
class Agent: Transition = namedtuple( 'Transition', ('state', 'action', 'next_state', 'reward', 'done'), rename=False) # 'rename' means not to overwrite invalid field def __init__(self, env, hyperparameters, device, writer, max_games, tg_bot): self.eps_start = hyperparameters['eps_start'] self.eps_end = hyperparameters['eps_end'] self.eps_decay = hyperparameters['eps_decay'] self.epsilon = hyperparameters['eps_start'] self.n_iter_update_nn = hyperparameters['n_iter_update_nn'] self.max_games = max_games self.tg_bot = tg_bot self.env = env self.agent_control = AgentControl(env, device, hyperparameters['learning_rate'], hyperparameters['gamma'], hyperparameters['multi_step'], hyperparameters['double_dqn'], hyperparameters['dueling']) self.replay_buffer = ReplayBuffer(hyperparameters['buffer_size'], hyperparameters['buffer_minimum'], hyperparameters['multi_step'], hyperparameters['gamma']) self.summary_writer = writer self.num_iterations = 0 self.total_reward = 0 self.num_games = 0 self.total_loss = [] self.ts_frame = 0 self.ts = time.time() self.birth_time = time.time() self.rewards = [] if self.tg_bot: tg.welcome_msg(hyperparameters['multi_step'], hyperparameters['double_dqn'], hyperparameters['dueling']) def select_greedy_action(self, obs): # Give current state to the control who will pass it to NN which will # return all actions and the control will take max and return it here return self.agent_control.select_greedy_action(obs) def select_eps_greedy_action(self, obs): rand_num = random.rand() if self.epsilon > rand_num: # Select random action - explore return self.env.action_space.sample() else: # Select best action return self.select_greedy_action(obs) def add_to_buffer(self, obs, action, new_obs, reward, done): transition = self.Transition(state=obs, action=action, next_state=new_obs, reward=reward, done=done) self.replay_buffer.append(transition) self.num_iterations = self.num_iterations + 1 if self.epsilon > self.eps_end: self.epsilon = self.eps_start - self.num_iterations / self.eps_decay self.total_reward = self.total_reward + reward def sample_and_improve(self, batch_size): # If buffer is big enough if len(self.replay_buffer.buffer) > self.replay_buffer.minimum: # Sample batch_size number of transitions from buffer B mini_batch = self.replay_buffer.sample(batch_size) # Calculate loss and improve NN loss = self.agent_control.improve(mini_batch) # So we can calculate mean of all loss during one game self.total_loss.append(loss) if (self.num_iterations % self.n_iter_update_nn) == 0: self.agent_control.update_target_nn() def reset_parameters(self): self.rewards.append(self.total_reward) self.total_reward = 0 self.num_games = self.num_games + 1 self.total_loss = [] def print_info(self): # print(self.num_iterations, self.ts_frame, time.time(), self.ts) fps = (self.num_iterations - self.ts_frame) / (time.time() - self.ts) print('%d %d rew:%d mean_rew:%.2f fps:%d, eps:%.2f, loss:%.4f' % (self.num_iterations, self.num_games, self.total_reward, np.mean(self.rewards[-40:]), fps, self.epsilon, np.mean(self.total_loss))) self.ts_frame = self.num_iterations self.ts = time.time() if self.summary_writer != None: self.summary_writer.add_scalar('reward', self.total_reward, self.num_games) self.summary_writer.add_scalar('mean_reward', np.mean(self.rewards[-40:]), self.num_games) self.summary_writer.add_scalar('10_mean_reward', np.mean(self.rewards[-10:]), self.num_games) self.summary_writer.add_scalar('esilon', self.epsilon, self.num_games) self.summary_writer.add_scalar('loss', np.mean(self.total_loss), self.num_games) if self.tg_bot: if (self.num_games % 10) == 0: tg.info_msg(self.num_games + 1, self.max_games, np.mean(self.rewards[-40:]), np.mean(self.total_loss)) if self.num_games == (self.max_games - 1): tg.end_msg(time.time() - self.birth_time)
def train(file_name): # Create folders. if not os.path.isdir(SAVE_DIR): os.makedirs(SAVE_DIR) if not os.path.isdir(FIGURE_TRAINING_DIR): os.makedirs(FIGURE_TRAINING_DIR) # Obtain environment parameters. env = make_atari(ENV_NAME) obs_space = env.observation_space action_space = env.action_space env.close() # Build networks. main_network = QValueNetwork(obs_space, action_space, name="main_network") target_network = QValueNetwork(obs_space, action_space, name="target_network", auxiliary_network=main_network) variables_initializer = tf.global_variables_initializer() # Create parallel environments. par_env = ParallelEnvironment( [make_atari(ENV_NAME) for _ in range(NUM_ENV)]) replay_buffer = ReplayBuffer(buffer_size=BUFFER_SIZE) start_time = time.time() list_episodic_reward = [] episodic_reward = np.zeros(NUM_ENV) obs = par_env.reset() with tf.Session() as sess: # Initialize all variables. sess.run(variables_initializer) # Only save the main network. saver = tf.train.Saver(var_list=main_network.variables) # Initialize buffers. while replay_buffer.get_size() < INITIAL_BUFFER_SIZE: # Sample random action. action = np.random.randint(action_space.n, size=NUM_ENV) # Interact with the environment. obs_next, reward, done, _ = par_env.step(action) episodic_reward += reward for i in range(NUM_ENV): if done[i]: episodic_reward[i] = 0 # Store data. for i in range(NUM_ENV): data = [obs[i], action[i], reward[i], done[i], obs_next[i]] replay_buffer.append(data) # Update observation. obs = obs_next step = 0 next_target_network_update_step = 0 next_autosave_step = 0 while step < TOTAL_STEP: # Synchronize the target network periodically (target network <- main network). if step >= next_target_network_update_step: sess.run(target_network.sync_op) next_target_network_update_step += TARGET_NETWORK_UPDATE_STEP # Sample action with epsilon-greedy policy. epsilon = EPSILON_MAX - (EPSILON_MAX - EPSILON_MIN) * np.minimum( step / EPSILON_DECAY_STEP, 1) random_uniform = np.random.uniform(size=NUM_ENV) action = np.zeros(NUM_ENV, dtype=np.int32) random_action_index = np.argwhere(random_uniform < epsilon) if np.shape(random_action_index)[0] > 0: action[tuple( np.transpose(random_action_index))] = np.random.randint( action_space.n, size=np.shape(random_action_index)[0]) greedy_action_index = np.argwhere(random_uniform >= epsilon) if np.shape(greedy_action_index)[0] > 0: q = sess.run(target_network.q, feed_dict={ target_network.Obs: np.array(obs)[tuple( np.transpose(greedy_action_index))] / 255.0 }) action[tuple(np.transpose(greedy_action_index))] = np.argmax( q, axis=1) # Interact with the environment. obs_next, reward, done, _ = par_env.step(action) episodic_reward += reward for i in range(NUM_ENV): if done[i]: list_episodic_reward.append((step, episodic_reward[i])) delta_time = int(time.time() - start_time) print("Step ", step, "/", TOTAL_STEP, ": Time spent = ", delta_time, " s , Episodic reward = ", episodic_reward[i], sep="") episodic_reward[i] = 0 # Store data. for i in range(NUM_ENV): data = [obs[i], action[i], reward[i], done[i], obs_next[i]] replay_buffer.append(data) # Update observation. obs = obs_next # Learning rate. lr = LEARNING_RATE[-1] for i in range(len(LR_ANNEAL_STEP)): if step < LR_ANNEAL_STEP[i]: lr = LEARNING_RATE[i] break for _ in range(NUM_ENV): # Sample training data from the replay buffer. batch_data = replay_buffer.sample(BATCH_SIZE) batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \ [np.array([batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0]))] # Compute the target Q value: # target_q = r + (1 - done) * REWARD_DISCOUNT * max[q(s', a)] q_next = sess.run( target_network.q, feed_dict={target_network.Obs: batch_obs_next / 255.0}) max_qnext = np.amax(q_next, axis=1) target_q = batch_reward + ( 1 - batch_done) * REWARD_DISCOUNT * max_qnext # Update the main network. sess.run(main_network.train_op, feed_dict={ main_network.Obs: batch_obs / 255.0, main_network.Action: batch_action, main_network.TargetQ: target_q, main_network.LR: lr }) # Save the main network periodically. if step >= next_autosave_step: saver.save(sess, SAVE_DIR + file_name) next_autosave_step += AUTOSAVE_STEP # Update step. step += NUM_ENV # Save the main network. saver = tf.train.Saver(var_list=main_network.variables) saver.save(sess, SAVE_DIR + file_name) total_time = int(time.time() - start_time) print("Training finished in ", total_time, " s.", sep="") # Close the environment. par_env.close() # Plot the episodic reward against training step curve. plot_episodic_reward(list_episodic_reward, file_name)
class DQNAgent: def __init__(self, env, QNetworkClass, minibatch_size_limit=32, replay_memory_size=1000000, history_length=4, target_update_step=10000, discount_factor=0.99, learning_rate=0.00025, initial_exploration=1.0, final_exploration=0.1, final_exploration_frame=1000000, replay_start_size=50000, log_dir=None): self.env = env self.new_episode() self.n_actions = env.action_space.n self.n_states = env.observation_space.shape[0] self.n_input = self.n_states * history_length #setup tensorflow self.sess = tf.Session() self.q = QNetworkClass("q_orig", self.n_input, self.n_actions, learning_rate) self.q_hat = QNetworkClass("q_hat", self.n_input, self.n_actions) self.total_reward = tf.placeholder(tf.float32) tf.summary.scalar("TotalReward", self.total_reward) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(max_to_keep=0) self.summary = tf.summary.merge_all() if tf.gfile.Exists(log_dir): tf.gfile.DeleteRecursively(log_dir) tf.gfile.MakeDirs(log_dir) if log_dir: self.log_writer = tf.summary.FileWriter(log_dir, self.sess.graph, flush_secs=20) else: self.log_writer = None #store parameter self.minibatch_size_limit = minibatch_size_limit self.gamma = discount_factor self.replay_buffer = ReplayBuffer(replay_memory_size) self.target_update_step = target_update_step self.step = 0 self.phi_t = np.zeros((1, self.n_input)). \ astype(np.float32) self.epsilon = initial_exploration self.replay_start_size = replay_start_size self.final_exploration = final_exploration self.epsilon_step = (initial_exploration - final_exploration) \ / final_exploration_frame def act(self): a_t = np.argmax(self._perform_q(self.phi_t)) s_t_1, r_t, terminal, _ = self.env.step(a_t) phi_t_1 = np.hstack( (self.phi_t[:, self.n_states:], s_t_1.astype(np.float32).reshape( (1, -1)))) self.phi_t = phi_t_1 return a_t, s_t_1, r_t, terminal, {'epsilon': self.epsilon} def act_and_train(self): # With probability epsilon select a random action # Otherwise select acionn from Q network if random.random() <= self.epsilon: a_t = random.randint(0, self.n_actions - 1) else: a_t = np.argmax(self._perform_q(self.phi_t)) # Execute action in emulator and observe reward and state s_t_1, r_t, terminal, _ = self.env.step(a_t) phi_t_1 = np.hstack( (self.phi_t[:, self.n_states:], s_t_1.astype(np.float32).reshape( (1, -1)))) # Store transition self.replay_buffer.append([self.phi_t, a_t, r_t, phi_t_1, terminal]) self.phi_t = phi_t_1 # After specified steps start experienced replay to update Q network if self.step >= self.replay_start_size: # sample minibatch y = np.zeros((0, self.n_actions)) phi = np.zeros((0, self.n_input)) minibatch = self.replay_buffer.sample(self.minibatch_size_limit) for phi_j, a_j, r_j, phi_j_1, terminal_j in minibatch: y_j = self._perform_q(phi_j)[0] if terminal_j: y_j[a_j] = r_j else: # DDQN a = np.argmax(self._perform_q(phi_j_1)) y_j[a_j] = r_j + self.gamma * self._perform_q_hat(phi_j_1)[ 0, a] # DQN #y_j[a_j] = r_j + self.gamma * np.max(self._perform_q_hat(phi_j_1)) y = np.vstack((y, y_j)) phi = np.vstack((phi, phi_j)) # Update Q network #TODO comversion to numpy array should be done in q network class self._train_q(np.array(phi, dtype=np.float32), np.array(y, dtype=np.float32)) # Update target Q network every specific steps if self.step % self.target_update_step == 0: self._update_q_hat() # Update Exploration ratio if self.epsilon > self.final_exploration: self.epsilon -= self.epsilon_step self.step += 1 return a_t, s_t_1, r_t, terminal, {'epsilon': self.epsilon} def new_episode(self): self.env.reset() def write_summary(self, episode, total_reward): summary = self.sess.run( self.summary, feed_dict={self.total_reward: np.array(total_reward)}) self.log_writer.add_summary(summary, episode) def save_variables(self, step, model_dir=None): if model_dir: if not tf.gfile.Exists(model_dir): tf.gfile.MakeDirs(model_dir) full_path = os.path.join(model_dir, 'model') self.saver.save(self.sess, full_path, global_step=step) print('save model to ' + full_path) def restore_variables(self, model_path=None): if model_path: self.saver.restore(self.sess, model_path) print('Restore model from ' + model_path) def _perform_q(self, x): return self.q(self.sess, x) def _perform_q_hat(self, x): return self.q_hat(self.sess, x) def _train_q(self, x, t): self.q.train(self.sess, x, t) def _update_q_hat(self): self.q_hat.set_variables(self.sess, self.q.read_variables(self.sess)) def __del__(self): if self.log_writer: self.log_writer.close()
def ddpg_training(plt, args=None): print("Using {} environment.".format(env.spec.id)) print('observation space {} '.format(env.observation_space)) print('action space {} high {} low {}'.format(env.action_space, env.action_space.high, env.action_space.low)) critic.summary() actor.summary() # create target networks criticp = keras.models.clone_model(critic) criticp.compile(optimizer='adam', loss='mse') criticp.set_weights(critic.get_weights()) actorp = keras.models.clone_model(actor) actorp.compile(optimizer='adam', loss='mse') actorp.set_weights(actor.get_weights()) #allocate replay buffers replay_buffer = ReplayBuffer(Config.buffer_length, env.observation_space.shape, env.action_space.shape) #set up the plotting - imports must be here to enable matplotlib.use() plt.ion() from util import ToggleFlags from display import display_progress flags = ToggleFlags(args) flags.add('noise', True) flags.add('render', False) flags.add('clear') flags.add('viz', True) flags.add('movie', True) flags.add('trails', False) RewardsHistory = [] Rdfr = np.zeros((Config.buffer_length, )) episodes = [] epoches = int(Config.buffer_length / Config.batch_size) for i_episode in range(Config.max_episodes): observation1 = env.reset() episode = [] RewardsHistory.append(0) for t in range(Config.max_steps): episode.append(replay_buffer.index) #take step using the action based on actor observation = observation1 action = actor.predict(np.expand_dims(observation, axis=0))[0] if flags.noise: action += exploration.sample() observation1, reward, done, _ = env.step(action) if len(observation1.shape) > 1 and observation1.shape[-1] == 1: observation1 = np.squeeze(observation1, axis=-1) # insert into replay buffer replay_buffer.append(observation, action, reward, observation1, done) #book keeping RewardsHistory[-1] += reward if flags.render: env.render() if done: break if replay_buffer.index == 0: episodes = [] #forget old episodes to avoid wraparound if replay_buffer.ready: for epoch in range(epoches): sample = replay_buffer.sample(Config.batch_size) # train critic on discounted future rewards yq = (replay_buffer.reward[sample] + Config.gamma * (criticp.predict([ replay_buffer.obs1[sample], actorp.predict(replay_buffer.obs1[sample]) ])[:, 0])) critic.train_on_batch( [replay_buffer.obs[sample], replay_buffer.action[sample]], yq) # train the actor to maximize Q if i_episode > Config.warmup: actor.train_on_batch( replay_buffer.obs[sample], np.zeros((Config.batch_size, *actor.output_shape[1:]))) # update target networks criticp.set_weights([ Config.tau * w + (1 - Config.tau) * wp for wp, w in zip( criticp.get_weights(), critic.get_weights()) ]) actorp.set_weights([ Config.tau * w + (1 - Config.tau) * wp for wp, w in zip(actorp.get_weights(), actor.get_weights()) ]) if flags.clear: episodes = [] episodes.append(episode) if len(episode) > 2 and Config.show_progress: display_progress(replay_buffer, flags, plt, RewardsHistory, Rdfr, env, episode, episodes, i_episode, actor, actorp, critic, criticp) if Config.save_model and i_episode % 100 == 0: print("Save models") actor.save('actor.h5') critic.save('critic.h5') print("Episode {} finished after {} timesteps total reward={}".format( i_episode, t + 1, RewardsHistory[-1]))
class Trainer_Clipped(object): def __init__(self, agent, env): self.agent = agent self.env = env self.seed = random.randint(0, 20180818) self.optimizer1 = optim.Adam(agent.parameters1, lr=LEARNING_RATE) self.optimizer2 = optim.Adam(agent.parameters2, lr=LEARNING_RATE) self.buffer = ReplayBuffer(capacity=CAPACITY) self.total_step = 0 def run(self, device='cpu', buffer=False, explore=False): """Run an episode and buffer""" self.env.reset() self.env.env.seed(self.seed) state = self.env.get_screen() states = np.asarray([state for _ in range(4)]) # shape (4, 84, 84) step = 0 accumulated_reward = 0 while True: action = self.agent.make_action1(torch.Tensor([states]).to(device), explore=explore) state_next, reward, done = self.env.step(action) states_next = np.concatenate([states[1:, :, :], [state_next]], axis=0) step += 1 accumulated_reward += reward if buffer: self.buffer.append(states, action, reward, states_next, done) states = states_next if explore == False: # Render the screen to see training self.env.env.render() if done: break return accumulated_reward, step def _fill_buffer(self, num, device='cpu'): start = time.time() while self.buffer.size < num: self.run(device, buffer=True, explore=True) print('Fill buffer: {}/{}'.format(self.buffer.size, self.buffer.capacity)) print('Filling buffer takes {:.3f} seconds'.format(time.time() - start)) def train(self, device='cpu'): # Sp self.env.change_record_every_episode(100000000) self._fill_buffer(OBSERV, device) if self.env.record_every_episode: self.env.change_record_every_episode(self.env.record_every_episode) episode = 0 total_accumulated_rewards = [] while 'training' != 'converge': #while episode <= 500: self.env.reset() state = self.env.get_screen() states = np.asarray([state for _ in range(4)]) # shape (4, 84, 84) step_prev = self.total_step accumulated_reward = 0 done = False n_flap = 0 n_none = 0 while not done: #### -------------------- #### Add a new transition # Calculates actions based on e-greedy action = self.agent.make_action1(torch.Tensor([states]).to(device), explore=True) state_next, reward, done = self.env.step(action) states_next = np.concatenate([states[1:, :, :], [state_next]], axis=0) self.total_step += 1 accumulated_reward += reward self.buffer.append(states, action, reward, states_next, done) states = states_next #### -------------------- #### -------------------- #### Training step start = time.time() # prepare training data minibatch = self.buffer.sample(n_sample=BATCH) _states = [b[0] for b in minibatch] _actions = [b[1] for b in minibatch] _rewards = [b[2] for b in minibatch] _states_next = [b[3] for b in minibatch] _dones = [b[4] for b in minibatch] ys = [] for i in range(len(minibatch)): terminal = _dones[i] r = _rewards[i] if terminal: y = r else: # Double DQN # Nessa parte usamos a e_greedy para tomar a ação, ou sempre nos baseamos no argmax da propria rede? s_t_next = torch.Tensor([_states_next[i]]).to(device) # Calculates de action with self.net1 online_act1 = self.agent.make_action1(s_t_next) # Calculates de max value for online_act1 max_value1 = self.agent.Q1(s_t_next, online_act1, target=True) # Calculates de action with self.net2 # online_act2 = self.agent.make_action2(s_t_next) # Calculates de max value for online_act2 max_value2 = self.agent.Q2(s_t_next, online_act1, target=True) # Index 0 network1, Index 1 network 2 max_values = [max_value1,max_value2] index = np.argmin(np.asarray(max_values)) # Calculates the total reward with the target network using the action calculated form the other network (self.net) # Both network1 and network 2 shares the same target y = r + DISCOUNT * max_values[index] ys.append(y) ys = torch.Tensor(ys).to(device) # Render the screen to see training #self.env.env.render() # Apply gradient on network 1 #print('Traning network 1...') self.optimizer1.zero_grad() input = torch.Tensor(_states).to(device) output1 = self.agent.net1(input) # shape (BATCH, 2) actions_one_hot = np.zeros([BATCH, 2]) actions_one_hot[np.arange(BATCH), _actions] = 1.0 actions_one_hot = torch.Tensor(actions_one_hot).to(device) ys_hat = (output1 * actions_one_hot).sum(dim=1) loss1 = F.smooth_l1_loss(ys_hat, ys) loss1.backward() self.optimizer1.step() # Apply gradient on network 2 #print('Traning network 2...') self.optimizer2.zero_grad() input = torch.Tensor(_states).to(device) output2 = self.agent.net2(input) # shape (BATCH, 2) actions_one_hot = np.zeros([BATCH, 2]) actions_one_hot[np.arange(BATCH), _actions] = 1.0 actions_one_hot = torch.Tensor(actions_one_hot).to(device) ys_hat = (output2 * actions_one_hot).sum(dim=1) loss2 = F.smooth_l1_loss(ys_hat, ys) loss2.backward() self.optimizer2.step() #### -------------------- # logging if action == 0: n_flap += 1 else: n_none += 1 if done and self.total_step % LOGGING_CYCLE == 0: log = '[{}, {}] alive: {}, reward: {}, F/N: {}/{}, loss1: {:.4f}, loss2: {:.4f}, epsilon: {:.4f}, time: {:.3f}, network: Q{}'.format( episode, self.total_step, self.total_step - step_prev, accumulated_reward, n_flap, n_none, loss1.item(), loss2.item(), self.agent.epsilon, time.time() - start, index+1) print(log) self.agent.update_epsilon() if self.total_step % TARGET_UPDATE_CYCLE == 0: #print('[Update target network]') self.agent.update_targets() if self.total_step % SAVE_MODEL_CYCLE == 0: print('[Save model]') self.save(id=self.total_step) if len(total_accumulated_rewards) > 0: self.save_graph_rewards(episode, total_accumulated_rewards) # Keep the accumulated_reward for all the episodes total_accumulated_rewards.append(accumulated_reward) episode += 1 def save_graph_rewards(self, episodes, total_accumulated_rewards): #fig = plt.figure() fig, ax = plt.subplots(figsize=(5, 5)) plt.xlabel('Episodes') plt.ylabel('Total reward') episodes_x = np.linspace(0, episodes, episodes) ax.plot(episodes_x, np.ones(episodes)*0, color='red', label='ref') ax.plot(episodes_x, total_accumulated_rewards, color='turquoise', label='real') ax.legend(loc='lower left') if not os.path.exists('tmp/graphs'): os.makedirs('tmp/graphs') plt.savefig(f'tmp/graphs/Total_rewards_ep={episodes}.png') plt.close() def save(self, id): filename = 'tmp/models1/model_{}.pth.tar'.format(id) dirpath = os.path.dirname(filename) if not os.path.exists(dirpath): os.mkdir(dirpath) checkpoint = { 'net': self.agent.net1.state_dict(), 'target': self.agent.target1.state_dict(), 'optimizer': self.optimizer1.state_dict(), 'total_step': self.total_step } torch.save(checkpoint, filename) # Only saves Q1 filename = 'tmp/models2/model_{}.pth.tar'.format(id) dirpath = os.path.dirname(filename) if not os.path.exists(dirpath): os.mkdir(dirpath) checkpoint = { 'net': self.agent.net2.state_dict(), 'target': self.agent.target2.state_dict(), 'optimizer': self.optimizer2.state_dict(), 'total_step': self.total_step } torch.save(checkpoint, filename) def load(self, filename, device='cpu'): # FAZER AJUSTES DEPOIS PARA EVALUATE # LEMBRAR DO USO DE DUAS REDES, LOGO DOIS OPTIMIZERS ckpt = torch.load(filename, map_location=lambda storage, loc: storage) ## Deal with the missing of bn.num_batches_tracked net_new = OrderedDict() tar_new = OrderedDict() for k, v in ckpt['net'].items(): for _k, _v in self.agent.net1.state_dict().items(): if k == _k: net_new[k] = v for k, v in ckpt['target'].items(): for _k, _v in self.agent.target1.state_dict().items(): if k == _k: tar_new[k] = v self.agent.net1.load_state_dict(net_new) self.agent.target1.load_state_dict(tar_new) ## ----------------------------------------------- self.optimizer1.load_state_dict(ckpt['optimizer']) self.total_step = ckpt['total_step']
def train( args, log_dir, seed, env_id, replay_buffer_len, memory_len, cores, trees, p, # #nn items; reported number is 50 embed_size, # embedding vector length; reported number is ? gamma, # discount value; reported number is 0.99 N, # N-step bootstrapping; reported number is 100 update_period, # the reported number is 16//4 = 4 batch_size, # the reported number is 32 init_eps, delta, lr, q_lr, epsilon, min_epsilon, epsilon_decay, #exponential decaying factor eval_period, save_period, **kwargs): # another hyper params _gw = np.array([gamma**i for i in range(N)]) # expr setting Path(log_dir).mkdir(parents=True, exist_ok='temp' in log_dir) with open(os.path.join(log_dir, 'args.txt'), 'w') as f: f.write(str(args)) np.random.seed(seed) tf.random.set_random_seed(seed) # Env env = wrap_deepmind(make_atari(env_id), episode_life=False, clip_rewards=False, frame_stack=True, scale=False) num_ac = env.action_space.n # ReplayBuffer replay_buffer = ReplayBuffer(replay_buffer_len) # Neural Episodic Controller nec = NEC( num_ac, p, embed_size, delta, lr, q_lr, dnd_params={ 'maxlen': memory_len, 'seed': seed, 'cores': cores, # #cores for KD-Tree 'trees': trees, # #trees for KD-Tree }) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter(os.path.join( log_dir, 'tensorboard')) def _write_scalar(it, it_type, tag, value): summary = tf.Summary(value=[ tf.Summary.Value(tag=f"{tag}/{it_type}", simple_value=value) ]) summary_writer.add_summary(summary, global_step=it) ####### Setup Done num_steps = 0 num_updates = 0 # Fill up the memory and replay buffer with a random policy for ep in range(init_eps): ob = env.reset() obs, acs, rewards = [ob], [], [] for _ in itertools.count(): ac = np.random.randint(num_ac) ob, r, done, _ = env.step(ac) obs.append(ob) acs.append(ac) rewards.append(r) num_steps += 1 if done: break Rs = [ np.sum(_gw[:len(rewards[i:i + N])] * rewards[i:i + N]) for i in range(len(rewards)) ] obs = np.array(obs) es = nec._embed(obs) for ob, e, a, R in zip(obs, es, acs, Rs): nec.append(e, a, R) replay_buffer.append(ob, a, R) # Training! next_save_steps = save_period try: for ep in itertools.count(start=init_eps): ob = env.reset() obs, acs, rewards, es, Vs = [ob], [], [], [], [] for t in itertools.count(): # Epsilon Greedy Policy ac, (e, V) = nec.policy(ob) if np.random.random() < epsilon: ac = np.random.randint(num_ac) ob, r, done, _ = env.step(ac) obs.append(ob) acs.append(ac) rewards.append(r) es.append(e) Vs.append(V) num_steps += 1 # Train on random minibatch from replacy buffer if num_steps % update_period == 0: b_s, b_a, b_R = replay_buffer.sample(batch_size) loss = nec.update(b_s, b_a, b_R) num_updates += 1 if num_updates % 100 == 0: print(f'[{num_steps*4}/{num_updates}] loss: {loss}') _write_scalar(it=num_steps * 4, it_type='per_frames', tag='loss', value=loss) _write_scalar(it=num_updates, it_type='per_updates', tag='loss', value=loss) _write_scalar(it=num_steps * 4, it_type='per_frames', tag='num_updates', value=num_updates) if t >= N: # N-Step Bootstrapping # TODO: implement the efficient version R = np.sum( _gw * rewards[t - N:t]) + (gamma**N) * Vs[t] #R_{t-N} # append to memory nec.append(es[t - N], acs[t - N], R) # append to replay buffer replay_buffer.append(obs[t - N], acs[t - N], R) if done: break print( f'Episode {ep} -- Ep Len: {len(obs)} Acc Reward: {np.sum(rewards)} current epsilon: {epsilon}' ) _write_scalar(tag='ep', value=ep, it=num_steps * 4, it_type='per_frames') _write_scalar(tag='ep_len', value=len(obs), it=num_steps * 4, it_type='per_frames') _write_scalar(tag='ep_len', value=len(obs), it=ep, it_type='per_episode') _write_scalar(tag='eps_reward', value=np.sum(rewards), it=num_steps * 4, it_type='per_frames') _write_scalar(tag='eps_reward', value=np.sum(rewards), it=ep, it_type='per_episode') _write_scalar(tag='epsilon', value=epsilon, it=ep, it_type='per_episode') # Remaining items which is not bootstrappable; partial trajectory close to end of episode # Append to memory & replay buffer for t in range(len(rewards) - N, len(rewards)): R = np.sum([ gamma**(i - t) * rewards[i] for i in range(t, len(rewards)) ]) nec.append(es[t], acs[t], R) replay_buffer.append(obs[t], acs[t], R) # epsilon decay epsilon = max(min_epsilon, epsilon * epsilon_decay) # Save Model & Evaluatate if ep % eval_period == 0: try: ep_len, eps_reward = _run(env, nec, os.path.join( log_dir, f'test-{ep}.mp4'), maxlen=len(obs) * 3) print( f'Evaluation -- Episode {ep} -- Ep Len: {ep_len} Acc Reward: {eps_reward}' ) _write_scalar(tag='ep_len', value=ep_len, it=ep, it_type='per_episode_eval') _write_scalar(tag='eps_reward', value=eps_reward, it=ep, it_type='per_episode_eval') except RuntimeError as e: print(e) print('Evaluation -- Skipped') if num_steps >= next_save_steps: nec.save(log_dir, it=next_save_steps * 4) # iteration number -- num frames next_save_steps += save_period except KeyboardInterrupt: print('saving... please wait...') nec.save(log_dir) print('done!')
max_steps = 500 batch_size = 64 frame_idx = 0 latest_10_returns = deque(maxlen=10) while True: state = env.reset() ou_noise.reset() episode_reward = 0 loss_act, loss_cri = 0, 0 for t in range(max_steps): action = act_net.get_action([state]) action = ou_noise.get_action(action, t) next_state, reward, done, _ = env.step(action) replay_buffer.append(state, action, reward, next_state, done) state = next_state frame_idx += 1 if len(replay_buffer) > batch_size: loss_act, loss_cri = ddpg_update(batch_size) episode_reward += reward if done: break latest_10_returns.append(episode_reward) mean_return = np.mean(latest_10_returns) if frame_idx % 500 == 0: print( 'Frame_idx: %d, loss_act: %.3f, loss_cri: %.3f, mean_return: %.3f' % (frame_idx, loss_act, loss_cri, float(mean_return))) if mean_return > -300: torch.save(act_net.state_dict(), identity + '_act.pth')
class Brain: """ The Brain that contains all the models """ def __init__(self, num_states, num_actions, action_high, action_low, gamma=GAMMA, rho=RHO, std_dev=STD_DEV): # initialize everything self.actor_network = ActorNetwork(num_states, num_actions, action_high) self.critic_network = CriticNetwork(num_states, num_actions, action_high) self.actor_target = ActorNetwork(num_states, num_actions, action_high) self.critic_target = CriticNetwork(num_states, num_actions, action_high) # Making the weights equal initially self.actor_target.set_weights(self.actor_network.get_weights()) self.critic_target.set_weights(self.critic_network.get_weights()) self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE) self.gamma = tf.constant(gamma) self.rho = rho self.action_high = action_high self.action_low = action_low self.num_states = num_states self.num_actions = num_actions self.noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1)) # optimizers self.critic_optimizer = tf.keras.optimizers.Adam(CRITIC_LR, amsgrad=True) self.actor_optimizer = tf.keras.optimizers.Adam(ACTOR_LR, amsgrad=True) # temporary variable for side effects self.cur_action = None # define update weights with tf.function for improved performance @tf.function( input_signature=[ tf.TensorSpec(shape=(None, num_states), dtype=tf.float32), tf.TensorSpec(shape=(None, num_actions), dtype=tf.float32), tf.TensorSpec(shape=(None, 1), dtype=tf.float32), tf.TensorSpec(shape=(None, num_states), dtype=tf.float32), tf.TensorSpec(shape=(None, 1), dtype=tf.float32), ]) def update_weights(s, a, r, sn, d): """ Function to update weights with optimizer """ with tf.GradientTape() as tape: # define target y = r + self.gamma * (1 - d) * self.critic_target([sn, self.actor_target(sn)]) # define the delta Q critic_loss = tf.math.reduce_mean(tf.math.abs(y - self.critic_network([s, a]))) critic_grad = tape.gradient(critic_loss, self.critic_network.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic_network.trainable_variables)) with tf.GradientTape() as tape: # define the delta mu actor_loss = -tf.math.reduce_mean(self.critic_network([s, self.actor_network(s)])) actor_grad = tape.gradient(actor_loss, self.actor_network.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor_network.trainable_variables)) return critic_loss, actor_loss self.update_weights = update_weights def act(self, state, _notrandom=True, noise=True): """ Run action by the actor network Args: state: the current state _notrandom: whether greedy is used noise: whether noise is to be added to the result action (this improves exploration) Returns: the resulting action """ self.cur_action = (self.actor_network(state)[0].numpy() if _notrandom else (np.random.uniform(self.action_low, self.action_high, self.num_actions)) + (self.noise() if noise else 0)) self.cur_action = np.clip(self.cur_action, self.action_low, self.action_high) maxQ = max(self.critic_network([state, self.actor_network(state)])).numpy()[0] return self.cur_action, maxQ def remember(self, prev_state, reward, state, done): """ Store states, reward, done value to the buffer """ # record it in the buffer based on its reward self.buffer.append(prev_state, self.cur_action, reward, state, done) def learn(self, entry): """ Run update for all networks (for training) """ s, a, r, sn, d = zip(*entry) c_l, a_l = self.update_weights(tf.convert_to_tensor(s, dtype=tf.float32), tf.convert_to_tensor(a, dtype=tf.float32), tf.convert_to_tensor(r, dtype=tf.float32), tf.convert_to_tensor(sn, dtype=tf.float32), tf.convert_to_tensor(d, dtype=tf.float32)) update_target(self.actor_target, self.actor_network, self.rho) update_target(self.critic_target, self.critic_network, self.rho) return c_l, a_l def save_weights(self, path): """ Save weights to `path` """ parent_dir = os.path.dirname(path) if not os.path.exists(parent_dir): os.makedirs(parent_dir) # Save the weights self.actor_network.save_weights(path + "an.h5") self.critic_network.save_weights(path + "cn.h5") self.critic_target.save_weights(path + "ct.h5") self.actor_target.save_weights(path + "at.h5") def load_weights(self, path): """ Load weights from path """ try: self.actor_network.load_weights(path + "an.h5") self.critic_network.load_weights(path + "cn.h5") self.critic_target.load_weights(path + "ct.h5") self.actor_target.load_weights(path + "at.h5") except OSError as err: logging.warning("Weights files cannot be found, %s", err)