def __init__(self, config, network, loss, optimizer): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.memory = ReplayMemory(config['REPLAY']) self.policy_net = network.to(self.device) self.target_net = network.to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.loss = loss self.optimizer = optimizer(self.policy_net.parameters(), config['lr']) self.steps_done = 0 self.config = config
def __init__(self, sess, logger, config, env): super(Agent, self).__init__(config, logger) self.sess = sess self.logger = logger self.config = config params = DeepSenseParams(config) self.env = env self.history = History(logger, config) self.replay_memory = ReplayMemory(logger, config) with tf.variable_scope(STEPS): self.step_op = tf.Variable(0, trainable=False, name=STEP) self.step_input = tf.placeholder('int32', None, name=STEP_INPUT) self.step_assign_op = self.step_op.assign(self.step_input) self.build_dqn(params)
class Agent(BaseAgent): '''Deep Trading Agent based on Deep Q Learning''' '''TODO: 1. add `play` function to run tests in the simulated environment ''' def __init__(self, sess, logger, config, env): super(Agent, self).__init__(config, logger) self.sess = sess self.logger = logger self.config = config params = DeepSenseParams(config) self.env = env self.history = History(logger, config) self.replay_memory = ReplayMemory(logger, config) with tf.variable_scope(STEPS): self.step_op = tf.Variable(0, trainable=False, name=STEP) self.step_input = tf.placeholder('int32', None, name=STEP_INPUT) self.step_assign_op = self.step_op.assign(self.step_input) self.build_dqn(params) @property def summary_writer(self): return self._summary_writer def train(self): start_step = self.sess.run(self.step_op) num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. max_avg_ep_reward = 0 ep_rewards, actions = [], [] trade_rem = self.env.new_random_episode(self.history, self.replay_memory) for self.step in tqdm(range(start_step, self.max_step), ncols=70, initial=start_step): if self.step == self.learn_start: num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] # 1. predict action = self.predict((self.history.history, trade_rem)) # 2. act screen, reward, terminal, trade_rem = self.env.act(action) # 3. observe self.observe(screen, reward, action, terminal, trade_rem) if terminal: self.env.new_random_episode(self.history, self.replay_memory) num_episodes += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward if self.step >= self.learn_start: if self.step % self.test_step == self.test_step - 1: avg_reward = total_reward / self.test_step avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 message = 'avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \ % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_episodes) self.logger.info(message) if max_avg_ep_reward * 0.9 <= avg_ep_reward: self.sess.run( fetches=self.step_assign_op, feed_dict={self.step_input: self.step + 1} ) self.save_model(self.step + 1) max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward) if self.step > 180: self.inject_summary({ 'average.reward': avg_reward, 'average.loss': avg_loss, 'average.q': avg_q, 'episode.max reward': max_ep_reward, 'episode.min reward': min_ep_reward, 'episode.avg reward': avg_ep_reward, 'episode.num of episodes': num_episodes, 'episode.rewards': ep_rewards, 'episode.actions': actions, 'training.learning_rate': self.sess.run( fetches=self.learning_rate_op, feed_dict={self.learning_rate_step: self.step} ) }, self.step) num_episodes = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] def predict(self, state, test_ep=None): s_t = state[0] trade_rem_t = state[1] ep = test_ep or (self.ep_end + max(0., (self.ep_start - self.ep_end) \ * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t)) if random.random() < ep: action = random.randrange(self.config[NUM_ACTIONS]) else: action = self.sess.run( fetches=self.q.action, feed_dict={ self.q.phase: 0, self.s_t: [s_t], self.trade_rem_t: [trade_rem_t], self.q_conv_keep_prob: 1.0, self.q_dense_keep_prob: 1.0, self.q_gru_keep_prob: 1.0 } )[0] return action def observe(self, screen, reward, action, terminal, trade_rem): #clip reward in the range min to max reward = max(self.min_reward, min(self.max_reward, reward)) self.history.add(screen) self.replay_memory.add(screen, reward, action, terminal, trade_rem) if self.step > self.learn_start: if self.step % self.train_frequency == 0: self.q_learning_mini_batch() if self.step % self.target_q_update_step == self.target_q_update_step - 1: self.update_target_network() def q_learning_mini_batch(self): if self.replay_memory.count >= self.replay_memory.history_length: state_t, action, reward, state_t_plus_1, terminal = self.replay_memory.sample s_t, trade_rem_t = state_t[0], state_t[1] s_t_plus_1, trade_rem_t_plus_1 = state_t_plus_1[0], state_t_plus_1[1] q_t_plus_1 = self.sess.run( fetches=self.t_q.values, feed_dict={ self.t_q.phase: 0, self.t_s_t: s_t_plus_1, self.t_trade_rem_t: trade_rem_t_plus_1 } ) max_q_t_plus_1 = np.max(q_t_plus_1, axis=1) terminal = np.array(terminal) + 0. target_q = reward + (1 - terminal) * max_q_t_plus_1 _, q_t, loss, avg_q_summary = self.sess.run([self.optimizer, self.q.values, self.loss, self.q.avg_q_summary], { self.q.phase: 1, self.target_q: target_q, self.action: action, self.s_t: s_t, self.trade_rem_t: trade_rem_t, self.q_conv_keep_prob: self.config[CONV_KEEP_PROB], self.q_dense_keep_prob: self.config[DENSE_KEEP_PROB], self.q_gru_keep_prob: self.config[GRU_KEEP_PROB], self.learning_rate_step: self.step }) self.summary_writer.add_summary(avg_q_summary, self.step) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 def build_dqn(self, params): with tf.variable_scope(PREDICTION): self.s_t = tf.placeholder( dtype=tf.float32, shape=[None, self.replay_memory.history_length, self.replay_memory.num_channels], name=HISTORICAL_PRICES ) self.trade_rem_t = tf.placeholder( dtype=tf.float32, shape=[None,], name=TRADE_REM ) with tf.variable_scope(DROPOUT_KEEP_PROBS): self.q_conv_keep_prob = tf.placeholder(tf.float32) self.q_dense_keep_prob = tf.placeholder(tf.float32) self.q_gru_keep_prob = tf.placeholder(tf.float32) params.dropoutkeepprobs = DropoutKeepProbs( self.q_conv_keep_prob, self.q_dense_keep_prob, self.q_gru_keep_prob ) self.q = DeepSense(params, self.logger, self.sess, self.config, name=Q_NETWORK) self.q.build_model((self.s_t, self.trade_rem_t)) with tf.variable_scope(TARGET): self.t_s_t = tf.placeholder( dtype=tf.float32, shape=[None, self.replay_memory.history_length, self.replay_memory.num_channels], name=HISTORICAL_PRICES ) self.t_trade_rem_t = tf.placeholder( dtype=tf.float32, shape=[None,], name=TRADE_REM ) params.dropoutkeepprobs = DropoutKeepProbs() self.t_q = DeepSense(params, self.logger, self.sess, self.config, name=T_Q_NETWORK) self.t_q.build_model((self.t_s_t, self.t_trade_rem_t)) with tf.variable_scope(UPDATE_TARGET_NETWORK): self.q_weights_placeholders = {} self.t_weights_assign_ops = {} for name in self.q.weights.keys(): self.q_weights_placeholders[name] = tf.placeholder( tf.float32, self.q.weights[name].get_shape().as_list() ) for name in self.q.weights.keys(): self.t_weights_assign_ops[name] = self.t_q.weights[name].assign( self.q_weights_placeholders[name] ) with tf.variable_scope(TRAINING): self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q) self.action = tf.placeholder(tf.int64, [None], name=ACTION) action_one_hot = tf.one_hot(self.action, self.config[NUM_ACTIONS], 1.0, 0.0, name=ACTION_ONE_HOT) q_acted = tf.reduce_sum(self.q.values * action_one_hot, reduction_indices=1, name=Q_ACTED) with tf.variable_scope(LOSS): self.delta = self.target_q - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(clipped_error(self.delta), name=LOSS) with tf.variable_scope(OPTIMIZER): self.learning_rate_step = tf.placeholder(tf.int64, None, name=LEARNING_RATE_STEP) self.learning_rate_op = tf.maximum(self.learning_rate_minimum, tf.train.exponential_decay( self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss) with tf.variable_scope(SUMMARY): scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \ 'episode.max reward', 'episode.min reward', 'episode.avg reward', \ 'episode.num of episodes', 'training.learning_rate'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = \ tf.summary.scalar( name="{}-{}".format(self.env_name, tag.replace(' ', '_')), tensor=self.summary_placeholders[tag] ) histogram_summary_tags = ['episode.rewards', 'episode.actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag) self.summary_ops[tag] = \ tf.summary.histogram( tag, self.summary_placeholders[tag] ) self.sess.run(tf.local_variables_initializer()) self.sess.run(tf.global_variables_initializer()) self._saver = tf.train.Saver(self.q.weights.values() + [self.step_op], max_to_keep=30) self.load_model() self.update_target_network() self._summary_writer = tf.summary.FileWriter(self.config[TENSORBOARD_LOG_DIR]) self._summary_writer.add_graph(self.sess.graph) def update_target_network(self): for name in self.q.weights.keys(): self.sess.run( fetches=self.t_weights_assign_ops[name], feed_dict= {self.q_weights_placeholders[name]: self.sess.run( fetches=self.q.weights[name] )} ) def inject_summary(self, tag_dict, step): summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in tag_dict.items() }) for summary_str in summary_str_lists: self.summary_writer.add_summary(summary_str, self.step)
class DQN: def __init__(self, config, network, loss, optimizer): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.memory = ReplayMemory(config['REPLAY']) self.policy_net = network.to(self.device) self.target_net = network.to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.loss = loss self.optimizer = optimizer(self.policy_net.parameters(), config['lr']) self.steps_done = 0 self.config = config def update(self): self.target_net.load_state_dict(self.policy_net.state_dict()) def select_action(self, state): EPS_START, EPS_END, EPS_DECAY, n_actions = self.config[ 'EPS_START'], self.config['EPS_END'], self.config[ 'EPS_DECAY'], self.config['ACTION_SPACE'] sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * self.steps_done / EPS_DECAY) self.steps_done += 1 if sample > eps_threshold: with torch.no_grad(): # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. return self.policy_net(state).max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(n_actions)]], device=self.device, dtype=torch.long) def optimize_model(self): BATCH_SIZE = self.config['BATCH_SIZE'] if len(self.memory) < BATCH_SIZE: return transitions = self.memory.sample(BATCH_SIZE) # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=self.device, dtype=torch.bool) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_state_values = torch.zeros(BATCH_SIZE, device=self.device) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() # Compute the expected Q values GAMMA = self.config['GAMMA'] expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = self.loss(state_action_values, expected_state_action_values.unsqueeze(1)) # Optimize the model self.optimizer.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step()
class Agent(BaseAgent): '''Deep Trading Agent based on Deep Q Learning''' '''TODO: 1. play ''' def __init__(self, sess, logger, config, env): super(Agent, self).__init__(config, logger) self.sess = sess self.logger = logger self.config = config params = DeepSenseParams(config) self.env = env self.history = History(logger, config) self.replay_memory = ReplayMemory(logger, config) with tf.variable_scope(STEPS): self.step_op = tf.Variable(0, trainable=False, name=STEP) self.step_input = tf.placeholder('int32', None, name=STEP_INPUT) self.step_assign_op = self.step_op.assign(self.step_input) self.build_dqn(params) @property def summary_writer(self): return self._summary_writer def train(self): start_step = self.step_op.eval() num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. max_avg_ep_reward = 0 ep_rewards, actions = [], [] self.env.new_random_episode(self.history) for self.step in tqdm(range(start_step, self.max_step), ncols=70, initial=start_step): if self.step == self.learn_start: num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] # 1. predict action = self.predict(self.history.get()) # 2. act screen, reward, terminal = self.env.act(action) # 3. observe self.observe(screen, reward, action, terminal) if terminal: self.env.new_random_episode(self.history) num_episodes += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward if self.step >= self.learn_start: if self.step % self.test_step == self.test_step - 1: avg_reward = total_reward / self.test_step avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 message = 'avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \ % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game) print_and_log_message(message, self.logger) if max_avg_ep_reward * 0.9 <= avg_ep_reward: self.step_assign_op.eval( {self.step_input: self.step + 1}) self.save_model(self.step + 1) max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward) if self.step > 180: self.inject_summary( { 'average.reward': avg_reward, 'average.loss': avg_loss, 'average.q': avg_q, 'episode.max reward': max_ep_reward, 'episode.min reward': min_ep_reward, 'episode.avg reward': avg_ep_reward, 'episode.num of game': num_game, 'episode.rewards': ep_rewards, 'episode.actions': actions, 'training.learning_rate': self.learning_rate_op.eval( {self.learning_rate_step: self.step}), }, self.step) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] def predict(self, s_t, test_ep=None): ep = test_ep or (self.ep_end + max(0., (self.ep_start - self.ep_end) \ * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t)) if random.random() < ep: action = random.randrange(self.env.action_size) else: action = self.q.action.eval({self.s_t: [s_t]})[0] return action def observe(self, screen, reward, action, terminal): #clip reward in the range min to max reward = max(self.min_reward, min(self.max_reward, reward)) self.history.add(screen) self.replay_memory.add(screen, reward, action, terminal) if self.step > self.learn_start: if self.step % self.train_frequency == 0: self.q_learning_mini_batch() if self.step % self.target_q_update_step == self.target_q_update_step - 1: self.update_target_network() def q_learning_mini_batch(self): if self.replay_memory.count >= self.replay_memory.history_length: s_t, action, reward, s_t_plus_1, terminal = self.replay_memory.sample( ) max_q_t_plus_1 = self.t_q.action.eval({self.t_s_t: s_t_plus_1}) terminal = np.array(terminal) + 0. target_q = reward + (1 - terminal) * max_q_t_plus_1 _, q_t, loss, avg_q_summary = self.sess.run( [ self.optimizer, self.q.values, self.loss, self.q.avg_q_summary ], { self.target_q: target_q, self.action: action, self.s_t: s_t, self.learning_rate_step: self.step, }) self.summary_writer.add_summary(avg_q_summary, self.step) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 def build_dqn(self, params): with tf.variable_scope(PREDICTION): self.s_t = tf.placeholder(dtype=tf.float32, shape=[ None, self.replay_memory.history_length, self.replay_memory.num_channels ]) self.q = DeepSense(params, self.logger, self.sess, self.config, name=Q_NETWORK) self.q.build_model(self.s_t) with tf.variable_scope(TARGET): self.t_s_t = tf.placeholder(dtype=tf.float32, shape=[ None, self.replay_memory.history_length, self.replay_memory.num_channels ]) self.t_q = DeepSense(params, self.logger, self.sess, self.config, name=T_Q_NETWORK) self.t_q.build_model(self.t_s_t, train=False) with tf.variable_scope(UPDATE_TARGET_NETWORK): self.q_weights_placeholders = {} self.t_weights_assign_ops = {} for name in self.q.weights.keys(): self.q_weights_placeholders[name] = tf.placeholder( tf.float32, self.q.weights[name].get_shape().as_list()) for name in self.q.weights.keys(): self.t_weights_assign_ops[name] = self.t_q.weights[ name].assign(self.q_weights_placeholders[name]) with tf.variable_scope(TRAINING): self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q) self.action = tf.placeholder(tf.int64, [None], name=ACTION) action_one_hot = tf.one_hot(self.action, self.env.action_size, 1.0, 0.0, name=ACTION_ONE_HOT) q_acted = tf.reduce_sum(self.q.values * action_one_hot, reduction_indices=1, name=Q_ACTED) with tf.variable_scope(LOSS): self.delta = self.target_q - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(clipped_error(self.delta), name=LOSS) with tf.variable_scope(OPTIMIZER): self.learning_rate_step = tf.placeholder( tf.int64, None, name=LEARNING_RATE_STEP) self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss) with tf.variable_scope(SUMMARY): scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', \ 'episode.max reward', 'episode.min reward', 'episode.avg reward', \ 'episode.num of game', 'training.learning_rate'] self.summary_placeholders = {} self.summary_ops = {} for tag in scalar_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = \ tf.summary.scalar( name="{}-{}".format(self.env_name, tag), tensor=self.summary_placeholders[tag] ) histogram_summary_tags = ['episode.rewards', 'episode.actions'] for tag in histogram_summary_tags: self.summary_placeholders[tag] = \ tf.placeholder('float32', None, name=tag.replace(' ', '_')) self.summary_ops[tag] = \ tf.summary.histogram( name=tag, self.summary_placeholders[tag] ) self._summary_writer = tf.summary.FileWriter( config[TENSORBOARD_LOG_DIR]) self._summary_writer.add_graph(sess.graph) tf.initialize_all_variables().run() self._saver = tf.train.Saver(self.q.weights.values + [self.step_op], max_to_keep=30) self.load_model() self.update_target_network() def update_target_network(self): for name in self.q.weights.keys(): self.t_weights_assign_ops[name].eval({ self.q_weights_placeholders[name]: self.q.weights[name].eval() }) def inject_summary(self, tag_dict, step): summary_str_lists = self.sess.run( [self.summary_ops[tag] for tag in tag_dict.keys()], { self.summary_placeholders[tag]: value for tag, value in tag_dict.items() }) for summary_str in summary_str_lists: self.writer.add_summary(summary_str, self.step)
def run(comm, env, policy, policy_path, action_bound, optimizer): actor, actor_target, critic, critic_target = policy actor_opt, critic_opt = optimizer # set OU noise noise_c = OUNoise(action_dim=ACT_SIZE, action_bound=action_bound) noise_c.reset() buff = [] global_update = 0 global_step = 0 replay_memory = ReplayMemory(REPLAY_SIZE, SEED) #world reset if env.index == 0: env.reset_world() for id in range(MAX_EPISODES): #reset env.reset_pose() terminal = False ep_reward = 0 step = 1 # generate goal env.generate_goal_point() # get_state obs = env.get_laser_observation() obs_stack = deque([obs, obs, obs]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [obs_stack, goal, speed] # episode 1 while not terminal and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) ## get_action #------------------------------------------------------------------------- # generate actions at rank==0 mean, action = generate_action(env=env, state_list=state_list, actor=actor, action_bound=action_bound) ## exploration #-------------------------------------------------------------------------- a = noise_c.get_action(mean, step) #noise = np.random.normal(0, exploration_noise, size=2) #action size check #a = mean + noise #a = a.clip(action_bound[0], action_bound[1]) # execute actions #------------------------------------------------------------------------- real_action = comm.scatter(a, root=0) ## run action #------------------------------------------------------------------------- env.control_vel(real_action) # rate.sleep() rospy.sleep(0.001) ## get reward #------------------------------------------------------------------------- # get informtion r, terminal, result = env.get_reward_and_terminate(step) ep_reward += r global_step += 1 #------------------------------------------------------------------------- # get next state #------------------------------------------------------------------------- s_next = env.get_laser_observation() left = obs_stack.popleft() obs_stack.append(s_next) goal_next = np.asarray(env.get_local_goal()) speed_next = np.asarray(env.get_self_speed()) state_next = [obs_stack, goal_next, speed_next] r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) state_next_list = comm.gather(state_next, root=0) #------------------------------------------------------------------------- ## training #------------------------------------------------------------------------- if env.index == 0: ## save data in memory #------------------------------------------------------------------------- replay_memory.push(state[0], state[1], state[2], a, r_list, state_next[0], state_next[1], state_next[2], terminal_list) policy_list = [actor, actor_target, critic, critic_target] optimizer_list = [actor_opt, critic_opt] if len(replay_memory) > BATCH_SIZE: ## Update step by step #------------------------------------------------------------------------- ddpg_update_stage(policy=policy_list, optimizer=optimizer_list, batch_size=BATCH_SIZE, memory=replay_memory, epoch=EPOCH, replay_size=REPLAY_SIZE, gamma=GAMMA, num_step=BATCH_SIZE, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE, tau=TAU) global_update += 1 step += 1 state = state_next ## save policy and log #------------------------------------------------------------------------- if env.index == 0: if global_update != 0 and global_update % 3000 == 0: torch.save(actor.state_dict(), policy_path + '/actor_{}'.format(global_update)) torch.save(critic.state_dict(), policy_path + '/critic_{}'.format(global_update)) logger.info( '########################## model saved when update {} times#########' '################'.format(global_update)) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1] - env.init_pose[1])**2) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \ (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result)) logger_cal.info(ep_reward)
def run(comm, env, policy, critic, critic_opt, critic_target, policy_path, action_bound, optimizer): buff = [] global_update = 0 global_step = 0 # world reset if env.index == 0: env.reset_world() memory_position = 0 update = 0 # replay_memory replay_memory = ReplayMemory(REPLAY_SIZE, SEED) for id in range(MAX_EPISODES): # reset env.reset_pose() terminal = False ep_reward = 0 step = 1 # generate goal env.generate_goal_point() # get_state obs = env.get_laser_observation() obs_stack = deque([obs, obs, obs]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [obs_stack, goal, speed] # episode 1 while not terminal and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) ## get_action #------------------------------------------------------------------------- # generate actions at rank==0 a, logprob, scaled_action = generate_action( env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions #------------------------------------------------------------------------- real_action = comm.scatter(scaled_action, root=0) ## run action #------------------------------------------------------------------------- env.control_vel(real_action) rospy.sleep(0.001) ## get reward #------------------------------------------------------------------------- # get informtion r, terminal, result = env.get_reward_and_terminate(step) ep_reward += r global_step += 1 #------------------------------------------------------------------------- # get next state #------------------------------------------------------------------------- s_next = env.get_laser_observation() left = obs_stack.popleft() obs_stack.append(s_next) goal_next = np.asarray(env.get_local_goal()) speed_next = np.asarray(env.get_self_speed()) state_next = [obs_stack, goal_next, speed_next] #------------------------------------------------------------------------- r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) state_next_list = comm.gather(state_next, root=0) ## training #------------------------------------------------------------------------- if env.index == 0: # add data in replay_memory #------------------------------------------------------------------------- replay_memory.push(state[0], state[1], state[2], a, logprob, r_list, state_next[0], state_next[1], state_next[2], terminal_list) if len(replay_memory) > BATCH_SIZE: ## update #------------------------------------------------------------------------- update = sac_update_stage(policy=policy, optimizer=optimizer, critic=critic, critic_opt=critic_opt, critic_target=critic_target, batch_size=BATCH_SIZE, memory=replay_memory, epoch=EPOCH, replay_size=REPLAY_SIZE, tau=TAU, alpha=ALPHA, gamma=GAMMA, updates=update, update_interval=UPDATE_INTERVAL, num_step=BATCH_SIZE, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) buff = [] global_update += 1 update = update step += 1 state = state_next ## save policy #-------------------------------------------------------------------------------------------------------------- if env.index == 0: if global_update != 0 and global_update % 1000 == 0: torch.save(policy.state_dict(), policy_path + '/policy_{}'.format(global_update)) torch.save(critic.state_dict(), policy_path + '/critic_{}'.format(global_update)) logger.info( '########################## model saved when update {} times#########' '################'.format(global_update)) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1] - env.init_pose[1])**2) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \ (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result)) logger_cal.info(ep_reward)
def run(comm, env, agent, policy_path, args): # Training Loop test_interval = 10 save_interval = 100 total_numsteps = 0 updates = 0 # world reset if env.index == 0: # step env.reset_gazebo_simulation() #Tesnorboard writer = SummaryWriter('test_runs/' + policy_path) # replay_memory memory = ReplayMemory(args.replay_size, args.seed) for i_episode in range(args.num_steps): episode_reward = 0 episode_steps = 0 done = False # Env reset env.set_gazebo_pose() # generate goal env.generate_goal_point_gazebo() # Get initial state frame = env.get_laser_observation() frame_stack = deque([frame, frame, frame]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [frame_stack, goal, speed] # Episode start while not done and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) if env.index == 0: action = agent.select_action(state_list) else: action = None if env.index == 0: if len(memory) > args.batch_size: # Number of updates per step in environment for i in range(args.updates_per_step): # Update parameters of all the networks critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, args.batch_size, updates) writer.add_scalar('loss/critic_1', critic_1_loss, updates) writer.add_scalar('loss/critic_2', critic_2_loss, updates) writer.add_scalar('loss/policy', policy_loss, updates) writer.add_scalar('loss/entropy_loss', ent_loss, updates) writer.add_scalar('entropy_temprature/alpha', alpha, updates) updates += 1 # Execute actions #------------------------------------------------------------------------- action_clip_bound = [[0, -1], [1, 1]] #### Action maximum, minimum values cliped_action = np.clip(action, a_min=action_clip_bound[0], a_max=action_clip_bound[1]) real_action = comm.scatter(cliped_action, root=0) #if real_action[0] >= 0.02 and real_action[0] <= 0.2: # real_action[0] = real_action[0] / 0.6 if real_action[0] < 0.10 and real_action[0] > 0: real_action[0] = 0.1 if real_action[1] > 0 and real_action[1] < 0.15: real_action[1] = 0.0 elif real_action[1] < 0 and real_action[1] > -0.15: real_action[1] = 0.0 env.control_vel(real_action) #rospy.sleep(0.001) ## Get reward and terminal state reward, done, result = env.get_reward_and_terminate(episode_steps) #print("Action : [{}, {}], Distance : {}, Reward : {}".format(real_action[0], real_action[1], env.distance, reward)) episode_steps += 1 total_numsteps += 1 episode_reward += reward # Get next state next_frame = env.get_laser_observation() left = frame_stack.popleft() frame_stack.append(next_frame) next_goal = np.asarray(env.get_local_goal()) next_speed = np.asarray(env.get_self_speed()) next_state = [frame_stack, next_goal, next_speed] r_list = comm.gather(reward, root=0) done_list = comm.gather(done, root=0) next_state_list = comm.gather(next_state, root=0) if env.index == 0: #meomry.list_push(state_list, action, r_list, next_state_list, done_list) for i in range(np.asarray(state_list).shape[0]): memory.push(state_list[i][0], state_list[i][1], state_list[i][2], action[i], r_list[i], next_state_list[i][0], next_state_list[i][1], next_state_list[i][2], done_list[i]) # Append transition to memory state = next_state #if total_numsteps > args.num_steps: # break if env.index == 0: writer.add_scalar('reward/train', episode_reward, i_episode) if env.index == 0: #if global_update != 0 and global_update % 20 == 0: if i_episode != 0 and i_episode % save_interval == 0: torch.save(agent.policy.state_dict(), policy_path + '/policy_epi_{}'.format(i_episode)) print('########################## policy model saved when update {} times#########' '################'.format(i_episode)) torch.save(agent.critic_1.state_dict(), policy_path + '/critic_1_epi_{}'.format(i_episode)) print('########################## critic model saved when update {} times#########' '################'.format(i_episode)) torch.save(agent.critic_2.state_dict(), policy_path + '/critic_2_epi_{}'.format(i_episode)) print('########################## critic model saved when update {} times#########' '################'.format(i_episode)) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2) print("Env: {}, memory_size: {}, Goal: ({} , {}), Episode: {}, steps: {}, Reward: {}, Distance: {}, {}".format(env.index, len(memory), round(env.goal_point[0],2), round(env.goal_point[1],2), i_episode+1, episode_steps, round(episode_reward, 2), round(distance, 2), result))
class Agent(BaseAgent): '''Deep Trading Agent based on Deep Q Learning''' '''TODO: 1. add summary ops 2. timing and logging 3. model saving 4. increment self.step ''' def __init__(self, sess, logger, config, env): super(Agent, self).__init__(config) self.sess = sess self.logger = logger self.config = config params = DeepSenseParams(config) self.env = env self.history = History(logger, config) self.replay_memory = ReplayMemory(logger, config) with tf.variable_scope(STEPS): self.step_op = tf.Variable(0, trainable=False, name=STEP) self.step_input = tf.placeholder('int32', None, name=STEP_INPUT) self.step_assign_op = self.step_op.assign(self.step_input) self.build_dqn(params) def train(self): start_step = self.step_op.eval() num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. max_avg_ep_reward = 0 ep_rewards, actions = [], [] self.env.new_random_episode(self.history) for self.step in range(start_step, self.max_step): if self.step == self.learn_start: num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] # 1. predict action = self.predict(self.history.get()) # 2. act screen, reward, terminal = self.env.act(action) # 3. observe self.observe(screen, reward, action, terminal) if terminal: self.env.new_random_episode(self.history) num_episodes += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward def predict(self, s_t, test_ep=None): ep = test_ep or (self.ep_end + max(0., (self.ep_start - self.ep_end) \ * (self.ep_end_t - max(0., self.step - self.learn_start)) / self.ep_end_t)) if random.random() < ep: action = random.randrange(self.env.action_size) else: action = self.q.action.eval({self.s_t: [s_t]})[0] return action def observe(self, screen, reward, action, terminal): #clip reward in the range min to max reward = max(self.min_reward, min(self.max_reward, reward)) self.history.add(screen) self.replay_memory.add(screen, reward, action, terminal) if self.step > self.learn_start: if self.step % self.train_frequency == 0: self.q_learning_mini_batch() if self.step % self.target_q_update_step == self.target_q_update_step - 1: self.update_target_network() def q_learning_mini_batch(self): if self.replay_memory.count >= self.replay_memory.history_length: s_t, action, reward, s_t_plus_1, terminal = self.replay_memory.sample( ) max_q_t_plus_1 = self.t_q.action.eval({self.t_s_t: s_t_plus_1}) terminal = np.array(terminal) + 0. target_q = reward + (1 - terminal) * max_q_t_plus_1 _, q_t, loss = self.sess.run( [self.optimizer, self.q.values, self.loss], { self.target_q: target_q, self.action: action, self.s_t: s_t, self.learning_rate_step: self.step, }) self.total_loss += loss self.total_q += q_t.mean() self.update_count += 1 def build_dqn(self, params): with tf.variable_scope(PREDICTION): self.s_t = tf.placeholder(dtype=tf.float32, shape=[ None, self.replay_memory.history_length, self.replay_memory.num_channels ]) self.q = DeepSense(params, self.logger, self.sess, self.config, name=Q_NETWORK) self.q.build_model(self.s_t) with tf.variable_scope(TARGET): self.t_s_t = tf.placeholder(dtype=tf.float32, shape=[ None, self.replay_memory.history_length, self.replay_memory.num_channels ]) self.t_q = DeepSense(params, self.logger, self.sess, self.config, name=T_Q_NETWORK) self.t_q.build_model(self.t_s_t, train=False) with tf.variable_scope(UPDATE_TARGET_NETWORK): self.q_weights_placeholders = {} self.t_weights_assign_ops = {} for name in self.q.weights.keys(): self.q_weights_placeholders[name] = tf.placeholder( tf.float32, self.q.weights[name].get_shape().as_list()) for name in self.q.weights.keys(): self.t_weights_assign_ops[name] = self.t_q.weights[ name].assign(self.q_weights_placeholders[name]) with tf.variable_scope(TRAINING): self.target_q = tf.placeholder(tf.float32, [None], name=TARGET_Q) self.action = tf.placeholder(tf.int64, [None], name=ACTION) action_one_hot = tf.one_hot(self.action, self.env.action_size, 1.0, 0.0, name=ACTION_ONE_HOT) q_acted = tf.reduce_sum(self.q.values * action_one_hot, reduction_indices=1, name=Q_ACTED) with tf.variable_scope(LOSS): self.delta = self.target_q - q_acted self.global_step = tf.Variable(0, trainable=False) self.loss = tf.reduce_mean(clipped_error(self.delta), name=LOSS) with tf.variable_scope(OPTIMIZER): self.learning_rate_step = tf.placeholder( tf.int64, None, name=LEARNING_RATE_STEP) self.learning_rate_op = tf.maximum( self.learning_rate_minimum, tf.train.exponential_decay(self.learning_rate, self.learning_rate_step, self.learning_rate_decay_step, self.learning_rate_decay, staircase=True)) self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate_op, momentum=0.95, epsilon=0.01).minimize(self.loss) # tf.initialize_all_variables().run() #initialize the q network and the target network with the same weights # self.update_target_network() def update_target_network(self): for name in self.q.weights.keys(): self.t_weights_assign_ops[name].eval({ self.q_weights_placeholders[name]: self.q.weights[name].eval() })