def __init__( self, n_actions, n_features, sess, agent_id, num_training, learning_rate=0.01, reward_decay=0.9, replace_target_iter=300, memory_size=500, batch_size=32, save_model_freq=100, max_epsilon=1, min_epsilon=0, load_model=False, ): self.n_actions = n_actions self.n_features = n_features self.sess = sess self.agent_id = agent_id self.num_training = num_training self.lr = learning_rate self.gamma = reward_decay self.replace_target_iter = replace_target_iter self.memory_size = memory_size self.batch_size = batch_size self.save_model_freq = save_model_freq self.max_epsilon = max_epsilon self.min_epsilon = min_epsilon self.epsilon = self.max_epsilon self.load_model = load_model # total learning step self.learn_step_counter = 0 self.episode_rew_agent = 0 self.episode_rew_all = 0 self.episode = 0 # initialize zero memory [s, a, r, s_] self.memory = Memory(capacity=memory_size) #np.zeros((self.memory_size, n_features * 2 + 2)) # consist of [target_net, evaluate_net] self._build_net() t_params = tf.get_collection('target_net_params') e_params = tf.get_collection('eval_net_params') self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)] self.cost_his = [] if(self.load_model): saver = tf.train.Saver(max_to_keep=100000000) model_load_steps = 420000 model_file_load = os.path.join("models/", "agent_No_" + str(self.agent_id) + "/", str(model_load_steps) + "_" + "model_segment_training/", "8m") saver.restore(self.sess, model_file_load) print("model trained for %s steps of agent %s have been loaded"%(model_load_steps, self.agent_id)) else: self.sess, self.saver, self.summary_placeholders, self.update_ops, self.summary_op, self.summary_writer, self.summary_vars = self.init_sess()
def run(): policy_net = DQN(num_channels, 19).cuda() target_net = DQN(num_channels, 19).cuda() optimizer = optim.Adam(policy_net.parameters(), LR) memory = Memory(50000) env = gym.make(ENV_NAME) env.make_interactive(port=6666, realtime=False) max_epi = 100 n_step = 2 update_period = 10 gamma = 0.99 total_steps = 0 epsilon = 0.95 endEpsilon = 0.01 stepDrop = (epsilon - endEpsilon) / max_epi for num_epi in range(max_epi): obs = env.reset() state = converter(ENV_NAME, obs).cuda() state = state.float() done = False total_reward = 0 steps = 0 if epsilon > endEpsilon: epsilon -= stepDrop while not done: steps += 1 total_steps += 1 a_out = policy_net.sample_action(state, epsilon) action_index = a_out action = make_19action(env, action_index) obs_prime, reward, done, info = env.step(action) total_reward += reward if done: print("%d episode is done" % num_epi) print("total rewards : %d " % total_reward) writer.add_scalar('Rewards/train', total_reward, num_epi) break state_prime = converter(ENV_NAME, obs_prime).cuda() append_sample(memory, policy_net, target_net, state, action_index, reward, state_prime, done) state = state_prime if memory.size() > 1000: update_network(policy_net, target_net, memory, 2, optimizer, total_steps) if total_steps % 2000 == 0: update_target(policy_net, target_net)
def __init__(self, sess, env, FLAGS, rl_mode): self.FLAGS = FLAGS self.rl_mode = rl_mode self.p_dic = getattr(conf.dic.path_dic, self.FLAGS.env_name) self.s_dim = env.observation_space.shape[0] self.a_dim = env.action_space.shape[0] self.sess = sess self._build_graph() self.state_translate = env.observation_space.low self.state_scale = env.observation_space.high - env.observation_space.low + 1e-5 self.action_translate = env.action_space.low self.action_scale = env.action_space.high - env.action_space.low + 1e-5 if self.rl_mode: self.memory = Memory(self.FLAGS.replayBuffer_size, dims=2 * self.s_dim + self.a_dim + 1)
def main(): policy_net = DQN(num_channels=num_channels, num_actions=19).to(device=device) target_net = DQN(num_channels=num_channels, num_actions=19).to(device=device) target_net.load_state_dict(policy_net.state_dict()) memory = Memory(50000) optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate, weight_decay=1e-5) print("pre_train start") model_name = 'pre_trained_dqn' pre_train(env_name, memory, policy_net, target_net, optimizer) print("pre_train finished")
def train_cnn(env, policy, train_policy, args): """ Args: param1(): policy param2(): writer param3(): episode default 1 number for path to save the video """ size = args.size obs_shape = (args.history_length, size, size) action_shape = (args.action_dim, ) memoy = Memory((84, 84, 3), int(args.buffer_size), args.device) replay_buffer = ReplayBuffer(obs_shape, action_shape, int(args.buffer_size), args.image_pad, args.device) torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) total_timesteps = 0 done_counter = deque(maxlen=100) scores_window = deque(maxlen=100) t0 = time.time() pathname = str(args.locexp) + "/" + str(args.env_name) pathname += "_batch_size_" + str(args.batch_size) + "_lr_encoder_" + str( args.lr_encoder) tensorboard_name = str(args.locexp) + '/runs/' + pathname writer = SummaryWriter(tensorboard_name) for i_episode in range(int(args.episodes)): obs = env.reset() done = False episode_reward = 0 for step in range(args.max_episode_steps): action = policy.select_action(np.array(obs)) new_obs, reward, done, image = env.step(action) memoy.add(image) episode_reward += reward # frame = cv2.imwrite("im{}.png".format(step), np.array(image)) done_bool = 0 if step + 1 == args.max_episode_steps else float( done) total_timesteps += 1 obs = new_obs if step == 49: done = True if done: memoy.create_states(replay_buffer) if total_timesteps != 0: if step < 50: done_counter.append(1) else: done_counter.append(0) goals = sum(done_counter) scores_window.append(episode_reward) text = "Total Timesteps: {} Episode Num: {} ".format( total_timesteps, i_episode) text += "Episode steps {} ".format(step) text += "Goal last 100 ep : {} ".format(goals) text += "Reward: {:.2f} Average Re: {:.2f} Time: {}".format( episode_reward, np.mean(scores_window), time_format(time.time() - t0)) print(text) break if total_timesteps > args.start_opt: train_policy.train_cnn(replay_buffer, policy, writer)
def main(): ''' Create the environment ''' env = gym.make(ENV_NAME) # For tensorboard writer = tf.summary.FileWriter("./tensorboard") assert STATE_DIM == np.prod(np.array(env.observation_space.shape)) assert ACTION_DIM == np.prod(np.array(env.action_space.shape)) env.seed(0) np.random.seed(0) ''' Create the replay memory ''' replay_memory = Memory(REPLAY_MEM_CAPACITY) # Tensorflow part starts here! tf.reset_default_graph() ''' Create placeholders ''' # Placeholders state_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, STATE_DIM], name='state_placeholder') action_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, ACTION_DIM], name='action_placeholder') reward_placeholder = tf.placeholder(dtype=tf.float32, shape=[None], name='reward_placeholder') next_state_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, STATE_DIM], name='next_state_placeholder') is_not_terminal_placeholder = tf.placeholder( dtype=tf.float32, shape=[None], name='is_not_terminal_placeholder') is_training_placeholder = tf.placeholder(dtype=tf.float32, shape=(), name='is_training_placeholder') ''' A counter to count the number of episodes ''' episodes = tf.Variable(0.0, trainable=False, name='episodes') episode_incr_op = episodes.assign_add(1) ''' Create the actor network inside the actor scope and calculate actions ''' with tf.variable_scope('actor'): actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_actions = actor.call(state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' actions = scale_actions(unscaled_actions, env.action_space.low, env.action_space.high) ''' Create the target actor network inside target_actor scope and calculate the target actions. Apply stop_gradient to the target actions so that thier gradient is not computed at any point of time. ''' with tf.variable_scope('target_actor', reuse=False): target_actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_target_actions = target_actor.call(next_state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' target_actions_temp = scale_actions(unscaled_target_actions, env.action_space.low, env.action_space.low) target_actions = tf.stop_gradient(target_actions_temp) ''' Create the critic network inside the critic variable scope. Get the Q-values of given actions and Q-values of actions suggested by the actor network. ''' with tf.variable_scope('critic'): critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) q_values_of_given_actions = critic.call(state_placeholder, action_placeholder) q_values_of_suggested_actions = critic.call(state_placeholder, actions) ''' Create the target critic network inside the target_critic variable scope. Calculate the target Q-values and apply stop_gradient to it. ''' with tf.variable_scope('target_critic', reuse=False): target_critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) target_q_values_temp = target_critic.call(next_state_placeholder, target_actions) target_q_values = tf.stop_gradient(target_q_values_temp) ''' Calculate - trainable variables in actor (Weights of actor network), - Weights of target actor network - trainable variables in critic (Weights of critic network), - Weights of target critic network ''' actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') ''' Get the operators for updating the target networks. The update_target_networks function defined in utils returns a list of operators to be run from tf session inorder to update the target networks using soft update. ''' update_targets_op = update_target_networks(TAU, \ target_actor_vars, actor_vars, target_critic_vars, \ critic_vars) ''' Create the tf operation to train the critic network: - calculate TD-target - calculate TD-Error = TD-target - q_values_of_given_actions - calculate Critic network's loss (Mean Squared Error of TD-Errors) - ? - create a tf operation to train the critic network ''' targets = tf.expand_dims(reward_placeholder, 1) + \ tf.expand_dims(is_not_terminal_placeholder, 1) * GAMMA * \ target_q_values td_errors = targets - q_values_of_given_actions critic_loss = tf.reduce_mean(tf.square(td_errors)) # Update critic networks after computing loss for var in critic_vars: if not 'bias' in var.name: critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var) # optimize critic critic_train_op = tf.train.AdamOptimizer( LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss) ''' Create a tf operation to train the actor networks - Calculate the Actor network's loss - Create the tf operation to train the actor network ''' # Actor's loss actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions) for var in actor_vars: if not 'bias' in var.name: actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var) # Optimize actor actor_train_op = tf.train.AdamOptimizer( LEARNING_RATE_ACTOR * LR_DECAY**episodes).minimize(actor_loss, var_list=actor_vars) # Init session sess = tf.Session() sess.run(tf.global_variables_initializer()) writer.add_graph(sess.graph) # Training num_steps = 0 for episode in range(NUM_EPISODES): total_reward = 0 num_steps_in_episode = 0 # Create noise noise = np.zeros(ACTION_DIM) noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \ (env.action_space.high - env.action_space.low) # Initial state state = env.reset() for _ in range(MAX_STEPS_PER_EPISODE): action = sess.run(actions, feed_dict={ \ state_placeholder: state[None], is_training_placeholder: False}) # Add Noise to actions noise = EXPLORATION_THETA * (EXPLORATION_MU - noise) + \ EXPLORATION_SIGMA * np.random.randn(ACTION_DIM) action += noise_scale * noise # Take action on env next_state, reward, done, _info = env.step(action) next_state = np.squeeze(next_state) reward = np.squeeze(reward) action = action[0] total_reward += reward replay_memory.add_to_memory( (state, action, reward, next_state, 0.0 if done else 1.0)) if num_steps % TRAIN_EVERY == 0 and replay_memory.size() >= \ MINI_BATCH_SIZE : batch = replay_memory.sample_from_memory(MINI_BATCH_SIZE) _, _ = sess.run([critic_train_op, actor_train_op], feed_dict={ state_placeholder: np.asarray( \ [elem[0] for elem in batch]), action_placeholder: np.asarray( \ [elem[1] for elem in batch]), reward_placeholder: np.asarray( \ [elem[2] for elem in batch]), next_state_placeholder: np.asarray( \ [elem[3] for elem in batch]), is_not_terminal_placeholder: np.asarray( \ [elem[4] for elem in batch]), is_training_placeholder: True }) _ = sess.run(update_targets_op) state = next_state num_steps += 1 num_steps_in_episode += 1 if done: _ = sess.run(episode_incr_op) break print(str((episode, total_reward, num_steps_in_episode, noise_scale))) env.close()
env = gym.make(ENV_NAME) env.seed(1) env = env.unwrapped # Get state and action dimension state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] # Initialize actor, critic and target networks actor = ActorNetwork(action_dim=action_dim) critic = CriticNetwork() target_mu = TargetNetMu(actor) target_q = TargetNetQ(critic) # Initialize buffer memory = Memory(capacity=buffer_size, dims=2 * state_dim + action_dim + 1) # Total loss for critic total_critic_loss = 0 total_transition_trained_on = 0 # Outer iteration for m in range(M): # Receive initial observation s = env.reset() explore_variance = 2 # initial exploration variance s = nd.array(s).reshape((1, -1)) # print(s)
class DDPG: def __init__(self, sess, env, FLAGS, rl_mode): self.FLAGS = FLAGS self.rl_mode = rl_mode self.p_dic = getattr(conf.dic.path_dic, self.FLAGS.env_name) self.s_dim = env.observation_space.shape[0] self.a_dim = env.action_space.shape[0] self.sess = sess self._build_graph() self.state_translate = env.observation_space.low self.state_scale = env.observation_space.high - env.observation_space.low + 1e-5 self.action_translate = env.action_space.low self.action_scale = env.action_space.high - env.action_space.low + 1e-5 if self.rl_mode: self.memory = Memory(self.FLAGS.replayBuffer_size, dims=2 * self.s_dim + self.a_dim + 1) def _build_graph(self): self._placehoders() self._actor_critic() self._loss_train_op() self.score = tf.Variable(0., trainable=False, dtype=tf.float32, name='score') self.score_summary = tf.summary.scalar('score', self.score) self.sess.run(tf.global_variables_initializer()) self.writer = tf.summary.FileWriter(self.p_dic.get('agent_log_dir')) self.writer.add_graph(self.sess.graph) self.saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=1) def _placehoders(self): with tf.name_scope('inputs'): self.current_state = tf.placeholder(tf.float32, shape=[None, self.s_dim], name='s') self.reward = tf.placeholder(tf.float32, [None, 1], name='r') self.next_state = tf.placeholder(tf.float32, shape=[None, self.s_dim], name='s_') self.is_training = tf.placeholder(tf.bool, name='is_training') def _actor_critic(self): self.actor = build_actor(self.current_state, self.a_dim, self.is_training) self.actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Actor') actor_ema = tf.train.ExponentialMovingAverage(decay=1 - self.FLAGS.tau) self.update_targetActor = actor_ema.apply(self.actor_vars) self.targetActor = build_actor(self.next_state, self.a_dim, False, reuse=True, getter=get_getter(actor_ema)) self.critic = build_critic(self.current_state, self.actor, self.is_training) self.critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Critic') critic_ema = tf.train.ExponentialMovingAverage(decay=1 - self.FLAGS.tau) self.update_targetCritic = critic_ema.apply(self.critic_vars) self.targetCritic = build_critic(self.next_state, self.targetActor, False, reuse=True, getter=get_getter(critic_ema)) def _loss_train_op(self): max_grad = 2 with tf.variable_scope('target_q'): self.target_q = self.reward + self.FLAGS.gamma * self.targetCritic with tf.variable_scope('TD_error'): self.critic_loss = tf.squared_difference(self.target_q, self.critic) with tf.variable_scope('critic_grads'): self.critic_grads = tf.gradients(ys=self.critic_loss, xs=self.critic_vars) for ix, grad in enumerate(self.critic_grads): self.critic_grads[ix] = grad / self.FLAGS.batch_size with tf.variable_scope('C_train'): critic_optimizer = tf.train.AdamOptimizer(self.FLAGS.critic_lr, epsilon=1e-5) self.train_critic = critic_optimizer.apply_gradients( zip(self.critic_grads, self.critic_vars)) with tf.variable_scope('a_grad'): self.a_grads = tf.gradients(self.critic, self.actor)[0] with tf.variable_scope('actor_grads'): self.actor_grads = tf.gradients(ys=self.actor, xs=self.actor_vars, grad_ys=self.a_grads) for ix, grad in enumerate(self.actor_grads): self.actor_grads[ix] = tf.clip_by_norm( grad / self.FLAGS.batch_size, max_grad) with tf.variable_scope('A_train'): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): actor_optimizer = tf.train.AdamOptimizer(-self.FLAGS.actor_lr, epsilon=1e-5) self.train_actor = actor_optimizer.apply_gradients( zip(self.actor_grads, self.actor_vars)) def choose_action(self, state): return self.sess.run(self.actor, feed_dict={ self.current_state: state, self.is_training: False }) def train(self, episode=None, ep_reward=None): batch_memory = self.memory.sample(self.FLAGS.batch_size) batch_s = batch_memory[:, :self.s_dim] batch_a = batch_memory[:, self.s_dim:self.s_dim + self.a_dim] batch_r = batch_memory[:, -self.s_dim - 1:-self.s_dim] batch_s_ = batch_memory[:, -self.s_dim:] if episode is None: critic_feed_dict = { self.current_state: batch_s, self.actor: batch_a, self.reward: batch_r, self.next_state: batch_s_, self.is_training: True } self.sess.run([self.train_critic, self.update_targetCritic], feed_dict=critic_feed_dict) actor_feed_dict = { self.current_state: batch_s, self.next_state: batch_s_, self.is_training: True } self.sess.run([self.train_actor, self.update_targetActor], feed_dict=actor_feed_dict) else: update_score = self.score.assign( tf.convert_to_tensor(ep_reward, dtype=tf.float32)) with tf.control_dependencies([update_score]): merged_score = tf.summary.merge([self.score_summary]) self.critic_summary = tf.summary.merge_all(scope='Critic') self.actor_summary = tf.summary.merge_all(scope='Actor') critic_feed_dict = { self.current_state: batch_s, self.actor: batch_a, self.reward: batch_r, self.next_state: batch_s_, self.is_training: True } _, _, critic = self.sess.run([ self.train_critic, self.update_targetCritic, self.critic_summary ], feed_dict=critic_feed_dict) self.writer.add_summary(critic, episode) actor_feed_dict = { self.current_state: batch_s, self.next_state: batch_s_, self.is_training: True } merged = tf.summary.merge([merged_score, self.actor_summary]) _, _, actor = self.sess.run( [self.train_actor, self.update_targetActor, merged], feed_dict=actor_feed_dict) self.writer.add_summary(actor, episode) self.saver.save( self.sess, self.p_dic.get('agent_log_dir') + '/' + datetime.datetime.now().strftime('%y%m%d-%H:%M:%S') + '_EP' + str(episode) + '.ckpt') def perceive(self, state, action, reward, next_state, episode=None, ep_reward=None): self.memory.store_transition(state, action, reward, next_state) if self.memory.pointer > self.FLAGS.replayBuffer_size: self.train(episode, ep_reward) def load(self): self.saver.restore( self.sess, tf.train.latest_checkpoint(self.p_dic.get('agent_log_dir'))) def act(self, obs): actor_feed_dict = {self.current_state: obs, self.is_training: False} action = self.sess.run(self.actor, feed_dict=actor_feed_dict) act_low = np.array( [16., 16., 16., 16., 7.36, 16., 16., 16., 16., 6.57], dtype=np.float32) act_high = np.array( [30., 30., 30., 30., 7.36, 30., 30., 30., 30., 6.57], dtype=np.float32) return action
class DeepQNetwork: def __init__( self, n_actions, n_features, sess, agent_id, num_training, learning_rate=0.01, reward_decay=0.9, replace_target_iter=300, memory_size=500, batch_size=32, save_model_freq=100, max_epsilon=1, min_epsilon=0, load_model=False, ): self.n_actions = n_actions self.n_features = n_features self.sess = sess self.agent_id = agent_id self.num_training = num_training self.lr = learning_rate self.gamma = reward_decay self.replace_target_iter = replace_target_iter self.memory_size = memory_size self.batch_size = batch_size self.save_model_freq = save_model_freq self.max_epsilon = max_epsilon self.min_epsilon = min_epsilon self.epsilon = self.max_epsilon self.load_model = load_model # total learning step self.learn_step_counter = 0 self.episode_rew_agent = 0 self.episode_rew_all = 0 self.episode = 0 # initialize zero memory [s, a, r, s_] self.memory = Memory(capacity=memory_size) #np.zeros((self.memory_size, n_features * 2 + 2)) # consist of [target_net, evaluate_net] self._build_net() t_params = tf.get_collection('target_net_params') e_params = tf.get_collection('eval_net_params') self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)] self.cost_his = [] if(self.load_model): saver = tf.train.Saver(max_to_keep=100000000) model_load_steps = 420000 model_file_load = os.path.join("models/", "agent_No_" + str(self.agent_id) + "/", str(model_load_steps) + "_" + "model_segment_training/", "8m") saver.restore(self.sess, model_file_load) print("model trained for %s steps of agent %s have been loaded"%(model_load_steps, self.agent_id)) else: self.sess, self.saver, self.summary_placeholders, self.update_ops, self.summary_op, self.summary_writer, self.summary_vars = self.init_sess() # 将网络计算的初始化工作完成 def init_sess(self): # Summary for tensorboard summary_placeholders, update_ops, summary_op, summary_vars = self.setup_summary() fileWritePath = os.path.join("logs/", "agent_No_" + str(self.agent_id) + "/") summary_writer = tf.summary.FileWriter(fileWritePath, self.sess.graph) self.sess.run(tf.global_variables_initializer()) # Load the file if the saved file exists saver = tf.train.Saver(max_to_keep=100000000) return self.sess, saver, summary_placeholders, update_ops, summary_op, summary_writer, summary_vars def _build_net(self): # ------------------ build evaluate_net ------------------ self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') with tf.variable_scope('eval_net'): # c_names(collections_names) are the collections to store variables 512*512的网络结构 c_names, n_l1, n_l2, w_initializer, b_initializer = \ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 256, 256, \ tf.contrib.layers.xavier_initializer(), tf.contrib.layers.xavier_initializer() # tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers # first layer. collections is used later when assign to target net with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1) # second layer. collections is used later when assign to target net with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [n_l1, n_l2], initializer=w_initializer, collections=c_names) b2 = tf.get_variable('b2', [1, n_l2], initializer=b_initializer, collections=c_names) l2 = tf.nn.relu(tf.matmul(l1, w2) + b2) # third layer. collections is used later when assign to target net with tf.variable_scope('l3'): w3 = tf.get_variable('w3', [n_l2, self.n_actions], initializer=w_initializer, collections=c_names) b3 = tf.get_variable('b3', [1, self.n_actions], initializer=b_initializer, collections=c_names) self.q_eval = tf.matmul(l2, w3) + b3 with tf.variable_scope('loss'): self.abs_errors = tf.reduce_sum(tf.abs(self.q_target - self.q_eval), axis=1) self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval)) #tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval)) with tf.variable_scope('train'): # self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss) self._train_op = tf.train.AdamOptimizer(self.lr, epsilon=1e-02).minimize(self.loss) # ------------------ build target_net ------------------ self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input with tf.variable_scope('target_net'): # c_names(collections_names) are the collections to store variables c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] # first layer. collections is used later when assign to target net with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1) # second layer. collections is used later when assign to target net with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [n_l1, n_l2], initializer=w_initializer, collections=c_names) b2 = tf.get_variable('b2', [1, n_l2], initializer=b_initializer, collections=c_names) l2 = tf.nn.relu(tf.matmul(l1, w2)) + b2 # third layer. collections is used later when assign to target net with tf.variable_scope('l3'): w3 = tf.get_variable('w3', [n_l2, self.n_actions], initializer=w_initializer, collections=c_names) b3 = tf.get_variable('b3', [1, self.n_actions], initializer=b_initializer, collections=c_names) self.q_next = tf.matmul(l2, w3) + b3 def store_transition(self, s, a, r, s_): transition = np.hstack((s, [a, r], s_)) self.memory.store(transition) # if not hasattr(self, 'memory_counter'): # self.memory_counter = 0 # # transition = np.hstack((s, [a, r], s_)) #往水平方向平铺,所以是一行数 # # # replace the old memory with new memory # index = self.memory_counter % self.memory_size # self.memory[index, :] = transition # # self.memory_counter += 1 def choose_action(self, observation): # to have batch dimension when feed into tf placeholder observation = observation[np.newaxis, :] if(self.load_model == False): if np.random.uniform() < self.epsilon: # forward feed the observation and get q value for every actions action = np.random.randint(0, self.n_actions) else: actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) action = np.argmax(actions_value) else: actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) action = np.argmax(actions_value) return action def learn(self): # check to replace target parameters # if(self.memory_counter < self.batch_size): # return if self.learn_step_counter % self.replace_target_iter == 0: self.sess.run(self.replace_target_op) # print('\ntarget_params_replaced\n') # sample batch memory from all memory # if self.memory_counter > self.memory_size: # sample_index = np.random.choice(self.memory_size, size=self.batch_size) # else: # sample_index = np.random.choice(self.memory_counter, size=self.batch_size) # batch_memory = self.memory[sample_index, :] tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size) q_next, q_eval = self.sess.run( [self.q_next, self.q_eval], feed_dict={ self.s_: batch_memory[:, -self.n_features:], # fixed params self.s: batch_memory[:, :self.n_features], # newest params }) # change q_target w.r.t q_eval's action q_target = q_eval.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) eval_act_index = batch_memory[:, self.n_features].astype(int) reward = batch_memory[:, self.n_features + 1] #batch个行,第n_features + 1列的数,那正好是reward q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) """ For example in this batch I have 2 samples and 3 actions: q_eval = [[1, 2, 3], [4, 5, 6]] q_target = q_eval = [[1, 2, 3], [4, 5, 6]] Then change q_target with the real q_target value w.r.t the q_eval's action. For example in: sample 0, I took action 0, and the max q_target value is -1; sample 1, I took action 2, and the max q_target value is -2: q_target = [[-1, 2, 3], [4, 5, -2]] So the (q_target - q_eval) becomes: [[(-1)-(1), 0, 0], [0, 0, (-2)-(6)]] We then backpropagate this error w.r.t the corresponding action to network, leave other action as error=0 cause we didn't choose it. """ _, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss], feed_dict={self.s: batch_memory[:, :self.n_features], self.q_target: q_target, self.ISWeights: ISWeights}) self.memory.batch_update(tree_idx, abs_errors) # train eval network # _, self.cost = self.sess.run([self._train_op, self.loss], # feed_dict={self.s: batch_memory[:, :self.n_features], # self.q_target: q_target}) self.cost_his.append(self.cost) self.learn_step_counter += 1 self.plotting() # Decreasing epsilon if self.epsilon > self.min_epsilon: self.epsilon -= self.max_epsilon/self.num_training else: self.epsilon = self.min_epsilon if (self.learn_step_counter % self.save_model_freq == 0): model_file_save = os.path.join("models/", "agent_No_"+str(self.agent_id)+"/", str(self.learn_step_counter) + "_" + "model_segment_training/", "8m") dirname = os.path.dirname(model_file_save) if any(dirname): os.makedirs(dirname, exist_ok=True) self.saver.save(self.sess, model_file_save) print("Model trained for %s times is saved"%self.learn_step_counter) # save data of replay buffer obj = self.memory filename = 'buffer_agent'+str(self.agent_id)+'.txt' file = open(filename, 'wb') pickle.dump(obj, file) file.close() def setup_summary(self): cost = tf.Variable(0.) eps_rew_agent = tf.Variable(0.) eps_rew_all = tf.Variable(0.) tf.summary.scalar("cost", cost) tf.summary.scalar("eps_rew_agent", eps_rew_agent) tf.summary.scalar("eps_rew_all", eps_rew_all) summary_vars = [cost, eps_rew_agent, eps_rew_all] summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))] update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))] summary_op = tf.summary.merge_all() return summary_placeholders, update_ops, summary_op, summary_vars def plotting(self): tensorboard_info = [self.cost, self.episode_rew_agent, self.episode_rew_all] vars_plot = [] for i in range(len(tensorboard_info)): vars_plot.append(self.sess.run(self.update_ops[i], feed_dict={self.summary_placeholders[i]: float(tensorboard_info[i])})) summary_1 = tf.Summary(value=[tf.Summary.Value(tag="cost", simple_value=vars_plot[0])]) summary_2 = tf.Summary(value=[tf.Summary.Value(tag="eps_rew_agent", simple_value=vars_plot[1])]) summary_3 = tf.Summary(value=[tf.Summary.Value(tag="eps_rew_all", simple_value=vars_plot[2])]) self.summary_writer.add_summary(summary_1, self.learn_step_counter) self.summary_writer.add_summary(summary_2, self.episode) self.summary_writer.add_summary(summary_3, self.episode) def plot_cost(self): import matplotlib.pyplot as plt plt.plot(np.arange(len(self.cost_his)), self.cost_his) plt.ylabel('Cost') plt.xlabel('training steps') plt.show() def get_episode_reward(self, eps_r_agent, eps_r_all, episode): self.episode_rew_agent = eps_r_agent self.episode_rew_all = eps_r_all self.episode = episode
def train(): gamma = 0.99 episodes = 100 batch_size = 128 max_time_steps = 200 episode_reward = 0 reward_history = [] env = gym.make('Pendulum-v0') obs_old = env.reset() memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) agent_policy = Policy(env, "policy") agent_critic = Value(env, "value") # agent_policy_t = Policy(env,"policy_t") # agent_critic_t = Value(env,"value_t") #initial rollouts to gather date for i in range(10000): action = env.action_space.sample() obs, rew, done, _ = env.step(action) episode_reward += rew memory.append(obs_old, action, rew, obs, done) obs_old = obs if done: # reward_history.append(episode_reward) episode_reward = 0 env.reset() episode_reward = 0 time_t = 200 tf.summary.scalar("episode_time_steps", time_t) tf.summary.scalar("episode_reward", episode_reward) merged = tf.summary.merge_all() tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) mark = np.zeros([8, 1]) time_split = np.zeros_like(mark) with tf.Session() as sess: sess1 = tf_debug.TensorBoardDebugWrapperSession(sess, "Vader:6007") # tf_debug.LocalCLIDebugWrapperSession(sess) from datetime import datetime now = datetime.now() train_writer = tf.summary.FileWriter( './train/' + now.strftime("%Y%m%d-%H%M%S") + '/', sess.graph) # train_writer = tf.summary.FileWriter('.' + '/train', sess.graph) agent_critic.create_target(0.1) agent_policy.create_target(0.1) # agent_policy_t.create_target_capacity(agent_policy.get_trainable_parameters(),0.6) sess.run(tf.global_variables_initializer()) sess.run(tf.initialize_all_variables()) sess.run(tf.initialize_local_variables()) sess1.run(agent_policy.get_trainable_parameters()) # agent_critic.set_trainable_parameters(agent_critic.get_trainable_parameters(), 0) # agent_policy.set_trainable_parameters(agent_policy.get_trainable_parameters(), 0) for i in range(episodes): print('running episode:', i) t = 0 done = 0 while t < max_time_steps: # print(t) start = Time.time() action = agent_policy.predict( obs_old) # + agent_policy.noise(0,1/episode_reward) mark[0] = Time.time() - start # print('mark1:'+str(mark1)) action = action.reshape(-1) # if action>0.5: # action = 1 # else: # action = -0 # action = env.action_space.sample() obs, rew, done, info = env.step(action) # env.render() episode_reward += rew memory.append(obs_old, action, rew, obs, done) # print('mark2:' + str(mark2)) if done or t == max_time_steps - 1: time_t = t reward_history.append(episode_reward) episode_reward = 0 env.reset() obs_old = obs t += 1 for steps in range(50): batch = memory.sample(batch_size) obs_batch = batch['obs0'] obs_batch -= np.mean(obs_batch, 0) obs_batch = obs_batch / np.var(obs_batch, 0) agent_policy.act_as_target = True action_batch_predict = agent_policy.predict(obs_batch)[0] agent_policy.act_as_target = False agent_critic.act_as_target = True value_batch = agent_critic.predict(obs_batch, action_batch_predict)[0] # print(value_batch[0]) agent_critic.act_as_target = False y = np.array(batch['rewards']) + gamma * np.array( value_batch) #.reshape(-1,batch_size) agent_critic.update_value(obs=obs_batch, action=action_batch_predict, target=y) q_grad = np.array( agent_critic.get_q_gradient(action_batch_predict, obs_batch)).reshape( -1, env.action_space.shape[0]) agent_policy.optimize_policy(q_grad, obs_batch) parm = agent_critic.get_all_parameters() value = np.array(sess.run(agent_critic.get_all_parameters())) agent_critic.update_target() agent_policy.update_target() value = np.array(sess.run(agent_critic.get_all_parameters())) # print(time_split) print(reward_history[-1]) summary = sess.run(merged) train_writer.add_summary(summary, i) time = 200 episode_reward = 0