def test_random_sampling(self): rb = ReplayBuffer(3) rb.add(Transitions[0]).add(Transitions[1]).add(Transitions[1]).add( Transitions[2]) samples = rb.sample(100) n_1, n_2 = 0, 0 for sample in samples: if sample == Transitions[1]: n_1 += 1 elif sample == Transitions[2]: n_2 += 1 else: pytest.fail() assert n_1 > n_2
class DQN(Trainer): def __init__(self, parameters): super(DQN, self).__init__(parameters) self.replay_buffer = ReplayBuffer(self.buffersize) def push_to_buffer(self, state, action, reward, next_state, done): self.replay_buffer.push(state, action, reward, next_state, done) def compute_td_loss(self, batch_size, *args): state, action, reward, next_state, done = self.replay_buffer.sample( batch_size) state = Variable(torch.FloatTensor(np.float32(state))) next_state = Variable(torch.FloatTensor(np.float32(next_state))) action = Variable(torch.LongTensor(action)) reward = Variable(torch.FloatTensor(reward)) done = Variable(torch.FloatTensor(done)) q_values = self.current_model(state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_values = self.current_model(next_state) next_q_state_values = self.target_model(next_state) next_q_value = next_q_state_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) expected_q_value = reward + self.gamma * next_q_value * (1 - done) loss = (q_value - Variable(expected_q_value.data)).abs() loss[loss.le(1)] = loss[loss.le(1)].pow(2) loss[loss.gt(1)] = 1 #(loss[loss.gt(1)] + 1) / 2 loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss
class Runner: def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 ''' self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) ''' self.env_evaluate = MeetEnv() if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) def run(self, num): plt.figure() plt.axis([0, self.args.n_epoch, 0, 100]) win_rates = [] episode_rewards = [] train_steps = 0 # print('Run {} start'.format(num)) for epoch in range(self.args.n_epoch): print('Run {}, train epoch {}'.format(num, epoch)) if epoch % self.args.evaluate_cycle == 0: win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) win_rates.append(win_rate) episode_rewards.append(episode_reward) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards) episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _ = self.rolloutWorker.generate_episode(episode_idx) episodes.append(episode) # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate((episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find('central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.rolloutWorker.generate_episode(evaluate=True) episode_rewards += episode_reward if episode_reward > self.args.threshold: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def evaluate_sparse(self): win_number = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.evaluateWorker.generate_episode(evaluate=True) result = 'win' if episode_reward > 0 else 'defeat' print('Epoch {}: {}'.format(epoch, result)) if episode_reward > 0: win_number += 1 self.env_evaluate.close() return win_number / self.args.evaluate_epoch
class Runner: def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy if args.use_per: self.buffer = PrioritizedReplayBuffer(args) else: self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.map + '/' if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.file_name = self.save_path + str(args.env_name) + '_' + str( args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time def run(self, num): train_steps = 0 episode_rewards = 0 fixed_rewards = 0 st = time.time() plot_rewards = [] # print('Run {} start'.format(num)) for epoch in range(self.args.n_epoch): # print('Run {}, train epoch {}'.format(num, epoch)) # if epoch % self.args.evaluate_cycle == 0: # win_rate, episode_reward = self.evaluate() # # print('win_rate is ', win_rate) # self.win_rates.append(win_rate) # self.episode_rewards.append(episode_reward) # print(episode_reward) # # self.plt(num) episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): if self.args.use_ja: if self.args.use_v1: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v2( episode_idx) else: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v3( episode_idx) else: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode( episode_idx) episodes.append(episode) episode_rewards += episode_reward fixed_rewards += fixed_reward plot_rewards.append(episode_reward) if epoch % self.args.evaluate_cycle == 0: t = time.time() - st st = time.time() epr = round(episode_rewards / self.args.evaluate_cycle, 2) fr = round(fixed_rewards / self.args.evaluate_cycle, 2) print('train epoch {}, reward {}, time {}, rate {}'.format( epoch, [epr, fr], t, rate)) # wandb.log({"reward": epr, "test_reward": epr}) episode_rewards = 0 fixed_rewards = 0 with open(self.file_name, 'wb') as fp: pickle.dump(plot_rewards, fp) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 elif not self.args.load_model: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): # mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size)) # # print(mini_batch['terminated']) # # print(train_steps) # dq = self.agents.train(mini_batch, train_steps) if self.args.use_per: mini_batch, idxs = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) dq = self.agents.train(mini_batch, train_steps) self.buffer.update_priorities(idxs, dq) else: mini_batch = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) dq = self.agents.train(mini_batch, train_steps) train_steps += 1 # self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag = self.rolloutWorker.generate_episode( epoch, evaluate=True) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch
class Runner: def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.plt_success = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.env_name if not os.path.exists(self.save_path): os.makedirs(self.save_path) def run(self, num): train_steps = 0 for epoch in range(self.args.n_epoch): epoch_success = 0 add_rate = 0 epoch_begin_time = time.time() for n in range(self.args.epoch_size): episodes = [] batch_success = 0 # one batch, 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, success, add_rate = self.rolloutWorker.generate_episode( epoch, episode_idx) episodes.append(episode) batch_success += success # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find( 'reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 print('\t\t batch success: {:.3f}'.format( batch_success / self.args.n_episodes)) epoch_success += batch_success print('Run {}, train epoch {}'.format(num, epoch)) epoch_time = time.time() - epoch_begin_time print('Time {:.2f}s'.format(epoch_time)) print('Add_rate: {:.2f}\t Success: {:.2f}'.format( add_rate, epoch_success / self.args.epoch_size / self.args.n_episodes)) self.plt_success.append(epoch_success / self.args.epoch_size / self.args.n_episodes) print('random seed', self.args.seed) self.plt(num) def evaluate(self): print('yes') epoch_success = 0 for epoch in range(self.args.evaluate_epoch): _, success, _ = self.rolloutWorker.generate_episode(epoch, evaluate=True) epoch_success += success return epoch_success / self.args.evaluate_epoch def plt(self, num): plt.figure() plt.axis([0, self.args.n_epoch, 0, 1]) plt.plot(range(len(self.plt_success)), self.plt_success) plt.xlabel('epoch*{}'.format(self.args.n_epoch)) plt.ylabel('success rate') plt.savefig(self.save_path + '/plt_{}.png'.format(self.args.seed), format='png') plt.show() np.save(self.save_path + '/success_rate_{}'.format(self.args.seed), self.plt_success)
class DQNAgent(object): """ refs: https://github.com/skumar9876/Hierarchical-DQN/blob/master/dqn.py """ def __init__(self, states_n: tuple, actions_n: int, hidden_layers: list, scope_name: str, sess=None, learning_rate=1e-4, discount=0.98, replay_memory_size=100000, batch_size=32, begin_train=1000, targetnet_update_freq=1000, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_step=50000, seed=1, logdir='logs', savedir='save', save_freq=10000, use_tau=False, tau=0.001): """ :param states_n: tuple :param actions_n: int :param hidden_layers: list :param scope_name: str :param sess: tf.Session :param learning_rate: float :param discount: float :param replay_memory_size: int :param batch_size: int :param begin_train: int :param targetnet_update_freq: int :param epsilon_start: float :param epsilon_end: float :param epsilon_decay_step: int :param seed: int :param logdir: str """ self.states_n = states_n self.actions_n = actions_n self._hidden_layers = hidden_layers self._scope_name = scope_name self.lr = learning_rate self._target_net_update_freq = targetnet_update_freq self._current_time_step = 0 self._epsilon_schedule = LinearSchedule(epsilon_decay_step, epsilon_end, epsilon_start) self._train_batch_size = batch_size self._begin_train = begin_train self._gamma = discount self._use_tau = use_tau self._tau = tau self.savedir = savedir self.save_freq = save_freq self.qnet_optimizer = tf.train.AdamOptimizer(self.lr) self._replay_buffer = ReplayBuffer(replay_memory_size) self._seed(seed) with tf.Graph().as_default(): self._build_graph() self._merged_summary = tf.summary.merge_all() if sess is None: self.sess = tf.Session() else: self.sess = sess self.sess.run(tf.global_variables_initializer()) self._saver = tf.train.Saver() self._summary_writer = tf.summary.FileWriter(logdir=logdir) self._summary_writer.add_graph(tf.get_default_graph()) def show_memory(self): print(self._replay_buffer.show()) def _q_network(self, state, hidden_layers, outputs, scope_name, trainable): with tf.variable_scope(scope_name): out = state for ly in hidden_layers: out = layers.fully_connected(out, ly, activation_fn=tf.nn.relu, trainable=trainable) out = layers.fully_connected(out, outputs, activation_fn=None, trainable=trainable) return out def _build_graph(self): self._state = tf.placeholder(dtype=tf.float32, shape=(None, ) + self.states_n, name='state_input') with tf.variable_scope(self._scope_name): self._q_values = self._q_network(self._state, self._hidden_layers, self.actions_n, 'q_network', True) self._target_q_values = self._q_network(self._state, self._hidden_layers, self.actions_n, 'target_q_network', False) with tf.variable_scope('q_network_update'): self._actions_onehot = tf.placeholder(dtype=tf.float32, shape=(None, self.actions_n), name='actions_onehot_input') self._td_targets = tf.placeholder(dtype=tf.float32, shape=(None, ), name='td_targets') self._q_values_pred = tf.reduce_sum(self._q_values * self._actions_onehot, axis=1) self._error = tf.abs(self._q_values_pred - self._td_targets) quadratic_part = tf.clip_by_value(self._error, 0.0, 1.0) linear_part = self._error - quadratic_part self._loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part) qnet_gradients = self.qnet_optimizer.compute_gradients( self._loss, tf.trainable_variables()) for i, (grad, var) in enumerate(qnet_gradients): if grad is not None: qnet_gradients[i] = (tf.clip_by_norm(grad, 10), var) self.train_op = self.qnet_optimizer.apply_gradients(qnet_gradients) tf.summary.scalar('loss', self._loss) with tf.name_scope('target_network_update'): q_network_params = [ t for t in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self._scope_name + '/q_network') if t.name.startswith(self._scope_name + '/q_network/') ] target_q_network_params = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=self._scope_name + '/target_q_network') self.target_update_ops = [] for var, var_target in zip( sorted(q_network_params, key=lambda v: v.name), sorted(target_q_network_params, key=lambda v: v.name)): # self.target_update_ops.append(var_target.assign(var)) # soft target update self.target_update_ops.append( var_target.assign( tf.multiply(var_target, 1 - self._tau) + tf.multiply(var, self._tau))) self.target_update_ops = tf.group(*self.target_update_ops) def choose_action(self, state, epsilon=None): """ for one agent :param state: :param epsilon: :return: """ if epsilon is not None: epsilon_used = epsilon else: epsilon_used = self._epsilon_schedule.value( self._current_time_step) if np.random.random() < epsilon_used: return np.random.randint(0, self.actions_n) else: q_values = self.sess.run(self._q_values, feed_dict={self._state: state[None]}) return np.argmax(q_values[0]) def choose_actions(self, states, epsilons=None): """ for multi-agent :param states: :param epsilon: :return: """ if epsilons is not None: epsilons_used = epsilons else: epsilons_used = self._epsilon_schedule.value( self._current_time_step) actions = [] for i, state in enumerate(states): if np.random.random() < epsilons_used[i]: actions.append(np.random.randint(0, self.actions_n)) else: q_values = self.sess.run(self._q_values, feed_dict={self._state: state[None]}) actions.append(np.argmax(q_values[0])) return actions def check_network_output(self, state): q_values = self.sess.run(self._q_values, feed_dict={self._state: state[None]}) print(q_values[0]) def store(self, state, action, reward, next_state, terminate): self._replay_buffer.add(state, action, reward, next_state, terminate) def get_max_target_Q_s_a(self, next_states): next_state_q_values = self.sess.run( self._q_values, feed_dict={self._state: next_states}) next_state_target_q_values = self.sess.run( self._target_q_values, feed_dict={self._state: next_states}) next_select_actions = np.argmax(next_state_q_values, axis=1) bt_sz = len(next_states) next_select_actions_onehot = np.zeros((bt_sz, self.actions_n)) for i in range(bt_sz): next_select_actions_onehot[i, next_select_actions[i]] = 1. next_state_max_q_values = np.sum(next_state_target_q_values * next_select_actions_onehot, axis=1) return next_state_max_q_values def train(self): self._current_time_step += 1 if self._current_time_step == 1: print('Training starts.') self.sess.run(self.target_update_ops) if self._current_time_step > self._begin_train: states, actions, rewards, next_states, terminates = self._replay_buffer.sample( batch_size=self._train_batch_size) actions_onehot = np.zeros((self._train_batch_size, self.actions_n)) for i in range(self._train_batch_size): actions_onehot[i, actions[i]] = 1. next_state_q_values = self.sess.run( self._q_values, feed_dict={self._state: next_states}) next_state_target_q_values = self.sess.run( self._target_q_values, feed_dict={self._state: next_states}) next_select_actions = np.argmax(next_state_q_values, axis=1) next_select_actions_onehot = np.zeros( (self._train_batch_size, self.actions_n)) for i in range(self._train_batch_size): next_select_actions_onehot[i, next_select_actions[i]] = 1. next_state_max_q_values = np.sum(next_state_target_q_values * next_select_actions_onehot, axis=1) td_targets = rewards + self._gamma * next_state_max_q_values * ( 1 - terminates) _, str_ = self.sess.run( [self.train_op, self._merged_summary], feed_dict={ self._state: states, self._actions_onehot: actions_onehot, self._td_targets: td_targets }) self._summary_writer.add_summary(str_, self._current_time_step) # update target_net if self._use_tau: self.sess.run(self.target_update_ops) else: if self._current_time_step % self._target_net_update_freq == 0: self.sess.run(self.target_update_ops) # save model if self._current_time_step % self.save_freq == 0: # TODO save the model with highest performance self._saver.save(sess=self.sess, save_path=self.savedir + '/my-model', global_step=self._current_time_step) def train_without_replaybuffer(self, states, actions, target_values): self._current_time_step += 1 if self._current_time_step == 1: print('Training starts.') self.sess.run(self.target_update_ops) bt_sz = len(states) actions_onehot = np.zeros((bt_sz, self.actions_n)) for i in range(bt_sz): actions_onehot[i, actions[i]] = 1. _, str_ = self.sess.run( [self.train_op, self._merged_summary], feed_dict={ self._state: states, self._actions_onehot: actions_onehot, self._td_targets: target_values }) self._summary_writer.add_summary(str_, self._current_time_step) # update target_net if self._use_tau: self.sess.run(self.target_update_ops) else: if self._current_time_step % self._target_net_update_freq == 0: self.sess.run(self.target_update_ops) # save model if self._current_time_step % self.save_freq == 0: # TODO save the model with highest performance self._saver.save(sess=self.sess, save_path=self.savedir + '/my-model', global_step=self._current_time_step) def load_model(self): self._saver.restore(self.sess, tf.train.latest_checkpoint(self.savedir)) def _seed(self, lucky_number): tf.set_random_seed(lucky_number) np.random.seed(lucky_number) random.seed(lucky_number)
class Runner: def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) def run(self, num): time_steps, train_steps, evaluate_steps = 0, 0, -1 while time_steps < self.args.n_steps: print('Run {}, time_steps {}'.format(num, time_steps)) if time_steps // self.args.evaluate_cycle > evaluate_steps: win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) self.win_rates.append(win_rate) self.episode_rewards.append(episode_reward) self.plt(num) evaluate_steps += 1 episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _, _, steps = self.rolloutWorker.generate_episode( episode_idx) episodes.append(episode) time_steps += steps # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 win_rate, episode_reward = self.evaluate() print('win_rate is ', win_rate) self.win_rates.append(win_rate) self.episode_rewards.append(episode_reward) self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag, _ = self.rolloutWorker.generate_episode( epoch, evaluate=True) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def plt(self, num): plt.figure() plt.ylim([0, 105]) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(self.win_rates)), self.win_rates) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rates') plt.subplot(2, 1, 2) plt.plot(range(len(self.episode_rewards)), self.episode_rewards) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), self.episode_rewards) plt.close()
class Runner: def __init__(self, env, args): self.env = env self.args = args self.agents = Agents(args) self.qmix_pg_learner = QMIX_PG(self.agents, args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.critic_buffer = ReplayBuffer(args, args.critic_buffer_size) self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size) self.args = args self.win_rates = [] self.episode_rewards = [] tmp = f'clamp2-5_' + f'{args.loss_coeff_entropy}_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \ f'{args.actor_update_delay}_{args.critic_lr}' # f'clamp2-5_'+ anneal_epsilon self.save_path = self.args.result_dir + '/linear_mix/' + 'qmix_ac_total_cf' + '/' + tmp + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) def run(self, num): train_steps = 0 epsilon = self.args.epsilon # 初始epsilon # print('Run {} start'.format(num)) for epoch in range(self.args.n_epoch): print('Run {}, train epoch {}'.format(num, epoch)) if epoch % self.args.evaluate_cycle == 0: # 100 win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) self.win_rates.append(win_rate) self.episode_rewards.append(episode_reward) self.plt(num) episodes = [] if self.args.epsilon_anneal_scale == 'epoch': epsilon = epsilon - self.args.anneal_epsilon if epsilon > self.args.min_epsilon else epsilon # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): # 1 episode, _, _ = self.rolloutWorker.generate_episode(episode_idx, evaluate=False, epsilon=epsilon) episodes.append(episode) # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate((episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find('central_v') > -1 or self.args.alg.find( 'reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.critic_buffer.store_episode(episode_batch) self.actor_buffer.store_episode(episode_batch) # if epoch % 16 == 0: # 2 for train_step in range(self.args.critic_train_steps): # 1 # 16 mini_batch = self.critic_buffer.sample( min(self.critic_buffer.current_size, self.args.critic_batch_size)) # 32 episodes # 16 self.qmix_pg_learner.train_critic(mini_batch, self.args.episode_limit, train_steps) train_steps += 1 if epoch % self.args.actor_update_delay == 0: # 2 for train_step in range(self.args.actor_train_steps): # 1 # 16 mini_batch = self.actor_buffer.sample( min(self.actor_buffer.current_size, self.args.actor_batch_size)) # 16 episodes # 16 self.qmix_pg_learner.train_actor(mini_batch, self.args.episode_limit) self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag = self.rolloutWorker.generate_episode(epoch, evaluate=True, epsilon=0) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def plt(self, num): plt.figure() plt.axis([0, self.args.n_epoch, 0, 100]) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(self.win_rates)), self.win_rates) plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(self.episode_rewards)), self.episode_rewards) plt.xlabel('episodes*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), self.episode_rewards)
def main(): with open('cartpole.json', encoding='utf-8') as config_file: config = json.load(config_file) env = gym.make('CartPole-v0') state_shape = env.observation_space.shape action_count = env.action_space.n layers = [] for layer in config['layers']: layers.append(Dense(layer, activation=C.relu)) layers.append(Dense((action_count, config['n']), activation=None)) model_func = Sequential(layers) replay_buffer = ReplayBuffer(config['buffer_capacity']) # Fill the buffer with randomly generated samples state = env.reset() for i in range(config['buffer_capacity']): action = env.action_space.sample() post_state, reward, done, _ = env.step(action) replay_buffer.add(state.astype(np.float32), action, reward, post_state.astype(np.float32), float(done)) if done: state = env.reset() reward_buffer = np.zeros(config['max_episodes'], dtype=np.float32) losses = [] epsilon_schedule = LinearSchedule(1, 0.01, config['max_episodes']) agent = CategoricalAgent(state_shape, action_count, model_func, config['vmin'], config['vmax'], config['n'], lr=config['lr'], gamma=config['gamma']) log_freq = config['log_freq'] for episode in range(1, config['max_episodes'] + 1): state = env.reset().astype(np.float32) done = False while not done: action = agent.act(state, epsilon_schedule.value(episode)) post_state, reward, done, _ = env.step(action) post_state = post_state.astype(np.float32) replay_buffer.add(state, action, reward, post_state, float(done)) reward_buffer[episode - 1] += reward state = post_state minibatch = replay_buffer.sample(config['minibatch_size']) agent.train(*minibatch) loss = agent.trainer.previous_minibatch_loss_average losses.append(loss) if episode % config['target_update_freq'] == 0: agent.update_target() if episode % log_freq == 0: average = np.sum(reward_buffer[episode - log_freq: episode]) / log_freq print('Episode {:4d} | Loss: {:6.4f} | Reward: {}'.format(episode, loss, average)) agent.model.save('cartpole.cdqn') sns.set_style('dark') pd.Series(reward_buffer).rolling(window=log_freq).mean().plot() plt.xlabel('Episode') plt.ylabel('Reward') plt.title('CartPole - Reward with Time') plt.show() plt.plot(np.arange(len(losses)), losses) plt.xlabel('Episode') plt.ylabel('Loss') plt.title('CartPole - Loss with Time') plt.show()
losses, episode_rewards = [], [] episode_reward, episode_iters, episodes = 0, 0, 0 state = env.reset() # training loop for i in range(1, num_frames + 1): action = current_model.act(state) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward episode_iters += 1 if len(replay_buffer) >= args.batch_size and current_model.training: states, actions, rewards, next_states, dones = replay_buffer.sample( args.batch_size, cuda=USE_CUDA, to_pytorch=True) # calculate source distribution source_dist = current_model(states) actions = actions.unsqueeze(1).unsqueeze(1).expand( args.batch_size, 1, args.num_atoms) source_dist = source_dist.gather(1, actions).squeeze(1) # calculate target distribution target_dist, bins = target_distribution(next_states, rewards, dones, target_model, args) loss = loss_fn(source_dist, bins if args.no_projection else target_dist) losses += [loss.data[0]] optimizer.zero_grad() loss.backward()
class Runner: def __init__(self, env, args, itr, seed): # 随机设置种子 if seed is not None: self.setup_seed(seed) self.args = args # 获取环境 self.env = env # 进程编号 self.pid = itr self.replay_buffer = ReplayBuffer(self.args) self.win_rates = [] ''' 这里,episode_reward 代表一个episode的累加奖赏, episodes_reward代表多个episode的累加奖赏, episodes_rewards代表多次评价的多个episode的累加奖赏 ''' self.episodes_rewards = [] self.evaluate_itr = [] self.max_win_rate = 0 self.time_steps = 0 # 保存结果和模型的位置,增加计数,帮助一次运行多个实例 alg_dir = self.args.alg + '_' + str(self.args.epsilon_anneal_steps // 10000) + 'w' + '_' + \ str(self.args.target_update_period) self.alg_tag = '_' + self.args.optim if self.args.her: self.alg_tag += str(self.args.her) alg_dir += '_her=' + str(self.args.her) # self.save_path = self.args.result_dir + '/' + alg_dir + '/' + self.args.map + '/' + itr self.save_path = self.args.result_dir + '/' + self.args.map + '/' + alg_dir + '/' + itr if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.args.model_dir = args.model_dir + '/' + args.map + '/' + alg_dir + '/' + itr self.agents = Agents(args, itr=itr) print('step runner 初始化') if self.args.her: print('使用HER') @staticmethod def setup_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) torch.backends.cudnn.deterministic = True def generate_episode(self, episode_num, evaluate=False): # 为保存评价的回放做准备 if self.args.replay_dir != '' and evaluate and episode_num == 0: self.env.close() # 变量初始化,使用her需要记录goal self.env.reset() done = False info = None win = False last_action = np.zeros((self.args.n_agents, self.args.n_actions)) # epsilon 递减 epsilon = 0 if evaluate else self.args.epsilon # epsilon 递减的方式 if self.args.epsilon_anneal_scale == 'episode' or \ (self.args.epsilon_anneal_scale == 'itr' and episode_num == 0): epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon # 记录一个episode的信息 episode_buffer = None if not evaluate: episode_buffer = { 'o': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.obs_shape ]), 's': np.zeros([self.args.episode_limit, self.args.state_shape]), 'a': np.zeros([self.args.episode_limit, self.args.n_agents, 1]), 'onehot_a': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.n_actions ]), 'avail_a': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.n_actions ]), 'r': np.zeros([self.args.episode_limit, 1]), 'next_o': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.obs_shape ]), 'next_s': np.zeros([self.args.episode_limit, self.args.state_shape]), 'next_avail_a': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.n_actions ]), 'done': np.ones([self.args.episode_limit, 1]), 'padded': np.ones([self.args.episode_limit, 1]) } # 开始进行一波 episode states, former_states = [], [] obs = self.env.get_obs() if self.args.her: obs = np.concatenate((obs, self.env.goal), axis=1) state = self.env.get_state() if self.args.her: states.append(self.env.state) former_states.append(self.env.former_states) avail_actions = [] self.agents.policy.init_hidden(1) for agent_id in range(self.args.n_agents): avail_action = self.env.get_avail_agent_actions(agent_id) avail_actions.append(avail_action) episode_reward = 0 for step in range(self.args.episode_limit): if done: break else: actions, onehot_actions = [], [] for agent_id in range(self.args.n_agents): # avail_action = self.env.get_avail_agent_actions(agent_id) action, _ = self.agents.choose_action( obs[agent_id], last_action[agent_id], agent_id, avail_actions[agent_id], epsilon, evaluate) # 得到该动作的独热编码 onehot_action = np.zeros(self.args.n_actions) onehot_action[action] = 1 onehot_actions.append(onehot_action) # 加入联合动作 actions.append(action) # avail_actions.append(avail_action) # 记录该动作 last_action[agent_id] = onehot_action # 对环境执行联合动作 reward, done, info = self.env.step(actions) # 记录时间步 if not evaluate: self.time_steps += 1 # 获取改变后的信息 if not done: next_obs = self.env.get_obs() if self.args.her: next_obs = np.concatenate((next_obs, self.env.goal), axis=1) next_state = self.env.get_state() if self.args.her: states.append(self.env.state) former_states.append(self.env.former_states) else: next_obs = obs next_state = state # 添加可得动作 next_avail_actions = [] for agent_id in range(self.args.n_agents): avail_action = self.env.get_avail_agent_actions(agent_id) next_avail_actions.append(avail_action) # 添加经验 if not evaluate: episode_buffer['o'][step] = obs episode_buffer['s'][step] = state episode_buffer['a'][step] = np.reshape( actions, [self.args.n_agents, 1]) episode_buffer['onehot_a'][step] = onehot_actions episode_buffer['avail_a'][step] = avail_actions episode_buffer['r'][step] = [reward] episode_buffer['next_o'][step] = next_obs episode_buffer['next_s'][step] = next_state episode_buffer['next_avail_a'][step] = next_avail_actions episode_buffer['done'][step] = [done] episode_buffer['padded'][step] = [0.] # 更新变量 episode_reward += reward obs = next_obs state = next_state avail_actions = next_avail_actions if self.args.epsilon_anneal_scale == 'step': epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon # 是训练则记录新的epsilon if not evaluate: self.args.epsilon = epsilon # 获取对局信息 if info.__contains__('battle_won'): win = True if done and info['battle_won'] else False if evaluate and episode_num == self.args.evaluate_num - 1 and self.args.replay_dir != '': self.env.save_replay() self.env.close() if not evaluate and self.args.her: return episode_buffer, states, former_states return episode_buffer, episode_reward, win def run(self): train_steps = 0 early_stop = 10 num_eval = 0 self.max_win_rate = 0 self.time_steps = 0 last_test_step = 0 begin_time = None begin_step = None # for itr in range(self.args.n_itr): while self.time_steps < self.args.max_steps: if begin_step is None: begin_time = datetime.utcnow().astimezone( timezone(timedelta(hours=8))) begin_step = self.time_steps # 收集 n_episodes 的数据 if self.args.her: episode_batch, states, former_states = self.generate_episode(0) self.her_k(episode_batch, states, former_states) else: episode_batch, _, _ = self.generate_episode(0) for key in episode_batch.keys(): episode_batch[key] = np.array([episode_batch[key]]) for e in range(1, self.args.n_episodes): if self.args.her: episode_batch, states, former_states = self.generate_episode( e) self.her_k(episode_batch, states, former_states) else: episode, _, _ = self.generate_episode(e) for key in episode_batch.keys(): episode[key] = np.array([episode[key]]) episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) # 添加到 replay buffer self.replay_buffer.store(episode_batch) # 训练 TODO 12.5 if self.replay_buffer.size < self.args.batch_size * self.args.bs_rate: print('replay buffer 还没 batch size * {} 大 !'.format( self.args.bs_rate)) begin_time = None begin_step = None continue for _ in range(self.args.train_steps): batch = self.replay_buffer.sample(self.args.batch_size) self.agents.train(batch, train_steps) train_steps += 1 # 周期性评价 # if itr % self.args.evaluation_period == 0: if (self.time_steps - last_test_step) / self.args.evaluation_steps_period >= 1.0: num_eval += 1 last_test_step = self.time_steps print( f'进程 {self.pid}: {self.time_steps} step / {self.args.max_steps} steps' ) # print('幂为:{}'.format(self.agents.policy.power)) win_rate, episodes_reward = self.evaluate() # 保存测试结果 self.evaluate_itr.append(self.time_steps) self.win_rates.append(win_rate) self.episodes_rewards.append(episodes_reward) # 表现好的模型要额外保存 if win_rate > self.max_win_rate: self.max_win_rate = win_rate self.agents.policy.save_model(str(win_rate)) # 不时刻保存,从而减少时间花费 if num_eval % 50 == 0: self.save_results() self.plot() # 记录经历50次测试花费了多久 now = datetime.utcnow().astimezone( timezone(timedelta(hours=8))) elapsed_time = now - begin_time expected_remain_time = (elapsed_time / (self.time_steps - begin_step)) * \ (self.args.max_steps - self.time_steps) expected_end_time = now + expected_remain_time print("预计还需: {}".format(str(expected_remain_time))) print("预计结束时间为: {}".format( expected_end_time.strftime("%Y-%m-%d_%H-%M-%S"))) # 最后把所有的都保存一下 self.save_results() self.plot() self.env.close() def evaluate(self): """ 得到平均胜率和每次测试的累加奖赏,方便画误差阴影图 :return: """ win_number = 0 episodes_reward = [] for itr in range(self.args.evaluate_num): if self.args.didactic: episode_reward, win = self.get_eval_qtot() else: _, episode_reward, win = self.generate_episode(itr, evaluate=True) episodes_reward.append(episode_reward) if win: win_number += 1 return win_number / self.args.evaluate_num, episodes_reward def save_results(self): """ 保存数据,方便后面多种算法结果画在一张图里比较 :return: """ # 如果已经有图片就删掉 for filename in os.listdir(self.save_path): if filename.endswith('.npy'): os.remove(self.save_path + '/' + filename) np.save(self.save_path + '/evaluate_itr.npy', self.evaluate_itr) if self.args.didactic and self.args.power is None and 'strapped' in self.args.alg: np.save(self.save_path + '/train_steps.npy', self.agents.policy.train_steps) np.save(self.save_path + '/differences.npy', self.agents.policy.differences) else: np.save(self.save_path + '/win_rates.npy', self.win_rates) np.save(self.save_path + '/episodes_rewards.npy', self.episodes_rewards) def plot(self): """ 定期绘图 :return: """ fig = plt.figure() ax1 = fig.add_subplot(211) if self.args.didactic and self.args.power is None and 'strapped' in self.args.alg: win_x = np.array(self.agents.policy.train_steps)[:, None] / 1000000. win_y = np.array(self.agents.policy.differences)[:, None] plot_win = pd.DataFrame(np.concatenate((win_x, win_y), axis=1), columns=['T (mil)', self.args.which_diff]) sns.lineplot(x="T (mil)", y=self.args.which_diff, data=plot_win, ax=ax1) else: win_x = np.array(self.evaluate_itr)[:, None] / 1000000. win_y = np.array(self.win_rates)[:, None] plot_win = pd.DataFrame(np.concatenate((win_x, win_y), axis=1), columns=['T (mil)', 'Test Win']) sns.lineplot(x="T (mil)", y="Test Win", data=plot_win, ax=ax1) ax2 = fig.add_subplot(212) reward_x = np.repeat(self.evaluate_itr, self.args.evaluate_num)[:, None] / 1000000. reward_y = np.array(self.episodes_rewards).flatten()[:, None] plot_reward = pd.DataFrame(np.concatenate((reward_x, reward_y), axis=1), columns=['T (mil)', 'Median Test Returns']) sns.lineplot(x="T (mil)", y="Median Test Returns", data=plot_reward, ax=ax2, ci='sd', estimator=np.median) plt.tight_layout() # 格式化成2016-03-20-11_45_39形式 # tag = self.args.alg + '-' + time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) tag = self.args.alg + '_' + str(self.args.target_update_period) # if 'averaged' in self.args.alg: tag += (self.alg_tag + '_' + datetime.utcnow().astimezone( timezone(timedelta(hours=8))).strftime("%Y-%m-%d_%H-%M-%S")) # 如果已经有图片就删掉 for filename in os.listdir(self.save_path): if filename.endswith('.png'): os.remove(self.save_path + '/' + filename) fig.savefig(self.save_path + "/%s.png" % tag) plt.close() def get_eval_qtot(self): """ 得到eval qtot """ self.env.reset() all_last_action = np.zeros((self.args.n_agents, self.args.n_actions)) # 开始进行一波 episode all_obs = self.env.get_obs() state = self.env.get_state() avail_actions = [] self.agents.policy.init_hidden(1) eval_qs = [] actions = [] one_hot_actions = [] hidden_evals = None for agent_idx in range(self.args.n_agents): obs = all_obs[agent_idx] last_action = all_last_action[agent_idx] avail_action = self.env.get_avail_agent_actions(agent_idx) avail_actions.append(avail_action) onehot_agent_idx = np.zeros(self.args.n_agents) onehot_agent_idx[agent_idx] = 1. if self.args.last_action: # 在水平方向上平铺 obs = np.hstack((obs, last_action)) if self.args.reuse_network: obs = np.hstack((obs, onehot_agent_idx)) hidden_state = self.agents.policy.eval_hidden[:, agent_idx, :] # 转置 obs = torch.Tensor(obs).unsqueeze(0) # 是否使用 GPU if self.args.cuda: obs = obs.cuda() hidden_state = hidden_state.cuda() # 获取 Q(s, a) qsa, hidden_eval = self.agents.policy.eval_rnn(obs, hidden_state) qsa[avail_action == 0.0] = -float("inf") eval_qs.append(torch.max(qsa)) action = torch.argmax(qsa) actions.append(action) onehot_action = np.zeros(self.args.n_actions) onehot_action[action] = 1 one_hot_actions.append(onehot_action) if hidden_evals is None: hidden_evals = hidden_eval else: hidden_evals = torch.cat([hidden_evals, hidden_eval], dim=0) s = torch.Tensor(state) eval_qs = torch.Tensor(eval_qs).unsqueeze(0) actions = torch.Tensor(actions).unsqueeze(0) one_hot_actions = torch.Tensor(one_hot_actions).unsqueeze(0) hidden_evals = hidden_evals.unsqueeze(0) # 是否使用GPU if self.args.cuda: s = s.cuda() eval_qs = eval_qs.cuda() actions = actions.cuda() one_hot_actions = one_hot_actions.cuda() hidden_evals = hidden_evals.cuda() # 计算Q_tot eval_q_total = None if self.args.alg == 'qatten': eval_q_total, _, _ = self.agents.policy.eval_mix_net( eval_qs, s, actions) elif self.args.alg == 'qmix' \ or 'wqmix' in self.args.alg \ or 'strapped' in self.args.alg: eval_q_total = self.agents.policy.eval_mix_net(eval_qs, s) elif 'dmaq' in self.args.alg: if self.args.alg == "dmaq_qatten": ans_chosen, _, _ = self.agents.policy.mixer(eval_qs, s, is_v=True) ans_adv, _, _ = self.agents.policy.mixer( eval_qs, s, actions=one_hot_actions, max_q_i=eval_qs, is_v=False) eval_q_total = ans_chosen + ans_adv else: ans_chosen = self.agents.policy.mixer(eval_qs, s, is_v=True) ans_adv = self.agents.policy.mixer(eval_qs, s, actions=one_hot_actions, max_q_i=eval_qs, is_v=False) eval_q_total = ans_chosen + ans_adv elif self.args.alg == 'qtran_base': one_hot_actions = one_hot_actions.unsqueeze(0) hidden_evals = hidden_evals.unsqueeze(0) eval_q_total = self.agents.policy.eval_joint_q( s, hidden_evals, one_hot_actions) eval_q_total = eval_q_total.squeeze().item() return eval_q_total, 0 def her_k(self, episode, states, former_states): import copy for _ in range(self.args.her): episode_buffer = { 'o': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.obs_shape ]), 's': np.zeros([self.args.episode_limit, self.args.state_shape]), 'a': np.zeros([self.args.episode_limit, self.args.n_agents, 1]), 'onehot_a': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.n_actions ]), 'avail_a': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.n_actions ]), 'r': np.zeros([self.args.episode_limit, 1]), 'next_o': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.obs_shape ]), 'next_s': np.zeros([self.args.episode_limit, self.args.state_shape]), 'next_avail_a': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.n_actions ]), 'done': np.ones([self.args.episode_limit, 1]), 'padded': np.ones([self.args.episode_limit, 1]) } # 重新生成goal,order等信息 self.env.reset() # 使用新生成的goals重构整个episode for i in range(len(episode)): reward = self.env.get_reward(states[i], former_states[i]) done = episode['done'][i] if reward >= 0: reward = 0 done = True episode_buffer['o'][i] = episode['o'][i] episode_buffer['o'][i, :, -2:] = np.array(self.env.goal)[:] episode_buffer['s'][i] = episode['s'][i] episode_buffer['a'][i] = episode['a'][i] episode_buffer['onehot_a'][i] = episode['onehot_a'][i] episode_buffer['avail_a'][i] = episode['avail_a'][i] episode_buffer['r'][i] = [reward] episode_buffer['next_o'][i] = episode['next_o'][i] episode_buffer['next_o'][i, :, -2:] = np.array(self.env.goal)[:] episode_buffer['next_s'][i] = episode['next_s'][i] episode_buffer['next_avail_a'][i] = episode['next_avail_a'][i] episode_buffer['done'][i] = [done] episode_buffer['padded'][i] = [0.] if done: break for key in episode_buffer.keys(): episode_buffer[key] = np.array([episode_buffer[key]]) self.replay_buffer.store(episode_buffer)
def main(env_name='KungFuMasterNoFrameskip-v0', train_freq=4, target_update_freq=10000, checkpoint_freq=100000, log_freq=1, batch_size=32, train_after=200000, max_timesteps=5000000, buffer_size=50000, vmin=-10, vmax=10, n=51, gamma=0.99, final_eps=0.1, final_eps_update=1000000, learning_rate=0.00025, momentum=0.95): env = gym.make(env_name) env = wrap_env(env) state_dim = (4, 84, 84) action_count = env.action_space.n with C.default_options(activation=C.relu, init=C.he_uniform()): model_func = Sequential([ Convolution2D((8, 8), 32, strides=4, name='conv1'), Convolution2D((4, 4), 64, strides=2, name='conv2'), Convolution2D((3, 3), 64, strides=1, name='conv3'), Dense(512, name='dense1'), Dense((action_count, n), activation=None, name='out') ]) agent = CategoricalAgent(state_dim, action_count, model_func, vmin, vmax, n, gamma, lr=learning_rate, mm=momentum, use_tensorboard=True) logger = agent.writer epsilon_schedule = LinearSchedule(1.0, final_eps, final_eps_update) replay_buffer = ReplayBuffer(buffer_size) try: obs = env.reset() episode = 0 rewards = 0 steps = 0 for t in range(max_timesteps): # Take action if t > train_after: action = agent.act(obs, epsilon=epsilon_schedule.value(t)) else: action = np.random.choice(action_count) obs_, reward, done, _ = env.step(action) # Store transition in replay buffer replay_buffer.add(obs, action, reward, obs_, float(done)) obs = obs_ rewards += reward if t > train_after and (t % train_freq) == 0: # Minimize error in projected Bellman update on a batch sampled from replay buffer experience = replay_buffer.sample(batch_size) agent.train(*experience) # experience is (s, a, r, s_, t) tuple logger.write_value('loss', agent.trainer.previous_minibatch_loss_average, t) if t > train_after and (t % target_update_freq) == 0: agent.update_target() if t > train_after and (t % checkpoint_freq) == 0: agent.checkpoint('checkpoints/model_{}.chkpt'.format(t)) if done: episode += 1 obs = env.reset() if episode % log_freq == 0: steps = t - steps + 1 logger.write_value('rewards', rewards, episode) logger.write_value('steps', steps, episode) logger.write_value('epsilon', epsilon_schedule.value(t), episode) logger.flush() rewards = 0 steps = t finally: agent.save_model('checkpoints/{}.cdqn'.format(env_name))
def main(): env = MultiEnvRunnerWrapper(ENV_NUM, CMOTP) lucky_no = RANDOM_SEED set_seed(lucky_no) agent1 = LenientDQNAgent(env.envs[0], ENV_NUM, [256, 256], 'LenientAgent1', learning_rate=1e-4, use_tau=True, tau=1e-3, mu=MAX_U, logdir='logs/logs1_ERM_{}_{}_{}_{}'.format(ENV_NUM, STEP_N, RANDOM_SEED, TRAIN_TIMES), savedir='save/save1_ERM_{}_{}_{}_{}'.format(ENV_NUM, STEP_N, RANDOM_SEED, TRAIN_TIMES), auto_save=False, discount=GAMMA) agent2 = LenientDQNAgent(env.envs[0], ENV_NUM, [256, 256], 'LenientAgent2', learning_rate=1e-4, use_tau=True, tau=1e-3, mu=MAX_U, logdir='logs/logs2_ERM_{}_{}_{}_{}'.format(ENV_NUM, STEP_N, RANDOM_SEED, TRAIN_TIMES), savedir='save/save2_ERM_{}_{}_{}_{}'.format(ENV_NUM, STEP_N, RANDOM_SEED, TRAIN_TIMES), auto_save=False, discount=GAMMA) erm1 = ReplayBuffer(ERM_FACTOR * ENV_NUM * STEP_N) erm2 = ReplayBuffer(ERM_FACTOR * ENV_NUM * STEP_N) print('after init') begintime = time.time() if TRAIN: train_input_shape = (ENV_NUM * STEP_N,) + env.envs[0].observation_space.shape episodes_1 = [[] for _ in range(ENV_NUM)] episodes_2 = [[] for _ in range(ENV_NUM)] states_1, states_2 = env.reset() ep_cnt = 0 ep_len_log = [] min_len = 10000. train_num = 0 train_log = [] # 针对某一状态,记录所有环境中,该状态的温度值 temp_log = [[] for _ in range(ENV_NUM)] while len(ep_len_log) < TRAIN_EPISODES: sts_1 = [[] for _ in range(ENV_NUM)] acts_1 = [[] for _ in range(ENV_NUM)] rwds_1 = [[] for _ in range(ENV_NUM)] n_sts_1 = [[] for _ in range(ENV_NUM)] dns_1 = [[] for _ in range(ENV_NUM)] ln_1 = [[] for _ in range(ENV_NUM)] sts_2 = [[] for _ in range(ENV_NUM)] acts_2 = [[] for _ in range(ENV_NUM)] rwds_2 = [[] for _ in range(ENV_NUM)] n_sts_2 = [[] for _ in range(ENV_NUM)] dns_2 = [[] for _ in range(ENV_NUM)] ln_2 = [[] for _ in range(ENV_NUM)] # get a batch of train data for j in range(ENV_NUM): for k in range(STEP_N): action_1 = agent1.choose_action(states_1[j], j) action_2 = agent2.choose_action(states_2[j], j) action_n = [action_1, action_2] next_state, reward, done, _ = env.envs[j].step([action_1, action_2]) next_state_1, next_state_2 = next_state reward_1, reward_2 = reward done_1, done_2 = done episodes_1[j].append((states_1[j], action_1)) episodes_2[j].append((states_2[j], action_2)) sts_1[j].append(states_1[j]) acts_1[j].append(action_1) rwds_1[j].append(reward_1) n_sts_1[j].append(next_state_1) dns_1[j].append(done_1) ln_1[j].append( agent1.leniency_calculator.calc_leniency(agent1.temp_recorders[j].get_state_temp(states_1[j]))) sts_2[j].append(states_2[j]) acts_2[j].append(action_2) rwds_2[j].append(reward_2) n_sts_2[j].append(next_state_2) dns_2[j].append(done_2) ln_2[j].append( agent2.leniency_calculator.calc_leniency(agent2.temp_recorders[j].get_state_temp(states_2[j]))) states_1[j] = next_state_1 states_2[j] = next_state_2 if done_1: states_1[j], states_2[j] = env.envs[j].reset() agent1.temp_recorders[j].decay_temp(episodes_1[j]) agent2.temp_recorders[j].decay_temp(episodes_2[j]) ep_cnt += 1 this_train_log = (train_num, ep_cnt, j, agent1.temp_recorders[j].get_ave_temp(), agent1.temp_recorders[j].get_temp_len(), len(episodes_1[j])) train_log.append(this_train_log) print('train_num: {}, episode_cnt: {}, env: {} , mean_temp: {}, temp_len: {}, len: {} '.format( *this_train_log)) checked_temp = agent1.temp_recorders[j].show_temp(big=True, narrow=False) temp_log[j].append(checked_temp) if ep_cnt % 100 == 0: print('testing...') print('average episode length: ', test(agent1, agent2, render=False, load_model=False)) ep_len_log.append(len(episodes_1[j])) tmp = np.mean(ep_len_log[-10:]) if tmp < min_len: print('update min_len with ', tmp) min_len = tmp agent1.save_model() agent2.save_model() episodes_1[j] = [] episodes_2[j] = [] # discount reward last_values_1 = agent1.get_max_target_Q_s_a(states_1) for j, (rwd_j, dn_j, l_v_j) in enumerate(zip(rwds_1, dns_1, last_values_1)): if type(rwd_j) is np.ndarray: rwd_j = rwd_j.tolist() if type(dn_j) is np.ndarray: dn_j = dn_j.tolist() if dn_j[-1] == 0: rwd_j = discount_with_dones(rwd_j + [l_v_j], dn_j + [0], GAMMA)[:-1] else: rwd_j = discount_with_dones(rwd_j, dn_j, GAMMA) rwds_1[j] = rwd_j last_values_2 = agent2.get_max_target_Q_s_a(states_2) for j, (rwd_j, dn_j, l_v_j) in enumerate(zip(rwds_2, dns_2, last_values_2)): if type(rwd_j) is np.ndarray: rwd_j = rwd_j.tolist() if type(dn_j) is np.ndarray: dn_j = dn_j.tolist() if dn_j[-1] == 0: rwd_j = discount_with_dones(rwd_j + [l_v_j], dn_j + [0], GAMMA)[:-1] else: rwd_j = discount_with_dones(rwd_j, dn_j, GAMMA) rwds_2[j] = rwd_j # flatten sts_1 = np.asarray(sts_1, dtype=np.float32).reshape(train_input_shape) acts_1 = np.asarray(acts_1, dtype=np.int32).flatten() rwds_1 = np.asarray(rwds_1, dtype=np.float32).flatten() n_sts_1 = np.asarray(n_sts_1, dtype=np.float32).reshape(train_input_shape) dns_1 = np.asarray(dns_1, dtype=np.bool).flatten() ln_1 = np.asarray(ln_1, dtype=np.float32).flatten() sts_2 = np.asarray(sts_2, dtype=np.float32).reshape(train_input_shape) acts_2 = np.asarray(acts_2, dtype=np.int32).flatten() rwds_2 = np.asarray(rwds_2, dtype=np.float32).flatten() n_sts_2 = np.asarray(n_sts_2, dtype=np.float32).reshape(train_input_shape) dns_2 = np.asarray(dns_2, dtype=np.bool).flatten() ln_2 = np.asarray(ln_2, dtype=np.float32).flatten() # train agent1.train_without_replaybuffer(sts_1, acts_1, rwds_1, ln_1) agent2.train_without_replaybuffer(sts_2, acts_2, rwds_2, ln_2) train_num += 1 # store these transitions to ERM for ii, (s1, a1, td1, l1) in enumerate(zip(sts_1, acts_1, rwds_1, ln_1)): erm1.add(s1, a1, td1, [], l1) for ii, (s2, a2, td2, l2) in enumerate(zip(sts_2, acts_2, rwds_2, ln_2)): erm2.add(s2, a2, td2, [], l2) # print(sts_1) # print(acts_1) # print(rwds_1) # print(ln_1) # print('----------------------') # erm1.show() # exit() # train with transitions from ERM for ii in range(ERM_TRAIN_NUM): erm_s1, erm_a1, erm_td1, _, erm_l1 = erm1.sample(ENV_NUM * STEP_N) erm_s2, erm_a2, erm_td2, _, erm_l2 = erm2.sample(ENV_NUM * STEP_N) # print('*************************') # print(erm_s1) # print(erm_a1) # print(erm_td1) # print(erm_l1) # exit() agent1.train_without_replaybuffer(erm_s1, erm_a1, erm_td1, erm_l1) agent2.train_without_replaybuffer(erm_s2, erm_a2, erm_td2, erm_l2) train_num += 1 endtime = time.time() print('training time: {}'.format(endtime - begintime)) with open('./train_log.txt', 'a') as f: f.write('ERM num_env: {}, n_step: {}, rand_seed: {}, episodes: {}, training time: {}'.format( ENV_NUM, STEP_N, RANDOM_SEED, TRAIN_TIMES, endtime - begintime) + '\n') # np.save('ep_len_{}_{}_{}.npy'.format(ENV_NUM, STEP_N, lucky_no), ep_len_log) train_log = np.array(train_log) np.save('train_log_ERM_{}_{}_{}_{}.npy'.format(ENV_NUM, STEP_N, lucky_no, TRAIN_TIMES), train_log) temp_log = np.array(temp_log) np.save('temp_log_ERM_{}_{}_{}_{}.npy'.format(ENV_NUM, STEP_N, lucky_no, TRAIN_TIMES), temp_log) else: test(agent1, agent2, render=True, load_model=True) env.close()
class Runner: def __init__(self, env, args, itr): # 获取参数 # self.args = get_common_args() self.args = args # 获取环境 self.env = env # 进程编号 self.pid = itr self.agents = Agents(args, itr=itr) # 不复用网络,就会有多个agent,训练的时候共享参数,就是一个网络 # if not self.args.reuse_network: # self.agents = [] # for i in range(self.args.n_agents): # self.agents.append(Agents(self.args, i)) # self.rollout = RollOut(self.agents, self.args) self.replay_buffer = ReplayBuffer(self.args) self.win_rates = [] ''' 这里,episode_reward 代表一个episode的累加奖赏, episodes_reward代表多个episode的累加奖赏, episodes_rewards代表多次评价的多个episode的累加奖赏 ''' self.episodes_rewards = [] self.evaluate_itr = [] self.max_win_rate = 0 # 保存结果和模型的位置,增加计数,帮助一次运行多个实例 self.save_path = self.args.result_dir + '/' + self.args.alg + '/' + self.args.map + '/' + str( itr) if not os.path.exists(self.save_path): os.makedirs(self.save_path) print('runner 初始化') def generate_episode(self, episode_num, evaluate=False): # 为保存评价的回放做准备 if self.args.replay_dir != '' and evaluate and episode_num == 0: self.env.close() # 变量初始化 self.env.reset() done = False info = None win = False last_action = np.zeros((self.args.n_agents, self.args.n_actions)) # epsilon 递减 epsilon = 0 if evaluate else self.args.epsilon # epsilon 递减的方式 if self.args.epsilon_anneal_scale == 'episode' or \ (self.args.epsilon_anneal_scale == 'itr' and episode_num == 0): epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon # 记录一个episode的信息 episode_buffer = None if not evaluate: episode_buffer = { 'o': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.obs_shape ]), 's': np.zeros([self.args.episode_limit, self.args.state_shape]), 'a': np.zeros([self.args.episode_limit, self.args.n_agents, 1]), 'onehot_a': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.n_actions ]), 'avail_a': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.n_actions ]), 'r': np.zeros([self.args.episode_limit, 1]), 'next_o': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.obs_shape ]), 'next_s': np.zeros([self.args.episode_limit, self.args.state_shape]), 'next_avail_a': np.zeros([ self.args.episode_limit, self.args.n_agents, self.args.n_actions ]), 'done': np.ones([self.args.episode_limit, 1]), 'padded': np.ones([self.args.episode_limit, 1]) } # 开始进行一波 episode obs = self.env.get_obs() state = self.env.get_state() avail_actions = [] self.agents.policy.init_hidden(1) for agent_id in range(self.args.n_agents): avail_action = self.env.get_avail_agent_actions(agent_id) avail_actions.append(avail_action) episode_reward = 0 for step in range(self.args.episode_limit): if done: break else: actions, onehot_actions = [], [] for agent_id in range(self.args.n_agents): # avail_action = self.env.get_avail_agent_actions(agent_id) action = self.agents.choose_action(obs[agent_id], last_action[agent_id], agent_id, avail_actions[agent_id], epsilon, evaluate) # 得到该动作的独热编码 onehot_action = np.zeros(self.args.n_actions) onehot_action[action] = 1 onehot_actions.append(onehot_action) # 加入联合动作 actions.append(action) # avail_actions.append(avail_action) # 记录该动作 last_action[agent_id] = onehot_action # 对环境执行联合动作 reward, done, info = self.env.step(actions) # 获取改变后的信息 if not done: next_obs = self.env.get_obs() next_state = self.env.get_state() else: next_obs = obs next_state = state # 添加可得动作 next_avail_actions = [] for agent_id in range(self.args.n_agents): avail_action = self.env.get_avail_agent_actions(agent_id) next_avail_actions.append(avail_action) # 添加经验 if not evaluate: episode_buffer['o'][step] = obs episode_buffer['s'][step] = state episode_buffer['a'][step] = np.reshape( actions, [self.args.n_agents, 1]) episode_buffer['onehot_a'][step] = onehot_actions episode_buffer['avail_a'][step] = avail_actions episode_buffer['r'][step] = [reward] episode_buffer['next_o'][step] = next_obs episode_buffer['next_s'][step] = next_state episode_buffer['next_avail_a'][step] = next_avail_actions episode_buffer['done'][step] = [done] episode_buffer['padded'][step] = [0.] # 更新变量 episode_reward += reward obs = next_obs state = next_state avail_actions = next_avail_actions if self.args.epsilon_anneal_scale == 'step': epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon # 是训练则记录新的epsilon if not evaluate: self.args.epsilon = epsilon # 获取对局信息 if info.__contains__('battle_won'): win = True if done and info['battle_won'] else False if evaluate and episode_num == self.args.evaluate_num - 1 and self.args.replay_dir != '': self.env.save_replay() self.env.close() return episode_buffer, episode_reward, win def run(self): train_steps = 0 early_stop = 10 num_eval = 0 self.max_win_rate = 0 for itr in range(self.args.n_itr): # 收集 n_episodes 的数据 episode_batch, _, _ = self.generate_episode(0) for key in episode_batch.keys(): episode_batch[key] = np.array([episode_batch[key]]) for e in range(1, self.args.n_episodes): episode, _, _ = self.generate_episode(e) for key in episode_batch.keys(): episode[key] = np.array([episode[key]]) episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) # 添加到 replay buffer self.replay_buffer.store(episode_batch) # 训练 if self.replay_buffer.size < self.args.batch_size * 12.5: # print('replay buffer 还没 batch size 大') continue for _ in range(self.args.train_steps): batch = self.replay_buffer.sample(self.args.batch_size) self.agents.train(batch, train_steps) # if self.args.reuse_network: # self.agents.train(batch, train_steps) # else: # for i in range(self.args.n_agents): # self.agents[i].train(batch, train_steps) train_steps += 1 # 周期性评价 if itr % self.args.evaluation_period == 0: num_eval += 1 print(f'进程 {self.pid}: {itr} / {self.args.n_itr}') win_rate, episodes_reward = self.evaluate() # 保存测试结果 self.evaluate_itr.append(itr) self.win_rates.append(win_rate) self.episodes_rewards.append(episodes_reward) # 表现好的模型要额外保存 if win_rate > self.max_win_rate: self.max_win_rate = win_rate self.agents.policy.save_model(str(win_rate)) # 不时刻保存,从而减少时间花费 if num_eval % 50 == 0: self.save_results() self.plot() # 最后把所有的都保存一下 self.save_results() self.plot() self.env.close() def evaluate(self): """ 得到平均胜率和每次测试的累加奖赏,方便画误差阴影图 :return: """ win_number = 0 episodes_reward = [] for itr in range(self.args.evaluate_num): _, episode_reward, win = self.generate_episode(itr, evaluate=True) episodes_reward.append(episode_reward) if win: win_number += 1 return win_number / self.args.evaluate_num, episodes_reward def save_results(self): """ 保存数据,方便后面多种算法结果画在一张图里比较 :return: """ # 如果已经有图片就删掉 for filename in os.listdir(self.save_path): if filename.endswith('.npy'): os.remove(self.save_path + '/' + filename) np.save(self.save_path + '/evaluate_itr.npy', self.evaluate_itr) np.save(self.save_path + '/win_rates.npy', self.win_rates) np.save(self.save_path + '/episodes_rewards.npy', self.episodes_rewards) def plot(self): """ 定期绘图 :return: """ fig = plt.figure() ax1 = fig.add_subplot(211) win_x = np.array(self.evaluate_itr)[:, None] win_y = np.array(self.win_rates)[:, None] plot_win = pd.DataFrame(np.concatenate((win_x, win_y), axis=1), columns=['evaluate_itr', 'win_rates']) sns.lineplot(x="evaluate_itr", y="win_rates", data=plot_win, ax=ax1) ax2 = fig.add_subplot(212) reward_x = np.repeat(self.evaluate_itr, self.args.evaluate_num)[:, None] reward_y = np.array(self.episodes_rewards).flatten()[:, None] plot_reward = pd.DataFrame( np.concatenate((reward_x, reward_y), axis=1), columns=['evaluate_itr', 'episodes_rewards']) sns.lineplot(x="evaluate_itr", y="episodes_rewards", data=plot_reward, ax=ax2, ci=68, estimator=np.median) # 格式化成2016-03-20-11_45_39形式 tag = self.args.alg + '-' + time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) # 如果已经有图片就删掉 for filename in os.listdir(self.save_path): if filename.endswith('.png'): os.remove(self.save_path + '/' + filename) fig.savefig(self.save_path + "/%s.png" % tag) plt.close()
class Rainbow(Trainer): def __init__(self, parameters): super(Rainbow, self).__init__(parameters) self.replay_buffer = ReplayBuffer(self.buffersize) def push_to_buffer(self, state, action, reward, next_state, done): self.replay_buffer.push(state, action, reward, next_state, done) def load_model(self): self.current_model = RainbowDQN(self.env.observation_space.shape[0], self.env.action_space.n, num_atoms, Vmin, Vmax) # input:(1,84,84), output:6 self.target_model = RainbowDQN(self.env.observation_space.shape[0], self.env.action_space.n, num_atoms, Vmin, Vmax) if USE_CUDA: self.current_model = self.current_model.cuda() self.target_model = self.target_model.cuda() self.update_target(self.current_model, self.target_model) # sync nets def projection_distribution(self, next_state, rewards, dones): batch_size = next_state.size(0) delta_z = float(Vmax - Vmin) / (num_atoms - 1) support = torch.linspace(Vmin, Vmax, num_atoms) next_dist = self.target_model(next_state).data.cpu() * support next_action = next_dist.sum(2).max(1)[1] next_action = next_action.unsqueeze(1).unsqueeze(1).expand( next_dist.size(0), 1, next_dist.size(2)) next_dist = next_dist.gather(1, next_action).squeeze(1) rewards = rewards.unsqueeze(1).expand_as(next_dist) dones = dones.unsqueeze(1).expand_as(next_dist) support = support.unsqueeze(0).expand_as(next_dist) Tz = rewards + (1 - dones) * 0.99 * support Tz = Tz.clamp(min=Vmin, max=Vmax) b = (Tz - Vmin) / delta_z l = b.floor().long() u = b.ceil().long() offset = torch.linspace(0, (batch_size - 1) * num_atoms, batch_size).long()\ .unsqueeze(1).expand(batch_size, num_atoms) proj_dist = torch.zeros(next_dist.size()) proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)) proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)) return proj_dist def compute_td_loss(self, batch_size, *args): state, action, reward, next_state, done = self.replay_buffer.sample( batch_size) state = Variable(torch.FloatTensor(np.float32(state))) next_state = Variable(torch.FloatTensor(np.float32(next_state))) action = Variable(torch.LongTensor(action)) reward = torch.FloatTensor(reward) done = torch.FloatTensor(np.float32(done)) proj_dist = self.projection_distribution(next_state, reward, done) dist = self.current_model(state) action = action.unsqueeze(1).unsqueeze(1).expand( batch_size, 1, num_atoms) dist = dist.gather(1, action).squeeze(1) dist.data.clamp_(0.01, 0.99) loss = -(Variable(proj_dist) * dist.log()).sum(1) loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.current_model.reset_noise() self.target_model.reset_noise() return loss
class Runner: def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.fig = None def run(self, num): global EPOCH train_steps = 0 # print('Run {} start'.format(num)) self.env.reset_callback = reset_callback #TODO for epoch in range(self.args.n_epoch): EPOCH = epoch # print('Run {}, train epoch {}'.format(num, epoch)) if epoch % self.args.evaluate_cycle == 0: # print('Run {}, train epoch {}, evaluating'.format(num, epoch)) win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) self.win_rates.append(self.rolloutWorker.epsilon) self.episode_rewards.append(episode_reward) self.plt(num) episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _, _ = self.rolloutWorker.generate_episode( episode_idx) episodes.append(episode) # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag = self.rolloutWorker.generate_episode( epoch, evaluate=True) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def plt(self, num): if self.fig is None: self.fig = plt.figure() fig = self.fig plt.axis([0, self.args.n_epoch, 0, 100]) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(self.win_rates)), self.win_rates) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('epsilon') plt.subplot(2, 1, 2) plt.plot(range(len(self.episode_rewards)), self.episode_rewards) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.tight_layout() plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), self.episode_rewards) plt.clf()
class NAF(BaseAgent): def __init__(self, env, gamma=0.99, tau=0.005, hidden_size=256, device=None): super(NAF, self).__init__(env, device=None) self.action_space = self.act_dim self.num_inputs = self.obs_dim num_inputs = self.obs_dim action_space = self.act_dim self.model = Policy(hidden_size, num_inputs, action_space).to(self.device) self.target_model = Policy(hidden_size, num_inputs, action_space).to(self.device) self.optimizer = Adam(self.model.parameters(), lr=1e-3) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim) self.c_loss, self.a_loss = [], [] self.gamma = gamma self.tau = tau hard_update(self.target_model, self.model) def act(self, state, action_noise=None, param_noise=None): state = torch.tensor(state, dtype=torch.float32, device=self.device) state = state.reshape(1, -1) self.model.eval() mu, _, _ = self.model((Variable(state), None)) self.model.train() mu = mu.data if action_noise is not None: mu += torch.Tensor(action_noise.noise()) return mu.clamp(-1, 1).cpu().data.numpy().flatten() def train(self): #state_batch = Variable(torch.cat(batch.state)) #action_batch = Variable(torch.cat(batch.action)) #reward_batch = Variable(torch.cat(batch.reward)) #mask_batch = Variable(torch.cat(batch.mask)) #next_state_batch = Variable(torch.cat(batch.next_state)) state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.replay_buffer.sample(128) _, _, next_state_values = self.target_model((next_state_batch, None)) reward_batch = reward_batch.unsqueeze(1) mask_batch = mask_batch.unsqueeze(1) expected_state_action_values = reward_batch + (self.gamma * (1 - mask_batch) * next_state_values) _, state_action_values, _ = self.model((state_batch, action_batch)) loss = MSELoss(state_action_values, expected_state_action_values) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(self.model.parameters(), 1) self.optimizer.step() soft_update(self.target_model, self.model, self.tau) return loss.item(), 0 def step(self, t): c, a = self.train() self.c_loss.append(c); self.a_loss.append(a) if t % 5000 == 0: # self.evaluate(self.env) print(f'Iteration {t}: Critic Loss: {np.mean(self.c_loss)}, Actor Loss: {np.mean(self.a_loss) * 2}') self.c_loss, self.a_loss = [], [] self.episode_timesteps += 1 def save_model(self, env_name, suffix="", model_path=None): if not os.path.exists('models/'): os.makedirs('models/') if model_path is None: model_path = "models/naf_{}_{}".format(env_name, suffix) print('Saving model to {}'.format(model_path)) torch.save(self.model.state_dict(), model_path) def load_model(self, model_path): print('Loading model from {}'.format(model_path)) self.model.load_state_dict(torch.load(model_path))
class Runner: def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.buffer = ReplayBuffer(args) self.args = args self.epsilon = args.epsilon # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) def run(self): plt.figure() plt.axis([0, self.args.n_epoch, 0, 100]) win_rates = [] episode_rewards = [] train_steps = 0 for epoch in tqdm(range(self.args.n_epoch)): # print('Train epoch {} start'.format(epoch)) self.epsilon = self.epsilon - 0.0001125 if self.epsilon > 0.05 else self.epsilon episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _ = self.rolloutWorker.generate_episode(self.epsilon) episodes.append(episode) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) self.buffer.store_episode(episode_batch) if self.buffer.current_size > 100: for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample(self.args.batch_size) self.agents.train(mini_batch, train_steps) train_steps += 1 win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) win_rates.append(win_rate) episode_rewards.append(episode_reward) # 可视化 if epoch % 100 == 0: plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch') plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch') plt.ylabel('episode_rewards') plt.savefig(self.args.result_dir + '/plt.png', format='png') np.save(self.args.result_dir + '/win_rates', win_rates) np.save(self.args.result_dir + '/episode_rewards', episode_rewards) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch') plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch') plt.ylabel('episode_rewards') plt.savefig(self.args.result_dir + '/plt.png', format='png') np.save(self.args.result_dir + '/win_rates', win_rates) np.save(self.args.result_dir + '/episode_rewards', episode_rewards) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.rolloutWorker.generate_episode(0) episode_rewards += episode_reward if episode_reward > self.args.threshold: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def evaluate_sparse(self): win_number = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.evaluateWorker.generate_episode(0) result = 'win' if episode_reward > 0 else 'defeat' print('Epoch {}: {}'.format(epoch, result)) if episode_reward > 0: win_number += 1 return win_number / self.args.evaluate_epoch