def __init__(self, env, args): self.env = env self.args = args if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.qmix_pg_learner = QMIX_PG(self.agents, args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.actor_critic_buffer = ReplayBuffer(args, args.buffer_size) # self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl tmp = f'clamp2-5_rewardscale10_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \ f'{args.actor_update_delay}_{args.critic_lr}_{args.n_epoch}_{args.temp}' # f'clamp2-5_'+ rewardscale10_ self.save_path = self.args.result_dir + '/linear_mix/' + 'mcsac' + '/' + tmp + '/' + args.map # _gradclip0.5 if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 ''' self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) ''' self.env_evaluate = MeetEnv() if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) if args.alg == 'commnet_coma': self.agents = CommNetAgents(args) self.rolloutWorker = CommNetRolloutWorker(env, self.agents, args) self.evaluateWorker = CommNetRolloutWorker(self.env_evaluate, self.agents, args) else: self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg != 'coma' and args.alg != 'commnet_coma': self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy if args.use_per: self.buffer = PrioritizedReplayBuffer(args) else: self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.map + '/' if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.file_name = self.save_path + str(args.env_name) + '_' + str( args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time
class PlayerTrainer(object): def __init__(self,actor,critic,buffersize,game,player,batch_size,gamma): self.actor = actor self.critic = critic self.replay = ReplayBuffer(buffersize) self.game =game self.player = player self.batch_size = batch_size self.gamma = gamma def noisyMaxQMove(self): state = self.game.space As = self.actor.predict(np.reshape(state, (1, *state.shape))) avail = self.game.avail() availQ = {} availP = [] for k in avail: availQ[k] = As[0][k] availP.append(As[0][k]) # if sum(availP)> 0: availP = np.array(availP) availP = [round(i, 5) if i >= 0 else (-.001 * round(i, 5)) for i in availP] availNorm = [i / sum(availP) for i in availP] a = np.random.choice(avail, p=availNorm) self.game.move(a,self.player) next_state, reward = self.game.step(self.player) self.bufferAdd(state,As,reward,self.game.game_over,next_state) if self.replay.size() > self.batch_size: s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay.sample_batch(self.batch_size) target_q = self.critic.predict_target(s2_batch,self.actor.predict_target(s2_batch)) y_i = [] for k in range(self.batch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.gamma * target_q[k]) predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (self.batch_size, 1))) #ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.actor.update_target_network() return self.game.space , reward def bufferAdd(self,state,Qs,reward,terminal,next_state): self.replay.add(np.reshape(state,(self.actor.s_dim,)),np.reshape(Qs,(self.actor.a_dim,)),reward,terminal,np.reshape(next_state,(self.actor.s_dim,)))
def __init__(self,actor,critic,buffersize,game,player,batch_size,gamma): self.actor = actor self.critic = critic self.replay = ReplayBuffer(buffersize) self.game =game self.player = player self.batch_size = batch_size self.gamma = gamma
def test_random_sampling(self): rb = ReplayBuffer(3) rb.add(Transitions[0]).add(Transitions[1]).add(Transitions[1]).add( Transitions[2]) samples = rb.sample(100) n_1, n_2 = 0, 0 for sample in samples: if sample == Transitions[1]: n_1 += 1 elif sample == Transitions[2]: n_2 += 1 else: pytest.fail() assert n_1 > n_2
def __init__(self, env, gamma=0.99, tau=0.005, hidden_size=256, device=None): super(NAF, self).__init__(env, device=None) self.action_space = self.act_dim self.num_inputs = self.obs_dim num_inputs = self.obs_dim action_space = self.act_dim self.model = Policy(hidden_size, num_inputs, action_space).to(self.device) self.target_model = Policy(hidden_size, num_inputs, action_space).to(self.device) self.optimizer = Adam(self.model.parameters(), lr=1e-3) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim) self.c_loss, self.a_loss = [], [] self.gamma = gamma self.tau = tau hard_update(self.target_model, self.model)
def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args, itr, seed): # 随机设置种子 if seed is not None: self.setup_seed(seed) self.args = args # 获取环境 self.env = env # 进程编号 self.pid = itr self.replay_buffer = ReplayBuffer(self.args) self.win_rates = [] ''' 这里,episode_reward 代表一个episode的累加奖赏, episodes_reward代表多个episode的累加奖赏, episodes_rewards代表多次评价的多个episode的累加奖赏 ''' self.episodes_rewards = [] self.evaluate_itr = [] self.max_win_rate = 0 self.time_steps = 0 # 保存结果和模型的位置,增加计数,帮助一次运行多个实例 alg_dir = self.args.alg + '_' + str(self.args.epsilon_anneal_steps // 10000) + 'w' + '_' + \ str(self.args.target_update_period) self.alg_tag = '_' + self.args.optim if self.args.her: self.alg_tag += str(self.args.her) alg_dir += '_her=' + str(self.args.her) # self.save_path = self.args.result_dir + '/' + alg_dir + '/' + self.args.map + '/' + itr self.save_path = self.args.result_dir + '/' + self.args.map + '/' + alg_dir + '/' + itr if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.args.model_dir = args.model_dir + '/' + args.map + '/' + alg_dir + '/' + itr self.agents = Agents(args, itr=itr) print('step runner 初始化') if self.args.her: print('使用HER')
def test_circular_buffer(self): rb = ReplayBuffer(4) rb.add(Transitions[0]) rb.add(Transitions[1]) rb.add(Transitions[2]) rb.add(Transitions[3]) rb.add(Transitions[4]) rb.add(Transitions[5]) assert (rb._storage == [ Transitions[4], Transitions[5], Transitions[2], Transitions[3] ]).all()
def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.buffer = ReplayBuffer(args) self.args = args self.epsilon = args.epsilon # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
def __init__(self, env, args): self.env = env self.args = args self.agents = Agents(args) self.qmix_pg_learner = QMIX_PG(self.agents, args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.critic_buffer = ReplayBuffer(args, args.critic_buffer_size) self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size) self.args = args self.win_rates = [] self.episode_rewards = [] tmp = f'clamp2-5_' + f'{args.loss_coeff_entropy}_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \ f'{args.actor_update_delay}_{args.critic_lr}' # f'clamp2-5_'+ anneal_epsilon self.save_path = self.args.result_dir + '/linear_mix/' + 'qmix_ac_total_cf' + '/' + tmp + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def test_len(self): rb = ReplayBuffer(5) rb.add(Transitions[0]).add(Transitions[1]).add(Transitions[2]) assert len(rb) == 3 for i in range(8): rb.add(Transitions[i]) assert len(rb) == 5
def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.plt_success = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.env_name if not os.path.exists(self.save_path): os.makedirs(self.save_path)
class DQN(Trainer): def __init__(self, parameters): super(DQN, self).__init__(parameters) self.replay_buffer = ReplayBuffer(self.buffersize) def push_to_buffer(self, state, action, reward, next_state, done): self.replay_buffer.push(state, action, reward, next_state, done) def compute_td_loss(self, batch_size, *args): state, action, reward, next_state, done = self.replay_buffer.sample( batch_size) state = Variable(torch.FloatTensor(np.float32(state))) next_state = Variable(torch.FloatTensor(np.float32(next_state))) action = Variable(torch.LongTensor(action)) reward = Variable(torch.FloatTensor(reward)) done = Variable(torch.FloatTensor(done)) q_values = self.current_model(state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_values = self.current_model(next_state) next_q_state_values = self.target_model(next_state) next_q_value = next_q_state_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) expected_q_value = reward + self.gamma * next_q_value * (1 - done) loss = (q_value - Variable(expected_q_value.data)).abs() loss[loss.le(1)] = loss[loss.le(1)].pow(2) loss[loss.gt(1)] = 1 #(loss[loss.gt(1)] + 1) / 2 loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss
def __init__(self, env, args, itr): # 获取参数 # self.args = get_common_args() self.args = args # 获取环境 self.env = env # 进程编号 self.pid = itr self.agents = Agents(args, itr=itr) # 不复用网络,就会有多个agent,训练的时候共享参数,就是一个网络 # if not self.args.reuse_network: # self.agents = [] # for i in range(self.args.n_agents): # self.agents.append(Agents(self.args, i)) # self.rollout = RollOut(self.agents, self.args) self.replay_buffer = ReplayBuffer(self.args) self.win_rates = [] ''' 这里,episode_reward 代表一个episode的累加奖赏, episodes_reward代表多个episode的累加奖赏, episodes_rewards代表多次评价的多个episode的累加奖赏 ''' self.episodes_rewards = [] self.evaluate_itr = [] self.max_win_rate = 0 # 保存结果和模型的位置,增加计数,帮助一次运行多个实例 self.save_path = self.args.result_dir + '/' + self.args.alg + '/' + self.args.map + '/' + str( itr) if not os.path.exists(self.save_path): os.makedirs(self.save_path) print('runner 初始化')
def __init__(self, config): self.writer = SummaryWriter() self.device = 'cuda' if T.cuda.is_available() else 'cpu' self.dqn_type = config["dqn-type"] self.run_title = config["run-title"] self.env = gym.make(config["environment"]) self.num_states = np.prod(self.env.observation_space.shape) self.num_actions = self.env.action_space.n layers = [ self.num_states, *config["architecture"], self.num_actions ] self.policy_net = Q_Network(self.dqn_type, layers).to(self.device) self.target_net = Q_Network(self.dqn_type, layers).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() capacity = config["max-experiences"] self.p_replay_eps = config["p-eps"] self.prioritized_replay = config["prioritized-replay"] self.replay_buffer = PrioritizedReplayBuffer(capacity, config["p-alpha"]) if self.prioritized_replay \ else ReplayBuffer(capacity) self.beta_scheduler = LinearSchedule(config["episodes"], initial_p=config["p-beta-init"], final_p=1.0) self.epsilon_decay = lambda e: max(config["epsilon-min"], e * config["epsilon-decay"]) self.train_freq = config["train-freq"] self.use_soft_update = config["use-soft-update"] self.target_update = config["target-update"] self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch-size"] self.time_step = 0 self.optim = T.optim.AdamW(self.policy_net.parameters(), lr=config["lr-init"], weight_decay=config["weight-decay"]) self.lr_scheduler = T.optim.lr_scheduler.StepLR(self.optim, step_size=config["lr-step"], gamma=config["lr-gamma"]) self.criterion = nn.SmoothL1Loss(reduction="none") # Huber Loss self.min_experiences = max(config["min-experiences"], config["batch-size"]) self.save_path = config["save-path"]
class NAF(BaseAgent): def __init__(self, env, gamma=0.99, tau=0.005, hidden_size=256, device=None): super(NAF, self).__init__(env, device=None) self.action_space = self.act_dim self.num_inputs = self.obs_dim num_inputs = self.obs_dim action_space = self.act_dim self.model = Policy(hidden_size, num_inputs, action_space).to(self.device) self.target_model = Policy(hidden_size, num_inputs, action_space).to(self.device) self.optimizer = Adam(self.model.parameters(), lr=1e-3) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim) self.c_loss, self.a_loss = [], [] self.gamma = gamma self.tau = tau hard_update(self.target_model, self.model) def act(self, state, action_noise=None, param_noise=None): state = torch.tensor(state, dtype=torch.float32, device=self.device) state = state.reshape(1, -1) self.model.eval() mu, _, _ = self.model((Variable(state), None)) self.model.train() mu = mu.data if action_noise is not None: mu += torch.Tensor(action_noise.noise()) return mu.clamp(-1, 1).cpu().data.numpy().flatten() def train(self): #state_batch = Variable(torch.cat(batch.state)) #action_batch = Variable(torch.cat(batch.action)) #reward_batch = Variable(torch.cat(batch.reward)) #mask_batch = Variable(torch.cat(batch.mask)) #next_state_batch = Variable(torch.cat(batch.next_state)) state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.replay_buffer.sample(128) _, _, next_state_values = self.target_model((next_state_batch, None)) reward_batch = reward_batch.unsqueeze(1) mask_batch = mask_batch.unsqueeze(1) expected_state_action_values = reward_batch + (self.gamma * (1 - mask_batch) * next_state_values) _, state_action_values, _ = self.model((state_batch, action_batch)) loss = MSELoss(state_action_values, expected_state_action_values) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(self.model.parameters(), 1) self.optimizer.step() soft_update(self.target_model, self.model, self.tau) return loss.item(), 0 def step(self, t): c, a = self.train() self.c_loss.append(c); self.a_loss.append(a) if t % 5000 == 0: # self.evaluate(self.env) print(f'Iteration {t}: Critic Loss: {np.mean(self.c_loss)}, Actor Loss: {np.mean(self.a_loss) * 2}') self.c_loss, self.a_loss = [], [] self.episode_timesteps += 1 def save_model(self, env_name, suffix="", model_path=None): if not os.path.exists('models/'): os.makedirs('models/') if model_path is None: model_path = "models/naf_{}_{}".format(env_name, suffix) print('Saving model to {}'.format(model_path)) torch.save(self.model.state_dict(), model_path) def load_model(self, model_path): print('Loading model from {}'.format(model_path)) self.model.load_state_dict(torch.load(model_path))
quant_idx = quant_idx.cpu().data batch_idx = np.arange(batch_size) tau = tau_hat[:, quant_idx][batch_idx, batch_idx] return tau, expected_quant num_quant = 51 Vmin = -10 Vmax = 10 current_model = QRDQN(env.observation_space.shape[0], env.action_space.n, num_quant) target_model = QRDQN(env.observation_space.shape[0], env.action_space.n, num_quant) optimizer = optim.Adam(current_model.parameters()) replay_buffer = ReplayBuffer(10000) def update_target(current_model, target_model): target_model.load_state_dict(current_model.state_dict()) update_target(current_model, target_model) def compute_td_loss(batch_size): state, action, reward, next_state, done = replay_buffer.sample(batch_size) state = autograd.Variable(torch.FloatTensor(np.float32(state))) next_state = autograd.Variable(torch.FloatTensor(np.float32(next_state)), volatile=True) action = autograd.Variable(torch.LongTensor(action)) reward = torch.FloatTensor(reward) done = torch.FloatTensor(np.float32(done))
]) # Use soft updates to update the target networks target_update = tf.group([ tf.assign(v_targ, DECAY * v_targ + (1 - DECAY) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) init = tf.global_variables_initializer() session = tf.Session() session.run(init) session.run(target_init) #%% Replay Buffer replay_buffer = ReplayBuffer(observation_shape=env.observation_space.shape, action_shape=(1, )) # %% Play def sample_action(env, observation, epsilon): if np.random.random() < epsilon: return env.action_space.sample() else: q_s_a = session.run(q, feed_dict={x: np.atleast_2d(observation)})[0] return np.argmax(q_s_a) def play_once(env, epsilon, render=False): observation = env.reset() done = False steps = 0
def learn(env, num_actions=3, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16): torch.set_num_threads(num_cpu) if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative obs, xy_per_marine = common.init(env, obs) group_id = 0 reset = True dqn = DQN(num_actions, lr, cuda) print('\nCollecting experience...') checkpoint_path = 'models/deepq/checkpoint.pth.tar' if os.path.exists(checkpoint_path): dqn, saved_mean_reward = load_checkpoint(dqn, cuda, filename=checkpoint_path) for t in range(max_timesteps): # Take action and update exploration to the newest value # custom process for DefeatZerglingsAndBanelings obs, screen, player = common.select_marine(env, obs) # action = act( # np.array(screen)[None], update_eps=update_eps, **kwargs)[0] action = dqn.choose_action(np.array(screen)[None]) reset = False rew = 0 new_action = None obs, new_action = common.marine_action(env, obs, player, action) army_count = env._obs[0].observation.player_common.army_count try: if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]: obs = env.step(actions=new_action) else: new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) except Exception as e: # print(e) 1 # Do nothing player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative rew += obs[0].reward done = obs[0].step_type == environment.StepType.LAST selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() if len(player_y) > 0: player = [int(player_x.mean()), int(player_y.mean())] if len(player) == 2: if player[0] > 32: new_screen = common.shift(LEFT, player[0] - 32, new_screen) elif player[0] < 32: new_screen = common.shift(RIGHT, 32 - player[0], new_screen) if player[1] > 32: new_screen = common.shift(UP, player[1] - 32, new_screen) elif player[1] < 32: new_screen = common.shift(DOWN, 32 - player[1], new_screen) # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: print("Episode Reward : %s" % episode_rewards[-1]) obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = player_relative group_list = common.init(env, obs) # Select all marines first # env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])]) episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = dqn.learn(obses_t, actions, rewards, obses_tp1, gamma, batch_size) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. dqn.update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_checkpoint({ 'epoch': t + 1, 'state_dict': dqn.save_state_dict(), 'best_accuracy': mean_100ep_reward }, checkpoint_path) saved_mean_reward = mean_100ep_reward
def __init__(self, parameters): super(Rainbow, self).__init__(parameters) self.replay_buffer = ReplayBuffer(self.buffersize)
class Rainbow(Trainer): def __init__(self, parameters): super(Rainbow, self).__init__(parameters) self.replay_buffer = ReplayBuffer(self.buffersize) def push_to_buffer(self, state, action, reward, next_state, done): self.replay_buffer.push(state, action, reward, next_state, done) def load_model(self): self.current_model = RainbowDQN(self.env.observation_space.shape[0], self.env.action_space.n, num_atoms, Vmin, Vmax) # input:(1,84,84), output:6 self.target_model = RainbowDQN(self.env.observation_space.shape[0], self.env.action_space.n, num_atoms, Vmin, Vmax) if USE_CUDA: self.current_model = self.current_model.cuda() self.target_model = self.target_model.cuda() self.update_target(self.current_model, self.target_model) # sync nets def projection_distribution(self, next_state, rewards, dones): batch_size = next_state.size(0) delta_z = float(Vmax - Vmin) / (num_atoms - 1) support = torch.linspace(Vmin, Vmax, num_atoms) next_dist = self.target_model(next_state).data.cpu() * support next_action = next_dist.sum(2).max(1)[1] next_action = next_action.unsqueeze(1).unsqueeze(1).expand( next_dist.size(0), 1, next_dist.size(2)) next_dist = next_dist.gather(1, next_action).squeeze(1) rewards = rewards.unsqueeze(1).expand_as(next_dist) dones = dones.unsqueeze(1).expand_as(next_dist) support = support.unsqueeze(0).expand_as(next_dist) Tz = rewards + (1 - dones) * 0.99 * support Tz = Tz.clamp(min=Vmin, max=Vmax) b = (Tz - Vmin) / delta_z l = b.floor().long() u = b.ceil().long() offset = torch.linspace(0, (batch_size - 1) * num_atoms, batch_size).long()\ .unsqueeze(1).expand(batch_size, num_atoms) proj_dist = torch.zeros(next_dist.size()) proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)) proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)) return proj_dist def compute_td_loss(self, batch_size, *args): state, action, reward, next_state, done = self.replay_buffer.sample( batch_size) state = Variable(torch.FloatTensor(np.float32(state))) next_state = Variable(torch.FloatTensor(np.float32(next_state))) action = Variable(torch.LongTensor(action)) reward = torch.FloatTensor(reward) done = torch.FloatTensor(np.float32(done)) proj_dist = self.projection_distribution(next_state, reward, done) dist = self.current_model(state) action = action.unsqueeze(1).unsqueeze(1).expand( batch_size, 1, num_atoms) dist = dist.gather(1, action).squeeze(1) dist.data.clamp_(0.01, 0.99) loss = -(Variable(proj_dist) * dist.log()).sum(1) loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.current_model.reset_noise() self.target_model.reset_noise() return loss
noise=config.get("noise", None) if m != config["target_map"] else None, vsn=config.get("vsn", None) if m != config["target_map"] else None, ally_indices=ally_indices, enemy_indices=enemy_indices, ) for m, d in zip(config["map_names"], difficulties) ] for env in train_envs: env_info = env.get_env_info() target_info = target_env.get_env_info() env.buffer = ReplayBuffer( n_actions=target_info['n_actions'], n_agents=env_info['n_agents'], obs_shape=target_info['obs_shape'], state_shape=target_info['state_shape'], episode_limit=env_info['episode_limit'], size=args.buffer_size, alg=args.alg, dtype=np.float16, ) logging.info(env_info) # change args to accommodate largest possible env # assures the widths of the created neural networks are sufficient env_info = target_env.get_env_info() args.n_actions = env_info["n_actions"] args.n_agents = env_info["n_agents"] args.state_shape = env_info["state_shape"] args.obs_shape = env_info["obs_shape"] args.episode_limit = env_info["episode_limit"] runner = Runner(None, args, target_env)
class Runner: def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 ''' self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) ''' self.env_evaluate = MeetEnv() if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) def run(self, num): plt.figure() plt.axis([0, self.args.n_epoch, 0, 100]) win_rates = [] episode_rewards = [] train_steps = 0 # print('Run {} start'.format(num)) for epoch in range(self.args.n_epoch): print('Run {}, train epoch {}'.format(num, epoch)) if epoch % self.args.evaluate_cycle == 0: win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) win_rates.append(win_rate) episode_rewards.append(episode_reward) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards) episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _ = self.rolloutWorker.generate_episode(episode_idx) episodes.append(episode) # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate((episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find('central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.rolloutWorker.generate_episode(evaluate=True) episode_rewards += episode_reward if episode_reward > self.args.threshold: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def evaluate_sparse(self): win_number = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.evaluateWorker.generate_episode(evaluate=True) result = 'win' if episode_reward > 0 else 'defeat' print('Epoch {}: {}'.format(epoch, result)) if episode_reward > 0: win_number += 1 self.env_evaluate.close() return win_number / self.args.evaluate_epoch
def train(train_env, agent_action_fn, eval_mode=False): action_space = train_env.action_space obs_space = train_env.observation_space ######### instantiate actor,critic, replay buffer, uo-process######### ## feed online with state. feed target with next_state. online_state_inputs = tf.placeholder(tf.float32, shape=(None, obs_space.shape[0]), name="online_state_inputs") target_state_inputs = tf.placeholder(tf.float32, shape=online_state_inputs.shape, name="target_state_inputs") ## inputs to q_net for training q. online_action_inputs_training_q = tf.placeholder( tf.float32, shape=(None, action_space.shape[0]), name='online_action_batch_inputs') # condition bool scalar to switch action inputs to online q. # feed True: training q. # feed False: training policy. cond_training_q = tf.placeholder(tf.bool, shape=[], name='cond_training_q') terminated_inputs = tf.placeholder(tf.float32, shape=(None), name='terminated_inputs') reward_inputs = tf.placeholder(tf.float32, shape=(None), name='rewards_inputs') # for summary text summary_text_tensor = tf.convert_to_tensor(str('summary_text'), preferred_dtype=string) tf.summary.text(name='summary_text', tensor=summary_text_tensor, collections=[DDPG_CFG.log_summary_keys]) ##instantiate actor, critic. actor = Actor( action_dim=action_space.shape[0], online_state_inputs=online_state_inputs, target_state_inputs=target_state_inputs, input_normalizer=DDPG_CFG.actor_input_normalizer, input_norm_params=DDPG_CFG.actor_input_norm_params, n_fc_units=DDPG_CFG.actor_n_fc_units, fc_activations=DDPG_CFG.actor_fc_activations, fc_initializers=DDPG_CFG.actor_fc_initializers, fc_normalizers=DDPG_CFG.actor_fc_normalizers, fc_norm_params=DDPG_CFG.actor_fc_norm_params, fc_regularizers=DDPG_CFG.actor_fc_regularizers, output_layer_initializer=DDPG_CFG.actor_output_layer_initializer, output_layer_regularizer=None, output_normalizers=DDPG_CFG.actor_output_layer_normalizers, output_norm_params=DDPG_CFG.actor_output_layer_norm_params, output_bound_fns=DDPG_CFG.actor_output_bound_fns, learning_rate=DDPG_CFG.actor_learning_rate, is_training=is_training) critic = Critic( online_state_inputs=online_state_inputs, target_state_inputs=target_state_inputs, input_normalizer=DDPG_CFG.critic_input_normalizer, input_norm_params=DDPG_CFG.critic_input_norm_params, online_action_inputs_training_q=online_action_inputs_training_q, online_action_inputs_training_policy=actor. online_action_outputs_tensor, cond_training_q=cond_training_q, target_action_inputs=actor.target_action_outputs_tensor, n_fc_units=DDPG_CFG.critic_n_fc_units, fc_activations=DDPG_CFG.critic_fc_activations, fc_initializers=DDPG_CFG.critic_fc_initializers, fc_normalizers=DDPG_CFG.critic_fc_normalizers, fc_norm_params=DDPG_CFG.critic_fc_norm_params, fc_regularizers=DDPG_CFG.critic_fc_regularizers, output_layer_initializer=DDPG_CFG.critic_output_layer_initializer, output_layer_regularizer=None, learning_rate=DDPG_CFG.critic_learning_rate) ## track updates. global_step_tensor = tf.train.create_global_step() ## build whole graph copy_online_to_target_op, train_online_policy_op, train_online_q_op, update_target_op, saver \ = build_ddpg_graph(actor, critic, reward_inputs, terminated_inputs, global_step_tensor) #we save the replay buffer data to files. replay_buffer = ReplayBuffer( buffer_size=DDPG_CFG.replay_buff_size, save_segment_size=DDPG_CFG.replay_buff_save_segment_size, save_path=DDPG_CFG.replay_buffer_file_path, seed=DDPG_CFG.random_seed) if DDPG_CFG.load_replay_buffer_set: replay_buffer.load(DDPG_CFG.replay_buffer_file_path) sess = tf.Session(graph=tf.get_default_graph()) summary_writer = tf.summary.FileWriter(logdir=os.path.join( DDPG_CFG.log_dir, "train"), graph=sess.graph) log_summary_op = tf.summary.merge_all(key=DDPG_CFG.log_summary_keys) sess.run(fetches=[tf.global_variables_initializer()]) #copy init params from online to target sess.run(fetches=[copy_online_to_target_op]) # Load a previous checkpoint if it exists latest_checkpoint = tf.train.latest_checkpoint(DDPG_CFG.checkpoint_dir) if latest_checkpoint: tf.logging.info( "==== Loading model checkpoint: {}".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) elif eval_mode: raise FileNotFoundError( '== in evaluation mode, we need check point file which can not be found.===' ) ####### start training ######### obs = train_env.reset() transition = preprocess_low_dim(obs) n_episodes = 1 if not eval_mode: for step in range(1, DDPG_CFG.num_training_steps): #replace with new transition policy_out = sess.run(fetches=[actor.online_action_outputs_tensor], feed_dict={ online_state_inputs: transition.next_state[np.newaxis, :], is_training: False })[0] transition = agent_action_fn(policy_out, replay_buffer, train_env) if step % 200 == 0: tf.logging.info(' +++++++++++++++++++ global_step:{} action:{}' ' reward:{} term:{}'.format( step, transition.action, transition.reward, transition.terminated)) if step < 10: #feed some transitions in buffer. continue ## ++++ sample mini-batch and train.++++ state_batch, action_batch, reward_batch, next_state_batch, terminated_batch = \ replay_buffer.sample_batch(DDPG_CFG.batch_size) # ---- 1. train policy.----------- sess.run( fetches=[train_online_policy_op], feed_dict={ online_state_inputs: state_batch, cond_training_q: False, online_action_inputs_training_q: action_batch, # feed but not used. is_training: True }) # ---- 2. train q. -------------- sess.run(fetches=[train_online_q_op], feed_dict={ online_state_inputs: state_batch, cond_training_q: True, online_action_inputs_training_q: action_batch, target_state_inputs: next_state_batch, reward_inputs: reward_batch, terminated_inputs: terminated_batch, is_training: True }) # ----- 3. update target --------- sess.run(fetches=[update_target_op], feed_dict=None) # do evaluation after eval_freq steps: if step % DDPG_CFG.eval_freq == 0: ##and step > DDPG_CFG.eval_freq: evaluate(env=train_env, num_eval_steps=DDPG_CFG.num_eval_steps, preprocess_fn=preprocess_low_dim, estimate_fn=lambda state: sess.run( fetches=[actor.online_action_outputs_tensor], feed_dict={ online_state_inputs: state, is_training: False }), summary_writer=summary_writer, saver=saver, sess=sess, global_step=step, log_summary_op=log_summary_op, summary_text_tensor=summary_text_tensor) if transition.terminated: transition = preprocess_low_dim(train_env.reset()) n_episodes += 1 continue # begin new episode else: #eval mode evaluate(env=train_env, num_eval_steps=DDPG_CFG.eval_steps_after_training, preprocess_fn=preprocess_low_dim, estimate_fn=lambda state: sess.run( fetches=[actor.online_action_outputs_tensor], feed_dict={ online_state_inputs: state, is_training: False }), summary_writer=summary_writer, saver=None, sess=sess, global_step=0, log_summary_op=log_summary_op, summary_text_tensor=summary_text_tensor) sess.close() train_env.close()
class Runner: def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy if args.use_per: self.buffer = PrioritizedReplayBuffer(args) else: self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.map + '/' if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.file_name = self.save_path + str(args.env_name) + '_' + str( args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time def run(self, num): train_steps = 0 episode_rewards = 0 fixed_rewards = 0 st = time.time() plot_rewards = [] # print('Run {} start'.format(num)) for epoch in range(self.args.n_epoch): # print('Run {}, train epoch {}'.format(num, epoch)) # if epoch % self.args.evaluate_cycle == 0: # win_rate, episode_reward = self.evaluate() # # print('win_rate is ', win_rate) # self.win_rates.append(win_rate) # self.episode_rewards.append(episode_reward) # print(episode_reward) # # self.plt(num) episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): if self.args.use_ja: if self.args.use_v1: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v2( episode_idx) else: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v3( episode_idx) else: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode( episode_idx) episodes.append(episode) episode_rewards += episode_reward fixed_rewards += fixed_reward plot_rewards.append(episode_reward) if epoch % self.args.evaluate_cycle == 0: t = time.time() - st st = time.time() epr = round(episode_rewards / self.args.evaluate_cycle, 2) fr = round(fixed_rewards / self.args.evaluate_cycle, 2) print('train epoch {}, reward {}, time {}, rate {}'.format( epoch, [epr, fr], t, rate)) # wandb.log({"reward": epr, "test_reward": epr}) episode_rewards = 0 fixed_rewards = 0 with open(self.file_name, 'wb') as fp: pickle.dump(plot_rewards, fp) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 elif not self.args.load_model: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): # mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size)) # # print(mini_batch['terminated']) # # print(train_steps) # dq = self.agents.train(mini_batch, train_steps) if self.args.use_per: mini_batch, idxs = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) dq = self.agents.train(mini_batch, train_steps) self.buffer.update_priorities(idxs, dq) else: mini_batch = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) dq = self.agents.train(mini_batch, train_steps) train_steps += 1 # self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag = self.rolloutWorker.generate_episode( epoch, evaluate=True) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch
def __init__(self, config): self.config = config self.network_freq = 125#self.config.conf['HLC-frequency'] self.reward_decay = 1.0 self.reward_scale = config.conf['reward-scale'] self.max_time_per_train_episode = 10#self.config.conf['max-train-time'] self.max_step_per_train_episode = int(self.max_time_per_train_episode*self.network_freq) self.max_time_per_test_episode = 10#self.config.conf['max-test-time']#16 self.max_step_per_test_episode = int(self.max_time_per_test_episode*self.network_freq) env_name = 'Walker2DBulletEnv-v0'#'AntBulletEnv-v0'#'Walker2DBulletEnv-v0'#'HumanoidBulletEnv-v0' self.env = gym.make(env_name) # self.env.render() print(self.env.observation_space) print(self.env.action_space) self.config.conf['state-dim'] = self.env.observation_space.shape[0] self.config.conf['action-dim'] = self.env.action_space.shape[0] self.config.conf['actor-logstd-initial'] = np.zeros((1, self.config.conf['action-dim'])) self.config.conf['actor-logstd-bounds'] = np.ones((2,self.config.conf['action-dim'])) self.config.conf['actor-output-bounds'] = np.ones((2,self.config.conf['action-dim'])) self.config.conf['actor-output-bounds'][0][:] = -1 * np.ones(self.config.conf['action-dim'],) self.config.conf['actor-output-bounds'][1][:] = 1* np.ones(self.config.conf['action-dim'],) self.config.conf['actor-logstd-initial'] *= np.log(1.0) # np.log(min(std*0.25, 1.0))#0.5 self.config.conf['actor-logstd-bounds'][0] *= np.log(0.2) self.config.conf['actor-logstd-bounds'][1] *= np.log(1.0) # 0.6 self.agent = Agent(self.env, self.config) self.episode_count = 0 self.step_count = 0 self.train_iter_count = 0 self.best_reward = 0 self.best_episode = 0 self.best_train_iter = 0 # load weight from previous network # dir_path = 'record/2017_12_04_15.20.44/no_force' # '2017_05_29_18.23.49/with_force' # create new network dir_path = 'TRPO/record/' + '3D/' + env_name +'/' + datetime.now().strftime('%Y_%m_%d_%H.%M.%S') if not os.path.exists(dir_path): os.makedirs(dir_path) if not os.path.exists(dir_path + '/saved_actor_networks'): os.makedirs(dir_path + '/saved_actor_networks') if not os.path.exists(dir_path + '/saved_critic_networks'): os.makedirs(dir_path + '/saved_critic_networks') self.logging = logger(dir_path) config.save_configuration(dir_path) config.record_configuration(dir_path) config.print_configuration() self.agent.load_weight(dir_path) self.dir_path = dir_path self.on_policy_paths = [] self.off_policy_paths = [] self.buffer = ReplayBuffer(self.config.conf['replay-buffer-size']) self.force = [0,0,0] self.force_chest = [0, 0, 0] # max(0,force_chest[1]-300*1.0 / EXPLORE)] self.force_pelvis = [0, 0, 0]
def __init__(self, config): self.config = config self.PD_freq = self.config.conf['LLC-frequency'] self.Physics_freq = self.config.conf['Physics-frequency'] self.network_freq = self.config.conf['HLC-frequency'] self.sampling_skip = int(self.PD_freq / self.network_freq) self.reward_decay = 1.0 self.reward_scale = config.conf['reward-scale'] self.reward_scale = self.reward_scale / float( self.sampling_skip) # /10.0#normalizing reward to 1 self.max_time_per_train_episode = self.config.conf['max-train-time'] self.max_step_per_train_episode = int(self.max_time_per_train_episode * self.network_freq) self.max_time_per_test_episode = self.config.conf['max-test-time'] #16 self.max_step_per_test_episode = int(self.max_time_per_test_episode * self.network_freq) self.train_external_force_disturbance = True if self.train_external_force_disturbance == True: path_str = 'with_external_force_disturbance/' else: path_str = 'without_external_force_disturbance/' self.test_external_force_disturbance = True self.env = Valkyrie( max_time=self.max_time_per_train_episode, renders=False, initial_gap_time=0.5, PD_freq=self.PD_freq, Physics_freq=self.Physics_freq, Kp=config.conf['Kp'], Kd=config.conf['Kd'], bullet_default_PD=config.conf['bullet-default-PD'], controlled_joints_list=config.conf['controlled-joints']) config.conf['state-dim'] = self.env.stateNumber self.agent = Agent(self.env, self.config) self.episode_count = 0 self.step_count = 0 self.train_iter_count = 0 self.best_reward = 0 self.best_episode = 0 self.best_train_iter = 0 self.control = Control(self.config, self.env) # load weight from previous network # dir_path = 'record/2017_12_04_15.20.44/no_force' # '2017_05_29_18.23.49/with_force' # create new network dir_path = 'TRPO/record/' + '3D_push/' + path_str + datetime.now( ).strftime('%Y_%m_%d_%H.%M.%S') if not os.path.exists(dir_path): os.makedirs(dir_path) if not os.path.exists(dir_path + '/saved_actor_networks'): os.makedirs(dir_path + '/saved_actor_networks') if not os.path.exists(dir_path + '/saved_critic_networks'): os.makedirs(dir_path + '/saved_critic_networks') self.logging = logger(dir_path) config.save_configuration(dir_path) config.record_configuration(dir_path) config.print_configuration() self.agent.load_weight(dir_path) self.dir_path = dir_path self.on_policy_paths = [] self.off_policy_paths = [] self.buffer = ReplayBuffer(self.config.conf['replay-buffer-size']) self.force = [0, 0, 0] self.force_chest = [0, 0, 0] # max(0,force_chest[1]-300*1.0 / EXPLORE)] self.force_pelvis = [0, 0, 0]