def __init__(self, state_size, action_size, seed, n_hidden_units=128, n_layers=3): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # actor self.actor = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4) # critic self.critic = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_opt = optim.Adam(self.critic.parameters(), lr=3e-4, weight_decay=0.0001) # will add noise self.noise = OUNoise(action_size, seed) # experience replay self.replay = ReplayBuffer(seed)
def init_policy(args, env): actor = Actor(layer=None, state_shape=args.state_shape, action_shape=args.action_shape, action_range=args.action_range, device=args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(layer=None, state_shape=args.state_shape, action_shape=args.action_shape, device=args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(layer=None, state_shape=args.state_shape, action_shape=args.action_shape, device=args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = TD3Policy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), args.policy_noise, args.update_actor_freq, args.noise_clip, args.action_range, reward_normalization=args.rew_norm, ignore_done=args.ignore_done, estimation_step=args.n_step) return policy
def test_a2c(args=get_args()): torch.set_num_threads(1) # for poor CPU env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list( actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = A2CPolicy( actor, critic, optim, dist, args.gamma, gae_lambda=args.gae_lambda, vf_coef=args.vf_coef, ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'a2c') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ddpg(args=get_args()): env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor( args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic = Critic( args.layer_num, args.state_shape, args.action_shape, args.device ).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, args.tau, args.gamma, args.exploration_noise, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=True, ignore_done=True) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log writer = SummaryWriter(args.logdir + '/' + 'ddpg') def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def __init__(self, n_feature, n_action, lr_A=0.001, lr_C=0.01, GAMMA=0.1): self.n_feature = n_feature self.n_action = n_action self.GAMMA = GAMMA self.actor = Actor(self.n_feature, self.n_action) self.critic = Critic(self.n_feature) self.optimizer_actor = optim.Adam(params=self.actor.parameters(), lr=lr_A) self.optimizer_critic = optim.Adam(params=self.critic.parameters(), lr=lr_C) self.cost_his = [] self.value_his = []
def test(args=get_args()): env = QuadcopterEnv() model = Actor(None, env.observation_space.shape, env.action_space.shape, [-1, 1], args.device).to(args.device) args.model_path = os.path.join(args.logdir, 'lqr') model.load_state_dict( torch.load(os.path.join(args.model_path, 'policy.pth'), map_location=args.device)) for i in range(10): obs = env.reset() env.render() done = False while not done: act = model(obs.reshape((1, -1)))[0].detach().cpu().numpy()[0] obs, reward, done, info = env.step(act) env.render()
def test(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym_make() model_path = os.path.join(args.logdir, args.task, 'ddpg/policy.pth') layer = [1024, 512, 512, 512] device = 'cuda' state_shape = env.observation_space.shape or env.observation_space.n action_shape = env.action_space.shape or env.action_space.n action_range = [env.action_space.low, env.action_space.high] actor = Actor( layer, state_shape, action_shape, action_range, device ).to(device) critic = Critic( layer, state_shape, action_shape, device ).to(device) actor = actor.to(device) actor_optim = torch.optim.Adam(actor.parameters()) critic = critic.to(device) critic_optim = torch.optim.Adam(critic.parameters()) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, action_range=action_range) policy.load_state_dict(torch.load(model_path, map_location=device)) obs = env.reset() # env.state[0] = -30.0 # env.goal[0] = 30.0 env.render() print(env.goal) while True: action, _ = policy.actor(obs.reshape(1,-1), eps=0.01) action = action.detach().cpu().numpy()[0] obs, reward, done, info = env.step(action) # print(env.state) # print(reward) # print(info) env.render() if done: break
def test(args=get_args()): env = DubinEnv() # env.set_obs([]) model = Actor(None, env.observation_space['dynamics'].shape, env.action_space.shape, [-1, 1], args.device).to(args.device) args.model_path = os.path.join(args.logdir, 'lqr') model.load_state_dict( torch.load(os.path.join(args.model_path, 'policy.pth'), map_location=args.device)) for i in range(10): env.reset() # env.state[:2] -= env.goal[:2] # env.goal[:2] -= env.goal[:2] obs = env._obs() env.render() done = False while not done: normed_obs = obs['dynamics'].reshape((1, -1)) # /np.array([20,20,np.pi,1,np.pi]) act = model(normed_obs)[0].detach().cpu().numpy()[0] obs, reward, done, info = env.step(act) env.render()
def init_policy(args, env): actor = Actor(layer=None, state_shape=args.state_shape, action_shape=args.action_shape, action_range=args.action_range, device=args.device).to(args.device) critic = Critic(layer=None, state_shape=args.state_shape, action_shape=args.action_shape, device=args.device).to(args.device) # orthogonal initialization for m in list(actor.modules()) + list(critic.modules()): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) torch.nn.init.zeros_(m.bias) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = DiagGaussian policy = PPOPolicy( actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, reward_normalization=args.rew_norm, # dual_clip=args.dual_clip, # dual clip cause monotonically increasing log_std :) value_clip=args.value_clip, # action_range=[env.action_space.low[0], env.action_space.high[0]],) # if clip the action, ppo would not converge :) gae_lambda=args.gae_lambda) return policy
def load_model_ddpg(model_path, user_embeddings_path, item_embeddings_path, input_dim, action_dim, hidden_size, device): with open(user_embeddings_path, "rb") as f: user_embeddings = np.load(f) with open(item_embeddings_path, "rb") as f: item_embeddings = np.load(f) model = Actor(input_dim, action_dim, hidden_size, user_embeddings, item_embeddings) model.load_state_dict(torch.load(model_path, map_location=device)) model.eval() return model
class DDPGAgent: def __init__(self, state_size, action_size, seed, n_hidden_units=128, n_layers=3): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # actor self.actor = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_opt = optim.Adam(self.actor.parameters(), lr=1e-4) # critic self.critic = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_opt = optim.Adam(self.critic.parameters(), lr=3e-4, weight_decay=0.0001) # will add noise self.noise = OUNoise(action_size, seed) # experience replay self.replay = ReplayBuffer(seed) def act(self, state, noise=True): ''' Returns actions taken. ''' state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().data.numpy() self.actor.train() if noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def step(self, state, action, reward, next_state, done): ''' Save experiences into replay and sample if replay contains enough experiences ''' self.replay.add(state, action, reward, next_state, done) if self.replay.len() > self.replay.batch_size: experiences = self.replay.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): ''' Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, n_s, done) tuples gamma (float): discount factor ''' states, actions, rewards, next_states, dones = experiences # update critic: # get predicted next state actions and Qvalues from targets next_actions = self.actor_target(next_states) next_Q_targets = self.critic_target(next_states, next_actions) # get current state Qvalues Q_targets = rewards + (GAMMA * next_Q_targets * (1 - dones)) # compute citic loss Q_expected = self.critic(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # minimize loss self.critic_opt.zero_grad() critic_loss.backward(retain_graph=True) self.critic_opt.step() # update actor: # compute actor loss action_predictions = self.actor(states) actor_loss = -self.critic(states, action_predictions).mean() # minimize actor loss self.actor_opt.zero_grad() actor_loss.backward(retain_graph=True) self.actor_opt.step() # update target networks self.soft_update(self.critic, self.critic_target, TAU) self.soft_update(self.actor, self.actor_target, TAU) def soft_update(self, local, target, tau): ''' Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params: local: PyTorch model (weights will be copied from) target: PyTorch model (weights will be copied to) tau (float): interpolation parameter ''' for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(args=get_args()): trajs = pickle.load(open('lqr_quadcopter', 'rb')) states, acts = trajs['states'], trajs['acts'] print(np.max(abs(states[:, 0])), np.max(abs(states[:, 1])), np.max(abs(states[:, 2])), np.max(abs(acts[:, 0])), np.max(abs(acts[:, 1])), np.max(abs(acts[:, 2])), np.max(abs(acts[:, 3]))) env = QuadcopterEnv() assert len(states) == len(acts), 'X, y sizes mismatch' print(len(states)) # shuffle indices = np.arange(len(states), dtype=int) np.random.shuffle(indices) # convert training data into tensors states = torch.tensor(states[indices], device=args.device, dtype=torch.float) acts = torch.tensor(acts[indices], device=args.device, dtype=torch.float) model = Actor(None, env.observation_space.shape, env.action_space.shape, [-1, 1], args.device).to(args.device) loss = nn.MSELoss() optimizer = torch.optim.Adagrad(model.parameters()) # Train the Models best_epoch = None best_loss = None args.model_path = os.path.join(args.logdir, 'lqr') writer = SummaryWriter(log_dir=args.model_path) for epoch in range(1, args.epoch + 1): loss_train = 0.0 for i in range(0, len(states), args.batch_size): # Forward, Backward and Optimize # we need zero_grad since pytorch accumulates gradient # this is useful in weight sharing like CNN model.zero_grad() i_ = i+args.batch_size if i + \ args.batch_size <= len(states) else len(states) b_states, b_acts = states[i:i_], acts[i:i_] c_acts = model(b_states)[0] loss_ = loss(c_acts, b_acts) loss_.backward() optimizer.step() loss_train = loss_train + loss_ # loss_train = loss(model(total_pcs[maps],states), nexts) writer.add_scalar('Loss/train', loss_train / (len(states) / args.batch_size), epoch) # Save the models if epoch != 0 and epoch % 50 == 0: print("loss: {} in #{}".format(loss_train, epoch)) if best_epoch == None or loss_train < best_loss: best_loss, best_epoch = loss_train, epoch print('best_loss: {} in #{}'.format(best_loss, best_epoch)) if best_epoch == epoch: torch.save(model.state_dict(), os.path.join(args.model_path, 'policy.pth')) writer.close()
def train(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym_make() args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.action_range = [env.action_space.low[0], env.action_space.high[0]] args.layer = [1024, 512, 512, 512] train_envs = VectorEnv( [lambda: gym_make() for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym_make() for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor( args.layer, args.state_shape, args.action_shape, args.action_range, args.device ).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic = Critic( args.layer, args.state_shape, args.action_shape, args.device ).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy( actor, actor_optim, critic, critic_optim, args.tau, args.gamma, args.exploration_noise, args.action_range, reward_normalization=args.rew_norm, ignore_done=True) # collector train_collector = Collector( policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ddpg') writer = SummaryWriter(log_path) # if a model exist, continue to train it model_path = os.path.join(log_path, 'policy.pth') # if os.path.exists(model_path): # policy.load_state_dict(torch.load(model_path)) def save_fn(policy): torch.save(policy.state_dict(), model_path) # trainer result = offpolicy_trainer( policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, save_fn=save_fn, writer=writer) train_collector.close() test_collector.close() if __name__ == '__main__': # Let's watch its performance! env = gym_make() collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_sac_with_il(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = ActorProb(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic1 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr) critic2 = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr) policy = SACPolicy(actor, actor_optim, critic1, critic1_optim, critic2, critic2_optim, args.tau, args.gamma, args.alpha, [env.action_space.low[0], env.action_space.high[0]], reward_normalization=args.rew_norm, ignore_done=args.ignore_done, estimation_step=args.n_step) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # train_collector.collect(n_step=args.buffer_size) # log log_path = os.path.join(args.logdir, args.task, 'sac') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close() # here we define an imitation collector with a trivial policy if args.task == 'Pendulum-v0': env.spec.reward_threshold = -300 # lower the goal net = Actor(1, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.il_lr) il_policy = ImitationPolicy(net, optim, mode='continuous') il_test_collector = Collector(il_policy, test_envs) train_collector.reset() result = offpolicy_trainer(il_policy, train_collector, il_test_collector, args.epoch, args.step_per_epoch // 5, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() il_test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(il_policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ppo(args=get_args()): env = gym.make(args.task) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # train_envs = gym.make(args.task) train_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = SubprocVectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model net = Net(args.layer_num, args.state_shape, device=args.device) actor = Actor(net, args.action_shape).to(args.device) critic = Critic(net).to(args.device) optim = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=args.lr) dist = torch.distributions.Categorical policy = PPOPolicy(actor, critic, optim, dist, args.gamma, max_grad_norm=args.max_grad_norm, eps_clip=args.eps_clip, vf_coef=args.vf_coef, ent_coef=args.ent_coef, action_range=None) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log writer = SummaryWriter(args.logdir + '/' + 'ppo') def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.repeat_per_collect, args.test_num, args.batch_size, stop_fn=stop_fn, writer=writer, task=args.task) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
def test_ddpg(args=get_args()): torch.set_num_threads(1) # we just need only one thread for NN env = gym.make(args.task) if args.task == 'Pendulum-v0': env.spec.reward_threshold = -250 args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n args.max_action = env.action_space.high[0] # you can also use tianshou.env.SubprocVectorEnv # train_envs = gym.make(args.task) train_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.training_num)]) # test_envs = gym.make(args.task) test_envs = VectorEnv( [lambda: gym.make(args.task) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # model actor = Actor(args.layer_num, args.state_shape, args.action_shape, args.max_action, args.device).to(args.device) actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr) critic = Critic(args.layer_num, args.state_shape, args.action_shape, args.device).to(args.device) critic_optim = torch.optim.Adam(critic.parameters(), lr=args.critic_lr) policy = DDPGPolicy(actor, actor_optim, critic, critic_optim, args.tau, args.gamma, GaussianNoise(sigma=args.exploration_noise), [env.action_space.low[0], env.action_space.high[0]], reward_normalization=args.rew_norm, ignore_done=args.ignore_done, estimation_step=args.n_step) # collector train_collector = Collector(policy, train_envs, ReplayBuffer(args.buffer_size)) test_collector = Collector(policy, test_envs) # log log_path = os.path.join(args.logdir, args.task, 'ddpg') writer = SummaryWriter(log_path) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(x): return x >= env.spec.reward_threshold # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.collect_per_step, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, writer=writer) assert stop_fn(result['best_reward']) train_collector.close() test_collector.close() if __name__ == '__main__': pprint.pprint(result) # Let's watch its performance! env = gym.make(args.task) collector = Collector(policy, env) result = collector.collect(n_episode=1, render=args.render) print(f'Final reward: {result["rew"]}, length: {result["len"]}') collector.close()
class Actor_Critic: def __init__(self, n_feature, n_action, lr_A=0.001, lr_C=0.01, GAMMA=0.1): self.n_feature = n_feature self.n_action = n_action self.GAMMA = GAMMA self.actor = Actor(self.n_feature, self.n_action) self.critic = Critic(self.n_feature) self.optimizer_actor = optim.Adam(params=self.actor.parameters(), lr=lr_A) self.optimizer_critic = optim.Adam(params=self.critic.parameters(), lr=lr_C) self.cost_his = [] self.value_his = [] def actor_learn(self, s, a, td_error): s = torch.Tensor(s[np.newaxis, :]) ##batch=1 action_prob = self.actor(s) ##[batch x self.n_action]-->[1, n_action] log_prob = torch.log(action_prob[0, a]) ## a in action index self.exp_v = torch.mean(-1 * log_prob * td_error) self.optimizer_actor.zero_grad() self.exp_v.backward() self.optimizer_actor.step() self.value_his.append(self.exp_v.item()) return self.exp_v def choose_action(self, s): s = torch.Tensor(s[np.newaxis, :]) probs = self.actor(s) return np.random.choice(range(probs.shape[1]), p=probs.clone().detach().numpy().ravel()) def critic_learn(self, s, r, s_): s, s_ = torch.Tensor(s[np.newaxis, :]), torch.Tensor(s_[np.newaxis, :]) v_ = self.critic(s_) ##part of Q target v = self.critic(s) td_error = F.mse_loss(v, r + self.GAMMA * v_) self.cost_his.append(td_error.item()) self.optimizer_critic.zero_grad() td_error.backward() self.optimizer_critic.step() return td_error.item() def plot_cost(self): import matplotlib.pyplot as plt fig, (ax1, ax2) = plt.subplots(2, 1) ax1.plot(np.arange(len(self.cost_his)), self.cost_his) ax1.set_ylabel('Critic TD error') ax2.plot(np.arange(len(self.value_his)), self.value_his) ax2.set_ylabel('Actor value') ax2.set_xlabel('training steps') plt.show() pass