class Agent: def __init__(self, state_size, num_actions, mode, buffer_size=None, **_kwargs): assert mode in ('train', 'test') self.mode = mode self.state_size = state_size self.num_actions = num_actions self.buffer = ReplayBuffer(buffer_size) def get_action(self, state: np.ndarray): raise NotImplemented def get_q_values(self, state: np.ndarray) -> np.ndarray: raise NotImplemented def optimize(self): raise NotImplemented def save_model(self, model_save_path: str): raise NotImplemented def store_transition(self, transition): self.buffer.append(transition) def save_history(self, path): verify_output_path(path) history = { field: np.array([ transition.__getattribute__(field) for transition in self.buffer.data ]) for field in Transition._fields } np.savez(path, **history)
def main(): parser = argparse.ArgumentParser(description='VDQN') parser.add_argument('--seed', type=int, default=100) parser.add_argument('--env', type=str, default='CartPole-v0', help='Name of the OpenAI Gym environment') parser.add_argument('--logdir', type=str, default='') parser.add_argument('--episodes', type=int, default=200) parser.add_argument('--target-update-period', type=int, default=100) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--gamma', type=float, default=.99) args = parser.parse_args() #### HYPERPARAMETERS episodes = args.episodes #1000 envname = args.env seed = args.seed tf.set_random_seed(seed) np.random.seed(seed) hiddendict = [100, 100] sigma = 0.01 Wpriorsigma = [10000] * 2 bpriorsigma = [10000] * 2 batchsize = 64 buffersize = 1000000 initialsize = 500 tau = 1.0 target_update_period = args.target_update_period lr_VI = args.lr gamma = args.gamma totalstep = 0 reward_scale = 1 ############ #### MAIN ITERATIONS ########### logdir = args.logdir + 'VDQN/' + envname + '/lr_' + str( args.lr) + '_episodes' + str(args.episodes) if not os.path.exists(logdir): os.makedirs(logdir) with tf.Session() as sess: ### INITIALIZATION env = gym.make(envname) obssize = env.observation_space.low.size actsize = env.action_space.n replaybuffer = ReplayBuffer(buffersize) Qactionnet = VariationalQNetwork( obssize, actsize, hiddendict, sess=sess, scope='principle', optimizer=tf.train.AdamOptimizer(lr_VI)) Qtargetnet = VariationalQNetwork(obssize, actsize, hiddendict, sess=sess, scope='target') noisesampler = NoiseSampler(Qactionnet.Wshape, Qactionnet.bshape) sess.run(tf.global_variables_initializer()) update_target(Qtargetnet, Qactionnet) ### RECORD VIlossrecord = [] Bellmanlossrecord = [] rewardrecord = [] ### ITERATIONS for episode in range(episodes): # start obs = env.reset() done = False rsum = 0 while not done: # sample a noise and compute Wnoise, bnoise = noisesampler.sample(1) # compuet Q value Qvalue = Qactionnet.compute_Qvalue(obs[None], Wnoise, bnoise) # select action action = np.argmax(Qvalue.flatten()) # step nextobs, reward, done, _ = env.step(action) # record experience done_ = 1 if done else 0 reward_ = reward * reward_scale experience = [(obs, action, reward_, done_, nextobs)] # append experience to buffer replaybuffer.append(experience) replaybuffer.popleft() # update obs = nextobs totalstep += 1 rsum += reward if replaybuffer.currentsize >= initialsize: # sample minibatch batch_obs, batch_act, batch_reward, batch_done, batch_nextobs = replaybuffer.sample( batchsize) # sample noise for computing target Wnoise, bnoise = noisesampler.sample(batchsize) # compute target value Qall = Qtargetnet.compute_Qvalue(batch_nextobs, Wnoise, bnoise) Qtarget = gamma * np.max( Qall, axis=1) * (1 - batch_done) + batch_reward # udpate principle network by VI VIloss = Qactionnet.train_on_sample( batch_obs, batch_act, Qtarget) # comptue bellman error loss #Wnoise_new,bnoise_new = noisesampler.sample(batchsize) Wnoise_new, bnoise_new = Wnoise, bnoise Qpred = Qactionnet.compute_Qvalue(batch_obs, Wnoise_new, bnoise_new) Qpredact = Qpred[np.arange(batchsize), batch_act] Bellmanloss = np.mean((Qpredact - Qtarget)**2) #a,b,c,d = Qactionnet.get_variables() #print('Wmu',a,'Wrho',b,'bmu',c,'brho',d) #print(Qpredact,Qtarget) # record #print('bellmanerror',Bellmanloss) #raise ValueError VIlossrecord.append(VIloss['loss']) Bellmanlossrecord.append(Bellmanloss) if (totalstep + 1) % target_update_period == 0: update_target(Qtargetnet, Qactionnet) print("update target") if done: break # record rewardrecord.append(rsum) ### TRAIN meanVIloss = np.mean( VIlossrecord[-10:]) if len(VIlossrecord) > 10 else np.float( 'nan') meanbellmanloss = np.mean(Bellmanlossrecord[-10:]) if len( Bellmanlossrecord) > 10 else np.float('nan') meanreward = np.mean(rewardrecord[-10:]) print( "episode %d buffer size %d meanVIloss %f meanbellmanloss %f meanreward %f" % (episode, replaybuffer.currentsize, meanVIloss, meanbellmanloss, meanreward)) if (1 + episode) % 5 == 0: np.save(logdir + '/VIloss_' + str(seed), VIlossrecord) np.save(logdir + '/bellmanloss_' + str(seed), Bellmanlossrecord) np.save(logdir + '/reward_' + str(seed), rewardrecord)
class DDPGAgent(Agent): def __init__(self, state_space, action_space, device, actor_lr=0.000025, critic_lr=0.00025, tau=0.001, gamma=0.99, max_size=1000000, layer1_size=200, layer2_size=150, batch_size=64, noise_std=0.1, name="DDPG"): assert isinstance( action_space, gym.spaces.Box) ### NEW: The action space is now continuous super().__init__(state_space, action_space, device=device, name=name) self.gamma = gamma self.tau = tau self.replay_buffer = ReplayBuffer(max_size, self.device) self.batch_size = batch_size self.actor = DefaultNN(actor_lr, self.state_size, layer1_size, layer2_size, self.nb_actions, self.device, last_activation=torch.tanh) self.critic = DefaultNN(critic_lr, self.state_size + self.nb_actions, layer1_size, layer2_size, 1, self.device) self.target_actor = copy.deepcopy(self.actor) self.target_critic = copy.deepcopy(self.critic) self.normal_distribution = torch.distributions.normal.Normal( torch.zeros(self.nb_actions), torch.full((self.nb_actions, ), noise_std)) def action(self, observation): with torch.no_grad(): observation = torch.tensor(observation, dtype=torch.float).to(self.device) actor_output = self.actor.forward(observation).to(self.device) noise = self.normal_distribution.sample() action = actor_output + noise return action.cpu().detach().numpy() def on_action_stop(self, action, new_state, reward, done): self.replay_buffer.append(self.last_state, action, reward, new_state, done) self.learn() super().on_action_stop(action, new_state, reward, done) def learn(self): if len(self.replay_buffer) > self.batch_size: states, actions, rewards, new_states, dones = self.replay_buffer.sample( self.batch_size) with torch.no_grad(): target_actions = self.target_actor.forward(new_states) critic_value_ = self.target_critic.forward( torch.concat((new_states, target_actions), dim=-1)) critic_value = self.critic.forward( torch.concat((states, actions), dim=-1)) target = torch.addcmul(rewards, self.gamma, 1 - dones, critic_value_.squeeze()).view( self.batch_size, 1) self.critic.optimizer.zero_grad() critic_loss = torch.nn.functional.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.actor.optimizer.zero_grad() actions = self.actor.forward(states) actor_loss = -self.critic.forward( torch.concat((states, actions), dim=-1)) actor_loss = torch.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.target_critic.converge_to(self.critic, tau=self.tau) self.target_actor.converge_to(self.actor, tau=self.tau)
for hparam in hparams.trials(5): exp.add_argparse_meta(hparam) for timestep in range(hparam.num_steps): noise = Normal( mean=Variable(torch.zeros(A)), std=hparam.noise_factor * Variable(torch.ones(A)), ) if timestep % 1000 == 0: hparam.noise_factor /= 2 a = actor(s) + noise.sample() succ, r, done, _ = env.step(a.data.numpy()) succ = np_to_var(succ) buffer.append(Step(s, a, r, succ, done)) rews.append(r) s = np_to_var(env.reset()) if done else succ if done: exp.add_metric_row({"Timestep": timestep + 1, "Loss": -sum(rews)}) rews = [] if len(buffer) >= hparam.batch_size: states, actions, rewards, succ_states, dones = format_batch( buffer.sample(hparam.batch_size) ) td_estims = get_critic_train_data(succ_states, rewards, dones)