def train(env): value_net = Critic(1290, 128, 256, params['critic_weight_init']).to(device) policy_net = Actor(1290, 128, 256, params['actor_weight_init']).to(device) target_value_net = Critic(1290, 128, 256).to(device) target_policy_net = Actor(1290, 128, 256).to(device) #Switiching off dropout layers target_value_net.eval() target_policy_net.eval() softUpdate(value_net, target_value_net, soft_tau=1.0) softUpdate(policy_net, target_policy_net, soft_tau=1.0) value_optimizer = optimizer.Ranger(value_net.parameters(), lr=params['value_lr'], weight_decay=1e-2) policy_optimizer = optimizer.Ranger(policy_net.parameters(), lr=params['policy_lr'], weight_decay=1e-5) value_criterion = nn.MSELoss() loss = { 'test': { 'value': [], 'policy': [], 'step': [] }, 'train': { 'value': [], 'policy': [], 'step': [] } } plotter = Plotter( loss, [['value', 'policy']], ) step = 0 plot_every = 10 for epoch in range(100): print("Epoch: {}".format(epoch + 1)) for batch in (env.train_dataloader): loss, value_net, policy_net, target_value_net, target_policy_net, value_optimizer, policy_optimizer\ = ddpg(value_net,policy_net,target_value_net,target_policy_net,\ value_optimizer, policy_optimizer, batch, params, step=step) # print(loss) plotter.log_losses(loss) step += 1 if step % plot_every == 0: print('step', step) test_loss = run_tests(env,step,value_net,policy_net,target_value_net,target_policy_net,\ value_optimizer, policy_optimizer,plotter) plotter.log_losses(test_loss, test=True) plotter.plot_loss() if step > 1500: assert False
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'init_method':args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([ state_batch, self.actor(state_batch) ]) else: policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if(self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if(self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() # if self.pic: # action = np.concatenate((softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy( self.actor_target(s_t) ).squeeze(0) else: action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = self.random_action(fix=True) # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self, nb_status, nb_actions, args): self.num_actor = 3 self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn } if args.pic: self.cnn = CNN(3, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actors = [ Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor) ] self.actor_targets = [ Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor) ] self.actor_optims = [ Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor) ] self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) for i in range(self.num_actor): hard_update( self.actor_targets[i], self.actors[i]) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self, train_actor=True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) print('label 1') print('size = ', state_batch.shape) state_batch = self.cnn(state_batch) print('label 2') next_state_batch = np.array( [self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn(next_state_batch) next_q_values = self.critic_target( [next_state_batch, self.actor_target(next_state_batch)]) else: index = np.random.randint(low=0, high=self.num_actor) next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_targets[index](to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() sum_policy_loss = 0 for i in range(self.num_actor): self.actors[i].zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actors[i](to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor: self.actor_optims[i].step() sum_policy_loss += policy_loss # Target update soft_update(self.actor_targets[i], self.actors[i], self.tau) soft_update(self.critic_target, self.critic, self.tau) return -sum_policy_loss / self.num_actor, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): for i in range(self.num_actor): self.actors[i].cuda() self.actor_targets[i].cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): actions = [] status = [] tot_score = [] for i in range(self.num_actor): action = to_numpy(self.actors[i](to_tensor( np.array([s_t]), volatile=True))).squeeze(0) noise_level = noise_level * max(self.epsilon, 0) action = action + self.random_process.sample() * noise_level status.append(s_t) actions.append(action) tot_score.append(0.) scores = self.critic([ to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True) ]) for j in range(self.num_actor): tot_score[j] += scores.data[j][0] best = np.array(tot_score).argmax() if decay_epsilon: self.epsilon -= self.depsilon self.a_t = actions[best] return actions[best] def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=0): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { "hidden1": args.hidden1, "hidden2": args.hidden2, "init_w": args.init_w, } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update( self.actor_target, self.actor ) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory( limit=args.rmsize, window_length=args.window_length ) self.random_process = OrnsteinUhlenbeckProcess( size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma ) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda() def update_policy(self): # Sample batch ( state_batch, action_batch, reward_batch, next_state_batch, terminal_batch, ) = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target( [ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ] ) # next_q_values.volatile = False target_q_batch = ( to_tensor(reward_batch) + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values ) # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))] ) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.0, 1.0, self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1.0, 1.0) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output))) self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output)) torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class DDPG_Agent: def __init__(self, state_size, action_size, seed, index=0, num_agents=2): """Initialize an Agent object. Params ====== state_size (int): Dimension of each state action_size (int): Dimension of each action seed (int): Random seed index (int): Index assigned to the agent num_agents (int): Number of agents in the environment """ self.state_size = state_size # State size self.action_size = action_size # Action size self.seed = torch.manual_seed(seed) # Random seed self.index = index # Index of this agent, not used at the moment self.tau = TAU # Parameter for soft weight update self.num_updates = N_UPDATES # Number of updates to perform when updating self.num_agents = num_agents # Number of agents in the environment self.tstep = 0 # Simulation step (modulo (%) UPDATE_EVERY) self.gamma = GAMMA # Gamma for the reward discount self.alpha = ALPHA # PER: toggle prioritization (0..1) # Set up actor and critic networks self.actor_local = Actor(state_size, action_size, seed).to(device) self.critic_local = Critic(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise self.noise = OUNoise((1, action_size), seed) # Replay buffer self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, self.alpha) # act and act_targets similar to exercises and MADDPG Lab def act(self, states, noise=1.0): """Returns actions for given state as per current policy. Params ====== state [n_agents, state_size]: current state noise (float): control whether or not noise is added """ # Uncomment if state is numpy array instead of tensor states = torch.from_numpy(states).float().to(device) actions = np.zeros((1, self.action_size)) # Put model into evaluation mode self.actor_local.eval() # Get actions for current state, transformed from probabilities with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() # Put actor back into training mode self.actor_local.train() # Ornstein-Uhlenbeck noise addition actions += noise * self.noise.sample() # Transform probability into valid action ranges return np.clip(actions, -1, 1) def step(self, states, actions, rewards, next_states, dones, beta): """Save experience in replay memory, use random samples from buffer to learn. PARAMS ====== states: [n_agents, state_size] current state actions: [n_agents, action_size] taken action rewards: [n_agents] earned reward next_states:[n_agents, state_size] next state dones: [n_agents] Whether episode has finished beta: [0..1] PER: toggles correction for importance weights (0 - no corrections, 1 - full correction) """ # ------------------------------------------------------------------ # Save experience in replay memory - slightly more effort due to Prioritization # We need to calculate priorities for the experience tuple. # This is in our case (Q_expected - Q_target)**2 # ----------------------------------------------------------------- # Set all networks to evaluation mode self.actor_target.eval() self.critic_target.eval() self.critic_local.eval() state = torch.from_numpy(states).float().to(device) next_state = torch.from_numpy(next_states).float().to(device) action = torch.from_numpy(actions).float().to(device) #reward = torch.from_numpy(rewards).float().to(device) #done = torch.from_numpy(dones).float().to(device) with torch.no_grad(): next_actions = self.actor_target(state) own_action = action[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1 next_actions_agent = torch.cat((own_action, next_actions), dim=1) else: # Agent 0: flipped order next_actions_agent = torch.cat((next_actions, own_action), dim=1) # Predicted Q value from Critic target network Q_targets_next = self.critic_target(next_state, next_actions_agent).float() #print(f"Type Q_t_n: {type(Q_targets_next)}") #print(f"Type gamma: {type(self.gamma)}") #print(f"Type dones: {type(dones)}") Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) Q_expected = self.critic_local(state, action) # Use error between Q_expected and Q_targets as priority in buffer error = (Q_expected - Q_targets)**2 self.memory.add(state, action, rewards, next_state, dones, error) # Set all networks back to training mode self.actor_target.train() self.critic_target.train() self.critic_local.train() # ------------------------------------------------------------------ # Usual learning procedure # ----------------------------------------------------------------- # Learn every UPDATE_EVERY time steps self.tstep = (self.tstep + 1) % UPDATE_EVERY # If UPDATE_EVERY and enough samples are available in memory, get random subset and learn if self.tstep == 0 and len(self.memory) > BATCH_SIZE: for _ in range(self.num_updates): experiences = self.memory.sample(beta) self.learn(experiences) def reset(self): """Reset the noise parameter of the agent.""" self.noise.reset() def learn(self, experiences): """Update value parameters using given batch of experience tuples. Update according to Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) According to the lessons: actor_target (state) gives action critic_target (state, action) gives Q-value Params ====== experiences (Tuple[torch.Variable]): tuple of states states visited actions actions taken by all agents rewards rewards received next states all next states dones whether or not a final state is reached weights weights of the experiences indices indices of the experiences """ # Load experiences from sample states, actions, rewards, next_states, dones, weights_cur, indices = experiences # ------------------- update critic ------------------- # # Get next actions via actor network next_actions = self.actor_target(next_states) # Stack action together with action of the agent own_actions = actions[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1 next_actions_agent = torch.cat((own_actions, next_actions), dim=1) else: # Agent 0: flipped order next_actions_agent = torch.cat((next_actions, own_actions), dim=1) # Predicted Q value from Critic target network Q_targets_next = self.critic_target(next_states, next_actions_agent) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) Q_expected = self.critic_local(states, actions) # Update priorities in ReplayBuffer loss = (Q_expected - Q_targets).pow(2).reshape( weights_cur.shape) * weights_cur self.memory.update(indices, loss.data.cpu().numpy()) # Compute critic loss critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() # Clip gradients #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING) self.critic_optimizer.step() # ------------------- update actor ------------------- # actions_expected = self.actor_local(states) # Stack action together with action of the agent own_actions = actions[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1: actions_expected_agent = torch.cat((own_actions, actions_expected), dim=1) else: # Agent 0: flipped order actions_expected_agent = torch.cat((actions_expected, own_actions), dim=1) # Compute actor loss based on expectation from actions_expected actor_loss = -self.critic_local(states, actions_expected_agent).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.target_soft_update(self.critic_local, self.critic_target) self.target_soft_update(self.actor_local, self.actor_target) def target_soft_update(self, local_model, target_model): """Soft update model parameters for actor and critic of all MADDPG agents. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def save(self, filename): """Saves the agent to the local workplace Params ====== filename (string): where to save the weights """ checkpoint = { 'input_size': self.state_size, 'output_size': self.action_size, 'actor_hidden_layers': [ each.out_features for each in self.actor_local.hidden_layers if each._get_name() != 'BatchNorm1d' ], 'actor_state_dict': self.actor_local.state_dict(), 'critic_hidden_layers': [ each.out_features for each in self.critic_local.hidden_layers if each._get_name() != 'BatchNorm1d' ], 'critic_state_dict': self.critic_local.state_dict() } torch.save(checkpoint, filename) def load_weights(self, filename): """ Load weights to update agent's actor and critic networks. Expected is a format like the one produced by self.save() Params ====== filename (string): where to load data from. """ checkpoint = torch.load(filename) if not checkpoint['input_size'] == self.state_size: print( f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}" ) return None if not checkpoint['output_size'] == self.action_size: print( f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}" ) return None my_actor_hidden_layers = [ each.out_features for each in self.actor_local.hidden_layers if each._get_name() != 'BatchNorm1d' ] if not checkpoint['actor_hidden_layers'] == my_actor_hidden_layers: print( f"Error when loading weights from checkpoint {filename}: actor hidden layers {checkpoint['actor_hidden_layers']} don't match agent's actor hidden layers {my_actor_hidden_layers}" ) return None my_critic_hidden_layers = [ each.out_features for each in self.critic_local.hidden_layers if each._get_name() != 'BatchNorm1d' ] if not checkpoint['critic_hidden_layers'] == my_critic_hidden_layers: print( f"Error when loading weights from checkpoint {filename}: critic hidden layers {checkpoint['critic_hidden_layers']} don't match agent's critic hidden layers {my_critic_hidden_layers}" ) return None self.actor_local.load_state_dict(checkpoint['actor_state_dict']) self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs, ), clip=5) # huh? # oh wow. ZFilter is exactly what I do in capstone project, removing "badtimes" print('state size:', num_inputs) print('action size:', num_actions) #load agent stuff actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) #if you aren't starting from scratch, load in this if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) # initialize everything actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) # if no old model no worries, start training. episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): # for i total trajectories actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: # sample trajectories (batch size) state = env.reset() score = 0 state = running_state( state) #uh.. again ZFilter related, cleans the state for _ in range(10000): #run through environment if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze( 0)) #pass state through actor network action = get_action(mu, std)[0] #compute random action next_state, reward, done, _ = env.step(action) #take a step irl_reward = get_reward( discrim, state, action ) #infer what the reward of this action is based on discriminator's get reward if done: mask = 0 else: mask = 1 #if done, save this, memory.append([state, action, irl_reward, mask]) next_state = running_state( next_state) #save cleaned next state state = next_state #and set to current state, score += reward #add total reward if done: break #actual sampling done here episodes += 1 scores.append(score) score_avg = np.mean(scores) #how this model did, print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) #logg actor.train(), critic.train(), discrim.train() #now train if train_discrim_flag: #if this batch optimizes discrim/reward, # for training the discriminator expert_acc, learner_acc = train_discrim( discrim, memory, discrim_optim, demonstrations, args) # see comments in train_model. print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False #now restart, train policy. #for training actor critic train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) # no output, see comments in train_model if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
class DDPG(object): def __init__(self, args, nb_states, nb_actions): USE_CUDA = torch.cuda.is_available() if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions= nb_actions self.gpu_ids = [i for i in range(args.gpu_nums)] if USE_CUDA and args.gpu_nums > 0 else [-1] self.gpu_used = True if self.gpu_ids[0] >= 0 else False net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_w':args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_optim = Adam(self.actor.parameters(), lr=args.p_lr, weight_decay=args.weight_decay) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_optim = Adam(self.critic.parameters(), lr=args.c_lr, weight_decay=args.weight_decay) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau_update = args.tau_update self.gamma = args.gamma # Linear decay rate of exploration policy self.depsilon = 1.0 / args.epsilon # initial exploration rate self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.continious_action_space = False def update_policy(self): pass def cuda_convert(self): if len(self.gpu_ids) == 1: if self.gpu_ids[0] >= 0: with torch.cuda.device(self.gpu_ids[0]): print('model cuda converted') self.cuda() if len(self.gpu_ids) > 1: self.data_parallel() self.cuda() self.to_device() print('model cuda converted and paralleled') def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def data_parallel(self): self.actor = nn.DataParallel(self.actor, device_ids=self.gpu_ids) self.actor_target = nn.DataParallel(self.actor_target, device_ids=self.gpu_ids) self.critic = nn.DataParallel(self.critic, device_ids=self.gpu_ids) self.critic_target = nn.DataParallel(self.critic_target, device_ids=self.gpu_ids) def to_device(self): self.actor.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.actor_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.critic.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.critic_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) # self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): # proto action action = to_numpy( self.actor(to_tensor(np.array([s_t]), gpu_used=self.gpu_used, gpu_0=self.gpu_ids[0])), gpu_used=self.gpu_used ).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon # self.a_t = action return action def reset(self, s_t): self.s_t = s_t self.random_process.reset_states() def load_weights(self, dir): if dir is None: return if self.gpu_used: # load all tensors to GPU (gpu_id) ml = lambda storage, loc: storage.cuda(self.gpu_ids) else: # load all tensors to CPU ml = lambda storage, loc: storage self.actor.load_state_dict( torch.load('output/{}/actor.pkl'.format(dir), map_location=ml) ) self.critic.load_state_dict( torch.load('output/{}/critic.pkl'.format(dir), map_location=ml) ) print('model weights loaded') def save_model(self,output): if len(self.gpu_ids) == 1 and self.gpu_ids[0] > 0: with torch.cuda.device(self.gpu_ids[0]): torch.save( self.actor.state_dict(), '{}/actor.pt'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pt'.format(output) ) elif len(self.gpu_ids) > 1: torch.save(self.actor.module.state_dict(), '{}/actor.pt'.format(output) ) torch.save(self.actor.module.state_dict(), '{}/critic.pt'.format(output) ) else: torch.save( self.actor.state_dict(), '{}/actor.pt'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pt'.format(output) ) def seed(self,seed): torch.manual_seed(seed) if len(self.gpu_ids) > 0: torch.cuda.manual_seed_all(seed)
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_method':args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_method': args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor=True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array( np.mean([ np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters() ])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class DDPG: def __init__(self, env, args): ob_space = env.observation_space goal_dim = env.goal_dim ob_dim = ob_space.shape[0] self.ob_dim = ob_dim self.ac_dim = ac_dim = 7 self.goal_dim = goal_dim self.num_iters = args.num_iters self.random_prob = args.random_prob self.tau = args.tau self.reward_scale = args.reward_scale self.gamma = args.gamma self.log_interval = args.log_interval self.save_interval = args.save_interval self.rollout_steps = args.rollout_steps self.env = env self.batch_size = args.batch_size self.train_steps = args.train_steps self.closest_dist = np.inf self.warmup_iter = args.warmup_iter self.max_grad_norm = args.max_grad_norm self.use_her = args.her self.k_future = args.k_future self.model_dir = os.path.join(args.save_dir, 'model') self.pretrain_dir = args.pretrain_dir os.makedirs(self.model_dir, exist_ok=True) self.global_step = 0 self.actor = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) if args.resume or args.test or args.pretrain_dir is not None: self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir) if not args.test: self.actor_target = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic_target = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.actor_optim = self.construct_optim(self.actor, lr=args.actor_lr) cri_w_decay = args.critic_weight_decay self.critic_optim = self.construct_optim(self.critic, lr=args.critic_lr, weight_decay=cri_w_decay) self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) self.actor_target.eval() self.critic_target.eval() if args.noise_type == 'ou_noise': mu = np.zeros(ac_dim) sigma = float(args.ou_noise_std) * np.ones(ac_dim) self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu, sigma=sigma) elif args.noise_type == 'uniform': low_limit = args.uniform_noise_low high_limit = args.uniform_noise_high dec_step = args.max_noise_dec_step self.action_noise = UniformNoise(low_limit=low_limit, high_limit=high_limit, dec_step=dec_step) elif args.noise_type == 'gaussian': mu = np.zeros(ac_dim) sigma = args.normal_noise_std * np.ones(ac_dim) self.action_noise = NormalActionNoise(mu=mu, sigma=sigma) self.memory = Memory(limit=int(args.memory_limit), action_shape=(int(ac_dim), ), observation_shape=(int(ob_dim), )) self.critic_loss = nn.MSELoss() self.ob_norm = args.ob_norm if self.ob_norm: self.obs_oms = OnlineMeanStd(shape=(1, ob_dim)) else: self.obs_oms = None self.cuda() def test(self, render=False, record=True, slow_t=0): dist, succ_rate = self.rollout(render=render, record=record, slow_t=slow_t) print('Final step distance: ', dist) def train(self): self.net_mode(train=True) tfirststart = time.time() epoch_episode_rewards = deque(maxlen=1) epoch_episode_steps = deque(maxlen=1) total_rollout_steps = 0 for epoch in range(self.global_step, self.num_iters): episode_reward = 0 episode_step = 0 self.action_noise.reset() obs = self.env.reset() obs = obs[0] epoch_actor_losses = [] epoch_critic_losses = [] if self.use_her: ep_experi = { 'obs': [], 'act': [], 'reward': [], 'new_obs': [], 'ach_goals': [], 'done': [] } for t_rollout in range(self.rollout_steps): total_rollout_steps += 1 ran = np.random.random(1)[0] if self.pretrain_dir is None and epoch < self.warmup_iter or \ ran < self.random_prob: act = self.random_action().flatten() else: act = self.policy(obs).flatten() new_obs, r, done, info = self.env.step(act) ach_goals = new_obs[1].copy() new_obs = new_obs[0].copy() episode_reward += r episode_step += 1 self.memory.append(obs, act, r * self.reward_scale, new_obs, ach_goals, done) if self.use_her: ep_experi['obs'].append(obs) ep_experi['act'].append(act) ep_experi['reward'].append(r * self.reward_scale) ep_experi['new_obs'].append(new_obs) ep_experi['ach_goals'].append(ach_goals) ep_experi['done'].append(done) if self.ob_norm: self.obs_oms.update(new_obs) obs = new_obs epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) if self.use_her: for t in range(episode_step - self.k_future): ob = ep_experi['obs'][t] act = ep_experi['act'][t] new_ob = ep_experi['new_obs'][t] ach_goal = ep_experi['ach_goals'][t] k_futures = np.random.choice(np.arange( t + 1, episode_step), self.k_future - 1, replace=False) k_futures = np.concatenate((np.array([t]), k_futures)) for future in k_futures: new_goal = ep_experi['ach_goals'][future] her_ob = np.concatenate( (ob[:-self.goal_dim], new_goal), axis=0) her_new_ob = np.concatenate( (new_ob[:-self.goal_dim], new_goal), axis=0) res = self.env.cal_reward(ach_goal.copy(), new_goal, act) her_reward, _, done = res self.memory.append(her_ob, act, her_reward * self.reward_scale, her_new_ob, ach_goal.copy(), done) self.global_step += 1 if epoch >= self.warmup_iter: for t_train in range(self.train_steps): act_loss, cri_loss = self.train_net() epoch_critic_losses.append(cri_loss) epoch_actor_losses.append(act_loss) if epoch % self.log_interval == 0: tnow = time.time() stats = {} if self.ob_norm: stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy()) stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy()) stats['total_rollout_steps'] = total_rollout_steps stats['rollout/return'] = safemean( [rew for rew in epoch_episode_rewards]) stats['rollout/ep_steps'] = safemean( [l for l in epoch_episode_steps]) if epoch >= self.warmup_iter: stats['actor_loss'] = np.mean(epoch_actor_losses) stats['critic_loss'] = np.mean(epoch_critic_losses) stats['epoch'] = epoch stats['actor_lr'] = self.actor_optim.param_groups[0]['lr'] stats['critic_lr'] = self.critic_optim.param_groups[0]['lr'] stats['time_elapsed'] = tnow - tfirststart for name, value in stats.items(): logger.logkv(name, value) logger.dumpkvs() if (epoch == 0 or epoch >= self.warmup_iter) and \ self.save_interval and\ epoch % self.save_interval == 0 and \ logger.get_dir(): mean_final_dist, succ_rate = self.rollout() logger.logkv('epoch', epoch) logger.logkv('test/total_rollout_steps', total_rollout_steps) logger.logkv('test/mean_final_dist', mean_final_dist) logger.logkv('test/succ_rate', succ_rate) tra_mean_dist, tra_succ_rate = self.rollout(train_test=True) logger.logkv('train/mean_final_dist', tra_mean_dist) logger.logkv('train/succ_rate', tra_succ_rate) # self.log_model_weights() logger.dumpkvs() if mean_final_dist < self.closest_dist: self.closest_dist = mean_final_dist is_best = True else: is_best = False self.save_model(is_best=is_best, step=self.global_step) def train_net(self): batch_data = self.memory.sample(batch_size=self.batch_size) for key, value in batch_data.items(): batch_data[key] = torch.from_numpy(value) obs0_t = batch_data['obs0'] obs1_t = batch_data['obs1'] obs0_t = self.normalize(obs0_t, self.obs_oms) obs1_t = self.normalize(obs1_t, self.obs_oms) obs0 = Variable(obs0_t).float().cuda() with torch.no_grad(): vol_obs1 = Variable(obs1_t).float().cuda() rewards = Variable(batch_data['rewards']).float().cuda() actions = Variable(batch_data['actions']).float().cuda() terminals = Variable(batch_data['terminals1']).float().cuda() cri_q_val = self.critic(obs0, actions) with torch.no_grad(): target_net_act = self.actor_target(vol_obs1) target_net_q_val = self.critic_target(vol_obs1, target_net_act) # target_net_q_val.volatile = False target_q_label = rewards target_q_label += self.gamma * target_net_q_val * (1 - terminals) target_q_label = target_q_label.detach() self.actor.zero_grad() self.critic.zero_grad() cri_loss = self.critic_loss(cri_q_val, target_q_label) cri_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optim.step() self.critic.zero_grad() self.actor.zero_grad() net_act = self.actor(obs0) net_q_val = self.critic(obs0, net_act) act_loss = -net_q_val.mean() act_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optim.step() self.soft_update(self.actor_target, self.actor, self.tau) self.soft_update(self.critic_target, self.critic, self.tau) return act_loss.cpu().data.numpy(), cri_loss.cpu().data.numpy() def normalize(self, x, stats): if stats is None: return x return (x - stats.mean) / stats.std def denormalize(self, x, stats): if stats is None: return x return x * stats.std + stats.mean def net_mode(self, train=True): if train: self.actor.train() self.critic.train() else: self.actor.eval() self.critic.eval() def load_model(self, step=None, pretrain_dir=None): model_dir = self.model_dir if pretrain_dir is not None: ckpt_file = os.path.join(self.pretrain_dir, 'model_best.pth') else: if step is None: ckpt_file = os.path.join(model_dir, 'model_best.pth') else: ckpt_file = os.path.join(model_dir, 'ckpt_{:08d}.pth'.format(step)) if not os.path.isfile(ckpt_file): raise ValueError("No checkpoint found at '{}'".format(ckpt_file)) mutils.print_yellow('Loading checkpoint {}'.format(ckpt_file)) checkpoint = torch.load(ckpt_file) if pretrain_dir is not None: actor_dict = self.actor.state_dict() critic_dict = self.critic.state_dict() actor_pretrained_dict = { k: v for k, v in checkpoint['actor_state_dict'].items() if k in actor_dict } critic_pretrained_dict = { k: v for k, v in checkpoint['critic_state_dict'].items() if k in critic_dict } actor_dict.update(actor_pretrained_dict) critic_dict.update(critic_pretrained_dict) self.actor.load_state_dict(actor_dict) self.critic.load_state_dict(critic_dict) self.global_step = 0 else: self.actor.load_state_dict(checkpoint['actor_state_dict']) self.critic.load_state_dict(checkpoint['critic_state_dict']) self.global_step = checkpoint['global_step'] if step is None: mutils.print_yellow('Checkpoint step: {}' ''.format(checkpoint['ckpt_step'])) self.warmup_iter += self.global_step mutils.print_yellow('Checkpoint loaded...') def save_model(self, is_best, step=None): if step is None: step = self.global_step ckpt_file = os.path.join(self.model_dir, 'ckpt_{:08d}.pth'.format(step)) data_to_save = { 'ckpt_step': step, 'global_step': self.global_step, 'actor_state_dict': self.actor.state_dict(), 'actor_optimizer': self.actor_optim.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'critic_optimizer': self.critic_optim.state_dict() } mutils.print_yellow('Saving checkpoint: %s' % ckpt_file) torch.save(data_to_save, ckpt_file) if is_best: torch.save(data_to_save, os.path.join(self.model_dir, 'model_best.pth')) def rollout(self, train_test=False, render=False, record=False, slow_t=0): test_conditions = self.env.train_test_conditions \ if train_test else self.env.test_conditions done_num = 0 final_dist = [] episode_length = [] for idx in range(test_conditions): if train_test: obs = self.env.train_test_reset(cond=idx) else: obs = self.env.test_reset(cond=idx) for t_rollout in range(self.rollout_steps): obs = obs[0].copy() act = self.policy(obs, stochastic=False).flatten() obs, r, done, info = self.env.step(act) if render: self.env.render() if slow_t > 0: time.sleep(slow_t) if done: done_num += 1 break if record: print('dist: ', info['dist']) final_dist.append(info['dist']) episode_length.append(t_rollout) final_dist = np.array(final_dist) mean_final_dist = np.mean(final_dist) succ_rate = done_num / float(test_conditions) if record: with open('./test_data.json', 'w') as f: json.dump(final_dist.tolist(), f) print('\nDist statistics:') print("Minimum: {0:9.4f} Maximum: {1:9.4f}" "".format(np.min(final_dist), np.max(final_dist))) print("Mean: {0:9.4f}".format(mean_final_dist)) print("Standard Deviation: {0:9.4f}".format(np.std(final_dist))) print("Median: {0:9.4f}".format(np.median(final_dist))) print("First quartile: {0:9.4f}" "".format(np.percentile(final_dist, 25))) print("Third quartile: {0:9.4f}" "".format(np.percentile(final_dist, 75))) print('Success rate:', succ_rate) if render: while True: self.env.render() return mean_final_dist, succ_rate def log_model_weights(self): for name, param in self.actor.named_parameters(): logger.logkv('actor/' + name, param.clone().cpu().data.numpy()) for name, param in self.actor_target.named_parameters(): logger.logkv('actor_target/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic.named_parameters(): logger.logkv('critic/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic_target.named_parameters(): logger.logkv('critic_target/' + name, param.clone().cpu().data.numpy()) def random_action(self): act = np.random.uniform(-1., 1., self.ac_dim) return act def policy(self, obs, stochastic=True): self.actor.eval() ob = Variable(torch.from_numpy(obs)).float().cuda().view(1, -1) act = self.actor(ob) act = act.cpu().data.numpy() if stochastic: act = self.action_noise(act) self.actor.train() return act def cuda(self): self.critic.cuda() self.actor.cuda() if hasattr(self, 'critic_target'): self.critic_target.cuda() self.actor_target.cuda() self.critic_loss.cuda() def construct_optim(self, net, lr, weight_decay=None): if weight_decay is None: weight_decay = 0 params = mutils.add_weight_decay([net], weight_decay=weight_decay) optimizer = optim.Adam(params, lr=lr, weight_decay=weight_decay) return optimizer def soft_update(self, target, source, tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
dp = True model = Actor(7, 1, 1) critic = Critic(7, 1) # Adjust model to load: # model.load_state_dict(torch.load('Models/' + str(evaluate_episode_number) + '_actor.pt')) model_name = 'actor_RUN-8_user-bestmodel_MEps-100_Bsize-128_LRac-1e-05_LRcr-0.006_Tau-0.001_maxBuf-20000_explRt-0.2.pt' model.load_state_dict(torch.load('Models/' + model_name)) model.eval() # model_name = '140_critic_user-hyperLR_MEps-200_Bsize-128_LRac-0.001_LRcr-0.001_Tau-0.001_maxBuf-20000_explRt-0.2.pt' model_name = model_name.replace('actor', 'critic') # critic.load_state_dict(torch.load('Models/' + str(evaluate_episode_number) + '_critic.pt')) critic.load_state_dict(torch.load('Models/' + model_name)) critic.eval() # random model, does not need actor model since policy is random random_rewards, random_actions = random_policy() # test policy with trained actor model policy_rewards, policy_actions = test_policy(model, critic) DP_rewards = [] DP_actions = [] if dp == True: f = open('results.pckl', 'rb') DP_actions = pickle.load(f) f.close() DP_rewards, DP_actions = test_policy_DP(DP_actions.x) # print("The mean and variance for normalizing the rewards") # print(np.mean(DP_rewards), np.std(DP_rewards)) make_stats(policy_rewards, policy_actions, random_rewards, random_actions,
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num)) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train() train_model(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions actor_net_cfg = { 'hidden1': 32, 'hidden2': 32, 'hidden3': 32, 'init_w': args.init_w } critic_net_cfg = { 'hidden1': 64, 'hidden2': 64, 'hidden3': 64, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.best_reward = -10 def update_policy(self, shared_model, args): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size, shared=args.use_more_states, num_states=args.num_states) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values # Critic update self.critic_optim.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() if args.shared: ensure_shared_grads(self.critic, shared_model.critic) self.critic_optim.step() # Actor update self.actor_optim.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if args.shared: ensure_shared_grads(self.actor, shared_model.actor) self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def share_memory(self): self.critic.share_memory() self.actor.share_memory() def add_optim(self, actor_optim, critic_optim): self.actor_optim = actor_optim self.critic_optim = critic_optim def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def update_models(self, agent): self.actor = deepcopy(agent.actor) self.actor_target = deepcopy(agent.actor_target) self.critic = deepcopy(agent.critic) self.critic_target = deepcopy(agent.critic_target) self.actor_optim = deepcopy(agent.actor_optim) self.critic_optim = deepcopy(agent.critic_optim) def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def train(self): self.critic.train() self.actor.train() def state_dict(self): return [ self.actor.state_dict(), self.actor_target.state_dict(), self.critic.state_dict(), self.critic_target.state_dict() ] def load_state_dict(self, list_of_dicts): self.actor.load_state_dict(list_of_dicts[0]) self.actor_target.load_state_dict(list_of_dicts[1]) self.critic.load_state_dict(list_of_dicts[2]) self.critic_target.load_state_dict(list_of_dicts[3]) def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s)
class DDPG(object): def __init__(self, nb_states, nb_actions, args, discrete, use_cuda=False): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.discrete = discrete # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_w':args.init_w } self.actor = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states * args.window_length, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = use_cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # state_batch, action_batch, reward_batch, \ # next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([ to_tensor(state_batch), to_tensor(action_batch) ]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor == True: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): print("use cuda") self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=1): action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) # print(self.random_process.sample(), action) noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) # print(max(self.epsilon, 0) * self.random_process.sample() * noise_level, noise_level) action = np.clip(action, -1., 1.) # print(action) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict( torch.load('{}/actor.pkl'.format(output)) ) self.critic.load_state_dict( torch.load('{}/critic.pkl'.format(output)) ) def save_model(self, output): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor.pkl'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pkl'.format(output) ) if self.use_cuda: self.actor.cuda() self.critic.cuda() def seed(self,s): torch.manual_seed(s) if self.use_cuda: torch.cuda.manual_seed(s)
class DDPG(object): def __init__(self, nb_status, nb_actions, args): self.num_actor = 3 self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn } if args.pic: self.cnn = CNN(3, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)] self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) for i in range(self.num_actor): hard_update(self.actor_targets[i], self.actors[i]) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) print('label 1') print('size = ', state_batch.shape) state_batch = self.cnn(state_batch) print('label 2') next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: index = np.random.randint(low=0, high=self.num_actor) next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_targets[index](to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() sum_policy_loss = 0 for i in range(self.num_actor): self.actors[i].zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actors[i](to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor: self.actor_optims[i].step() sum_policy_loss += policy_loss # Target update soft_update(self.actor_targets[i], self.actors[i], self.tau) soft_update(self.critic_target, self.critic, self.tau) return -sum_policy_loss / self.num_actor, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): for i in range(self.num_actor): self.actors[i].cuda() self.actor_targets[i].cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): actions = [] status = [] tot_score = [] for i in range(self.num_actor): action = to_numpy(self.actors[i](to_tensor(np.array([s_t]), volatile=True))).squeeze(0) noise_level = noise_level * max(self.epsilon, 0) action = action + self.random_process.sample() * noise_level status.append(s_t) actions.append(action) tot_score.append(0.) scores = self.critic([to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True)]) for j in range(self.num_actor): tot_score[j] += scores.data[j][0] best = np.array(tot_score).argmax() if decay_epsilon: self.epsilon -= self.depsilon self.a_t = actions[best] return actions[best] def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=0): if output is None: return for i in range(self.num_actor): actor = self.actors[i] actor_target = self.actor_targets[i] actor.load_state_dict( torch.load('{}/actor{}_{}.pkl'.format(output, num, i)) ) actor_target.load_state_dict( torch.load('{}/actor{}_{}.pkl'.format(output, num, i)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: for i in range(self.num_actor): self.actors[i].cpu() self.critic.cpu() for i in range(self.num_actor): torch.save( self.actors[i].state_dict(), '{}/actor{}_{}.pkl'.format(output, num, i) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: for i in range(self.num_actor): self.actors[i].cuda() self.critic.cuda()
class DDPG(object): def __init__(self, nb_states, nb_actions): self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions) self.actor_target = Actor(self.nb_states, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic = Critic(self.nb_states, self.nb_actions) self.critic_target = Critic(self.nb_states, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=HISTORY_LEN) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=OU_THETA, mu=OU_MU, sigma=OU_SIGMA) # Hyper-parameters self.batch_size = BATCH_SIZE self.tau = TAU self.discount = GAMMA self.depsilon = 1.0 / DEPSILON self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True if USE_CUDA: self.cuda() def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ])[:, 0] next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() torch.nn.utils.clip_grad_norm(self.critic.parameters(), 10.0) for p in self.critic.parameters(): p.data.add_(-CRITIC_LR, p.grad.data) self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() torch.nn.utils.clip_grad_norm(self.actor.parameters(), 10.0) for p in self.actor.parameters(): p.data.add_(-ACTOR_LR, p.grad.data) self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t]))))[0] ou = self.random_process.sample() prGreen('eps:{}, act:{}, random:{}'.format(self.epsilon, action, ou)) action += self.is_training * max(self.epsilon, 0) * ou action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
class Agent(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions, args.init_w) self.actor_target = Actor(self.nb_states, self.nb_actions, args.init_w) self.critic = Critic(self.nb_states, self.nb_actions, args.init_w) self.critic_target = Critic(self.nb_states, self.nb_actions, args.init_w) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.trajectory_length = args.trajectory_length self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.is_training = True # if USE_CUDA: self.cuda() def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) return action def select_action(self, state, noise_enable=True, decay_epsilon=True): action, _ = self.actor(to_tensor(np.array([state]))) action = to_numpy(action).squeeze(0) if noise_enable == True: action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon return action def reset_lstm_hidden_state(self, done=True): self.actor.reset_lstm_hidden_state(done) def reset(self): self.random_process.reset_states() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def load_weights(self, output): if output is None: return False self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) return True def save_model(self, output): if not os.path.exists(output): os.mkdir(output) torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))
class Agent(): """Interacts with and learns from the environment""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size,action_size,random_seed).to(device) self.critic_target = Critic(state_size,action_size,random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = LR_CRITIC, weight_decay = WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size,random_seed) # Replay Buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.counter = 0 # Make sure target is with the same weight as the source found on slack self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state,action,reward,next_state,done in zip(state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.counter+=1 # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and self.counter%10==0: experience = self.memory.sample() self.learn(experience, GAMMA) def act(self, state, add_noise=True): """Return actions for given state as per current policy.""" #Save experience / reward state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experience, gamma): """Update policy and value parameters using given batch of experience tuples Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state,action) -> Q-value Params ====== experience (Torch[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ state, action, reward, next_state, done = experience # ============================== Update Critic =================================# # Get predicted next-state actions and Q values from target models self.actor_target.eval() ## there is no point is saving gradient self.critic_target.eval() actions_next = self.actor_target(next_state) Q_target_next = self.critic_target(next_state,actions_next) # Compute Q targets for current states (y_i) Q_targets = reward + (gamma*Q_target_next*(1-done)) ## Compute Critic Loss Q_expected = self.critic_local(state,action) critic_loss = F.mse_loss(Q_expected, Q_targets) ## Minize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ============================== Update Actor =================================# ## Compute actor loss action_pred = self.actor_local(state) actor_loss = -(self.critic_local(state,action_pred).mean()) ## Calculating Advantage!! #print(Q_targets.size(),self.critic_local(state,action_pred).size()) # actor_loss = -(torch.mean(Q_targets-self.critic_local(state,action_pred))) ## The reason we can calculate loss this way and we don't have ## to collect trajector ( noisy Monte carlo estimation; cum_reward/reward_future) ## is action space is continuous and differentiable and we calculate ## gradient w.r.t to q_value which is estimated by CRITIC. # Minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() # del actor_loss self.actor_optimizer.step() # ========================== Update target network =================================# self.soft_update(self.critic_local,self.critic_target,TAU) self.soft_update(self.actor_local,self.actor_target,TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param,local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1-tau)*target_param.data) ## add noise to weights # local_param.data.copy_(local_param.data + self.noise.sample()[3]) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_shape, action_size, num_agents, buffer_size, batch_size, gamma, tau, learning_rate_actor, learning_rate_critic, device, update_every=1, random_seed=42): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents acting in the environment buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor tau (float): used for soft update of target parameters learning_rate_actor (float): learning rate for the actor learning_rate_critic (float): learning rate for the critic device (torch.Device): pytorch device update_every (int): how many time steps between network updates seed (int): random seed """ self.state_shape = state_shape self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.device = device self.update_every = update_every self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(action_size, random_seed).to(device) self.actor_target = Actor(action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=learning_rate_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(action_size, random_seed).to(device) self.critic_target = Critic(action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=learning_rate_critic, weight_decay=0) # Noise process self.noise = OUNoise(size=action_size, seed=random_seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, device=device, seed=random_seed) # Initialize time step (for updating every self.update_every steps) self.t_step = 0 def add(self, state, action, reward, next_state, done): """Add a new experience to memory.""" next_state_torch = torch.from_numpy(next_state).float().to(self.device) reward_torch = torch.from_numpy(np.array(reward)).float().to( self.device) done_torch = torch.from_numpy(np.array(done).astype( np.uint8)).float().to(self.device) state_torch = torch.from_numpy(state).float().to(self.device) action_torch = torch.from_numpy(action).float().to(self.device) self.actor_target.eval() self.critic_target.eval() self.critic_local.eval() with torch.no_grad(): action_next = self.actor_target(next_state_torch) Q_target_next = self.critic_target(next_state_torch, action_next) Q_target = reward_torch + (self.gamma * Q_target_next * (1 - done_torch)) Q_expected = self.critic_local(state_torch, action_torch) self.actor_local.train() self.critic_target.train() self.critic_local.train() #Error used in prioritized replay buffer error = (Q_expected - Q_target).squeeze().cpu().data.numpy() #Adding experiences to prioritized replay buffer #for i in np.arange(len(reward)): self.memory.add(error, state, action, reward, next_state, done) def step(self, state, action, reward, next_state, done): """Save experience in replay memory.""" # Save experience / reward self.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: if len(self.memory) > self.batch_size: experiences, idxs, is_weights = self.memory.sample() self.learn(experiences, idxs, is_weights) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, idxs, is_weights): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) #critic_loss = F.mse_loss(Q_expected, Q_targets) critic_loss = (torch.from_numpy(is_weights).float().to(self.device) * F.mse_loss(Q_expected, Q_targets)).mean() # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #gradient clipping #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) #.......................update priorities in prioritized replay buffer.......# #Calculate errors used in prioritized replay buffer errors = (Q_expected - Q_targets).squeeze().cpu().data.numpy() # update priority for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_1.p', "rb")) demonstrations = np.array(expert_demo[0]) print("demonstrations.shape", demonstrations.shape) print(expert_demo[1]) print(expert_demo[0]) print(np.array(expert_demo[0]).shape) # expert_x = int(expert_demo[1][0]) # expert_y = int(expert_demo[1][1]) expert_x = int(expert_demo[0][0]) expert_y = int(expert_demo[0][1]) env = Env(expert_x, expert_y) # env.seed(args.seed) # torch.manual_seed(args.seed) num_inputs = 6 num_actions = 8 running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) vdb = VDB(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) vdb_optim = optim.Adam(vdb.parameters(), lr=args.learning_rate) # load demonstrations k = 1 writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) vdb.load_state_dict(ckpt['vdb']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): # expert_demo = pickle.load(open('./paper/{}.p'.format((iter+1)%expert_sample_size), "rb")) print(iter) expert_demo = pickle.load(open('./Expert dataset 1/expert_20x20_{}.p'.format(np.random.randint(1,50)), "rb")) tmp = expert_demo.pop(-1) demonstrations = np.array(expert_demo) print(demonstrations, demonstrations.shape) tot_sample_size = len(demonstrations) + 10 ########################## actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] # while steps < args.total_sample_size: while steps < tot_sample_size: # env.delete_graph() state = env.reset() # time.sleep(1) score = 0 # state = running_state(state) state1 = state for _ in range((tot_sample_size+1)*2): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action2 = np.argmax(get_action(mu, std)[0]) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action2) irl_reward = get_reward(vdb, state, action) # ###### 동영상 촬영용 # if iter > 11500 : # time.sleep(0.015) # ##### if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) # next_state = running_state(next_state) state = next_state score += reward if done: break ########################## env.draw_graph() env.render() ########################## episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), vdb.train() if train_discrim_flag: expert_acc, learner_acc = train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'vdb': vdb.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) #### score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_' + 'last_model' + '.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'vdb': vdb.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
class Agent(object): """ Interacts with and learns from the environment. """ def __init__(self, state_size, action_size, num_agents, seed=0, buffer_size=int(1e6), actor_lr=1e-4, actor_hidden_sizes=(128, 256), actor_weight_decay=0, critic_lr=1e-4, critic_hidden_sizes=(128, 256, 128), critic_weight_decay=0, batch_size=128, gamma=0.99, tau=1e-3): """ Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents to train seed (int): random seed, default value is 0 buffer_size (int): buffer size of experience memory, default value is 100000 actor_lr (float): learning rate of actor model, default value is 1e-4 actor_lr (float): learning rate of actor model, default value is 1e-4 actor_hidden_sizes (tuple): size of hidden layer of actor model, default value is (128, 256) critic_lr (float): learning rate of critic model, default value is 1e-4 critic_hidden_sizes (tuple): size of hidden layer of critic model, default value is (128, 256, 128) batch_size (int): mini-batch size gamma (float): discount factor tau (float): interpolation parameter """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = seed self.batch_size = batch_size # mini-batch size self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters # Actor Network self.actor_local = Actor(state_size, action_size, seed, hidden_units=actor_hidden_sizes).to(DEVICE) self.actor_target = Actor(state_size, action_size, seed, hidden_units=actor_hidden_sizes).to(DEVICE) self.actor_target.eval() self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=actor_lr, weight_decay=actor_weight_decay) # Critic Network self.critic_local = Critic(state_size, action_size, seed, hidden_units=critic_hidden_sizes).to(DEVICE) self.critic_target = Critic(state_size, action_size, seed, hidden_units=critic_hidden_sizes).to(DEVICE) self.critic_target.eval() self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=critic_lr, weight_decay=critic_weight_decay) # Noise process self.noise = OUNoise((num_agents, action_size), seed) # Replay memory self.memory = ReplyBuffer(buffer_size=buffer_size, seed=seed) # copy parameters of the local model to the target model self.soft_update(self.critic_local, self.critic_target, 1.) self.soft_update(self.actor_local, self.actor_target, 1.) self.seed = random.seed(seed) np.random.seed(seed) self.reset() def reset(self): self.noise.reset() def act(self, state, add_noise=True): # actions = np.random.randn(self.num_agents, self.action_size) # actions = np.clip(actions, -1, 1) state = torch.from_numpy(state).float().to(DEVICE) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def step(self, states, actions, rewards, next_states, dones): """ Save experience in replay memory, and use random sample from buffer to learn. """ # Save experience / reward for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample(batch_size=self.batch_size) self.learn(experiences, self.gamma) def learn(self, experiences, gamma, last_action_loss=None): """ Update policy and experiences parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-experiences Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ------- update critic ------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) q_targets = rewards + (gamma * q_targets_next * (1 - dones)) q_targets = q_targets.detach() # Compute critic loss q_expected = self.critic_local(states, actions) assert q_expected.shape == q_targets.shape critic_loss = F.mse_loss(q_expected, q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0) # clip the gradient (Udacity) self.critic_optimizer.step() # ------- update actor ------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target networks self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) return actor_loss.item(), critic_loss.item() def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.detach_() target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save(self): """ Save model state """ torch.save(self.actor_local.state_dict(), "checkpoints/checkpoint_actor.pth") torch.save(self.actor_target.state_dict(), "checkpoints/checkpoint_actor_target.pth") torch.save(self.critic_local.state_dict(), "checkpoints/checkpoint_critic.pth") torch.save(self.critic_target.state_dict(), "checkpoints/checkpoint_critic_target.pth") def load(self): """ Load model state """ self.actor_local.load_state_dict(torch.load("checkpoints/checkpoint_actor.pth", map_location=lambda storage, loc: storage)) self.actor_target.load_state_dict(torch.load("checkpoints/checkpoint_actor_target.pth", map_location=lambda storage, loc: storage)) self.critic_local.load_state_dict(torch.load("checkpoints/checkpoint_critic.pth", map_location=lambda storage, loc: storage)) self.critic_target.load_state_dict(torch.load("checkpoints/checkpoint_critic_target.pth", map_location=lambda storage, loc: storage)) def __str__(self): return f"{str(self.actor_local)}\n{str(self.critic_local)}"
class Agent: def __init__(self, env_name, n_iter, n_states, action_bounds, n_actions, lr): self.env_name = env_name self.n_iter = n_iter self.action_bounds = action_bounds self.n_actions = n_actions self.n_states = n_states self.device = torch.device("cpu") self.lr = lr self.current_policy = Actor(n_states=self.n_states, n_actions=self.n_actions).to(self.device) self.critic = Critic(n_states=self.n_states).to(self.device) self.actor_optimizer = Adam(self.current_policy.parameters(), lr=self.lr, eps=1e-5) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.lr, eps=1e-5) self.critic_loss = torch.nn.MSELoss() self.scheduler = lambda step: max(1.0 - float(step / self.n_iter), 0) self.actor_scheduler = LambdaLR(self.actor_optimizer, lr_lambda=self.scheduler) self.critic_scheduler = LambdaLR(self.actor_optimizer, lr_lambda=self.scheduler) def choose_dist(self, state): state = np.expand_dims(state, 0) state = from_numpy(state).float().to(self.device) with torch.no_grad(): dist = self.current_policy(state) # action *= self.action_bounds[1] # action = np.clip(action, self.action_bounds[0], self.action_bounds[1]) return dist def get_value(self, state): state = np.expand_dims(state, 0) state = from_numpy(state).float().to(self.device) with torch.no_grad(): value = self.critic(state) return value.detach().cpu().numpy() def optimize(self, actor_loss, critic_loss): self.actor_optimizer.zero_grad() actor_loss.backward() # torch.nn.utils.clip_grad_norm_(self.current_policy.parameters(), 0.5) # torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.current_policy.parameters(), 0.5) # torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) self.critic_optimizer.step() def schedule_lr(self): # self.total_scheduler.step() self.actor_scheduler.step() self.critic_scheduler.step() def save_weights(self, iteration, state_rms): torch.save( { "current_policy_state_dict": self.current_policy.state_dict(), "critic_state_dict": self.critic.state_dict(), "actor_optimizer_state_dict": self.actor_optimizer.state_dict(), "critic_optimizer_state_dict": self.critic_optimizer.state_dict(), "actor_scheduler_state_dict": self.actor_scheduler.state_dict(), "critic_scheduler_state_dict": self.critic_scheduler.state_dict(), "iteration": iteration, "state_rms_mean": state_rms.mean, "state_rms_var": state_rms.var, "state_rms_count": state_rms.count }, self.env_name + "_weights.pth") def load_weights(self): checkpoint = torch.load(self.env_name + "_weights.pth") self.current_policy.load_state_dict( checkpoint["current_policy_state_dict"]) self.critic.load_state_dict(checkpoint["critic_state_dict"]) self.actor_optimizer.load_state_dict( checkpoint["actor_optimizer_state_dict"]) self.critic_optimizer.load_state_dict( checkpoint["critic_optimizer_state_dict"]) self.actor_scheduler.load_state_dict( checkpoint["actor_scheduler_state_dict"]) self.critic_scheduler.load_state_dict( checkpoint["critic_scheduler_state_dict"]) iteration = checkpoint["iteration"] state_rms_mean = checkpoint["state_rms_mean"] state_rms_var = checkpoint["state_rms_var"] return iteration, state_rms_mean, state_rms_var def set_to_eval_mode(self): self.current_policy.eval() self.critic.eval() def set_to_train_mode(self): self.current_policy.train() self.critic.train()
critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) actor_optim = optim.Adam(actor.parameters(), lr=hp.actor_lr) critic_optim = optim.Adam(critic.parameters(), lr=hp.critic_lr, weight_decay=hp.l2_rate) episodes = 0 for iter in range(15000): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < 2048: episodes += 1 state = env.reset() state = running_state(state) score = 0 for _ in range(10000): if args.render: env.render() steps += 1 mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
def training(opt): # ~~~~~~~~~~~~~~~~~~~ hyper parameters ~~~~~~~~~~~~~~~~~~~ # EPOCHS = opt.epochs CHANNELS = 1 H, W = 64, 64 work_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') FEATURE_D = 128 Z_DIM = 100 BATCH_SIZE = opt.batch_size # ~~~~~~~~~~~~~~~~~~~ as per WGAN paper ~~~~~~~~~~~~~~~~~~~ # lr = opt.lr CRITIC_TRAIN_STEPS = 5 WEIGHT_CLIP = 0.01 print(f"Epochs: {EPOCHS}| lr: {lr}| batch size {BATCH_SIZE}|" + f" device: {work_device}") # ~~~~~~~~~~~ creating directories for weights ~~~~~~~~~~~ # if opt.logs: log_dir = Path(f'{opt.logs}').resolve() if log_dir.exists(): shutil.rmtree(str(log_dir)) if opt.weights: Weight_dir = Path(f'{opt.weights}').resolve() if not Weight_dir.exists(): Weight_dir.mkdir() # ~~~~~~~~~~~~~~~~~~~ loading the dataset ~~~~~~~~~~~~~~~~~~~ # trans = transforms.Compose([ transforms.Resize((H, W)), transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )) ]) MNIST_data = MNIST(str(opt.data_dir), True, transform=trans, download=True) loader = DataLoader( MNIST_data, BATCH_SIZE, True, num_workers=2, pin_memory=True, ) # ~~~~~~~~~~~~~~~~~~~ creating tensorboard variables ~~~~~~~~~~~~~~~~~~~ # writer_fake = SummaryWriter(f"{str(log_dir)}/fake") writer_real = SummaryWriter(f"{str(log_dir)}/real") loss_writer = SummaryWriter(f"{str(log_dir)}/loss") # ~~~~~~~~~~~~~~~~~~~ loading the model ~~~~~~~~~~~~~~~~~~~ # critic = Critic(img_channels=CHANNELS, feature_d=FEATURE_D).to(work_device) gen = Faker(Z_DIM, CHANNELS, FEATURE_D).to(work_device) if opt.resume: if Path(Weight_dir / 'critic.pth').exists(): critic.load_state_dict( torch.load(str(Weight_dir / 'critic.pth'), map_location=work_device)) if Path(Weight_dir / 'generator.pth').exists(): gen.load_state_dict( torch.load(str(Weight_dir / 'generator.pth'), map_location=work_device)) # ~~~~~~~~~~~~~~~~~~~ create optimizers ~~~~~~~~~~~~~~~~~~~ # critic_optim = optim.RMSprop(critic.parameters(), lr) gen_optim = optim.RMSprop(gen.parameters(), lr) # ~~~~~~~~~~~~~~~~~~~ training loop ~~~~~~~~~~~~~~~~~~~ # # loss variables C_loss_prev = math.inf G_loss_prev = math.inf C_loss = 0 G_loss = 0 C_loss_avg = 0 G_loss_avg = 0 print_gpu_details() # setting the models to train mode critic.train() gen.train() for epoch in range(EPOCHS): # reset the average loss to zero C_loss_avg = 0 G_loss_avg = 0 print_memory_utilization() for batch_idx, (real, _) in enumerate(tqdm(loader)): real = real.to(work_device) fixed_noise = torch.rand(real.shape[0], Z_DIM, 1, 1).to(work_device) # ~~~~~~~~~~~~~~~~~~~ critic loop ~~~~~~~~~~~~~~~~~~~ # with torch.no_grad(): fake = gen(fixed_noise) # dim of (N,1,W,H) for _ in range(CRITIC_TRAIN_STEPS): critic.zero_grad() # ~~~~~~~~~~~ weight cliping as per WGAN paper ~~~~~~~~~~ # for p in critic.parameters(): p.data.clamp_(-WEIGHT_CLIP, WEIGHT_CLIP) # ~~~~~~~~~~~~~~~~~~~ forward ~~~~~~~~~~~~~~~~~~~ # # make it one dimensional array real_predict = critic(real).view(-1) # make it one dimensional array fake_predict = critic(fake.detach()).view(-1) # ~~~~~~~~~~~~~~~~~~~ loss ~~~~~~~~~~~~~~~~~~~ # C_loss = -(torch.mean(fake_predict) - torch.mean(real_predict)) C_loss_avg += C_loss # ~~~~~~~~~~~~~~~~~~~ backward ~~~~~~~~~~~~~~~~~~~ # C_loss.backward() critic_optim.step() # ~~~~~~~~~~~~~~~~~~~ generator loop ~~~~~~~~~~~~~~~~~~~ # gen.zero_grad() # ~~~~~~~~~~~~~~~~~~~ forward ~~~~~~~~~~~~~~~~~~~ # # make it one dimensional array fake_predict = critic(fake).view(-1) # ~~~~~~~~~~~~~~~~~~~ loss ~~~~~~~~~~~~~~~~~~~ # G_loss = -(torch.mean(fake_predict)) G_loss_avg += G_loss # ~~~~~~~~~~~~~~~~~~~ backward ~~~~~~~~~~~~~~~~~~~ # G_loss.backward() gen_optim.step() # ~~~~~~~~~~~~~~~~~~~ loading the tensorboard ~~~~~~~~~~~~~~~~~~~ # # will execute at every 50 steps if (batch_idx + 1) % 50 == 0: # ~~~~~~~~~~~~ calculate average loss ~~~~~~~~~~~~~ # C_loss_avg_ = C_loss_avg / (CRITIC_TRAIN_STEPS * batch_idx) G_loss_avg_ = G_loss_avg / (batch_idx) print(f"Epoch [{epoch}/{EPOCHS}] | batch size {batch_idx}" + f"Loss C: {C_loss_avg_:.4f}, loss G: {G_loss_avg_:.4f}") # ~~~~~~~~~~~~ send data to tensorboard ~~~~~~~~~~~~~ # with torch.no_grad(): critic.eval() gen.eval() if BATCH_SIZE > 32: fake = gen(fixed_noise[:32]).reshape( -1, CHANNELS, H, W) data = real[:32].reshape(-1, CHANNELS, H, W) else: fake = gen(fixed_noise).reshape(-1, CHANNELS, H, W) data = real.reshape(-1, CHANNELS, H, W) img_grid_fake = torchvision.utils.make_grid(fake, normalize=True) img_grid_real = torchvision.utils.make_grid(data, normalize=True) step = (epoch + 1) * (batch_idx + 1) writer_fake.add_image("Mnist Fake Images", img_grid_fake, global_step=step) writer_real.add_image("Mnist Real Images", img_grid_real, global_step=step) loss_writer.add_scalar('Critic', C_loss, global_step=step) loss_writer.add_scalar('generator', G_loss, global_step=step) # changing back the model to train mode critic.train() gen.train() # ~~~~~~~~~~~~~~~~~~~ saving the weights ~~~~~~~~~~~~~~~~~~~ # if opt.weights: if C_loss_prev > C_loss_avg: C_loss_prev = C_loss_avg weight_path = str(Weight_dir / 'critic.pth') torch.save(critic.state_dict(), weight_path) if G_loss_prev > G_loss_avg: G_loss_prev = G_loss_avg weight_path = str(Weight_dir / 'generator.pth') torch.save(gen.state_dict(), weight_path)
class Agent(): def __init__(self, test=False): # device if torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') ######################################### """ Some hand tune config(for developing) """ self.discrete = False self.action_dim = 1 self.state_dim = 3 self.batch_size = 100 self.action_low = -2 self.action_high = 2 ########################################## self.P_online = Actor(state_dim=self.state_dim, action_size=self.action_dim).to(self.device) self.P_target = Actor(state_dim=self.state_dim, action_size=self.action_dim).to(self.device) self.P_target.load_state_dict(self.P_online.state_dict()) self.Q_online = Critic(state_size=self.state_dim, action_size=self.action_dim).to(self.device) self.Q_target = Critic(state_size=self.state_dim, action_size=self.action_dim).to(self.device) self.Q_target.load_state_dict(self.Q_online.state_dict()) # discounted reward self.gamma = 0.99 self.eps = 0.25 # optimizer self.q_optimizer = torch.optim.Adam(self.Q_online.parameters(), lr=1e-3) self.p_optimizer = torch.optim.Adam(self.P_online.parameters(), lr=1e-3) # saved rewards and actions self.replay_buffer = ReplayBuffer() # noise self.noise = Noise(DELTA, SIGMA, OU_A, OU_MU) # Initialize noise self.ou_level = 0. self.ep_step = 0 def act(self, state, test=False): if not test: with torch.no_grad(): # boring type casting state = ((torch.from_numpy(state)).unsqueeze(0)).float().to( self.device) action = self.P_online(state) # continuous output a = action.data.cpu().numpy() # if self.ep_step < 200: # self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level) # a = a + self.ou_level if self.discrete: action = np.argmax(a) return a, action else: if self.ep_step < 200: self.ou_level = self.noise.ornstein_uhlenbeck_level( self.ou_level) action = np.clip(a + self.ou_level, self.action_low, self.action_high) return action, action def collect_data(self, state, action, reward, next_state, done): self.replay_buffer.push( torch.from_numpy(state).float().unsqueeze(0), torch.from_numpy(action).float(), torch.tensor([reward]).float().unsqueeze(0), torch.from_numpy(next_state).float().unsqueeze(0), torch.tensor([done]).float().unsqueeze(0)) def clear_data(self): raise NotImplementedError("Circular Queue don't need this function") def update(self): if len(self.replay_buffer) < self.batch_size: return states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size=self.batch_size, device=self.device) # discounted rewards # rewards = torch.from_numpy(discount((rewards.view(rewards.shape[0])).cpu().numpy())).float().to(self.device) ### debug shape : ok #===============================Critic Update=============================== self.Q_online.train() Q = self.Q_online((states, actions)) with torch.no_grad(): # don't need backprop for target value self.Q_target.eval() self.P_target.eval() target = rewards + self.gamma * (1 - dones) * self.Q_target( (next_states, self.P_target(next_states))) critic_loss_fn = torch.nn.MSELoss() critic_loss = critic_loss_fn(Q, target).mean() # update self.q_optimizer.zero_grad() critic_loss.backward() self.q_optimizer.step() # print("critic loss", critic_loss.item()) #===============================Actor Update=============================== # fix online_critic , update online_actor self.Q_online.eval() for p in self.Q_online.parameters(): p.requires_grad = False for p in self.P_online.parameters(): p.requires_grad = True policy_loss = -self.Q_online((states, self.P_online(states))) policy_loss = policy_loss.mean() self.p_optimizer.zero_grad() policy_loss.backward() self.p_optimizer.step() # print("policy loss", policy_loss.item()) for p in self.Q_online.parameters(): p.requires_grad = True #===============================Target Update=============================== soft_update(self.Q_target, self.Q_online, tau=1e-3) soft_update(self.P_target, self.P_online, tau=1e-3) self.eps -= EPSILON_DECAY if self.eps <= 0: self.eps = 0
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, num_agents=20): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ print("Running on: " + str(device)) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.eps = EPS_START self.eps_decay = 0.0005 # Actor network self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.noise = OUNoise((num_agents, action_size), seed) def step(self, state, action, reward, next_state, done, agent_id): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) self.t_step += 1 # Learn every UPDATE_EVERY time steps. if (self.t_step % UPDATE_EVERY) == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_id) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): actions[i, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.eps * self.noise.sample() return np.clip(actions, -1, 1) def learn(self, experiences, gamma, agent_id): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ------------------- update critic network ------------------- # target_actions = self.actor_target.forward(next_states) # Construct next actions vector relative to the agent if agent_id == 0: target_actions = torch.cat((target_actions, actions[:, 2:]), dim=1) else: target_actions = torch.cat((actions[:, :2], target_actions), dim=1) next_critic_value = self.critic_target.forward(next_states, target_actions) critic_value = self.critic_local.forward(states, actions) # Q targets for current state # If the episode is over, the reward from the future state will not be incorporated Q_targets = rewards + (gamma * next_critic_value * (1 - dones)) critic_loss = F.mse_loss(critic_value, Q_targets) # Minimizing loss self.critic_local.train() self.critic_optim.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optim.step() self.critic_local.eval() # ------------------- update actor network ------------------- # self.actor_local.train() self.actor_optim.zero_grad() mu = self.actor_local.forward(states) # Construct mu vector relative to each agent if agent_id == 0: mu = torch.cat((mu, actions[:, 2:]), dim=1) else: mu = torch.cat((actions[:, :2], mu), dim=1) actor_loss = -self.critic_local(states, mu).mean() actor_loss.backward() self.actor_optim.step() self.actor_local.eval() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, EPS_FINAL) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset(self): self.noise.reset()
generator_net = Generator(1290, 128, 512).to(device) value_net1 = Critic(1290, 128, 256, init_w=8e-1).to(device) value_net2 = Critic(1290, 128, 256, init_w=8e-1).to(device) perturbator_net = Perturbator(1290, 128, 256, init_w=27e-2).to(device) target_value_net1 = Critic(1290, 128, 256).to(device) target_value_net2 = Critic(1290, 128, 256).to(device) target_perturbator_net = Perturbator(1290, 128, 256).to(device) ad = AnomalyDetector().to(device) ad.load_state_dict(torch.load('trained/anomaly.pt')) ad.eval() target_perturbator_net.eval() target_value_net1.eval() target_value_net2.eval() soft_update(value_net1, target_value_net1, soft_tau=1.0) soft_update(value_net2, target_value_net2, soft_tau=1.0) soft_update(perturbator_net, target_perturbator_net, soft_tau=1.0) # optim.Adam can be replaced with RAdam value_optimizer1 = optimizer.Ranger(value_net1.parameters(), lr=params['value_lr'], k=10) value_optimizer2 = optimizer.Ranger(value_net2.parameters(), lr=params['perturbator_lr'], k=10) perturbator_optimizer = optimizer.Ranger(perturbator_net.parameters(), lr=params['value_lr'], weight_decay=1e-3,k=10) generator_optimizer = optimizer.Ranger(generator_net.parameters(), lr=params['generator_lr'], k=10) loss = { 'train': {'value': [], 'perturbator': [], 'generator': [], 'step': []}, 'test': {'value': [], 'perturbator': [], 'generator': [], 'step': []},
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) print(1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, time_step, agent_list): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward if time_step % 2: for idx in range(state.shape[0]): self.memory.add(state[idx], action[idx], reward[idx], next_state[idx], done[idx]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: if time_step % 2 == 0: for agent in agent_list: experiences = self.memory.sample() agent.learn(experiences, GAMMA) #import ipdb; ipdb.set_trace() #experiences = self.memory.sample() #self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" actions = [] for idx in range(state.shape[0]): state = state.float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local( state[idx, ...].unsqueeze(0)).cpu().data.numpy() if add_noise: action += self.noise.sample() actions.append(np.clip(action, -1, 1)) return np.asarray(actions) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models self.critic_local.train() with torch.no_grad(): actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) #rewards = rewards - rewards.mean() #rewards = rewards / rewards.std() Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() self.critic_local.eval() # ---------------------------- update actor ---------------------------- # # Compute actor loss self.actor_local.train() actions_pred = self.actor_local(states) torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) actor_loss = -self.critic_local(states, actions_pred).mean() # include q value normalization for actor to learn faster #actor_actions = -self.critic_local(states, actions_pred) #actor_no_mean = actor_actions - actor_actions.mean() #actor_loss = (actor_no_mean/actor_no_mean.std()).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() self.actor_local.eval() # ----------------------- update target networks ----------------------- # with torch.no_grad(): self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): expert_demo = pickle.load(open('./Ree1_expert.p', "rb")) # Ree1 : action 1 # Ree2 : action 100 # Ree3 : action 50 # Ree4 : action 10 # Ree5 : action 4 # Ree6 : action 0.5 # print('expert_demo_shape : ', np.array(expert_demo).shape) expert_x = int(expert_demo[1][0]) expert_y = int(expert_demo[1][1]) env = Env(expert_x, expert_y) # env = Env(0,0) # env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = 2 num_actions = 8 running_state = ZFilter((num_inputs, ), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo[0]) # print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(1000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action2 = np.argmax(get_action(mu, std)[0]) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action2) # next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) temp_learner.append(learner_acc * 100) temp_expert.append(expert_acc * 100) if ((expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen and iter % 55 == 0) or iter % 50 == 0): # train_discrim_flag = False plt.plot(temp_learner, label='learner') plt.plot(temp_expert, label='expert') plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.legend() plt.savefig('accuracy{}.png'.format(iter)) # plt.show() model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') print("check path", ckpt_path) save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(), 'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) model_path = 'C:/Users/USER/9 GAIL/lets-do-irl/mujoco/gail' ckpt_path = os.path.join(model_path, 'ckpt_' + str(score_avg) + '.pth.tar') save_checkpoint( { 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n': running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path) plt.plot(temp_learner) plt.plot(temp_expert) plt.xlabel('Episode') plt.ylabel('Accuracy') plt.xticks([]) plt.savefig('accuracy.png')
class DDPG(object): def __init__(self, nb_states, nb_actions, args): self.nb_states = nb_states self.nb_actions = nb_actions self.discrete = args.discrete net_config = { 'hidden1' : args.hidden1, 'hidden2' : args.hidden2 } # Actor and Critic initialization self.actor = Actor(self.nb_states, self.nb_actions, **net_config) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_config) self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr) self.critic = Critic(self.nb_states, self.nb_actions, **net_config) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_config) self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr) hard_update(self.critic_target, self.critic) hard_update(self.actor_target, self.actor) # Replay Buffer and noise self.memory = ReplayBuffer(args.memory_size) self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions)) self.last_state = None self.last_action = None # Hyper parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount # CUDA self.use_cuda = args.cuda if self.use_cuda: self.cuda() def cuda(self): self.actor.to(device) self.actor_target.to(device) self.critic.to(device) self.critic_target.to(device) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def reset(self, obs): self.last_state = obs self.noise.reset() def observe(self, reward, state, done): self.memory.append([self.last_state, self.last_action, reward, state, done]) self.last_state = state def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.last_action = action return action.argmax() if self.discrete else action def select_action(self, state, apply_noise=False): self.eval() action = to_numpy(self.actor(to_tensor(np.array([state]), device=device))).squeeze(0) self.train() if apply_noise: action = action + self.noise.sample() action = np.clip(action, -1., 1.) self.last_action = action #print('action:', action, 'output:', action.argmax()) return action.argmax() if self.discrete else action def update_policy(self): state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) state = to_tensor(np.array(state_batch), device=device) action = to_tensor(np.array(action_batch), device=device) next_state = to_tensor(np.array(next_state_batch), device=device) # compute target Q value next_q_value = self.critic_target([next_state, self.actor_target(next_state)]) target_q_value = to_tensor(reward_batch, device=device) \ + self.discount * to_tensor((1 - terminal_batch.astype(np.float)), device=device) * next_q_value # Critic and Actor update self.critic.zero_grad() with torch.set_grad_enabled(True): q_values = self.critic([state, action]) critic_loss = criterion(q_values, target_q_value.detach()) critic_loss.backward() self.critic_optim.step() self.actor.zero_grad() with torch.set_grad_enabled(True): policy_loss = -self.critic([state.detach(), self.actor(state)]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return to_numpy(-policy_loss), to_numpy(critic_loss), to_numpy(q_values.mean()) def save_model(self, output, num=1): if self.use_cuda: self.actor.to(torch.device("cpu")) self.critic.to(torch.device("cpu")) torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.to(device) self.critic.to(device) def load_model(self, output, num=1): self.actor.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num))) if self.use_cuda: self.cuda()
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'use_bn': args.bn, 'init_method': args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm( args.rmsize ) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array( [self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target( [next_state_batch, self.actor_target(next_state_batch)]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([state_batch, self.actor(state_batch)]) else: policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array( np.mean([ np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters() ])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if (self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if (self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() if self.pic: action = np.concatenate( (softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy(self.actor_target(s_t)).squeeze(0) else: action = to_numpy(self.actor(to_tensor(np.array([s_t ])))).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = (action + self.random_action(fix=True)) / 2. # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
class Agent: """Initeracts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, cfg): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ buffer_size = cfg["Agent"]["Buffer_size"] batch_size = cfg["Agent"]["Batch_size"] gamma = cfg["Agent"]["Gamma"] tau = cfg["Agent"]["Tau"] lr_actor = cfg["Agent"]["Lr_actor"] lr_critic = cfg["Agent"]["Lr_critic"] noise_decay = cfg["Agent"]["Noise_decay"] weight_decay = cfg["Agent"]["Weight_decay"] update_every = cfg["Agent"]["Update_every"] noise_min = cfg["Agent"]["Noise_min"] noise_initial = cfg["Agent"]["Noise_initial"] action_clip = cfg["Agent"]["Action_clip"] # Attach some configuration parameters self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.update_every = update_every self.action_clip = action_clip # Actor Networks both Local and Target. self.actor_local = Actor(state_size, action_size, random_seed, cfg).to(device) self.actor_target = Actor(state_size, action_size, random_seed, cfg).to(device) self.actor_noise = ActorNoise(state_size, action_size, random_seed, cfg).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Networks both Local and Target. self.critic_local = Critic(state_size, action_size, random_seed, cfg).to(device) self.critic_target = Critic(state_size, action_size, random_seed, cfg).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Noise process self.noise = OUNoise(action_size, random_seed, cfg) self.noise_modulation = noise_initial self.noise_decay = noise_decay self.noise_min = noise_min # Replay memory # self._memory = Memory(capacity=buffer_size, seed=random_seed) self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) # Count number of steps self.n_steps = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done) # Learn if enough samples are available in memory if len(self.memory ) > self.batch_size and self.n_steps % self.update_every == 0: experiences = self.memory.sample() self.learn(experiences) self.noise_modulation *= self.noise_decay self.noise_modulation = max(self.noise_modulation, self.noise_min) self.n_steps += 1 def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() action = self.actor_local(state).cpu().data.numpy() if add_noise: # action += self.noise_modulation * self.noise.sample() self.actor_noise.reset_parameters() self.actor_noise.eval() self.hard_update(self.actor_local, self.actor_noise, self.noise_modulation) action = self.actor_noise(state).cpu().data.numpy() self.actor_noise.train() self.actor_local.train() return np.clip(action, -self.action_clip, self.action_clip) def reset(self): self.n_steps = 0 self.noise.reset() def learn(self, experiences): """Update policy and value parameters given batch of experience tuples. Q_targets = r + gamma * cirtic_target(next_state, actor_state(next)state) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Update critic # Get predicted next-state actions and Q-values from target models. self.actor_target.eval() self.critic_target.eval() actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # We didn't want actor_target or critc_target showing up in the graph. self.actor_target.train() self.critic_target.train() # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() # Clear gradient critic_loss.backward() # Backpropagation torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Update parameters # Update actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() # Clear gradient actor_loss.backward() # Backpropagation # torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() # Update parameters # Now we update the target networks self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. theta_target = tau * theta_local + (1 - tau) * theta_target Params ====== local_model: PyTorch model (weight source) target_model: PyTorch model (weight destination) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, local_model, noise_model, noise_modulation): """Hard update model parameters. theta_noise = theta_local + self.noise_modulation * theta_noise Params ====== local_model: PyTorch model (weight source) noise_model: PyTorch model (weight destination) """ for noise_param, local_param in zip(noise_model.parameters(), local_model.parameters()): noise_param.data.copy_(local_param.data + noise_modulation * noise_param.data)