def __init__(self, n_states, n_actions, hidden_dim): """Agent class that choose action and train Args: input_dim (int): input dimension output_dim (int): output dimension hidden_dim (int): hidden dimension """ self.q_local = QNetwork(n_states, n_actions, hidden_dim=16).to(device) self.q_target = QNetwork(n_states, n_actions, hidden_dim=16).to(device) self.mse_loss = torch.nn.MSELoss() self.optim = optim.Adam(self.q_local.parameters(), lr=LEARNING_RATE) self.n_states = n_states self.n_actions = n_actions # ReplayMemory: trajectory is saved here self.replay_memory = ReplayMemory(10000)
# where it is shared print("Creating optimizers...") optimizer = torch.optim.RMSprop(DQN_main.parameters()) # optimizer_next_object = torch.optim.RMSprop(DQN_next_object_main.parameters()) # optimizer_predicate = torch.optim.RMSprop(DQN_predicate_main.parameters()) # optimizer_attribute = torch.optim.RMSprop(DQN_attribute_main.parameters()) print("Done!") # define loss functions loss_fn = nn.MSELoss() # loss_fn_predicate = nn.MSELoss() # loss_fn_next_object = nn.MSELoss() # create replay buffer print("Creating replay buffer...") replay_buffer = ReplayMemory(replay_buffer_capacity, replay_buffer_minimum_number_samples) print("Done!") # load skip thought model # skip_thought_model = skipthoughts.load_model() # skip_thought_encoder = skipthoughts.Encoder(skip_thought_model) # load train data samples if args.train: train_data_samples = json.load(open(args.train_data)) train_dataset = VGDataset(train_data_samples, args.images_dir) train_data_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate)
class Agent(object): def __init__(self, n_states, n_actions, hidden_dim): """Agent class that choose action and train Args: input_dim (int): input dimension output_dim (int): output dimension hidden_dim (int): hidden dimension """ self.q_local = QNetwork(n_states, n_actions, hidden_dim=16).to(device) self.q_target = QNetwork(n_states, n_actions, hidden_dim=16).to(device) self.mse_loss = torch.nn.MSELoss() self.optim = optim.Adam(self.q_local.parameters(), lr=LEARNING_RATE) self.n_states = n_states self.n_actions = n_actions # ReplayMemory: trajectory is saved here self.replay_memory = ReplayMemory(10000) def get_action(self, state, eps, check_eps=True): """Returns an action Args: state : 2-D tensor of shape (n, input_dim) eps (float): eps-greedy for exploration Returns: int: action index """ global steps_done sample = random.random() if check_eps == False or sample > eps: with torch.no_grad(): # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. ## UserWarning: volatile was removed and now has no effect. ## Use `with torch.no_grad():` instead. return self.q_local( Variable(state).type(FloatTensor)).data.max(1)[1].view( 1, 1) else: ## return LongTensor([[random.randrange(2)]]) return torch.tensor([[random.randrange(self.n_actions)]], device=device) def learn(self, experiences, gamma): """Prepare minibatch and train them Args: experiences (List[Transition]): Minibatch of `Transition` gamma (float): Discount rate of Q_target """ if len(self.replay_memory.memory) < BATCH_SIZE: return transitions = self.replay_memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) states = torch.cat(batch.state) actions = torch.cat(batch.action) rewards = torch.cat(batch.reward) next_states = torch.cat(batch.next_state) dones = torch.cat(batch.done) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net # Use local model to choose an action, and target model to evaluate that action Q_max_action = self.q_local(next_states).detach().max(1)[1].unsqueeze( 1) Q_targets_next = self.q_target(next_states).gather( 1, Q_max_action).reshape(-1) # Compute the expected Q values Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.q_local(states).gather(1, actions) ## current #self.q_local.train(mode=True) self.optim.zero_grad() #print('Q_expected.shape: ', Q_expected.shape) #print('Q_targets_next.shape: ', Q_targets_next.shape) #print('Q_targets.shape: ', Q_targets.shape) loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1)) # backpropagation of loss to NN loss.backward() self.optim.step()
def __init__(self, gamma, tau, actor_hidden_size, critic_hidden_size, observation_space, action_space, args): self.num_inputs = observation_space.shape[0] self.action_space = action_space self.actor_hidden_size = actor_hidden_size self.critic_hidden_size = critic_hidden_size self.comm_hidden_size = actor_hidden_size // 2 self.gamma = gamma self.tau = tau self.args = args # replay for the update of attention unit self.queue = queue.Queue() # Define actor part 1 self.actor_p1 = ActorPart1(self.num_inputs, actor_hidden_size).to(device) self.actor_target_p1 = ActorPart1(self.num_inputs, actor_hidden_size).to(device) # attention unit is not end-to-end trained self.atten = AttentionUnit(actor_hidden_size, actor_hidden_size).to(device) self.atten_optim = Adam(self.atten.parameters(), lr=self.args.actor_lr) # Define Communication Channel self.comm = CommunicationChannel(actor_hidden_size, self.comm_hidden_size).to(device) self.comm_target = CommunicationChannel( actor_hidden_size, self.comm_hidden_size).to(device) self.comm_optim = Adam(self.comm.parameters(), lr=self.args.actor_lr) # Define actor part 2 # input -- [thoughts, intergrated thoughts] self.actor_p2 = ActorPart2( actor_hidden_size + self.comm_hidden_size * 2, self.action_space, actor_hidden_size).to(device) self.actor_target_p2 = ActorPart2( actor_hidden_size + self.comm_hidden_size * 2, self.action_space, actor_hidden_size).to(device) self.actor_optim = Adam([{ 'params': self.actor_p1.parameters(), 'lr': self.args.actor_lr }, { 'params': self.actor_p2.parameters(), 'lr': self.args.actor_lr }]) self.critic = Critic(self.num_inputs, self.action_space, critic_hidden_size).to(device) self.critic_target = Critic(self.num_inputs, self.action_space, critic_hidden_size).to(device) self.critic_optim = Adam(self.critic.parameters(), lr=self.args.critic_lr) # Make sure target is with the same weight hard_update(self.actor_target_p1, self.actor_p1) hard_update(self.comm_target, self.comm) hard_update(self.actor_target_p2, self.actor_p2) hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = ReplayMemory(args.memory_size) self.random_process = OrnsteinUhlenbeckProcess(size=action_space.n, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma)
class ATOC_trainer(object): def __init__(self, gamma, tau, actor_hidden_size, critic_hidden_size, observation_space, action_space, args): self.num_inputs = observation_space.shape[0] self.action_space = action_space self.actor_hidden_size = actor_hidden_size self.critic_hidden_size = critic_hidden_size self.comm_hidden_size = actor_hidden_size // 2 self.gamma = gamma self.tau = tau self.args = args # replay for the update of attention unit self.queue = queue.Queue() # Define actor part 1 self.actor_p1 = ActorPart1(self.num_inputs, actor_hidden_size).to(device) self.actor_target_p1 = ActorPart1(self.num_inputs, actor_hidden_size).to(device) # attention unit is not end-to-end trained self.atten = AttentionUnit(actor_hidden_size, actor_hidden_size).to(device) self.atten_optim = Adam(self.atten.parameters(), lr=self.args.actor_lr) # Define Communication Channel self.comm = CommunicationChannel(actor_hidden_size, self.comm_hidden_size).to(device) self.comm_target = CommunicationChannel( actor_hidden_size, self.comm_hidden_size).to(device) self.comm_optim = Adam(self.comm.parameters(), lr=self.args.actor_lr) # Define actor part 2 # input -- [thoughts, intergrated thoughts] self.actor_p2 = ActorPart2( actor_hidden_size + self.comm_hidden_size * 2, self.action_space, actor_hidden_size).to(device) self.actor_target_p2 = ActorPart2( actor_hidden_size + self.comm_hidden_size * 2, self.action_space, actor_hidden_size).to(device) self.actor_optim = Adam([{ 'params': self.actor_p1.parameters(), 'lr': self.args.actor_lr }, { 'params': self.actor_p2.parameters(), 'lr': self.args.actor_lr }]) self.critic = Critic(self.num_inputs, self.action_space, critic_hidden_size).to(device) self.critic_target = Critic(self.num_inputs, self.action_space, critic_hidden_size).to(device) self.critic_optim = Adam(self.critic.parameters(), lr=self.args.critic_lr) # Make sure target is with the same weight hard_update(self.actor_target_p1, self.actor_p1) hard_update(self.comm_target, self.comm) hard_update(self.actor_target_p2, self.actor_p2) hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = ReplayMemory(args.memory_size) self.random_process = OrnsteinUhlenbeckProcess(size=action_space.n, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) def update_thoughts(self, thoughts, C): batch_size = 1 nagents = thoughts.shape[0] thoughts = thoughts.clone().detach() for index in range(nagents): if not C[index, index]: continue input_comm = [] # the neighbour of agent_i for j in range(nagents): if C[index, j]: input_comm.append(thoughts[j]) input_comm = torch.stack(input_comm, dim=0).unsqueeze( 0) # (1, m, acotr_hidden_size) # input communication channel to intergrate thoughts hidden_state = self.initHidden(batch_size) intergrated_thoughts, _ = self.comm( input_comm, hidden_state) # (1, m, 2*comm_hidden_size) intergrated_thoughts = intergrated_thoughts.squeeze() # update group_index intergrated thoughts thoughts[C[index]] = intergrated_thoughts return thoughts def select_action(self, thoughts, inter_thoughts, C, action_noise=True): nagents = thoughts.shape[0] # merge invidual thoughts and intergrated thoughts is_comm = C.any(dim=0) # (nagents) # agent withouth communication padding with zeros for i in range(nagents): if not is_comm[i]: inter_thoughts[i] = 0 # TODO: [intergrated_thoughts, individual_thoughts] ??? # (nagents, actor_hidden_size+2*comm_hidden_size) input_actor2 = torch.cat((thoughts, inter_thoughts), dim=-1) # input to part II of the actor actor2_action = self.actor_p2(input_actor2) action = actor2_action.data.numpy() return action def calc_delta_Q(self, obs_n, action_n, thoughts, C): obs_n = torch.FloatTensor(obs_n).to(device) action_n = torch.FloatTensor(action_n).to(device) nagents = obs_n.shape[0] for index in range(nagents): group_Q = [] actual_group_Q = [] if not C[index, index]: continue for j in range(nagents): if not C[index, j]: continue h_j = torch.cat((thoughts[j], torch.zeros_like(thoughts[j])), dim=-1).unsqueeze(0) action_j = self.actor_p2(h_j) # (1, action_shape) actual_action_j = action_n[j].unsqueeze(0) # (1, action_shape) Q_j = self.critic(obs_n[j].unsqueeze(0), action_j) # (1, 1) actual_Q_j = self.critic(obs_n[j].unsqueeze(0), actual_action_j) group_Q.append(Q_j.squeeze()) actual_group_Q.append(actual_Q_j.squeeze()) group_Q = torch.stack(group_Q, dim=0) actual_group_Q = torch.stack(actual_group_Q, dim=0) # (m, ) delta_Q = actual_group_Q.mean() - group_Q.mean() # store the thought and delta_Q h_i = thoughts[index].data.numpy() # (actor_hidden_size, ) delta_Q = delta_Q.data.numpy() # 1 self.queue.put((h_i, delta_Q)) def update_parameters(self): batch_size = self.args.batch_size batch = self.memory.sample(batch_size) obs_n_batch = torch.FloatTensor(batch.obs_n).to( device) # (batch_size, nagents, obs_shape) action_n_batch = torch.FloatTensor(batch.action_n).to( device) # (batch_size, nagents, action_shape) reward_n_batch = torch.FloatTensor(batch.reward_n).unsqueeze(-1).to( device) # (batch_size, nagents, 1) next_obs_n_batch = torch.FloatTensor(batch.next_obs_n).to( device) # (batch_size, nagents, obs_shape) C_batch = torch.BoolTensor(batch.C).to( device) # (batch_size, nagents, nagents) nagents = obs_n_batch.shape[1] # ----------------------------------------------------------------------------------------- # sample agents without communication # ----------------------------------------------------------------------------------------- # True --> communication, False --> no communicaiton # TODO: cancel the # # ind = C_batch.any(dim=1) # (batch_size, nagents) # obs_n = obs_n_batch[ind==False] # action_n = action_n_batch[ind==False] # reward_n = reward_n_batch[ind==False] # next_obs_n = next_obs_n_batch[ind==False] # (sample_agents, shape) # # update critic # thoughts_n = self.actor_target_p1(next_obs_n) # (sample_agents, actor_hiddensize) # padding = torch.zeros(thoughts_n.shape[0], 2*self.comm_hidden_size) # input_target_actor2 = torch.cat((thoughts_n, padding), dim=-1) # (sample_agents, hiddensize) # next_action_n = self.actor_target_p2(input_target_actor2) # (sample_agents, action_shape) # next_Q_n = self.critic_target(next_obs_n, next_action_n) # (sample_agents, 1) # target_Q_n = reward_n + (self.gamma * next_Q_n).detach() # (sample_agents, 1) # Q_n = self.critic(obs_n, action_n) # (sample_agents, 1) # value_loss = F.mse_loss(target_Q_n, Q_n) # self.critic_optim.zero_grad() # value_loss.backward() # self.critic_optim.step() # # update actor # thoughts_actor = self.actor_p1(obs_n) # padding_actor = torch.zeros(thoughts_actor.shape[0], 2*self.comm_hidden_size) # input_actor2 = torch.torch.cat((thoughts_actor, padding_actor), dim=-1) # action_n_actor = self.actor_p2(input_actor2) # policy_loss = -self.critic(obs_n, action_n_actor) # policy_loss = policy_loss.mean() # self.actor_optim.zero_grad() # policy_loss.backward() # self.actor_optim.step() # ----------------------------------------------------------------------------------------- # sample agents with communication # ----------------------------------------------------------------------------------------- # update critic target_Q = [] Q = [] for batch_index in range(batch_size): is_comm = C_batch[batch_index].any(dim=0) # (nagents,) next_thoughts_n = self.actor_target_p1( next_obs_n_batch[batch_index]) # (nagents, actor_hiddensize) # communication padding = next_thoughts_n.clone().detach() for agent_i in range(nagents): if not C_batch[batch_index, agent_i, agent_i]: continue thoughts_m = padding[C_batch[batch_index, agent_i]].unsqueeze( 0) # (1, m, actor_hiddensize) hidden_state = self.initHidden(1) inter_thoughts, _ = self.comm_target( thoughts_m, hidden_state) # (1, m, 2*comm_hidden_size) inter_thoughts = inter_thoughts.squeeze( ) # (m, 2*comm_hiddensize) # update inter thoughts to thoughts clone -- inter group communication # TODO: Can this avoid in-place operation? padding = padding.clone() padding[C_batch[batch_index, agent_i]] = inter_thoughts # select action for m agents with communication padding[~is_comm] = 0.0 input_target_actor2 = torch.cat( (next_thoughts_n, padding), dim=-1) # (nagents, a_hiddensie+c_hiddensize) next_action_n = self.actor_target_p2( input_target_actor2) # (nagents, action_shape) # print('next_action_n shape', next_action_n.shape) next_obs_m = next_obs_n_batch[batch_index, is_comm] # (m, obs_shape) next_action_m = next_action_n[is_comm] # (m, action_shape) next_Q_m = self.critic_target(next_obs_m, next_action_m) # (m, 1) reward_m = reward_n_batch[batch_index, is_comm] # (m, 1) target_Q_m = reward_m + (self.gamma * next_Q_m).detach() # (m, 1) obs_m = obs_n_batch[batch_index, is_comm] action_m = action_n_batch[batch_index, is_comm] Q_m = self.critic(obs_m, action_m) target_Q.append(target_Q_m) Q.append(Q_m) target_Q = torch.stack(target_Q, dim=0) Q = torch.stack(Q, dim=0) critic_loss = F.mse_loss(target_Q, Q) self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() # update actor and communication channel actor_loss = [] for batch_index in range(batch_size): is_comm = C_batch[batch_index].any(dim=0) # (nagents, ) thoughts_n = self.actor_p1( obs_n_batch[batch_index]) # (nagents, actor_hiddensize) # communication padding = thoughts_n.clone().detach() for agent_i in range(nagents): if not C_batch[batch_index, agent_i, agent_i]: continue thoughts_m = padding[C_batch[batch_index, agent_i]].unsqueeze( 0) # (1, m, actor_hiddensize) hidden_state = self.initHidden(1) inter_thoughts, _ = self.comm( thoughts_m, hidden_state) # (1, m, 2*comm_hiddensize) inter_thoughts = inter_thoughts.squeeze() # TODO: Can this avoid in-place operation and pass the gradient? padding = padding.clone() padding[C_batch[batch_index, agent_i]] = inter_thoughts # select action for m agents with communication padding[~is_comm] = 0.0 input_actor2 = torch.cat( (thoughts_n, padding), dim=-1) # (nagents, a_hiddensize+c_hiddensize) action_n = self.actor_p2(input_actor2) # (nagents, action shape) action_m = action_n[is_comm] # (m, action shape) obs_m = obs_n_batch[batch_index, is_comm] # (m, obs shape) actor_loss_batch = -self.critic(obs_m, action_m) # (m, 1) actor_loss.append(actor_loss_batch) actor_loss = torch.stack(actor_loss, dim=0) # (batch_size, m, 1) # print('actor_loss shape', actor_loss.shape) actor_loss = actor_loss.mean() self.actor_optim.zero_grad() self.comm_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self.comm_optim.step() soft_update(self.actor_target_p1, self.actor_p1, self.tau) soft_update(self.actor_target_p2, self.actor_p2, self.tau) soft_update(self.comm_target, self.comm, self.tau) soft_update(self.critic_target, self.critic, self.tau) return critic_loss.item(), actor_loss.item() def update_attention_unit(self): h_i_batch = [] delta_Q_batch = [] while not self.queue.empty(): h_i, delta_Q = self.queue.get() h_i_batch.append(h_i) delta_Q_batch.append(delta_Q) print("delta_Q_batch values", delta_Q_batch) h_i_batch = torch.FloatTensor(h_i_batch).to( device) # (batch_size, actor_hiddensize) delta_Q_batch = torch.FloatTensor(delta_Q_batch).to( device) # (batch_size, ) p_i = self.atten(h_i_batch) # (batch_size, 1) p_i = p_i.squeeze() # min-max normalization delta_Q_batch = (delta_Q_batch - delta_Q_batch.min()) / ( delta_Q_batch.max() - delta_Q_batch.min()) # calc loss loss = -delta_Q_batch * torch.log(p_i) - ( 1 - delta_Q_batch) * torch.log(1 - p_i) self.atten_optim.zero_grad() loss.backward() self.atten_optim.step() def get_thoughts(self, obs_n): obs_n_tensor = torch.FloatTensor(obs_n).to( device) # (nagents, obs_shape) thoughts = self.actor_p1(obs_n_tensor) return thoughts def initiate_group(self, obs_n, m, thoughts): obs_n = np.array(obs_n) nagents = obs_n.shape[0] # decide whether to initiate communication atten_out = self.atten(thoughts) # (nagents, 1) is_comm = (atten_out > 0.5).squeeze() # (nagents, ) C = torch.zeros(nagents, nagents).bool() # relative position other_pos = (obs_n[:, -(nagents - 1) * 2:]).reshape( -1, nagents - 1, 2) # (nagents, nagents-1, 2) other_dist = np.sqrt(np.sum(np.square(other_pos), axis=-1)) # (nagents, nagents-1) # insert itself distance into other_dist -> total_dist total_dist = [] for i in range(nagents): total_dist.append(np.insert(other_dist[i], obj=i, values=0.0)) total_dist = np.stack(total_dist) # (nagents, nagents) # the id of top-m agents (including itself) index = np.argsort(total_dist, axis=-1) assert m <= nagents neighbour_m = index[:, :m] # (nagents, m) for index, comm in enumerate(is_comm): if comm: C[index, neighbour_m[index]] = True # TODO: test the other parts of this project without attention unit C = torch.zeros(nagents, nagents) C[0] = 1 C = C.bool() return C def initHidden(self, batch_size): return torch.zeros((2 * 1, batch_size, self.comm_hidden_size)) def save_model(self, env_name, suffix=""): if not os.path.exists('models/'): os.makedirs('models/') save_path = "models/ddpg_{}_{}".format(env_name, suffix) model = { 'actor_p1': self.actor_p1.state_dict(), 'actor_target_p1': self.actor_target_p1.state_dict(), 'actor_p2': self.actor_p2.state_dict(), 'actor_target_p2': self.actor_target_p2.state_dict(), 'critic': self.critic.state_dict(), 'critic_target': self.critic_target.state_dict(), 'comm': self.comm.state_dict(), 'comm_target': self.comm_target.state_dict(), 'atten': self.atten.state_dict() } torch.save(model, save_path) print('Saving models to {}'.format(save_path)) def load_model(self, env_name, suffix=""): load_path = "models/ddpg_{}_{}".format(env_name, suffix) print('Loading models from {}'.format(load_path)) model = torch.load(load_path) self.actor_p1.load_state_dict(model['actor_p1']) self.actor_target_p1.load_state_dict(model['actor_target_p1']) self.actor_p2.load_state_dict(model['actor_p2']) self.actor_target_p2.load_state_dict(model['actor_target_p2']) self.critic.load_state_dict(model['critic']) self.critic_target.load_state_dict(model['critic_target']) self.comm.load_state_dict(model['comm']) self.comm_target.load_state_dict(model['comm_target']) self.atten.load_state_dict(model['atten'])
class Agent(object): def __init__(self, n_states, n_actions, hidden_dim, lr, device): """Agent class that choose action and train Args: n_states (int): input dimension n_actions (int): output dimension hidden_dim (int): hidden dimension """ self.device = device self.q_local = QNetwork(n_states, n_actions, hidden_dim=16).to(self.device) self.q_target = QNetwork(n_states, n_actions, hidden_dim=16).to(self.device) self.mse_loss = torch.nn.MSELoss() self.optim = optim.Adam(self.q_local.parameters(), lr=lr) self.n_states = n_states self.n_actions = n_actions # ReplayMemory: trajectory is saved here self.replay_memory = ReplayMemory(10000) def get_action(self, state, eps, check_eps=True): """Returns an action Args: state : 2-D tensor of shape (n, input_dim) eps (float): eps-greedy for exploration Returns: int: action index """ global steps_done sample = random.random() if check_eps==False or sample > eps: with torch.no_grad(): return self.q_local(Variable(state).type(FloatTensor)).data.max(1)[1].view(1, 1) else: ## return LongTensor([[random.randrange(2)]]) return torch.tensor([[random.randrange(self.n_actions)]], device=self.device) def learn(self, experiences, gamma): """Prepare minibatch and train them Args: experiences (List[Transition]): batch of `Transition` gamma (float): Discount rate of Q_target """ if len(self.replay_memory.memory) < BATCH_SIZE: return; transitions = self.replay_memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) states = torch.cat(batch.state) actions = torch.cat(batch.action) rewards = torch.cat(batch.reward) next_states = torch.cat(batch.next_state) dones = torch.cat(batch.done) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to newtork q_local (current estimate) Q_expected = self.q_local(states).gather(1, actions) Q_targets_next = self.q_target(next_states).detach().max(1)[0] # Compute the expected Q values Q_targets = rewards + (gamma * Q_targets_next * (1-dones)) self.q_local.train(mode=True) self.optim.zero_grad() loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1)) # backpropagation of loss to NN loss.backward() self.optim.step() def soft_update(self, local_model, target_model, tau): """ tau (float): interpolation parameter""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def hard_update(self, local, target): for target_param, param in zip(target.parameters(), local.parameters()): target_param.data.copy_(param.data)
# Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() state_size = torch.tensor(env.observation()).shape[0] act_size = env.action_space.shape[0] n_actions = 3 policy_net = DQN(state_size, act_size).to(device) target_net = DQN(state_size, act_size).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = torch.optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) steps_done = 0 def converter(observation): state = torch.tensor(observation).float().to(device) return state def select_action(state): global steps_done sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1
vis = Visdom() win_score = None win_actor_score = None win_critic_loss = None actor = Actor(state_size * 2, action_size * 2).to(device) actor_target = Actor(state_size * 2, action_size * 2).to(device) critic = Critic(state_size * 2, n_action=action_size * 2).to(device) critic_target = Critic(state_size * 2, n_action=action_size * 2).to(device) for target_param, param in zip(critic_target.parameters(), critic.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(actor_target.parameters(), actor.parameters()): target_param.data.copy_(param.data) replay_buffer = ReplayMemory(args.replay_capacity) criterion = nn.MSELoss() optim_critic = torch.optim.Adam(critic.parameters(), lr=args.lr_critic, weight_decay=args.weight_decay_critic) optim_actor = torch.optim.Adam(actor.parameters(), lr=args.lr_actor) loss_critic = [] score_actor = [] score = 0 steps = 0 noise_std = args.noise_std_start for i in range(args.episodes): env_info = env.reset(train_mode=True)[brain_name] state = torch.from_numpy(