def __init__(self, args): self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape input_shape = self.obs_shape # 根据参数决定RNN的输入维度 if args.last_action: input_shape += self.n_actions if args.reuse_network: input_shape += self.n_agents if args.alg == 'vdn': self.policy = VDN(args) elif args.alg == 'qmix': # self.policy = QMIX(args) self.policy = rnn_policy(input_shape, args) if args.cuda: self.policy.cuda() # elif args.alg == 'coma': # self.policy = COMA(args) # elif args.alg == 'qtran_alt': # self.policy = QtranAlt(args) # elif args.alg == 'qtran_base': # self.policy = QtranBase(args) # elif args.alg == 'maven': # self.policy = MAVEN(args) # elif args.alg == 'central_v': # self.policy = CentralV(args) # elif args.alg == 'reinforce': # self.policy = Reinforce(args) # else: # raise Exception("No such algorithm") self.args = args print('Init Agents')
def __init__(self, args): self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape if args.alg == 'vdn': self.policy = VDN(args) elif args.alg == 'qmix': self.policy = QMIX(args) elif args.alg == 'coma': self.policy = COMA(args) elif args.alg == 'qtran_alt': self.policy = QtranAlt(args) elif args.alg == 'qtran_base': self.policy = QtranBase(args) elif args.alg == 'maven': self.policy = MAVEN(args) elif args.alg == 'central_v': self.policy = CentralV(args) elif args.alg == 'reinforce': self.policy = Reinforce(args) else: raise Exception("No such algorithm") self.args = args print('Init Agents')
def __init__(self, args): self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape if args.alg == 'vdn': from policy.vdn import VDN self.policy = VDN(args) elif args.alg == 'iql': from policy.iql import IQL self.policy = IQL(args) elif args.alg == 'qmix': from policy.qmix import QMIX self.policy = QMIX(args) elif args.alg == 'coma': from policy.coma import COMA self.policy = COMA(args) elif args.alg == 'qtran_alt': from policy.qtran_alt import QtranAlt self.policy = QtranAlt(args) elif args.alg == 'qtran_base': from policy.qtran_base import QtranBase self.policy = QtranBase(args) elif args.alg == 'maven': from policy.maven import MAVEN self.policy = MAVEN(args) elif args.alg == 'central_v': from policy.central_v import CentralV self.policy = CentralV(args) elif args.alg == 'reinforce': from policy.reinforce import Reinforce self.policy = Reinforce(args) else: raise Exception("No such algorithm") self.args = args
def __init__(self, args): self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape if args.alg == 'vdn': self.policy = VDN(args) else: self.policy = QMIX(args) self.args = args
def __init__(self, args): self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape input_shape = self.obs_shape # 根据参数决定RNN的输入维度 if args.last_action: input_shape += self.n_actions if args.reuse_network: input_shape += self.n_agents if args.alg == 'vdn': self.policy = VDN(args) elif args.alg == 'qmix': # self.policy = QMIX(args) self.policy = rnn_policy(input_shape, args) if args.cuda: self.policy.cuda() self.args = args print('Init Agents')
def __init__(self, args): self.n_actions = args.n_actions self.n_agents = args.n_agents * 2 self.state_shape = args.state_shape self.obs_shape = args.obs_shape self.idact_shape = args.id_dim + args.n_actions self.search_actions = np.eye(args.n_actions) self.search_ids = np.zeros(self.n_agents) if args.alg == 'vdn': self.policy = VDN(args) elif args.alg == 'qmix': self.policy = QMIX(args) elif args.alg == 'ours': self.policy = OURS(args) elif args.alg == 'coma': self.policy = COMA(args) elif args.alg == 'qtran_alt': self.policy = QtranAlt(args) elif args.alg == 'qtran_base': self.policy = QtranBase(args) elif args.alg == 'maven': self.policy = MAVEN(args) elif args.alg == 'central_v': self.policy = CentralV(args) elif args.alg == 'reinforce': self.policy = Reinforce(args) else: raise Exception("No such algorithm") if args.use_fixed_model: args_goal_a = get_common_args() args_goal_a.load_model = True args_goal_a = get_mixer_args(args_goal_a) args_goal_a.learn = False args_goal_a.epsilon = 0 # 1 args_goal_a.min_epsilon = 0 args_goal_a.map = 'battle' args_goal_a.n_actions = args.n_actions args_goal_a.episode_limit = args.episode_limit args_goal_a.n_agents = args.n_agents args_goal_a.state_shape = args.state_shape args_goal_a.feature_shape = args.feature_shape args_goal_a.view_shape = args.view_shape args_goal_a.obs_shape = args.obs_shape args_goal_a.real_view_shape = args.real_view_shape args_goal_a.load_num = args.load_num args_goal_a.use_ja = False args_goal_a.mlp_hidden_dim = [512, 512] self.fixed_policy = VDN_F(args_goal_a) self.args = args print('Init Agents')
def __init__(self, args): self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape if args.alg == 'vdn': self.policy = VDN(args) elif args.alg == 'qmix': self.policy = QMIX(args) elif args.alg == 'coma': self.policy = COMA(args) elif args.alg == 'qtran_alt': self.policy = QtranAlt(args) elif args.alg == 'qtran_base': self.policy = QtranBase(args) else: raise Exception("No such algorithm") self.args = args
class Agents: def __init__(self, args): self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape input_shape = self.obs_shape # 根据参数决定RNN的输入维度 if args.last_action: input_shape += self.n_actions if args.reuse_network: input_shape += self.n_agents if args.alg == 'vdn': self.policy = VDN(args) elif args.alg == 'qmix': # self.policy = QMIX(args) self.policy = rnn_policy(input_shape, args) if args.cuda: self.policy.cuda() self.args = args print('Init Agents') def init_hidden(self, episode_num): # 为每个episode中的每个agent都初始化一个eval_hidden、target_hidden self.policy_eval_hidden = torch.zeros( (episode_num, self.n_agents, self.args.rnn_hidden_dim)) # self.target_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim)) def choose_action(self, obs, last_action, agent_num, avail_actions, epsilon, maven_z=None, evaluate=False): inputs = obs.copy() avail_actions_ind = np.nonzero(avail_actions)[ 0] # index of actions which can be choose # transform agent_num to onehot vector agent_id = np.zeros(self.n_agents) agent_id[agent_num] = 1. if self.args.last_action: inputs = np.hstack((inputs, last_action)) if self.args.reuse_network: inputs = np.hstack((inputs, agent_id)) hidden_state = self.policy_eval_hidden[:, agent_num, :] # transform the shape of inputs from (42,) to (1,42) inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) avail_actions = torch.tensor(avail_actions, dtype=torch.float32).unsqueeze(0) if self.args.cuda: inputs = inputs.cuda() hidden_state = hidden_state.cuda() # get q value if self.args.alg == 'maven': maven_z = torch.tensor(maven_z, dtype=torch.float32).unsqueeze(0) if self.args.cuda: maven_z = maven_z.cuda() q_value, self.policy.eval_hidden[:, agent_num, :] = self.policy.eval_rnn( inputs, hidden_state, maven_z) else: # q_value, self.policy.eval_hidden[:, agent_num, :] = self.policy.eval_rnn(inputs, hidden_state) if True in (hidden_state != hidden_state): import time time.sleep(1) agent_outs, self.policy_eval_hidden[:, agent_num, :] = self.policy( inputs, hidden_state) # agent_outs = torch.tanh(agent_outs) # TODO # choose action from q value if self.args.alg == 'coma' or self.args.alg == 'central_v' or self.args.alg == 'reinforce': action = self._choose_action_from_softmax(q_value.cpu(), avail_actions, epsilon, evaluate) else: # qmix agent_outs[avail_actions == 0.0] = -float("inf") # if evaluate: agent_outs = F.softmax(agent_outs / self.args.temp, dim=1) # 概率分布 action = agent_outs.max(dim=1, keepdim=False)[1] # indices # action = np.random.choice(np.arange(q_value.shape[-1]), 1, p=agent_outs.squeeze(0).detach().cpu().numpy()) # action是一个整数 按概率分布采样 else: if True in (agent_outs != agent_outs): import time time.sleep(1) agent_outs = F.softmax(agent_outs / self.args.temp, dim=1) #概率分布 action = np.random.choice( np.arange(agent_outs.shape[-1]), 1, p=agent_outs.squeeze( 0).detach().cpu().numpy()) # action是一个整数 按概率分布采样 # while (int(action) not in avail_actions_ind): # print(f'agent_outs:{agent_outs}') # print(f'{action} not in {avail_actions_ind}') # # action = agent_outs.max(dim=1, keepdim=False)[1] # action = np.random.choice(avail_actions_ind) return action def _choose_action_from_softmax(self, inputs, avail_actions, epsilon, evaluate=False): """ :param inputs: # q_value of all actions """ action_num = avail_actions.sum(dim=1, keepdim=True).float().repeat( 1, avail_actions.shape[-1]) # num of avail_actions # 先将Actor网络的输出通过softmax转换成概率分布 prob = torch.nn.functional.softmax(inputs, dim=-1) # add noise of epsilon prob = ((1 - epsilon) * prob + torch.ones_like(prob) * epsilon / action_num) prob[avail_actions == 0] = 0.0 # 不能执行的动作概率为0 """ 不能执行的动作概率为0之后,prob中的概率和不为1,这里不需要进行正则化,因为torch.distributions.Categorical 会将其进行正则化。要注意在训练的过程中没有用到Categorical,所以训练时取执行的动作对应的概率需要再正则化。 """ if epsilon == 0 and evaluate: action = torch.argmax(prob) else: action = Categorical(prob).sample().long() return action def _get_max_episode_len(self, batch): terminated = batch['terminated'] episode_num = terminated.shape[0] max_episode_len = 0 for episode_idx in range(episode_num): for transition_idx in range(self.args.episode_limit): if terminated[episode_idx, transition_idx, 0] == 1: if transition_idx + 1 >= max_episode_len: max_episode_len = transition_idx + 1 break return max_episode_len def train(self, batch, train_step, epsilon=None): # coma needs epsilon for training # different episode has different length, so we need to get max length of the batch max_episode_len = self._get_max_episode_len(batch) for key in batch.keys(): if key != 'z': batch[key] = batch[key][:, :max_episode_len] self.policy.learn(batch, max_episode_len, train_step, epsilon) if train_step > 0 and train_step % self.args.save_cycle == 0: self.policy.save_model(train_step)