def __init__(self, conf, qnet_conf, mixer_gnn_conf, mixer_ff_conf, brain_conf, buffer_conf): super(QmixAgent, self).__init__() self.conf = conf qnet = MultiStepInputQnet(qnet_conf) mixer = QMixer(mixer_gnn_conf, mixer_ff_conf) if self.conf.agent_conf['use_target']: qnet_target = MultiStepInputQnet(qnet_conf) mixer_target = QMixer(mixer_gnn_conf, mixer_ff_conf) else: qnet_target = None mixer_target = None if self.conf.agent_conf['use_clipped_q']: qnet2 = MultiStepInputQnet(qnet_conf) mixer2 = QMixer(mixer_gnn_conf, mixer_ff_conf) else: qnet2 = None mixer2 = None self.brain = QMixBrain(conf=brain_conf, qnet=qnet, mixer=mixer, qnet_target=qnet_target, mixer_target=mixer_target, qnet2=qnet2, mixer2=mixer2) self.buffer = NstepInputMemory(**buffer_conf.memory_conf)
def __init__(self, conf, qnet_conf, mixer_gnn_conf, mixer_ff_conf, sup_mixer_conf, brain_conf, buffer_conf, soft_assignment=False): super(HierarchicalQmixAgent, self).__init__() self.conf = conf qnet = HierarchicalMultiStepInputQnet(qnet_conf, mixer_gnn_conf, mixer_ff_conf, soft_assignment=soft_assignment) mixer = SupQmixer( input_dim=qnet_conf.qnet_actor_conf['node_input_dim'], conf=sup_mixer_conf) if self.conf.agent_conf['use_target']: qnet_target = HierarchicalMultiStepInputQnet( qnet_conf, mixer_gnn_conf, mixer_ff_conf, soft_assignment=soft_assignment) mixer_target = SupQmixer( input_dim=qnet_conf.qnet_actor_conf['node_input_dim'], conf=sup_mixer_conf) else: qnet_target = None mixer_target = None if self.conf.agent_conf['use_clipped_q']: qnet2 = HierarchicalMultiStepInputQnet( qnet_conf, mixer_gnn_conf, mixer_ff_conf, soft_assignment=soft_assignment) mixer2 = SupQmixer( input_dim=qnet_conf.qnet_actor_conf['node_input_dim'], conf=sup_mixer_conf) else: qnet2 = None mixer2 = None self.brain = HierarchicalQmixBrain(conf=brain_conf, qnet=qnet, mixer=mixer, qnet_target=qnet_target, mixer_target=mixer_target, qnet2=qnet2, mixer2=mixer2) self.buffer = NstepInputMemory(**buffer_conf.memory_conf)
def __init__(self, conf, network_conf, brain_conf, buffer_conf, use_attention=True, use_hierarchical_actor=False): super(MultiStepActorCriticAgent, self).__init__(brain_conf=brain_conf, buffer_conf=buffer_conf) self.conf = conf actor = MultiStepInputActor( network_conf, use_attention=use_attention, use_hierarchical_actor=use_hierarchical_actor) critic = MultiStepInputActor( network_conf, use_attention=use_attention, use_hierarchical_actor=use_hierarchical_actor) if self.conf.module_conf['use_target']: critic_target = MultiStepInputActor( network_conf, use_attention=use_attention, use_hierarchical_actor=use_hierarchical_actor) else: critic_target = None if self.conf.module_conf['use_double_q']: critic2 = MultiStepInputActor( network_conf, use_attention=use_attention, use_hierarchical_actor=use_hierarchical_actor) critic2_target = MultiStepInputActor( network_conf, use_attention=use_attention, use_hierarchical_actor=use_hierarchical_actor) else: critic2 = None critic2_target = None self.brain = MultiStepActorCriticBrain(actor=actor, critic=critic, conf=brain_conf, critic_target=critic_target, critic2=critic2, critic2_target=critic2_target) self.buffer = NstepInputMemory(**buffer_conf.memory_conf)
def __init__(self, conf, network_conf, brain_conf, buffer_conf, use_attention=True, use_hierarchical_actor=False): super(MultiStepActorCriticAgent, self).__init__(brain_conf=brain_conf, buffer_conf=buffer_conf) self.conf = conf actor = MultiStepInputActor( network_conf, use_attention=use_attention, use_hierarchical_actor=use_hierarchical_actor) self.brain = MultiStepPolicyGradientBrain(actor=actor, conf=brain_conf) self.buffer = NstepInputMemory(**buffer_conf.memory_conf)
class QmixAgent(torch.nn.Module): def __init__(self, conf, qnet_conf, mixer_gnn_conf, mixer_ff_conf, brain_conf, buffer_conf): super(QmixAgent, self).__init__() self.conf = conf qnet = MultiStepInputQnet(qnet_conf) mixer = QMixer(mixer_gnn_conf, mixer_ff_conf) if self.conf.agent_conf['use_target']: qnet_target = MultiStepInputQnet(qnet_conf) mixer_target = QMixer(mixer_gnn_conf, mixer_ff_conf) else: qnet_target = None mixer_target = None if self.conf.agent_conf['use_clipped_q']: qnet2 = MultiStepInputQnet(qnet_conf) mixer2 = QMixer(mixer_gnn_conf, mixer_ff_conf) else: qnet2 = None mixer2 = None self.brain = QMixBrain(conf=brain_conf, qnet=qnet, mixer=mixer, qnet_target=qnet_target, mixer_target=mixer_target, qnet2=qnet2, mixer2=mixer2) self.buffer = NstepInputMemory(**buffer_conf.memory_conf) def get_action(self, hist_graph, curr_graph, tag2unit_dict): assert isinstance( curr_graph, dgl.DGLGraph), "get action is designed to work on a single graph!" num_time_steps = hist_graph.batch_size hist_node_feature = hist_graph.ndata.pop('node_feature') curr_node_feature = curr_graph.ndata.pop('node_feature') maximum_num_enemy = get_largest_number_of_enemy_nodes([curr_graph]) nn_actions, info_dict = self.brain.get_action( num_time_steps, hist_graph, hist_node_feature, curr_graph, curr_node_feature, maximum_num_enemy) ally_tags = info_dict['ally_tags'] enemy_tags = info_dict['enemy_tags'] sc2_actions = nn_action_to_sc2_action(nn_actions=nn_actions, ally_tags=ally_tags, enemy_tags=enemy_tags, tag2unit_dict=tag2unit_dict) hist_graph.ndata['node_feature'] = hist_node_feature curr_graph.ndata['node_feature'] = curr_node_feature return nn_actions, sc2_actions, info_dict def sample_noise(self): for m in self.modules(): if isinstance(m, NoisyLinear): m.sample_noise() def remove_noise(self): for m in self.modules(): if isinstance(m, NoisyLinear): m.remove_noise() def fit(self, device='cpu'): # the prefix 'c' indicates #current# time stamp inputs # the prefix 'n' indicates #next# time stamp inputs # expected specs: # bs = batch_size, nt = hist_num_time_steps # 'h_graph' = list of graph lists [[g_(0,0), g_(0,1), ... g_(0,nt)], # [g_(1,0), g_(1,1), ..., g_(1,nt)], # [g_(2,0), ..., g_(bs, 0), ... g_(bs, nt)]] # 'graph' = list of graphs [g_(0), g_(1), ..., g_(bs)] fit_conf = self.conf.fit_conf batch_size = fit_conf['batch_size'] hist_num_time_steps = fit_conf['hist_num_time_steps'] c_h_graph, c_graph, actions, rewards, n_h_graph, n_graph, dones = self.buffer.sample( batch_size) c_maximum_num_enemy = get_largest_number_of_enemy_nodes(c_graph) n_maximum_num_enemy = get_largest_number_of_enemy_nodes(n_graph) # batching graphs list_c_h_graph = [g for L in c_h_graph for g in L] list_n_h_graph = [g for L in n_h_graph for g in L] c_hist_graph = dgl.batch(list_c_h_graph) n_hist_graph = dgl.batch(list_n_h_graph) c_curr_graph = dgl.batch(c_graph) n_curr_graph = dgl.batch(n_graph) # casting actions to one torch tensor actions = torch.cat(actions).long() # prepare rewards rewards = torch.Tensor(rewards) # preparing dones dones = torch.Tensor(dones) if device != 'cpu': c_hist_graph.to(torch.device('cuda')) n_hist_graph.to(torch.device('cuda')) c_curr_graph.to(torch.device('cuda')) n_curr_graph.to(torch.device('cuda')) actions = actions.to(torch.device('cuda')) rewards = rewards.to(torch.device('cuda')) dones = dones.to(torch.device('cuda')) c_hist_feature = c_hist_graph.ndata.pop('node_feature') c_curr_feature = c_curr_graph.ndata.pop('node_feature') n_hist_feature = n_hist_graph.ndata.pop('node_feature') n_curr_feature = n_curr_graph.ndata.pop('node_feature') fit_return_dict = self.brain.fit( num_time_steps=hist_num_time_steps, c_hist_graph=c_hist_graph, c_hist_feature=c_hist_feature, c_curr_graph=c_curr_graph, c_curr_feature=c_curr_feature, c_maximum_num_enemy=c_maximum_num_enemy, n_hist_graph=n_hist_graph, n_hist_feature=n_hist_feature, n_curr_graph=n_curr_graph, n_curr_feature=n_curr_feature, n_maximum_num_enemy=n_maximum_num_enemy, actions=actions, rewards=rewards, dones=dones) return fit_return_dict
brain_hyper_param = get_hyper_param_dict() brain = MultiStepSharedActorCriticBrain(actor_critic=actor_critic, hist_encoder=hist_encoder, curr_encoder=curr_encoder, hyper_params=brain_hyper_param) sample_spec = namedtuple( 'exp_args', ["state", "action", "reward", "next_state", "done"], defaults=tuple([list() for _ in range(4)])) num_hist_steps = 5 buffer = NstepInputMemory(N=num_hist_steps, max_n_episodes=100, spec=sample_spec, gamma=1.0, max_traj_len=40) agent = MultiStepActorCriticAgent(brain=brain, buffer=buffer) init_graph = env.observe()['g'] history_manager = HistoryManager(n_hist_steps=num_hist_steps, init_graph=init_graph) done_cnt = 0 iters = 0 while True: # print("Itertation : {} ".format(iters)) curr_state_dict = env.observe() hist_graph = history_manager.get_hist()
class MultiStepActorCriticAgent(AgentBase): def __init__(self, conf, network_conf, brain_conf, buffer_conf, use_attention=True, use_hierarchical_actor=False): super(MultiStepActorCriticAgent, self).__init__(brain_conf=brain_conf, buffer_conf=buffer_conf) self.conf = conf actor = MultiStepInputActor( network_conf, use_attention=use_attention, use_hierarchical_actor=use_hierarchical_actor) critic = MultiStepInputActor( network_conf, use_attention=use_attention, use_hierarchical_actor=use_hierarchical_actor) if self.conf.module_conf['use_target']: critic_target = MultiStepInputActor( network_conf, use_attention=use_attention, use_hierarchical_actor=use_hierarchical_actor) else: critic_target = None if self.conf.module_conf['use_double_q']: critic2 = MultiStepInputActor( network_conf, use_attention=use_attention, use_hierarchical_actor=use_hierarchical_actor) critic2_target = MultiStepInputActor( network_conf, use_attention=use_attention, use_hierarchical_actor=use_hierarchical_actor) else: critic2 = None critic2_target = None self.brain = MultiStepActorCriticBrain(actor=actor, critic=critic, conf=brain_conf, critic_target=critic_target, critic2=critic2, critic2_target=critic2_target) self.buffer = NstepInputMemory(**buffer_conf.memory_conf) def get_action(self, hist_graph, curr_graph, tag2unit_dict): assert isinstance( curr_graph, dgl.DGLGraph), "get action is designed to work on a single graph!" num_time_steps = hist_graph.batch_size hist_node_feature = hist_graph.ndata.pop('node_feature') curr_node_feature = curr_graph.ndata.pop('node_feature') maximum_num_enemy = get_largest_number_of_enemy_nodes([curr_graph]) nn_actions, info_dict = self.brain.get_action( num_time_steps, hist_graph, hist_node_feature, curr_graph, curr_node_feature, maximum_num_enemy) ally_tags = info_dict['ally_tags'] enemy_tags = info_dict['enemy_tags'] sc2_actions = nn_action_to_sc2_action(nn_actions=nn_actions, ally_tags=ally_tags, enemy_tags=enemy_tags, tag2unit_dict=tag2unit_dict) hist_graph.ndata['node_feature'] = hist_node_feature curr_graph.ndata['node_feature'] = curr_node_feature return nn_actions, sc2_actions def forward(self, *args, **kwargs): return None def fit(self, device='cpu'): # the prefix 'c' indicates #current# time stamp inputs # the prefix 'n' indicates #next# time stamp inputs # expected specs: # bs = batch_size, nt = hist_num_time_steps # 'h_graph' = list of graph lists [[g_(0,0), g_(0,1), ... g_(0,nt)], # [g_(1,0), g_(1,1), ..., g_(1,nt)], # [g_(2,0), ..., g_(bs, 0), ... g_(bs, nt)]] # 'graph' = list of graphs [g_(0), g_(1), ..., g_(bs)] fit_conf = self.conf.fit_conf batch_size = fit_conf['batch_size'] hist_num_time_steps = fit_conf['hist_num_time_steps'] c_h_graph, c_graph, actions, rewards, n_h_graph, n_graph, dones = self.buffer.sample( batch_size) c_maximum_num_enemy = get_largest_number_of_enemy_nodes(c_graph) n_maximum_num_enemy = get_largest_number_of_enemy_nodes(n_graph) # casting actions to one torch tensor actions = torch.cat(actions).long() # 'c_graph' is now list of graphs c_ally_units = [ len(get_filtered_node_index_by_type(graph, NODE_ALLY)) for graph in c_graph ] c_ally_units = torch.Tensor(c_ally_units).long() # prepare rewards rewards = torch.Tensor(rewards) rewards = rewards.repeat_interleave(c_ally_units, dim=0) # preparing dones dones = torch.Tensor(dones) dones = dones.repeat_interleave(c_ally_units, dim=0) # batching graphs list_c_h_graph = [g for L in c_h_graph for g in L] list_n_h_graph = [g for L in n_h_graph for g in L] c_hist_graph = dgl.batch(list_c_h_graph) n_hist_graph = dgl.batch(list_n_h_graph) c_curr_graph = dgl.batch(c_graph) n_curr_graph = dgl.batch(n_graph) if device != 'cpu': c_hist_graph.to(torch.device('cuda')) n_hist_graph.to(torch.device('cuda')) c_curr_graph.to(torch.device('cuda')) n_curr_graph.to(torch.device('cuda')) actions = actions.to(torch.device('cuda')) rewards = rewards.to(torch.device('cuda')) dones = dones.to(torch.device('cuda')) c_hist_feature = c_hist_graph.ndata.pop('node_feature') c_curr_feature = c_curr_graph.ndata.pop('node_feature') n_hist_feature = n_hist_graph.ndata.pop('node_feature') n_curr_feature = n_curr_graph.ndata.pop('node_feature') fit_return_dict = self.brain.fit( num_time_steps=hist_num_time_steps, c_hist_graph=c_hist_graph, c_hist_feature=c_hist_feature, c_curr_graph=c_curr_graph, c_curr_feature=c_curr_feature, c_maximum_num_enemy=c_maximum_num_enemy, n_hist_graph=n_hist_graph, n_hist_feature=n_hist_feature, n_curr_graph=n_curr_graph, n_curr_feature=n_curr_feature, n_maximum_num_enemy=n_maximum_num_enemy, actions=actions, rewards=rewards, dones=dones) return fit_return_dict