def main(): env = StarCraft2Env(map_name="8m") env_info = env.get_env_info() n_actions = env_info["n_actions"] n_agents = env_info["n_agents"] n_episodes = 10 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 while not terminated: obs = env.get_obs() state = env.get_state() actions = [] for agent_id in range(n_agents): avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] action = np.random.choice(avail_actions_ind) actions.append(action) reward, terminated, _ = env.step(actions) episode_reward += reward print("Total reward in episode {} = {}".format(e, episode_reward)) env.close()
def __init__(self, name="3m", **kwargs): gym.Env.__init__(self) self.seed = kwargs.pop('seed', None) self.reward_sparse = kwargs.get('reward_sparse', False) self.use_central_value = kwargs.pop('central_value', False) self.random_invalid_step = kwargs.pop('random_invalid_step', False) self.replay_save_freq = kwargs.pop('replay_save_freq', 10000) self.apply_agent_ids = kwargs.pop('apply_agent_ids', False) self.env = StarCraft2Env(map_name=name, seed=self.seed, **kwargs) self.env_info = self.env.get_env_info() self._game_num = 0 self.n_actions = self.env_info["n_actions"] self.n_agents = self.env_info["n_agents"] self.action_space = gym.spaces.Discrete(self.n_actions) one_hot_agents = 0 if self.apply_agent_ids: one_hot_agents = self.n_agents self.observation_space = gym.spaces.Box( low=0, high=1, shape=(self.env_info['obs_shape'] + one_hot_agents, ), dtype=np.float32) self.state_space = gym.spaces.Box( low=0, high=1, shape=(self.env_info['state_shape'], ), dtype=np.float32) self.obs_dict = {}
def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1: # communication self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args) else: # no communication self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1: # these 3 algorithm are on-poliy self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def heuristic_run(n_episodes, map_name, env_args): env_args['map_name'] = map_name pprint.pprint(env_args) env = StarCraft2Env(**env_args) wins = 0 with trange(n_episodes) as t: for i in t: env.reset() terminated = False episode_reward = 0 while not terminated: actions = [] for agent_id in range(env.n_agents): avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] action = np.random.choice(avail_actions_ind) # _, haction_num = env.get_agent_action_heuristic(agent_id, action) actions.append(action) reward, terminated, info = env.step(actions) try: wins += info['battle_won'] except: continue t.set_postfix(win_rate=wins / (i + 1.)) env.close() print("\n") print("In {} episodes games, heuristic ai wins {}; win rate is {}".format( n_episodes, wins, float(wins) / float(n_episodes)))
def __init__(self, **smac_args): """Create a new multi-agent StarCraft env compatible with RLlib. Arguments: smac_args (dict): Arguments to pass to the underlying smac.env.starcraft.StarCraft2Env instance. Examples: >>> from smac_rllib import RLlibStarCraft2Env >>> env = RLlibStarCraft2Env(map_name="8m") >>> print(env.reset()) """ self._env = StarCraft2Env(**smac_args) self.horizon = self._env.episode_limit self.nbr_agents = self._env.n_agents self._ready_agents = [] self.observation_space = Dict({ "obs": Box(-1, 1, shape=(self.nbr_agents, self._env.get_obs_size(),)), "avail_actions": Box(0, 1, shape=(self.nbr_agents, self._env.get_total_actions(),)), "state": Box(-float('inf'), float('inf'), shape=(self._env.get_state_size(),)), "battle_won": Box(0,1, shape=(1,), dtype=np.bool), "dead_allies": Box(0,self.nbr_agents, shape=(1,), dtype=np.int), "dead_enemies": Box(0, int(1e3), shape=(1,), dtype=np.int) }) self.action_space = MultiDiscrete([self._env.get_total_actions()] * self.nbr_agents)
def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) if args.alg == 'commnet_coma': self.agents = CommNetAgents(args) self.rolloutWorker = CommNetRolloutWorker(env, self.agents, args) self.evaluateWorker = CommNetRolloutWorker(self.env_evaluate, self.agents, args) else: self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg != 'coma' and args.alg != 'commnet_coma': self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def create_environment(_): """Returns a starcraft environment.""" task = FLAGS.task_name logging.info('Creating environment: %s', task) env = StarCraft2Env(map_name=task, replay_dir=FLAGS.replay_dir) return env_wrappers.SCWrapper(env)
def prepare_env_and_agents(): # base path to save results in a dedicated directory every launch launch_time = datetime.datetime.now() save_path_base = pathlib.Path(RESULT_PATH_BASE) save_path_base = save_path_base / f'{MODEL_NAME}' \ f'_d{launch_time:%Y_%m_%d}' \ f'_t{launch_time:%H_%M_%S}' tb_path = save_path_base / 'tensorboard' tb_writer = tensorboardX.SummaryWriter(str(tb_path.resolve())) # calculate eps decay and num exploration from N_EPISODE num_exploration_ep = int(N_EPISODE * EXPLORATION_FRAQ) save_freq = min(20, N_EPISODE // 15) eps_decay_eps = N_EPISODE * EPS_TIME_FRACTION # prepare env env = StarCraft2Env(map_name=MAP_NAME, seed=SEED, reward_only_positive=False, obs_timestep_number=True, reward_scale_rate=200) # prepare agents agents: List[Agent] = prepare_agents( env, eps_decay_eps, ) # tb_writer) return save_freq, agents, env, num_exploration_ep, save_path_base, tb_writer
def main(): tb_writer = tensorboardX.SummaryWriter(RESULT_PATH_BASE + 'tensorboard') env = StarCraft2Env(map_name=MAP_NAME, seed=SEED, reward_only_positive=False, obs_timestep_number=True, reward_scale_rate=200) if PRESET == 'q_table': # env = StarCraft2Env(map_name="2m2zFOX", difficulty="1", seed=SEED) evaluator = evaluate.SCAbsPosEvaluator(env) toolbox = prepare_env(individuals.AgentwiseQTable, evaluator) elif PRESET == 'dqn': evaluator = evaluate.SCNativeEvaluator(env, tb_writer) toolbox = prepare_env(individuals.AgentwiseFullyConnected, evaluator) else: raise NotImplementedError(f'Preset {PRESET} for genetic learn is not available') pop = toolbox.population(n=POPULATION) hof = tools.HallOfFame(1) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean) stats.register("std", np.std) stats.register("min", np.min) stats.register("max", np.max) algorithms.eaSimple(pop, toolbox, cxpb=0.7, mutpb=0.2, ngen=NUM_GENERATIONS, stats=stats, halloffame=hof) save_top_individual(hof) print(f'Num of evaluations (episodes): {evaluator.evaluation_counter}') if EVALUATE_TOP: print("results of evaluation of top individual") evaluator.evaluate_single(hof.items[0], 0) return pop, stats, hof
def __init__(self, name="3m", replay_save_freq=4000, **kwargs): gym.Env.__init__(self) self.seed = kwargs.pop('seed', None) self.reward_sparse = kwargs.pop('reward_sparse', False) self.env = StarCraft2Env(map_name=name, seed=self.seed, reward_sparse=self.reward_sparse) self.env_info = self.env.get_env_info() self.replay_save_freq = replay_save_freq self._game_num = 0 self.n_actions = self.env_info["n_actions"] self.n_agents = self.env_info["n_agents"] self.action_space = gym.spaces.Discrete(self.n_actions) self.observation_space = gym.spaces.Box( low=0, high=1, shape=(self.env_info['obs_shape'], ), dtype=np.float32) self.central_state_space = gym.spaces.Box( low=0, high=1, shape=(self.env_info['state_shape'], ), dtype=np.float32) self.random_invalid_step = kwargs.pop('random_invalid_step', False)
def __init__(self): self.rnnagent = RNNAgent() self.epsilon = config.epsilon_start self.epsilon_deg = (config.epsilon_start - config.epsilon_finish) / config.epsilon_anneal_time self.env = StarCraft2Env(map_name=config.map_name)
def __init__( self, map_name="8m", step_mul=None, move_amount=2, difficulty="7", game_version=None, seed=None, continuing_episode=False, obs_all_health=True, obs_own_health=True, obs_last_action=False, obs_pathing_grid=False, obs_terrain_height=False, obs_instead_of_state=False, state_last_action=True, reward_sparse=False, reward_only_positive=True, reward_death_value=10, reward_win=200, reward_defeat=0, reward_negative_scale=0.5, reward_scale=True, reward_scale_rate=20, replay_dir="", replay_prefix="", window_size_x=1920, window_size_y=1200, debug=False, ): self.env = StarCraft2Env(map_name=map_name, step_mul=step_mul, move_amount=move_amount, difficulty=difficulty, \ game_version=game_version, seed=seed, continuing_episode=continuing_episode, \ obs_all_health=obs_all_health, obs_own_health=obs_own_health, obs_last_action=obs_last_action, \ obs_pathing_grid=obs_pathing_grid, obs_terrain_height=obs_terrain_height, \ obs_instead_of_state=obs_instead_of_state, state_last_action=state_last_action, \ reward_sparse=reward_sparse, reward_only_positive=reward_only_positive, \ reward_death_value=reward_death_value, reward_win=reward_win, reward_defeat=reward_defeat, \ reward_negative_scale=reward_negative_scale, reward_scale=reward_scale, reward_scale_rate=reward_scale_rate, \ replay_dir=replay_dir, replay_prefix=replay_prefix, window_size_x=window_size_x, window_size_y=window_size_y, \ debug=debug) env_info = self.env.get_env_info() num_actions = env_info['n_actions'] self.n = env_info['n_agents'] self.state_shape = env_info['state_shape'] # Configure action space self.action_space = [] self.observation_space = [] for _ in range(self.n): self.action_space.append(spaces.Discrete(num_actions)) self.observation_space.append( spaces.Box(low=-1.0, high=1.0, shape=(self.env.get_obs_size(), ), dtype=np.float32)) self.state = None
def main(model_path): env = StarCraft2Env(map_name="8m", window_size_x=1920 / 3, window_size_y=1080 / 3) loaded_model = load_model(model_path) mean_wr = test_model(loaded_model, env, num_runs=50) return mean_wr
def init_env(self, env_info): logging.debug("init env with: {}".format(env_info)) print(env_info) sys.stdout.flush() _info = env_info.copy() if "agent_num" in _info.keys(): _info.pop("agent_num") return StarCraft2Env(**_info)
def load_smac_env(env_config: Dict[str, Any]) -> StarCraft2Env: """Loads a smac environment given a config dict. Also, the possible agents in the environment are set""" env = StarCraft2Env(**env_config) env.possible_agents = list(range(env.n_agents)) return env
def run(self): self.log = logging.getLogger('StarCraftII') message = "start rl algorithm" self.log.info(message) Signal.get_signal().emit_signal_str(message) env = StarCraft2Env( map_name="3m", window_size_x=1418, window_size_y=890, window_loc=(5, 155), ) env_info = env.get_env_info() n_actions = env_info["n_actions"] n_agents = env_info["n_agents"] message = "n_actions : {}".format(n_actions) self.log.info(message) Signal.get_signal().emit_signal_str(message) message = "n_agents : {}".format(n_agents) self.log.info(message) Signal.get_signal().emit_signal_str(message) n_episodes = 100 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 if self.stop: break while not terminated: obs = env.get_obs() state = env.get_state() if globalInformation.get_value(strings.IS_STOP): self.stop = True break actions = [] for agent_id in range(n_agents): avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] action = np.random.choice(avail_actions_ind) actions.append(action) reward, terminated, _ = env.step(actions) episode_reward += reward message = "Total reward in episode {} = {}".format( e, episode_reward) self.log.info(message) Signal.get_signal().emit_signal_str(message) env.close() Signal.get_signal().emit_signal_gameover()
def prepare_agents(env: StarCraft2Env, eps_decay_steps, tb_writer=None): env_info = env.get_env_info() n_agents = env_info['n_agents'] agents: List[Agent] = [] n_actions = env_info['n_actions'] n_features = env.get_obs_size() for i in range(n_agents): agents.append( Agent(i, n_features, n_actions, eps_decay_steps, LR, TARGET_UPDATE, MEMORY_SIZE, batch_size=BATCH_SIZE, tb_writer=tb_writer, discount=DISCOUNT)) return agents
def main(): config = deepcopy(QMixConfig) env = StarCraft2Env(map_name=config['scenario'], difficulty=config['difficulty']) env = SC2EnvWrapper(env) config['episode_limit'] = env.episode_limit config['obs_shape'] = env.obs_shape config['state_shape'] = env.state_shape config['n_agents'] = env.n_agents config['n_actions'] = env.n_actions rpm = EpisodeReplayBuffer(config['replay_buffer_size']) agent_model = RNNModel(config) qmixer_model = QMixerModel(config) algorithm = QMIX(agent_model, qmixer_model, config) qmix_agent = QMixAgent(algorithm, config) while rpm.count < config['memory_warmup_size']: train_reward, train_step, train_is_win, train_loss, train_td_error\ = run_train_episode(env, qmix_agent, rpm, config) total_steps = 0 last_test_step = -1e10 while total_steps < config['training_steps']: train_reward, train_step, train_is_win, train_loss, train_td_error\ = run_train_episode(env, qmix_agent, rpm, config) total_steps += train_step if total_steps - last_test_step >= config['test_steps']: last_test_step = total_steps eval_is_win_buffer = [] eval_reward_buffer = [] eval_steps_buffer = [] for _ in range(3): eval_reward, eval_step, eval_is_win = run_evaluate_episode( env, qmix_agent) eval_reward_buffer.append(eval_reward) eval_steps_buffer.append(eval_step) eval_is_win_buffer.append(eval_is_win) summary.add_scalar('train_loss', train_loss, total_steps) summary.add_scalar('eval_reward', np.mean(eval_reward_buffer), total_steps) summary.add_scalar('eval_steps', np.mean(eval_steps_buffer), total_steps) summary.add_scalar('eval_win_rate', np.mean(eval_is_win_buffer), total_steps) summary.add_scalar('exploration', qmix_agent.exploration, total_steps) summary.add_scalar('replay_buffer_size', rpm.count, total_steps) summary.add_scalar('target_update_count', qmix_agent.target_update_count, total_steps) summary.add_scalar('train_td_error:', train_td_error, total_steps)
def find_best_model(model_path, model_num): args = get_common_args() if args.alg == 'coma': args = get_coma_args(args) rnn_suffix = 'rnn_params.pkl' critic_fuffix = 'critic_params.pkl' policy = COMA elif args.alg == 'qmix': args = get_mixer_args(args) rnn_suffix = 'rnn_net_params.pkl' critic_fuffix = 'qmix_net_params.pkl' policy = QMIX elif args.alg == 'vdn': args = get_mixer_args(args) rnn_suffix = 'rnn_net_params.pkl' critic_fuffix = 'vdn_net_params.pkl' policy = VDN else: raise Exception("Not finished") env = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, replay_dir=args.replay_dir) env_info = env.get_env_info() args.n_actions = env_info["n_actions"] args.n_agents = env_info["n_agents"] args.state_shape = env_info["state_shape"] args.obs_shape = env_info["obs_shape"] args.episode_limit = env_info["episode_limit"] args.evaluate_epoch = 100 runner = Runner(env, args) max_win_rate = 0 max_win_rate_idx = 0 for num in range(model_num): critic_path = model_path + '/' + str(num) + '_' + critic_fuffix rnn_path = model_path + '/' + str(num) + '_' + rnn_suffix if os.path.exists(critic_path) and os.path.exists(rnn_path): os.rename(critic_path, model_path + '/' + critic_fuffix) os.rename(rnn_path, model_path + '/' + rnn_suffix) runner.agents.policy = policy(args) win_rate = runner.evaluate_sparse() if win_rate > max_win_rate: max_win_rate = win_rate max_win_rate_idx = num os.rename(model_path + '/' + critic_fuffix, critic_path) os.rename(model_path + '/' + rnn_suffix, rnn_path) print('The win rate of {} is {}'.format(num, win_rate)) print('The max win rate is {}, model index is {}'.format( max_win_rate, max_win_rate_idx))
def main(): args = parser.parse_args() env = StarCraft2Env(map_name="15z3m_drm", seed=SEED, reward_only_positive=False, obs_timestep_number=True, reward_scale_rate=200, realtime=False) evaluator = evaluate.SCNativeEvaluator(env) top_individual = read_last_individual(args, evaluator) with torch.no_grad(): evaluator.evaluate_single(top_individual, 50) return 0
def __init__(self, **smac_args): """Create a new multi-agent StarCraft env compatible with RLlib. Arguments: smac_args (dict): Arguments to pass to the underlying smac.env.starcraft.StarCraft2Env instance. Examples: >>> from smac.examples.rllib import RLlibStarCraft2Env >>> env = RLlibStarCraft2Env(map_name="8m") >>> print(env.reset()) """ self._env = StarCraft2Env(**smac_args) self._ready_agents = [] self.observation_space = Dict({ "obs": Box(-1, 1, shape=(self._env.get_obs_size(),)), "action_mask": Box(0, 1, shape=(self._env.get_total_actions(),)), }) self.action_space = Discrete(self._env.get_total_actions())
def main(): random.seed(42) np.random.seed(42) fnames = load_latest_q_table() if fnames: top_individual = individuals.AgentwiseQTable.load(fnames[0]) else: raise FileNotFoundError("Found no individuals") random.seed(42) np.random.seed(42) env = StarCraft2Env(map_name="2m2zFOX", difficulty="1", seed=42, realtime=False) evaluator = evaluate.SCAbsPosEvaluator(env) evaluator.evaluate_single(top_individual) return 0
def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.buffer = ReplayBuffer(args) self.args = args self.epsilon = args.epsilon # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
def run(self): self.log = logging.getLogger('StarCraftII') for i in range(8): args = get_common_args() map_name = globalInformation.get_value(strings.TYPE_MAP) alg_name = globalInformation.get_value(strings.TYPE_POLICY) if map_name is not None: args.map = map_name if alg_name is not None: args.alg = alg_name args.evaluate_epoch = 100 if args.alg == 'coma': args = get_coma_args(args) elif args.alg == 'commnet_coma': args = get_commnet_args(args) else: args = get_mixer_args(args) env = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, replay_dir=args.replay_dir, window_size_x=1418, window_size_y=890, window_loc=(5, 155)) env_info = env.get_env_info() args.n_actions = env_info["n_actions"] args.n_agents = env_info["n_agents"] args.state_shape = env_info["state_shape"] args.obs_shape = env_info["obs_shape"] args.episode_limit = env_info["episode_limit"] runner = Runner(env, args) if args.learn: runner.run(i) else: win_rate = runner.evaluate_sparse() message = 'The win rate of {} is {}'.format( args.alg, win_rate) self.log.info(message) Signal.get_signal().emit_signal_str(message) break env.close()
def __init__(self, map_name, seed=123, step_mul=8, difficulty='7', game_version=None, replay_dir=""): self.env = StarCraft2Env(map_name=map_name, step_mul=step_mul, seed=123, difficulty=difficulty, game_version=game_version, replay_dir=replay_dir) env_info = self.env.get_env_info() self.observation_space = [ env_info["obs_shape"] for i in range(env_info["n_agents"]) ] self.action_space = [ env_info["n_actions"] for i in range(env_info["n_agents"]) ] self.agent_types = ['agent' for i in range(env_info["n_agents"])]
def make_environment( evaluation: bool = False, map_name: str = "3m", random_seed: Optional[int] = None, **kwargs: Any, ) -> dm_env.Environment: """Wraps an starcraft 2 environment. Args: map_name: str, name of micromanagement level. Returns: A starcraft 2 smac environment wrapped as a DeepMind environment. """ del evaluation env = StarCraft2Env(map_name=map_name, seed=random_seed, **kwargs) # wrap starcraft 2 environment environment = SMACEnvWrapper(env) return environment
def get_env(arg): if arguments.map == 'matrix_2': # 210000 return Matrix_game2Env() elif arguments.map == 'matrix_3': return Matrix_game3Env(n_agents=2, n_actions=3, episode_limit=1, obs_last_action=False, state_last_action=False, print_rew=False, is_print=False) elif 'mmdp-' in arguments.map: length = int(re.findall(r'\d+\.\d+|\d+', arg.map)[-1]) return uni_mmdp_Env(episode_limit=length) elif arguments.map == 'go_orderly': return EnvGoOrderly(map_size=6, num_agent=3) else: # 设置环境,pymarl中设置的也是环境默认参数 return StarCraft2Env(map_name=arg.map, difficulty=arg.difficulty, step_mul=arg.step_mul, replay_dir=arg.replay_dir)
def main(): env = StarCraft2Env(map_name="8m", window_size_x=1920/3, window_size_y=1080/3) env_info = env.get_env_info() n_actions = env_info["n_actions"] n_agents = env_info["n_agents"] n_episodes = 30 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 while not terminated: obs = env.get_obs() state = env.get_state() actions = [] for agent_id in range(n_agents): avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] action = np.random.choice(avail_actions_ind) actions.append(action) reward, terminated, info = env.step(actions) if terminated: won = True if info['battle_won'] else False print("Battle result : {}".format(won)) episode_reward += reward print("Total reward in episode {} = {}".format(e, episode_reward)) env.close()
def main(): """The StarCraft II environment for decentralised multi-agent micromanagement scenarios.""" '''difficulty ="1" is VeryEasy''' #replay_dir="D:\StarCraft II\Replays\smacfox" env = StarCraft2Env(map_name="2m2zFOX", difficulty="1") '''env_info= {'state_shape': 48, 'obs_shape': 30, 'n_actions': 9, 'n_agents': 3, 'episode_limit': 60}''' env_info = env.get_env_info() #print("env_info = ", env_info) """Returns the size of the observation.""" """obssize = 10""" """obs= [array([ 1. , 1. , 1. , 1. , 1. , 0.63521415, 0.63517255, -0.00726997, 0.06666667, 0.06666667], dtype=float32)]""" obssize = env.get_obs_size() #print("obssize = ", obssize) ###################################################################### """ ready_agents = [] #observation_space= Dict(action_mask:Box(9,), obs:Box(30,)) observation_space = Dict({ "obs": Box(-1, 1, shape=(env.get_obs_size())), "action_mask": Box(0, 1, shape=(env.get_total_actions())) }) #print ("observation_space=", observation_space) #action_space= Discrete(9) action_space = Discrete(env.get_total_actions()) #print ("action_space=", action_space) """ ######################################################################## n_actions = env_info["n_actions"] #print ("n_actions=", n_actions) n_agents = env_info["n_agents"] n_episodes = 100 # количество эпизодов lapan = 20 alpha = 0.5 #learning rate sayon - 0.5 больш - 0.9 Lapan = 0.2 gamma = 0.9 #discount factor sayon - 0.9 больш - 0.5 lapan = 0.9 epsilon = 0.7 #e-greedy sayon - 0.3 больш - 0.7 lapan = = 1.0 (100% random actions) bonusrewardsize = 10 # for fire - action 6 n_statesFox = 16 # количество состояний нашего мира-сетки #n_statesFox1 = 16 # количество состояний нашего мира-сетки n_actionsFox = 7 # вводим свое количество действий, которые понадобятся Q_table = np.zeros([n_agents, n_statesFox, n_actions]) #задаем пустую q таблицу #Q_table1 = np.zeros([n_statesFox1, n_actionsFox]) #Q_table = np.zeros([32, n_actions]) #print (Q_table) for e in range(n_episodes): #print("n_episode = ", e) """Reset the environment. Required after each full episode.Returns initial observations and states.""" env.reset() ''' Battle is over terminated = True''' terminated = False episode_reward = 0 #n_steps = 1 #пока не берем это количество шагов для уменьгения награды за долгий поиск """ # вывод в файл fileobj = open("файл.txt", "wt") print("text",file=fileobj) fileobj.close() """ #динамический epsilon - только при большом количестве эпизодов имеет смысл!!! if e % 15 == 0: epsilon += (1 - epsilon) * 10 / n_episodes print("epsilon = ", epsilon) #stoprun = [0,0,0,0,0] while not terminated: """Returns observation for agent_id.""" obs = env.get_obs() #print ("obs=", obs) """Returns the global state.""" #state = env.get_state() actions = [] action = 0 bonusreward = np.zeros([n_agents]) stateFox = np.zeros([n_agents]) '''agent_id= 0, agent_id= 1''' for agent_id in range(n_agents): #получаем характеристики юнита unit = env.get_unit_by_id(agent_id) #получаем состояние по координатам юнита stateFox[agent_id] = get_stateFox(agent_id, unit.pos.x, unit.pos.y) #print ("agent_id =", agent_id) #print ("stateFox[agent_id] =", stateFox[agent_id]) ''' tag = unit.tag #много разных характеристик юнита x = unit.pos.x y = unit.pos.y ''' """Returns the available actions for agent_id.""" """avail_actions= [0, 1, 1, 1, 1, 1, 0, 0, 0]""" avail_actions = env.get_avail_agent_actions(agent_id) '''Функция nonzero() возвращает индексы ненулевых элементов массива.''' """avail_actions_ind of agent_id == 0: [1 2 3 4 5]""" avail_actions_ind = np.nonzero(avail_actions)[0] # выбираем действие action = select_actionFox(agent_id, stateFox[agent_id], avail_actions_ind, n_actionsFox, epsilon, Q_table) if action == 6 or action == 7: bonusreward[agent_id] += bonusrewardsize #print ('bonusreward[agent_id]=', bonusreward[agent_id]) #собираем действия от разных агентов actions.append(action) ###############_Бежим вправо и стреляем_################################ """ if is_possible_action(avail_actions_ind, 6) == True: action = 6 else: if is_possible_action(avail_actions_ind, 4) == True: action = 4 else: action = np.random.choice(avail_actions_ind) #Случайная выборка из значений заданного одномерного массива """ ##################################################################### """Функция append() добавляет элементы в конец массива.""" #print("agent_id=",agent_id,"avail_actions_ind=", avail_actions_ind, "action = ", action, "actions = ", actions) #f.write(agent_id) #f.write(avail_actions_ind) #собираем действия от разных агентов #actions.append(action) #как узнать куда стрелять? в определенного человека? #как узнать что делают другие агенты? самому создавать для них глобальное состояние #раз я ими управляю? """A single environment step. Returns reward, terminated, info.""" reward, terminated, _ = env.step(actions) #print ('actions=', actions) #print ('bonusreward[0]=', bonusreward[0]) #print ('bonusreward[1]=', bonusreward[1]) reward += (bonusreward[0] + bonusreward[1]) episode_reward += reward ###################_Обучаем_############################################## for agent_id in range(n_agents): #получаем характеристики юнита unit = env.get_unit_by_id(agent_id) #получаем состояние по координатам юнита stateFox_next = get_stateFox(agent_id, unit.pos.x, unit.pos.y) stateFoxint = int(stateFox[agent_id]) Q_table[agent_id, stateFoxint, action] = Q_table[agent_id, stateFoxint, action] + alpha * \ (reward + gamma * np.max(Q_table[agent_id, stateFox_next, :]) - Q_table[agent_id, stateFoxint, action]) ########################################################################## #Total reward in episode 4 = 20.0 print("Total reward in episode {} = {}".format(e, episode_reward)) #get_stats()= {'battles_won': 2, 'battles_game': 5, 'battles_draw': 0, 'win_rate': 0.4, 'timeouts': 0, 'restarts': 0} print("get_stats()=", env.get_stats()) #env.save_replay() """Save a replay.""" """"Close StarCraft II.""" "" env.close() print(Q_table) with open("se20.pkl", 'wb') as f: pickle.dump(Q_table, f)
def __init__(self, max_cycles, **smac_args): EzPickle.__init__(self, max_cycles, **smac_args) env = StarCraft2Env(**smac_args) super().__init__(env, max_cycles)