コード例 #1
0
    def __init__(self, **kwargs):  # batch_size=None,
        # Unpack arguments from sacred
        args = kwargs.get('env_args', kwargs)
        if isinstance(args, dict):
            args = convert(args)

        # Define the agents
        self.n_agents = 2

        self.episode_limit = args.steps
        # print(args)
        if getattr(args, 'state_type', None):
            print('args.state_type', args.state_type)
            self.state_type = args.state_type
        else:
            self.state_type = 'obs'

        # print('self.state_type', self.state_type)
        # Define the internal state
        self.steps = 0

        r_matrix = [[1, 1], [1, 1]]
        self.payoff_values = [r_matrix for _ in range(self.episode_limit)]
        self.final_step_diff = [[1, 1], [1, 4]]

        self.branches = 4
        self.branch = 0
        self.state_num = self.branches * (self.episode_limit + 1)

        self.n_actions = len(self.payoff_values[0])

        self.good_branches = args.good_branches
コード例 #2
0
    def __init__(self, batch_size=None, **kwargs):
        # Unpack arguments from sacred
        args = kwargs["env_args"]
        if isinstance(args, dict):
            args = convert(args)
        self.args = args

        if getattr(args, "seed", None) is not None:
            self.seed = args.seed
            self.rs = np.random.RandomState(self.seed) # initialise numpy random state
コード例 #3
0
    def __init__(self, args):

        if isinstance(args, dict):
            args = convert(args)

        if isinstance(args.env_args, dict):
            args.env_args = convert(args.env_args)

        self.args = args

        # Unpack arguments from sacred

        self.device = self.args.device

        self.bs = self.args.batch_size_run

        # I treat randomness here different from other parts of the code. not sure whether that's bad/how bad that is
        self.rng = default_rng(seed=self.args.env_args.seed)
        self.extra_action = self.args.env_args.extra_action

        # Define the agents
        self.n_players = 2
        self.n_agents = 2
        if self.extra_action:
            self.size_A = 3
        else:
            self.size_A = 2

        self.n_obs = self.size_A

        self.n_actions = self.size_A

        self.r_success = self.args.env_args.r_success
        self.r_failure = self.args.env_args.r_failure

        self.n_comm_steps = self.args.env_args.n_comm_steps

        self.obs_size = 2

        self.state_size = 5

        self.episode_limit = self.args.env_args.episode_limit
コード例 #4
0
    def __init__(self, batch_size=None, **kwargs):
        # Unpack arguments from sacred
        args = kwargs["env_args"]
        if isinstance(args, dict):
            args = convert(args)

        # Define the agents and actions
        self.n_agents = 2
        self.n_actions = 3
        self.episode_limit = 1

        self.payoff_matrix = np.array([[8, -12, -12], [-12, 0, 0], [-12, 0,
                                                                    0]])

        self.state = np.ones(5)
コード例 #5
0
    def __init__(self, batch_size=None, **kwargs):
        # Unpack arguments from sacred
        args = kwargs["env_args"]
        if isinstance(args, dict):
            args = convert(args)

        # Define the agents
        self.n_agents = 2

        self.episode_limit = args.steps

        # Define the internal state
        self.steps = 0

        r_matrix = [[1, 1], [1, 1]]
        self.payoff_values = [r_matrix for _ in range(self.episode_limit)]
        self.final_step_diff = [[1, 1], [1, 4]]

        self.branches = 4
        self.branch = 0

        self.n_actions = len(self.payoff_values[0])

        self.good_branches = args.good_branches
コード例 #6
0
def main(_config, _run):
    config = convert(_config)
    _id = _run._id

    # Logging stuff
    logger = logging.getLogger("Main")
    if config.mongo:
        logging.disable(logging.WARNING)
    configure_stats_logging(
        str(_id) + "_" + config.name,
        log_interval=config.log_interval,
        sacred_info=_run.info,
        use_tb=config.tb,
    )
    stats = get_stats()

    logger.critical("ID: {}".format(_id))
    # Update config with environment specific information
    env = gym.make(config.env)
    num_actions = env.action_space.n
    config = config._replace(num_actions=num_actions)
    state_shape = env.observation_space.shape
    config = config._replace(state_shape=state_shape)
    # Wrap env
    env = EnvWrapper(env, debug=True, args=config)

    # Log the config
    config_str = "Config:\n\n"
    for k, v in sorted(config._asdict().items()):
        config_str += "     {}: {}\n".format(k, v)
    logger.critical(config_str)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.critical("Device: {}".format(device.type))

    # Make agent and target agent
    agent = get_model(config.agent)(config)
    target_agent = get_model(config.agent)(config)
    target_agent.load_state_dict(agent.state_dict())
    agent.to(device)
    target_agent.to(device)

    # Pseudocount stuff
    count_model = None
    if config.count_rewards:
        if config.atari_count:
            count_model = AtariCount(config)
        elif config.rnd_net_count:
            # assert config.count_state_only_rewards
            count_model = RndNetworkDistill(config, device)
        elif config.dora_count:
            count_model = DoraCount(config, device)
        else:
            count_model = PseudoCount(config)

    # Make action selector
    action_selector = None
    if config.action_selector == "eps_greedy":
        action_selector = eps_greedy.EpsGreedy(config)
    elif config.action_selector == "optimistic_action":
        action_selector = optimistic_action.OptimisticAction(
            count_model, config)
    elif config.action_selector == "bsp":
        action_selector = bsp_action.BSPAction(config)
    else:
        raise Exception("{} is not an Action Selector!".format(
            config.action_selector))

    # Make replay buffer
    # Check if the obs dtype of the environment is an int
    obs_dtype = getattr(env.wrapped_env, "obs_dtype", np.float32)
    obs_scaling = getattr(env.wrapped_env, "obs_scaling", 1)
    replay_buffer = ReplayBuffer(size=config.buffer_size,
                                 frame_history_len=config.past_frames_input,
                                 obs_dtype=obs_dtype,
                                 obs_scaling=obs_scaling,
                                 args=config)

    if config.dora_count:
        dora_buffer = ReplayBuffer(size=config.batch_size * 4,
                                   frame_history_len=config.past_frames_input,
                                   obs_dtype=obs_dtype,
                                   obs_scaling=obs_scaling,
                                   args=config)

    # Make trainer
    trainer = None
    if config.trainer == "DQN":
        trainer = DQNTrainer(agent=agent,
                             target_agent=target_agent,
                             args=config,
                             count_model=count_model,
                             buffer=replay_buffer)
    else:
        raise Exception
    testing_buffer = ReplayBuffer(size=(config.past_frames_input + 1),
                                  frame_history_len=config.past_frames_input,
                                  args=config)

    # Testing stuff
    testing_env = EnvWrapper(env=gym.make(config.env), debug=True, args=config)
    if config.test_augmented:
        assert config.action_selector == "optimistic_action"

    # Player Positions
    positions = set()
    action_positions = set()

    T = 0
    start_time = time.time()
    last_time = start_time

    # Lots of code duplication :(
    logging.critical("Filling buffer with {:,} random experiences.".format(
        config.buffer_burn_in))
    state = env.reset()
    assert config.buffer_burn_in == 0
    for t in range(config.buffer_burn_in):
        buffer_idx = replay_buffer.store_frame(state)
        stacked_states = replay_buffer.encode_recent_observation()
        tensor_state = torch.tensor(stacked_states, device=device).unsqueeze(0)
        action = np.random.randint(config.num_actions)
        next_state, reward, terminated, info = env.step(action)
        terminal_to_store = terminated
        if "Steps_Termination" in info and info["Steps_Termination"]:
            terminal_to_store = False

        intrinsic_reward = 0
        pseudo_count = 0
        if config.count_rewards:
            pseudo_count = count_model.visit(tensor_state, action)
            if getattr(count_model, "reward_directly", False):
                intrinsic_reward = pseudo_count
            else:
                count_bonus = config.count_beta / sqrt(pseudo_count)
                intrinsic_reward = count_bonus

        replay_buffer.store_effect(buffer_idx, action,
                                   reward - config.reward_baseline,
                                   intrinsic_reward, terminal_to_store,
                                   pseudo_count)
        state = next_state
        if terminated:
            state = env.reset()
            logger.warning("Random action burn in t: {:,}".format(t))

    state = env.reset()
    episode = 0
    episode_reward = 0
    intrinsic_episode_reward = 0
    episode_length = 0
    env_positive_reward = 0
    max_episode_reward = 0
    if config.bsp:
        bsp_k = np.random.randint(config.bsp_k)
        action_selector.update_k(bsp_k)

    logging.critical("Beginning training.")

    while T < config.t_max:

        # Store the current state
        buffer_idx = replay_buffer.store_frame(state)
        if config.dora_count:
            dora_idx = dora_buffer.store_frame(state)

        # Get the stacked input vector
        stacked_states = replay_buffer.encode_recent_observation()

        # Get output from agent
        with torch.no_grad():
            tensor_state = torch.tensor(stacked_states,
                                        device=device).unsqueeze(0)
            agent_output = agent(tensor_state)
            # agent_output = agent(torch.Tensor(stacked_states).unsqueeze(0))

        # Select action
        action, action_info = action_selector.select_actions(
            agent_output, T, info={"state": tensor_state})

        # Take an environment step
        next_state, reward, terminated, info = env.step(action)
        T += 1
        stats.update_t(T)
        episode_reward += reward
        episode_length += 1
        terminal_to_store = terminated
        if "Steps_Termination" in info and info["Steps_Termination"]:
            logger.warning("Terminating because of episode limit")
            terminal_to_store = False

        # Log if a positive reward was ever received from environment. ~Finding goal
        if reward > 0.1:
            env_positive_reward = 1
        stats.update_stats("Positive_Reward", env_positive_reward)

        # Calculate count based intrinsic motivation
        intrinsic_reward = 0
        pseudo_count = 0
        if config.count_rewards:
            pseudo_count = count_model.visit(tensor_state, action)
            if getattr(count_model, "reward_directly", False):
                # The count-model is giving us the intrinsic reward directly
                intrinsic_reward = pseudo_count[0]
            else:
                # Count-model is giving us the pseudo-count
                count_bonus = config.count_beta / sqrt(pseudo_count)
                intrinsic_reward = count_bonus
            intrinsic_episode_reward += intrinsic_reward

        # Render training
        if config.render_train_env:
            debug_info = {}
            debug_info.update(action_info)
            env.render(debug_info=debug_info)

        # Add what happened to the buffer
        replay_buffer.store_effect(buffer_idx, action,
                                   reward - config.reward_baseline,
                                   intrinsic_reward, terminal_to_store,
                                   pseudo_count)
        if config.dora_count:
            dora_buffer.store_effect(dora_idx, action,
                                     reward - config.reward_baseline,
                                     intrinsic_reward, terminal_to_store,
                                     pseudo_count)

        # Update state
        state = next_state

        # If terminated
        if terminated:
            # If we terminated due to episode limit, we need to add the current state in
            if "Steps_Termination" in info and info["Steps_Termination"]:
                buffer_idx = replay_buffer.store_frame(state)
                replay_buffer.store_effect(buffer_idx,
                                           0,
                                           0,
                                           0,
                                           True,
                                           0,
                                           dont_sample=True)
                if config.dora_count:
                    dora_idx = dora_buffer.store_frame(state)
                    dora_buffer.store_effect(dora_idx,
                                             0,
                                             0,
                                             0,
                                             True,
                                             0,
                                             dont_sample=True)

            logger.warning("T: {:,}, Episode Reward: {:.2f}".format(
                T, episode_reward))
            state = env.reset()
            max_episode_reward = max(max_episode_reward, episode_reward)
            stats.update_stats("Episode Reward", episode_reward)
            stats.update_stats("Max Episode Reward", max_episode_reward)
            stats.update_stats("Episode Length", episode_length)
            stats.update_stats("Intrin Eps Reward", intrinsic_episode_reward)
            episode_reward = 0
            episode_length = 0
            intrinsic_episode_reward = 0
            episode += 1
            stats.update_stats("Episode", episode)
            if config.bsp:
                bsp_k = np.random.randint(config.bsp_k)
                action_selector.update_k(bsp_k)

        # Train if possible
        for _ in range(config.training_iters):
            sampled_batch = None

            if T % config.update_freq != 0:
                # Only train every update_freq timesteps
                continue
            if replay_buffer.can_sample(config.batch_size):
                sampled_batch = replay_buffer.sample(config.batch_size,
                                                     nstep=config.n_step)

            if sampled_batch is not None:
                trainer.train(sampled_batch)

            if config.dora_count:
                if dora_buffer.can_sample(config.batch_size):
                    sampled_batch = replay_buffer.sample(config.batch_size,
                                                         nstep=config.n_step)
                if sampled_batch is not None:
                    count_model.train(sampled_batch)

        # Update target networks if necessary
        if T % config.target_update_interval == 0:
            trainer.update_target_agent()
            if config.dora_count:
                count_model.update_target_agent()

        # Logging
        if config.bsp:
            agent_output = agent_output[:, :, bsp_k]
        q_vals_numpy = agent_output.detach().cpu()[0].numpy()
        if num_actions < 20:
            for action_id in range(config.num_actions):
                stats.update_stats("Q-Value_{}".format(action_id),
                                   q_vals_numpy[action_id])
        else:
            stats.update_stats("Q-Value_Mean", np.mean(q_vals_numpy))
        player_pos = env.log_visitation()
        positions.add(player_pos)
        action_positions.add((player_pos, action))
        stats.update_stats("States Visited", len(positions))
        stats.update_stats("State_Actions Visited", len(action_positions))
        stats.update_stats("Player Position", player_pos)
        # Log all env stats returned
        for k, v in info.items():
            if k != "Steps_Termination":
                stats.update_stats(k, v)

        if config.save_count_gifs > 0 and T % config.save_count_gifs == 0:
            if count_model is not None:
                state_action_counts, count_nums = env.count_state_action_space(
                    count_model)
                if state_action_counts is not None:
                    save_image(state_action_counts,
                               image_name="SA_Counts__{}_Size__{}_T".format(
                                   config.count_size, T),
                               direc_name="State_Action_Counts")
                    save_sa_count_vals(count_nums,
                                       name="SA_PCounts__{}_Size__{}_T".format(
                                           config.count_size, T),
                                       direc_name="Sa_Count_Estimates")

                actual_counts = env.state_counts()
                if actual_counts is not None:
                    save_actual_counts(actual_counts,
                                       name="Counts__{}_T".format(T),
                                       direc_name="Actual_Counts")

                q_val_img, q_vals = env.q_value_estimates(count_model, agent)
                if q_val_img is not None:
                    save_image(q_val_img,
                               image_name="Q_Vals__{}_Size__{}_T".format(
                                   config.count_size, T),
                               direc_name="Q_Value_Estimates")
                if q_vals is not None:
                    save_q_vals(q_vals,
                                name="Q_Vals__{}_Size__{}_T".format(
                                    config.count_size, T),
                                direc_name="Q_Value_Estimates")

        # Testing
        with torch.no_grad():
            if T % config.testing_interval == 0:

                prefixes = [""]
                if config.test_augmented:
                    prefixes += ["Aug_"]

                for prefix in prefixes:
                    total_test_reward = 0
                    total_test_length = 0
                    for _ in range(config.test_episodes):
                        test_episode_reward = 0
                        test_episode_length = 0
                        test_state = testing_env.reset()
                        test_env_terminated = False

                        while not test_env_terminated:
                            test_buffer_idx = testing_buffer.store_frame(
                                test_state)
                            stacked_test_states = testing_buffer.encode_recent_observation(
                            )
                            test_tensor_state = torch.tensor(
                                stacked_test_states,
                                device=device).unsqueeze(0)
                            testing_agent_output = agent(test_tensor_state)

                            if prefix == "Aug_" or config.bsp:
                                test_action, _ = action_selector.select_actions(
                                    testing_agent_output,
                                    T,
                                    info={"state": test_tensor_state},
                                    testing=True)
                            else:
                                test_action = get_test_action(
                                    testing_agent_output, config)

                            next_test_state, test_reward, test_env_terminated, _ = testing_env.step(
                                test_action)
                            if config.render_test_env:
                                testing_env.render()

                            test_episode_length += 1
                            test_episode_reward += test_reward

                            testing_buffer.store_effect(
                                test_buffer_idx, test_action, test_reward, 0,
                                test_env_terminated, 0)

                            test_state = next_test_state

                        total_test_length += test_episode_length
                        total_test_reward += test_episode_reward

                    mean_test_reward = total_test_reward / config.test_episodes
                    mean_test_length = total_test_length / config.test_episodes

                    logger.error(
                        "{}Testing -- T: {:,}/{:,}, Test Reward: {:.2f}, Test Length: {:,}"
                        .format(prefix, T, config.t_max, mean_test_reward,
                                mean_test_length))

                    stats.update_stats("{}Test Reward".format(prefix),
                                       mean_test_reward,
                                       always_log=True)
                    stats.update_stats("{}Test Episode Length".format(prefix),
                                       mean_test_length,
                                       always_log=True)

                logger.error("Estimated time left: {}. Time passed: {}".format(
                    time_left(last_time, T - config.testing_interval,
                              T, config.t_max),
                    time_str(time.time() - start_time)))
                last_time = time.time()

        if T % (config.log_interval * 4) == 0:
            stats.print_stats()

    logger.critical("Closing envs")
    env.close()
    testing_env.close()

    logger.critical("Finished training.")

    if client is not None:
        logger.critical("Attempting to close pymongo client")
        client.close()
        logger.critical("Pymongo client closed")

    logger.critical("Exiting")
コード例 #7
0
    def __init__(self, **kwargs):

        self.debug_launcher = False
        self.port_in_use = False
        self.debug_inputs = False
        self.debug_rewards = False

        if self.debug_launcher:
            print("INIT")

        args = kwargs["env_args"]
        if isinstance(args, dict):
            args = convert(args)

        self._add_deepcopy_support()

        # Read arguments
        self.map_name = args.map_name
        assert map_present(self.map_name), \
            "map {} not in map registry! please add.".format(self.map_name)
        map_params = convert(get_map_params(self.map_name))
        self.map_type = map_params.map_type
        self.n_agents = map_params.n_agents
        self.n_enemies = map_params.n_enemies
        self._agent_race = map_params.agent_race
        self._bot_race = map_params.bot_race
        self.zealot_id = 65
        self.dragoon_id = 66
        self.episode_limit = map_params.limit
        self.micro_battles = map_params.micro_battles

        self._move_amount = args.move_amount
        self._step_mul = args.step_mul
        self.state_last_action = args.state_last_action

        # Rewards args
        self.reward_only_positive = args.reward_only_positive
        self.reward_negative_scale = args.reward_negative_scale
        self.reward_death_value = args.reward_death_value
        self.reward_win = args.reward_win
        self.reward_scale = args.reward_scale
        self.reward_scale_rate = args.reward_scale_rate

        # Other
        self.seed = args.seed
        self.heuristic = args.heuristic
        self.measure_fps = args.measure_fps
        self.continuing_episode = args.continuing_episode

        self.hostname = args.hostname
        self.port = portpicker.pick_unused_port()

        self.n_actions_no_attack = 6
        self.n_actions = self.n_actions_no_attack + self.n_enemies
        self.max_reward = self.n_enemies * self.reward_death_value + self.reward_win

        for tc_dir in ["/install/torchcraft"]:
            if os.path.isdir(tc_dir):
                os.environ['TCPATH'] = tc_dir

        if sys.platform == 'linux':
            os.environ['SC1PATH'] = os.path.join(os.getcwd(), '3rdparty',
                                                 'StarCraftI', 'linux')
            self.env_file_type = 'so'
        elif sys.platform == 'darwin':
            os.environ['SC1PATH'] = os.path.join(os.getcwd(), '3rdparty',
                                                 'StarCraftI', 'mac')
            self.env_file_type = 'dylib'

        # Check if server has already been launched on this port
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            s.bind((socket.gethostbyname(socket.gethostname()), self.port))
            self.port_in_use = False
        except socket.error as e:
            if e.errno == errno.EADDRINUSE:
                # Port is already in use
                self.port_in_use = True
                print("Exception error: Port {} already in use. \n".format(
                    self.port, e))
            else:
                # Something else raised the socket.error exception
                print(e)
        s.close()

        # For single-batch testing in BWAPILauncher rendering
        # self.port = 11111  # Needs to be commented out when using more than once SC1 instance

        if self.debug_launcher:
            print("BEFORE LAUNCH SERVER")

        # Launch the server
        if not self.port_in_use:
            self._launch_server()

        if self.debug_launcher:
            print("BEFORE LAUNCH CLIENT")

        # Launch the game
        self._launch_client()

        if self.debug_launcher:
            print("AFTER LAUNCH CLIENT")

        self.map_x = self._obs.map_size[0]
        self.map_y = self._obs.map_size[1]
        self.map_play_area_min = [int(0), int(0)]
        self.map_play_area_max = [self.map_x, self.map_y]
        self.max_distance_x = self.map_x
        self.max_distance_y = self.map_y

        self._episode_count = -1
        self._total_steps = 0

        self.battles_won = 0
        self.battles_game = 0
        self.timeouts = 0
        self.force_restarts = 0
コード例 #8
0
ファイル: starcraft2.py プロジェクト: shariqiqbal2810/pymarl
    def __init__(self, **kwargs):
        args = kwargs["env_args"]
        if isinstance(args, dict):
            args = convert(args)
        # Read arguments
        self.map_name = args.map_name
        assert map_present(self.map_name), \
            "map {} not in map registry! please add.".format(self.map_name)
        map_params = convert(get_map_params(self.map_name))
        self.n_agents = map_params.n_agents
        self.n_enemies = map_params.n_enemies
        self.episode_limit = map_params.limit
        self._move_amount = args.move_amount
        self._step_mul = args.step_mul
        self.difficulty = args.difficulty
        # Observations and state
        self.obs_own_health = args.obs_own_health
        self.obs_all_health = args.obs_all_health
        self.obs_instead_of_state = args.obs_instead_of_state
        self.state_last_action = args.state_last_action
        if self.obs_all_health:
            self.obs_own_health = True
        # Rewards args
        self.reward_sparse = args.reward_sparse
        self.reward_only_positive = args.reward_only_positive
        self.reward_negative_scale = args.reward_negative_scale
        self.reward_death_value = args.reward_death_value
        self.reward_win = args.reward_win
        self.reward_defeat = args.reward_defeat
        self.reward_scale = args.reward_scale
        self.reward_scale_rate = args.reward_scale_rate
        # Other
        self.continuing_episode = args.continuing_episode
        self.seed = args.seed
        self.heuristic = args.heuristic
        self.window_size = (1920, 1200)
        self.save_replay_prefix = args.save_replay_prefix

        # For sanity check
        self.debug_inputs = False
        self.debug_rewards = False

        # Actions
        self.n_actions_no_attack = 6
        self.n_actions = self.n_actions_no_attack + self.n_enemies

        # Map info
        self._agent_race = map_params.a_race
        self._bot_race = map_params.b_race
        self.shield_bits_ally = 1 if self._agent_race == "P" else 0
        self.shield_bits_enemy = 1 if self._bot_race == "P" else 0
        self.unit_type_bits = map_params.unit_type_bits
        self.map_type = map_params.map_type

        if sys.platform == 'linux':
            os.environ.setdefault(
                'SC2PATH', os.path.join(os.getcwd(), "3rdparty",
                                        'StarCraftII'))
            self.game_version = args.game_version
        else:
            # Can be derived automatically
            self.game_version = None

        # Launch the game
        self._launch()

        self.max_reward = self.n_enemies * self.reward_death_value + self.reward_win
        self._game_info = self.controller.game_info()
        self._map_info = self._game_info.start_raw
        self.map_x = self._map_info.map_size.x
        self.map_y = self._map_info.map_size.y
        self.map_play_area_min = self._map_info.playable_area.p0
        self.map_play_area_max = self._map_info.playable_area.p1
        self.max_distance_x = self.map_play_area_max.x - self.map_play_area_min.x
        self.max_distance_y = self.map_play_area_max.y - self.map_play_area_min.y
        self.terrain_height = np.array(list(
            self._map_info.terrain_height.data)).reshape(
                self.map_x, self.map_y)
        self.pathing_grid = np.array(list(
            self._map_info.pathing_grid.data)).reshape(self.map_x, self.map_y)

        self._episode_count = 0
        self._total_steps = 0

        self.battles_won = 0
        self.battles_game = 0
        self.timeouts = 0
        self.force_restarts = 0

        self.last_stats = None
コード例 #9
0
    def __init__(self, batch_size=None, **kwargs):
        # Unpack arguments from sacred
        args = kwargs["env_args"]
        if isinstance(args, dict):
            args = convert(args)
        self.args = args
        self.print_caught_prey = getattr(args, "print_caught_prey", False)
        self.print_frozen_agents = getattr(args, "print_frozen_agents", False)

        # Add-on for graph interface
        self.state_as_graph = args.state_as_graph
        if self.state_as_graph:
            self.absolute_distance = getattr(args, "absolute_distance", False)
            self.normalise_distance = getattr(args, "normalise_distance",
                                              False)
            self.add_walls = getattr(args, "add_walls", False)
            self.prey_relational = getattr(args, "prey_relational", True)

        # Add-on for goat-hunts (which like to climb mountains)
        self.mountain_slope = getattr(args, "mountain_slope", 0.0)
        self.capture_conditions = getattr(args, "capture_conditions", [0, 1])
        self.mountain_spawn = getattr(args, "mountain_spawn", False)
        self.mountain_agent_row = getattr(args, "mountain_agent_row", -1)

        # Downwards compatibility of batch_mode
        self.batch_mode = batch_size is not None
        self.batch_size = batch_size if self.batch_mode else 1

        # Define the environment grid
        self.truncate_episodes = getattr(args, "truncate_episodes", True)
        self.observe_ids = getattr(args, "observe_ids", False)
        self.intersection_global_view = getattr(args,
                                                "intersection_global_view",
                                                False)
        self.intersection_unknown = getattr(args, "intersection_unknown",
                                            False)
        self.directed_observations = getattr(args, "directed_observations",
                                             False)
        self.directed_cone_narrow = getattr(args, "directed_cone_narrow", True)
        self.directed_exta_actions = getattr(args, "directed_exta_actions",
                                             True)
        self.random_ghosts = getattr(args, "random_ghosts", False)
        self.random_ghosts_prob = getattr(args, "random_ghosts_prob", 0.5)
        self.random_ghosts_mul = getattr(args, "random_ghosts_mul", -1.0)
        self.random_ghosts_random_indicator = getattr(
            args, "random_ghosts_indicator", False)
        self.observe_state = getattr(args, "observe_state", False)
        self.observe_walls = getattr(args, "observe_walls", True)
        self.observe_one_hot = getattr(args, "observe_one_hot", False)
        self.n_feats = (5 if self.observe_one_hot else
                        3) + (1 if self.random_ghosts else 0)
        self.toroidal = args.toroidal
        shape = args.world_shape
        self.x_max, self.y_max = shape
        self.state_size = self.x_max * self.y_max * self.n_feats
        self.env_max = np.asarray(shape, dtype=int_type)
        self.grid_shape = np.asarray(shape, dtype=int_type)
        self.grid = np.zeros(
            (self.batch_size, self.x_max, self.y_max, self.n_feats),
            dtype=float_type)
        # 0=agents, 1=stag, 2=hare, [3=wall, 4=unknown], [-1=ghost-indicator]

        if self.random_ghosts:
            self.ghost_indicator = False  # indicator whether whether prey is a ghost (True) or not (False)
            self.ghost_indicator_potential_positions = np.asarray(
                [[0, 0], [0, self.x_max - 1], [self.y_max - 1, 0],
                 [self.y_max - 1, self.x_max - 1]],
                dtype=int_type)
            self.ghost_indicator_pos = [
                0, 0
            ]  # position of the indicator whether prey is a ghost (-1) or not (+1)

        # Define the agents and their action space
        self.capture_action = getattr(args, "capture_action", False)
        self.capture_action_conditions = getattr(args,
                                                 "capture_action_conditions",
                                                 (2, 1))
        self.actions = np.asarray([[0, 1], [1, 0], [0, -1], [-1, 0], [0, 0],
                                   [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
                                  dtype=int_type)
        self.action_names = [
            "right", "down", "left", "up", "stay", "catch", 'look-right',
            'look-down', 'look-left', 'look-up'
        ]
        self.agent_move_block = np.asarray(getattr(args, "agent_move_block",
                                                   [0]),
                                           dtype=int_type)
        self.n_actions = 10 if self.directed_observations and self.directed_exta_actions \
            else (6 if self.capture_action else 5)
        self.n_agents = args.n_agents
        self.n_stags = args.n_stags
        self.p_stags_rest = args.p_stags_rest
        self.n_hare = args.n_hare
        self.p_hare_rest = args.p_hare_rest
        self.n_prey = self.n_stags + self.n_hare
        self.agent_obs = args.agent_obs
        self.agent_obs_dim = np.asarray(self.agent_obs, dtype=int_type)

        if self.observe_state:
            # The size of the global state as observation (with one additional position feature)
            self.obs_size = int(self.state_size +
                                self.grid_shape[0] * self.grid_shape[1])
        elif self.directed_observations and self.directed_cone_narrow:
            # The size of the visible observation cones for this option
            self.obs_size = self.n_feats * (2 * args.agent_obs[0] -
                                            1) * (2 * args.agent_obs[1] - 1)
        else:
            # The agent-centric observation size
            self.obs_size = self.n_feats * (2 * args.agent_obs[0] +
                                            1) * (2 * args.agent_obs[1] + 1)

        # Define the episode and rewards
        self.episode_limit = args.episode_limit
        self.time_reward = getattr(args, "reward_time", -0.1)
        self.collision_reward = getattr(args, "reward_collision", 0.0)
        self.capture_hare_reward = getattr(args, "reward_hare", 1.0)
        self.capture_stag_reward = getattr(args, "reward_stag", 2.0)
        self.miscapture_punishment = float(
            getattr(args, "miscapture_punishment", -self.capture_stag_reward))
        self.capture_terminal = getattr(args, "capture_terminal", True)
        self.capture_freezes = getattr(args, "capture_freezes", True)
        self.remove_frozen = getattr(args, "remove_frozen", False)

        # Define the internal state
        self.agents = np.zeros((self.n_agents, self.batch_size, 2),
                               dtype=int_type)
        self.agents_not_frozen = np.ones((self.n_agents, self.batch_size),
                                         dtype=int_type)
        self.agents_orientation = np.zeros(
            (self.n_agents, self.batch_size),
            dtype=int_type)  # use action_labels 0..3
        self.prey = np.zeros((self.n_prey, self.batch_size, 2), dtype=int_type)
        self.prey_alive = np.zeros((self.n_prey, self.batch_size),
                                   dtype=int_type)
        self.prey_type = np.ones((self.n_prey, self.batch_size),
                                 dtype=int_type)  # fill with stag (1)
        self.prey_type[self.n_stags:, :] = 2  # set hares to 2
        self.steps = 0
        self.sum_rewards = 0
        self.reset()

        self.made_screen = False
        self.scaling = 5
コード例 #10
0
ファイル: stag_hunt.py プロジェクト: yyf17/dcg
        'reward_hare': 1,
        'reward_stag': 10,
        'reward_collision': 0.0,
        'reward_time': -0.1,
        'capture_terminal': True,
        'episode_limit': 200,
        'n_stags': 2,
        'p_stags_rest': 0.1,
        'n_hare': 4,
        'p_hare_rest': 0.5,
        'n_agents': 4,
        'agent_obs': (2, 2),
        'state_as_graph': False,
        'print_caught_prey': True
    }
    env_args = convert(env_args)
    print(env_args)

    env = StagHunt(env_args=env_args)
    [all_obs, state] = env.reset()
    print("Env is ", "batched" if env.batch_mode else "not batched")

    if False:
        grid = state.reshape((6, 6, 3))
        for i in range(grid.shape[2]):
            print(grid[:, :, i], '\n')

    if False:
        print(state)
        for i in range(env.n_agents):
            print(all_obs[i])
コード例 #11
0
    def __init__(self, **kwargs):

        args = kwargs["env_args"]
        if isinstance(args, dict):
            args = convert(args)

        self.flounderl_delegate_if_zero_ck = getattr(kwargs["args"], "flounderl_delegate_if_zero_ck", False)

        self.map_param_registry = kwargs.get("map_param_registry", map_param_registry)
        # Read arguments
        self.map_name = args.map_name
        assert self.map_name in map_param_registry, \
            "map {} not in map registry! please add.".format(self.map_name)
        self.n_agents = map_param_registry[self.map_name]["n_agents"]
        self.n_enemies = map_param_registry[self.map_name]["n_enemies"]
        self.episode_limit = map_param_registry[self.map_name]["limit"]
        self._move_amount = args.move_amount
        self._step_mul = args.step_mul
        self.difficulty = args.difficulty
        self.state_last_action = args.state_last_action
        # Rewards args
        self.reward_only_positive = args.reward_only_positive
        self.reward_negative_scale = args.reward_negative_scale
        self.reward_death_value = args.reward_death_value
        self.reward_damage_coef = args.reward_damage_coef
        self.reward_win = args.reward_win
        self.reward_scale = args.reward_scale
        self.reward_scale_rate = args.reward_scale_rate
        # Other
        self.seed = args.seed
        self.heuristic = args.heuristic
        self.measure_fps = args.measure_fps
        self.obs_ignore_ally = args.obs_ignore_ally if hasattr(args, "obs_ignore_ally") else False
        self.obs_instead_of_state = args.obs_instead_of_state if hasattr(args, "obs_instead_of_state") else False
        self.window_size = (1920, 1200)

        self.debug_inputs = False
        self.debug_rewards = False
        self.debug_action_result = False

        self.n_actions_no_attack = 6
        self.n_actions = self.n_actions_no_attack + self.n_enemies

        self.fully_observable = args.fully_observable if hasattr(args, "fully_observable") else False
        self.relax_pairwise_aa = args.relax_pairwise_aa if hasattr(args, "relax_pairwise_aa") else False

        self.continuing_episode = args.continuing_episode

        self.map_settings()

        if sys.platform == 'linux':
            self.game_version = args.game_version if hasattr(args, "game_version") else "3.16.1"
            if os.path.exists(os.path.join(os.getcwd(), "3rdparty", 'StarCraftII__{}'.format(self.game_version))):
                os.environ['SC2PATH'] = os.path.join(os.getcwd(), "3rdparty", 'StarCraftII__{}'.format(self.game_version))
            else:
                os.environ['SC2PATH'] = os.path.join(os.getcwd(), "3rdparty", 'StarCraftII')
        else:
            self.game_version = "4.3.2"


        # Launch the game
        self._launch()

        self.max_reward = self.n_enemies * self.reward_death_value + self.reward_win
        self._game_info = self.controller.game_info()
        self.map_x = self._game_info.start_raw.map_size.x
        self.map_y = self._game_info.start_raw.map_size.y
        self.map_play_area_min = self._game_info.start_raw.playable_area.p0
        self.map_play_area_max = self._game_info.start_raw.playable_area.p1
        self.max_distance_x = self.map_play_area_max.x - self.map_play_area_min.x
        self.max_distance_y = self.map_play_area_max.y - self.map_play_area_min.y

        self._episode_count = 0
        self._total_steps = 0

        self.battles_won = 0
        self.battles_game = 0
        self.timeouts = 0
        self.force_restarts = 0
コード例 #12
0
ファイル: stag_hunt.py プロジェクト: johnson7788/pymarl2
    def __init__(self, batch_size=None, **kwargs):
        #获取所有参数
        args = kwargs
        if isinstance(args, dict):
            args = convert(args)
        self.args = args
        self.print_caught_prey = getattr(args, "print_caught_prey", False)
        self.print_frozen_agents = getattr(args, "print_frozen_agents", False)

        # 附加图形界面  ##插件1
        self.state_as_graph = args.state_as_graph
        if self.state_as_graph:
            self.absolute_distance = getattr(args, "absolute_distance", False)
            self.normalise_distance = getattr(args, "normalise_distance",
                                              False)
            self.add_walls = getattr(args, "add_walls", False)
            self.prey_relational = getattr(args, "prey_relational", True)

        # 附加 山羊狩猎(喜欢爬山), #插件2
        self.mountain_slope = getattr(args, "mountain_slope", 0.0)
        self.capture_conditions = getattr(args, "capture_conditions", [0, 1])
        self.mountain_spawn = getattr(args, "mountain_spawn", False)
        self.mountain_agent_row = getattr(args, "mountain_agent_row", -1)

        # Batch_Mode向下兼容性
        self.batch_mode = batch_size is not None
        self.batch_size = batch_size if self.batch_mode else 1

        # 定义环境网格
        self.truncate_episodes = getattr(args, "truncate_episodes", True)
        self.observe_ids = getattr(args, "observe_ids", False)
        self.intersection_global_view = getattr(args,
                                                "intersection_global_view",
                                                False)
        self.intersection_unknown = getattr(args, "intersection_unknown",
                                            False)
        self.directed_observations = getattr(args, "directed_observations",
                                             False)
        self.directed_cone_narrow = getattr(args, "directed_cone_narrow", True)
        self.directed_exta_actions = getattr(args, "directed_exta_actions",
                                             True)
        self.random_ghosts = getattr(args, "random_ghosts", False)
        self.random_ghosts_prob = getattr(args, "random_ghosts_prob", 0.5)
        self.random_ghosts_mul = getattr(args, "random_ghosts_mul", -1.0)
        self.random_ghosts_random_indicator = getattr(
            args, "random_ghosts_indicator", False)
        self.observe_state = getattr(args, "observe_state", False)
        self.observe_walls = getattr(args, "observe_walls", True)
        self.observe_one_hot = getattr(args, "observe_one_hot", False)
        self.n_feats = (5 if self.observe_one_hot else
                        3) + (1 if self.random_ghosts else 0)
        self.toroidal = args.toroidal
        shape = args.world_shape
        self.x_max, self.y_max = shape
        self.state_size = self.x_max * self.y_max * self.n_feats  #eg:state_size: 300
        self.env_max = np.asarray(shape, dtype=int_type)
        self.grid_shape = np.asarray(shape, dtype=int_type)
        self.grid = np.zeros(
            (self.batch_size, self.x_max, self.y_max, self.n_feats),
            dtype=float_type)
        # 0=agents, 1=stag, 2=hare, [3=wall, 4=unknown], [-1=ghost-indicator]
        #  如果是 "True",猎物会随机变成幽灵(否定的奖励),由一个角落的特征来表示
        if self.random_ghosts:
            self.ghost_indicator = False  # indicator whether whether prey is a ghost (True) or not (False)
            self.ghost_indicator_potential_positions = np.asarray(
                [[0, 0], [0, self.x_max - 1], [self.y_max - 1, 0],
                 [self.y_max - 1, self.x_max - 1]],
                dtype=int_type)
            self.ghost_indicator_pos = [
                0, 0
            ]  # position of the indicator whether prey is a ghost (-1) or not (+1)

        # 定义agent及其动作空间
        self.capture_action = getattr(args, "capture_action", False)
        self.capture_action_conditions = getattr(args,
                                                 "capture_action_conditions",
                                                 (2, 1))
        self.actions = np.asarray([[0, 1], [1, 0], [0, -1], [-1, 0], [0, 0],
                                   [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]],
                                  dtype=int_type)
        self.action_names = [
            "right", "down", "left", "up", "stay", "catch", 'look-right',
            'look-down', 'look-left', 'look-up'
        ]
        self.agent_move_block = np.asarray(getattr(args, "agent_move_block",
                                                   [0]),
                                           dtype=int_type)
        self.n_actions = 10 if self.directed_observations and self.directed_exta_actions \
            else (6 if self.capture_action else 5)
        self.n_agents = args.n_agents
        self.n_stags = args.n_stags
        self.p_stags_rest = args.p_stags_rest
        self.n_hare = args.n_hare
        self.p_hare_rest = args.p_hare_rest
        self.n_prey = self.n_stags + self.n_hare
        self.agent_obs = args.agent_obs
        self.agent_obs_dim = np.asarray(self.agent_obs, dtype=int_type)

        if self.observe_state:
            # 作为观察的全局状态的大小(有一个额外的位置特征)
            self.obs_size = int(self.state_size +
                                self.grid_shape[0] * self.grid_shape[1])
        elif self.directed_observations and self.directed_cone_narrow:
            # The size of the visible observation cones for this option
            self.obs_size = self.n_feats * (2 * args.agent_obs[0] -
                                            1) * (2 * args.agent_obs[1] - 1)
        else:
            # The agent-centric observation size,观察空间的维度eg: 75
            self.obs_size = self.n_feats * (2 * args.agent_obs[0] +
                                            1) * (2 * args.agent_obs[1] + 1)

        # 定义episode和奖励
        self.episode_limit = args.episode_limit
        self.time_reward = getattr(args, "reward_time", -0.1)
        self.collision_reward = getattr(args, "reward_collision", 0.0)
        self.capture_hare_reward = getattr(args, "reward_hare", 1.0)
        self.capture_stag_reward = getattr(args, "reward_stag", 2.0)
        self.miscapture_punishment = float(
            getattr(args, "miscapture_punishment", -self.capture_stag_reward))
        self.capture_terminal = getattr(args, "capture_terminal", True)
        self.capture_freezes = getattr(args, "capture_freezes", True)
        self.remove_frozen = getattr(args, "remove_frozen", False)

        # 定义内部状态, agents: 猎人,
        self.agents = np.zeros((self.n_agents, self.batch_size, 2),
                               dtype=int_type)
        self.agents_not_frozen = np.ones((self.n_agents, self.batch_size),
                                         dtype=int_type)
        self.agents_orientation = np.zeros(
            (self.n_agents, self.batch_size),
            dtype=int_type)  # use action_labels 0..3
        # 猎物prey,包括兔子和雄鹿
        self.prey = np.zeros((self.n_prey, self.batch_size, 2), dtype=int_type)
        self.prey_alive = np.zeros((self.n_prey, self.batch_size),
                                   dtype=int_type)
        self.prey_type = np.ones((self.n_prey, self.batch_size),
                                 dtype=int_type)  # fill with stag (1)
        self.prey_type[self.n_stags:, :] = 2  # set hares to 2
        self.steps = 0
        self.sum_rewards = 0
        self.reset()

        self.made_screen = False
        self.scaling = 5
コード例 #13
0
ファイル: run.py プロジェクト: wwxFromTju/mackrl
def run(_run, _config, _log, pymongo_client):

    # check args sanity
    _config = args_sanity_check(_config, _log)

    # convert _config dict to GenericDict objects (which cannot be overwritten later)
    args = convert(_config)
    _log.info("Experiment Parameters:")
    experiment_params = pprint.pformat(_config,
                                       indent=4,
                                       width=1)
    _log.info("\n\n" + experiment_params + "\n")

    import os
    _log.info("OS ENVIRON KEYS: {}".format(os.environ))

    if _config.get("debug_mode", None) is not None:
        _log.warning("ATTENTION DEBUG MODE: {}".format(_config["debug_mode"]))

    # ----- configure logging
    # configure tensorboard logger
    unique_token = "{}__{}".format(args.name, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    if args.use_tensorboard:
        import tensorboard
        if tensorboard:
            from tensorboard_logger import configure, log_value
        import os
        from os.path import dirname, abspath
        file_tb_path = os.path.join(dirname(dirname(abspath(__file__))), "tb_logs")
        configure(os.path.join(file_tb_path, "{}").format(unique_token))

    # configure trajectory logger


    # set up logging object to be passed on from now on
    logging_struct = SN(py_logger=_log,
                        sacred_log_scalar_fn=partial(append_scalar, run=_run))
    if args.use_tensorboard:
        logging_struct.tensorboard_log_scalar_fn=log_value

    if hasattr(args, "use_hdf_logger") and args.use_hdf_logger:
        logging_struct.hdf_logger = HDFLogger(path=args.local_results_path, name=args.name, logging_struct=logging_struct)

    # ----- execute runners
    # run framework in run_mode selected
    if args.run_mode in ["parallel_subproc"]:
        run_parallel(args=args, _run=_run, _logging_struct=logging_struct, unique_token=unique_token)
    else:
        run_sequential(args=args, _run=_run, _logging_struct=logging_struct, unique_token=unique_token)

    #Clean up after finishing
    print("Exiting Main")

    if pymongo_client is not None: #args.use_mongodb:
        print("Attempting to close mongodb client")
        pymongo_client.close()
        print("Mongodb client closed")

    print("Stopping all threads")
    for t in threading.enumerate():
        if t.name != "MainThread":
            print("Thread {} is alive! Is daemon: {}".format(t.name, t.daemon))
            t.join(timeout=1)
            print("Thread joined")

    print("Exiting script")

    # Making sure framework really exits
    os._exit(os.EX_OK)
コード例 #14
0
ファイル: gfootball.py プロジェクト: conglu1997/MAVEN
    def __init__(self, batch_size=None, **kwargs):
        # Unpack arguments from sacred
        args = kwargs["env_args"]
        if isinstance(args, dict):
            args = convert(args)

        # Primary config
        self.scenario = getattr(args, "scenario", "11_vs_11_stochastic")
        self.game_visibility = getattr(args, "game_visibility", "full")
        self.representation = getattr(args, "representation", "simple115")
        self.full_obs_flag = getattr(args, "full_obs", False)
        self.view_angle = getattr(args, "view_angle", 160)
        self.rewards = getattr(args, "rewards", "scoring")

        # Video dumping config
        self.write_full_episode_dumps = getattr(args, "write_full_episode_dumps", False)
        self.write_video = getattr(args, "write_video", False)
        self.dump_frequency = getattr(args, "dump_frequency", 1)
        self.logdir = getattr(args, "logdir", "episode_dumps")

        # Environment modifiers
        self.move_goalkeeper = getattr(args, "move_goalkeeper", False)
        self.difficulty_override = getattr(args, "env_difficulty", -1)

        # Secondary config
        scenario_config = {"11_vs_11_stochastic": {"n_agents": 11},
                           "academy_empty_goal_close": {"n_agents": 1},
                           "academy_empty_goal": {"n_agents": 1},
                           "academy_run_to_score": {"n_agents": 1},
                           "academy_run_to_score_with_keeper": {"n_agents": 1},
                           "academy_pass_and_shoot_with_keeper": {"n_agents": 2},
                           "academy_run_pass_and_shoot_with_keeper": {"n_agents": 2},
                           "academy_3_vs_1_with_keeper": {"n_agents": 3},
                           "academy_corner": {"n_agents": 1},
                           "academy_counterattack_easy": {"n_agents": 4},
                           "academy_single_goal_versus_lazy": {"n_agents": 11}
                           }
        if getattr(args, "n_agents", -1) == -1:
            self.n_agents = scenario_config[self.scenario]["n_agents"]
        else:
            assert args.n_agents <= scenario_config[self.scenario]["n_agents"], \
                "Scenario only supports up to {} agents - you supplied {}!".format(
                    scenario_config[self.scenario]["n_agents"], args.n_agents)
            self.n_agents = args.n_agents

        self.episode_limit = args.episode_limit if getattr(args, "episode_limit",
                                                           -1) != -1 else 1000  # TODO: Look up correct episode length!
        self.observation_reference_frame = getattr(args, "observation_reference_frame", "fixed")

        self.action_set = getattr(args, "action_set", 'default')
        # Either 19 or non sticky
        self.n_actions = 19 if self.action_set is not 'non_sticky' else 14

        self.env = football_env.create_environment(
            env_name=self.scenario,
            render=False,
            number_of_left_players_agent_controls=self.n_agents,
            representation=self.representation,
            rewards=self.rewards,
            write_full_episode_dumps=self.write_full_episode_dumps,
            write_video=self.write_video,
            dump_frequency=self.dump_frequency,
            logdir=self.logdir,
            # po_view_cone_xy_opening=self.view_angle,
            # full_obs_flag=self.full_obs_flag,
            action_set=self.action_set,
        )

        self.reset()

        self.obs_size = self.observations[0].shape

        self.state = self.env.get_global_state()
        self.state_size = self.state.shape