Esempio n. 1
0
    def create_env(self, env_kwargs):
        def thunk():
            import experiments.test_lstm_a2c
            return RewardCollector(gym.make(**env_kwargs))

        env = AsyncVectorEnv([thunk] * self.num_processes)
        self.validation_env = SyncVectorEnv([thunk])
        return env
Esempio n. 2
0
        def test_config(n_envs, base_env, use_torch, use_logger, return_info):
            config = 'n_envs' + str(n_envs) + '-base_env' + str(base_env) \
                    + '-torch' + str(use_torch) + '-logger' + str(use_logger) \
                    + '-info' + str(return_info)
            if isinstance(base_env, str):
                env = vec_env = gym.vector.make(base_env, num_envs=n_envs)
            else:

                def make_env():
                    env = base_env()
                    return env

                env_fns = [make_env for _ in range(n_envs)]
                env = vec_env = AsyncVectorEnv(env_fns)

            if use_logger:
                env = envs.Logger(env, interval=5, logger=self.logger)

            if use_torch:
                env = envs.Torch(env)
                policy = lambda x: ch.totensor(vec_env.action_space.sample())
            else:
                policy = lambda x: vec_env.action_space.sample()

            if return_info:
                agent = lambda x: (policy(x), {'policy': policy(x)[0]})
            else:
                agent = policy

            # Gather experience
            env = envs.Runner(env)
            replay = env.run(agent, steps=NUM_STEPS)

            # Pre-compute some shapes
            shape = (NUM_STEPS, n_envs)
            state_shape = vec_env.observation_space.sample()[0]
            if isinstance(state_shape, (int, float)):
                state_shape = tuple()
            else:
                state_shape = state_shape.shape
            action_shape = vec_env.action_space.sample()[0]
            if isinstance(action_shape, (int, float)):
                action_shape = (1, )
            else:
                action_shape = action_shape.shape
            done_shape = tuple()

            # Check shapes
            states = replay.state()
            self.assertEqual(states.shape, shape + state_shape, config)
            actions = replay.action()
            self.assertEqual(actions.shape, shape + action_shape, config)
            dones = replay.done()
            self.assertEqual(dones.shape, shape + done_shape, config)
            if return_info:
                policies = replay.policy()
                self.assertEqual(policies.shape, (NUM_STEPS, ) + action_shape,
                                 config)
Esempio n. 3
0
def create_unreal_env(num_processes, kwargs):
    def thunk(env):
        env = gym.make(**env)
        env = RewardCollector(env)
        env = TransposeImage(env)
        env = ScaledFloatFrame(env)
        env = UnrealEnvBaseWrapper(env)
        return env

    return AsyncVectorEnv([lambda: thunk(kwargs) for _ in range(num_processes)]), SyncVectorEnv([lambda: thunk(kwargs)])
Esempio n. 4
0
    def create_env(self, env):
        class W(gym.ObservationWrapper):
            def observation(self, o):
                return o.astype(np.float32)

        env_kwargs = env

        def _thunk():
            env = gym.make(**env_kwargs)
            env = RewardCollector(env)
            env = gym.wrappers.TransformReward(env, lambda r: 0.01 * r)
            env = W(env)
            return env

        self.validation_environment = SyncVectorEnv([_thunk])
        return AsyncVectorEnv([_thunk for _ in range(self.num_processes)])
Esempio n. 5
0
        def _init():
            env = gym.make(
                env_id, seed=seed + rank,
                effective_max_num_players=effective_max_num_players,
                init_num_players=effective_max_num_players,
                with_shuffle=with_shuffle,
                gnn_input=gnn_input
            )
            return env

        return _init


    env = AsyncVectorEnv(
        [make_env('Adhoc-Foraging-8x8-3f-v0', i,
                  args['seed'], num_players_train, False, True)
         for i in range(args['num_envs'])]
    )

    # Save init agent model parameters.
    save_dirs = os.path.join(directory, 'params_0')
    agent.save_parameters(save_dirs)

    # Evaluate initial model performance in training environment
    avgs = []
    num_dones, per_worker_rew = [0] * args['num_envs'], [0] * args['num_envs']
    agent.reset()
    env_eval = AsyncVectorEnv(
        [make_env('Adhoc-Foraging-8x8-3f-v0', i,
                  args['eval_init_seed'], num_players_train, False, True)
         for i in range(args['num_envs'])]
Esempio n. 6
0
                 implicit_max_player_num=3,
                 with_shuffling=False):
        def _init():
            env = gym.make(env_id,
                           seed=seed + rank,
                           num_players=num_players,
                           close_penalty=close_penalty,
                           implicit_max_player_num=implicit_max_player_num,
                           with_shuffling=with_shuffling)
            return env

        return _init

    num_players = args['num_players']
    env = AsyncVectorEnv([
        make_env('Adhoc-wolfpack-v5', i, num_players, args['seed'],
                 args['close_penalty']) for i in range(args['num_envs'])
    ])

    # Save initial model parameters.
    save_dirs = os.path.join(directory, 'params_0')
    agent.save_parameters(save_dirs)

    # Evaluate initial model performance in training environment
    avgs = []
    for ep_val_num in range(args['eval_eps']):
        num_players = args['num_players']
        agent.reset()
        steps = 0
        avg_total_rewards = 0.0
        env_eval = AsyncVectorEnv([
            make_env('Adhoc-wolfpack-v5', i, num_players, 2000,
Esempio n. 7
0
def main():
    n_envs = len(os.sched_getaffinity(0))
    factory = FallingEnvFactory()
    # factory = HalfCheetahEnvFactory()
    # factory = HumanoidFallingEnvFactory()
    env: Env = factory.make_env()
    envs: VectorEnv = AsyncVectorEnv([factory.make_env for _ in range(n_envs)])
    env_container = EnvContainer(env, envs)

    state_dim, = env.observation_space.shape
    action_dim, = env.action_space.shape
    relu = nn.ReLU()
    tanh = nn.Tanh()
    identity = nn.Identity()

    actor = ProbMLPConstantLogStd(state_dim, action_dim, [256, 256], relu, tanh, -1.0)
    critic = MultiLayerPerceptron(state_dim, 1, [256, 256], relu, identity)
    scaler_ = StandardScaler()
    print("Fit scaler")
    env.reset()
    state_seq = []
    for _ in tqdm(range(512)):
        action = env.action_space.sample()
        state, _, done, _ = env.step(action)
        state_seq.append(state)
        if done:
            env.reset()
    state_seq = np.stack(state_seq)
    scaler_.fit(state_seq)
    scaler = ScalerNet(scaler_)

    module_dict = ModuleDict()
    module_dict.set(ModuleKey.actor, actor)
    module_dict.set(ModuleKey.scaler, scaler)
    module_dict.set(ModuleKey.critic, critic)

    action_getter: ActionGetter = ActionGetterModule(actor, scaler)
    sample_collector: SampleCollector = SampleCollectorV0(env_container, action_getter, 2048, 1)

    mse_loss = nn.MSELoss()
    critic_tensor_inserter: TensorInserter = \
        TensorInserterTensorize(ArrayKey.states, TensorKey.states_tensor) + \
        TensorInserterTensorize(ArrayKey.log_probs, TensorKey.log_probs_tensor) + \
        TensorInserterTensorize(ArrayKey.cumulative_rewards, TensorKey.cumulative_rewards_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.scaler, TensorKey.states_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.critic, TensorKey.cumulative_reward_predictions_tensor)
    critic_loss_calculator: LossCalculator = \
        LossCalculatorInputTarget(TensorKey.cumulative_reward_predictions_tensor, TensorKey.cumulative_rewards_tensor,
                                  mse_loss)

    actor_tensor_inserter: TensorInserter = \
        TensorInserterTensorize(ArrayKey.states, TensorKey.states_tensor) + \
        TensorInserterTensorize(ArrayKey.actions, TensorKey.actions_tensor) + \
        TensorInserterTensorize(ArrayKey.log_probs, TensorKey.log_probs_tensor) + \
        TensorInserterTensorize(ArrayKey.cumulative_rewards, TensorKey.cumulative_rewards_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.scaler, TensorKey.states_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.critic,
                              TensorKey.cumulative_reward_predictions_tensor) + \
        TensorInserterLambda([TensorKey.cumulative_rewards_tensor, TensorKey.cumulative_reward_predictions_tensor],
                             lambda x, y: x - y, TensorKey.advantages_tensor) + \
        TensorInserterModuleLambda(ModuleKey.actor, [TensorKey.states_tensor, TensorKey.actions_tensor],
                                   lambda actor, state, action: actor.get_log_prob(state, action),
                                   TensorKey.new_log_probs_tensor) + \
        TensorInserterLambda([TensorKey.new_log_probs_tensor, TensorKey.log_probs_tensor, TensorKey.advantages_tensor],
                             get_ppo_surrogate_tensor, TensorKey.ppo_surrogates_tensor)

    actor_loss_calculator: LossCalculator = \
        LossCalculatorLambda([TensorKey.ppo_surrogates_tensor], lambda x: -torch.mean(x))

    actor_optimizer = RAdam(params=actor.parameters(), lr=3e-4)
    actor_updater: ModuleUpdater = ModuleUpdaterOptimizer(actor_optimizer)
    critic_optimizer = RAdam(params=critic.parameters(), lr=3e-4)
    critic_updater: ModuleUpdater = ModuleUpdaterOptimizer(critic_optimizer)

    actor_trainee = Trainee([actor], actor_updater, actor_tensor_inserter, actor_loss_calculator, 10)
    critic_trainee = Trainee([critic], critic_updater, critic_tensor_inserter, critic_loss_calculator, 10)

    trainer = RLTrainer(sample_collector, [critic_trainee, actor_trainee], 100000, 128)
    trainer.train(module_dict)
Esempio n. 8
0
    def make_env(args, rank, num_agents=5, active_agents=3, freeze_multiplier=80, team_mode="guard",
                 reward_scheme="sparse", seed=100):
        def _init():
            return make_open_env(
                args, args['num_env_steps'], num_agents, active_agents, freeze_multiplier,
                team_mode=team_mode, reward_scheme=reward_scheme, seed=int(seed + 1000 * rank)
            )

        return _init


    num_players_train = args['num_players_train']
    num_players_test = args['num_players_test']

    env = AsyncVectorEnv([
        make_env(args, i, active_agents=num_players_train, seed=args['seed'], reward_scheme=args["reward_type"]) for i
        in range(args['num_envs'])
    ])

    env_eval = AsyncVectorEnv([
        make_env(args, i, active_agents=num_players_train, seed=args['eval_init_seed'],
                 reward_scheme=args["reward_type"]) for i in range(args['num_envs'])
    ])

    env_eval2 = AsyncVectorEnv([
        make_env(args, i, active_agents=num_players_test, seed=args['eval_init_seed'],
                 reward_scheme=args["reward_type"]) for i in range(args['num_envs'])
    ])

    today = date.today()
    d1 = today.strftime("%d_%m_%Y")
Esempio n. 9
0
from gym.vector import AsyncVectorEnv

from agents.dqn.dqn import DQN
from agents.ppo.ppo import PPO
from utils.runner import train, make_env

if __name__ == "__main__":
    env_fns = [make_env() for _ in range(8)]
    env = AsyncVectorEnv(env_fns)

    agent = PPO(env.single_observation_space, env.single_action_space)

    returns = train(agent, env, 3000000, 500)
Esempio n. 10
0
        def test_config(n_envs, n_episodes, base_env, use_torch, use_logger,
                        return_info, retry):
            config = 'n_envs' + str(n_envs) + '-n_eps' + str(n_episodes) \
                    + '-base_env' + str(base_env) \
                    + '-torch' + str(use_torch) + '-logger' + str(use_logger) \
                    + '-info' + str(return_info)
            if isinstance(base_env, str):
                env = vec_env = gym.vector.make(base_env, num_envs=n_envs)
            else:

                def make_env():
                    env = base_env()
                    return env

                env_fns = [make_env for _ in range(n_envs)]
                env = vec_env = AsyncVectorEnv(env_fns)

            if use_logger:
                env = envs.Logger(env, interval=5, logger=self.logger)

            if use_torch:
                env = envs.Torch(env)
                policy = lambda x: ch.totensor(vec_env.action_space.sample())
            else:
                policy = lambda x: vec_env.action_space.sample()

            if return_info:
                agent = lambda x: (policy(x), {
                    'policy': policy(x)[0],
                    'act': policy(x)
                })
            else:
                agent = policy

            # Gather experience
            env = envs.Runner(env)
            replay = env.run(agent, episodes=n_episodes)
            if retry:
                replay = env.run(agent, episodes=n_episodes)

            # Pre-compute some shapes
            shape = (len(replay), )
            state_shape = vec_env.observation_space.sample().shape[1:]
            action_shape = np.array(vec_env.action_space.sample())[0].shape
            if len(action_shape) == 0:
                action_shape = (1, )
            done_shape = (1, )

            # Check shapes
            states = replay.state()
            self.assertEqual(states.shape, shape + state_shape, config)
            actions = replay.action()
            self.assertEqual(actions.shape, shape + action_shape, config)
            dones = replay.done()
            self.assertEqual(dones.shape, shape + done_shape, config)
            if return_info:
                policies = replay.policy()
                self.assertEqual(policies.shape, shape + action_shape, config)
                acts = replay.act()
                self.assertEqual(acts.shape,
                                 (len(replay), n_envs) + action_shape, config)
Esempio n. 11
0
                                 num_agents,
                                 active_agents,
                                 freeze_multiplier,
                                 team_mode=team_mode,
                                 reward_scheme=reward_scheme,
                                 seed=int(seed + 1000 * rank))

        return _init

    num_players_train = args['num_players_train']
    num_players_test = args['num_players_test']

    env = AsyncVectorEnv([
        make_env(args,
                 i,
                 active_agents=num_players_train,
                 seed=args['seed'],
                 reward_scheme="sparse") for i in range(8)
    ])

    args["device"] = "cpu"
    writer = None

    for idx in range(101):
        agent = MRFAgent(args=args, writer=writer, added_u_dim=0)
        load_dir = args['loading_dir'] + str(idx)
        agent.load_parameters(load_dir)

        obs_list = []

        agent.reset()