Exemple #1
0
def main():
    epochs = int(1e8)
    env = retro.make('ContraForce-Nes')
    n_actions = env.action_space.n
    agent = PPO(input_dims=(224, 240), n_actions=2**n_actions)
    n_steps = 0
    N = 100
    learn_iters = 0
    score_history = []
    for epoch in range(epochs):
        observation = env.reset()
        done = False
        score = 0
        while not done:
            env.render()
            action, prob, value = agent.choose_action(observation)
            print(action)
            observation_, reward, done, info = env.step(
                convertToAction(action))
            agent.remember(observation, action, value, prob, reward, done)
            score += reward
            n_steps += 1
            if n_steps % N == 0:
                agent.learn()
                learn_iters += 1
            observation = observation_
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])
        if epoch % 10 == 0:
            print('epoch:{}, score:{:.2f},avg_score:{:.2f}'.format(
                epoch, score, avg_score))
    x = [i + 1 for i in range(len(score_history))]
    plot_learning_curve(x, score_history, 'MountainCar1.png')
Exemple #2
0
    def __init__(self,
                 action_space,
                 observation_shape,
                 n_envs,
                 n_agents,
                 combine_states=False,
                 gamma=0.99,
                 horizon=128,
                 gae_lambda=0.95,
                 epochs=12,
                 epsilon=0.2,
                 learning_rate=0.0001):

        print("MultiPPO agent:")
        print("\tNumber of sub-agents: {}".format(n_agents))
        self._n_envs = n_envs
        print("\tNumber of environments: {}".format(self._n_envs))

        # State preprocessor
        state_processor = unite_states if combine_states else noop_states
        self._state_preprocessor, observation_shape = state_processor(
            n_agents,
            n_envs,
            observation_shape,
        )

        # Create agents
        assert len(observation_shape) == 1
        self._agents = [
            PPO(
                action_space,
                observation_shape,
                n_envs=n_envs,
                gamma=gamma,
                horizon=horizon,
                gae_lambda=gae_lambda,
                epochs=epochs,
                epsilon=epsilon,
                learning_rate=learning_rate,
            ) for _ in range(n_agents)
        ]
        self._action_space = action_space
        self._observation_shape = observation_shape
Exemple #3
0
 def pick_algorithm(cfg, **kwargs):
     if cfg['game']['algorithm'] == 'a2c':
         return A2C(
             env=kwargs['env'],
             num_agents=kwargs['agents'],
             gamma=0.99,
             hidden_size=2**6,
             l_rate=1e-4,
             n_inputs_n=[kwargs['env'].flatten_observation_space_n[j].shape[0]
                         for j in range(kwargs['agents'])],
             n_outputs=kwargs['env'].action_space.n,
         )
     elif cfg['game']['algorithm'] == 'ppo':
         return PPO(
             env=kwargs['env'],
             num_agents=kwargs['agents'],
             gamma=0.99,
             hidden_size=2 ** 6,
             l_rate=1e-4,
             n_inputs_n=[kwargs['env'].flatten_observation_space_n[j].shape[0]
                         for j in range(kwargs['agents'])],
             n_outputs=kwargs['env'].action_space.n,
             betas=(0.9, 0.999),
             K_epochs=4,
             eps_clip=0.2,
             update_timestep=round(kwargs['env'].cfg['duration'] * 0.04)
         )
     elif cfg['game']['algorithm'] == 'dqn':
         return DQN(
             env=kwargs['env'],
             num_agents=kwargs['agents'],
             gamma=0.99,
             epsilon=1.0,
             epsilon_min=0.1,
             epsilon_max=1.0,
             batch_size=32,
             n_inputs_n=[kwargs['env'].flatten_observation_space_n[j].shape[0]
                         for j in range(kwargs['agents'])],
             n_outputs=kwargs['env'].action_space.n,
         )
     else:
         raise NotImplementedError(
             f"Given algorithm (`{cfg['game']['algorithm']}`) is not implemeneted yet!")
Exemple #4
0
def create_agent(env, args):

    action_space = env.action_space
    observation_shape = env.observation_space.shape
    print("Action space: {}".format(action_space))
    print("Observation space: {}".format(env.observation_space))

    agent_type = args["agent"]
    baseline = args["baseline"]
    baseline_learning_rate = args["baseline_learning_rate"]
    gamma = args["gamma"]
    learning_rate = args["learning_rate"]

    if agent_type == "qlearning":
        return QLearning(
                action_size=action_space.n,
                observation_shape=observation_shape,
                beta_decay=args["beta_decay"],
                gamma=gamma,
                learning_rate=learning_rate,
                soft=args["soft"],
                dueling=args["dueling"],
                double=args["double"],
                noisy=args["noisy"],
                priority=args["priority"],
                replay_buffer_size=args["replay_buffer_size"],
                min_replay_buffer_size=args["min_replay_buffer_size"],
                target_update_freq=args["target_update_freq"],
                train_freq=args["train_freq"],
                tau=args["tau"],
                batch_size=args["batch_size"],
                epsilon_start=args["epsilon_start"],
                epsilon_end=args["epsilon_end"],
                epsilon_decay=args["epsilon_decay"])
    elif agent_type == "reinforce":
        return Reinforce(
                action_size=action_space.n,
                observation_shape=observation_shape,
                gamma=gamma,
                learning_rate=learning_rate,
                baseline=baseline,
                baseline_learning_rate=baseline_learning_rate)
    elif agent_type == "actor-critic":
        return ActorCritic(
                action_size=action_space.n,
                observation_shape=observation_shape,
                gamma=gamma,
                learning_rate=learning_rate)
    elif agent_type == "ppo":
        return PPO(
                action_space=action_space,
                observation_shape=observation_shape,
                n_envs=env.n_envs,
                gamma=gamma,
                horizon=args["horizon"],
                epochs=args["ppo_epochs"],
                gae_lambda=args["gae_lambda"],
                learning_rate=learning_rate)
    elif agent_type == 'multippo':
        return MultiPPO(
                action_space=action_space,
                observation_shape=observation_shape,
                n_envs=env.n_envs,
                n_agents=env.n_agents,
                gamma=gamma,
                horizon=args["horizon"],
                epochs=args["ppo_epochs"],
                gae_lambda=args["gae_lambda"],
                learning_rate=learning_rate)
def run(EnvName,
        rl_confs,
        mode=None,
        episodes=1000,
        t_horizon=1000,
        model_path=None,
        log_path=None):
    env = StateNormWrapper(gym.make(EnvName),
                           file_name="./rl/rl.json")  # for state normalization
    env = gym.make(EnvName)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n  # discrete
    model = PPO(state_dim, action_dim, rl_confs["General"]["policy_approx"], rl_confs[EnvName]["learner_args"], \
        **rl_confs[EnvName]["alg_confs"]).to(torch.device(rl_confs[EnvName]["learner_args"]["device"]))
    print_interval = 20
    if mode == 'test':
        model.load_model(model_path)
    rewards_list = []
    for n_epi in range(episodes):
        s = env.reset()
        done = False
        reward = 0.0
        step = 0
        while not done and step < t_horizon:
            if mode == 'train':
                a, prob = model.choose_action(s)
            else:
                a = model.choose_action(s, Greedy=True)
                # a, prob=model.choose_action(s)

            s_prime, r, done, info = env.step(a)

            if mode == 'test':
                env.render()
            else:
                model.put_data(
                    (s, a, r / 100.0, s_prime, prob[a].item(), done))
                # model.put_data((s, a, r, s_prime, prob[a].item(), done))

            s = s_prime

            reward += r
            step += 1
            if done:
                break
        if mode == 'train':
            model.train_net()
            if n_epi % print_interval == 0 and n_epi != 0:
                # plot(rewards_list)
                np.save(log_path, rewards_list)
                torch.save(model.state_dict(), model_path)
                print("# of episode :{}, reward : {:.1f}, episode length: {}".
                      format(n_epi, reward, step))
        else:
            print(
                "# of episode :{}, reward : {:.1f}, episode length: {}".format(
                    n_epi, reward, step))
        rewards_list.append(reward)
    env.close()
    states_data_path = il_confs["data_collect_confs"]["data_path"]+env.spec.id.split("-")[0].lower()+'/state'
    states = np.load(states_data_path+'.npy')
    mean = state_stats['mean']
    std = state_stats['std']
    states = (states-mean)/std

    np.save(states_data_path+'_norm', states)
    

if __name__ == '__main__':
    EnvName = 'CartPole-v1'
    # EnvName = 'LunarLander-v2'

    env = gym.make(EnvName)
    if EnvName == 'LunarLander-v2':  # the heuristic agent exists for LunarLander
        agent = HeuristicAgentLunarLander(env, Continuous=False)
    elif EnvName == 'CartPole-v1':  # no heuristic agent for CartPole, so use a well-trained RL agent
        filename = "./mlp/mlp_rl_train.json"
        with open(filename, "r") as read_file:
                rl_confs = json.load(read_file)  # hyperparameters for rl training
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n  # discrete
        agent = PPO(state_dim, action_dim, 'MLP', rl_confs[EnvName]["learner_args"], \
        **rl_confs[EnvName]["alg_confs"]).to(torch.device(rl_confs[EnvName]["learner_args"]["device"]))
        agent.load_model(rl_confs[EnvName]["train_confs"]["model_path"])

    # collect_demo(env, agent, render=False, collect_data = False)
    norm_state(env)