def main(): epochs = int(1e8) env = retro.make('ContraForce-Nes') n_actions = env.action_space.n agent = PPO(input_dims=(224, 240), n_actions=2**n_actions) n_steps = 0 N = 100 learn_iters = 0 score_history = [] for epoch in range(epochs): observation = env.reset() done = False score = 0 while not done: env.render() action, prob, value = agent.choose_action(observation) print(action) observation_, reward, done, info = env.step( convertToAction(action)) agent.remember(observation, action, value, prob, reward, done) score += reward n_steps += 1 if n_steps % N == 0: agent.learn() learn_iters += 1 observation = observation_ score_history.append(score) avg_score = np.mean(score_history[-100:]) if epoch % 10 == 0: print('epoch:{}, score:{:.2f},avg_score:{:.2f}'.format( epoch, score, avg_score)) x = [i + 1 for i in range(len(score_history))] plot_learning_curve(x, score_history, 'MountainCar1.png')
def __init__(self, action_space, observation_shape, n_envs, n_agents, combine_states=False, gamma=0.99, horizon=128, gae_lambda=0.95, epochs=12, epsilon=0.2, learning_rate=0.0001): print("MultiPPO agent:") print("\tNumber of sub-agents: {}".format(n_agents)) self._n_envs = n_envs print("\tNumber of environments: {}".format(self._n_envs)) # State preprocessor state_processor = unite_states if combine_states else noop_states self._state_preprocessor, observation_shape = state_processor( n_agents, n_envs, observation_shape, ) # Create agents assert len(observation_shape) == 1 self._agents = [ PPO( action_space, observation_shape, n_envs=n_envs, gamma=gamma, horizon=horizon, gae_lambda=gae_lambda, epochs=epochs, epsilon=epsilon, learning_rate=learning_rate, ) for _ in range(n_agents) ] self._action_space = action_space self._observation_shape = observation_shape
def pick_algorithm(cfg, **kwargs): if cfg['game']['algorithm'] == 'a2c': return A2C( env=kwargs['env'], num_agents=kwargs['agents'], gamma=0.99, hidden_size=2**6, l_rate=1e-4, n_inputs_n=[kwargs['env'].flatten_observation_space_n[j].shape[0] for j in range(kwargs['agents'])], n_outputs=kwargs['env'].action_space.n, ) elif cfg['game']['algorithm'] == 'ppo': return PPO( env=kwargs['env'], num_agents=kwargs['agents'], gamma=0.99, hidden_size=2 ** 6, l_rate=1e-4, n_inputs_n=[kwargs['env'].flatten_observation_space_n[j].shape[0] for j in range(kwargs['agents'])], n_outputs=kwargs['env'].action_space.n, betas=(0.9, 0.999), K_epochs=4, eps_clip=0.2, update_timestep=round(kwargs['env'].cfg['duration'] * 0.04) ) elif cfg['game']['algorithm'] == 'dqn': return DQN( env=kwargs['env'], num_agents=kwargs['agents'], gamma=0.99, epsilon=1.0, epsilon_min=0.1, epsilon_max=1.0, batch_size=32, n_inputs_n=[kwargs['env'].flatten_observation_space_n[j].shape[0] for j in range(kwargs['agents'])], n_outputs=kwargs['env'].action_space.n, ) else: raise NotImplementedError( f"Given algorithm (`{cfg['game']['algorithm']}`) is not implemeneted yet!")
def create_agent(env, args): action_space = env.action_space observation_shape = env.observation_space.shape print("Action space: {}".format(action_space)) print("Observation space: {}".format(env.observation_space)) agent_type = args["agent"] baseline = args["baseline"] baseline_learning_rate = args["baseline_learning_rate"] gamma = args["gamma"] learning_rate = args["learning_rate"] if agent_type == "qlearning": return QLearning( action_size=action_space.n, observation_shape=observation_shape, beta_decay=args["beta_decay"], gamma=gamma, learning_rate=learning_rate, soft=args["soft"], dueling=args["dueling"], double=args["double"], noisy=args["noisy"], priority=args["priority"], replay_buffer_size=args["replay_buffer_size"], min_replay_buffer_size=args["min_replay_buffer_size"], target_update_freq=args["target_update_freq"], train_freq=args["train_freq"], tau=args["tau"], batch_size=args["batch_size"], epsilon_start=args["epsilon_start"], epsilon_end=args["epsilon_end"], epsilon_decay=args["epsilon_decay"]) elif agent_type == "reinforce": return Reinforce( action_size=action_space.n, observation_shape=observation_shape, gamma=gamma, learning_rate=learning_rate, baseline=baseline, baseline_learning_rate=baseline_learning_rate) elif agent_type == "actor-critic": return ActorCritic( action_size=action_space.n, observation_shape=observation_shape, gamma=gamma, learning_rate=learning_rate) elif agent_type == "ppo": return PPO( action_space=action_space, observation_shape=observation_shape, n_envs=env.n_envs, gamma=gamma, horizon=args["horizon"], epochs=args["ppo_epochs"], gae_lambda=args["gae_lambda"], learning_rate=learning_rate) elif agent_type == 'multippo': return MultiPPO( action_space=action_space, observation_shape=observation_shape, n_envs=env.n_envs, n_agents=env.n_agents, gamma=gamma, horizon=args["horizon"], epochs=args["ppo_epochs"], gae_lambda=args["gae_lambda"], learning_rate=learning_rate)
def run(EnvName, rl_confs, mode=None, episodes=1000, t_horizon=1000, model_path=None, log_path=None): env = StateNormWrapper(gym.make(EnvName), file_name="./rl/rl.json") # for state normalization env = gym.make(EnvName) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n # discrete model = PPO(state_dim, action_dim, rl_confs["General"]["policy_approx"], rl_confs[EnvName]["learner_args"], \ **rl_confs[EnvName]["alg_confs"]).to(torch.device(rl_confs[EnvName]["learner_args"]["device"])) print_interval = 20 if mode == 'test': model.load_model(model_path) rewards_list = [] for n_epi in range(episodes): s = env.reset() done = False reward = 0.0 step = 0 while not done and step < t_horizon: if mode == 'train': a, prob = model.choose_action(s) else: a = model.choose_action(s, Greedy=True) # a, prob=model.choose_action(s) s_prime, r, done, info = env.step(a) if mode == 'test': env.render() else: model.put_data( (s, a, r / 100.0, s_prime, prob[a].item(), done)) # model.put_data((s, a, r, s_prime, prob[a].item(), done)) s = s_prime reward += r step += 1 if done: break if mode == 'train': model.train_net() if n_epi % print_interval == 0 and n_epi != 0: # plot(rewards_list) np.save(log_path, rewards_list) torch.save(model.state_dict(), model_path) print("# of episode :{}, reward : {:.1f}, episode length: {}". format(n_epi, reward, step)) else: print( "# of episode :{}, reward : {:.1f}, episode length: {}".format( n_epi, reward, step)) rewards_list.append(reward) env.close()
states_data_path = il_confs["data_collect_confs"]["data_path"]+env.spec.id.split("-")[0].lower()+'/state' states = np.load(states_data_path+'.npy') mean = state_stats['mean'] std = state_stats['std'] states = (states-mean)/std np.save(states_data_path+'_norm', states) if __name__ == '__main__': EnvName = 'CartPole-v1' # EnvName = 'LunarLander-v2' env = gym.make(EnvName) if EnvName == 'LunarLander-v2': # the heuristic agent exists for LunarLander agent = HeuristicAgentLunarLander(env, Continuous=False) elif EnvName == 'CartPole-v1': # no heuristic agent for CartPole, so use a well-trained RL agent filename = "./mlp/mlp_rl_train.json" with open(filename, "r") as read_file: rl_confs = json.load(read_file) # hyperparameters for rl training state_dim = env.observation_space.shape[0] action_dim = env.action_space.n # discrete agent = PPO(state_dim, action_dim, 'MLP', rl_confs[EnvName]["learner_args"], \ **rl_confs[EnvName]["alg_confs"]).to(torch.device(rl_confs[EnvName]["learner_args"]["device"])) agent.load_model(rl_confs[EnvName]["train_confs"]["model_path"]) # collect_demo(env, agent, render=False, collect_data = False) norm_state(env)