import pybullet_envs import gym import numpy as np from sac_torch import Agent if __name__ == '__main__': env = gym.make('InvertedPendulumBulletEnv-v0') #print(env.action_space.shape[0]) agent = Agent(input_dims=env.observation_space.shape[0], env=env, n_actions=env.action_space.shape[0]) n_games = 200 best_score = env.reward_range[0] score_history = [] load_checkpoints = False if load_checkpoints: agent.load_models() env.render(mode='human') for i in range(n_games): observation = env.reset() done = False score = 0 while not done: action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) score += reward agent.remember(observation, action, reward, observation_, done) #if not load_checkpoints: agent.learn()
# np.random.seed(42) def one_hot_single_value(cur_val, total_vals): """Coverts cur_val into one-hot vector of size total_vals""" x = [0] * total_vals x[cur_val] = 1 return x if __name__ == '__main__': env = SimpleBandit(num_states=10, num_actions=4, max_trajectory_len=20) n_states, n_actions = env.state_space.n, env.action_space.n agent = Agent(input_dims=[env.state_space.n], env=env, n_actions=env.action_space.n) n_games = 2000 # uncomment this line and do a mkdir tmp && mkdir video if you want to # record video of the agent playing the game. # env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True) filename = 'simple_bandit.png' figure_file = 'plots/' + filename best_score = env.reward_range[0] score_history = [] load_checkpoint = False if load_checkpoint: agent.load_models()
def main(): Hyper.init() env = make_env( Constants.env_id) # See wrapper code for environment in atari_image.py Hyper.n_actions = env.action_space.n shape = (env.observation_space.shape) agent = Agent(input_dims=shape, env=env, n_actions=env.action_space.n) filename = f"{Constants.env_id}_games{Hyper.n_games}_alpha{Hyper.alpha}.png" figure_file = f'plots/{filename}' best_ave_score = env.reward_range[0] best_score = 0 score_history = [] load_checkpoint = False if load_checkpoint: agent.load_models() env.render(mode='human') total_steps = 0 game_id = 0 for i in range(Hyper.n_games): game_id += 1 if game_id % 20 == 0: Hyper.alpha = Hyper.alpha * 1.2 Hyper.beta = Hyper.beta * 1.2 observation = env.reset() done = False steps = 0 score = 0 while not done: # Sample action from the policy action = agent.choose_action(observation) # Sample transition from the environment new_observation, reward, done, info = env.step(action) steps += 1 total_steps += 1 # Store transition in the replay buffer agent.remember(observation, action, reward, new_observation, done) if not load_checkpoint: agent.learn() score += reward observation = new_observation score_history.append(score) avg_score = np.mean(score_history[-100:]) if score > best_score: best_score = score if avg_score > best_ave_score: best_ave_score = avg_score if not load_checkpoint: agent.save_models() episode = i + 1 print( f"episode {episode}: score {score}, best_score {best_score}, best ave score {best_ave_score}, trailing 100 games avg {avg_score}, steps {steps}, total steps {total_steps}" ) print(f"total number of steps taken: {total_steps}") if not load_checkpoint: x = [i + 1 for i in range(Hyper.n_games)] plot_learning_curve(x, score_history, figure_file)
rewardmodel = RewardModel(model_cuda, n_actions, n_spaces, 1, 512, 3, ensemble_size=ensemble_size) modelloss = [] rewards = [] agent_cuda = args[5] agent = Agent(agent_cuda, alpha=0.0003, beta=0.0003, reward_scale=2, env_id=env_id, input_dims=env.observation_space.shape, tau=0.005, env=env, batch_size=256, layer1_size=256, layer2_size=256, n_actions=n_actions) horizon = int(args[9]) num_control_samples = 100 num_elite = 30 grad_steps = 10 mpc = MPCController(agent_cuda, env, horizon=horizon, num_control_samples=num_control_samples, num_elite=num_elite, agent=agent,
from sac_torch import Agent seed = 18095048 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) env_name = 'InvertedPendulumBulletEnv-v0' env = gym.make(env_name) if seed is not None: env.seed(seed) state = env.reset() # initializing a model model = Agent(input_dims=env.observation_space.shape[0], env=env, n_actions=env.action_space.shape[0]) mean_rewards = [] for i in range(100): print('game ' + str(i)) rewards = [model.train_on_env(env) for _ in range(100)] mean_rewards.append(np.mean(rewards)) print("mean reward:%.3f" % (np.mean(rewards))) plt.figure(figsize=[9, 6]) plt.title("Mean reward per 100 games") plt.plot(mean_rewards) plt.grid() # plt.show() plt.savefig('plots/SAC_learning_curve.png') plt.close()