def main(): env = gym.make("PongNoFrameskip-v4") # Remove Scaled Float Frame wrapper, re-use if needed. from atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame env = ScaledFloatFrame(wrap_dqn(env)) model = cnn_to_dist(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], num_atoms=50, dueling=True)
def main(): env = gym.make("PongNoFrameskip-v4") # Remove Scaled Float Frame wrapper, re-use if needed. from atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame env = ScaledFloatFrame(wrap_dqn(env)) # model = cnn_to_mlp( # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], # hiddens=[256], # dueling=True # ) # act = learn( # env, # q_func=model, # lr=1e-4, # max_timesteps=2000000, # buffer_size=10000, # exploration_fraction=0.1, # exploration_final_eps=0.01, # train_freq=4, # learning_starts=10000, # target_network_update_freq=1000, # gamma=0.99, # prioritized_replay=False # ) num_atoms = 51 model = cnn_to_dist( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], num_atoms=num_atoms, dueling=False ) act = dist_learn( env, q_dist_func=model, num_atoms=num_atoms, V_max=10.0, lr=1e-4, max_timesteps=20000000, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=False ) act.save("pong_model.pkl") env.close()
def main(): # env = gym.make("CartPole-v0") # env = gym.make("CartPole-v1") # env = gym.make("Acrobot-v1") # env = gym.make("MountainCar-v0") # env = gym.make("FrozenLake-v0") # env = gym.make("FrozenLake8x8-v0") env = gym.make("PongNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) # robShape = (2,) # robShape = (3,) # robShape = (200,) # robShape = (16,) # robShape = (64,) def make_obs_ph(name): return U.BatchInput(env.observation_space.shape, name=name) # return U.BatchInput(robShape, name=name) # # these params are specific to mountaincar # def getOneHotObs(obs): # obsFraction = (obs[0] + 1.2) / 1.8 # idx1 = np.int32(np.trunc(obsFraction*100)) # obsFraction = (obs[1] + 0.07) / 0.14 # idx2 = np.int32(np.trunc(obsFraction*100)) # ident = np.identity(100) # return np.r_[ident[idx1,:],ident[idx2,:]] # these params are specific to frozenlake def getOneHotObs(obs): # ident = np.identity(16) ident = np.identity(64) return ident[obs, :] # model = models.mlp([32]) # model = models.mlp([64]) # model = models.mlp([64], layer_norm=True) # model = models.mlp([16, 16]) model = models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) # parameters q_func = model # lr=1e-3 lr = 1e-4 max_timesteps = 2000000 # max_timesteps=100000 # max_timesteps=50000 # buffer_size=50000 buffer_size = 100000 exploration_fraction = 0.1 # exploration_fraction=0.3 exploration_final_eps = 0.01 # exploration_final_eps=0.02 # exploration_final_eps=0.1 # train_freq=1 train_freq = 4 batch_size = 32 print_freq = 10 checkpoint_freq = 10000 # learning_starts=1000 learning_starts = 10000 # gamma=1.0 gamma = 0.99 # target_network_update_freq=500 target_network_update_freq = 1000 # prioritized_replay=False prioritized_replay = True prioritized_replay_alpha = 0.6 prioritized_replay_beta0 = 0.4 prioritized_replay_beta_iters = None prioritized_replay_eps = 1e-6 num_cpu = 16 # # try mountaincar w/ different input dimensions # inputDims = [50,2] sess = U.make_session(num_cpu) sess.__enter__() act, train, update_target, debug = build_graph.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # obs = np.r_[env.reset(),0] # obs = getOneHotObs(obs) # with tempfile.TemporaryDirectory() as td: model_saved = False # model_file = os.path.join(td, "model") for t in range(max_timesteps): # Take action and update exploration to the newest value action = act(np.array(obs)[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # new_obs = getOneHotObs(new_obs) # new_obs = np.r_[new_obs,0] # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() # obs = getOneHotObs(obs) # obs = np.r_[obs,0] episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: # if done: print("steps: " + str(t) + ", episodes: " + str(num_episodes) + ", mean 100 episode reward: " + str(mean_100ep_reward) + ", % time spent exploring: " + str(int(100 * exploration.value(t)))) # if done and print_freq is not None and len(episode_rewards) % print_freq == 0: # logger.record_tabular("steps", t) # logger.record_tabular("episodes", num_episodes) # logger.record_tabular("mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) # logger.dump_tabular() # sess plt.plot(episode_rewards) plt.show() sess
#!/usr/bin/env python import gym import numpy as np from atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame def wrap_train(env): from atari_wrappers import (wrap_deepmind, FrameStack) env = wrap_deepmind(env, episode_life = False, clip_rewards=False) env = FrameStack(env, 4) return env env = gym.make("PongNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) # env = wrap_train(env) obs = env.reset() print env.observation_space print env.action_space print len(obs), len(obs[0]), len(obs[0][0]) action = env.action_space.sample() print action # print len(observation) # for _ in range(1000): # # env.render() # action = env.action_space.sample() # your agent here (this takes random actions) # observation, reward, done, info = env.step(action) # print action # if done: