def main(): env = gym.make("PongNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) model = distdeepq.models.cnn_to_dist_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=False) act = distdeepq.learn( env, p_dist_func=model, lr=1e-4, max_timesteps=2000000, # max_timesteps=100000, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=False, dist_params={ 'Vmin': -10, 'Vmax': 10, 'nb_atoms': 51 }) act.save("pong_model.pkl") env.close()
def main(): env = gym.make("PongNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) act = deepq.load("pong_model.pkl") while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def main(): env = gym.make("PongNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) act = simple.learn(env, q_func=model, lr=1e-4, max_timesteps=200000, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, tf_log_dir='./log') act.save("pong_model.pkl") env.close()
def main(): env = gym.make("PongNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True ) act = deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=2000000, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True ) act.save("pong_model.pkl") env.close()
def main(): env = gym.make("PongNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) act = DeepqWithGaze.load("pong_model.pkl") while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def main(): env = gym.make("BreakoutNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) agent = DQN(env) agent.update_target() episodes_rewards = [0] * 100 avg_rewards = [] skip_rewards = [] step_num = 0 for episode in range(EPISODE): goal = 0 img_buf = deque() state = env.reset() while True: action = agent.egreedy_action(state) next_state, reward, done, _ = env.step(action) # env.render() # time.sleep(0.01) agent.perceive(state, action, reward, next_state, done, step_num) goal += reward step_num += 1 state = next_state if done: episodes_rewards.pop(0) episodes_rewards.append(goal) break # print "Current reward:", goal," Step number:", step_num print("Episode: ", episode, " Last 100 episode average reward: ", np.average(episodes_rewards), " Toal step number: ", step_num, " eps: ", agent.epsilon) if step_num > 2000000: break if episode % 50 == 0: skip_rewards.append(goal) if episode % 100 == 0: avg_rewards.append(np.average(episodes_rewards)) out_file = open("avg_rewards.pkl",'wb') out_file1 = open("skip_rewards.pkl",'wb') pickle.dump(avg_rewards, out_file) pickle.dump(skip_rewards, out_file1) out_file.close() out_file1.close() agent.saver.save(agent.session, 'saved_networks/' + 'network' + '-dqn', global_step=episode) env.close()
def play(): env = gym.make("BreakoutNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) agent = DQN(env) for episode in range(TEST): goal = 0 step_num = 0 state = env.reset() while True: action = agent.action(state) next_state, reward, done, _ = env.step(action) step_num += 1 env.render() time.sleep(0.01) goal += reward state = next_state if done or step_num > MAX_STEP_PER_EPISODE: print("Episode: ", episode, " Total reward: ", goal) break
#import sys #sys.path.append("../gym") #import gym import gym from baselines import deepq from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame # Create the Baseline game environment # TRAIN env = gym.make("PongNoFrameskip-v4") env = ScaledFloatFrame(wrap_dqn(env)) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True ) act = deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=2000000, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000,
def train(): from linear_schedule import Linear ledger = defaultdict(lambda: MovingAverage(Reporting.reward_average)) M.config(file=os.path.join(RUN.log_directory, RUN.log_file)) M.diff() with U.make_session( RUN.num_cpu), Logger(RUN.log_directory) as logger, contextify( gym.make(G.env_name)) as env: env = ScaledFloatFrame(wrap_dqn(env)) if G.seed is not None: env.seed(G.seed) logger.log_params(G=vars(G), RUN=vars(RUN), Reporting=vars(Reporting)) inputs = TrainInputs(action_space=env.action_space, observation_space=env.observation_space) trainer = QTrainer(inputs=inputs, action_space=env.action_space, observation_space=env.observation_space) if G.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(size=G.buffer_size, alpha=G.alpha) else: replay_buffer = ReplayBuffer(size=G.buffer_size) class schedules: # note: it is important to have this start from the begining. eps = Linear(G.n_timesteps * G.exploration_fraction, 1, G.final_eps) if G.prioritized_replay: beta = Linear(G.n_timesteps - G.learning_start, G.beta_start, G.beta_end) U.initialize() trainer.update_target() x = np.array(env.reset()) ep_ind = 0 M.tic('episode') for t_step in range(G.n_timesteps): # schedules eps = 0 if G.param_noise else schedules.eps[t_step] if G.prioritized_replay: beta = schedules.beta[t_step - G.learning_start] x0 = x M.tic('sample', silent=True) (action, *_), action_q, q = trainer.runner.act([x], eps) x, rew, done, info = env.step(action) ledger['action_q_value'].append(action_q.max()) ledger['action_q_value/mean'].append(action_q.mean()) ledger['action_q_value/var'].append(action_q.var()) ledger['q_value'].append(q.max()) ledger['q_value/mean'].append(q.mean()) ledger['q_value/var'].append(q.var()) ledger['timing/sample'].append(M.toc('sample', silent=True)) # note: adding sample to the buffer is identical between the prioritized and the standard replay strategy. replay_buffer.add(s0=x0, action=action, reward=rew, s1=x, done=float(done)) logger.log( t_step, { 'q_value': ledger['q_value'].latest, 'q_value/mean': ledger['q_value/mean'].latest, 'q_value/var': ledger['q_value/var'].latest, 'q_value/action': ledger['action_q_value'].latest, 'q_value/action/mean': ledger['action_q_value/mean'].latest, 'q_value/action/var': ledger['action_q_value/var'].latest }, action=action, eps=eps, silent=True) if G.prioritized_replay: logger.log(t_step, beta=beta, silent=True) if done: ledger['timing/episode'].append(M.split('episode', silent=True)) ep_ind += 1 x = np.array(env.reset()) ledger['rewards'].append(info['total_reward']) silent = (ep_ind % Reporting.print_interval != 0) logger.log(t_step, timestep=t_step, episode=green(ep_ind), total_reward=ledger['rewards'].latest, episode_length=info['timesteps'], silent=silent) logger.log(t_step, { 'total_reward/mean': yellow(ledger['rewards'].mean, lambda v: f"{v:.1f}"), 'total_reward/max': yellow(ledger['rewards'].max, lambda v: f"{v:.1f}"), "time_spent_exploring": default(eps, percent), "timing/episode": green(ledger['timing/episode'].latest, sec), "timing/episode/mean": green(ledger['timing/episode'].mean, sec), }, silent=silent) try: logger.log(t_step, { "timing/sample": default(ledger['timing/sample'].latest, sec), "timing/sample/mean": default(ledger['timing/sample'].mean, sec), "timing/train": default(ledger['timing/train'].latest, sec), "timing/train/mean": green(ledger['timing/train'].mean, sec), "timing/log_histogram": default(ledger['timing/log_histogram'].latest, sec), "timing/log_histogram/mean": default(ledger['timing/log_histogram'].mean, sec) }, silent=silent) if G.prioritized_replay: logger.log(t_step, { "timing/update_priorities": default(ledger['timing/update_priorities'].latest, sec), "timing/update_priorities/mean": default(ledger['timing/update_priorities'].mean, sec) }, silent=silent) except Exception as e: pass if G.prioritized_replay: logger.log( t_step, {"replay_beta": default(beta, lambda v: f"{v:.2f}")}, silent=silent) # note: learn here. if t_step >= G.learning_start and t_step % G.learn_interval == 0: if G.prioritized_replay: experiences, weights, indices = replay_buffer.sample( G.replay_batch_size, beta) logger.log_histogram(t_step, weights=weights) else: experiences, weights = replay_buffer.sample( G.replay_batch_size), None M.tic('train', silent=True) x0s, actions, rewards, x1s, dones = zip(*experiences) td_error_val, loss_val = trainer.train(s0s=x0s, actions=actions, rewards=rewards, s1s=x1s, dones=dones, sample_weights=weights) ledger['timing/train'].append(M.toc('train', silent=True)) M.tic('log_histogram', silent=True) logger.log_histogram(t_step, td_error=td_error_val) ledger['timing/log_histogram'].append( M.toc('log_histogram', silent=True)) if G.prioritized_replay: M.tic('update_priorities', silent=True) new_priorities = np.abs(td_error_val) + eps replay_buffer.update_priorities(indices, new_priorities) ledger['timing/update_priorities'].append( M.toc('update_priorities', silent=True)) if t_step % G.target_network_update_interval == 0: trainer.update_target() if t_step % Reporting.checkpoint_interval == 0: U.save_state(os.path.join(RUN.log_directory, RUN.checkpoint))