def train_model(num_frames): env = make_atari('PongNoFrameskip-v4') env = wrap_deepmind(env,episode_life=True, frame_stack=True) train_results = results.results(globals()) cumulative_frames = 0 best_score = -50 games = 0 full_loss = [] rewards = [] while 1: state = env.reset() done = False cum_reward = 0 cum_loss = [] while not done: action = select_action(torch.tensor(np.array(state).reshape(-1, 4, HEIGHT, WIDTH)).to(device), cumulative_frames) next_state, reward, done, _ = env.step(action) memory.add(state, action, reward, next_state, reward) state = next_state if cumulative_frames % TRAIN_FREQUENCY == 0 and cumulative_frames > LEARNING_STARTS: loss = optimize_model(cumulative_frames) cum_loss.append(loss) cum_reward += reward cumulative_frames += 1 if cumulative_frames % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) if best_score < cum_reward: best_score = cum_reward if len(cum_loss) == 0: full_loss.append(0) else: full_loss.append(np.mean(cum_loss)) rewards.append(cum_reward) games += 1 if games % 10 == 0: print("=============================================") print("Game: {} | Frame {}".format(games, cumulative_frames)) print("Final reward: {}".format(cum_reward)) print("Epsilon after: {}".format(EPSILON)) print("Best High Score: {}".format(best_score)) print("Avg Loss Last 100 games: {}".format( np.mean(full_loss[-100:]))) print("Avg Reward Last 100 games: {}".format( np.mean(rewards[-100:]))) train_results.record(cumulative_frames, games, EPSILON, cum_reward, full_loss[-1]) if np.mean(rewards[-100:]) >= 18 and cumulative_frames > LEARNING_STARTS: break torch.save(target_net.state_dict(), PATH) train_results.close()
def record(): ''' This function generates a gif file for a single episode. This process may take some time. To watch the non-stop game play, please run the test() function. ''' save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/" figure_path = FIGURE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/" list_obs = [] list_reward = [] obs_mean_std = np.load(save_path + "obs_mean_std.npz") obs_mean = obs_mean_std["obs_mean"] obs_std = obs_mean_std["obs_std"] # Create environment. env = make_atari(ENV_NAME) obs_space = env.observation_space action_space = env.action_space # Build models. policy = Policy(obs_space, action_space, is_training=False) with tf.Session() as sess: # Load variables. saver_policy = tf.train.Saver(policy.trainable_variables) saver_policy.restore(sess, save_path + "policy") total_reward = 0 obs = env.reset() while True: list_obs.append(obs) list_reward.append(total_reward) env.render() # Get observation. obs = (obs - obs_mean) / obs_std # Get action. action = sess.run( policy.action, feed_dict={policy.Obs: np.reshape(obs, [1, 1, *obs.shape])}) action = np.squeeze(action, (0, 1)) # Interact with the environment. obs, reward, done, _ = env.step(action) total_reward += reward if done: list_obs.append(obs) list_reward.append(total_reward) break env.close() # Record the gameplay. imageio.mimsave( figure_path + "gameplay.gif", [plot_obs(obs, reward) for obs, reward in zip(list_obs, list_reward)], fps=30)
def test(): ''' This function visualizes the game play. The environment will be reset immediately and the game will not be recorded. To record the game play, please run the record() function. ''' save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/" obs_mean_std = np.load(save_path + "obs_mean_std.npz") obs_mean = obs_mean_std["obs_mean"] obs_std = obs_mean_std["obs_std"] # Create environment. env = make_atari(ENV_NAME) obs_space = env.observation_space action_space = env.action_space # Build models. policy = Policy(obs_space, action_space, is_training=False) with tf.Session() as sess: # Load variables. saver_policy = tf.train.Saver(policy.trainable_variables) saver_policy.restore(sess, save_path + "policy") total_step = 0 total_reward = 0 while True: # Get observation. if total_step == 0: obs = env.reset() else: obs = obs_next obs = (obs - obs_mean) / obs_std env.render() # Get action. action = sess.run( policy.action, feed_dict={policy.Obs: np.reshape(obs, [1, 1, *obs.shape])}) action = np.squeeze(action, (0, 1)) # Interact with the environment. obs_next, reward, done, _ = env.step(action) total_reward += reward if done: # Reset environment. print("Episodic reward: ", total_reward, sep="") obs_next = env.reset() total_reward = 0 # Update step counter. total_step += 1 env.close()
def inference(episodes, model, env_name): env = make_atari(env_name) env = wrap_deepmind(env, episode_life=True, frame_stack=True) for _ in range(episodes): observation = env.reset() done = False while not done: time.sleep(0.05) env.render() observation = torch.tensor(np.array(observation).reshape(-1, 4, HEIGHT, WIDTH)).to(device) with torch.no_grad(): action = model(observation).max(1)[1].item() observation, reward, done, _ = env.step(action) if reward != 0: print(reward)
dh[eph <= 0] = 0 # backprop relu dW1 = np.dot(dh.T, epx) return {'W1': dW1, 'W2': dW2} if __name__ == '__main__': # hyperparameters H = 200 # number of hidden layer neurons batch_size = 10 # every how many episodes to do a param update? learning_rate = 1e-4 gamma = 0.99 # discount factor for reward decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2 resume = False # resume from previous checkpoint? #!!!!! render = True #!!!!! env = make_atari(sys.argv[1]) num_actions = env.action_space.n env = wrap_deepmind(env) # model initialization D = 84 * 84 # input dimensionality: 80x80 grid if resume: print('RESUMING') #!!!!! model = pickle.load(open('save.p', 'rb')) else: model = {} model['W1'] = np.random.randn(H, D) / np.sqrt( D) # "Xavier" initialization model['W2'] = np.random.randn(num_actions, H) / np.sqrt(H) grad_buffer = {
name = "pong_nn_0" env_name = "PongNoFrameskip-v4" if len(sys.argv) > 1: env_name = sys.argv[1] name = sys.argv[2] mode_file = name + ".h5" npy = name + ".npy" frame_name = name + "_frames.npy" max_frames = int(1e7) windows = 50 learn_delay = 80000 env = make_atari(env_name) env = wrap_deepmind(env) print(env.action_space) print(env.observation_space, env.observation_space.shape) agent = DQN_NN(env.action_space.n) fs = [] avg_reward = [] best_avg_reward = -math.inf rs = deque(maxlen=windows) frames = 0 if load: try: agent.q_network.load_weights(mode_file) agent.q_target.load_weights(mode_file)
# run python -i test.py for testing stuff in shell import torch import numpy as np import gym from wrappers import make_atari, wrap_deepmind from utils import LinearSchedule, Replay env=wrap_deepmind(make_atari('BreakoutNoFrameskip-v4')) state=env.reset() state = np.array(state) r = Replay(50, 3, False) for i in range(100): action = env.action_space.sample() next_state, reward, done, _ = env.step(action) r.add(state, action, reward, next_state, done) state = next_state s, a, r, ns, d = r.sample_tensor()
return p, h # return probability of taking action 2, and hidden state def policy_backward(eph, epdlogp): """ backward pass. (eph is array of intermediate hidden states) """ dW2 = np.dot(eph.T, epdlogp).ravel() dh = np.outer(epdlogp, model['W2']) dh[eph <= 0] = 0 # backpro prelu dW1 = np.dot(dh.T, epx) return {'W1': dW1, 'W2': dW2} if __name__ == '__main__': start_time = time.time() env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind(env) observation = env.reset() prev_x = None # used in computing the difference frame xs, hs, dlogps, drs = [], [], [], [] running_reward = None reward_sum = 0 episode_number = 0 reward_log = open('pg pv4w 1e-4 500 no' + str(sys.argv[1]) + '.txt', 'w') while (episode_number < 500): # if render: env.render() # preprocess the observation, set input to network to be difference image cur_x = observation.astype(np.float).ravel()
def get_env(): env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind(env) env = wrap_pytorch(env) return env
def get_env(env_id, frame_stack): env = make_atari(env_id) env = wrap_deepmind(env, frame_stack) env = wrap_pytorch(env) return env
def train(): save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/" figure_path = FIGURE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/" # Create folders. if not os.path.isdir(save_path): os.makedirs(save_path) if not os.path.isdir(figure_path): os.makedirs(figure_path) # Get observation space and action space. env = make_atari(ENV_NAME) obs_space = env.observation_space action_space = env.action_space # Estimate the mean and standard deviation of observations. env.reset() list_obs = [] for _ in range(RANDOM_STEP): action = action_space.sample() obs, _, done, _ = env.step(action) if done: obs = env.reset() list_obs.append(obs) obs_mean = np.mean(list_obs, 0) obs_std = np.mean(np.std(list_obs, 0)) np.savez_compressed(save_path + "obs_mean_std", obs_mean=obs_mean, obs_std=obs_std) env.close() del env # Build models. dynamics = Dynamics(obs_space, action_space, auxiliary_task=AUXILIARY_TASK, is_training=True) policy = Policy(obs_space, action_space, is_training=True) variables_initializer = tf.global_variables_initializer() # Create environments. par_env = ParallelEnvironment( [make_atari(ENV_NAME) for _ in range(NUM_ENV)]) with tf.Session() as sess: # Initialize variables. sess.run(variables_initializer) saver_dynamics = tf.train.Saver(dynamics.trainable_variables) saver_policy = tf.train.Saver(policy.trainable_variables) # Initialize the running estimate of rewards. sum_reward = np.zeros(NUM_ENV) reward_mean = 0.0 reward_std = 1.0 reward_count = 0 # Initialize the counters. total_rollout_step = 0 total_update_step = 0 total_frame = 0 # Initialize the recording of highest rewards. done_first = np.zeros(NUM_ENV) sum_ext_reward = np.zeros((NUM_ENV, ROLLOUT_STEP)) list_highest_reward = [] num_batch = int(np.ceil(NUM_ENV / BATCH_SIZE)) # Each while loop performs a rollout, which first interacts with the environment and then updates the network. while total_frame < MAX_FRAME: # Initialize buffers. buffer_obs = np.zeros( (NUM_ENV, ROLLOUT_STEP + 1, *obs_space.shape)) buffer_action = np.zeros((NUM_ENV, ROLLOUT_STEP)) buffer_ext_reward = np.zeros((NUM_ENV, ROLLOUT_STEP)) buffer_done = np.zeros((NUM_ENV, ROLLOUT_STEP)) buffer_log_prob = np.zeros((NUM_ENV, ROLLOUT_STEP)) buffer_v = np.zeros((NUM_ENV, ROLLOUT_STEP + 1)) buffer_int_reward = np.zeros((NUM_ENV, ROLLOUT_STEP)) buffer_reward = np.zeros((NUM_ENV, ROLLOUT_STEP)) buffer_sum_reward = np.zeros((NUM_ENV, ROLLOUT_STEP)) buffer_adv = np.zeros((NUM_ENV, ROLLOUT_STEP)) buffer_v_target = np.zeros((NUM_ENV, ROLLOUT_STEP)) # Interact with the environment for ROLLOUT_STEP steps. for step in range(ROLLOUT_STEP): # Get observation. if total_frame == 0: obs = par_env.reset() else: obs, _, _, _ = par_env.get_last_response() obs = (obs - obs_mean) / obs_std # Sample action. action, log_prob, v = sess.run( [policy.sampled_action, policy.sampled_log_prob, policy.v], feed_dict={policy.Obs: np.expand_dims(obs, 1)}) action = np.squeeze(action, 1) log_prob = np.squeeze(log_prob, 1) v = np.squeeze(v, 1) # Interact with the environment. obs_next, extrinsic_reward, done, _ = par_env.step(action) # Update buffers. buffer_obs[:, step] = obs buffer_action[:, step] = action buffer_ext_reward[:, step] = extrinsic_reward buffer_done[:, step] = done buffer_log_prob[:, step] = log_prob buffer_v[:, step] = v if step == ROLLOUT_STEP - 1: # Extra operations for the last time step. obs_next = (obs_next - obs_mean) / obs_std v_next = sess.run( policy.v, feed_dict={policy.Obs: np.expand_dims(obs_next, 1)}) v_next = np.squeeze(v_next, 1) buffer_obs[:, step + 1] = obs_next buffer_v[:, step + 1] = v_next # Update frame counter. total_frame += NUM_ENV # Get the highest reward. for step in range(ROLLOUT_STEP): done_prev = done_first if step == 0 else buffer_done[:, step - 1] sum_ext_reward[:, step] = buffer_ext_reward[:, step] + ( 1 - done_prev) * sum_ext_reward[:, step - 1] done_first[:] = buffer_done[:, ROLLOUT_STEP - 1] highest_reward = np.amax(sum_ext_reward) list_highest_reward.append(highest_reward) # Compute the intrinsic reward. buffer_int_reward[:] = sess.run(dynamics.intrinsic_reward, feed_dict={ dynamics.Obs: buffer_obs[:, :-1], dynamics.ObsNext: buffer_obs[:, 1:], dynamics.Action: buffer_action }) # The total reward is a mixture of extrinsic reward and intrinsic reward. buffer_reward[:] = COEF_EXT_REWARD * np.clip( buffer_ext_reward, -1.0, 1.0) + COEF_INT_REWARD * buffer_int_reward # Normalize reward by dividing it by a running estimate of the standard deviation of the sum of discounted rewards. # 1. Compute the sum of discounted rewards. for step in range(ROLLOUT_STEP): sum_reward = buffer_reward[:, step] + GAMMA * sum_reward buffer_sum_reward[:, step] = sum_reward # 2. Compute mean and standard deviation of the sum of discounted rewards. reward_batch_mean = np.mean(buffer_sum_reward) reward_batch_std = np.std(buffer_sum_reward) reward_batch_count = np.size(buffer_sum_reward) # 3. Update the running estimate of standard deviation. reward_mean, reward_std, reward_count = average_mean_std( reward_mean, reward_std, reward_count, reward_batch_mean, reward_batch_std, reward_batch_count) # 4. Normalize reward. buffer_reward = buffer_reward / reward_std # Compute advantage. # - gae_adv_t = sum((gamma * lambda)^i * adv_(t+l)) over i in [0, inf) # - adv_t = r_t + gamma * v_(t+1) - v_t adv = buffer_reward + GAMMA * buffer_v[:, 1:] - buffer_v[:, :-1] sum_adv = np.zeros(NUM_ENV) for step in range(ROLLOUT_STEP - 1, -1, -1): sum_adv = adv[:, step] + GAMMA * LAMBDA * sum_adv buffer_adv[:, step] = sum_adv # Compute target value. buffer_v_target[:] = buffer_adv + buffer_v[:, :-1] # Normalize advantage with zero mean and unit variance. adv_mean = np.mean(buffer_adv) adv_std = np.std(buffer_adv) buffer_adv = (buffer_adv - adv_mean) / adv_std # Update networks. for epoch in range(EPOCH): random_id = np.arange(NUM_ENV) np.random.shuffle(random_id) for i in range(num_batch): batch_id = random_id[i * BATCH_SIZE:np. minimum(NUM_ENV, (i + 1) * BATCH_SIZE)] _, auxiliary_loss, dyna_loss = sess.run( [ dynamics.train_op, dynamics.auxiliary_loss, dynamics.dyna_loss ], feed_dict={ dynamics.Obs: buffer_obs[batch_id, :-1], dynamics.ObsNext: buffer_obs[batch_id, 1:], dynamics.Action: buffer_action[batch_id] }) _, value_loss, pg_loss, entropy_loss = sess.run( [ policy.train_op, policy.value_loss, policy.pg_loss, policy.entropy_loss ], feed_dict={ policy.Obs: buffer_obs[batch_id, :-1], policy.Action: buffer_action[batch_id], policy.Adv: buffer_adv[batch_id], policy.VTarget: buffer_v_target[batch_id], policy.LogProbOld: buffer_log_prob[batch_id] }) total_update_step += 1 # Update rollout step. total_rollout_step += 1 # Only print the last update step. print("Rollout Step ", total_rollout_step, ", Total Frame ", total_frame, ", Update Step ", total_update_step, ":", sep="") print(" Auxiliary Loss = ", format(auxiliary_loss, ".6f"), ", Dynamics Loss = ", format(dyna_loss, ".6f"), ", Value Loss = ", format(value_loss, ".6f"), ", Policy Loss = ", format(pg_loss, ".6f"), sep="") print(" Highest Reward = ", highest_reward, sep="") if total_rollout_step % AUTOSAVE_STEP == 0: # Save network parameters. saver_dynamics.save(sess, save_path + "dynamics") saver_policy.save(sess, save_path + "policy") # Plot reward. interval = NUM_ENV * ROLLOUT_STEP list_frame = list( range(interval, (total_rollout_step + 1) * interval, interval)) plot_reward(list_frame, list_highest_reward, figure_path) # Save network parameters. saver_dynamics.save(sess, save_path + "dynamics") saver_policy.save(sess, save_path + "policy") # Plot reward. interval = NUM_ENV * ROLLOUT_STEP list_frame = list(range(interval, total_frame + interval, interval)) plot_reward(list_frame, list_highest_reward, figure_path) par_env.close()
if render: time_to_sleep = wait_time - (time.time() - start_time) if time_to_sleep > 0: time.sleep(time_to_sleep) return total_reward if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Render on graphics card(cuda:0).") parser.add_argument("--env", default=ENV_NAME, help="Name of the environment, default=" + ENV_NAME) parser.add_argument("-m", "--model", help="DQN") args = parser.parse_args() device = torch.device(GRAPHICS_CARD if args.cuda else "cpu") env = wrappers.make_atari(args.env) env = wrappers.wrap_deepmind(env, False, False, True) net = model.DQN(4, env.action_space.n).to(device) net.load_state_dict(torch.load(args.model)) score = play(env, net, True, device) print(f"Score: {score}")
def select_action(state, number_actions): eps = random.random() if eps < epsilon: action = random.randrange(number_actions) else: input = torch.from_numpy(state).to(device, torch.float32).unsqueeze(0) score = net(input) action = score.max(dim=1)[1].to(torch.int64).item() return action # Build environment env = make_atari('PongNoFrameskip-v4', stack=2) env = wrap_pytorch(env) env = gym.wrappers.Monitor(env, directory='./movie', force=True, video_callable=lambda x: True) number_actions = env.action_space.n # Separate target net & policy net input_shape = env.reset().shape net = QNet(input_shape, number_actions) net.load_state_dict(torch.load(model)) net.eval().to(device) for episode in range(10): state = env.reset()
def train_dqn(env_name, save_path, double=False, dueling=False, notebook=False): env = wrap_deepmind(make_atari(env_name)) num_actions = env.action_space.n print('Num actions: {}'.format(num_actions)) if dueling: model = DuelingNet(out_size=num_actions) target_model = DuelingNet(out_size=num_actions) else: model = DQN(out_size=num_actions) target_model = DQN(out_size=num_actions) criterion = nn.SmoothL1Loss() print('Created models') cuda = False if torch.cuda.is_available(): cuda = True model = model.cuda() target_model = target_model.cuda() print('GPU: {}'.format(torch.cuda.get_device_name(0))) model.apply(init_weights) target_model.apply(init_weights) optimizer = optim.Adam(model.parameters()) #, lr=0.00001) print('Initalized models') schedule = LinearSchedule(P.start_eps, P.end_eps, P.steps_eps) replay = Replay(P.replay_size, P.batch_size, cuda) state = env.reset() num_updates = 0 eps_reward = 0 rewards = [] losses = [] # populate replay with random policy print('Populating replay') for i in tqdm(range(P.replay_start_size), desc='Populating replay'): action = env.action_space.sample() next_state, reward, done, _ = env.step(action) replay.add(state, action, reward, next_state, done) state = next_state if done: state = env.reset() print('Starting training') state = env.reset() for i in tqdm(range(P.num_steps), desc='Total steps'): if schedule.choose_random(): action = env.action_space.sample() else: model_input = torch.from_numpy(np.array(state)[None, :]).type( torch.FloatTensor) if cuda: model_input = model_input.cuda() q_values = model(model_input) action = int(q_values.argmax(1)[0]) next_state, reward, done, _ = env.step(action) eps_reward += reward replay.add(state, action, reward, next_state, done) state = next_state last_eps = 0 if i % P.update_freq == 0: loss = compute_loss(replay, optimizer, model, target_model, P.gamma, criterion, double) num_updates += 1 if num_updates % P.target_update_freq == 0: target_model.load_state_dict(model.state_dict()) if done: rewards.append(eps_reward) losses.append(loss.item()) eps_reward = 0 state = env.reset() if i % P.print_every == 0 and i > 0: print('Step: {}'.format(i)) print('Average episode reward: {}'.format( sum(rewards[last_eps:]) / len(rewards[last_eps:]))) print('Loss: {}'.format( sum(losses[last_eps:]) / len(losses[last_eps:]))) last_eps = len(losses) if i % P.plot_every == 0 and i > 0: plot(i, rewards, losses, notebook, save_path) # if i % P.save_every == 0 and i > 0: torch.save(model, 'experiments/{}/{}_model'.format(save_path, i)) pickle.dump( losses, open("experiments/{}/{}_losses.p".format(save_path, i), "wb")) pickle.dump( rewards, open("experiments/{}/{}_rewards.p".format(save_path, i), "wb"))
config = Config() config.env = args.env config.gamma = 0.99 config.epsilon = 1 config.epsilon_min = 0.01 config.eps_decay = 30000 config.frames = 2000000 config.learning_rate = 1e-4 config.max_buff = 300000 config.update_interval = 2000 config.batch_size = 32 config.print_interval = 5000 config.checkpoint_interval = 50000 # wrap the env env = make_atari(config.env) env = wrap_deepmind(env) env = wrap_pytorch(env) config.action_dim = env.action_space.n config.state_shape = env.observation_space.shape if args.train: agent = DDQNAgent(config) trainer = Trainer(agent, env, config) trainer.train() elif args.test: agent = DDQNAgent(config, training=False) tester = Tester(agent, env, args.model_path) tester.test()
def main(): env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) observation_space = env.observation_space.shape action_sapce = env.action_space.n model = CnnDQN(observation_space, action_sapce) if USE_CUDA: model = model.cuda() optimizer = optim.Adam(model.parameters()) replay_buffer = ReplayBuffer(1000) batch_size = 32 gamma = 0.99 replay_initial = 100 num_frames = 14000 losses = [] all_rewards = [] x_axis1 = [] x_axis2= [] episode_reward = 0 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 30000 # 要求探索率随着迭代次数增加而减小 epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) state = env.reset() for frame_idx in range(1, num_frames + 1): #显示动画 env.render() epsilon = epsilon_by_frame(frame_idx) action = model.act(state, epsilon) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() x_axis1.append(frame_idx) all_rewards.append(episode_reward) episode_reward = 0 if frame_idx+1 > replay_initial: loss = compute_td_loss(model, optimizer, replay_buffer, gamma, batch_size) x_axis2.append(frame_idx) losses.append(np.array(loss.data.cpu())) if frame_idx % 100 == 0: plt.figure(1) plt.subplot(121) plt.plot(x_axis1, all_rewards) plt.subplot(122) plt.plot(x_axis2, losses) plt.show() env.close()
epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 30000 num_frames = 1000000 batch_size = 32 learning_rate = 0.0001 # create environment # env_id = "PongNoFrameskip-v4" # env_id = 'SpaceInvadersNoFrameskip-v4' # env_id = 'MsPacmanNoFrameskip-v4' # env_id = 'VideoPinballNoFrameskip-v4' # env_id = 'MontezumaRevengeNoFrameskip-v4' # env_id = 'QbertNoFrameskip-v4' env_id = sys.argv[1] env = make_atari(env_id) # env = gym.wrappers.Monitor(env, 'stats', video_callable=lambda episode_id: False, force=True, resume=False) env = wrap_deepmind(env) env = wrap_pytorch(env) # create networks current_model = CnnDQN(env.observation_space.shape, env.action_space.n) target_model = CnnDQN(env.observation_space.shape, env.action_space.n) if USE_CUDA: current_model = current_model.cuda() target_model = target_model.cuda() # setup optimizer optimizer = optim.Adam(current_model.parameters(), lr = learning_rate) # initialize replay memory
s = env.reset() return rewards if __name__ == '__main__': logger.configure(f'{C.env_id}/logs_{time_stamp}') for k, v in C._asdict().items(): logger.record_tabular(k, v) logger.dump_tabular() max_reward = tf.placeholder(tf.float32, name='max_reward') mean_reward = tf.placeholder(tf.float32, name='mean_reward') max_summary = tf.summary.scalar('max_rew', max_reward) mean_summary = tf.summary.scalar('mean_rew', mean_reward) with create_session(0) as sess: eval_env = make_atari(C.env_id, 113, 'eval')() envs = SubprocVecEnv( [make_atari(C.env_id, r + 1, 'train') for r in range(4)]) model = Model(eval_env.observation_space.shape, eval_env.action_space.n) runner = Runner(envs, model.policy, nb_rollout=C.nb_rollout) sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter( './{}/summary/{}'.format(C.env_id, time_stamp), sess.graph) for i in range(C.iterations): if i % C.eval_freq == 0: rewards = evaluate(eval_env, model.policy, C.eval_episodes) logger.log( f'Step: {i} | Max reward: {np.max(rewards)} | Mean reward: {np.mean(rewards):.2f} | Std: {np.std(rewards):.2f}' )