def main(): env = gym.make('PongDeterministic-v4') network_input_shape = (4, 210, 160 ) # Dimension ordering: 'th' (channels first) actions = env.action_space.n agent = DQN_CNN(actions, network_input_shape, learning_rate=LEARNING_RATE, discount_factor=DISCOUNT_FACTOR) frame_counter = 0 for ep in range(1, EPISODES + 1): # Start episode score = 0 # Observe reward and initialize first state obs = preprocess_observation(env.reset()) # Initialize the first state with the same 4 images current_state = np.array([obs, obs, obs, obs]) frame_counter += 1 for t in range(MAX_STEPS): # Select an action using the DQA action = agent.act(np.asarray([current_state])) pdb.set_trace() # Observe reward and next state obs, reward, done, info = env.step(action) obs = preprocess_observation(obs) next_state = get_next_state(current_state, obs) frame_counter += 1 # Store transition in replay memory clipped_reward = np.clip(reward, -1, 1) # Clip the reward agent.add_experience(np.asarray([current_state]), action, clipped_reward, np.asarray([next_state]), done)
episode = 0 frame_counter = 0 from cv2 import VideoWriter, VideoWriter_fourcc, imshow, waitKey, imwrite vid = VideoWriter('demo_breakout.avi', VideoWriter_fourcc(*"XVID"), float(30), (160, 210), False) if args.train: # Main loop while episode < args.max_episodes: # Start episode logger.log("Episode %d" % episode) score = 0 # Observe reward and initialize first state obs = utils.preprocess_observation(env.reset()) # Initialize the first state with the same 4 images current_state = np.array([obs, obs, obs, obs]) # Main episode loop t = 0 frame_counter += 1 while t < args.max_episode_length: # Stop the episode if it takes too long if frame_counter > args.max_frames_number: DQA.quit() # Select an action using the DQA action = DQA.get_action(np.asarray([current_state]))
def evaluate(DQA, args, logger, env): global max_mean_score evaluation_csv = 'evaluation.csv' logger.to_csv(evaluation_csv, 'length,score') scores = list() frame_counter = 0 while frame_counter < args.validation_frames: remaining_random_actions = args.initial_random_actions obs = utils.preprocess_observation(env.reset()) frame_counter += 1 # Initialize the first state with the same 4 images current_state = np.array([obs, obs, obs, obs]) t = 0 episode = 0 score = 0 # Start episode while True: # Render the game if video output is not suppressed if args.video: env.render() action = DQA.get_action(np.asarray([current_state]), testing=True, force_random=remaining_random_actions > 0) obs, reward, done, info = env.step(action) obs = utils.preprocess_observation(obs) current_state = utils.get_next_state(current_state, obs) if remaining_random_actions > 0: remaining_random_actions -= 1 score += reward t += 1 frame_counter += 1 # End episode if done or t > args.max_episode_length: episode += 1 print('Episode %d end\n---------------\nFrame counter: %d\n' % (episode, frame_counter)) print('Length: %d, Score: %.1f\n\n' % (t, score)) # Save episode data in the evaluation csv logger.to_csv(evaluation_csv, [t, score]) break scores.append([t, score]) scores = np.asarray(scores) max_indices = np.argwhere(scores[:, 1] == np.max(scores[:, 1])).ravel() max_idx = np.random.choice(max_indices) # Save best model if max_mean_score < np.mean(scores): max_mean_score = np.mean(scores) DQA.DQN.save(append='_best') return scores[max_idx, :].ravel()
def evaluate(DQA, args, logger): global max_mean_score evaluation_csv = 'evaluation.csv' logger.to_csv(evaluation_csv, 'length,score') env = gym.make(args.environment) scores = list() frame_counter = 0 while frame_counter < args.validation_frames: remaining_random_actions = args.initial_random_actions obs = utils.preprocess_observation(env.reset()) frame_counter += 1 # Initialize the first state with the same 4 images current_state = np.array([obs, obs, obs, obs]) t = 0 episode = 0 score = 0 # Start episode while True: # Render the game if video output is not suppressed if args.video: env.render() action = DQA.get_action(np.asarray([current_state]), testing=True, force_random=remaining_random_actions > 0) obs, reward, done, info = env.step(action) obs = utils.preprocess_observation(obs) current_state = utils.get_next_state(current_state, obs) if remaining_random_actions > 0: remaining_random_actions -= 1 score += reward t += 1 frame_counter += 1 # End episode if done or t > args.max_episode_length: episode += 1 print('Episode %d end\n---------------\nFrame counter: %d\n' % (episode, frame_counter)) print('Length: %d\n, Score: %f\n\n' % (t, score)) # Save episode data in the evaluation csv logger.to_csv(evaluation_csv, [t, score]) break scores.append([t, score]) scores = np.asarray(scores) max_indices = np.argwhere(scores[:, 1] == np.max(scores[:, 1])).ravel() max_idx = np.random.choice(max_indices) # Save best model if max_mean_score < np.mean(scores): max_mean_score = np.mean(scores) DQA.DQN.save(append='_best') return scores[max_idx, :].ravel()
logger.to_csv(eval_csv, 'length,score') logger.to_csv(test_csv, 'avg_score,avg_Q') # Set counters episode = 0 frame_counter = 0 if args.train: # Main loop while episode < args.max_episodes: # Start episode logger.log("Episode %d" % episode) score = 0 # Observe reward and initialize first state obs = utils.preprocess_observation(env.reset()) # Initialize the first state with the same 4 images current_state = np.array([obs, obs, obs, obs]) # Main episode loop t = 0 frame_counter += 1 while t < args.max_episode_length: # Stop the episode if it takes too long if frame_counter > args.max_frames_number: DQA.quit() # Render the game if args.video: env.render()