#env = monitor = Monitor(env, 'data/monitor',force=True, video_callable=lambda i: i % 1 != 0 ) env = FrameStack(env, num_stack, False) episode_count = 1 reward = 0 done = False score = 0.0 print_interval = 3 for n_epi in range(100): s = env.reset() done = False while not done: for t in range(T_horizon): env.render() s = np.array(s).reshape(shape) od = model(torch.from_numpy(s).float()) prob = od['pi'] #print(prob) m = Categorical(prob) a = m.sample().item() s_prime, r, done, info = env.step(a) trn = (s.reshape(shape0), a, r / 100.0, np.array(s_prime), prob[0][a].item(), done) model.put_data(trn) s = s_prime score += r if done: break
def main(args): env = gym.make(args.env) # Rescale images to 42x42 and turn into greyscale env = AtariPreprocessing(env, screen_size=42, grayscale_obs=True, noop_max=1, terminal_on_life_loss=True) # A quick trick to give agent some sense of history/motion: # Give N successive frames instead of just one to the agent. # This deque will store N last frames to do this. state_stacker = deque(maxlen=FRAME_STACK_SIZE) new_deque = deque(maxlen=100) # Build models according to image shape and number of actions # that are available. # If we are evaluating, load existing model instead state_shape = RESOLUTION + (FRAME_STACK_SIZE, ) model = None target_model = None if not args.evaluate: # Construct new models model, target_model = build_models(state_shape, env.action_space.n) else: # Load existing model model = keras.models.load_model(args.model_path) # Initialize replay memory (if training) replay_memory = None if not args.evaluate: replay_memory = ReplayMemory(REPLAY_SIZE, state_shape) # Open log file if we want to output results log_file = None if args.log is not None: log_file = open(args.log, "w") # Main training loop step_ctr = 0 q_values_counter = 0 q_values_summation = 0 while step_ctr < args.steps: terminal = False episode_reward = 0 # Keep track of losses losses = [] # Reset frame stacker to empty frames state_stacker.clear() for i in range(FRAME_STACK_SIZE): state_stacker.append(np.zeros(RESOLUTION + (1, ))) s1 = env.reset() # Preprocess state s1 = preprocess_state(s1, state_stacker) while not terminal: action, q_values = get_action(s1, model, env.action_space.n) # TODO # Here you might want to store q_values somewhere # for later plotting s2, reward, terminal, info = env.step(action) #print(reward) s2 = preprocess_state(s2, state_stacker) step_ctr += 1 # Count episodic reward episode_reward += reward if args.show: env.render() # Skip training/replay memory stuff if we are evaluating if not args.evaluate: # Store the experience to replay memory replay_memory.add_experience(s1, action, reward, s2, terminal) # Check if we should do updates or saving model if (step_ctr % UPDATE_RATE) == 0: if replay_memory.num_total > SAMPLES_TILL_TRAIN: losses.append( update_model(model, target_model, replay_memory)) if (step_ctr % TARGET_UPDATE_RATE) == 0: update_target_model(model, target_model) if (step_ctr % SAVE_MODEL_EVERY_STEPS) == 0: model.save(args.model_path) # s2 becomes s1 for the next iteration s1 = s2 # If we want to limit fps, sleep little bit if args.limit_fps: sleep(1 / 35.0) # storing another collection #storer_deque = [] new_deque.append(episode_reward) # To avoid div-by-zero if len(losses) == 0: losses.append(0.0) # TODO # 1) Print out average training loss # 2) Track average reward over last 100 episodes # 3) Track average Q-value of this episode print('Average of q_values: ', np.average(q_values)) # TODO average loss # Losses from previous episodes are already stored in list `losses`. # Compute average loss and include it in the printout below q_values_counter += len(q_values) q_values_summation += np.sum(q_values) print('Average of losses: ', np.average(losses)) print('Average of first 100 revolts: ', np.average(new_deque)) running_average_q_values = q_values_summation / q_values_counter print('Running average of the q_values: ', running_average_q_values) # Legend: # - Episode reward: Reward from the previous episode # - Steps: Total number of agent steps taken in thins training s = "Episode reward: {:.1f}\tSteps: {}\t".format( episode_reward, step_ctr, ) # Print our log message print(s) # If we have a log file, print it there as well if log_file is not None: log_file.write(s + "\n") env.close()