def main(): env = gym.make('CartPole-v1') agent = QNetwork(env.observation_space, env.action_space) frame = 0 while frame < MAX_NUM_FRAMES: state = env.reset() for t in range(MAX_EPISODE_DURATION): env.render() action = agent.act(state) next_state, reward, done, info = env.step(action) if done: # done doesn't return a negative reward... reward *= -1 frame = frame + 1 if done or frame == MAX_NUM_FRAMES: break state = next_state env.close()
def main(): env = gym.make('CartPole-v1') replay_memory = ReplayMemory() agent = QNetwork(env.observation_space, env.action_space) get_training_batch = make_training_transformer(env.observation_space.shape, agent) frame = 0 acc_loss = 0 acc_state_value = 0 while frame < MAX_NUM_FRAMES: state = env.reset() for t in range(MAX_EPISODE_DURATION): if take_random_action(frame): action = env.action_space.sample() # pick random action else: action = agent.act(state) next_state, reward, done, info = env.step(action) if done: # on done doesn't return a negative reward... reward *= -1 experience = (state, action, reward, next_state, done) replay_memory.append(experience) frame += 1 experience_samples = replay_memory.sample(BATCH_SIZE) state_batch, qs_batch = get_training_batch(experience_samples) acc_state_value += np.mean(qs_batch) loss = agent.training_step(state_batch, qs_batch) acc_loss += loss if frame % FRAMES_PER_SAVE == 0: model_filename = f"ckpt-loss={loss:.4f}" agent.save_model(model_filename) if frame % FRAMES_PER_PRINT == 0: print(f"Frame: {frame}") avg_loss = acc_loss / FRAMES_PER_PRINT avg_state_value = acc_state_value / FRAMES_PER_PRINT print( f"avg loss: {avg_loss:.4f}; avg value: {avg_state_value:.2f}" ) acc_loss = 0 acc_state_value = 0 if done or frame == MAX_NUM_FRAMES: break state = next_state env.close()