def do_q_learning(env, reward_function, train_episodes, figure=False): alpha = 0.01 gamma = 0.9 epsilon = 0.1 policy = DQNPolicy(env, lr=alpha, gamma=gamma, input=2, output=4) # 4 actions output, up, right, down, left replay_buffer = ReplayBuffer() # Play with a random policy and see # run_current_policy(env.env, policy) agg_interval = 100 avg_history = {'episodes': [], 'timesteps': [], 'reward': []} # Train the network to predict actions for each of the states for episode_i in range(train_episodes): episode_timestep = 0 episode_reward = 0.0 env.__init__() # todo : the first current state should be 0 cur_state = env.cur_state counter = 0 done = False while not done: # Let each episode be of 30 steps counter += 1 done = counter >= 30 # todo : check if this line is working action = policy.select_action(cur_state.reshape(1, -1), epsilon) # take action in the environment next_state = env.step(action) reward = reward_function(next_state) # add the transition to replay buffer replay_buffer.add(cur_state, action, next_state, reward, done) # sample minibatch of transitions from the replay buffer # the sampling is done every timestep and not every episode sample_transitions = replay_buffer.sample() # update the policy using the sampled transitions policy.update_policy(**sample_transitions) episode_reward += reward episode_timestep += 1 cur_state = next_state avg_history['episodes'].append(episode_i + 1) avg_history['timesteps'].append(episode_timestep) avg_history['reward'].append(episode_reward) learning_policy_progress.update() if figure: plt.plot(avg_history['episodes'], avg_history['reward']) plt.title('Reward') plt.xlabel('Episode') plt.ylabel('Reward') plt.show() return policy.q_model
pbar_cp = tqdm(total=cp_train_episodes) # In[]: # Train the network to predict actions for each of the states for episode_i in range(cp_start_episode, cp_start_episode + cp_train_episodes): episode_timestep = 0 episode_reward = 0.0 done = False cur_state = cp_env.reset() while not done: # select action action = cp_policy.select_action(cur_state.reshape(1, -1), cp_epsilon) # take action in the environment next_state, reward, done, info = cp_env.step(action) # add the transition to replay buffer replay_buffer.add(cur_state, action, next_state, reward, done) # sample minibatch of transitions from the replay buffer # the sampling is done every timestep and not every episode sample_transitions = replay_buffer.sample() # update the policy using the sampled transitions cp_policy.update_policy(**sample_transitions) episode_reward += reward
# play with a random policy # run_current_policy(env_policy, env, env.reset()) # In[]: history = dict({'reward':list(), 'timesteps':list(), 'episodes':list()}) for episode in range(total_train_episodes): done = False # print('Epoch :', episode + 1) ep_reward = 0 ep_timesteps = 0 cur_state = env.reset() epsilon = max(epsilon, epsilon_min) max_position = -99 while not done: action = env_policy.select_action(cur_state.reshape(1, -1), epsilon) next_state, reward, done, _ = env.step(action) # Visualize the status if episode % mod_episode == 0: env.render() # Keep track of max position if next_state[0] > max_position: max_position = next_state[0] # Adjust reward for task completion if next_state[0] >= 0.5: reward += 10 replay_buffer.add(cur_state, action, next_state, reward, done)