def run_sarsa(self,
                  max_number_of_episodes=100,
                  interactive=False,
                  display_frequency=1):

        # repeat for each episode
        for episode_number in range(max_number_of_episodes):

            # initialize state
            state = self.env.reset()

            done = False  # used to indicate terminal state
            R = 0  # used to display accumulated rewards for an episode
            t = 0  # used to display accumulated steps for an episode i.e episode length

            # choose action from state using policy derived from Q
            action = self.agent.act(state)

            # repeat for each step of episode, until state is terminal
            while not done:

                t += 1  # increase step counter - for display

                # take action, observe reward and next state
                next_state, reward, done, _ = self.env.step(action)

                # choose next action from next state using policy derived from Q
                next_action = self.agent.act(next_state)

                # agent learn (SARSA update)
                self.agent.learn(state, action, reward, next_state,
                                 next_action)

                # state <- next state, action <- next_action
                state = next_state
                action = next_action

                R += reward  # accumulate reward - for display

                # if interactive display, show update for each step
                if interactive:
                    self.update_display_step()

            self.episode_length = np.append(
                self.episode_length, t)  # keep episode length - for display
            self.episode_reward = np.append(
                self.episode_reward, R)  # keep episode reward - for display

            # if interactive display, show update for the episode
            if interactive:
                self.update_display_episode()

        # if not interactive display, show graph at the end
        if not interactive:
            self.fig.clf()
            stats = plotting.EpisodeStats(
                episode_lengths=self.episode_length,
                episode_rewards=self.episode_reward,
                episode_running_variance=np.zeros(max_number_of_episodes))
            plotting.plot_episode_stats(stats, display_frequency)
コード例 #2
0
def policy_f(env, scaler, featurizer, print_ep_lens):
    '''
    Main Calling Function for generating expert policy.
    ** Read the multi-line comment at the starting of this file to gain understanding.
    
    Args:
        env: Gym environment
        scaler: Mean and variance of the state values.
        featurizer: The container used for generating expert trajectories.
        print_ep_stats: [Bool] Prints interation with no. of time steps required for completion
        
    Returns:
        a) Plots statisics of mountain car learning with inbuilt rewards in the gym environment.
        So that we are able to compare results with the mountain car learning with learnt reward function.
        b) Returns "Demostration By Expert" DBE policy.
    '''
    estimator = Estimator(env, scaler, featurizer)
    stats = q_learning_best_policy(env,
                                   estimator,
                                   200,
                                   epsilon=0.0,
                                   print_ep_lens=False)
    print("___Plotting Learning Stats of the Agent____")
    plotting.plot_cost_to_go_mountain_car(env, estimator)
    plotting.plot_episode_stats(stats, smoothing_window=25)
    final_policy = greedy_policy(estimator, env.action_space.n)
    return final_policy, estimator
コード例 #3
0
ファイル: experiment.py プロジェクト: sircodesalittle/mdp
    def run(self):
        print('\tValue Iteration')
        vi_agent = ValueIterationAgent(self.env, self.gamma)
        print('\t\tAverage reward: ' + str(np.mean(vi_agent.scores)))
        print('\t\tConvergence step: ' + str(vi_agent.convergence))
        print('\t\tPolicy: ' + str(vi_agent.policy))

        print('\tPolicy Iteration')
        self.env.reset()
        pi_agent = PolicyIterationAgent(self.env, self.gamma)
        print('\t\tAverage reward: ' + str(np.mean(pi_agent.scores)))
        print('\t\tConvergence step: ' + str(pi_agent.convergence))
        print('\t\tPolicy: ' + str(pi_agent.policy))

        print('\tQ Learning')
        self.env.reset()
        ql_agent = QLearningAgent(self.env)
        q, stats = ql_agent.q_learning(self.env, 500)
        plotting.plot_episode_stats(stats, experiment_name=self.name)
コード例 #4
0
ファイル: a2cpytorch.py プロジェクト: SKRohit/MOVE37
		for t in count():
			env.render()
			action = actor_critic.choose_action(state)
			next_state, reward, done, _ = env.step(action)
			next_state = torch.Tensor(next_state)

			steps.append(Transition(state=state, action=action, reward=reward, next_state=next_state, done=done))
			stats.episode_rewards[i_episode] += reward

			#calculate total loss
			total_loss = actor_critic.loss_func(state, action, reward, next_state, gamma)  

			ac_optim.zero_grad()
			total_loss.backward()
			ac_optim.step()

			print("\rStep {} @ Episode {}/{} ({})".format(t, i_episode + 1, n_episodes, stats.episode_rewards[i_episode - 1]), end="")
			if done:
				stats.episode_lengths[i_episode] = t
				break
			state = next_state
	return stats, steps

if __name__ == '__main__':

	gamma, num_episodes = args.gamma, args.episodes
	stats, steps = ac_train(env,actor_critic,ac_optim,num_episodes,gamma)

	#Plot 3 plots: episode_reward vs time, episode_length vs time, episode_number vs time
	plot_episode_stats(stats)
コード例 #5
0
ファイル: run_exp.py プロジェクト: dhfromkorea/mighty-rl
def main():
    logging.info("define environment and basis function")
    env_id = "MountainCar-v0"
    env = gym.envs.make(env_id)
    logging.info("env_id: {}".format(env_id))
    action_list = range(env.action_space.n)

    # linear basis func
    p_linear = 3
    q_linear = 3
    phi_linear = simple_phi
    psi_linear = phi_linear

    # radial basis (gaussian) fn
    p_rbf = 100
    q_rbf = 100
    phi_rbf = get_basis_function(env_id)
    psi_rbf = phi_rbf

    # this is specific to mountaincar-v0
    init_s_sampler = lambda: [np.random.uniform(-0.4, -0.6), 0.0]

    # 2. define hyperparams
    gamma = 0.95
    n_trial = 2
    n_iteration = 10
    # @note: hard-coded
    # this's gotta be sufficiently large to avoid mc variance issue
    sample_size_mc = 10**2
    #p = p_linear
    #q = q_linear
    #phi = phi_linear
    #psi = psi_linear
    p = p_rbf
    q = q_rbf
    phi = phi_rbf
    psi = psi_rbf
    precision = 1e-4
    use_slack = False
    # @note: reward may have to be scaled to work with slack penalty
    slack_penalty = 1e-3
    eps = 0.0001
    #eps = 0
    # this should be large to account for varying init sate
    mu_sample_size = 50

    logging.info("collect a batch of data (D) from pi_expert (and some noise)")
    pi_exp = NearExpertPolicy()
    pi_random = get_random_policy()

    # preprocessing D in numpy array for k
    logging.info("apprenticeship learning starts")
    logging.info("feature dim:\n{}".format(phi))

    mu_exp = AL.estimate_mu(env=env,
                            pi_eval=pi_exp,
                            mu_sample_size=sample_size_mc,
                            phi=phi,
                            gamma=gamma,
                            return_epi_len=False)
    #mu_mc_list = estimate_mu_mc(env, pi_exp, phi_linear, gamma, sample_size_mc)
    #mu_mc_list = estimate_mu_mc(env, pi_exp, phi_rbf, gamma, sample_size_mc)
    #mu_exp = np.mean(mu_mc_list, axis=0)

    pi_init = pi_random

    mdp_solver = LinearQ3(env=env,
                          phi=phi,
                          action_list=action_list,
                          n_episode=100,
                          epsilon=0.0,
                          gamma=gamma)

    al = AL(env=env,
            pi_init=pi_init,
            action_list=action_list,
            p=p,
            q=q,
            phi=phi,
            psi=psi,
            gamma=gamma,
            eps=eps,
            mu_exp=mu_exp,
            init_s_sampler=init_s_sampler,
            mu_sample_size=mu_sample_size,
            precision=precision,
            mdp_solver=mdp_solver,
            use_slack=use_slack,
            slack_penalty=slack_penalty)

    results = al.run(n_trial=n_trial, n_iteration=n_iteration)

    # 5. post-process results (plotting)
    pi_irl = results["policy_best"][0]
    weight_irl = results["weight_best"][0]
    margin_v = results["margin_v"][0]
    margin_mu = results["margin_mu"][0]
    weight = results["weight"][0]

    state_dim = env.observation_space.shape[0]
    # discrete action
    action_dim = 1
    n_action = env.action_space.n
    sim = Simulator(env, state_dim=state_dim, action_dim=action_dim)

    D_irl, stats = sim.simulate(pi_irl,
                                n_trial=1,
                                n_episode=15,
                                return_stats=True)

    plotting.plot_cost_to_go_mountain_car(env, pi_irl._estimator)
    plotting.plot_episode_stats(stats, smoothing_window=5)

    np.save("data/D_irl.npy".format(time()), D_irl)
    np.save("data/margin_v.npy".format(time()), margin_v)
    np.save("data/margin_mu.npy".format(time()), margin_mu)
    np.save("data/weight.npy".format(time()), weight)
    np.save("data/weight_best.npy".format(time()), weight_irl)
    print("D_irl shape{}".format(D_irl.shape))

    with open("data/res_{}".format(time()), "wb") as f:
        pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
コード例 #6
0
            next_state, reward, end, _ = env.step(action)
            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(next_action_probs)),
                                           p=next_action_probs)

            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            q_values_next = estimator.predict(next_state)
            td_target = reward + discount_factor * q_values_next[next_action]

            estimator.update(state, action, td_target)

            if i_episode % 10 == 0:
                print("\rStep {} @ Episode {}/{} ({})".format(
                    t, i_episode + 1, num_episodes, reward))

            if end:
                break

            state = next_state
            action = next_action
    return stats


estimator = FunctionApproximator()
stats = sarsa(env, estimator, 200, epsilon=0.0)

plotting.plot_cost_to_go_mountain_car(env, estimator)
plotting.plot_episode_stats(stats, smoothing_window=25)
コード例 #7
0
            replay_memory_size=1024,
            #                                    replay_memory_init_size=50000,
            replay_memory_init_size=128,
            #                                    update_target_estimator_every=10000,
            update_target_estimator_every=500,
            epsilon_start=1.0,
            epsilon_end=0.1,
            #                                    epsilon_decay_steps=500000,
            epsilon_decay_steps=800,
            discount_factor=0.99,
            #                                    batch_size=32):
            batch_size=32):
        print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))

q_estimator.save_states(experiment_dir + "/lstmvis")

ep_length, ep_reward, t_steps = plotting.plot_episode_stats(stats,
                                                            smoothing_window=5,
                                                            noshow=True)
ep_length.savefig(experiment_dir + '/ep_length.png')
ep_reward.savefig(experiment_dir + '/ep_reward.png')
t_steps.savefig(experiment_dir + '/t_steps.png')

if (os.path.exists("./log.txt")):
    copyfile("./log.txt", experiment_dir + "/log.txt")
    os.remove("./log.txt")

# In[ ]:

# In[ ]:
コード例 #8
0
            print("\rStep {} @ Episode {}/{} ({})".format(
                t, i_episode+1, num_episodes, stats.episode_rewards[i_episode-1]))

            if done:
                break

            state = next_state

    return stats


tf.reset_default_graph()
global_step = tf.Variable(0, name="global_step", trainable=False)
policy_estimator = PolicyEstimator(learning_rate=0.001)
value_estimator = ValueEstimator(learning_rate=0.1)

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    stats = actor_critic(env, policy_estimator, value_estimator, 50, discount_factor=0.95)

plotting.plot_episode_stats(stats, smoothing_window=25)









コード例 #9
0
            action = np.random.choice(np.arange( 
                      len(action_probabilities)), 
                       p = action_probabilities) 
   
            # take action and get reward, transit to next state 
            next_state, reward, done, _ = env.step(action) 
   
            # Update statistics 
            stats.episode_rewards[ith_episode] += reward 
            stats.episode_lengths[ith_episode] = t 
               
            # TD Update 
            best_next_action = np.argmax(Q[next_state])     
            td_target = reward + discount_factor * Q[next_state][best_next_action] 
            td_delta = td_target - Q[state][action] 
            Q[state][action] += alpha * td_delta 
   
            # done is True if episode terminated    
            if done: 
                break
                   
            state = next_state 
       
    return Q, stats


Q, stats = qLearning(env, 1000)

plotting.plot_episode_stats(stats) 

コード例 #10
0
def run_discrete(environment_name, mapping=None, shape=None):
    problem = gym.make(environment_name)
    print('== {} =='.format(environment_name))
    print('Actions:', problem.action_space.n)
    print('States:', problem.observation_space.n)
    print(problem.desc)
    print()

    if environment_name == 'TaxiEnv-v1':
        print('== Value Iteration ==')
        value_policy, iters = value_iteration_local(problem)
        print('Iterations:', iters)
        print()

        print('== Policy Iteration ==')
        policy, iters = policy_iteration_local(problem)
        print('Iterations:', iters)
        print()

        diff = sum([
            abs(x - y)
            for x, y in zip(policy.flatten(), value_policy.flatten())
        ])
        if diff > 0:
            print('Discrepancy:', diff)
            print()

        if shape is not None:
            print('== Policy ==')
            print_policy(value_policy, mapping, shape)
            print_policy(policy, mapping, shape)
            print()

        taxi_q_learning()
    else:
        print('== Value Iteration ==')
        value_policy_local, iters = value_iteration_local(problem)
        value_policy, Vi, iters, time = value_iteration(problem)
        print('Iterations:', iters)
        print()

        print('== Policy Iteration ==')
        policy_local, iters = policy_iteration_local(problem)
        policy, V, iters, time = policy_iteration(problem)
        print('Iterations:', iters)
        print()

        visualize_policy(value_policy, environment_name, problem.desc.shape,
                         'Optimal policy - Modified transition model')
        visualize_value(Vi, environment_name, problem.desc.shape,
                        'Value estimates - Modified transition model')

        diff = sum([
            abs(x - y)
            for x, y in zip(policy.flatten(), value_policy.flatten())
        ])
        if diff > 0:
            print('Discrepancy:', diff)
            print()

        if shape is not None:
            print('== Policy ==')
            print_policy(value_policy_local, mapping, shape)
            print_policy(policy_local, mapping, shape)
            print()

        frozenlake_q_learning()

        Q, stats, Nsa, final_policy = q_learning(problem, 'greedy', 1000)

        plotting.plot_episode_stats(stats)

    return policy
コード例 #11
0
ファイル: simulation.py プロジェクト: jsparent/ai-rl
    def run_qlearning(self,
                      interactive=False,
                      display_frequency=1,
                      save_model_each_n_episodes=50,
                      time_penalty=0,
                      life_penalty=0):
        self.start_run()

        BYTE_VIE = 57

        # repeat for each episode
        for episode_number in range(self.EPISODES):

            # initialize state
            state = self.env.reset()

            done = False  # used to indicate terminal state
            R = 0  # used to display accumulated rewards for an episode
            t = 0  # used to display accumulated steps for an episode i.e episode length

            # repeat for each step of episode, until state is terminal
            while not done:

                t += 1  # increase step counter - for display

                # choose action from state using policy derived from Q
                action = self.agent.act(state)

                # take action, observe reward and next state
                next_state, reward, done, _ = self.env.step(action)

                # Pénaliser le fait de ne rien faire pour éviter des épisodes qui s'étirent
                learning_reward = reward

                # Après discussions avec Mikael, ne pas pénaliser l'inaction ou les vies
                if learning_reward == 0:
                    vies_actuelles = state[BYTE_VIE]
                    vies_apres = next_state[BYTE_VIE]
                    if vies_apres < vies_actuelles:
                        #print("vie perdue")
                        learning_reward = -life_penalty  # Pénaliser fortement la perte de vie
                    else:
                        learning_reward = -time_penalty  # Pénaliser légèrement l'inaction

                # agent learn (Q-Learning update)
                if self.training:
                    self.agent.learn(state, action, learning_reward,
                                     next_state, done)

                # state <- next state
                state = next_state

                R += reward  # accumulate reward - for display

                # if interactive display, show update for each step
                #if self.training and interactive:
                self.update_display_step(t)

                # If cancel requested, exit
                if self.agent.isCancelRequested():
                    self.agent.log("*** Arrêt demandé détecté ***",
                                   flushBuffer=True)
                    break

            self.episode_length = np.append(
                self.episode_length, t)  # keep episode length - for display
            self.episode_reward = np.append(
                self.episode_reward, R)  # keep episode reward - for display

            if R > 0:
                self.agent.log(
                    f"Épisode {episode_number+1}/{self.EPISODES}: R={R}, Steps={t}",
                    doPrint=not interactive)

            # Update image of highest score only
            if interactive:
                if R >= self.high_score:
                    self.update_display_step()
                self.update_display_episode()

            if R > self.high_score:
                self.high_score = R
                self.agent.log(
                    f"\tNouveau meilleur score à: {self.high_score}, épisode #{episode_number + 1}",
                    flushBuffer=True)

            # Sauvegarde du modèle tous les n épisodes
            if self.training and save_model_each_n_episodes != None and (
                    episode_number + 1) % save_model_each_n_episodes == 0:
                self.agent.saveModel()

            # if interactive display, show update for the episode
            if not self.training and interactive:
                self.update_display_episode()

            # If cancel requested, exit
            if self.agent.isCancelRequested():
                self.agent.log("*** Arrêt demandé par l'usager ***",
                               flushBuffer=True)
                break

        # if interactive display, show graph at the end
        if interactive:
            self.update_display_episode()

        else:
            self.fig.clf()
            stats = plotting.EpisodeStats(episode_lengths=self.episode_length,
                                          episode_rewards=self.episode_reward,
                                          episode_running_variance=np.zeros(
                                              self.EPISODES))
            plotting.plot_episode_stats(stats, display_frequency)

        self.agent.log("")
        self.agent.log(f"Fin des épisodes")
        self.agent.log(f"Meilleur score obtenu: {self.high_score}")
        self.agent.log(
            f"Durée moyenne: {round(np.average(self.episode_length), 1)} actions"
        )
        self.agent.log(
            f"Score moyen: {round(np.average(self.episode_reward), 2)} points")
        self.agent.log("", flushBuffer=True)

        self.end_run()
コード例 #12
0
                reward_ = self._update_states()
                #print 'New State', self.old_state
                stats.episode_rewards[i_episode] += reward_
                stats.episode_lengths[i_episode] = i

                if self.old_state == 15 or self.old_state == 0:
                    print 'Ith episode, episode len', i_episode, i
                    break
                free_energy_2 = self.update_action(update=True)
                diff = reward_ + self.discount_factor * free_energy_2 - free_energy_1
                self._update_action_weights(diff)
                self._update_state_weights(diff)
        return stats


'''
rbm = RBM(nagent=3, nstate=16, nhid=20, naction=4)
stats1 = rbm.gibbs_sampling(100)

rbm = RBM(nagent=3, nstate=16, nhid=50, naction=4)
stats2 = rbm.gibbs_sampling(100)
'''
rbm = RBM(nagent=1, nstate=16, nhid=100, naction=4)
stats3 = rbm.gibbs_sampling(100)

Q, stats4 = sarsa(rbm.env, 100)
plotting.plot_episode_stats(stats3, stats4)
import ipdb

ipdb.set_trace()
コード例 #13
0
    #             stats.episode_rewards[i_episode] += reward
    #             stats.episode_lossbag[i_episode] += data_overflow
    #             if t>=4000:
    #                 break
    #             state = next_state
    #             total_t += 1

    stats_c = plotting.EpisodeStats(episode_transbag=np.zeros(num_episodes),
                                    episode_rewards=np.zeros(num_episodes),
                                    episode_lossbag=np.zeros(num_episodes))
    _ = env.reset_test()
    for i_episode in range(num_episodes):

        _ = env.reset_test()
        for t in itertools.count():
            best_action = greedyselect_known(env.S)
            _, reward, _, data_overflow, data_trans = env.step(best_action)
            stats_c.episode_transbag[i_episode] += data_trans
            stats_c.episode_rewards[i_episode] += reward
            stats_c.episode_lossbag[i_episode] += data_overflow
            if t >= 1000:
                break
        print("\rEpisode {} / {} , lossbag {}  D {}  ".format(
            i_episode + 1, num_episodes, stats_c.episode_lossbag[i_episode],
            np.sum(env.S[:, 1])),
              end="")
        sys.stdout.flush()
    print("\n")
    print(np.mean(stats_c.episode_lossbag))
    plotting.plot_episode_stats(stats, stats_comp=stats_c)
コード例 #14
0
def createEpsilonGreedyPolicy(Q, epsilon, num_actions):
    # """ 
    Creates an epsilon-greedy policy based 
    on a given Q-function and epsilon. 
       
    Returns a function that takes the state 
    as an input and returns the probabilities 
    for each action in the form of a numpy array  
    of length of the action space(set of possible actions). 
    """#
  def policyFunction(state):
    Action_probabilities = np.ones(num_actions, dtype = float) * epsilon/num_actions
    best_action = np.argmax(Q[state])
    Action_probabilities[best_action] += (1.0 - epsilon)
    return Action_probabilities
  return policyFunction

#Step 4: Build Q-Learning Model
def qLearning(env,num_episodes,discount_factor=1.0,alpha=0.3,epsilon=0.1):
""" 
    Q-Learning algorithm: Off-policy TD control. 
    Finds the optimal greedy policy while improving 
    following an epsilon-greedy policy"""
       
    # Action value function 
    # A nested dictionary that maps 
    # state -> (action -> action-value). 
    
  Q = defaultdict(lambda: np.zeros(env.action_space.n))
  
  # Keeps track of useful statistics 
  stats= plotting.EpisodeStats(episode_lengths = np.zeros(num_episodes),
                                episode_rewards = np.zeros(num_episodes))
  
  # Create an epsilon greedy policy function 
  # appropriately for environment action space 
  policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n)
  
  # For every episode
  for ith_episode in range(num_episodes):
    state = env.reset()
    
    # Reset the environment and pick the first action
    for t in itertools.count():
      
      # get probabilities of all actions from current state    
      action_probabilities = policy(state)
      
      # choose action according to  
      # the probability distribution 
      action = np.random.choice(np.arange(len(action_probabilities)),
                                p = action_probabilities)
      
      # take action and get reward, transit to next state
      next_state, reward, done, _ = env.step(action)
      
      # Update statistics
      stats.episode_rewards[ith_episode] += reward
      stats.episode_lengths[ith_episode] = t
      
      # TD Update
      best_next_action = np.argmax(Q[next_state])
      td_target= reward + discount_factor * Q[next_state][best_next_action]
      td_delta = td_target - Q[state][action]
      Q[state][action] += alpha * td_delta
      
      # done is True if episode terminated
      if done:
        break
      
      state = next_state
  return Q, stats

#Step 5: Train the model
Q, stats = qLearning(env,2000)

#Step 6: Plot important statistics
plotting.plot_episode_stats(stats)

env.render()

"""**After many trials, the network was able to reach the highest reward of 1 (the target state). With this Q-Learning method, it doesn't seem getting the result faster compared to Q-Table method. However, this Q-learning method can give us nice visualization about statistics.**"""
コード例 #15
0
def main():
    estimator = Estimator()
    stats = expected_sarsa(env, estimator, 100, epsilon=0.0)
    plotting.plot_cost_to_go_mountain_car(env, estimator)
    plotting.plot_episode_stats(stats, smoothing_window=25)
コード例 #16
0
            # TD Update
            # TODO: is this correct? seems like old qval should be scaled by 1-alpha, but
            #       maybe it's all the same.
            best_next_action = np.argmax(Q[next_state])
            td_target = reward + discount_factor * Q[next_state][best_next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta

            if verbose_mode:
                env.render("episode: ({},{})  rewards:{}".format(
                    ith_episode, t, stats.episode_rewards[ith_episode]))

            # done is True if episode terminated
            if done or t > MAX_TIME_STEPS:
                break

            state = next_state

    return Q, stats

#
# Experiments
#

# basic training
outputPath = ""
actionVal, stats = qLearning(windyGridEnv, nEpisodes, True, discount_factor=GAMMA, alpha=ALPHA, epsilon=EPSILON)
fig = plotting.plot_episode_stats(stats, smoothing_window = nEpisodes // 10)
fig.savefig(os.path.join(outputPath, "basic_training_results.png"))

print("la fin")
コード例 #17
0
ファイル: learn.py プロジェクト: Alfeezy/learn-to-fly
			# Update statistics 
			stats.episode_rewards[ith_episode] += reward 
			stats.episode_lengths[ith_episode] = t 
			
			# TD Update 
			best_next_action = np.argmax(Q[next_state])	 
			td_target = reward + discount_factor * Q[next_state][best_next_action] 
			td_delta = td_target - Q[state][action] 
			Q[state][action] += alpha * td_delta 

			# done is True if episode terminated 
			if done: 
				count += 1

			# reaches done state 10 times
			if count > 5:
				break

			# won't go forever
			if t > 1000:
				break
				
			state = next_state

	return Q, stats

Q, stats = qLearning(env, 1000, alpha=a, discount_factor=d, epsilon=e) 

plotting.plot_episode_stats(stats, a, d, e)