def run_sarsa(self, max_number_of_episodes=100, interactive=False, display_frequency=1): # repeat for each episode for episode_number in range(max_number_of_episodes): # initialize state state = self.env.reset() done = False # used to indicate terminal state R = 0 # used to display accumulated rewards for an episode t = 0 # used to display accumulated steps for an episode i.e episode length # choose action from state using policy derived from Q action = self.agent.act(state) # repeat for each step of episode, until state is terminal while not done: t += 1 # increase step counter - for display # take action, observe reward and next state next_state, reward, done, _ = self.env.step(action) # choose next action from next state using policy derived from Q next_action = self.agent.act(next_state) # agent learn (SARSA update) self.agent.learn(state, action, reward, next_state, next_action) # state <- next state, action <- next_action state = next_state action = next_action R += reward # accumulate reward - for display # if interactive display, show update for each step if interactive: self.update_display_step() self.episode_length = np.append( self.episode_length, t) # keep episode length - for display self.episode_reward = np.append( self.episode_reward, R) # keep episode reward - for display # if interactive display, show update for the episode if interactive: self.update_display_episode() # if not interactive display, show graph at the end if not interactive: self.fig.clf() stats = plotting.EpisodeStats( episode_lengths=self.episode_length, episode_rewards=self.episode_reward, episode_running_variance=np.zeros(max_number_of_episodes)) plotting.plot_episode_stats(stats, display_frequency)
def policy_f(env, scaler, featurizer, print_ep_lens): ''' Main Calling Function for generating expert policy. ** Read the multi-line comment at the starting of this file to gain understanding. Args: env: Gym environment scaler: Mean and variance of the state values. featurizer: The container used for generating expert trajectories. print_ep_stats: [Bool] Prints interation with no. of time steps required for completion Returns: a) Plots statisics of mountain car learning with inbuilt rewards in the gym environment. So that we are able to compare results with the mountain car learning with learnt reward function. b) Returns "Demostration By Expert" DBE policy. ''' estimator = Estimator(env, scaler, featurizer) stats = q_learning_best_policy(env, estimator, 200, epsilon=0.0, print_ep_lens=False) print("___Plotting Learning Stats of the Agent____") plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25) final_policy = greedy_policy(estimator, env.action_space.n) return final_policy, estimator
def run(self): print('\tValue Iteration') vi_agent = ValueIterationAgent(self.env, self.gamma) print('\t\tAverage reward: ' + str(np.mean(vi_agent.scores))) print('\t\tConvergence step: ' + str(vi_agent.convergence)) print('\t\tPolicy: ' + str(vi_agent.policy)) print('\tPolicy Iteration') self.env.reset() pi_agent = PolicyIterationAgent(self.env, self.gamma) print('\t\tAverage reward: ' + str(np.mean(pi_agent.scores))) print('\t\tConvergence step: ' + str(pi_agent.convergence)) print('\t\tPolicy: ' + str(pi_agent.policy)) print('\tQ Learning') self.env.reset() ql_agent = QLearningAgent(self.env) q, stats = ql_agent.q_learning(self.env, 500) plotting.plot_episode_stats(stats, experiment_name=self.name)
for t in count(): env.render() action = actor_critic.choose_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) steps.append(Transition(state=state, action=action, reward=reward, next_state=next_state, done=done)) stats.episode_rewards[i_episode] += reward #calculate total loss total_loss = actor_critic.loss_func(state, action, reward, next_state, gamma) ac_optim.zero_grad() total_loss.backward() ac_optim.step() print("\rStep {} @ Episode {}/{} ({})".format(t, i_episode + 1, n_episodes, stats.episode_rewards[i_episode - 1]), end="") if done: stats.episode_lengths[i_episode] = t break state = next_state return stats, steps if __name__ == '__main__': gamma, num_episodes = args.gamma, args.episodes stats, steps = ac_train(env,actor_critic,ac_optim,num_episodes,gamma) #Plot 3 plots: episode_reward vs time, episode_length vs time, episode_number vs time plot_episode_stats(stats)
def main(): logging.info("define environment and basis function") env_id = "MountainCar-v0" env = gym.envs.make(env_id) logging.info("env_id: {}".format(env_id)) action_list = range(env.action_space.n) # linear basis func p_linear = 3 q_linear = 3 phi_linear = simple_phi psi_linear = phi_linear # radial basis (gaussian) fn p_rbf = 100 q_rbf = 100 phi_rbf = get_basis_function(env_id) psi_rbf = phi_rbf # this is specific to mountaincar-v0 init_s_sampler = lambda: [np.random.uniform(-0.4, -0.6), 0.0] # 2. define hyperparams gamma = 0.95 n_trial = 2 n_iteration = 10 # @note: hard-coded # this's gotta be sufficiently large to avoid mc variance issue sample_size_mc = 10**2 #p = p_linear #q = q_linear #phi = phi_linear #psi = psi_linear p = p_rbf q = q_rbf phi = phi_rbf psi = psi_rbf precision = 1e-4 use_slack = False # @note: reward may have to be scaled to work with slack penalty slack_penalty = 1e-3 eps = 0.0001 #eps = 0 # this should be large to account for varying init sate mu_sample_size = 50 logging.info("collect a batch of data (D) from pi_expert (and some noise)") pi_exp = NearExpertPolicy() pi_random = get_random_policy() # preprocessing D in numpy array for k logging.info("apprenticeship learning starts") logging.info("feature dim:\n{}".format(phi)) mu_exp = AL.estimate_mu(env=env, pi_eval=pi_exp, mu_sample_size=sample_size_mc, phi=phi, gamma=gamma, return_epi_len=False) #mu_mc_list = estimate_mu_mc(env, pi_exp, phi_linear, gamma, sample_size_mc) #mu_mc_list = estimate_mu_mc(env, pi_exp, phi_rbf, gamma, sample_size_mc) #mu_exp = np.mean(mu_mc_list, axis=0) pi_init = pi_random mdp_solver = LinearQ3(env=env, phi=phi, action_list=action_list, n_episode=100, epsilon=0.0, gamma=gamma) al = AL(env=env, pi_init=pi_init, action_list=action_list, p=p, q=q, phi=phi, psi=psi, gamma=gamma, eps=eps, mu_exp=mu_exp, init_s_sampler=init_s_sampler, mu_sample_size=mu_sample_size, precision=precision, mdp_solver=mdp_solver, use_slack=use_slack, slack_penalty=slack_penalty) results = al.run(n_trial=n_trial, n_iteration=n_iteration) # 5. post-process results (plotting) pi_irl = results["policy_best"][0] weight_irl = results["weight_best"][0] margin_v = results["margin_v"][0] margin_mu = results["margin_mu"][0] weight = results["weight"][0] state_dim = env.observation_space.shape[0] # discrete action action_dim = 1 n_action = env.action_space.n sim = Simulator(env, state_dim=state_dim, action_dim=action_dim) D_irl, stats = sim.simulate(pi_irl, n_trial=1, n_episode=15, return_stats=True) plotting.plot_cost_to_go_mountain_car(env, pi_irl._estimator) plotting.plot_episode_stats(stats, smoothing_window=5) np.save("data/D_irl.npy".format(time()), D_irl) np.save("data/margin_v.npy".format(time()), margin_v) np.save("data/margin_mu.npy".format(time()), margin_mu) np.save("data/weight.npy".format(time()), weight) np.save("data/weight_best.npy".format(time()), weight_irl) print("D_irl shape{}".format(D_irl.shape)) with open("data/res_{}".format(time()), "wb") as f: pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
next_state, reward, end, _ = env.step(action) next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t q_values_next = estimator.predict(next_state) td_target = reward + discount_factor * q_values_next[next_action] estimator.update(state, action, td_target) if i_episode % 10 == 0: print("\rStep {} @ Episode {}/{} ({})".format( t, i_episode + 1, num_episodes, reward)) if end: break state = next_state action = next_action return stats estimator = FunctionApproximator() stats = sarsa(env, estimator, 200, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25)
replay_memory_size=1024, # replay_memory_init_size=50000, replay_memory_init_size=128, # update_target_estimator_every=10000, update_target_estimator_every=500, epsilon_start=1.0, epsilon_end=0.1, # epsilon_decay_steps=500000, epsilon_decay_steps=800, discount_factor=0.99, # batch_size=32): batch_size=32): print("\nEpisode Reward: {}".format(stats.episode_rewards[-1])) q_estimator.save_states(experiment_dir + "/lstmvis") ep_length, ep_reward, t_steps = plotting.plot_episode_stats(stats, smoothing_window=5, noshow=True) ep_length.savefig(experiment_dir + '/ep_length.png') ep_reward.savefig(experiment_dir + '/ep_reward.png') t_steps.savefig(experiment_dir + '/t_steps.png') if (os.path.exists("./log.txt")): copyfile("./log.txt", experiment_dir + "/log.txt") os.remove("./log.txt") # In[ ]: # In[ ]:
print("\rStep {} @ Episode {}/{} ({})".format( t, i_episode+1, num_episodes, stats.episode_rewards[i_episode-1])) if done: break state = next_state return stats tf.reset_default_graph() global_step = tf.Variable(0, name="global_step", trainable=False) policy_estimator = PolicyEstimator(learning_rate=0.001) value_estimator = ValueEstimator(learning_rate=0.1) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) stats = actor_critic(env, policy_estimator, value_estimator, 50, discount_factor=0.95) plotting.plot_episode_stats(stats, smoothing_window=25)
action = np.random.choice(np.arange( len(action_probabilities)), p = action_probabilities) # take action and get reward, transit to next state next_state, reward, done, _ = env.step(action) # Update statistics stats.episode_rewards[ith_episode] += reward stats.episode_lengths[ith_episode] = t # TD Update best_next_action = np.argmax(Q[next_state]) td_target = reward + discount_factor * Q[next_state][best_next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta # done is True if episode terminated if done: break state = next_state return Q, stats Q, stats = qLearning(env, 1000) plotting.plot_episode_stats(stats)
def run_discrete(environment_name, mapping=None, shape=None): problem = gym.make(environment_name) print('== {} =='.format(environment_name)) print('Actions:', problem.action_space.n) print('States:', problem.observation_space.n) print(problem.desc) print() if environment_name == 'TaxiEnv-v1': print('== Value Iteration ==') value_policy, iters = value_iteration_local(problem) print('Iterations:', iters) print() print('== Policy Iteration ==') policy, iters = policy_iteration_local(problem) print('Iterations:', iters) print() diff = sum([ abs(x - y) for x, y in zip(policy.flatten(), value_policy.flatten()) ]) if diff > 0: print('Discrepancy:', diff) print() if shape is not None: print('== Policy ==') print_policy(value_policy, mapping, shape) print_policy(policy, mapping, shape) print() taxi_q_learning() else: print('== Value Iteration ==') value_policy_local, iters = value_iteration_local(problem) value_policy, Vi, iters, time = value_iteration(problem) print('Iterations:', iters) print() print('== Policy Iteration ==') policy_local, iters = policy_iteration_local(problem) policy, V, iters, time = policy_iteration(problem) print('Iterations:', iters) print() visualize_policy(value_policy, environment_name, problem.desc.shape, 'Optimal policy - Modified transition model') visualize_value(Vi, environment_name, problem.desc.shape, 'Value estimates - Modified transition model') diff = sum([ abs(x - y) for x, y in zip(policy.flatten(), value_policy.flatten()) ]) if diff > 0: print('Discrepancy:', diff) print() if shape is not None: print('== Policy ==') print_policy(value_policy_local, mapping, shape) print_policy(policy_local, mapping, shape) print() frozenlake_q_learning() Q, stats, Nsa, final_policy = q_learning(problem, 'greedy', 1000) plotting.plot_episode_stats(stats) return policy
def run_qlearning(self, interactive=False, display_frequency=1, save_model_each_n_episodes=50, time_penalty=0, life_penalty=0): self.start_run() BYTE_VIE = 57 # repeat for each episode for episode_number in range(self.EPISODES): # initialize state state = self.env.reset() done = False # used to indicate terminal state R = 0 # used to display accumulated rewards for an episode t = 0 # used to display accumulated steps for an episode i.e episode length # repeat for each step of episode, until state is terminal while not done: t += 1 # increase step counter - for display # choose action from state using policy derived from Q action = self.agent.act(state) # take action, observe reward and next state next_state, reward, done, _ = self.env.step(action) # Pénaliser le fait de ne rien faire pour éviter des épisodes qui s'étirent learning_reward = reward # Après discussions avec Mikael, ne pas pénaliser l'inaction ou les vies if learning_reward == 0: vies_actuelles = state[BYTE_VIE] vies_apres = next_state[BYTE_VIE] if vies_apres < vies_actuelles: #print("vie perdue") learning_reward = -life_penalty # Pénaliser fortement la perte de vie else: learning_reward = -time_penalty # Pénaliser légèrement l'inaction # agent learn (Q-Learning update) if self.training: self.agent.learn(state, action, learning_reward, next_state, done) # state <- next state state = next_state R += reward # accumulate reward - for display # if interactive display, show update for each step #if self.training and interactive: self.update_display_step(t) # If cancel requested, exit if self.agent.isCancelRequested(): self.agent.log("*** Arrêt demandé détecté ***", flushBuffer=True) break self.episode_length = np.append( self.episode_length, t) # keep episode length - for display self.episode_reward = np.append( self.episode_reward, R) # keep episode reward - for display if R > 0: self.agent.log( f"Épisode {episode_number+1}/{self.EPISODES}: R={R}, Steps={t}", doPrint=not interactive) # Update image of highest score only if interactive: if R >= self.high_score: self.update_display_step() self.update_display_episode() if R > self.high_score: self.high_score = R self.agent.log( f"\tNouveau meilleur score à: {self.high_score}, épisode #{episode_number + 1}", flushBuffer=True) # Sauvegarde du modèle tous les n épisodes if self.training and save_model_each_n_episodes != None and ( episode_number + 1) % save_model_each_n_episodes == 0: self.agent.saveModel() # if interactive display, show update for the episode if not self.training and interactive: self.update_display_episode() # If cancel requested, exit if self.agent.isCancelRequested(): self.agent.log("*** Arrêt demandé par l'usager ***", flushBuffer=True) break # if interactive display, show graph at the end if interactive: self.update_display_episode() else: self.fig.clf() stats = plotting.EpisodeStats(episode_lengths=self.episode_length, episode_rewards=self.episode_reward, episode_running_variance=np.zeros( self.EPISODES)) plotting.plot_episode_stats(stats, display_frequency) self.agent.log("") self.agent.log(f"Fin des épisodes") self.agent.log(f"Meilleur score obtenu: {self.high_score}") self.agent.log( f"Durée moyenne: {round(np.average(self.episode_length), 1)} actions" ) self.agent.log( f"Score moyen: {round(np.average(self.episode_reward), 2)} points") self.agent.log("", flushBuffer=True) self.end_run()
reward_ = self._update_states() #print 'New State', self.old_state stats.episode_rewards[i_episode] += reward_ stats.episode_lengths[i_episode] = i if self.old_state == 15 or self.old_state == 0: print 'Ith episode, episode len', i_episode, i break free_energy_2 = self.update_action(update=True) diff = reward_ + self.discount_factor * free_energy_2 - free_energy_1 self._update_action_weights(diff) self._update_state_weights(diff) return stats ''' rbm = RBM(nagent=3, nstate=16, nhid=20, naction=4) stats1 = rbm.gibbs_sampling(100) rbm = RBM(nagent=3, nstate=16, nhid=50, naction=4) stats2 = rbm.gibbs_sampling(100) ''' rbm = RBM(nagent=1, nstate=16, nhid=100, naction=4) stats3 = rbm.gibbs_sampling(100) Q, stats4 = sarsa(rbm.env, 100) plotting.plot_episode_stats(stats3, stats4) import ipdb ipdb.set_trace()
# stats.episode_rewards[i_episode] += reward # stats.episode_lossbag[i_episode] += data_overflow # if t>=4000: # break # state = next_state # total_t += 1 stats_c = plotting.EpisodeStats(episode_transbag=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_lossbag=np.zeros(num_episodes)) _ = env.reset_test() for i_episode in range(num_episodes): _ = env.reset_test() for t in itertools.count(): best_action = greedyselect_known(env.S) _, reward, _, data_overflow, data_trans = env.step(best_action) stats_c.episode_transbag[i_episode] += data_trans stats_c.episode_rewards[i_episode] += reward stats_c.episode_lossbag[i_episode] += data_overflow if t >= 1000: break print("\rEpisode {} / {} , lossbag {} D {} ".format( i_episode + 1, num_episodes, stats_c.episode_lossbag[i_episode], np.sum(env.S[:, 1])), end="") sys.stdout.flush() print("\n") print(np.mean(stats_c.episode_lossbag)) plotting.plot_episode_stats(stats, stats_comp=stats_c)
def createEpsilonGreedyPolicy(Q, epsilon, num_actions): # """ Creates an epsilon-greedy policy based on a given Q-function and epsilon. Returns a function that takes the state as an input and returns the probabilities for each action in the form of a numpy array of length of the action space(set of possible actions). """# def policyFunction(state): Action_probabilities = np.ones(num_actions, dtype = float) * epsilon/num_actions best_action = np.argmax(Q[state]) Action_probabilities[best_action] += (1.0 - epsilon) return Action_probabilities return policyFunction #Step 4: Build Q-Learning Model def qLearning(env,num_episodes,discount_factor=1.0,alpha=0.3,epsilon=0.1): """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while improving following an epsilon-greedy policy""" # Action value function # A nested dictionary that maps # state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.action_space.n)) # Keeps track of useful statistics stats= plotting.EpisodeStats(episode_lengths = np.zeros(num_episodes), episode_rewards = np.zeros(num_episodes)) # Create an epsilon greedy policy function # appropriately for environment action space policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n) # For every episode for ith_episode in range(num_episodes): state = env.reset() # Reset the environment and pick the first action for t in itertools.count(): # get probabilities of all actions from current state action_probabilities = policy(state) # choose action according to # the probability distribution action = np.random.choice(np.arange(len(action_probabilities)), p = action_probabilities) # take action and get reward, transit to next state next_state, reward, done, _ = env.step(action) # Update statistics stats.episode_rewards[ith_episode] += reward stats.episode_lengths[ith_episode] = t # TD Update best_next_action = np.argmax(Q[next_state]) td_target= reward + discount_factor * Q[next_state][best_next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta # done is True if episode terminated if done: break state = next_state return Q, stats #Step 5: Train the model Q, stats = qLearning(env,2000) #Step 6: Plot important statistics plotting.plot_episode_stats(stats) env.render() """**After many trials, the network was able to reach the highest reward of 1 (the target state). With this Q-Learning method, it doesn't seem getting the result faster compared to Q-Table method. However, this Q-learning method can give us nice visualization about statistics.**"""
def main(): estimator = Estimator() stats = expected_sarsa(env, estimator, 100, epsilon=0.0) plotting.plot_cost_to_go_mountain_car(env, estimator) plotting.plot_episode_stats(stats, smoothing_window=25)
# TD Update # TODO: is this correct? seems like old qval should be scaled by 1-alpha, but # maybe it's all the same. best_next_action = np.argmax(Q[next_state]) td_target = reward + discount_factor * Q[next_state][best_next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if verbose_mode: env.render("episode: ({},{}) rewards:{}".format( ith_episode, t, stats.episode_rewards[ith_episode])) # done is True if episode terminated if done or t > MAX_TIME_STEPS: break state = next_state return Q, stats # # Experiments # # basic training outputPath = "" actionVal, stats = qLearning(windyGridEnv, nEpisodes, True, discount_factor=GAMMA, alpha=ALPHA, epsilon=EPSILON) fig = plotting.plot_episode_stats(stats, smoothing_window = nEpisodes // 10) fig.savefig(os.path.join(outputPath, "basic_training_results.png")) print("la fin")
# Update statistics stats.episode_rewards[ith_episode] += reward stats.episode_lengths[ith_episode] = t # TD Update best_next_action = np.argmax(Q[next_state]) td_target = reward + discount_factor * Q[next_state][best_next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta # done is True if episode terminated if done: count += 1 # reaches done state 10 times if count > 5: break # won't go forever if t > 1000: break state = next_state return Q, stats Q, stats = qLearning(env, 1000, alpha=a, discount_factor=d, epsilon=e) plotting.plot_episode_stats(stats, a, d, e)