def main():
    env = gym.envs.make("MountainCar-v0")

    # Feature Preprocessing: Normalize to zero mean and unit variance
    # We use a few samples from the observation space to do this
    observation_examples = np.array(
        [env.observation_space.sample() for x in range(10000)])
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(observation_examples)

    # Used to convert a state to a featurized represenation.
    # We use RBF kernels with different variances to cover different parts of the space
    featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
    ])
    featurizer.fit(scaler.transform(observation_examples))

    estimator = Estimator(env, scaler, featurizer)

    # Note: For the Mountain Car we don't actually need an epsilon > 0.0
    # because our initial estimate for all states is too "optimistic" which leads
    # to the exploration of all states.
    stats = q_learning(env, estimator, 100, epsilon=0.0)

    plotting.plot_cost_to_go_mountain_car(env, estimator)
    plotting.plot_episode_stats(stats, smoothing_window=25)
Esempio n. 2
0
    def run_qlearning(self,
                      max_number_of_episodes=100,
                      interactive=False,
                      display_frequency=1):
        if interactive:
            plt.ion()
            plt.show()
        else:
            plt.close()

        # repeat for each episode
        for episode_number in range(max_number_of_episodes):

            # initialize state
            state = self.env.reset()

            done = False  # used to indicate terminal state
            R = 0  # used to display accumulated rewards for an episode
            t = 0  # used to display accumulated steps for an episode i.e episode length

            # repeat for each step of episode, until state is terminal
            while not done:

                t += 1  # increase step counter - for display

                # choose action from state using policy derived from Q
                action = self.agent.act(state)

                # take action, observe reward and next state
                next_state, reward, done, _ = self.env.step(action)

                # agent learn (Q-Learning update)
                self.agent.learn(state, action, reward, next_state, done)

                # state <- next state
                state = next_state

                R += reward  # accumulate reward - for display

                # if interactive display, show update for each step
                if interactive:
                    self.update_display_step()

            self.episode_length = np.append(
                self.episode_length, t)  # keep episode length - for display
            self.episode_reward = np.append(
                self.episode_reward, R)  # keep episode reward - for display

            # if interactive display, show update for the episode
            if interactive:
                self.update_display_episode()

        # if not interactive display, show graph at the end
        if not interactive:
            self.fig.clf()
            stats = plotting.EpisodeStats(
                episode_lengths=self.episode_length,
                episode_rewards=self.episode_reward,
                episode_running_variance=np.zeros(max_number_of_episodes))
            plotting.plot_episode_stats(stats, display_frequency)
Esempio n. 3
0
def main():

    print "PolyRL Q Learning"

    #discretizing the action space
    action_space = np.linspace(env.min_action, env.max_action, num=10)
    w_param = np.random.normal(size=(400, action_space.shape[0] + 1))

    print "Action Space", action_space

    num_episodes = 200
    smoothing_window = 100
    stats_poly_q_learning = poly_rl_q_learning(env,
                                               w_param,
                                               num_episodes,
                                               epsilon=0.1)
    rewards_smoothed_stats_poly_q_learning = pd.Series(
        stats_poly_q_learning.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_smoothed_stats_poly_q_learning
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Persistence_Length_Exploration/Results/'
        + 'Trial_PolyRL' + '.npy', cum_rwd)
    plotting.plot_episode_stats(stats_poly_q_learning)
    env.close()
def main():

    print "Tree Backup (lambda)"

    theta = np.zeros(shape=(400, env.action_space.n))

    num_episodes = 1000
    smoothing_window = 1

    # stats_sarsa_tb_lambda, cumulative_errors = q_sigma_lambda_on_policy_static_sigma(env, theta, num_episodes)
    # rewards_smoothed_stats_tb_lambda = pd.Series(stats_sarsa_tb_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
    # cum_rwd = rewards_smoothed_stats_tb_lambda
    # cum_err = cumulative_errors
    # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/'  + 'Q_sigma_lambda_OnPolicy_Static_Sigma_RBF_Cum_Rwd_2' + '.npy', cum_rwd)
    # np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/'  + 'Q_sigma_lambda_OnPolicy_Static_Sigma_RBF_Cum_Err_2' + '.npy', cum_err)
    # plotting.plot_episode_stats(stats_sarsa_tb_lambda)
    # env.close()

    stats_sarsa_tb_lambda, cumulative_errors = q_sigma_lambda_on_policy_dynamic_sigma(
        env, theta, num_episodes)
    rewards_smoothed_stats_tb_lambda = pd.Series(
        stats_sarsa_tb_lambda.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_smoothed_stats_tb_lambda
    cum_err = cumulative_errors
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/'
        + 'Q_sigma_lambda_OnPolicy_Dynamic_Sigma_RBF_Cum_Rwd_2' + '.npy',
        cum_rwd)
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/'
        + 'Q_sigma_lambda_OnPolicy_Dynamic_Sigma_RBF_Cum_Err_2' + '.npy',
        cum_err)
    plotting.plot_episode_stats(stats_sarsa_tb_lambda)
    env.close()
Esempio n. 5
0
def test_td_control_method(env):
    """
    plot_episode_stats([test_expected_sarsa_method(env),test_n_setps_expected_sarsa_method(env),test_off_policy_n_steps_sarsa(env),
                        test_n_steps_sarsa_method(env),test_qlearning_method(env),test_sarsa_lambda_method(env),test_q_lambda_method(env)])
    
    """
    plot_episode_stats([test_qlearning_method(env),test_q_lambda_method(env),test_double_q_learning_method(env),test_dynaQ_method_trival(env),test_dynaQ_method_priority(env)])
Esempio n. 6
0
def main():
    env = gym.make('MountainCar-v0')
    outdir = './experiment-results'
    # env = wrappers.Monitor(env, directory=outdir, force=True)

    # Keeps track of useful statistics
    num_episodes = 300
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # Feature Preprocessing: Normalize to zero mean and unit variance
    # We use a few samples from the observation space to do this
    observation_examples = np.array(
        [env.observation_space.sample() for x in range(10000)])
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(observation_examples)

    # Used to converte a state to a featurizes represenation.
    # We use RBF kernels with different variances to cover different parts of the space
    featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
    ])
    featurizer.fit(scaler.transform(observation_examples))

    agent = Agent(env.action_space.n,
                  scaler,
                  featurizer,
                  env.observation_space.sample(),
                  epsilon=0,
                  gamma=1)

    for i_episode in range(num_episodes):
        print("\rEpisode {}/{} ({})".format(
            i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]),
              end="")
        sys.stdout.flush()

        state = env.reset()
        action = agent.set_initial_state(state)

        for t in itertools.count():
            next_state, reward, done, info = env.step(action)
            action = agent.act(next_state, reward)

            # book-keeping
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            if done:
                break

    env.close()
    # gym.upload(outdir, api_key='sk_9YxUhFDaT5XSahcLut47w')

    plotting.plot_cost_to_go_mountain_car(env, agent.Q)
    plotting.plot_episode_stats(stats, smoothing_window=25)
Esempio n. 7
0
def main():
    matplotlib.style.use('ggplot')

    env = gym.envs.make("MountainCar-v0")

    num_episodes = 100

    estimator_q_learning = tile_coding_estimator.Estimator(env)
    statistics_q_learning = plotting.EpisodeStats(
        "q_learning",
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Note: For the Mountain Car we don't actually need an epsilon > 0.0
    # because our initial estimate for all states is too "optimistic" which leads
    # to the exploration of all states.
    q_learning_tile_coding.q_learning(env,
                                      estimator_q_learning,
                                      num_episodes,
                                      statistics_q_learning,
                                      epsilon=0.0)
    plotting.plot_cost_to_go_mountain_car(env, estimator_q_learning)

    estimator_sarsa = tile_coding_estimator.Estimator(env)
    statistics_sarsa = plotting.EpisodeStats(
        "sarsa",
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Note: For the Mountain Car we don't actually need an epsilon > 0.0
    # because our initial estimate for all states is too "optimistic" which leads
    # to the exploration of all states.
    sarsa_tile_coding.sarsa(env,
                            estimator_sarsa,
                            num_episodes,
                            statistics_sarsa,
                            epsilon=0.0)
    plotting.plot_cost_to_go_mountain_car(env, estimator_sarsa)

    estimator_expected_sarsa = tile_coding_estimator.Estimator(env)
    statistics_expected_sarsa = plotting.EpisodeStats(
        "expected_sarsa",
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Note: For the Mountain Car we don't actually need an epsilon > 0.0
    # because our initial estimate for all states is too "optimistic" which leads
    # to the exploration of all states.
    expected_sarsa_tile_coding.expected_sarsa(env,
                                              estimator_expected_sarsa,
                                              num_episodes,
                                              statistics_expected_sarsa,
                                              epsilon=0.0)
    plotting.plot_cost_to_go_mountain_car(env, estimator_expected_sarsa)

    plotting.plot_episode_stats(
        [statistics_q_learning, statistics_sarsa, statistics_expected_sarsa],
        smoothing_window=25)
Esempio n. 8
0
def main():
    estimator = Estimator()
    num_episodes = 5000
    stats = q_learning(env, estimator, num_episodes, epsilon=0.1)

    cum_rwd = save_cum_rwd(stats, smoothing_window=1)

    plotting.plot_episode_stats(stats)
    env.close()
Esempio n. 9
0
def test_approximation_control_method(env):

    episode_stats = [
        test_approximation_control_sarsa(env),
        test_approximation_control_expected_sarsa(env),
        test_approximation_control_q_learning(env)
    ]

    plotting.plot_episode_stats(episode_stats)
    plotting.plot_3d_q_value(env, episode_stats)
def main():
    Q, stats = sarsa(env, 50000)
    # plotting.plot_episode_stats(stats)
    # V = defaultdict(float)

    # for state, actions in Q.items():
    # 	action_value = np.max(actions)
    # 	V[state] = action_value

    # plotting.plot_value_function(V, title="Optimal Value Function")

    plotting.plot_episode_stats(stats)
Esempio n. 11
0
def main():
    global num_of_load
    global num_of_dump
    global num_of_return
    global state
    global old_state
    global old_time
    global Mean_TD_Error
    global Iterations
    global nTrucks

    BucketA_capacity = 1.5
    BucketB_capacity = 1.0
    Truck1_capacity = 6
    Truck2_capacity = 3
    Truck1_speed = 15.0
    Truck2_speed = 20.0
    Truck1_speedRatio = Truck1_speed / (Truck1_speed + Truck2_speed)
    Truck2_speedRatio = Truck2_speed / (Truck1_speed + Truck2_speed)

    #run session (initialise tf global vars)
    sess.run(init)

    num_episodes = 200
    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes),
                                  episode_loss=np.zeros(num_episodes))
    for i_episode in range(num_episodes):
        #reset global vars
        num_of_load = 0
        num_of_dump = 0
        num_of_return = 0
        state = np.zeros(12)
        old_state = np.zeros((nTrucks, 12))
        old_time = np.zeros(nTrucks)
        Mean_TD_Error = 0
        Iterations = 0
        # Print out which episode we're on, useful for debugging.
        if i_episode % 1 == 0:
            print "\rEpisode: ", i_episode + 1, " / ", num_episodes
        #run simulation
        run_sim(nTrucks, BucketA_capacity, BucketB_capacity, Truck1_capacity,
                Truck2_capacity, Truck1_speedRatio, Truck2_speedRatio)
        stats.episode_lengths[i_episode] = Hrs[i_episode]
        stats.episode_rewards[i_episode] = ProdRate[i_episode]
        stats.episode_loss[i_episode] = abs(Mean_TD_Error)
    plotting.plot_episode_stats(stats,
                                name='Linear_Qlearning',
                                smoothing_window=20)
Esempio n. 12
0
def main():
	theta = np.random.normal(size=(400,env.action_space.n))
	num_episodes = 200

	print "Running for Total Episodes", num_episodes

	smoothing_window = 1

	stats_q_sigma_off_policy = Q_Sigma_Off_Policy_2_Step(env, theta, num_episodes, epsilon=0.1)
	rewards_smoothed_stats_q_sigma_off_policy = pd.Series(stats_q_sigma_off_policy.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()	
	cum_rwd = rewards_smoothed_stats_q_sigma_off_policy
	np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/raw_results/'  + 'Off_Policy_Q_Sigma_2_step' + '.npy', cum_rwd)
	plotting.plot_episode_stats(stats_q_sigma_off_policy)
	env.close()
def main():

	print "SARSA(lambda)"

	theta = np.zeros(shape=(total_states))

	num_episodes = 5000
	smoothing_window = 1

	stats_sarsa_lambda = sarsa_lambda(env, theta, num_episodes)
	rewards_smoothed_stats_sarsa_lambda = pd.Series(stats_sarsa_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()	
	cum_rwd = rewards_smoothed_stats_sarsa_lambda
	np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/Cliff_Walking_Results/'  + 'sarsa_lambda_rbf' + '.npy', cum_rwd)
	plotting.plot_episode_stats(stats_sarsa_lambda)
	env.close()
Esempio n. 14
0
def main():

    tf.reset_default_graph()

    global_step = tf.Variable(0, name="global_step", trainable=False)
    policy_estimator = PolicyEstimator()
    value_estimator = ValueEstimator()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # Note, due to randomness in the policy the number of episodes you need to learn a good
        # policy may vary. ~300 seemed to work well for me.
        stats = actor_critic(env, policy_estimator, value_estimator, 300)

    plotting.plot_episode_stats(stats, smoothing_window=25)
Esempio n. 15
0
def main():

    print "Q Learning"
    theta = np.random.normal(size=(400, env.action_space.n))
    num_episodes = 2000
    smoothing_window = 200
    stats_q_learning = q_learning(env, theta, num_episodes, epsilon=0.1)
    rewards_smoothed_stats_q_learning = pd.Series(
        stats_q_learning.episode_rewards).rolling(
            smoothing_window, min_periods=smoothing_window).mean()
    cum_rwd = rewards_smoothed_stats_q_learning
    np.save(
        '/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Persistence_Length_Exploration/Results/'
        + 'Trial_Q_Learning' + '.npy', cum_rwd)
    plotting.plot_episode_stats(stats_q_learning)
    env.close()
Esempio n. 16
0
def q_learning_cliff_walking(Q,
                             env,
                             eps=200,
                             epsilon=0.05,
                             alpha=0.5,
                             discount_factor=1.0,
                             timesteps=800):
    nS = env.nS
    nA = env.nA
    episode_reward = []
    episode_length = []
    init_epsilon = epsilon
    for ep in range(eps):
        epsilon -= (init_epsilon * 0.0005)
        epsilon = max(epsilon, 0)
        policy = fn_policy(Q, env, epsilon=epsilon)
        current_state = env.reset()
        current_action = np.random.choice(np.arange(nA),
                                          p=policy[current_state])
        total_reward = 0
        for ts in range(timesteps):
            next_state, reward, done, prob = env.step(current_action)
            total_reward += reward
            next_action = np.random.choice(np.arange(nA), p=policy[next_state])
            next_greedy_action = np.argmax(policy[next_state])
            Q[current_state][
                current_action] = Q[current_state][current_action] + alpha * (
                    reward +
                    (discount_factor * Q[next_state][next_greedy_action]) -
                    Q[current_state][current_action])
            next_action_array = np.ones(nA) * epsilon / nA
            next_action_array[np.argmax(Q[current_state])] += (1 - epsilon)
            policy[current_state] = next_action_array
            if done:
                print(
                    "Episode {} ended after {} timesteps with total reward of {}"
                    .format(ep, ts, total_reward))
                episode_length.append(ts)
                episode_reward.append(total_reward)
                break
            current_state = next_state
            current_action = next_action
    stats = plotting.EpisodeStats(episode_lengths=episode_length,
                                  episode_rewards=episode_reward)
    plotting.plot_episode_stats(stats)
def main():

    tf.reset_default_graph()

    global_step = tf.Variable(0, name="global_step", trainable=False)
    policy_estimator = PolicyEstimator()
    value_estimator = ValueEstimator(env)

    num_episodes = 1000

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        stats = actor_critic(env, policy_estimator, value_estimator,
                             num_episodes)

    plotting.plot_episode_stats(stats, smoothing_window=25)

    env.close()
def main():

	print "True Online Tree Backup (lambda)"

	
	# theta = np.random.normal(size=(400))
	theta = np.zeros(shape=(400, env.action_space.n))

	num_episodes = 1000
	smoothing_window = 1

	stats_sarsa_tb_lambda, cumulative_errors = true_online_tree_backup_lambda(env, theta, num_episodes)
	rewards_smoothed_stats_tb_lambda = pd.Series(stats_sarsa_tb_lambda.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()	
	cum_rwd = rewards_smoothed_stats_tb_lambda
	cum_err = cumulative_errors
	np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/'  + 'True_Online_Tree_Backup_RBF_Cum_Rwd_2' + '.npy', cum_rwd)
	np.save('/Users/Riashat/Documents/PhD_Research/BASIC_ALGORITHMS/My_Implementations/Project_652/Code/Linear_Approximator/Eligibility_Traces/Accumulating_Traces/WindyGrid_Results/'  + 'True_Online_Tree_Backup_RBF_Cum_Err_2' + '.npy', cum_err)
	plotting.plot_episode_stats(stats_sarsa_tb_lambda)
	env.close()
Esempio n. 19
0
def main():

    env = CliffWalkingEnv()
    num_episodes = 500

    tf.reset_default_graph()
    tf.Variable(0, name="global_step", trainable=False)

    policyEstimatorReinforce = policy_estimator.PolicyEstimator(
        env, scope="Policy_Estimator_Reinforce")
    valueEstimatorReinforce = value_estimator.ValueEstimator(
        env, scope="Value_Estimator_Reinforce")
    statistics_reinforce = plotting.EpisodeStats(
        "Reinforce",
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    policyEstimatorAC = policy_estimator.PolicyEstimator(
        env, scope="Policy_Estimator_AC")
    valueEstimatorAC = value_estimator.ValueEstimator(
        env, scope="Value_Estimator_AC")
    statistics_ac = plotting.EpisodeStats(
        "AC",
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        # Note, due to randomness in the policy the number of episodes you need to learn a good
        # policy may vary. ~2000-5000 seemed to work well for me.
        reinforce.reinforce(env,
                            statistics_reinforce,
                            policyEstimatorReinforce,
                            valueEstimatorReinforce,
                            num_episodes,
                            discount_factor=1.0)
        actor_critic.actor_critic(env, statistics_ac, policyEstimatorAC,
                                  valueEstimatorAC, num_episodes)

    plotting.plot_episode_stats([statistics_reinforce, statistics_ac],
                                smoothing_window=25)
Esempio n. 20
0
def main():  
    parser = argparse.ArgumentParser(description='Run Reinforcment Learning at an Office in Tsinghua University')
    parser.add_argument('--env', default='band_control-v0', help='Environment name')
    parser.add_argument('-o', '--output', default='office-QN-Rh', help='Directory to save data to')
    parser.add_argument('--num', default=500, help='Number of Episodes')
    parser.add_argument('--gamma', default=0.95, help='Discount Factor')
    parser.add_argument('--alpha', default=0.5, help='Constant step-size parameter')
    parser.add_argument('--epsilon', default=0.05, help='Epsilon greedy policy')
    parser.add_argument('--epsilon_min', default=0.05, help='Smallest Epsilon that can get')
    parser.add_argument('--epsilon_decay', default=0.9, help='Epsilon decay after the number of episodes')
    parser.add_argument('--batch_size', default=32, help='Sampling batch size')
    parser.add_argument('--lr', default=0.001, help='Learning rate')


    args = parser.parse_args()

    output = get_output_folder(args.output, args.env)

  

    #create environment
    print(args.env)
    env = gym.make(args.env)

    ################# tabular Q learning ##########
    #### change the environment to  _process_state_table before use it
    # Q, stats = QL.q_learning(env, int(args.num), float(args.gamma), float(args.alpha), float(args.epsilon), 
    #      float(args.epsilon_min),  float(args.epsilon_decay), output)
    # plotting.plot_episode_stats(stats, smoothing_window=1)

    # print(Q)

    ############### Q learning with Neural network approximation and fixed target ################
    #### change the environment to  _process_state_DDQN before use it

    state_size = env.nS
    action_size = env.nA
    agent = QN.QNAgent(state_size, action_size, float(args.gamma), float(args.lr))
    stats = QN.q_learning(env, agent, int(args.num), int(args.batch_size),
        float(args.epsilon), float(args.epsilon_min), float(args.epsilon_decay), output)
    plotting.plot_episode_stats(stats, smoothing_window=1)
def main():
    estimator = Estimator()
    number_of_episodes = 1000
    print('Two Step Sarsa')
    stats_sarsa_2_step = sarsa_2_step_TD(env,
                                         estimator,
                                         number_of_episodes,
                                         discount_factor=1.0,
                                         epsilon=0.015,
                                         epsilon_decay=1.0)
    plotting.plot_episode_stats(stats_sarsa_2_step, smoothing_window=25)
    print('Two Step Q Learning')
    stats_Q = Q_learning_2_step_TD(env,
                                   estimator,
                                   number_of_episodes,
                                   discount_factor=1,
                                   epsilon=0.015,
                                   epsilon_decay=1.0)
    print('Two Step Tree Backup')
    stats_tree = two_step_tree_backup(env,
                                      estimator,
                                      number_of_episodes,
                                      discount_factor=1.0,
                                      epsilon=0.1)
    print('SARSA')
    stats_sarsa = sarsa(env,
                        estimator,
                        number_of_episodes,
                        discount_factor=1.0,
                        epsilon=0.015,
                        epsilon_decay=1.0)
    print('Expected SARSA')
    stats_expected_sarsa = expected_sarsa(env,
                                          estimator,
                                          number_of_episodes,
                                          discount_factor=1.0,
                                          epsilon=0.015,
                                          epsilon_decay=1.0)
    plot_episode_stats(stats_sarsa_2_step, stats_Q, stats_tree, stats_sarsa,
                       stats_expected_sarsa)
Esempio n. 22
0
def sarsa_WindyGridWorld(Q,
                         env,
                         eps=200,
                         discount_factor=1.0,
                         alpha=0.5,
                         epsilon=0.05):
    nS = env.nS
    nA = env.nA
    episode_lengths = []
    episode_rewards = []
    for ep in range(eps):
        timesteps = 700
        current_state = env.reset()
        policy = fn_policy(env, Q, epsilon=epsilon)
        action_arr = policy[current_state]
        action = np.random.choice(np.arange(nA), p=action_arr)
        total_reward = 0
        for ts in range(timesteps):
            next_state, reward, done, prob = env.step(action)
            total_reward += reward
            next_action_arr = policy[next_state]
            next_action = np.random.choice(np.arange(nA), p=next_action_arr)
            Q[current_state][action] = Q[current_state][action] + alpha * (
                reward + (discount_factor * Q[next_state][next_action]) -
                Q[current_state][action])
            act_arr = np.ones(nA) * epsilon / nA
            act_arr[np.argmax(Q[current_state])] += (1 - epsilon)
            policy[current_state] = act_arr
            if done:
                print("Episode {} ended after {} timesteps".format(ep, ts))
                episode_lengths.append(ts)
                episode_rewards.append(total_reward)
                break
            current_state = next_state
            action = next_action

    stats = plotting.EpisodeStats(episode_lengths=np.array(episode_lengths),
                                  episode_rewards=np.array(episode_rewards))
    plotting.plot_episode_stats(stats)
def main():
    Q, stats = q_learning(env, 300)
    plotting.plot_episode_stats(stats)
Esempio n. 24
0
import sys

if "../" not in sys.path:
    sys.path.append("../")

from lib.envs.cliff_walking import CliffWalkingEnv
from lib import plotting
from agents import QLearningAgent
import numpy as np

env_shape = (4, 12)
start_position = (3, 0)
end_positions = [(3, 11)]
cliff = tuple((3, i + 1) for i in range(10))

env = CliffWalkingEnv(env_shape, start_position, end_positions, cliff)
n_actions = env.action_space.n
agent = QLearningAgent(alpha=0.5,
                       epsilon=0.1,
                       discount=0.99,
                       n_actions=n_actions)

agent.train(env,
            n_episodes=1000,
            t_max=10**3,
            verbose=True,
            verbose_per_episode=500)

plotting.draw_policy(env, agent)
plotting.plot_episode_stats(agent)
                if update_time >= 0:
                    action_state_update_time = env_list[update_time][1]
                    evaluated_state_index = update_time + self.n - 1
                    if evaluated_state_index < len(states):
                        state_update_time = states[evaluated_state_index]
                        action_state_update_time.update(
                            0,
                            state_update_time.get_actions(),
                            time_step=update_time)
                    else:
                        action_state_update_time.update(0,
                                                        None,
                                                        time_step=update_time)
                if update_time == T - 1:
                    a_ss = [a_s for _, a_s in env_list]
                    for a_s in a_ss:
                        a_s.clear_reward_calculator()
                    break
        return stats


if __name__ == '__main__':
    q_learning = NStepSarsa(CliffWalkingEnv(), 1)
    stats = q_learning.run(200, get_learning_rate=lambda x1, x2: 1)
    plotting.plot_episode_stats(stats)
    q_learning.show_one_episode()
    # q_learning = NStepSarsa(WindyGridworldEnv(), 8)
    # stats = q_learning.run(50000)
    # plotting.plot_episode_stats(stats)
    # q_learning.show_one_episode()
            # next_action = np.random.choice(np.arange(len(next_action_probs)), p=next_action_probs)
            # td_target = reward + discount_factor * q_values_next[next_action]

            # Update the function approximator using our target
            estimator.update(state, action, td_target)
            #plt.figure()
            plt.clf()
            plt.imshow(env.render(mode='rgb_array'))

            print("\rStep {} @ Episode {}/{} ({})".format(
                t, i_episode + 1, num_episodes, last_reward)),
            sys.stdout.flush()

            if done:
                break

            state = next_state

    return stats


estimator = Estimator()

# Note: For the Mountain Car we don't actually need an epsilon > 0.0
# because our initial estimate for all states is too "optimistic" which leads
# to the exploration of all states.
stats = q_learning(env, estimator, 100, epsilon=0.0)

plotting.plot_cost_to_go_mountain_car(env, estimator)
plotting.plot_episode_stats(stats, smoothing_window=25)
Esempio n. 27
0
                                    target_estimator=target_estimator,
                                    state_processor=state_processor,
                                    experiment_dir=experiment_dir,
                                    num_episodes=3000,
                                    replay_memory_size=200000,
                                    replay_memory_init_size=20000,
                                    update_target_estimator_every=10000,
                                    epsilon_start=1,
                                    epsilon_end=0.1,
                                    epsilon_decay_steps=500000,
                                    discount_factor=0.99,
                                    batch_size=32,
                                    record_video_every=50):
        episode_reward_array.append(stats.episode_rewards[-1])
        if num_episode % 50 == 0:
            fig1, fig2, fig3, fig4 = plotting.plot_episode_stats(stats, smoothing_window=10, noshow=True)
            fig1.savefig(experiment_dir + '/episode_length.jpg')
            fig2.savefig(experiment_dir + '/reward.jpg')
            fig3.savefig(experiment_dir + '/episode_per_t.jpg')
            fig4.savefig(experiment_dir + '/episode_reward.jpg')
            np.savetxt(experiment_dir + '/episode_reward.txt', episode_reward_array, newline=" ")
        num_episode += 1
        avg_50_reward.append(np.average(stats.episode_rewards[max(0, num_episode - 50):]))
        print("\nEpisode Reward: {} , Last 50 average: {}".format(stats.episode_rewards[-1], np.average(
            stats.episode_rewards[max(0, num_episode - 50):])))



# In[ ]:

Esempio n. 28
0
 def plot(self, stats):
     plotting.plot_episode_stats(stats)
from collections import defaultdict

import matplotlib
import numpy as np

sys.path.append('/home/ornot/GymRL')
from algorithm import mc_online_policy_control, q_learning, sarsa, expected_sarsa
from env import windy_gridworld
from lib import plotting

matplotlib.style.use('ggplot')

env = windy_gridworld.WindyGridworldEnv()

num_episodes = 200

# TD online
statistics_sara = plotting.EpisodeStats("sara",
                                        episode_lengths=np.zeros(num_episodes),
                                        episode_rewards=np.zeros(num_episodes))
Q_sara = sarsa.sarsa(env, num_episodes, statistics_sara)

# TD_offline
statistics_q_learning = plotting.EpisodeStats(
    "q_learning",
    episode_lengths=np.zeros(num_episodes),
    episode_rewards=np.zeros(num_episodes))
Q_learning = q_learning.q_learning(env, num_episodes, statistics_q_learning)

plotting.plot_episode_stats([statistics_sara, statistics_q_learning])
def main():
    Q, stats = sarsa(env, 500)
    plotting.plot_episode_stats(stats)
            # update current state
            state = new_state
            
            if terminated:
                break
    
    return stats


# In[122]:


estimator = Estimator()


# In[123]:


# Note: For the Mountain Car we don't actually need an epsilon > 0.0
# because our initial estimate for all states is too "optimistic" which leads
# to the exploration of all states.
stats = q_learning(env, estimator, 100, epsilon=0.0)


# In[124]:


plotting.plot_cost_to_go_mountain_car(env, estimator)
plotting.plot_episode_stats(stats, smoothing_window=25)