Python LinearVFA Examples, util.LinearVFA Python Examples

Example #1

0

Show file

    def setUpCritic(self, featDim, nS, nA):
        self.VFAcritic.setUpWeights(featDim) # Initialize weights critic VFA

        # In order to compute the advantage function it is necesary to have
        # another set of weights to approximate both Q(s,a) and V(s)
        # The VFA chosen here is linear combination of features
        self.VFAstateval = LinearVFA()
        self.VFAstateval.setUpWeights((self.nS, 1))
        self.kappa = self.beta * 0.4 # Step size parameter

Example #2

0

Show file

File: agentBatchVFA.py Project: DapengChalmers/RL

def compareMethods():
    import gym
    import matplotlib.pyplot as plt

    env = gym.make('GridWorld-v0')
    policy = EGreedyPolicyVFA(0.1)
    VFA = LinearVFA()
    feature = Featurize()

    training_episodes = 1000
    n_plot_points = 100
    eps_benchmark = 100
    fixedHorizon = 20

    # Initialize agents
    alpha1 = 0.4
    agent1 = LeastSquaresTD(env, policy, VFA, feature, alpha1, horizon = fixedHorizon)

    alpha2 = 0.4
    lamda2 = 0.8
    agent2 = LSTDlamda(env, policy, VFA, feature, alpha2, lamda2, horizon = fixedHorizon)

    alpha3 = 0.4
    agent3 = LSTDQ(env, policy, VFA, feature, alpha3, horizon = fixedHorizon)

    alpha4 = 0.4
    agent4 = LSPITD(env, policy, VFA, feature, alpha4, horizon = fixedHorizon)

    agents = [agent1, agent2, agent3, agent4]

    eps_per_point = int(training_episodes / n_plot_points)
    benchmark_data = np.zeros((4, n_plot_points))
    # Benchmark agents without training
    for agent_i in range(4): benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark)
    # Train and benchmark agents
    for point_i in range(1, n_plot_points):
        for agent_i in range(4):
            print('Agent ' + str(agent_i) + ', Episode ' + str((point_i+1)*eps_per_point))
            agents[agent_i].train(eps_per_point)
            benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(eps_benchmark)

    # Plot results
    plt.figure(figsize=(12, 10))
    xaxis = [eps_per_point*(i+1) for i in range(n_plot_points)]
    title1 = 'LSTD(0), a = ' + str(alpha1)
    title2 = 'LSTD(lamda), a = ' + str(alpha2) + ', l = ' + str(lamda2)
    title3 = 'LSTDQ, a = ' + str(alpha3)
    title4 = 'LSPITD, a = ' + str(alpha4)
    titles = [title1, title2, title3, title4]
    for i in range(4):
        plt.subplot(221+i)
        plt.plot(xaxis, benchmark_data[i])
        plt.xlabel('Training episodes')
        plt.ylabel('Average reward per episode')
        plt.title(titles[i])
    plt.show()

Example #3

0

Show file

class AdvanAC(AgentQAC):
    def setUpCritic(self, featDim, nS, nA):
        self.VFAcritic.setUpWeights(featDim) # Initialize weights critic VFA

        # In order to compute the advantage function it is necesary to have
        # another set of weights to approximate both Q(s,a) and V(s)
        # The VFA chosen here is linear combination of features
        self.VFAstateval = LinearVFA()
        self.VFAstateval.setUpWeights((self.nS, 1))
        self.kappa = self.beta * 0.4 # Step size parameter

    def step(self, state, action):
        # Take A, observe R and S'
        state_prime, reward, done, info = self.env.step(action)

        # Choose A' using a policy derived from S'
        action_prime = self.policy.getAction(self.featurize, state_prime)

        if self.learn:
            # Compute the pertinent feature vectors
            features = self.featurize.featureStateAction(state, action)
            features_prime = self.featurize.featureStateAction(state_prime, action_prime)
            features_state = self.featurize.featureState(state)
            features_stateprime = self.featurize.featureState(state_prime)

            # Compute the value of the features via value function approximation
            value = self.VFAcritic.getValue(features)
            value_prime = self.VFAcritic.getValue(features_prime)
            value_state = self.VFAstateval.getValue(features_state)
            value_stateprime = self.VFAstateval.getValue(features_stateprime)

            delta_q = reward + self.gamma * value_prime - value
            delta_v = reward + self.gamma * value_stateprime - value_state

            # Actor update
            advantage = value - value_state
            gradient = self.policy.getGradient(self.featurize, state, action)
            delta_theta = self.alpha * gradient * advantage
            self.policy.updateWeightsDelta(delta_theta)

            # Critic update
            delta_weight = self.beta * delta_q * features
            self.VFAcritic.updateWeightsDelta(delta_weight)

            # State value function update
            delta_stateWeight = self.kappa * delta_v * features_state
            self.VFAstateval.updateWeightsDelta(delta_stateWeight)

        return state_prime, action_prime, reward, done

Example #4

0

Show file

File: compare.py Project: GitZzw/RL_Algorithm

def compareMethods():
    import gym
    import matplotlib.pyplot as plt
    import gym_gridworlds
    env = gym.make('Gridworld-v0')

    epsilon = 0.1
    policyVFA = EGreedyPolicyVFA(epsilon)
    policyTab = EGreedyPolicyTabular(epsilon)
    VFA = LinearVFA()
    feature = Featurize()
    init_train_model = 0  # No previous knowledge about model
    H = 20

    training_episodes = 200
    n_plot_points = 100
    eps_benchmark = 100

    # Initialize agents
    alpha1 = 0.4
    plan1 = 20
    agent1 = DynaQ(env,
                   policyVFA,
                   VFA,
                   feature,
                   init_train_model,
                   plan1,
                   alpha1,
                   horizon=H)

    alpha4 = 0.4
    beta4 = 0.4
    plan4 = 20
    agent4 = Dyna2(env,
                   policyVFA,
                   LinearVFA(),
                   VFA,
                   feature,
                   init_train_model,
                   plan4,
                   alpha4,
                   beta4,
                   horizon=H)
    agent4.model.addTerminalStates([0, 15])

    agents = [agent1, agent4]

    eps_per_point = int(training_episodes / n_plot_points)
    benchmark_data = np.zeros((4, n_plot_points))
    # Benchmark agents without training
    for agent_i in range(2):
        benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark)
    # Train and benchmark agents

    plt.figure(figsize=(10, 5))
    plt.ion()
    for point_i in range(1, n_plot_points):
        # for agent_i in range(2):
        #     print('Dyna ' + str(agent_i) + ', Episode ' + str((point_i+1)*eps_per_point))
        #     agents[agent_i].train(eps_per_point)
        #     benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(eps_benchmark)

        print('DynaQ' + ', Episode ' + str((point_i + 1) * eps_per_point))
        agents[0].train(eps_per_point)
        benchmark_data[0][point_i] = agents[0].benchmark(eps_benchmark)
        print('Dyna2' + ', Episode ' + str((point_i + 1) * eps_per_point))
        agents[1].train(eps_per_point)
        benchmark_data[1][point_i] = agents[1].benchmark(eps_benchmark)

        # Plot results
        # plt.figure(figsize=(16, 10))
        # xaxis = [eps_per_point*(i+1) for i in range(n_plot_points)]
        # title1 = 'DynaQ'
        # title4 = 'Dyna2'
        # titles = [title1, title4]
        # for i in range(2):
        #     plt.subplot(221+i)
        #     plt.plot(xaxis, benchmark_data[i])
        #     plt.xlabel('Training episodes')
        #     plt.ylabel('Average reward per episode')
        #     plt.title(titles[i])
        # plt.show()
        # Plot results
        plt.clf()  # 清除之前画的图
        fig = plt.gcf()  # 获取当前图
        xaxis = [eps_per_point * (i + 1) for i in range(n_plot_points)]
        title1 = 'DynaQ'
        title4 = 'Dyna2'
        titles = [title1, title4]
        for i in range(2):
            plt.subplot(121 + i)
            plt.plot(xaxis, benchmark_data[i])
            plt.xlabel('Training episodes')
            plt.ylabel('Average reward per episode')
            plt.title(titles[i])
        plt.pause(0.001)  # 暂停一段时间，不然画的太快会卡住显示不出来
        plt.ioff()  # 关闭画图窗口Z
    plt.show()

Example #5

0

Show file

File: compare.py Project: GitZzw/RL_Algorithm

    def __init__(self,
                 env,
                 policy,
                 VFAshort,
                 VFAlong,
                 featurize,
                 train_eps,
                 planning,
                 alpha,
                 beta,
                 gamma=1,
                 horizon=100,
                 verbosity=0):
        # Inputs:
        #   -env: openAI gym environment object
        #   -policy: object containing a policy from which to sample actions
        #   -VFAshort: object containing the value function approximator for the
        #       short-term memory
        #   -VFAlong: object containing the value function approximator for the
        #       long-term memory
        #   -featurize: object which featurizes states
        #   -train_eps: numer of random episodes to generate experience to train
        #       the model initially
        #   -planning: number of planning steps
        #   -alpha: step size parameter for long term memory value function update
        #   -beta: step size parameter for short term memory value function update
        #   -lamda: trace discount paramater
        #   -gamma: reward discount-rate parameter
        #   -horizon: finite horizon steps
        #   -verbosity: if TRUE, prints to screen additional information

        self.env = env
        self.policy = policy
        self.VFAshort = VFAshort
        self.VFAlong = VFAlong
        self.featurize = featurize
        self.planning = planning
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.horizon = horizon
        self.verbosity = verbosity

        self.nS = env.observation_space.n  # Number of states
        self.nA = env.action_space.n  # Number of actions
        self.policy.setNActions(self.nA)
        self.featurize.set_nSnA(self.nS, self.nA)
        self.featDim = featurize.featureStateAction(
            0, 0).shape  # Dimensions of the
        # feature vector
        self.VFAshort.setUpWeights(
            self.featDim)  # Initialize weights for the VFA
        # for short term memory
        self.VFAlong.setUpWeights(
            self.featDim)  # Initialize weights for the VFA
        # for long term memory
        self.QVFA = LinearVFA()  # Q(s,a) is approximated through Linear Value
        # Function Approximation, with weights equal to
        # the sum of the weights of the short and long
        # term memory VFAs.
        self.updateQ()  # Initialize QVFA

        # Initially prevent agent from learning
        self.learn = 0

        # Initialize model
        self.model = TableLookupModel(self.nS,
                                      self.nA)  # Initialize model as a
        # Table Lookup Model
        self.model_learn = 0

        # Uncoment for previous random exploration in order to improve initial model
        self.trainModel(train_eps)

Example #6

0

Show file

File: compare.py Project: GitZzw/RL_Algorithm

class Dyna2(Agent):
    def __init__(self,
                 env,
                 policy,
                 VFAshort,
                 VFAlong,
                 featurize,
                 train_eps,
                 planning,
                 alpha,
                 beta,
                 gamma=1,
                 horizon=100,
                 verbosity=0):
        # Inputs:
        #   -env: openAI gym environment object
        #   -policy: object containing a policy from which to sample actions
        #   -VFAshort: object containing the value function approximator for the
        #       short-term memory
        #   -VFAlong: object containing the value function approximator for the
        #       long-term memory
        #   -featurize: object which featurizes states
        #   -train_eps: numer of random episodes to generate experience to train
        #       the model initially
        #   -planning: number of planning steps
        #   -alpha: step size parameter for long term memory value function update
        #   -beta: step size parameter for short term memory value function update
        #   -lamda: trace discount paramater
        #   -gamma: reward discount-rate parameter
        #   -horizon: finite horizon steps
        #   -verbosity: if TRUE, prints to screen additional information

        self.env = env
        self.policy = policy
        self.VFAshort = VFAshort
        self.VFAlong = VFAlong
        self.featurize = featurize
        self.planning = planning
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.horizon = horizon
        self.verbosity = verbosity

        self.nS = env.observation_space.n  # Number of states
        self.nA = env.action_space.n  # Number of actions
        self.policy.setNActions(self.nA)
        self.featurize.set_nSnA(self.nS, self.nA)
        self.featDim = featurize.featureStateAction(
            0, 0).shape  # Dimensions of the
        # feature vector
        self.VFAshort.setUpWeights(
            self.featDim)  # Initialize weights for the VFA
        # for short term memory
        self.VFAlong.setUpWeights(
            self.featDim)  # Initialize weights for the VFA
        # for long term memory
        self.QVFA = LinearVFA()  # Q(s,a) is approximated through Linear Value
        # Function Approximation, with weights equal to
        # the sum of the weights of the short and long
        # term memory VFAs.
        self.updateQ()  # Initialize QVFA

        # Initially prevent agent from learning
        self.learn = 0

        # Initialize model
        self.model = TableLookupModel(self.nS,
                                      self.nA)  # Initialize model as a
        # Table Lookup Model
        self.model_learn = 0

        # Uncoment for previous random exploration in order to improve initial model
        self.trainModel(train_eps)

    def trainModel(self, train_eps):
        self.model_learn = 1  # Model will be learnt
        self.preventlearn()  # Value function will not be learnt
        self.runEpisodes(train_eps)
        self.model_learn = 0

    def updateQ(self):
        weights_short = self.VFAshort.getWeights()
        weights_long = self.VFAlong.getWeights()
        Qweights = weights_long + weights_short  # Assuming that both VFAs use the
        # same featurize function
        self.QVFA.setWeights(Qweights)

    # Computes a single episode.
    # Returns the episode reward return.
    def episode(self):
        episodeReward = 0

        # Clear short term memory
        self.VFAshort.setUpWeights(
            self.featDim)  # Initialize weights for the VFA
        # for short term memory

        state = self.env.reset()  # Initialize S
        if self.learn:
            self.search(state)  # Search in order to update short term memory
            self.updateQ()  # Take into account previous search in Q VFA

        # Pick A
        action = self.policy.getAction(self.QVFA, self.featurize, state)

        # Repeat for each episode
        for t in range(self.horizon):
            # Take action A, observe R, S'
            state, action, reward, done = self.step(state, action)

            # Update the total episode return
            episodeReward += reward

            # Finish the loop if S' is a terminal state
            if done: break

        # Update the policy parameters if the agent is learning
        if self.learn: self.policy.episodeUpdate()

        return episodeReward

    def search(self, state):
        for ep in range(self.planning):
            s = state  # Initialize S
            self.updateQ()
            a = self.policy.getAction(self.QVFA, self.featurize, s)  # Pick A
            for k in range(self.horizon):
                s_prime = self.model.sampleStatePrime(s, a)  # Get expected S'
                r = self.model.sampleReward(s, a)  # Get expected R
                self.updateQ()  # Update QVFA
                a_prime = self.policy.getAction(
                    self.QVFA, self.featurize,
                    s_prime)  # Pick A' using QVFA and S'
                self.TDupdateShort(s, a, r, s_prime,
                                   a_prime)  # Update short-term
                # memory weights
                if self.model.isTerminal(s_prime):
                    break  # Finish episode if S'
                # is terminal
                s = s_prime
                a = a_prime

    def step(self, state, action):
        # Take A, observe R and S'
        state_prime, reward, done, info = self.env.step(action)

        # Update model with new experience
        if self.learn or self.model_learn:
            experience = (state, action, reward, state_prime)
            self.model.addExperience(experience)

        self.search(state_prime)  # Search tree
        action_prime = self.policy.getAction(self.QVFA, self.featurize,
                                             state_prime)  # Pick A'

        # Update long-term weights
        if self.learn:
            self.TDupdateLong(state, action, reward, state_prime, action_prime)

        return state_prime, action_prime, reward, done

    def getValueMemory(self, features):
        value_short = self.VFAshort.getValue(
            features)  # Short term memory value
        value_long = self.VFAlong.getValue(features)  # Long term memory value
        total_value = value_short + value_long  # Memory value considered as sum
        # of short and long term memory
        return total_value

    def TDupdateShort(self, state, action, reward, state_prime, action_prime):
        # Compute the pertinent feature vectors
        features = self.featurize.featureStateAction(state, action)
        features_prime = self.featurize.featureStateAction(
            state_prime, action_prime)

        # Compute the value of the features via function approximation
        value = self.getValueMemory(features)
        value_prime = self.getValueMemory(features_prime)

        # Obtain delta weight
        delta_w = (self.beta * (reward + self.gamma * value_prime - value) *
                   self.VFAshort.getGradient(features))
        self.VFAshort.updateWeightsDelta(delta_w)

    def TDupdateLong(self, state, action, reward, state_prime, action_prime):
        # Compute the pertinent feature vectors
        features = self.featurize.featureStateAction(state, action)
        features_prime = self.featurize.featureStateAction(
            state_prime, action_prime)

        # Compute the value of the features via function approximation
        value = self.VFAlong.getValue(features)
        value_prime = self.VFAlong.getValue(features_prime)

        # Obtain delta weight
        delta_w = (self.alpha * (reward + self.gamma * value_prime - value) *
                   self.VFAlong.getGradient(features))
        self.VFAlong.updateWeightsDelta(delta_w)

Example #7

0

Show file

def compareMethods():
    import gym
    import matplotlib.pyplot as plt

    env = gym.make('GridWorld-v0')
    policy = SoftmaxPolicyVFA(1)
    feature = Featurize()

    training_episodes = 1000
    n_plot_points = 100
    eps_benchmark = 100
    fixedHorizon = 20

    agent = AdvanAC(env, policy, LinearVFA(), feature, 0.2, 0.1, 0.4, horizon = 20)

    # Initialize agents
    alpha1 = 0.2
    beta1 = 0.1
    agent1 = QAC(env, policy, LinearVFA(), feature, alpha1, beta1, horizon = fixedHorizon)

    alpha2 = 0.2
    beta2 = 0.1
    agent2 = AdvanAC(env, policy, LinearVFA(), feature, alpha2, beta2, horizon = fixedHorizon)

    alpha3 = 0.2
    beta3 = 0.1
    agent3 = TDAC(env, policy, LinearVFA(), feature, alpha3, beta3, horizon = fixedHorizon)

    alpha4 = 0.2
    beta4 = 0.1
    lamda4 = 0.4
    agent4 = TDlamdaAC(env, policy, LinearVFA(), feature, alpha4, beta4, lamda4, horizon = fixedHorizon)

    alpha5 = 0.2
    beta5 = 0.1
    agent5 = NaturalAC(env, policy, LinearVFA(), feature, alpha5, beta5, horizon = fixedHorizon)
    agents = [agent1, agent2, agent3, agent4, agent5]

    eps_per_point = int(training_episodes / n_plot_points)
    benchmark_data = np.zeros((5, n_plot_points))
    # Benchmark agents without training
    for agent_i in range(5): benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark)
    # Train and benchmark agents
    for point_i in range(1, n_plot_points):
        for agent_i in range(5):
            print('Agent ' + str(agent_i+1) + ', Episode ' + str((point_i+1)*eps_per_point))
            agents[agent_i].train(eps_per_point)
            benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(eps_benchmark)

    # Plot results
    plt.figure(figsize=(16, 10))
    xaxis = [eps_per_point*(i+1) for i in range(n_plot_points)]
    title1 = 'QAC, a = ' + str(alpha1) + ' b = ' + str(beta1)
    title2 = 'Advantage AC, a = ' + str(alpha2) + ' b = ' + str(beta2)
    title3 = 'TDAC, a = ' + str(alpha3) + ' b = ' + str(beta3)
    title4 = 'TD(lamda)AC, a = ' + str(alpha4) + ' b = ' + str(beta4) + ' l =' + str(lamda4)
    title5 = 'Natural AC, a = ' + str(alpha5) + ' b = ' + str(beta5)
    titles = [title1, title2, title3, title4, title5]
    for i in range(5):
        plt.subplot(231+i)
        plt.plot(xaxis, benchmark_data[i])
        plt.xlabel('Training episodes')
        plt.ylabel('Average reward per episode')
        plt.title(titles[i])
    plt.show()

Example #8

0

Show file

File: agentModelBased.py Project: GitZzw/RL_Algorithm

def compareMethods():
    import gym
    import matplotlib.pyplot as plt
    import gym_gridworlds
    env = gym.make('Gridworld-v0')

    epsilon = 0.1
    policyVFA = EGreedyPolicyVFA(epsilon)
    policyTab = EGreedyPolicyTabular(epsilon)
    VFA = LinearVFA()
    feature = Featurize()
    init_train_model = 0  # No previous knowledge about model
    H = 20

    training_episodes = 200
    n_plot_points = 100
    eps_benchmark = 100

    # Initialize agents
    alpha1 = 0.4
    plan1 = 20
    agent1 = DynaQ(env,
                   policyVFA,
                   VFA,
                   feature,
                   init_train_model,
                   plan1,
                   alpha1,
                   horizon=H)

    alpha2 = 0.4
    plan2 = 20
    agent2 = MCTreeSearch(env,
                          policyTab,
                          init_train_model,
                          plan2,
                          alpha2,
                          horizon=H)

    alpha3 = 0.4
    plan3 = 20
    agent3 = TDTreeSearch(env,
                          policyVFA,
                          VFA,
                          feature,
                          init_train_model,
                          plan3,
                          alpha3,
                          horizon=H)
    agent3.model.addTerminalStates([0, 15])

    alpha4 = 0.4
    beta4 = 0.2
    plan4 = 20
    agent4 = Dyna2(env,
                   policyVFA,
                   LinearVFA(),
                   VFA,
                   feature,
                   init_train_model,
                   plan4,
                   alpha4,
                   beta4,
                   horizon=H)
    agent4.model.addTerminalStates([0, 15])

    agents = [agent1, agent2, agent3, agent4]

    eps_per_point = int(training_episodes / n_plot_points)
    benchmark_data = np.zeros((4, n_plot_points))
    # Benchmark agents without training
    for agent_i in range(4):
        benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark)
    # Train and benchmark agents
    for point_i in range(1, n_plot_points):
        for agent_i in range(4):
            print('Agent ' + str(agent_i) + ', Episode ' +
                  str((point_i + 1) * eps_per_point))
            agents[agent_i].train(eps_per_point)
            benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(
                eps_benchmark)

    # Plot results
    plt.figure(figsize=(16, 10))
    xaxis = [eps_per_point * (i + 1) for i in range(n_plot_points)]
    title1 = 'DynaQ, n = ' + str(plan1) + ', a = ' + str(alpha1)
    title2 = 'MCTS, n = ' + str(plan2) + ', a = ' + str(alpha2)
    title3 = 'TDTS, n = ' + str(plan3) + ', a = ' + str(alpha3)
    title4 = 'Dyna2, n = ' + str(plan4) + ', a = ' + str(
        alpha4) + ', b = ' + str(beta4)
    titles = [title1, title2, title3, title4]
    for i in range(4):
        plt.subplot(221 + i)
        plt.plot(xaxis, benchmark_data[i])
        plt.xlabel('Training episodes')
        plt.ylabel('Average reward per episode')
        plt.title(titles[i])
    plt.show()

Example #9

0

Show file

File: agentIncrementalVFA.py Project: DapengChalmers/RL

def compareMethods():
    import gym
    import matplotlib.pyplot as plt

    env = gym.make('GridWorld-v0')
    policy = EGreedyPolicyVFA(0.1)
    VFA = LinearVFA()
    feature = Featurize()

    training_episodes = 400
    n_plot_points = 100
    eps_benchmark = 100

    # Initialize agents
    alpha1 = 0.4
    agent1 = TD(env, policy, VFA, feature, alpha1, horizon=20)

    alpha2 = 0.4
    lamda2 = 0.8
    agent2 = TDlamda(env, policy, VFA, feature, alpha2, lamda2, horizon=20)

    alpha3 = 0.4
    beta3 = 0.2
    agent3 = GradientTD2(env,
                         policy,
                         VFA,
                         feature,
                         alpha3,
                         beta=beta3,
                         horizon=20)

    alpha4 = 0.4
    agent4 = GradientQlearning(env, policy, VFA, feature, alpha4, horizon=20)

    alpha5 = 0.4
    beta5 = 0.2
    agent5 = RLSTD(env, policy, VFA, feature, alpha4, beta=beta5, horizon=20)

    agents = [agent1, agent2, agent3, agent4, agent5]

    eps_per_point = int(training_episodes / n_plot_points)
    benchmark_data = np.zeros((5, n_plot_points))
    # Benchmark agents without training
    for agent_i in range(5):
        benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark)
    # Train and benchmark agents
    for point_i in range(1, n_plot_points):
        for agent_i in range(5):
            print('Agent ' + str(agent_i) + ', Episode ' +
                  str((point_i + 1) * eps_per_point))
            agents[agent_i].train(eps_per_point)
            benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(
                eps_benchmark)

    # Plot results
    xaxis = [eps_per_point * (i + 1) for i in range(n_plot_points)]
    title1 = 'VFA TD, a = ' + str(alpha1)
    title2 = 'VFA TD(lamda), a = ' + str(alpha2) + ', l = ' + str(lamda2)
    title3 = 'GTD2, a = ' + str(alpha3) + ', b = ' + str(beta3)
    title4 = 'Gradient Q, a = ' + str(alpha4)
    title5 = 'RLSTD, a = ' + str(alpha5) + ', b = ' + str(beta5)
    titles = [title1, title2, title3, title4, title5]
    for i in range(5):
        plt.subplot(231 + i)
        plt.plot(xaxis, benchmark_data[i])
        plt.xlabel('Training episodes')
        plt.ylabel('Average reward per episode')
        plt.title(titles[i])
    plt.show()