Ejemplo n.º 1
0
    def setUpCritic(self, featDim, nS, nA):
        self.VFAcritic.setUpWeights(featDim) # Initialize weights critic VFA

        # In order to compute the advantage function it is necesary to have
        # another set of weights to approximate both Q(s,a) and V(s)
        # The VFA chosen here is linear combination of features
        self.VFAstateval = LinearVFA()
        self.VFAstateval.setUpWeights((self.nS, 1))
        self.kappa = self.beta * 0.4 # Step size parameter
Ejemplo n.º 2
0
def compareMethods():
    import gym
    import matplotlib.pyplot as plt

    env = gym.make('GridWorld-v0')
    policy = EGreedyPolicyVFA(0.1)
    VFA = LinearVFA()
    feature = Featurize()

    training_episodes = 1000
    n_plot_points = 100
    eps_benchmark = 100
    fixedHorizon = 20

    # Initialize agents
    alpha1 = 0.4
    agent1 = LeastSquaresTD(env, policy, VFA, feature, alpha1, horizon = fixedHorizon)

    alpha2 = 0.4
    lamda2 = 0.8
    agent2 = LSTDlamda(env, policy, VFA, feature, alpha2, lamda2, horizon = fixedHorizon)

    alpha3 = 0.4
    agent3 = LSTDQ(env, policy, VFA, feature, alpha3, horizon = fixedHorizon)

    alpha4 = 0.4
    agent4 = LSPITD(env, policy, VFA, feature, alpha4, horizon = fixedHorizon)

    agents = [agent1, agent2, agent3, agent4]

    eps_per_point = int(training_episodes / n_plot_points)
    benchmark_data = np.zeros((4, n_plot_points))
    # Benchmark agents without training
    for agent_i in range(4): benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark)
    # Train and benchmark agents
    for point_i in range(1, n_plot_points):
        for agent_i in range(4):
            print('Agent ' + str(agent_i) + ', Episode ' + str((point_i+1)*eps_per_point))
            agents[agent_i].train(eps_per_point)
            benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(eps_benchmark)

    # Plot results
    plt.figure(figsize=(12, 10))
    xaxis = [eps_per_point*(i+1) for i in range(n_plot_points)]
    title1 = 'LSTD(0), a = ' + str(alpha1)
    title2 = 'LSTD(lamda), a = ' + str(alpha2) + ', l = ' + str(lamda2)
    title3 = 'LSTDQ, a = ' + str(alpha3)
    title4 = 'LSPITD, a = ' + str(alpha4)
    titles = [title1, title2, title3, title4]
    for i in range(4):
        plt.subplot(221+i)
        plt.plot(xaxis, benchmark_data[i])
        plt.xlabel('Training episodes')
        plt.ylabel('Average reward per episode')
        plt.title(titles[i])
    plt.show()
Ejemplo n.º 3
0
def compareMethods():
    import gym
    import matplotlib.pyplot as plt
    import gym_gridworlds
    env = gym.make('Gridworld-v0')

    epsilon = 0.1
    policyVFA = EGreedyPolicyVFA(epsilon)
    policyTab = EGreedyPolicyTabular(epsilon)
    VFA = LinearVFA()
    feature = Featurize()
    init_train_model = 0  # No previous knowledge about model
    H = 20

    training_episodes = 200
    n_plot_points = 100
    eps_benchmark = 100

    # Initialize agents
    alpha1 = 0.4
    plan1 = 20
    agent1 = DynaQ(env,
                   policyVFA,
                   VFA,
                   feature,
                   init_train_model,
                   plan1,
                   alpha1,
                   horizon=H)

    alpha4 = 0.4
    beta4 = 0.4
    plan4 = 20
    agent4 = Dyna2(env,
                   policyVFA,
                   LinearVFA(),
                   VFA,
                   feature,
                   init_train_model,
                   plan4,
                   alpha4,
                   beta4,
                   horizon=H)
    agent4.model.addTerminalStates([0, 15])

    agents = [agent1, agent4]

    eps_per_point = int(training_episodes / n_plot_points)
    benchmark_data = np.zeros((4, n_plot_points))
    # Benchmark agents without training
    for agent_i in range(2):
        benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark)
    # Train and benchmark agents

    plt.figure(figsize=(10, 5))
    plt.ion()
    for point_i in range(1, n_plot_points):
        # for agent_i in range(2):
        #     print('Dyna ' + str(agent_i) + ', Episode ' + str((point_i+1)*eps_per_point))
        #     agents[agent_i].train(eps_per_point)
        #     benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(eps_benchmark)

        print('DynaQ' + ', Episode ' + str((point_i + 1) * eps_per_point))
        agents[0].train(eps_per_point)
        benchmark_data[0][point_i] = agents[0].benchmark(eps_benchmark)
        print('Dyna2' + ', Episode ' + str((point_i + 1) * eps_per_point))
        agents[1].train(eps_per_point)
        benchmark_data[1][point_i] = agents[1].benchmark(eps_benchmark)

        # Plot results
        # plt.figure(figsize=(16, 10))
        # xaxis = [eps_per_point*(i+1) for i in range(n_plot_points)]
        # title1 = 'DynaQ'
        # title4 = 'Dyna2'
        # titles = [title1, title4]
        # for i in range(2):
        #     plt.subplot(221+i)
        #     plt.plot(xaxis, benchmark_data[i])
        #     plt.xlabel('Training episodes')
        #     plt.ylabel('Average reward per episode')
        #     plt.title(titles[i])
        # plt.show()
        # Plot results
        plt.clf()  # 清除之前画的图
        fig = plt.gcf()  # 获取当前图
        xaxis = [eps_per_point * (i + 1) for i in range(n_plot_points)]
        title1 = 'DynaQ'
        title4 = 'Dyna2'
        titles = [title1, title4]
        for i in range(2):
            plt.subplot(121 + i)
            plt.plot(xaxis, benchmark_data[i])
            plt.xlabel('Training episodes')
            plt.ylabel('Average reward per episode')
            plt.title(titles[i])
        plt.pause(0.001)  # 暂停一段时间,不然画的太快会卡住显示不出来
        plt.ioff()  # 关闭画图窗口Z
    plt.show()
Ejemplo n.º 4
0
    def __init__(self,
                 env,
                 policy,
                 VFAshort,
                 VFAlong,
                 featurize,
                 train_eps,
                 planning,
                 alpha,
                 beta,
                 gamma=1,
                 horizon=100,
                 verbosity=0):
        # Inputs:
        #   -env: openAI gym environment object
        #   -policy: object containing a policy from which to sample actions
        #   -VFAshort: object containing the value function approximator for the
        #       short-term memory
        #   -VFAlong: object containing the value function approximator for the
        #       long-term memory
        #   -featurize: object which featurizes states
        #   -train_eps: numer of random episodes to generate experience to train
        #       the model initially
        #   -planning: number of planning steps
        #   -alpha: step size parameter for long term memory value function update
        #   -beta: step size parameter for short term memory value function update
        #   -lamda: trace discount paramater
        #   -gamma: reward discount-rate parameter
        #   -horizon: finite horizon steps
        #   -verbosity: if TRUE, prints to screen additional information

        self.env = env
        self.policy = policy
        self.VFAshort = VFAshort
        self.VFAlong = VFAlong
        self.featurize = featurize
        self.planning = planning
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.horizon = horizon
        self.verbosity = verbosity

        self.nS = env.observation_space.n  # Number of states
        self.nA = env.action_space.n  # Number of actions
        self.policy.setNActions(self.nA)
        self.featurize.set_nSnA(self.nS, self.nA)
        self.featDim = featurize.featureStateAction(
            0, 0).shape  # Dimensions of the
        # feature vector
        self.VFAshort.setUpWeights(
            self.featDim)  # Initialize weights for the VFA
        # for short term memory
        self.VFAlong.setUpWeights(
            self.featDim)  # Initialize weights for the VFA
        # for long term memory
        self.QVFA = LinearVFA()  # Q(s,a) is approximated through Linear Value
        # Function Approximation, with weights equal to
        # the sum of the weights of the short and long
        # term memory VFAs.
        self.updateQ()  # Initialize QVFA

        # Initially prevent agent from learning
        self.learn = 0

        # Initialize model
        self.model = TableLookupModel(self.nS,
                                      self.nA)  # Initialize model as a
        # Table Lookup Model
        self.model_learn = 0

        # Uncoment for previous random exploration in order to improve initial model
        self.trainModel(train_eps)
Ejemplo n.º 5
0
def compareMethods():
    import gym
    import matplotlib.pyplot as plt

    env = gym.make('GridWorld-v0')
    policy = SoftmaxPolicyVFA(1)
    feature = Featurize()

    training_episodes = 1000
    n_plot_points = 100
    eps_benchmark = 100
    fixedHorizon = 20

    agent = AdvanAC(env, policy, LinearVFA(), feature, 0.2, 0.1, 0.4, horizon = 20)

    # Initialize agents
    alpha1 = 0.2
    beta1 = 0.1
    agent1 = QAC(env, policy, LinearVFA(), feature, alpha1, beta1, horizon = fixedHorizon)

    alpha2 = 0.2
    beta2 = 0.1
    agent2 = AdvanAC(env, policy, LinearVFA(), feature, alpha2, beta2, horizon = fixedHorizon)

    alpha3 = 0.2
    beta3 = 0.1
    agent3 = TDAC(env, policy, LinearVFA(), feature, alpha3, beta3, horizon = fixedHorizon)

    alpha4 = 0.2
    beta4 = 0.1
    lamda4 = 0.4
    agent4 = TDlamdaAC(env, policy, LinearVFA(), feature, alpha4, beta4, lamda4, horizon = fixedHorizon)

    alpha5 = 0.2
    beta5 = 0.1
    agent5 = NaturalAC(env, policy, LinearVFA(), feature, alpha5, beta5, horizon = fixedHorizon)
    agents = [agent1, agent2, agent3, agent4, agent5]

    eps_per_point = int(training_episodes / n_plot_points)
    benchmark_data = np.zeros((5, n_plot_points))
    # Benchmark agents without training
    for agent_i in range(5): benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark)
    # Train and benchmark agents
    for point_i in range(1, n_plot_points):
        for agent_i in range(5):
            print('Agent ' + str(agent_i+1) + ', Episode ' + str((point_i+1)*eps_per_point))
            agents[agent_i].train(eps_per_point)
            benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(eps_benchmark)

    # Plot results
    plt.figure(figsize=(16, 10))
    xaxis = [eps_per_point*(i+1) for i in range(n_plot_points)]
    title1 = 'QAC, a = ' + str(alpha1) + ' b = ' + str(beta1)
    title2 = 'Advantage AC, a = ' + str(alpha2) + ' b = ' + str(beta2)
    title3 = 'TDAC, a = ' + str(alpha3) + ' b = ' + str(beta3)
    title4 = 'TD(lamda)AC, a = ' + str(alpha4) + ' b = ' + str(beta4) + ' l =' + str(lamda4)
    title5 = 'Natural AC, a = ' + str(alpha5) + ' b = ' + str(beta5)
    titles = [title1, title2, title3, title4, title5]
    for i in range(5):
        plt.subplot(231+i)
        plt.plot(xaxis, benchmark_data[i])
        plt.xlabel('Training episodes')
        plt.ylabel('Average reward per episode')
        plt.title(titles[i])
    plt.show()
Ejemplo n.º 6
0
def compareMethods():
    import gym
    import matplotlib.pyplot as plt
    import gym_gridworlds
    env = gym.make('Gridworld-v0')

    epsilon = 0.1
    policyVFA = EGreedyPolicyVFA(epsilon)
    policyTab = EGreedyPolicyTabular(epsilon)
    VFA = LinearVFA()
    feature = Featurize()
    init_train_model = 0  # No previous knowledge about model
    H = 20

    training_episodes = 200
    n_plot_points = 100
    eps_benchmark = 100

    # Initialize agents
    alpha1 = 0.4
    plan1 = 20
    agent1 = DynaQ(env,
                   policyVFA,
                   VFA,
                   feature,
                   init_train_model,
                   plan1,
                   alpha1,
                   horizon=H)

    alpha2 = 0.4
    plan2 = 20
    agent2 = MCTreeSearch(env,
                          policyTab,
                          init_train_model,
                          plan2,
                          alpha2,
                          horizon=H)

    alpha3 = 0.4
    plan3 = 20
    agent3 = TDTreeSearch(env,
                          policyVFA,
                          VFA,
                          feature,
                          init_train_model,
                          plan3,
                          alpha3,
                          horizon=H)
    agent3.model.addTerminalStates([0, 15])

    alpha4 = 0.4
    beta4 = 0.2
    plan4 = 20
    agent4 = Dyna2(env,
                   policyVFA,
                   LinearVFA(),
                   VFA,
                   feature,
                   init_train_model,
                   plan4,
                   alpha4,
                   beta4,
                   horizon=H)
    agent4.model.addTerminalStates([0, 15])

    agents = [agent1, agent2, agent3, agent4]

    eps_per_point = int(training_episodes / n_plot_points)
    benchmark_data = np.zeros((4, n_plot_points))
    # Benchmark agents without training
    for agent_i in range(4):
        benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark)
    # Train and benchmark agents
    for point_i in range(1, n_plot_points):
        for agent_i in range(4):
            print('Agent ' + str(agent_i) + ', Episode ' +
                  str((point_i + 1) * eps_per_point))
            agents[agent_i].train(eps_per_point)
            benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(
                eps_benchmark)

    # Plot results
    plt.figure(figsize=(16, 10))
    xaxis = [eps_per_point * (i + 1) for i in range(n_plot_points)]
    title1 = 'DynaQ, n = ' + str(plan1) + ', a = ' + str(alpha1)
    title2 = 'MCTS, n = ' + str(plan2) + ', a = ' + str(alpha2)
    title3 = 'TDTS, n = ' + str(plan3) + ', a = ' + str(alpha3)
    title4 = 'Dyna2, n = ' + str(plan4) + ', a = ' + str(
        alpha4) + ', b = ' + str(beta4)
    titles = [title1, title2, title3, title4]
    for i in range(4):
        plt.subplot(221 + i)
        plt.plot(xaxis, benchmark_data[i])
        plt.xlabel('Training episodes')
        plt.ylabel('Average reward per episode')
        plt.title(titles[i])
    plt.show()
Ejemplo n.º 7
0
def compareMethods():
    import gym
    import matplotlib.pyplot as plt

    env = gym.make('GridWorld-v0')
    policy = EGreedyPolicyVFA(0.1)
    VFA = LinearVFA()
    feature = Featurize()

    training_episodes = 400
    n_plot_points = 100
    eps_benchmark = 100

    # Initialize agents
    alpha1 = 0.4
    agent1 = TD(env, policy, VFA, feature, alpha1, horizon=20)

    alpha2 = 0.4
    lamda2 = 0.8
    agent2 = TDlamda(env, policy, VFA, feature, alpha2, lamda2, horizon=20)

    alpha3 = 0.4
    beta3 = 0.2
    agent3 = GradientTD2(env,
                         policy,
                         VFA,
                         feature,
                         alpha3,
                         beta=beta3,
                         horizon=20)

    alpha4 = 0.4
    agent4 = GradientQlearning(env, policy, VFA, feature, alpha4, horizon=20)

    alpha5 = 0.4
    beta5 = 0.2
    agent5 = RLSTD(env, policy, VFA, feature, alpha4, beta=beta5, horizon=20)

    agents = [agent1, agent2, agent3, agent4, agent5]

    eps_per_point = int(training_episodes / n_plot_points)
    benchmark_data = np.zeros((5, n_plot_points))
    # Benchmark agents without training
    for agent_i in range(5):
        benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark)
    # Train and benchmark agents
    for point_i in range(1, n_plot_points):
        for agent_i in range(5):
            print('Agent ' + str(agent_i) + ', Episode ' +
                  str((point_i + 1) * eps_per_point))
            agents[agent_i].train(eps_per_point)
            benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(
                eps_benchmark)

    # Plot results
    xaxis = [eps_per_point * (i + 1) for i in range(n_plot_points)]
    title1 = 'VFA TD, a = ' + str(alpha1)
    title2 = 'VFA TD(lamda), a = ' + str(alpha2) + ', l = ' + str(lamda2)
    title3 = 'GTD2, a = ' + str(alpha3) + ', b = ' + str(beta3)
    title4 = 'Gradient Q, a = ' + str(alpha4)
    title5 = 'RLSTD, a = ' + str(alpha5) + ', b = ' + str(beta5)
    titles = [title1, title2, title3, title4, title5]
    for i in range(5):
        plt.subplot(231 + i)
        plt.plot(xaxis, benchmark_data[i])
        plt.xlabel('Training episodes')
        plt.ylabel('Average reward per episode')
        plt.title(titles[i])
    plt.show()