def setUpCritic(self, featDim, nS, nA): self.VFAcritic.setUpWeights(featDim) # Initialize weights critic VFA # In order to compute the advantage function it is necesary to have # another set of weights to approximate both Q(s,a) and V(s) # The VFA chosen here is linear combination of features self.VFAstateval = LinearVFA() self.VFAstateval.setUpWeights((self.nS, 1)) self.kappa = self.beta * 0.4 # Step size parameter
def compareMethods(): import gym import matplotlib.pyplot as plt env = gym.make('GridWorld-v0') policy = EGreedyPolicyVFA(0.1) VFA = LinearVFA() feature = Featurize() training_episodes = 1000 n_plot_points = 100 eps_benchmark = 100 fixedHorizon = 20 # Initialize agents alpha1 = 0.4 agent1 = LeastSquaresTD(env, policy, VFA, feature, alpha1, horizon = fixedHorizon) alpha2 = 0.4 lamda2 = 0.8 agent2 = LSTDlamda(env, policy, VFA, feature, alpha2, lamda2, horizon = fixedHorizon) alpha3 = 0.4 agent3 = LSTDQ(env, policy, VFA, feature, alpha3, horizon = fixedHorizon) alpha4 = 0.4 agent4 = LSPITD(env, policy, VFA, feature, alpha4, horizon = fixedHorizon) agents = [agent1, agent2, agent3, agent4] eps_per_point = int(training_episodes / n_plot_points) benchmark_data = np.zeros((4, n_plot_points)) # Benchmark agents without training for agent_i in range(4): benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark) # Train and benchmark agents for point_i in range(1, n_plot_points): for agent_i in range(4): print('Agent ' + str(agent_i) + ', Episode ' + str((point_i+1)*eps_per_point)) agents[agent_i].train(eps_per_point) benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(eps_benchmark) # Plot results plt.figure(figsize=(12, 10)) xaxis = [eps_per_point*(i+1) for i in range(n_plot_points)] title1 = 'LSTD(0), a = ' + str(alpha1) title2 = 'LSTD(lamda), a = ' + str(alpha2) + ', l = ' + str(lamda2) title3 = 'LSTDQ, a = ' + str(alpha3) title4 = 'LSPITD, a = ' + str(alpha4) titles = [title1, title2, title3, title4] for i in range(4): plt.subplot(221+i) plt.plot(xaxis, benchmark_data[i]) plt.xlabel('Training episodes') plt.ylabel('Average reward per episode') plt.title(titles[i]) plt.show()
class AdvanAC(AgentQAC): def setUpCritic(self, featDim, nS, nA): self.VFAcritic.setUpWeights(featDim) # Initialize weights critic VFA # In order to compute the advantage function it is necesary to have # another set of weights to approximate both Q(s,a) and V(s) # The VFA chosen here is linear combination of features self.VFAstateval = LinearVFA() self.VFAstateval.setUpWeights((self.nS, 1)) self.kappa = self.beta * 0.4 # Step size parameter def step(self, state, action): # Take A, observe R and S' state_prime, reward, done, info = self.env.step(action) # Choose A' using a policy derived from S' action_prime = self.policy.getAction(self.featurize, state_prime) if self.learn: # Compute the pertinent feature vectors features = self.featurize.featureStateAction(state, action) features_prime = self.featurize.featureStateAction(state_prime, action_prime) features_state = self.featurize.featureState(state) features_stateprime = self.featurize.featureState(state_prime) # Compute the value of the features via value function approximation value = self.VFAcritic.getValue(features) value_prime = self.VFAcritic.getValue(features_prime) value_state = self.VFAstateval.getValue(features_state) value_stateprime = self.VFAstateval.getValue(features_stateprime) delta_q = reward + self.gamma * value_prime - value delta_v = reward + self.gamma * value_stateprime - value_state # Actor update advantage = value - value_state gradient = self.policy.getGradient(self.featurize, state, action) delta_theta = self.alpha * gradient * advantage self.policy.updateWeightsDelta(delta_theta) # Critic update delta_weight = self.beta * delta_q * features self.VFAcritic.updateWeightsDelta(delta_weight) # State value function update delta_stateWeight = self.kappa * delta_v * features_state self.VFAstateval.updateWeightsDelta(delta_stateWeight) return state_prime, action_prime, reward, done
def compareMethods(): import gym import matplotlib.pyplot as plt import gym_gridworlds env = gym.make('Gridworld-v0') epsilon = 0.1 policyVFA = EGreedyPolicyVFA(epsilon) policyTab = EGreedyPolicyTabular(epsilon) VFA = LinearVFA() feature = Featurize() init_train_model = 0 # No previous knowledge about model H = 20 training_episodes = 200 n_plot_points = 100 eps_benchmark = 100 # Initialize agents alpha1 = 0.4 plan1 = 20 agent1 = DynaQ(env, policyVFA, VFA, feature, init_train_model, plan1, alpha1, horizon=H) alpha4 = 0.4 beta4 = 0.4 plan4 = 20 agent4 = Dyna2(env, policyVFA, LinearVFA(), VFA, feature, init_train_model, plan4, alpha4, beta4, horizon=H) agent4.model.addTerminalStates([0, 15]) agents = [agent1, agent4] eps_per_point = int(training_episodes / n_plot_points) benchmark_data = np.zeros((4, n_plot_points)) # Benchmark agents without training for agent_i in range(2): benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark) # Train and benchmark agents plt.figure(figsize=(10, 5)) plt.ion() for point_i in range(1, n_plot_points): # for agent_i in range(2): # print('Dyna ' + str(agent_i) + ', Episode ' + str((point_i+1)*eps_per_point)) # agents[agent_i].train(eps_per_point) # benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(eps_benchmark) print('DynaQ' + ', Episode ' + str((point_i + 1) * eps_per_point)) agents[0].train(eps_per_point) benchmark_data[0][point_i] = agents[0].benchmark(eps_benchmark) print('Dyna2' + ', Episode ' + str((point_i + 1) * eps_per_point)) agents[1].train(eps_per_point) benchmark_data[1][point_i] = agents[1].benchmark(eps_benchmark) # Plot results # plt.figure(figsize=(16, 10)) # xaxis = [eps_per_point*(i+1) for i in range(n_plot_points)] # title1 = 'DynaQ' # title4 = 'Dyna2' # titles = [title1, title4] # for i in range(2): # plt.subplot(221+i) # plt.plot(xaxis, benchmark_data[i]) # plt.xlabel('Training episodes') # plt.ylabel('Average reward per episode') # plt.title(titles[i]) # plt.show() # Plot results plt.clf() # 清除之前画的图 fig = plt.gcf() # 获取当前图 xaxis = [eps_per_point * (i + 1) for i in range(n_plot_points)] title1 = 'DynaQ' title4 = 'Dyna2' titles = [title1, title4] for i in range(2): plt.subplot(121 + i) plt.plot(xaxis, benchmark_data[i]) plt.xlabel('Training episodes') plt.ylabel('Average reward per episode') plt.title(titles[i]) plt.pause(0.001) # 暂停一段时间,不然画的太快会卡住显示不出来 plt.ioff() # 关闭画图窗口Z plt.show()
def __init__(self, env, policy, VFAshort, VFAlong, featurize, train_eps, planning, alpha, beta, gamma=1, horizon=100, verbosity=0): # Inputs: # -env: openAI gym environment object # -policy: object containing a policy from which to sample actions # -VFAshort: object containing the value function approximator for the # short-term memory # -VFAlong: object containing the value function approximator for the # long-term memory # -featurize: object which featurizes states # -train_eps: numer of random episodes to generate experience to train # the model initially # -planning: number of planning steps # -alpha: step size parameter for long term memory value function update # -beta: step size parameter for short term memory value function update # -lamda: trace discount paramater # -gamma: reward discount-rate parameter # -horizon: finite horizon steps # -verbosity: if TRUE, prints to screen additional information self.env = env self.policy = policy self.VFAshort = VFAshort self.VFAlong = VFAlong self.featurize = featurize self.planning = planning self.alpha = alpha self.beta = beta self.gamma = gamma self.horizon = horizon self.verbosity = verbosity self.nS = env.observation_space.n # Number of states self.nA = env.action_space.n # Number of actions self.policy.setNActions(self.nA) self.featurize.set_nSnA(self.nS, self.nA) self.featDim = featurize.featureStateAction( 0, 0).shape # Dimensions of the # feature vector self.VFAshort.setUpWeights( self.featDim) # Initialize weights for the VFA # for short term memory self.VFAlong.setUpWeights( self.featDim) # Initialize weights for the VFA # for long term memory self.QVFA = LinearVFA() # Q(s,a) is approximated through Linear Value # Function Approximation, with weights equal to # the sum of the weights of the short and long # term memory VFAs. self.updateQ() # Initialize QVFA # Initially prevent agent from learning self.learn = 0 # Initialize model self.model = TableLookupModel(self.nS, self.nA) # Initialize model as a # Table Lookup Model self.model_learn = 0 # Uncoment for previous random exploration in order to improve initial model self.trainModel(train_eps)
class Dyna2(Agent): def __init__(self, env, policy, VFAshort, VFAlong, featurize, train_eps, planning, alpha, beta, gamma=1, horizon=100, verbosity=0): # Inputs: # -env: openAI gym environment object # -policy: object containing a policy from which to sample actions # -VFAshort: object containing the value function approximator for the # short-term memory # -VFAlong: object containing the value function approximator for the # long-term memory # -featurize: object which featurizes states # -train_eps: numer of random episodes to generate experience to train # the model initially # -planning: number of planning steps # -alpha: step size parameter for long term memory value function update # -beta: step size parameter for short term memory value function update # -lamda: trace discount paramater # -gamma: reward discount-rate parameter # -horizon: finite horizon steps # -verbosity: if TRUE, prints to screen additional information self.env = env self.policy = policy self.VFAshort = VFAshort self.VFAlong = VFAlong self.featurize = featurize self.planning = planning self.alpha = alpha self.beta = beta self.gamma = gamma self.horizon = horizon self.verbosity = verbosity self.nS = env.observation_space.n # Number of states self.nA = env.action_space.n # Number of actions self.policy.setNActions(self.nA) self.featurize.set_nSnA(self.nS, self.nA) self.featDim = featurize.featureStateAction( 0, 0).shape # Dimensions of the # feature vector self.VFAshort.setUpWeights( self.featDim) # Initialize weights for the VFA # for short term memory self.VFAlong.setUpWeights( self.featDim) # Initialize weights for the VFA # for long term memory self.QVFA = LinearVFA() # Q(s,a) is approximated through Linear Value # Function Approximation, with weights equal to # the sum of the weights of the short and long # term memory VFAs. self.updateQ() # Initialize QVFA # Initially prevent agent from learning self.learn = 0 # Initialize model self.model = TableLookupModel(self.nS, self.nA) # Initialize model as a # Table Lookup Model self.model_learn = 0 # Uncoment for previous random exploration in order to improve initial model self.trainModel(train_eps) def trainModel(self, train_eps): self.model_learn = 1 # Model will be learnt self.preventlearn() # Value function will not be learnt self.runEpisodes(train_eps) self.model_learn = 0 def updateQ(self): weights_short = self.VFAshort.getWeights() weights_long = self.VFAlong.getWeights() Qweights = weights_long + weights_short # Assuming that both VFAs use the # same featurize function self.QVFA.setWeights(Qweights) # Computes a single episode. # Returns the episode reward return. def episode(self): episodeReward = 0 # Clear short term memory self.VFAshort.setUpWeights( self.featDim) # Initialize weights for the VFA # for short term memory state = self.env.reset() # Initialize S if self.learn: self.search(state) # Search in order to update short term memory self.updateQ() # Take into account previous search in Q VFA # Pick A action = self.policy.getAction(self.QVFA, self.featurize, state) # Repeat for each episode for t in range(self.horizon): # Take action A, observe R, S' state, action, reward, done = self.step(state, action) # Update the total episode return episodeReward += reward # Finish the loop if S' is a terminal state if done: break # Update the policy parameters if the agent is learning if self.learn: self.policy.episodeUpdate() return episodeReward def search(self, state): for ep in range(self.planning): s = state # Initialize S self.updateQ() a = self.policy.getAction(self.QVFA, self.featurize, s) # Pick A for k in range(self.horizon): s_prime = self.model.sampleStatePrime(s, a) # Get expected S' r = self.model.sampleReward(s, a) # Get expected R self.updateQ() # Update QVFA a_prime = self.policy.getAction( self.QVFA, self.featurize, s_prime) # Pick A' using QVFA and S' self.TDupdateShort(s, a, r, s_prime, a_prime) # Update short-term # memory weights if self.model.isTerminal(s_prime): break # Finish episode if S' # is terminal s = s_prime a = a_prime def step(self, state, action): # Take A, observe R and S' state_prime, reward, done, info = self.env.step(action) # Update model with new experience if self.learn or self.model_learn: experience = (state, action, reward, state_prime) self.model.addExperience(experience) self.search(state_prime) # Search tree action_prime = self.policy.getAction(self.QVFA, self.featurize, state_prime) # Pick A' # Update long-term weights if self.learn: self.TDupdateLong(state, action, reward, state_prime, action_prime) return state_prime, action_prime, reward, done def getValueMemory(self, features): value_short = self.VFAshort.getValue( features) # Short term memory value value_long = self.VFAlong.getValue(features) # Long term memory value total_value = value_short + value_long # Memory value considered as sum # of short and long term memory return total_value def TDupdateShort(self, state, action, reward, state_prime, action_prime): # Compute the pertinent feature vectors features = self.featurize.featureStateAction(state, action) features_prime = self.featurize.featureStateAction( state_prime, action_prime) # Compute the value of the features via function approximation value = self.getValueMemory(features) value_prime = self.getValueMemory(features_prime) # Obtain delta weight delta_w = (self.beta * (reward + self.gamma * value_prime - value) * self.VFAshort.getGradient(features)) self.VFAshort.updateWeightsDelta(delta_w) def TDupdateLong(self, state, action, reward, state_prime, action_prime): # Compute the pertinent feature vectors features = self.featurize.featureStateAction(state, action) features_prime = self.featurize.featureStateAction( state_prime, action_prime) # Compute the value of the features via function approximation value = self.VFAlong.getValue(features) value_prime = self.VFAlong.getValue(features_prime) # Obtain delta weight delta_w = (self.alpha * (reward + self.gamma * value_prime - value) * self.VFAlong.getGradient(features)) self.VFAlong.updateWeightsDelta(delta_w)
def compareMethods(): import gym import matplotlib.pyplot as plt env = gym.make('GridWorld-v0') policy = SoftmaxPolicyVFA(1) feature = Featurize() training_episodes = 1000 n_plot_points = 100 eps_benchmark = 100 fixedHorizon = 20 agent = AdvanAC(env, policy, LinearVFA(), feature, 0.2, 0.1, 0.4, horizon = 20) # Initialize agents alpha1 = 0.2 beta1 = 0.1 agent1 = QAC(env, policy, LinearVFA(), feature, alpha1, beta1, horizon = fixedHorizon) alpha2 = 0.2 beta2 = 0.1 agent2 = AdvanAC(env, policy, LinearVFA(), feature, alpha2, beta2, horizon = fixedHorizon) alpha3 = 0.2 beta3 = 0.1 agent3 = TDAC(env, policy, LinearVFA(), feature, alpha3, beta3, horizon = fixedHorizon) alpha4 = 0.2 beta4 = 0.1 lamda4 = 0.4 agent4 = TDlamdaAC(env, policy, LinearVFA(), feature, alpha4, beta4, lamda4, horizon = fixedHorizon) alpha5 = 0.2 beta5 = 0.1 agent5 = NaturalAC(env, policy, LinearVFA(), feature, alpha5, beta5, horizon = fixedHorizon) agents = [agent1, agent2, agent3, agent4, agent5] eps_per_point = int(training_episodes / n_plot_points) benchmark_data = np.zeros((5, n_plot_points)) # Benchmark agents without training for agent_i in range(5): benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark) # Train and benchmark agents for point_i in range(1, n_plot_points): for agent_i in range(5): print('Agent ' + str(agent_i+1) + ', Episode ' + str((point_i+1)*eps_per_point)) agents[agent_i].train(eps_per_point) benchmark_data[agent_i][point_i] = agents[agent_i].benchmark(eps_benchmark) # Plot results plt.figure(figsize=(16, 10)) xaxis = [eps_per_point*(i+1) for i in range(n_plot_points)] title1 = 'QAC, a = ' + str(alpha1) + ' b = ' + str(beta1) title2 = 'Advantage AC, a = ' + str(alpha2) + ' b = ' + str(beta2) title3 = 'TDAC, a = ' + str(alpha3) + ' b = ' + str(beta3) title4 = 'TD(lamda)AC, a = ' + str(alpha4) + ' b = ' + str(beta4) + ' l =' + str(lamda4) title5 = 'Natural AC, a = ' + str(alpha5) + ' b = ' + str(beta5) titles = [title1, title2, title3, title4, title5] for i in range(5): plt.subplot(231+i) plt.plot(xaxis, benchmark_data[i]) plt.xlabel('Training episodes') plt.ylabel('Average reward per episode') plt.title(titles[i]) plt.show()
def compareMethods(): import gym import matplotlib.pyplot as plt import gym_gridworlds env = gym.make('Gridworld-v0') epsilon = 0.1 policyVFA = EGreedyPolicyVFA(epsilon) policyTab = EGreedyPolicyTabular(epsilon) VFA = LinearVFA() feature = Featurize() init_train_model = 0 # No previous knowledge about model H = 20 training_episodes = 200 n_plot_points = 100 eps_benchmark = 100 # Initialize agents alpha1 = 0.4 plan1 = 20 agent1 = DynaQ(env, policyVFA, VFA, feature, init_train_model, plan1, alpha1, horizon=H) alpha2 = 0.4 plan2 = 20 agent2 = MCTreeSearch(env, policyTab, init_train_model, plan2, alpha2, horizon=H) alpha3 = 0.4 plan3 = 20 agent3 = TDTreeSearch(env, policyVFA, VFA, feature, init_train_model, plan3, alpha3, horizon=H) agent3.model.addTerminalStates([0, 15]) alpha4 = 0.4 beta4 = 0.2 plan4 = 20 agent4 = Dyna2(env, policyVFA, LinearVFA(), VFA, feature, init_train_model, plan4, alpha4, beta4, horizon=H) agent4.model.addTerminalStates([0, 15]) agents = [agent1, agent2, agent3, agent4] eps_per_point = int(training_episodes / n_plot_points) benchmark_data = np.zeros((4, n_plot_points)) # Benchmark agents without training for agent_i in range(4): benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark) # Train and benchmark agents for point_i in range(1, n_plot_points): for agent_i in range(4): print('Agent ' + str(agent_i) + ', Episode ' + str((point_i + 1) * eps_per_point)) agents[agent_i].train(eps_per_point) benchmark_data[agent_i][point_i] = agents[agent_i].benchmark( eps_benchmark) # Plot results plt.figure(figsize=(16, 10)) xaxis = [eps_per_point * (i + 1) for i in range(n_plot_points)] title1 = 'DynaQ, n = ' + str(plan1) + ', a = ' + str(alpha1) title2 = 'MCTS, n = ' + str(plan2) + ', a = ' + str(alpha2) title3 = 'TDTS, n = ' + str(plan3) + ', a = ' + str(alpha3) title4 = 'Dyna2, n = ' + str(plan4) + ', a = ' + str( alpha4) + ', b = ' + str(beta4) titles = [title1, title2, title3, title4] for i in range(4): plt.subplot(221 + i) plt.plot(xaxis, benchmark_data[i]) plt.xlabel('Training episodes') plt.ylabel('Average reward per episode') plt.title(titles[i]) plt.show()
def compareMethods(): import gym import matplotlib.pyplot as plt env = gym.make('GridWorld-v0') policy = EGreedyPolicyVFA(0.1) VFA = LinearVFA() feature = Featurize() training_episodes = 400 n_plot_points = 100 eps_benchmark = 100 # Initialize agents alpha1 = 0.4 agent1 = TD(env, policy, VFA, feature, alpha1, horizon=20) alpha2 = 0.4 lamda2 = 0.8 agent2 = TDlamda(env, policy, VFA, feature, alpha2, lamda2, horizon=20) alpha3 = 0.4 beta3 = 0.2 agent3 = GradientTD2(env, policy, VFA, feature, alpha3, beta=beta3, horizon=20) alpha4 = 0.4 agent4 = GradientQlearning(env, policy, VFA, feature, alpha4, horizon=20) alpha5 = 0.4 beta5 = 0.2 agent5 = RLSTD(env, policy, VFA, feature, alpha4, beta=beta5, horizon=20) agents = [agent1, agent2, agent3, agent4, agent5] eps_per_point = int(training_episodes / n_plot_points) benchmark_data = np.zeros((5, n_plot_points)) # Benchmark agents without training for agent_i in range(5): benchmark_data[agent_i][0] = agents[agent_i].benchmark(eps_benchmark) # Train and benchmark agents for point_i in range(1, n_plot_points): for agent_i in range(5): print('Agent ' + str(agent_i) + ', Episode ' + str((point_i + 1) * eps_per_point)) agents[agent_i].train(eps_per_point) benchmark_data[agent_i][point_i] = agents[agent_i].benchmark( eps_benchmark) # Plot results xaxis = [eps_per_point * (i + 1) for i in range(n_plot_points)] title1 = 'VFA TD, a = ' + str(alpha1) title2 = 'VFA TD(lamda), a = ' + str(alpha2) + ', l = ' + str(lamda2) title3 = 'GTD2, a = ' + str(alpha3) + ', b = ' + str(beta3) title4 = 'Gradient Q, a = ' + str(alpha4) title5 = 'RLSTD, a = ' + str(alpha5) + ', b = ' + str(beta5) titles = [title1, title2, title3, title4, title5] for i in range(5): plt.subplot(231 + i) plt.plot(xaxis, benchmark_data[i]) plt.xlabel('Training episodes') plt.ylabel('Average reward per episode') plt.title(titles[i]) plt.show()