from pybrain.rl.agents.linearfa import LinearFA_Agent from pybrain.rl.learners.valuebased.linearfa import LSPI from pybrain.rl.experiments import EpisodicExperiment from environment import Environment from tasks import LSPIBalanceTask from training import LinearFATraining task = LSPIBalanceTask(only_steer=True) learner = LSPI(task.nactions, task.outdim) theta = np.loadtxt('/home/fitze/Dropbox/stanford/21quarter/229cs/proj/data/balance_lspi_experimental_112011H17M18S/theta_800.dat') learner._theta = theta # TODO this LSPI does not have eligibility traces. #learner._lambda = 0.95 task.discount = learner.rewardDiscount agent = LinearFA_Agent(learner) # The state has a huge number of dimensions, and the logging causes me to run # out of memory. We needn't log, since learning is done online. agent.logging = False #learner.exploring = True performance_agent = LinearFA_Agent(learner) performance_agent.logging = False performance_agent.greedy = True performance_agent.learning = False experiment = EpisodicExperiment(task, agent) # TODO PyBrain says that the learning rate needs to decay, but I don't see that # described in Randlov's paper. # A higher number here means the learning rate decays slower. learner.learningRateDecay = 100000 # NOTE increasing this number above from the default of 100 is what got the
from pybrain.rl.experiments import EpisodicExperiment from environment import Environment from tasks import LSPIGotoTask from training import LinearFATraining x_g = 10 y_g = 30 task = LSPIGotoTask(butt_disturbance_amplitude=0.0000, randomInitState=False, five_actions=True, rewardType=1, x_goal=x_g, y_goal=y_g) learner = LSPI(task.nactions, task.outdim, randomInit=False) # TODO this LSPI does not have eligibility traces. #learner._lambda = 0.95 # lagoudakis uses 0.8 discount factor learner.rewardDiscount = 0.8 task.discount = learner.rewardDiscount agent = LinearFA_Agent(learner) # The state has a huge number of dimensions, and the logging causes me to run # out of memory. We needn't log, since learning is done online. agent.logging = False agent.epsilonGreedy = True #learner.exploring = True performance_agent = LinearFA_Agent(learner)
from pybrain.rl.agents.linearfa import LinearFA_Agent from pybrain.rl.learners.valuebased.linearfa import LSPI from pybrain.rl.experiments import EpisodicExperiment from environment import Environment from tasks import LSPIBalanceTask from training import LinearFATraining task = LSPIBalanceTask() learner = LSPI(task.nactions, task.outdim) # TODO this LSPI does not have eligibility traces. #learner._lambda = 0.95 task.discount = learner.rewardDiscount agent = LinearFA_Agent(learner) # The state has a huge number of dimensions, and the logging causes me to run # out of memory. We needn't log, since learning is done online. agent.logging = False performance_agent = LinearFA_Agent(learner) performance_agent.logging = False performance_agent.greedy = True performance_agent.learning = False experiment = EpisodicExperiment(task, agent) # TODO PyBrain says that the learning rate needs to decay, but I don't see that # described in Randlov's paper. # A higher number here means the learning rate decays slower. learner.learningRateDecay = 100000 # NOTE increasing this number above from the default of 100 is what got the # learning to actually happen, and fixed the bug/issue where the performance # agent's performance stopped improving.
from pybrain.rl.agents.linearfa import LinearFA_Agent from pybrain.rl.learners.valuebased.linearfa import LSPI from pybrain.rl.experiments import EpisodicExperiment from environment import Environment from tasks import LSPIGotoTask from training import LinearFATraining x_g = 10 y_g = 30 task = LSPIGotoTask(butt_disturbance_amplitude = 0.0000, randomInitState = False, five_actions = True, rewardType = 1, x_goal = x_g, y_goal = y_g) learner = LSPI(task.nactions, task.outdim, randomInit = False) # TODO this LSPI does not have eligibility traces. #learner._lambda = 0.95 # lagoudakis uses 0.8 discount factor learner.rewardDiscount = 0.8 task.discount = learner.rewardDiscount agent = LinearFA_Agent(learner) # The state has a huge number of dimensions, and the logging causes me to run # out of memory. We needn't log, since learning is done online. agent.logging = False agent.epsilonGreedy = True #learner.exploring = True performance_agent = LinearFA_Agent(learner) performance_agent.logging = False performance_agent.greedy = True performance_agent.learning = False
from pybrain.rl.agents.linearfa import LinearFA_Agent from pybrain.rl.learners.valuebased.linearfa import LSPI from pybrain.rl.experiments import EpisodicExperiment from pybrain.utilities import one_to_n from environment import Environment from tasks import LSPIBalanceTask from training import LinearFATraining task = LSPIBalanceTask() learner = LSPI(task.nactions, task.outdim) # TODO this LSPI does not have eligibility traces. #learner._lambda = 0.95 task.discount = learner.rewardDiscount agent = LinearFA_Agent(learner) # The state has a huge number of dimensions, and the logging causes me to run # out of memory. We needn't log, since learning is done online. agent.logging = False # TODO PyBrain says that the learning rate needs to decay, but I don't see that # described in Randlov's paper. # A higher number here means the learning rate decays slower. learner.learningRateDecay = 100000 # NOTE increasing this number above from the default of 100 is what got the # learning to actually happen, and fixed the bug/issue where the performance # agent's performance stopped improving. for idx in np.arange(0, 2500, 100): theta = np.loadtxt('/home/fitze/Dropbox/stanford/21quarter/229cs/proj/data/balance_lspi_experimental_112011H17M18S/theta_%i.dat' % idx) learner._theta = theta
def getReward(self): target = np.array([30, 50]) (_, _, _, _, _, xf, yf, _, _, _) = self.env.sensors dist_to_goal = np.linalg.norm(target - np.array([xf, yf])) delta_tilt = self.env.getTilt()**2 - self.env.last_omega**2 last_xf = self.env.last_xf last_yf = self.env.last_yf dist_to_goal_last = np.linalg.norm(target - np.array([last_xf, last_yf])) delta_dist = dist_to_goal - dist_to_goal_last return -delta_tilt - delta_dist * 0.01 task = LSPI_task() learner = LSPI(9, 20) task.rewardDiscount = 0.8 learner.rewardDiscount = 0.8 agent = LinearFA_Agent(learner) agent.epsilonGreedy = True exp = EpisodicExperiment(task, agent) learner.learningRateDecay = 3000 max_agent = LinearFA_Agent(learner) max_agent.learnerning = False max_agent.greedy = True task.env.saveWheelContactTrajectories(True) plt.ion() plt.figure(figsize=(8, 4))