def experiment(algorithm_class, decay_exp): np.random.seed() # MDP p = np.load('chain_structure/p.npy') rew = np.load('chain_structure/rew.npy') mdp = FiniteMDP(p, rew, gamma=.9) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(pi, mdp.info, **algorithm_params) # Algorithm collect_Q = CollectQ(agent.approximator) callbacks = [collect_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True) Qs = collect_Q.get_values() return Qs
def experiment1(decay_exp, beta_type): np.random.seed() # MDP p = np.load('p.npy') rew = np.load('rew.npy') mdp = FiniteMDP(p, rew, gamma=.9) # Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) if beta_type == 'Win': beta = WindowedVarianceIncreasingParameter(value=1, size=mdp.info.size, tol=10., window=50) else: beta = VarianceIncreasingParameter(value=1, size=mdp.info.size, tol=10.) algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = RQLearning(pi, mdp.info, agent_params) # Algorithm collect_q = CollectQ(agent.Q) collect_lr_1 = CollectParameters(beta, np.array([0])) collect_lr_5 = CollectParameters(beta, np.array([4])) callbacks = [collect_q, collect_lr_1, collect_lr_5] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True) Qs = collect_q.get_values() lr_1 = collect_lr_1.get_values() lr_5 = collect_lr_5.get_values() return Qs, lr_1, lr_5