def experiment(algorithm_class, decay_exp): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(pi, mdp.info, **algorithm_params) # Algorithm start = mdp.convert_to_int(mdp._start, mdp._width) collect_max_Q = CollectMaxQ(agent.approximator, start) collect_dataset = CollectDataset() callbacks = [collect_dataset, collect_max_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def _fit_boosted(self, x): """ Single fit iteration for boosted FQI. Args: x (list): the dataset. """ state, action, reward, next_state, absorbing, _ = parse_dataset(x) if self._target is None: self._target = reward else: self._next_q += self.approximator.predict(next_state, idx=self._idx - 1) if np.any(absorbing): self._next_q *= 1 - absorbing.reshape(-1, 1) max_q = np.max(self._next_q, axis=1) self._target = reward + self.mdp_info.gamma * max_q self._target -= self._prediction self._prediction += self._target self.approximator.fit(state, action, self._target, idx=self._idx, **self._fit_params) self._idx += 1
def fit(self, dataset): phi_state, action, reward, phi_next_state, absorbing, _ = parse_dataset( dataset, self.phi) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) norm = np.inf while norm > self._epsilon: q = self.approximator.predict(phi_next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) next_action = np.argmax(q, axis=1).reshape(-1, 1) phi_next_state_next_action = get_action_features( phi_next_state, next_action, self.mdp_info.action_space.n) tmp = phi_state_action - self.mdp_info.gamma *\ phi_next_state_next_action self._A += phi_state_action.T.dot(tmp) self._b += (phi_state_action.T.dot(reward)).reshape(-1, 1) old_w = self.approximator.get_weights() if np.linalg.matrix_rank(self._A) == self._A.shape[1]: w = np.linalg.solve(self._A, self._b).ravel() else: w = np.linalg.pinv(self._A).dot(self._b).ravel() self.approximator.set_weights(w) norm = np.linalg.norm(w - old_w)
def _fit(self, x): state = list() action = list() reward = list() next_state = list() absorbing = list() half = len(x) // 2 for i in range(2): s, a, r, ss, ab, _ = parse_dataset(x[i * half:(i + 1) * half]) state.append(s) action.append(a) reward.append(r) next_state.append(ss) absorbing.append(ab) if self._target is None: self._target = reward else: for i in range(2): q_i = self.approximator.predict(next_state[i], idx=i) amax_q = np.expand_dims(np.argmax(q_i, axis=1), axis=1) max_q = self.approximator.predict(next_state[i], amax_q, idx=1 - i) if np.any(absorbing[i]): max_q *= 1 - absorbing[i] self._target[i] = reward[i] + self.mdp_info.gamma * max_q for i in range(2): self.approximator.fit(state[i], action[i], self._target[i], idx=i, **self._fit_params)
def fit(self, dataset): phi_state, action, reward, phi_next_state, absorbing, _ = parse_dataset( dataset, self.phi) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) norm = np.inf while norm > self._epsilon: q = self.approximator.predict(phi_next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) next_action = np.argmax(q, axis=1).reshape(-1, 1) phi_next_state_next_action = get_action_features( phi_next_state, next_action, self.mdp_info.action_space.n ) tmp = phi_state_action - self.mdp_info.gamma *\ phi_next_state_next_action self._A += phi_state_action.T.dot(tmp) self._b += (phi_state_action.T.dot(reward)).reshape(-1, 1) old_w = self.approximator.get_weights() if np.linalg.matrix_rank(self._A) == self._A.shape[1]: w = np.linalg.solve(self._A, self._b).ravel() else: w = np.linalg.pinv(self._A).dot(self._b).ravel() self.approximator.set_weights(w) norm = np.linalg.norm(w - old_w)
def _fit(self, x): state = list() action = list() reward = list() next_state = list() absorbing = list() half = len(x) / 2 for i in xrange(2): s, a, r, ss, ab, _ = parse_dataset(x[i * half:(i + 1) * half]) state.append(s) action.append(a) reward.append(r) next_state.append(ss) absorbing.append(ab) if self._target is None: self._target = reward else: for i in xrange(2): q_i = self.approximator.predict(next_state[i], idx=i) if np.any(absorbing[i]): q_i *= 1 - absorbing[i].reshape(-1, 1) amax_q = np.expand_dims(np.argmax(q_i, axis=1), axis=1) max_q = self.approximator.predict(next_state[i], amax_q, idx=1 - i) self._target[i] = reward[i] + self.mdp_info.gamma * max_q for i in xrange(2): self.approximator.fit(state[i], action[i], self._target[i], idx=i, **self.params['fit_params'])
def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) x, u, r, xn, absorbing, last = parse_dataset(dataset) x = x.astype(np.float32) u = u.astype(np.float32) r = r.astype(np.float32) xn = xn.astype(np.float32) obs = torch.tensor(x, dtype=torch.float) act = torch.tensor(u, dtype=torch.float) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = torch.tensor(np_adv, dtype=torch.float) old_pol_dist = self.policy.distribution_t(obs) old_log_p = old_pol_dist.log_prob(act)[:, None].detach() self._V.fit(x, v_target, **self._critic_fit_params) self._update_policy(obs, act, adv, old_log_p) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1
def experiment_others(alg, decay_exp): np.random.seed() # MDP grid_map = "simple_gridmap.txt" mdp = GridWorldGenerator(grid_map=grid_map) # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=alpha) fit_params = dict() agent_params = {'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = alg(pi, mdp.info, agent_params) # Algorithm collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width)) collect_dataset = CollectDataset() callbacks = [collect_max_Q, collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def fit(self, dataset): state, action, reward, next_state, absorbing, _ = parse_dataset( dataset) v, adv = compute_advantage_montecarlo(self._V, state, next_state, reward, absorbing, self.mdp_info.gamma) self._V.fit(state, v, **self._critic_fit_params) loss = self._loss(state, action, adv) self._optimize_actor_parameters(loss)
def experiment(decay_exp, windowed, tol): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) if windowed: beta = WindowedVarianceIncreasingParameter(value=1, size=mdp.info.size, tol=tol, window=50) else: beta = VarianceIncreasingParameter(value=1, size=mdp.info.size, tol=tol) algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = RQLearning(pi, mdp.info, agent_params) # Algorithm collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width)) collect_dataset = CollectDataset() callbacks = [collect_max_Q, collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) state, action, reward, next_state, absorbing, last = parse_dataset( dataset) x = state.astype(np.float32) u = action.astype(np.float32) r = reward.astype(np.float32) xn = next_state.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) # Policy update self._old_policy = deepcopy(self.policy) old_pol_dist = self._old_policy.distribution_t(obs) old_log_prob = self._old_policy.log_prob_t(obs, act).detach() zero_grad(self.policy.parameters()) loss = self._compute_loss(obs, act, adv, old_log_prob) prev_loss = loss.item() # Compute Gradient loss.backward() g = get_gradient(self.policy.parameters()) # Compute direction through conjugate gradient stepdir = self._conjugate_gradient(g, obs, old_pol_dist) # Line search self._line_search(obs, act, adv, old_log_prob, old_pol_dist, prev_loss, stepdir) # VF update self._V.fit(x, v_target, **self._critic_fit_params) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1
def _fit(self, x): """ Single fit iteration. Args: x (list): the dataset. """ state, action, reward, next_state, absorbing, _ = parse_dataset(x) if self._target is None: self._target = reward else: q = self.approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) max_q = np.max(q, axis=1) self._target = reward + self.mdp_info.gamma * max_q self.approximator.fit(state, action, self._target, **self._fit_params)
def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) state, action, reward, next_state, absorbing, last = parse_dataset(dataset) x = state.astype(np.float32) u = action.astype(np.float32) r = reward.astype(np.float32) xn = next_state.astype(np.float32) obs = torch.tensor(x, dtype=torch.float) act = torch.tensor(u, dtype=torch.float) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = torch.tensor(np_adv, dtype=torch.float) # Policy update old_pol_dist = self.policy.distribution_t(obs) old_log_prob = self.policy.log_prob_t(obs, act).detach() self._zero_grad() loss = self._compute_loss(obs, act, adv, old_log_prob) prev_loss = loss.item() # Compute Gradient loss.backward(retain_graph=True) g = get_gradient(self.policy.parameters()) # Compute direction trough conjugate gradient stepdir = self._conjugate_gradient(g, obs, old_pol_dist) # Line search shs = .5 * stepdir.dot(self._fisher_vector_product( torch.from_numpy(stepdir), obs, old_pol_dist) ) lm = np.sqrt(shs / self._max_kl) fullstep = stepdir / lm stepsize = 1. theta_old = self.policy.get_weights() violation = True for _ in range(self._n_epochs_line_search): theta_new = theta_old + fullstep * stepsize self.policy.set_weights(theta_new) new_loss = self._compute_loss(obs, act, adv, old_log_prob) kl = self._compute_kl(obs, old_pol_dist) improve = new_loss - prev_loss if kl <= self._max_kl * 1.5 or improve >= 0: violation = False break stepsize *= .5 if violation: self.policy.set_weights(theta_old) # VF update self._V.fit(x, v_target, **self._critic_fit_params) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1
def experiment(mdp, test_states, test_actions, test_q, names): np.random.seed() n_games = len(mdp) input_shape = [(m.info.observation_space.shape[0], ) for m in mdp] n_actions_per_head = [(m.info.action_space.n, ) for m in mdp] test_states = np.array([test_states]).repeat(len(mdp), 0).reshape(-1, 2) test_actions = np.array([test_actions]).repeat(len(mdp), 0).reshape(-1, 1) test_idxs = np.ones(len(test_states), dtype=np.int) * np.arange( len(mdp)).repeat(len(test_states) // len(mdp), 0) # Policy epsilon = Parameter(value=1.) pi = EpsGreedyMultiple(parameter=epsilon, n_actions_per_head=n_actions_per_head) # Approximator optimizer = {'class': optim.Adam, 'params': dict()} loss = LossFunction(n_games) approximator_params = dict(network=Network, input_shape=input_shape, output_shape=n_actions_per_head, optimizer=optimizer, loss=loss, features='sigmoid', n_features=30, use_cuda=True, quiet=False) approximator = TorchApproximator dataset = list() len_datasets = list() for i in range(len(mdp)): d = pickle.load(open('dataset_%s.pkl' % names[i], 'rb')) len_datasets.append(len(d)) dataset += d # Agent algorithm_params = dict(n_iterations=1, n_actions_per_head=n_actions_per_head, fit_params=dict(patience=100, epsilon=1e-6)) agent = FQI(approximator, pi, mdp[0].info, approximator_params=approximator_params, **algorithm_params) qs = list() scores = list() idxs = list() for i, l in enumerate(len_datasets): idxs += (np.ones(l, dtype=np.int) * i).tolist() idxs = np.array(idxs) state, action, reward, next_state, absorbing, _ = parse_dataset(dataset) for _ in trange(50, dynamic_ncols=True, disable=False, leave=False): agent._fit(state, action, reward, next_state, absorbing, idxs) # Algorithm core = Core(agent, mdp) test_epsilon = Parameter(0.) pi.set_parameter(test_epsilon) dataset = core.evaluate(n_steps=100) qs.append( agent.approximator.predict(test_states, test_actions, idx=test_idxs)) scores.append(np.mean(compute_J(dataset, mdp[0].info.gamma))) qs_hat = np.array(qs) avi_diff = list() for i in range(len(qs_hat)): avi_diff.append( np.linalg.norm(qs_hat[i] - test_q, ord=1) / len(test_q)) print(avi_diff, scores) return avi_diff, scores