def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) x, u, r, xn, absorbing, last = parse_dataset(dataset) x = x.astype(np.float32) u = u.astype(np.float32) r = r.astype(np.float32) xn = xn.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) old_pol_dist = self.policy.distribution_t(obs) old_log_p = old_pol_dist.log_prob(act)[:, None].detach() self._V.fit(x, v_target, **self._critic_fit_params) self._update_policy(obs, act, adv, old_log_p) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1
def experiment(algorithm_class, exp): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialParameter(value=1, exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialParameter(value=1, exp=exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(mdp.info, pi, **algorithm_params) # Algorithm start = mdp.convert_to_int(mdp._start, mdp._width) collect_max_Q = CollectMaxQ(agent.Q, start) collect_dataset = CollectDataset() callbacks = [collect_dataset, collect_max_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get() return reward, max_Qs
def fit(self, dataset): phi_state, action, reward, phi_next_state, absorbing, _ = parse_dataset( dataset, self.phi) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) norm = np.inf while norm > self._epsilon: q = self.approximator.predict(phi_next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) next_action = np.argmax(q, axis=1).reshape(-1, 1) phi_next_state_next_action = get_action_features( phi_next_state, next_action, self.mdp_info.action_space.n) tmp = phi_state_action - self.mdp_info.gamma *\ phi_next_state_next_action self._A += phi_state_action.T.dot(tmp) self._b += (phi_state_action.T.dot(reward)).reshape(-1, 1) old_w = self.approximator.get_weights() if np.linalg.matrix_rank(self._A) == self._A.shape[1]: w = np.linalg.solve(self._A, self._b).ravel() else: w = np.linalg.pinv(self._A).dot(self._b).ravel() self.approximator.set_weights(w) norm = np.linalg.norm(w - old_w)
def _fit_boosted(self, x): """ Single fit iteration for boosted FQI. Args: x (list): the dataset. """ state, action, reward, next_state, absorbing, _ = parse_dataset(x) if self._target is None: self._target = reward else: self._next_q += self.approximator.predict(next_state, idx=self._idx - 1) if np.any(absorbing): self._next_q *= 1 - absorbing.reshape(-1, 1) max_q = np.max(self._next_q, axis=1) self._target = reward + self.mdp_info.gamma * max_q self._target -= self._prediction self._prediction += self._target self.approximator.fit(state, action, self._target, idx=self._idx, **self._fit_params) self._idx += 1
def _fit(self, x): state = list() action = list() reward = list() next_state = list() absorbing = list() half = len(x) // 2 for i in range(2): s, a, r, ss, ab, _ = parse_dataset(x[i * half:(i + 1) * half]) state.append(s) action.append(a) reward.append(r) next_state.append(ss) absorbing.append(ab) if self._target is None: self._target = reward else: for i in range(2): q_i = self.approximator.predict(next_state[i], idx=i) amax_q = np.expand_dims(np.argmax(q_i, axis=1), axis=1) max_q = self.approximator.predict(next_state[i], amax_q, idx=1 - i) if np.any(absorbing[i]): max_q *= 1 - absorbing[i] self._target[i] = reward[i] + self.mdp_info.gamma * max_q for i in range(2): self.approximator.fit(state[i], action[i], self._target[i], idx=i, **self._fit_params)
def fit(self, x): state, action, reward, next_state, absorbing, _ = parse_dataset(x) for _ in trange(self._n_iterations(), dynamic_ncols=True, disable=self._quiet, leave=False): if self._target is None: self._target = reward else: self._next_q += self.approximator.predict(next_state, idx=self._idx - 1) if np.any(absorbing): self._next_q *= 1 - absorbing.reshape(-1, 1) max_q = np.max(self._next_q, axis=1) self._target = reward + self.mdp_info.gamma * max_q self._target -= self._prediction self._prediction += self._target self.approximator.fit(state, action, self._target, idx=self._idx, **self._fit_params) self._idx += 1
def compute_metrics(core, eval_params, agent_builder, cmp_E): """ Function to compute the metrics. Args: eval_params (dict): parameters for running the evaluation agent_builder (AgentBuilder): the agent builder cmp_E (bool): select if policy entropy should be computed """ dataset = core.evaluate(**eval_params) # Compute J J = np.mean(compute_J(dataset, core.mdp.info.gamma)) # Compute R R = np.mean(compute_J(dataset)) # Compute Q states = get_init_states(dataset) Q = agent_builder.compute_Q(agent=core.agent, states=states) # Compute Policy Entropy E = None if cmp_E: if agent_builder.compute_entropy_with_states: E = core.agent.policy.entropy(parse_dataset(dataset)[0]) else: E = core.agent.policy.entropy() return J, R, Q, E
def fit(self, dataset): state, action, reward, next_state, absorbing, _ = parse_dataset(dataset) v, adv = compute_advantage_montecarlo(self._V, state, next_state, reward, absorbing, self.mdp_info.gamma) self._V.fit(state, v, **self._critic_fit_params) loss = self._loss(state, action, adv) self._optimize_actor_parameters(loss)
def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) state, action, reward, next_state, absorbing, last = parse_dataset( dataset) x = state.astype(np.float32) u = action.astype(np.float32) r = reward.astype(np.float32) xn = next_state.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) # Policy update self._old_policy = deepcopy(self.policy) old_pol_dist = self._old_policy.distribution_t(obs) old_log_prob = self._old_policy.log_prob_t(obs, act).detach() zero_grad(self.policy.parameters()) loss = self._compute_loss(obs, act, adv, old_log_prob) prev_loss = loss.item() # Compute Gradient loss.backward() g = get_gradient(self.policy.parameters()) # Compute direction through conjugate gradient stepdir = self._conjugate_gradient(g, obs, old_pol_dist) # Line search self._line_search(obs, act, adv, old_log_prob, old_pol_dist, prev_loss, stepdir) # VF update self._V.fit(x, v_target, **self._critic_fit_params) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1
def _fit(self, x): """ Single fit iteration. Args: x (list): the dataset. """ state, action, reward, next_state, absorbing, _ = parse_dataset(x) if self._target is None: self._target = reward else: q = self.approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) max_q = np.max(q, axis=1) self._target = reward + self.mdp_info.gamma * max_q self.approximator.fit(state, action, self._target, **self._fit_params)
def experiment(alg, n_epochs, n_steps, n_episodes_test): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP gamma = 0.99 habitat_root_path = Habitat.root_path() config_file = os.path.join( habitat_root_path, 'habitat_baselines/config/rearrange/rl_pick.yaml') base_config_file = os.path.join(habitat_root_path, 'configs/tasks/rearrange/pick.yaml') wrapper = 'HabitatRearrangeWrapper' mdp = Habitat(wrapper, config_file, base_config_file, gamma=gamma) # Settings initial_replay_size = 64 max_replay_size = 50000 batch_size = 64 n_features = 64 warmup_transitions = 100 tau = 0.005 lr_alpha = 3e-4 use_cuda = torch.cuda.is_available() # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = actor_input_shape + mdp.info.action_space.shape critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=use_cuda) # Agent agent = alg(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None) # Algorithm core = Core(agent, mdp) # RUN dataset = core.evaluate(n_episodes=n_episodes_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(0, J=J, R=R, entropy=E) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) for n in trange(n_epochs, leave=False): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_episodes=n_episodes_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(n + 1, J=J, R=R, entropy=E) logger.info('Press a button to visualize the robot') input() core.evaluate(n_episodes=5, render=True)
def experiment(alg, n_epochs, n_steps, n_steps_test): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP horizon = 200 gamma = 0.99 mdp = Gym('Pendulum-v1', horizon, gamma) # Settings initial_replay_size = 64 max_replay_size = 50000 batch_size = 64 n_features = 64 warmup_transitions = 100 tau = 0.005 lr_alpha = 3e-4 use_cuda = torch.cuda.is_available() # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) critic_params = dict(network=CriticNetwork, optimizer={'class': optim.Adam, 'params': {'lr': 3e-4}}, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1,), use_cuda=use_cuda) # Agent agent = alg(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None) # Algorithm core = Core(agent, mdp) # RUN dataset = core.evaluate(n_steps=n_steps_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(0, J=J, R=R, entropy=E) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) for n in trange(n_epochs, leave=False): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_steps=n_steps_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(n+1, J=J, R=R, entropy=E) logger.info('Press a button to visualize pendulum') input() core.evaluate(n_episodes=5, render=True)