def runner(x): costs_mean = [] costs_std = [] for i in range(x.shape[0]): self.policy.set_params(x[i]) episodes = run_rollout( policy=self, env=self.env, n=self.n_evals_if_stochastic if self.stochastic else 1, eval=False, additional_keys=['costs', 'n_icu'], ) costs_eps = np.array([ np.sum(episodes[i_ep]['costs'], axis=0) for i_ep in range( self.n_evals_if_stochastic if self.stochastic else 1) ]) costs_mean.append(costs_eps.mean(axis=0)) costs_std.append(costs_eps.std(axis=0)) return np.array(costs_mean), np.array(costs_std)
def evaluate(self, n=None, goal=None, best=None, reset_same_model=False): # run eval if n is None: n = self.n_evals_if_stochastic if self.env.unwrapped.stochastic else 1 if self.goal_conditioned: if goal is not None: eval_goals = np.array([goal] * n) else: eval_goals = self.cost_function.get_eval_goals(n) n = eval_goals.shape[0] else: eval_goals = None eval_episodes = run_rollout( policy=self, env=self.env, n=n, goal=eval_goals, eval=True, reset_same_model=reset_same_model, additional_keys=('costs', 'constraints'), ) new_logs, costs = self.compute_eval_score(eval_episodes, eval_goals) return new_logs, costs
def evaluate_pareto(self, load_model=True): if load_model: self.load_model(self.logdir + '/models/best_model.cp') if self.goal_conditioned: print('----------------\nForming pareto front') goals = sample_goals(self.pareto_size, self.cost_function.goal_dim) res = dict() costs_mean = [] costs_std = [] n = self.n_evals_if_stochastic if self.env.unwrapped.stochastic else 1 for i_g, g in enumerate(goals): if (i_g + 1) % 20 == 0: print('\t{:.2f} %'.format( (i_g + 1) / goals.shape[0] * 100)) gs = np.atleast_2d(np.array([g for _ in range(n)])) if gs.shape[0] != n: gs = gs.transpose() episodes = run_rollout( policy=self, env=self.env, n=n, goal=gs, eval=True, additional_keys=['costs'], ) costs = np.array( [np.array(e['costs']).sum(axis=0) for e in episodes]) costs_mean.append(costs.mean(axis=0)) costs_std.append(costs.std(axis=0)) res['F_all'] = np.array(costs_mean) res['F_std_all'] = np.array(costs_std) res['G_all'] = goals front_ids = compute_pareto_front(costs_mean) costs_mean = np.array(costs_mean) costs_std = np.array(costs_std) costs_std = costs_std[front_ids] costs_mean = costs_mean[front_ids] res['F'] = costs_mean res['F_std'] = costs_std with open(self.logdir + 'res_eval.pk', 'wb') as f: pickle.dump(res, f) else: print('----------------\nForming pareto front') res = dict() costs_mean = [] costs_std = [] n = self.n_evals_if_stochastic if self.env.unwrapped.stochastic else 1 episodes = run_rollout( policy=self, env=self.env, n=n, eval=True, additional_keys=['costs'], ) costs = np.array( [np.array(e['costs']).sum(axis=0) for e in episodes]) costs_mean.append(costs.mean(axis=0)) costs_std.append(costs.std(axis=0)) res['F'] = np.array(costs_mean) res['F_std'] = np.array(costs_std) for k in list(res.keys()): res[k + '_all'] = res[k] res['G_all'] = np.array([[ self.cost_function.beta_default for _ in range(len(costs_mean)) ]]) with open(self.logdir + 'res_eval.pk', 'wb') as f: pickle.dump(res, f)
def learn(self, num_train_steps): """ Main training loop. Parameters ---------- num_train_steps: int Number of training steps (environment steps) Returns ------- """ while self.env_step_counter < num_train_steps: if self.goal_conditioned: goal = self.env.unwrapped.sample_cost_function_params() else: goal = None episodes = run_rollout( policy=self, env=self.env, n=1, goal=goal, eval=False, additional_keys=('costs', 'constraints'), ) lengths = self.store_episodes(episodes) self.env_step_counter += np.sum(lengths) self.episode += 1 self.aggregated_costs.append( np.sum(episodes[0]['aggregated_costs'])) self.costs.append(np.sum(episodes[0]['costs'], axis=0)) # Update if len(self.replay_buffer) > self.batch_size: update_losses = [] for _ in range(int(np.sum(lengths) * 0.5)): update_losses.append(self.update()) update_losses = np.array(update_losses) losses = update_losses.mean(axis=0) else: losses = [np.nan] * 2 if self.episode % self.eval_and_log_every == 0: # Run evaluations new_logs, eval_costs = self.evaluate( n=self.n_evals_if_stochastic if self.stochastic else 1) # Compute train scores train_agg_cost = np.mean(self.aggregated_costs) train_costs = np.array(self.costs).mean(axis=0) self.log(self.episode, new_logs, losses, train_agg_cost, train_costs) # Reset training score tracking self.aggregated_costs = [] self.costs = [] if self.episode % self.save_policy_every == 0: self.save_model(self.logdir + '/models/policy_{}.cp'.format(self.episode)) self.evaluate_pareto() print('Run has terminated successfully')
def evaluate(self, n=None, all=False, best=False, goal=None, reset_same_model=False): res = dict() if all: costs_mean = [] costs_std = [] for w in self.res.X: self.policy.set_params(w) episodes = run_rollout( policy=self, env=self.env, n=n, eval=True, reset_same_model=reset_same_model, additional_keys=['costs'], ) costs = np.array( [np.array(e['costs']).sum(axis=0) for e in episodes]) costs_mean.append(costs.mean(axis=0)) costs_std.append(costs.std(axis=0)) front_ids = compute_pareto_front(costs_mean) costs_mean = np.array(costs_mean) costs_std = np.array(costs_std) costs_std = costs_std[front_ids] costs_mean = costs_mean[front_ids] weights = self.res.X[front_ids] res['F'] = costs_mean res['F_std'] = costs_std res['X'] = weights costs = costs_mean elif best: weights = self.res_eval['X'] costs = self.res_eval['F'] normalized_costs = np.array([ c_f.scale(c) for c_f, c in zip(self.cost_function.costs, costs.transpose()) ]).transpose() agg_cost = normalized_costs.sum(axis=1) ind_min = np.argmin(agg_cost) self.policy.set_params(weights[ind_min]) episodes = run_rollout( policy=self, env=self.env, n=n, eval=True, additional_keys=['costs'], ) costs = np.array( [np.array(e['costs']).sum(axis=0) for e in episodes]) # res['X'] = weights[ind_min] for i, c_m, c_std in zip(range(costs.shape[1]), costs.mean(axis=0), costs.std(axis=0)): res['C{} mean'.format(i)] = c_m res['C{} std'.format(i)] = c_std elif goal is not None: nn_model = NearestNeighbors(n_neighbors=1) weights = self.res_eval['X'] costs = self.res_eval['F'] normalized_costs = np.array([ c_f.scale(c) for c_f, c in zip(self.cost_function.costs, costs.transpose()) ]).transpose() nn_model.fit(normalized_costs) normalized_goal = np.atleast_2d( np.array([ c_f.scale(g) for c_f, g in zip(self.cost_function.costs, goal) ])) ind_nn = nn_model.kneighbors(normalized_goal, return_distance=False).flatten() self.policy.set_params(weights[ind_nn].flatten()) episodes = run_rollout( policy=self, env=self.env, n=n, eval=True, additional_keys=['costs'], ) costs = np.array( [np.array(e['costs']).sum(axis=0) for e in episodes]) res['X'] = weights[ind_nn] res['F'] = costs.mean(axis=0) res['F_std'] = costs.std(axis=0) else: episodes = run_rollout( policy=self, env=self.env, n=n, eval=True, additional_keys=['costs'], ) costs = np.array( [np.array(e['costs']).sum(axis=0) for e in episodes]) for i, c_m, c_std in zip(range(costs.shape[1]), costs.mean(axis=0), costs.std(axis=0)): res['C{} mean'.format(i)] = c_m res['C{} std'.format(i)] = c_std return res, costs