def test_estimate(): from environment import GridWorldEnv env = GridWorldEnv(grid=[ [0, 0, 0, 1], [0, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 0], ]) # Train Teacher teacher = PolicyIterationPlanner(env) teacher.plan() trajectories = [] print("Gather demonstrations of teacher.") for i in range(20): s = env.reset() done = False steps = [s] while not done: a = teacher.act(s) n_s, r, done, _ = env.step(a) steps.append(n_s) s = n_s trajectories.append(steps) print("Estimate reward.") irl = MaxEntIRL(env) rewards = irl.estimate(trajectories, epoch=100) print(rewards) env.plot_on_grid(rewards)
def __init__(self, env, eta=0.8, prior_mean=0.0, prior_scale=0.5): self.env = env self.planner = PolicyIterationPlanner(env) self.eta = eta self._mean = prior_mean self._scale = prior_scale self.prior_dist = scipy.stats.norm(loc=prior_mean, scale=prior_scale)
def main(): grid = [[CellType.NORMAL, CellType.NORMAL, CellType.NORMAL, CellType.REWARD], [CellType.NORMAL, CellType.BLOCK, CellType.NORMAL, CellType.DAMAGE], [CellType.NORMAL, CellType.NORMAL, CellType.NORMAL, CellType.NORMAL]] env = Environment(grid) planner = PolicyIterationPlanner(env) value_grid = planner.plan() for row in value_grid: print(", ".join([str(value) for value in row]))
def main(): grid = [[0, 0, 0, 1], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] # Prepare Teacher env = Environment(grid) planner = PolicyIterationPlanner(env) planner.plan() # Execute IRL irl = LinerIRL() irl.estimate(env, planner) print(irl.rewards) # Plot Reward Map ncol = env.column_length nrow = env.row_length import matplotlib.pyplot as plt import matplotlib.cm as cm fig, ax = plt.subplots() reward_map = irl.rewards.reshape((nrow, ncol)) ax.imshow(reward_map, cmap=cm.RdYlGn) ax.set_xticks(np.arange(ncol)) ax.set_yticks(np.arange(nrow)) fig.tight_layout() plt.show()
def post(self): data = tornado.escape.json_decode(self.request.body) grid = data["grid"] plan_type = data["plan"] move_prob = 0.8 # default value try: move_prob = float(data["prob"]) except ValueError: pass env = Environment(grid, move_prob=move_prob) if plan_type == "value": planner = ValueIterationPlanner(env) elif plan_type == "policy": planner = PolicyIterationPlanner(env) result = planner.plan() planner.log.append(result) self.write({"log": planner.log})
def __init__(self, env): self.env = env self.planner = PolicyIterationPlanner(env)
class MaxEntIRL(): def __init__(self, env): self.env = env self.planner = PolicyIterationPlanner(env) def estimate(self, trajectories, epoch=20, learning_rate=0.01, gamma=0.9): state_features = np.vstack( [self.env.state_to_feature(s) for s in self.env.states]) theta = np.random.uniform(size=state_features.shape[1]) teacher_features = self.calculate_expected_feature(trajectories) for e in tqdm(range(epoch)): # Estimate reward. rewards = state_features.dot(theta.T) # Optimize policy under estimated reward. self.planner.reward_func = lambda s: rewards[s] self.planner.plan(gamma=gamma) # Estimate feature under policy. features = self.expected_features_under_policy( self.planner.policy, trajectories) # Update to close to teacher. update = teacher_features - features.dot(state_features) theta += learning_rate * update estimated = state_features.dot(theta.T) estimated = estimated.reshape(self.env.shape) return estimated def calculate_expected_feature(self, trajectories): features = np.zeros(self.env.observation_space.n) for t in trajectories: for s in t: features[s] += 1 features /= len(trajectories) return features def expected_features_under_policy(self, policy, trajectories): t_size = len(trajectories) states = self.env.states transition_probs = np.zeros((t_size, len(states))) initial_state_probs = np.zeros(len(states)) for t in trajectories: initial_state_probs[t[0]] += 1 initial_state_probs /= t_size transition_probs[0] = initial_state_probs for t in range(1, t_size): for prev_s in states: prev_prob = transition_probs[t - 1][prev_s] a = self.planner.act(prev_s) probs = self.env.transit_func(prev_s, a) for s in probs: transition_probs[t][s] += prev_prob * probs[s] total = np.mean(transition_probs, axis=0) return total
class MaxEntIRL(): def __init__(self, env): self.env = env self.planner = PolicyIterationPlanner(env) def estimate(self, trajectories, epoch=20, learning_rate=0.01, gamma=0.9): # 特徴量行列 F # 16×16の単位行列 state_features = np.vstack([self.env.state_to_feature(s) for s in self.env.states]) # 1. パラメータθを初期化 theta = np.random.uniform(size=state_features.shape[1]) # 2. エキスパートデータから特徴ベクトルに変換 teacher_features = self.calculate_expected_feature(trajectories) for e in tqdm(range(epoch)): # Estimate reward. # 3. 状態ごとの報酬関数 R(s) = θ・F rewards = state_features.dot(theta.T) # 現時点のパラメータによる報酬関数を設定 self.planner.reward_func = lambda s: rewards[s] # 4. 現時点の報酬関数に対して、方策を計算 self.planner.plan(gamma=gamma) # 5. 計算した方策で特徴ベクトルを取得 features = self.expected_features_under_policy( self.planner.policy, trajectories) # 6. 勾配を計算 # μ_expert - μ(θ) update = teacher_features - features.dot(state_features) theta += learning_rate * update estimated = state_features.dot(theta.T) estimated = estimated.reshape(self.env.shape) return estimated def calculate_expected_feature(self, trajectories): ''' エキスパートデータから特徴ベクトルを作成する関数 :param trajectories: エキスパートの軌跡データ :return: 特徴ベクトル ''' features = np.zeros(self.env.observation_space.n) for t in trajectories: for s in t: features[s] += 1 features /= len(trajectories) return features def expected_features_under_policy(self, policy, trajectories): ''' パラメータによる報酬関数から獲得される方策による特徴ベクトルの取得 :param policy: 方策 :param trajectories: エキスパート軌跡 軌跡数×状態のリスト :return: 各状態の発生確率 16次元のリスト ''' # エキスパート軌跡の数, このコードではt_size=20 t_size = len(trajectories) states = self.env.states # 状態遷移確率 軌跡の数×状態数 transition_probs = np.zeros((t_size, len(states))) # 状態の発生確率(各状態の到達頻度) initial_state_probs = np.zeros(len(states)) # 各軌跡データの初期状態を取得して、数をカウント # 今回の場合は初期状態は12の為、transition_probs[12]=20でそれ以外は0 for t in trajectories: initial_state_probs[t[0]] += 1 # 回数を頻度に変換するためにt_sizeで割る initial_state_probs /= t_size # 状態遷移確率の初期行に初期状態の配列を代入 transition_probs[0] = initial_state_probs # 環境の状態遷移確率にしたがって,t_size-1回状態遷移を繰り返して状態の発生確率を計算 # 疑問なのが、なぜ繰り返すステップ数がt_sizeで良いのか? for t in range(1, t_size): # 1ステップ前の状態prev_sが状態0~15の場合をそれぞれ計算 # 1ステップ前の状態の発生確率と状態遷移確率を掛けて全て足すことで今ステップの状態発生確率を計算 # μ_t(s') = P(s'|s, a) * Σμ_t-1(s) for prev_s in states: # 1ステップ前に状態prev_sにいる確率を計算 prev_prob = transition_probs[t - 1][prev_s] # 1ステップ前の状態prev_sで行う行動を方策から決定 a = self.planner.act(prev_s) # 状態遷移確率に従い、各状態への遷移確率を取得. probsは16次元のリスト. probs = self.env.transit_func(prev_s, a) # 1ステップ前の発生確率と状態遷移確率を掛ける for s in probs: transition_probs[t][s] += prev_prob * probs[s] # t_sizeステップ分の発生確率を平均して、各状態の発生確率を計算 total = np.mean(transition_probs, axis=0) return total
class BayesianIRL(): def __init__(self, env, eta=0.8, prior_mean=0.0, prior_scale=0.5): self.env = env self.planner = PolicyIterationPlanner(env) self.eta = eta self._mean = prior_mean self._scale = prior_scale self.prior_dist = scipy.stats.norm(loc=prior_mean, scale=prior_scale) def estimate(self, trajectories, epoch=50, gamma=0.3, learning_rate=0.1, sigma=0.05, sample_size=20): num_states = len(self.env.states) reward = np.random.normal(size=num_states, loc=self._mean, scale=self._scale) def get_q(r, g): self.planner.reward_func = lambda s: r[s] V = self.planner.plan(g) Q = self.planner.policy_to_q(V, gamma) return Q for i in range(epoch): noises = np.random.randn(sample_size, num_states) scores = [] for n in tqdm(noises): _reward = reward + sigma * n Q = get_q(_reward, gamma) # Calculate prior (sum of log prob). reward_prior = np.sum( self.prior_dist.logpdf(_r) for _r in _reward) # Calculate likelihood. likelihood = self.calculate_likelihood(trajectories, Q) # Calculate posterior. posterior = likelihood + reward_prior scores.append(posterior) rate = learning_rate / (sample_size * sigma) scores = np.array(scores) normalized_scores = (scores - scores.mean()) / scores.std() noise = np.mean(noises * normalized_scores.reshape((-1, 1)), axis=0) reward = reward + rate * noise print("At iteration {} posterior={}.".format(i, scores.mean())) reward = reward.reshape(self.env.shape) return reward def calculate_likelihood(self, trajectories, Q): mean_log_prob = 0.0 for t in trajectories: t_log_prob = 0.0 for s, a in t: expert_value = self.eta * Q[s][a] total = [self.eta * Q[s][_a] for _a in self.env.actions] t_log_prob += (expert_value - logsumexp(total)) mean_log_prob += t_log_prob mean_log_prob /= len(trajectories) return mean_log_prob