def train(discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ # wind = 0.3 trajectory_length = 268 # gw = gridworld.Gridworld(grid_size, wind, discount) env = Env() trajectories = env.generate_trajectories(n_trajectories, trajectory_length, env.optimal_policy_deterministic) feature_matrix = env.feature_matrix() r = maxent.irl(feature_matrix, env.n_actions, discount, env.transition_probability, trajectories, epochs, learning_rate, "alpha_%d.pkl", "alpha_205.pkl", 205) pkl.dump(r, open("maxent_reward.pkl", 'wb')) return r
def main(discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, structure): # n_objects, n_colours 随便给的 """ Run deep maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. structure: Neural network structure. Tuple of hidden layer dimensions, e.g., () is no neural network (linear maximum entropy) and (3, 4) is two hidden layers with dimensions 3 and 4. """ trajectory_length = 268 l1 = l2 = 0 env = Env(n_objects, n_colours) # ground_r = np.array([env.reward_deep_maxent(s) for s in range(env.n_states)]) # policy = find_policy(env.n_states, env.n_actions, env.transition_probability, # ground_r, discount, stochastic=False) # trajectories = env.generate_trajectories(n_trajectories, trajectory_length, policy = env.get_policy() trajectories = env.generate_trajectories(2, trajectory_length, lambda s: policy[s]) # feature_matrix = env.feature_matrix_deep_maxent(discrete=False) feature_matrix = env.feature_matrix() r = deep_maxent.irl((feature_matrix.shape[1], ) + structure, feature_matrix, env.n_actions, discount, env.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) pkl.dump(r, open('deep_maxent_reward.pkl', 'wb'))
def train(grid_size, discount): """ Run linear programming inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. """ env = Env(prepare_tp=True) r = linear_irl.irl(env.n_states, env.n_actions, env.transition_probability, env.get_policy(), discount, 1, 5) pkl.dump(r, open("lp_reward.pkl", 'wb'))
def train(discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ trajectory_length = 268 env = Env() trajectories = env.generate_trajectories(n_trajectories, trajectory_length, env.optimal_policy_deterministic) def feature_function(state): feature = np.zeros(env.n_states) feature[state] = 1 return feature def transitionProbability(state_code, action): res = {} for i in range(env.n_states): res[state_code] = env._transition_probability( state_code, action, i) return res irl = LargeGradientIRL(env.n_actions, env.n_states, transitionProbability, feature_function, discount, learning_rate, trajectories, epochs) result = irl.gradientIterationIRL() reward = result[-1][0].reshape(env.n_states, ) pkl.dump(result, open("lg_result.pkl", 'wb')) pkl.dump(reward, open("lg_reward.pkl", 'wb')) return reward
feature_matrix = env.feature_matrix() r = maxent.irl(feature_matrix, env.n_actions, discount, env.transition_probability, trajectories, epochs, learning_rate, "alpha_%d.pkl", "alpha_205.pkl", 205) pkl.dump(r, open("maxent_reward.pkl", 'wb')) return r if __name__ == '__main__': train(0.01, 1, 400, 0.01) rewards = pkl.load(open("maxent_reward.pkl", 'rb')) env = Env(prepare_tp=True) value = vi.value(env.get_policy(), env.n_states, env.transition_probability, rewards, 0.3) opt_value = vi.optimal_value(env.n_states, env.n_actions, env.transition_probability, rewards, 0.3) pkl.dump(value, open("maxent_value.pkl", 'wb')) pkl.dump(opt_value, open("maxent_opt_value.pkl", 'wb')) value = pkl.load(open("maxent_value.pkl", 'rb')) opt_value = pkl.load(open("maxent_opt_value.pkl", 'rb')) status = validate(value) print(status) pkl.dump(status, open("maxent_status.pkl", 'wb')) status = validate(opt_value)
discount: MDP discount factor. float. """ env = Env(prepare_tp=True) r = linear_irl.irl(env.n_states, env.n_actions, env.transition_probability, env.get_policy(), discount, 1, 5) pkl.dump(r, open("lp_reward.pkl", 'wb')) if __name__ == '__main__': # train(5, 0.2) rewards = pkl.load(open("lp_reward.pkl", 'rb')) env = Env(prepare_tp=True) value = vi.value(env.get_policy(), env.n_states, env.transition_probability, rewards, 0.3) opt_value = vi.optimal_value(env.n_states, env.n_actions, env.transition_probability, rewards, 0.3) pkl.dump(value, open("lp_value.pkl", 'wb')) pkl.dump(opt_value, open("lp_opt_value.pkl", 'wb')) # value = pkl.load(open("lp_value.pkl", 'rb')) # opt_value = pkl.load(open("lp_opt_value.pkl", 'rb')) status = validate(value) print(status) pkl.dump(status, open("lp_status.pkl", 'wb')) status = validate(value) print(status)
def validate(reward): env = Env(prepare_tp=False) def get_next_state_code_by_state(state): states = [] next_time_code = env._next_time_hash_code(state[1]) for i in range(env.n_actions): states.append((env._next_position(state[0], i), next_time_code)) print(state, "next_state", states) return states def chose_action(ob): states_code = [ env._get_state_code(s) for s in get_next_state_code_by_state(ob) ] rewards = np.array([reward[sc] for sc in states_code]) max_actions = np.where(rewards == np.max(rewards)) return np.random.choice(max_actions[0]) def run(env): env.reset() print("start running") i = 0 img, observation, termial = env.frame_step(env.do_nothing) states = [env._get_ob_str(observation, 0)] while not termial: i += 1 ob = env._get_ob_str(observation, i) action = chose_action(ob) img, observation_, termial = env.frame_step(action) observation = observation_ states.append(env._get_ob_str(observation, i + 1)) if i >= len(env.expert_states): print("Done") break if termial: print("Terminal, your airplane dead") break return states length = len(env.expert_states) states = run(env) E_time = 0 for rob, exp in zip(states, env.expert_states): if not (rob[0] == exp[0] and rob[1] == exp[1]): E_time += 1 print("E_time", len(env.expert_states), len(states)) E_time += len(env.expert_states) - len(states) # self.status["KLnorm"].append(KL_norm) # self.status["KLAbsSum"].append(KL_abs_sum) # self.status["Error"].append(E) status = { "ErrorRate": E_time / length, "ErrorTime": E_time, "ResultLength": len(states), "ExpertLength": len(env.expert_states), } return status
from PIL import Image import pickle import sys from airplane import Env gs = Env() termial = False while not termial: action = input() if action == "0": img, [x, y], terminal = gs.frame_step(0) elif action == '1': img, [x, y], terminal = gs.frame_step(1) elif action == 's': break elif action == 'r': gs.reset(hard=True) else: # elif action=='2': img, [x, y], terminal = gs.frame_step(2) print([x, y], terminal) if terminal: break # graph={} # prev=None # for i in st: # # cnt=0