def setUp(self): grid = [['0', '0', '0', '0', '10'], ['0', 'x', '0', '0', '-10'], ['0', '0', '0', '0', '0']] self.grid = grid self.gw_deterministic = gridworld.GridWorld(grid, {(0, 4), (1, 4)}, 1) self.gw_non_deterministic = gridworld.GridWorld( grid, {(0, 4), (1, 4)}, 0.8)
def main(): # Create environment env = gridworld.GridWorld(hard_version=False) # Initialize simulation s = env.reset() # Create log to store data from simulation log = { 't': [0], 's': [s], 'a': [], 'r': [], } # Simulate until episode is done done = False while not done: a = random.randrange(4) (s, r, done) = env.step(a) log['t'].append(log['t'][-1] + 1) log['s'].append(s) log['a'].append(a) log['r'].append(r) # Plot data and save to png file plt.plot(log['t'], log['s']) plt.plot(log['t'][:-1], log['a']) plt.plot(log['t'][:-1], log['r']) plt.legend(['s', 'a', 'r']) plt.savefig('results_gridworld.png')
def test_probabilities_gridworld(size=5): """ Check transition-probabilities for GridWorld Args: size: The size of the world to be used for testing. """ check_zero_probabilities(gridworld.GridWorld(size))
def setUp(self): grid = [['0', '0', '0', '1'], ['0', 'x', '0', '-1'], ['0', '0', '0', '0']] self.grid = grid self.gw_non_deterministic = gridworld.GridWorld( grid, {(0, 3), (1, 3)}, 0.8) self.agent = value_iteration.ValueIterationAgent( self.gw_non_deterministic, 0.9, 100)
def generate_random_grid(base, num_event_cells, period_range, bound, mode='linear', stack=True, event_region=None): min_period, max_period = period_range free_spaces = np.argwhere(base == 0) if event_region is None else event_region cells = [] for n in range(num_event_cells): obj = gridworld.Object(x=free_spaces[n, 1], y=free_spaces[n, 0], period=random.randint(min_period, max_period), bound=bound) cells.append(obj) gw = gridworld.GridWorld(base, cells, person=None, viewable_distance=0, mode=mode, stack=stack) return gw
def __init__(self, env_id=0, is_default=True, grid_size=[10, 10], state_size=[16, 16]): # Initialize gridworld environment self.gridworld = gw.GridWorld(size=grid_size, default=is_default) # Initialize gridworld matrix self.gridmatrix = self.gridworld.CreateGridWorld() # Initialize initial energy self.initial_energy = 20.0 # Initialize control position self.control_position = self.gridworld.GetStartPoint() # Initialize agent state self.agent_state = gw.AgentState(self.control_position[0], self.control_position[1]) # Initialize state size self.state_size = state_size # Initialize state generator self.stategenerator = StateGenerator(state_size=state_size, grid_size=grid_size) # Initialize field state size for get_state self.fstate_size = [5, 5] # initialize previous parameters self.prev_grid_arr = self.gridworld.GetCurrentMatrix() self.prev_position = self.agent_state.GetCurrentPosition() # Initialize reward function self.rewardfunction = RewardFunction(pos_max=15, neg_min=-25) self.rewardfunction.set_delta_s(env_delta_s=self.gridworld.GetDeltaS()) # Get gridworld endpoint self.endpoint = self.gridworld.GetEndPoint() # Initialize episode step count self.step_count = 0 # Infinite resource environment parameters self.inf_resource = False self.p_terminate = -10 self.max_steps = 200 # Get instatnce id self.env_id = env_id
def main(): # Create environment env = gridworld.GridWorld(hard_version=False) # Initialize simulation s = env.reset() # Create log to store data from simulation log = { 't': [0], 's': [s], 'a': [], 'r': [], } pi = np.ones((25, 4)) / 4 val_iter = ValueIteration(env, pi, 0.001, 0.95) # go through value iteration to find optimal policy val_iter.iterate() pi, v = val_iter.get_policy() # Simulate until episode is done done = False while not done: a = np.argmax(pi[s]) (s, r, done) = env.step(a) log['t'].append(log['t'][-1] + 1) log['s'].append(s) log['a'].append(a) log['r'].append(r) # plot trajectory plt.plot(log['t'], log['s']) plt.plot(log['t'][:-1], log['a']) plt.plot(log['t'][:-1], log['r']) plt.legend(['s', 'a', 'r']) plt.title('Value Iteration Trajectory') plt.savefig('val_iter_gridworld.png') # plot learning curve plt.figure() plt.plot(np.arange(val_iter.get_steps()), val_iter.get_means()) plt.title('Value Iteration Learning Curve') plt.savefig('val_iter_means.png') # visualize policy plt.figure() plt.pcolor(pi) plt.title('Value Iteration Policy') plt.savefig('val_iter_policy.png')
def get_fidelity(height, model): input_size = height * 5 * 4 grid = g.GridWorld(height) count = height * 5 - (height - 2) fidelity = 0 for i in range(height): for j in range(5): # If not wall if not grid.state[(i, j)][2]: grid.place_player((i, j)) q_value = model.predict(grid.state.reshape(1, input_size), batch_size=1) action = (np.argmax(q_value)) fidelity += grid.check_optimal_policy((i, j), action) print(fidelity / count) return fidelity / count
def main(): #Initialize Gridworld Environment env = gridworld.GridWorld() #Initialize REINFORCE algorithm reinforce = REINFORCE(args, env) #run reinforce algorithm for episode in range(args.episodes): print('Episode ' + str(episode+1) + '/' + str(args.episodes)) reinforce.train() #save results once training is finished reinforce.save_model()
def test_training(model, height=3, num_of_steps=10): grid = g.GridWorld(height) input_size = height * 5 * 4 total_reward = 0 print("Initial State:") print(grid.display_grid()) # while game still in progress for i in range(num_of_steps): q_value = model.predict(grid.state.reshape(1, input_size), batch_size=1) # take action with highest Q-value action = (np.argmax(q_value)) print('Move #: %s; Taking action: %s' % (i, action)) grid.agent_move(action) grid.display_grid() reward = grid.get_reward() total_reward += reward print("Max steps reached, total reward: {}".format(total_reward))
def generate_random_grid(base, num_event_cells, period_range, bound, mode='linear', stack=True, event_region=None, extra_event_region=[]): min_period, max_period = period_range free_spaces = np.argwhere( base == 0) if event_region is None else event_region np.random.shuffle(free_spaces) cells = [] for n in range(num_event_cells): obj = gridworld.Object(x=free_spaces[n, 1], y=free_spaces[n, 0], period=random.randint(min_period, max_period), bound=bound) cells.append(obj) pos = (free_spaces[num_event_cells, 1], free_spaces[num_event_cells, 0]) person = None if mode == "person": person = gridworld.Person( (free_spaces[num_event_cells, 1], free_spaces[num_event_cells, 0])) cells = [ gridworld.Object(x=free_spaces[n, 1], y=free_spaces[n, 0], period=random.randint(min_period, max_period), bound=bound) for n in range(len(free_spaces)) ] gw = gridworld.GridWorld(base, cells, person=person, initialpos=pos, viewable_distance=0, mode=mode, stack=stack, extra_event_region=extra_event_region) return gw
raise NotImplementedError() # You shouldn't need to touch this part. c = cvx.matrix(c) G = cvx.matrix(G) h = cvx.matrix(h) sol = cvx.solvers.lp(c, G, h) R = np.asarray(sol["x"][:nS]).squeeze() return R if __name__ == "__main__": env = gridworld.GridWorld(map_name='8x8') # Generate policy from Q3.2.1 gamma = 0.9 Vs, n_iter = rl.value_iteration(env, gamma) policy = rl.policy_from_value_function(env, Vs, gamma) T = env.generateTransitionMatrices() # Q3.3.5 # Set R_max and l1 as you want. R = irl_lp(policy, T, gamma, R_max, l1) # You can test out your R by re-running VI with your new rewards as follows: # env_irl = gridworld.GridWorld(map_name='8x8', R=R) # Vs_irl, n_iter_irl = rl.value_iteration(env_irl, gamma)
def main(): #Argument to initialize grid world if args.gridworld: #create environment env = gridworld.GridWorld(hard_version=False) #initializations P = env.p # state transition probability R = env.r # reward function V = np.zeros(env.num_states) #state value function policy = np.zeros(env.num_states) Q = np.zeros( (env.num_states, env.num_actions)) #state action value function #Argument to run Value Iteration if args.value_iteration: V_optimal, policy_optimal, mean_VF_list, iterations = ValueIteration( args, P, R, V, policy, env) data = [V_optimal, policy_optimal, mean_VF_list, iterations] #save data to pickle for deliverable generation if not os.path.isdir('./gridworld_data/'): os.makedirs('./gridworld_data/') filename = 'value_iteration_data.pkl' with open('gridworld_data/' + filename, 'wb') as f: pickle.dump(data, f) #Argument to run Policy Iteration if args.policy_iteration: V_optimal, policy_optimal, mean_VF_list, iterations = PolicyIteration( args, P, R, V, policy, env) data = [V_optimal, policy_optimal, mean_VF_list, iterations] #save data to pickle for deliverable generation if not os.path.isdir('./gridworld_data/'): os.makedirs('./gridworld_data/') filename = 'policy_iteration_data.pkl' with open('gridworld_data/' + filename, 'wb') as f: pickle.dump(data, f) #Argument to run SARSA if args.SARSA: Q_optimal, policy_optimal, discounted_rewards = SARSA(args, Q, env) V_estimate = TD_Zero(args, V, policy_optimal, env) data = [Q_optimal, policy_optimal, discounted_rewards, V_estimate] #save data to pickle for deliverable generation if not os.path.isdir('./gridworld_data/'): os.makedirs('./gridworld_data/') filename = 'SARSA_data_alpha=' + str( args.alpha) + '_epsilon=' + str(args.epsilon) + '.pkl' with open('gridworld_data/' + filename, 'wb') as f: pickle.dump(data, f) #Argument to run Q learning if args.q_learning: Q_optimal, policy_optimal, discounted_rewards = Q_learning( args, Q, env) V_estimate = TD_Zero(args, V, policy_optimal, env) data = [Q_optimal, policy_optimal, discounted_rewards, V_estimate] #save data to pickle for deliverable generation if not os.path.isdir('./gridworld_data/'): os.makedirs('./gridworld_data/') filename = 'Q_Learning_data_alpha=' + str( args.alpha) + '_epsilon=' + str(args.epsilon) + '.pkl' with open('gridworld_data/' + filename, 'wb') as f: pickle.dump(data, f) #Argument to initialize grid world if args.pendulum: env = discrete_pendulum.Pendulum() #Initializations Q = np.zeros((env.num_states, env.num_actions)) V = np.zeros(env.num_states) #state value function #Argument to run SARSA if args.SARSA: Q_optimal, policy_optimal, discounted_rewards = SARSA(args, Q, env) V_estimate = TD_Zero(args, V, policy_optimal, env) data = [Q_optimal, policy_optimal, discounted_rewards, V_estimate] #save data to pickle for deliverable generation if not os.path.isdir('./pendulum_data/'): os.makedirs('./pendulum_data/') filename = 'SARSA_data_alpha=' + str( args.alpha) + '_epsilon=' + str(args.epsilon) + '.pkl' with open('pendulum_data/' + filename, 'wb') as f: pickle.dump(data, f) #Argument to run Q learning if args.q_learning: Q_optimal, policy_optimal, discounted_rewards = Q_learning( args, Q, env) V_estimate = TD_Zero(args, V, policy_optimal, env) data = [Q_optimal, policy_optimal, discounted_rewards, V_estimate] #save data to pickle for deliverable generation if not os.path.isdir('./pendulum_data/'): os.makedirs('./pendulum_data/') filename = 'Q_Learning_data_alpha=' + str( args.alpha) + '_epsilon=' + str(args.epsilon) + '.pkl' with open('pendulum_data/' + filename, 'wb') as f: pickle.dump(data, f)
new_action): # Fill in this function return Q_table # -------------------- # # Create the Task # # -------------------- # # Task Parameters task_name = short_hallway action_error_prob = .1 pit_reward = -500 task = gridworld.GridWorld(task_name, action_error_prob=action_error_prob, rewards={ '*': 50, 'moved': -1, 'hit-wall': -1, 'X': pit_reward }) task.get_max_reward() # ---------------- # # Run the Task # # ---------------- # # Algorithm Parameters alpha = .5 epsilon = .1 gamma = .99 state_count = task.num_states action_count = task.num_actions episode_count = 250
def run_qlearning(): gw = gridworld.GridWorld() plt = plot.Plot() qlearn.q_learning(gw, plt)
def run_experiment(size): print "size", size rows = size cols = size #reward = [[0,0,0,-1,0],[0,-1,0,-1,0],[1,-1,0,0,0]] #true expert reward reward = np.reshape([np.random.randint(-10,10) for _ in range(rows*cols)],(rows,cols)) #true expert reward terminals = [] #no terminals, you can change this if you want gamma = 0.9 #discount factor for mdp grid = gridworld.GridWorld(reward, terminals, gamma) #create grid world #print "expert reward" #util.print_reward(grid) pi_star, V_star = mdp_solver.policy_iteration(grid) #solve for expert policy #print pi_star #print "expert policy" #util.print_policy(grid, pi_star) #print "expert value function" #util.print_grid(grid, np.reshape(V_star, (grid.rows, grid.cols))) Q_star = mdp_solver.calc_qvals(grid, pi_star, V_star, gamma) #print "expert Q-values" #print Q_star #give optimal action in each (non-terminal) state as demonstration #we can test giving demonstrations in some but not all states, or even noisy demonstrations to see what happens if we want demo = [(state, np.argmax(Q_star[state,:])) for state in range(grid.num_states) if state not in terminals] #print "demonstration", demo ####### gradient descent starting from random guess at expert's reward reward_guess = np.reshape([np.random.randint(-10,10) for _ in range(grid.num_states)],(grid.rows,grid.cols)) #create new mdp with reward_guess as reward mdp = gridworld.GridWorld(reward_guess, terminals, gamma) #create markov chain start = timeit.default_timer() num_steps = 10 c = 0.5 #we should experiment with step sizes print "----- gradient descent ------" for step in range(num_steps): #calculate optimal policy for current estimate of reward pi, V = mdp_solver.policy_iteration(mdp) #print "new policy" #print pi_star #calculate Q values for current estimate of reward Q = mdp_solver.calc_qvals(mdp, pi, V, gamma) #print "new Qvals" #print log-likelihood #print "log-likelihood posterior", birl.demo_log_likelihood(demo, Q) step_size = c / np.sqrt(step + 1) #print "stepsize", step_size #calculate gradient of posterior wrt reward grad = birl.calc_reward_gradient(demo, mdp, mdp.R, eta=1.0) #update reward R_new = mdp.R + step_size * grad #print "new reward" #print R_new #update mdp with new reward mdp.set_reward(R_new) stop = timeit.default_timer() #print "recovered reward" #util.print_reward(mdp) pi, V = mdp_solver.policy_iteration(mdp) #print "resulting optimal policy" #util.print_policy(mdp, pi) print "policy difference" #print np.linalg.norm(pi_star - pi) runtime = stop - start print "runtime for size", size, "=", runtime f = open("../results/runtime_size" + str(size) + ".txt", "w") f.write(str(runtime)) f.close()
def test_probabilities_gridworld(size=5): """ Check transition-probabilities for GridWorld """ check_zero_probabilities(gridworld.GridWorld(size))
plt.plot(avg_causality_importance, label='C + IS') plt.plot(ADAM_avg_base, '--', label='ADAM: Base Model') plt.plot(ADAM_avg_baseline_importance, '--', label='ADAM: BS + IS') plt.plot(ADAM_avg_baseline_causality, '--', label='ADAM: BS + C') plt.plot(ADAM_avg_baseline_causality_importance, '--', label='ADAM: BS + C + IS') plt.plot(ADAM_avg_causality_importance, '', label='ADAM: C+ IS') plt.xlabel('Simulation Steps (10 Episodes/Step)') plt.ylabel('Total Reward') plt.title('Learning Curve Comparison (SGD vs ADAM): 10,000 Episodes') plt.legend(bbox_to_anchor=(.975, 1.0), loc='upper left') plt.savefig('./generated_results/SGD_ADAM_learning_curve_10000_episodes.png') plt.show() env = gridworld.GridWorld() weights = base1_weights.detach().numpy() policy = [] for s in range(env.num_states): policy.append(np.argmax(weights[s])) policy = np.reshape(np.asarray(policy), (5, 5)) grid_x = [-0.5, 0.5, 1.5, 2.5, 3.5, 4.5] grid_y = [-0.5, 0.5, 1.5, 2.5, 3.5, 4.5] fig = plt.figure() plt.imshow(policy, cmap='coolwarm') plt.colorbar() plt.xticks(grid_x)
def __init__(self, width, height, **kwargs): self._displayer = gridworld_displayer.PyGameDisplayer(width, height) self._gridworld = gridworld.GridWorld(width, height, **kwargs) self.prev_state = None self.state = self._gridworld.get_state()
yield l[idx:idx + n] ############ ここまで関数 ########### ここからmain ############## if __name__ == '__main__': import gridworld from value_iteration import ValueIteration #setting env X, Y = 5, 5 #setting reward grid_shape = [X, Y] reward = np.full(np.prod(grid_shape), 0.0) #setting expert env = gridworld.GridWorld(grid_shape, reward) gamma = 0.9 #0.99,0.95,0.90,0.85,0.80 #traj =[[20, 21, 22, 23, 24, 19, 14, 9, 4], [20, 15, 10, 11, 6, 7, 2, 3, 4], [20, 15, 10, 5, 6, 7, 2, 3, 4], [20, 15, 10, 11, 6, 7, 2, 3, 4], [20, 21, 22, 23, 24, 19, 14, 9, 4], [20, 21, 16, 11, 12, 13, 8, 3, 4], [20, 15, 16, 17, 12, 7, 2, 3, 4], [20, 15, 10, 5, 6, 7, 8, 3, 4], [20, 15, 16, 11, 6, 7, 8, 9, 4], [20, 21, 16, 11, 6, 7, 8, 9, 4], [20, 15, 16, 17, 12, 13, 14, 9, 4], [20, 21, 22, 17, 12, 7, 8, 3, 4], [20, 15, 16, 17, 12, 13, 8, 3, 4], [20, 21, 16, 11, 6, 1, 2, 3, 4], [20, 21, 16, 17, 12, 7, 2, 3, 4], [20, 15, 10, 5, 6, 7, 2, 3, 4], [20, 15, 16, 11, 6, 7, 8, 3, 4], [20, 15, 16, 11, 6, 7, 2, 3, 4], [20, 21, 16, 11, 12, 7, 2, 3, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 21, 22, 17, 18, 13, 14, 9, 4], [20, 15, 10, 5, 6, 1, 2, 3, 4], [20, 21, 22, 17, 18, 19, 14, 9, 4], [20, 21, 22, 17, 12, 7, 2, 3, 4], [20, 15, 10, 11, 12, 7, 2, 3, 4], [20, 15, 16, 11, 6, 7, 8, 3, 4], [20, 21, 22, 23, 24, 19, 14, 9, 4], [20, 21, 16, 17, 18, 19, 14, 9, 4], [20, 15, 10, 11, 12, 7, 2, 3, 4], [20, 21, 22, 23, 18, 13, 14, 9, 4], [20, 21, 16, 17, 18, 19, 14, 9, 4], [20, 15, 16, 17, 12, 13, 8, 3, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 15, 16, 11, 6, 7, 8, 9, 4], [20, 15, 10, 11, 12, 13, 8, 9, 4], [20, 15, 16, 17, 18, 13, 14, 9, 4], [20, 21, 22, 17, 12, 13, 8, 3, 4], [20, 15, 16, 17, 18, 13, 8, 3, 4], [20, 21, 22, 17, 12, 7, 8, 3, 4], [20, 21, 22, 23, 18, 19, 14, 9, 4], [20, 15, 16, 17, 12, 13, 8, 3, 4], [20, 21, 16, 11, 6, 1, 2, 3, 4], [20, 15, 16, 11, 12, 13, 8, 9, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 21, 16, 11, 12, 13, 8, 3, 4], [20, 21, 22, 17, 18, 13, 14, 9, 4], [20, 15, 10, 11, 6, 7, 2, 3, 4], [20, 21, 22, 17, 18, 19, 14, 9, 4], [20, 15, 16, 11, 12, 13, 8, 3, 4], [20, 21, 22, 17, 12, 7, 2, 3, 4], [20, 21, 22, 17, 18, 19, 14, 9, 4], [20, 21, 16, 11, 6, 1, 2, 3, 4], [20, 21, 22, 17, 12, 13, 8, 3, 4], [20, 15, 16, 17, 18, 13, 8, 9, 4], [20, 15, 16, 11, 6, 7, 2, 3, 4], [20, 15, 10, 11, 12, 7, 8, 3, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 21, 16, 11, 12, 13, 8, 3, 4], [20, 15, 10, 11, 6, 1, 2, 3, 4], [20, 15, 10, 5, 6, 7, 8, 3, 4], [20, 21, 16, 11, 6, 7, 2, 3, 4], [20, 15, 10, 11, 6, 7, 8, 9, 4], [20, 21, 22, 23, 18, 19, 14, 9, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 21, 16, 17, 18, 19, 14, 9, 4], [20, 21, 16, 17, 12, 7, 2, 3, 4], [20, 21, 22, 23, 18, 13, 8, 3, 4], [20, 21, 16, 11, 6, 1, 2, 3, 4], [20, 15, 16, 17, 12, 13, 14, 9, 4], [20, 15, 16, 17, 18, 19, 14, 9, 4], [20, 21, 22, 17, 12, 13, 14, 9, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 15, 10, 11, 6, 7, 8, 9, 4], [20, 21, 22, 23, 18, 13, 8, 9, 4], [20, 21, 16, 17, 12, 13, 14, 9, 4], [20, 21, 16, 17, 18, 19, 14, 9, 4], [20, 15, 10, 11, 6, 7, 8, 3, 4], [20, 21, 16, 17, 18, 19, 14, 9, 4], [20, 21, 22, 17, 18, 13, 8, 9, 4], [20, 21, 16, 17, 12, 7, 2, 3, 4], [20, 15, 10, 11, 12, 13, 14, 9, 4], [20, 15, 16, 17, 18, 19, 14, 9, 4], [20, 21, 16, 17, 12, 7, 8, 9, 4], [20, 15, 16, 17, 18, 13, 8, 3, 4], [20, 21, 16, 11, 12, 7, 2, 3, 4], [20, 21, 16, 11, 12, 13, 8, 3, 4], [20, 15, 16, 11, 6, 7, 2, 3, 4], [20, 21, 22, 17, 18, 19, 14, 9, 4], [20, 21, 16, 11, 6, 7, 2, 3, 4], [20, 21, 22, 17, 12, 13, 8, 9, 4], [20, 21, 22, 17, 12, 7, 8, 9, 4], [20, 21, 22, 17, 18, 13, 8, 3, 4], [20, 21, 16, 11, 12, 7, 8, 9, 4], [20, 21, 22, 17, 12, 13, 14, 9, 4], [20, 15, 16, 17, 18, 19, 14, 9, 4], [20, 15, 16, 17, 12, 7, 2, 3, 4], [20, 15, 16, 11, 6, 1, 2, 3, 4], [20, 15, 16, 17, 18, 13, 8, 9, 4], [20, 21, 22, 17, 12, 7, 8, 9, 4], [20, 15, 10, 5, 6, 7, 2, 3, 4]] traj = [[20, 21, 22, 23, 18, 13, 8, 9, 4], [20, 21, 16, 17, 18, 13, 14, 9, 4], [20, 15, 10, 11, 12, 7, 2, 3, 4], [20, 15, 16, 17, 12, 13, 8, 3, 4], [20, 21, 16, 11, 12, 7, 8, 3, 4], [20, 15, 16, 17, 12, 7, 8, 3, 4], [20, 21, 22, 23, 24, 19, 14, 9, 4], [20, 15, 10, 5, 6, 7, 2, 3, 4], [20, 21, 16, 11, 6, 7, 2, 3, 4], [20, 21, 22, 23, 18, 13, 8, 9, 4], [20, 21, 22, 23, 24, 19, 14, 9, 4], [20, 21, 16, 17, 18, 13, 14, 9, 4], [20, 15, 10, 11, 12, 13, 14, 9, 4], [20, 21, 16, 11, 6, 1, 2, 3, 4], [20, 15, 16, 11, 12, 13, 14, 9, 4], [20, 15, 10, 5, 6, 1, 2, 3, 4],
### HELPER CODE #### ### INIITALIZE GRID ### # Create the grid for Problem 2. grid = ['..,..', '..,..', 'o.,..', '.?,.*'] # Create the Task # Task Parameters task = gridworld.GridWorld(grid, terminal_markers={'*', '?'}, rewards={ '.': -1, '*': 50, '?': 5, ',': -50, 'o': -1 }) # Algorithm Parameters gamma = .75 state_count = task.num_states action_count = task.num_actions row_count = len(grid) col_count = len(grid[0]) # -------------- # # Make Plots # # -------------- #
def execute_configuration(config=DEFAULT_CONFIG, row_index=0, column_index=0, height=1, width=1): task = gridworld.GridWorld(TASK_MAP[config['task_name']], action_error_prob=config['action_error_prob'], rewards={ '*': 50, 'moved': -1, 'hit-wall': -1, 'X': config['pit_reward'] }) task.get_max_reward() # Loop over some number of episodes episode_reward_set = np.zeros( (config['rep_count'], config['episode_count'])) for rep_iter in range(config['rep_count']): # Initialize the Q table Q_table = np.zeros((task.num_states, task.num_actions)) # Loop until the episode is done for episode_iter in range(config['episode_count']): # Start the task task.reset() state = task.observe() action = policy(state, Q_table, task.num_actions, config['epsilon']) episode_reward_list = [] task_iter = 0 # Loop until done -- check when do we get the final state reward? while True: task_iter = task_iter + 1 new_state, reward = task.perform_action(action) new_action = policy(new_state, Q_table, task.num_actions, config['epsilon']) # Update the Q_table. if config['method'] == 'sarsa': Q_table = update_Q_SARSA(Q_table, config['alpha'], config['gamma'], state, action, reward, new_state, new_action) elif config['method'] == 'qlearning': Q_table = update_Q_Learning(Q_table, config['alpha'], config['gamma'], state, action, reward, new_state) else: sys.exit( "Unrecognized algorithm %s. Consider adding support?" % config['method']) # store the data episode_reward_list.append(reward) # stop if at goal/else update for the next iteration if task.is_terminal( state) or task_iter > config['episode_max_length']: episode_reward_set[rep_iter, episode_iter] = np.sum( episode_reward_list) break else: state = new_state action = new_action add_plot(config, Q_table, episode_reward_set, row_index, column_index, width, height)
print(grid.display_grid()) # while game still in progress for i in range(num_of_steps): q_value = model.predict(grid.state.reshape(1, input_size), batch_size=1) # take action with highest Q-value action = (np.argmax(q_value)) print('Move #: %s; Taking action: %s' % (i, action)) grid.agent_move(action) grid.display_grid() reward = grid.get_reward() total_reward += reward print("Max steps reached, total reward: {}".format(total_reward)) if __name__ == "__main__": if len(sys.argv) > 1: height = sys.argv[1] env = g.GridWorld(height) else: height = 3 env = g.GridWorld() num_of_steps = 14 for index in range(5): model = model_init(height) f = training_easy(env, model, 3, height, num_of_steps) test_training(model) plt.plot(f[0], f[1]) plt.show()
### HELPER CODE #### ### INIITALIZE GRID ### # Create the grid for Problem 2. grid = ['..,..', '..,..', 'o.,..', '.?,.*'] # Create the Task # Task Parameters action_error_prob = .2 task = gridworld.GridWorld(grid, action_error_prob=action_error_prob, terminal_markers={'*', '?'}, rewards={ '.': -1, '*': 50, '?': 5, ',': -50, 'o': -1 }) # Algorithm Parameters gamma = .75 state_count = task.num_states action_count = task.num_actions row_count = len(grid) col_count = len(grid[0]) # -------------- # # Make Plots # # -------------- #
for j in range(width): if (isInt): sys.stdout.write("%6s" % str('%d' % printArray[(i * height) + j]) + " ") else: sys.stdout.write("%6s" % str('%02.2f' % printArray[(i * height) + j]) + " ") sys.stdout.write("\n\n") sys.stdout.flush() if __name__ == "__main__": env = gridworld.GridWorld(map_name='8x8') # Generate policy from Q3.2.1 gamma = 0.9 Vs, n_iter = rl.value_iteration(env, gamma) policy = rl.policy_from_value_function(env, Vs, gamma) T = env.generateTransitionMatrices() # Q3.3.5 # Set R_max and l1 as you want. R_max = 1 l1 = 0.5 R = irl_lp(policy, T, gamma, R_max, l1) printGridWorld("IRL-generated Rewards", R, 8, 8, False)
import numpy as np import mdp_solver import gridworld import util import birl #gradient descent on reward reward = [[0, 0, 0], [0, -1, 0], [1, -1, 0]] terminals = [6] gamma = 0.9 simple_world = gridworld.GridWorld(reward, terminals, gamma) print "reward" util.print_reward(simple_world) pi_star, V_star = mdp_solver.policy_iteration(simple_world) print "optimal policy" util.print_policy(simple_world, pi_star) Q_star = mdp_solver.calc_qvals(simple_world, pi_star, V_star, gamma) print "q-vals" print Q_star #give optimal action in each state as demonstration demo = [(state, np.argmax(Q_star[state, :])) for state in range(simple_world.num_states)] print demo #compute the gradient of R_guess #TODO get an actual guess and update it towards real R num_states = simple_world.num_states num_actions = simple_world.num_actions print "gradient"
import numpy as np import mdp_solver import gridworld import util import birl_optimized as birl ##test script for running gradient descent for bayesian inverse reinforcement learning ##domain is a simple grid world (see gridworld.py) ##TODO I haven't incorporated a prior so this really is more of a maximum likelihood rather than bayesian irl algorithm reward = [[0, 0, 0, -1, 0, 0, 0], [0, -1, 0, -1, 0, -1, 0], [0, -1, 0, -1, 0, -1, 0], [1, -1, 0, 0, 0, -1, 0]] #true expert reward terminals = [21] #no terminals, you can change this if you want gamma = 0.95 #discount factor for mdp grid = gridworld.GridWorld(reward, terminals, gamma) #create grid world print "expert reward" util.print_reward(grid) pi_star, V_star = mdp_solver.policy_iteration(grid) #solve for expert policy print pi_star print "expert policy" util.print_policy(grid, pi_star) print "expert value function" util.print_grid(grid, np.reshape(V_star, (grid.rows, grid.cols))) Q_star = mdp_solver.calc_qvals(grid, pi_star, V_star, gamma) print "expert Q-values" print Q_star #give optimal action in each (non-terminal) state as demonstration #we can test giving demonstrations in some but not all states, or even noisy demonstrations to see what happens if we want demo = [(state, np.argmax(Q_star[state, :]))
for j in range(X.shape[1]): if not text is None: v = text[int(X[i, j])] else: v = X[i, j] factor = 10.0 * dec v = math.trunc(v * factor) / factor ax.text(j, i, v, ha="center", va="center", color="w") plt.savefig(f"{title}.png") # plt.show() plt.close() if __name__ == "__main__": mapname = "8x8" env = gridworld.GridWorld(map_name=mapname) gw, gh = int(mapname[0]), int(mapname[-1]) # Play around with these values if you want! gamma = 0.9 alpha = 0.05 n = 4 action_names = ['L', 'D', 'R', 'U'] # Q3.2.1 print(f"\n** q3.2.1 value iteration") V_vi, n_iter = value_iteration(env, gamma) plot(V_vi.reshape(gw, gh), title='value_iteration') print(f"value iteration converged after {n_iter} steps") policy = policy_from_value_function(env, V_vi, gamma) plot(policy.reshape(gw, gh), title='policy_from_value_iteration', text=action_names)