def runPolicies(demonstrations=10, super_iterations=100, sub_iterations=1000, learning_rate=1e-3, env_noise=0.1): m = GridWorldModel(2, (3, 1), (4, 1), 2) MAP_NAME = 'resources/GridWorldMaps/experiment4.txt' gmap = np.loadtxt(MAP_NAME, dtype=np.uint8) full_traj = [] vis_traj = [] for i in range(0, demonstrations): print("Traj", i) g = GridWorldGasEnv(copy.copy(gmap), noise=env_noise) g.generateRandomStartGoal() v = ValueIterationPlanner(g) traj = v.plan(max_depth=100) new_traj = [] for t in traj: a = np.zeros(shape=(4, 1)) a[t[1]] = 1 new_traj.append((t[0], a)) full_traj.append(new_traj) vis_traj.extend(new_traj) # g.visualizePlan(vis_traj,blank=True, filename="resources/results/exp4-trajs.png") opt = tf.train.AdamOptimizer(learning_rate=learning_rate) loss = m.getLossFunction()[0] train = opt.minimize(loss) init = tf.initialize_all_variables() #with m.sess as sess: m.sess.run(init) for it in range(super_iterations): print("Iteration", it) batch = m.sampleBatch(full_traj) for i in range(sub_iterations): m.sess.run(train, batch) actions = np.eye(4) g = GridWorldGasEnv(copy.copy(gmap), noise=0.0) for i in range(m.k): states = g.getAllStates() policy_hash = {} trans_hash = {} for s in states: #print([m.evalpi(i,ns, actions[:,j]) for j in range(4)]) l = [ np.ravel(m.evalpi(i, [(s, actions[j, :])])) for j in g.possibleActions(s) ] if len(l) == 0: continue #print(i, s,l, m.evalpsi(i,ns)) action = g.possibleActions(s)[np.argmax(l)] if s[2] / g.gas_limit > 0.75: policy_hash[s] = action #print(transitions[i].eval(np.array(ns))) trans_hash[s] = 0 #1 - s[2]/g.gas_limit g.visualizePolicy(policy_hash, trans_hash, blank=True, filename="resources/results/exp4-policy-" + str(i) + ".png") for i in range(m.k): states = g.getAllStates() policy_hash = {} trans_hash = {} for s in states: #print([m.evalpi(i,ns, actions[:,j]) for j in range(4)]) l = [ np.ravel(m.evalpi(i, [(s, actions[j, :])])) for j in g.possibleActions(s) ] if len(l) == 0: continue #print(i, s,l, m.evalpsi(i,ns)) action = g.possibleActions(s)[np.argmax(l)] if s[2] / g.gas_limit < 0.25: policy_hash[s] = action #print(transitions[i].eval(np.array(ns))) trans_hash[s] = 0 #1 - s[2]/g.gas_limit g.visualizePolicy(policy_hash, trans_hash, blank=True, filename="resources/results/exp4-policy-" + str(i + m.k) + ".png")
def runPolicies(demonstrations=200, super_iterations=1000, sub_iterations=1, learning_rate=1e-3, env_noise=0.1): m = GridWorldModel(4) MAP_NAME = 'resources/GridWorldMaps/experiment2.txt' gmap = np.loadtxt(MAP_NAME, dtype=np.uint8) full_traj = [] vis_traj = [] for i in range(0, demonstrations): print("Traj", i) g = GridWorldEnv(copy.copy(gmap), noise=env_noise) g.generateRandomStartGoal() v = ValueIterationPlanner(g) traj = v.plan(max_depth=100) new_traj = [] for t in traj: a = np.zeros(shape=(4, 1)) a[t[1]] = 1 new_traj.append((t[0], a)) full_traj.append(new_traj) vis_traj.extend(new_traj) #g.visualizePlan(vis_traj,blank=True, filename="resources/results/exp2-trajs.png") opt = tf.train.AdamOptimizer(learning_rate=learning_rate) m.train(opt, full_traj, super_iterations, sub_iterations) actions = np.eye(4) g = GridWorldEnv(copy.copy(gmap), noise=0.0) for i in range(m.k): states = g.getAllStates() policy_hash = {} trans_hash = {} for s in states: #print([m.evalpi(i,ns, actions[:,j]) for j in range(4)]) l = [ np.ravel(m.evalpi(i, [(s, actions[j, :])])) for j in g.possibleActions(s) ] if len(l) == 0: continue #print(i, s,l, m.evalpsi(i,ns)) action = g.possibleActions(s)[np.argmax(l)] policy_hash[s] = action #print(transitions[i].eval(np.array(ns))) trans_hash[s] = np.ravel(m.evalpsi(i, [(s, actions[1, :])])) g.visualizePolicy(policy_hash, trans_hash, blank=True, filename="resources/results/exp2-policy" + str(i) + ".png")
def runPolicies(demonstrations=20, super_iterations=10000, sub_iterations=0, learning_rate=10, env_noise=0.3): m = GridWorldModel(2, statedim=(8,9)) MAP_NAME = 'resources/GridWorldMaps/experiment1.txt' gmap = np.loadtxt(MAP_NAME, dtype=np.uint8) full_traj = [] vis_traj = [] for i in range(0,demonstrations): print("Traj",i) g = GridWorldEnv(copy.copy(gmap), noise=env_noise) # print("Initialized") g.generateRandomStartGoal() start = np.argwhere(g.map == g.START)[0] goal = np.argwhere(g.map == g.GOAL)[0] #generate trajectories start in same room and end different room while not ((inRoom1(start) and inRoom2(goal)) or\ (inRoom2(start) and inRoom1(goal))): # print(inr) g.generateRandomStartGoal() start = np.argwhere(g.map == g.START)[0] goal = np.argwhere(g.map == g.GOAL)[0] print(np.argwhere(g.map == g.START), np.argwhere(g.map == g.GOAL)) v = ValueIterationPlanner(g) traj = v.plan(max_depth=100) new_traj = [] for t in traj: a = np.zeros(shape=(4,1)) s = np.zeros(shape=(8,9)) a[t[1]] = 1 s[t[0][0],t[0][1]] = 1 #s[2:4,0] = np.argwhere(g.map == g.START)[0] #s[4:6,0] = np.argwhere(g.map == g.GOAL)[0] new_traj.append((s,a)) full_traj.append(new_traj) vis_traj.extend(new_traj) #raise ValueError("") #g.visualizePlan(vis_traj,blank=True, filename="resources/results/exp1-trajs.png") m.sess.run(tf.initialize_all_variables()) with tf.variable_scope("optimizer"): opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) m.train(opt, full_traj, super_iterations, sub_iterations) actions = np.eye(4) g = GridWorldEnv(copy.copy(gmap), noise=0.1) g.generateRandomStartGoal() for i in range(m.k): states = g.getAllStates() policy_hash = {} trans_hash = {} for s in states: t = np.zeros(shape=(8,9)) t[s[0],s[1]] = 1 #t[2:4,0] = np.argwhere(g.map == g.START)[0] #t[4:6,0] = np.argwhere(g.map == g.GOAL)[0] l = [ np.ravel(m.evalpi(i, [(t, actions[j,:])] )) for j in g.possibleActions(s)] if len(l) == 0: continue #print(i, s,l, m.evalpsi(i,ns)) action = g.possibleActions(s)[np.argmax(l)] policy_hash[s] = action #print("Transition: ",m.evalpsi(i, [(t, actions[1,:])]), t) trans_hash[s] = np.ravel(m.evalpsi(i, [(t, actions[1,:])])) g.visualizePolicy(policy_hash, trans_hash, blank=True, filename="resources/results/exp1-policy"+str(i)+".png")
# For each option, making a table of the transition probabilities # and actions for each state in the gridworld to understand learned options # For each option: # policy_hash maps states to actions, trans_hash maps states to option termination # probabilities, for THAT option. for s in states: # Not a problem here because trajectories are defined to be 11 x 11 with one-hot for state filled, but in HDQN clearly # that is not the case t = np.zeros(shape=(gmap.shape[0], gmap.shape[1])) t[s[0], s[1]] = 1 #t[2:4,0] = np.argwhere(g.map == g.START)[0] #t[4:6,0] = np.argwhere(g.map == g.GOAL)[0] l = [ np.ravel(m.evalpi(i, [(t, actions[j, :])])) for j in g.possibleActions(s) ] if len(l) == 0: continue #print(i, s,l, m.evalpsi(i,ns)) action = g.possibleActions(s)[np.argmax(l)] policy_hash[s] = action trans_hash[s] = np.ravel(m.evalpsi(i, [(t, actions[1, :])])) g.visualizePolicy(policy_hash, trans_hash,
def runPolicies( demonstrations=20, super_iterations=1000, #10000 sub_iterations=0, learning_rate=10, env_noise=0.3): m = GridWorldModel(4, statedim=(8, 9)) MAP_NAME = 'resources/GridWorldMaps/experiment1.txt' gmap = np.loadtxt(MAP_NAME, dtype=np.uint8) full_traj = [] vis_traj = [] for i in range(0, demonstrations): print("Traj", i) g = GridWorldEnv(copy.copy(gmap), noise=env_noise) # print("Initialized") g.generateRandomStartGoal() start = np.argwhere(g.map == g.START)[0] goal = np.argwhere(g.map == g.GOAL)[0] #generate trajectories start in same room and end different room while not ((inRoom1(start) and inRoom2(goal)) or\ (inRoom2(start) and inRoom1(goal))): # print(inr) g.generateRandomStartGoal() start = np.argwhere(g.map == g.START)[0] goal = np.argwhere(g.map == g.GOAL)[0] print(np.argwhere(g.map == g.START), np.argwhere(g.map == g.GOAL)) v = ValueIterationPlanner(g) traj = v.plan(max_depth=100) print(len(traj), 'length of the trajectory') #this is length depends on the start, and goal state and the planner output. new_traj = [] for t in traj: # now for the length of the trajectory it took to get there, iterate over each step a = np.zeros(shape=(4, 1)) s = np.zeros(shape=(8, 9)) a[t[1]] = 1 s[t[0][0], t[0][1]] = 1 #s[2:4,0] = np.argwhere(g.map == g.START)[0] #s[4:6,0] = np.argwhere(g.map == g.GOAL)[0] new_traj.append((s, a)) full_traj.append(new_traj) vis_traj.extend(new_traj) print(np.shape(full_traj[0][0][1]), "full trajectory") #raise ValueError("") #g.visualizePlan(vis_traj,blank=True, filename="resources/results/exp1-trajs.png") m.sess.run(tf.initialize_all_variables()) with tf.variable_scope("optimizer"): opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) #define he optimizer, put the full trajectorty, 1000, 0 closs, tloss = m.train(opt, full_traj, super_iterations, sub_iterations) print(closs, len(closs), 'this is closs') plt.plot(range(len(closs)), closs) plt.savefig('closs.png') plt.plot(range(len(tloss)), tloss) plt.savefig('tloss.png') actions = np.eye(4) g = GridWorldEnv(copy.copy(gmap), noise=0.0) g.generateRandomStartGoal() for i in range(m.k): states = g.getAllStates() print('\n', states, '\n', 'this is all states', m.k) policy_hash = {} trans_hash = {} for s in states: t = np.zeros(shape=(8, 9)) t[s[0], s[1]] = 1 #t[2:4,0] = np.argwhere(g.map == g.START)[0] #t[4:6,0] = np.argwhere(g.map == g.GOAL)[0] #np.ravel returns the elements of the combined set of elements. l = [ np.ravel(m.evalpi(i, [(t, actions[j, :])])) for j in g.possibleActions(s) ] print('\n', l, 'l', g.possibleActions(s), 'possible actioins') if len(l) == 0: continue #print(i, s,l, m.evalpsi(i,ns)) action = g.possibleActions(s)[np.argmax(l)] policy_hash[s] = action #print("Transition: ",m.evalpsi(i, [(t, actions[1,:])]), t) trans_hash[s] = np.ravel(m.evalpsi(i, [(t, actions[1, :])])) g.visualizePolicy(policy_hash, trans_hash, blank=True, filename="resources/results/exp1-policy" + str(i) + ".png")
policy_hash = {} trans_hash = {} # For each option, making a table of the transition probabilities # and actions for each state in the gridworld to understand learned options # For each option: # policy_hash maps states to actions, trans_hash maps states to option termination # probabilities, for THAT option. for s in states: t = np.zeros(shape=(gmap.shape[0],gmap.shape[1])) t[s[0],s[1]] = 1 #t[2:4,0] = np.argwhere(g.map == g.START)[0] #t[4:6,0] = np.argwhere(g.map == g.GOAL)[0] l = [ np.ravel(m.evalpi(i, [(t, actions[j,:])] )) for j in g.possibleActions(s)] if len(l) == 0: continue #print(i, s,l, m.evalpsi(i,ns)) action = g.possibleActions(s)[np.argmax(l)] policy_hash[s] = action trans_hash[s] = np.ravel(m.evalpsi(i, [(t, actions[1,:])])) g.visualizePolicy(policy_hash, trans_hash, blank=True, filename="resources/results/exp_stuff"+str(i)+".png")