Exemple #1
0
def runPolicies(demonstrations=10,
                super_iterations=100,
                sub_iterations=1000,
                learning_rate=1e-3,
                env_noise=0.1):

    m = GridWorldModel(2, (3, 1), (4, 1), 2)

    MAP_NAME = 'resources/GridWorldMaps/experiment4.txt'
    gmap = np.loadtxt(MAP_NAME, dtype=np.uint8)
    full_traj = []
    vis_traj = []

    for i in range(0, demonstrations):
        print("Traj", i)
        g = GridWorldGasEnv(copy.copy(gmap), noise=env_noise)
        g.generateRandomStartGoal()
        v = ValueIterationPlanner(g)
        traj = v.plan(max_depth=100)

        new_traj = []
        for t in traj:
            a = np.zeros(shape=(4, 1))
            a[t[1]] = 1

            new_traj.append((t[0], a))

        full_traj.append(new_traj)
        vis_traj.extend(new_traj)

    # g.visualizePlan(vis_traj,blank=True, filename="resources/results/exp4-trajs.png")

    opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
    loss = m.getLossFunction()[0]
    train = opt.minimize(loss)
    init = tf.initialize_all_variables()

    #with m.sess as sess:
    m.sess.run(init)

    for it in range(super_iterations):
        print("Iteration", it)
        batch = m.sampleBatch(full_traj)
        for i in range(sub_iterations):
            m.sess.run(train, batch)

    actions = np.eye(4)

    g = GridWorldGasEnv(copy.copy(gmap), noise=0.0)

    for i in range(m.k):
        states = g.getAllStates()
        policy_hash = {}
        trans_hash = {}

        for s in states:

            #print([m.evalpi(i,ns, actions[:,j]) for j in range(4)])
            l = [
                np.ravel(m.evalpi(i, [(s, actions[j, :])]))
                for j in g.possibleActions(s)
            ]

            if len(l) == 0:
                continue

            #print(i, s,l, m.evalpsi(i,ns))
            action = g.possibleActions(s)[np.argmax(l)]

            if s[2] / g.gas_limit > 0.75:
                policy_hash[s] = action

            #print(transitions[i].eval(np.array(ns)))
            trans_hash[s] = 0  #1 - s[2]/g.gas_limit

        g.visualizePolicy(policy_hash,
                          trans_hash,
                          blank=True,
                          filename="resources/results/exp4-policy-" + str(i) +
                          ".png")

    for i in range(m.k):
        states = g.getAllStates()
        policy_hash = {}
        trans_hash = {}

        for s in states:

            #print([m.evalpi(i,ns, actions[:,j]) for j in range(4)])
            l = [
                np.ravel(m.evalpi(i, [(s, actions[j, :])]))
                for j in g.possibleActions(s)
            ]

            if len(l) == 0:
                continue

            #print(i, s,l, m.evalpsi(i,ns))
            action = g.possibleActions(s)[np.argmax(l)]

            if s[2] / g.gas_limit < 0.25:
                policy_hash[s] = action

            #print(transitions[i].eval(np.array(ns)))
            trans_hash[s] = 0  #1 - s[2]/g.gas_limit

        g.visualizePolicy(policy_hash,
                          trans_hash,
                          blank=True,
                          filename="resources/results/exp4-policy-" +
                          str(i + m.k) + ".png")
Exemple #2
0
def runPolicies(demonstrations=200,
                super_iterations=1000,
                sub_iterations=1,
                learning_rate=1e-3,
                env_noise=0.1):

    m = GridWorldModel(4)

    MAP_NAME = 'resources/GridWorldMaps/experiment2.txt'
    gmap = np.loadtxt(MAP_NAME, dtype=np.uint8)
    full_traj = []
    vis_traj = []

    for i in range(0, demonstrations):
        print("Traj", i)
        g = GridWorldEnv(copy.copy(gmap), noise=env_noise)
        g.generateRandomStartGoal()
        v = ValueIterationPlanner(g)
        traj = v.plan(max_depth=100)

        new_traj = []
        for t in traj:
            a = np.zeros(shape=(4, 1))
            a[t[1]] = 1

            new_traj.append((t[0], a))

        full_traj.append(new_traj)
        vis_traj.extend(new_traj)

    #g.visualizePlan(vis_traj,blank=True, filename="resources/results/exp2-trajs.png")

    opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
    m.train(opt, full_traj, super_iterations, sub_iterations)

    actions = np.eye(4)

    g = GridWorldEnv(copy.copy(gmap), noise=0.0)

    for i in range(m.k):
        states = g.getAllStates()
        policy_hash = {}
        trans_hash = {}

        for s in states:

            #print([m.evalpi(i,ns, actions[:,j]) for j in range(4)])
            l = [
                np.ravel(m.evalpi(i, [(s, actions[j, :])]))
                for j in g.possibleActions(s)
            ]

            if len(l) == 0:
                continue

            #print(i, s,l, m.evalpsi(i,ns))
            action = g.possibleActions(s)[np.argmax(l)]

            policy_hash[s] = action

            #print(transitions[i].eval(np.array(ns)))
            trans_hash[s] = np.ravel(m.evalpsi(i, [(s, actions[1, :])]))

        g.visualizePolicy(policy_hash,
                          trans_hash,
                          blank=True,
                          filename="resources/results/exp2-policy" + str(i) +
                          ".png")
Exemple #3
0
def runPolicies(demonstrations=20,
		super_iterations=10000,
		sub_iterations=0,
		learning_rate=10,
		env_noise=0.3):

	m  = GridWorldModel(2, statedim=(8,9))

	MAP_NAME = 'resources/GridWorldMaps/experiment1.txt'
	gmap = np.loadtxt(MAP_NAME, dtype=np.uint8)
	full_traj = []
	vis_traj = []

	for i in range(0,demonstrations):
		print("Traj",i)
		g = GridWorldEnv(copy.copy(gmap), noise=env_noise)
		# print("Initialized")

		g.generateRandomStartGoal()	
		start = np.argwhere(g.map == g.START)[0]
		goal = np.argwhere(g.map == g.GOAL)[0]
		#generate trajectories start in same room and end different room
		while not ((inRoom1(start) and inRoom2(goal))  or\
				   (inRoom2(start) and inRoom1(goal))):
			# print(inr)
			g.generateRandomStartGoal()	
			start = np.argwhere(g.map == g.START)[0]
			goal = np.argwhere(g.map == g.GOAL)[0]


		print(np.argwhere(g.map == g.START), np.argwhere(g.map == g.GOAL))

		v = ValueIterationPlanner(g)
		traj = v.plan(max_depth=100)
		
		new_traj = []
		for t in traj:
			a = np.zeros(shape=(4,1))

			s = np.zeros(shape=(8,9))

			a[t[1]] = 1

			s[t[0][0],t[0][1]] = 1
			#s[2:4,0] = np.argwhere(g.map == g.START)[0]
			#s[4:6,0] = np.argwhere(g.map == g.GOAL)[0]

			new_traj.append((s,a))

		full_traj.append(new_traj)
		vis_traj.extend(new_traj)

	#raise ValueError("")

	#g.visualizePlan(vis_traj,blank=True, filename="resources/results/exp1-trajs.png")


	m.sess.run(tf.initialize_all_variables())

	with tf.variable_scope("optimizer"):
		opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)

		m.train(opt, full_traj, super_iterations, sub_iterations)

	actions = np.eye(4)


	g = GridWorldEnv(copy.copy(gmap), noise=0.1)
	g.generateRandomStartGoal()

	for i in range(m.k):
		states = g.getAllStates()
		policy_hash = {}
		trans_hash = {}

		for s in states:

			t = np.zeros(shape=(8,9))

			t[s[0],s[1]] = 1
			#t[2:4,0] = np.argwhere(g.map == g.START)[0]
			#t[4:6,0] = np.argwhere(g.map == g.GOAL)[0]


			l = [ np.ravel(m.evalpi(i, [(t, actions[j,:])] ))  for j in g.possibleActions(s)]

			if len(l) == 0:
				continue

			#print(i, s,l, m.evalpsi(i,ns))
			action = g.possibleActions(s)[np.argmax(l)]

			policy_hash[s] = action

			#print("Transition: ",m.evalpsi(i, [(t, actions[1,:])]), t)
			trans_hash[s] = np.ravel(m.evalpsi(i, [(t, actions[1,:])]))

		g.visualizePolicy(policy_hash, trans_hash, blank=True, filename="resources/results/exp1-policy"+str(i)+".png")
    # For each option, making a table of the transition probabilities
    # and actions for each state in the gridworld to understand learned options
    # For each option:
    # policy_hash maps states to actions, trans_hash maps states to option termination
    # probabilities, for THAT option.
    for s in states:
        # Not a problem here because trajectories are defined to be 11 x 11 with one-hot for state filled, but in HDQN clearly
        # that is not the case
        t = np.zeros(shape=(gmap.shape[0], gmap.shape[1]))

        t[s[0], s[1]] = 1
        #t[2:4,0] = np.argwhere(g.map == g.START)[0]
        #t[4:6,0] = np.argwhere(g.map == g.GOAL)[0]

        l = [
            np.ravel(m.evalpi(i, [(t, actions[j, :])]))
            for j in g.possibleActions(s)
        ]

        if len(l) == 0:
            continue

        #print(i, s,l, m.evalpsi(i,ns))
        action = g.possibleActions(s)[np.argmax(l)]

        policy_hash[s] = action

        trans_hash[s] = np.ravel(m.evalpsi(i, [(t, actions[1, :])]))

    g.visualizePolicy(policy_hash,
                      trans_hash,
Exemple #5
0
def runPolicies(
        demonstrations=20,
        super_iterations=1000,  #10000
        sub_iterations=0,
        learning_rate=10,
        env_noise=0.3):

    m = GridWorldModel(4, statedim=(8, 9))

    MAP_NAME = 'resources/GridWorldMaps/experiment1.txt'
    gmap = np.loadtxt(MAP_NAME, dtype=np.uint8)
    full_traj = []
    vis_traj = []

    for i in range(0, demonstrations):
        print("Traj", i)
        g = GridWorldEnv(copy.copy(gmap), noise=env_noise)
        # print("Initialized")

        g.generateRandomStartGoal()
        start = np.argwhere(g.map == g.START)[0]
        goal = np.argwhere(g.map == g.GOAL)[0]
        #generate trajectories start in same room and end different room
        while not ((inRoom1(start) and inRoom2(goal))  or\
             (inRoom2(start) and inRoom1(goal))):
            # print(inr)
            g.generateRandomStartGoal()
            start = np.argwhere(g.map == g.START)[0]
            goal = np.argwhere(g.map == g.GOAL)[0]

        print(np.argwhere(g.map == g.START), np.argwhere(g.map == g.GOAL))

        v = ValueIterationPlanner(g)
        traj = v.plan(max_depth=100)

        print(len(traj), 'length of the trajectory')
        #this is length depends on the start, and goal state and the planner output.

        new_traj = []
        for t in traj:

            # now for the length of the trajectory it took to get there, iterate over each step
            a = np.zeros(shape=(4, 1))

            s = np.zeros(shape=(8, 9))

            a[t[1]] = 1

            s[t[0][0], t[0][1]] = 1
            #s[2:4,0] = np.argwhere(g.map == g.START)[0]
            #s[4:6,0] = np.argwhere(g.map == g.GOAL)[0]

            new_traj.append((s, a))

        full_traj.append(new_traj)
        vis_traj.extend(new_traj)
    print(np.shape(full_traj[0][0][1]), "full trajectory")
    #raise ValueError("")

    #g.visualizePlan(vis_traj,blank=True, filename="resources/results/exp1-trajs.png")

    m.sess.run(tf.initialize_all_variables())

    with tf.variable_scope("optimizer"):
        opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
        #define he optimizer,  put the full trajectorty, 1000, 0
        closs, tloss = m.train(opt, full_traj, super_iterations,
                               sub_iterations)

    print(closs, len(closs), 'this is closs')
    plt.plot(range(len(closs)), closs)
    plt.savefig('closs.png')
    plt.plot(range(len(tloss)), tloss)
    plt.savefig('tloss.png')

    actions = np.eye(4)

    g = GridWorldEnv(copy.copy(gmap), noise=0.0)

    g.generateRandomStartGoal()

    for i in range(m.k):
        states = g.getAllStates()
        print('\n', states, '\n', 'this is all states', m.k)
        policy_hash = {}
        trans_hash = {}

        for s in states:

            t = np.zeros(shape=(8, 9))

            t[s[0], s[1]] = 1
            #t[2:4,0] = np.argwhere(g.map == g.START)[0]
            #t[4:6,0] = np.argwhere(g.map == g.GOAL)[0]

            #np.ravel returns the elements of the combined set of elements.
            l = [
                np.ravel(m.evalpi(i, [(t, actions[j, :])]))
                for j in g.possibleActions(s)
            ]
            print('\n', l, 'l', g.possibleActions(s), 'possible actioins')

            if len(l) == 0:
                continue

            #print(i, s,l, m.evalpsi(i,ns))
            action = g.possibleActions(s)[np.argmax(l)]

            policy_hash[s] = action

            #print("Transition: ",m.evalpsi(i, [(t, actions[1,:])]), t)
            trans_hash[s] = np.ravel(m.evalpsi(i, [(t, actions[1, :])]))

        g.visualizePolicy(policy_hash,
                          trans_hash,
                          blank=True,
                          filename="resources/results/exp1-policy" + str(i) +
                          ".png")
Exemple #6
0
    policy_hash = {}
    trans_hash = {}

    # For each option, making a table of the transition probabilities
    # and actions for each state in the gridworld to understand learned options
    # For each option:
    # policy_hash maps states to actions, trans_hash maps states to option termination
    # probabilities, for THAT option.
    for s in states:

        t = np.zeros(shape=(gmap.shape[0],gmap.shape[1]))

        t[s[0],s[1]] = 1
        #t[2:4,0] = np.argwhere(g.map == g.START)[0]
        #t[4:6,0] = np.argwhere(g.map == g.GOAL)[0]


        l = [ np.ravel(m.evalpi(i, [(t, actions[j,:])] ))  for j in g.possibleActions(s)]

        if len(l) == 0:
            continue

        #print(i, s,l, m.evalpsi(i,ns))
        action = g.possibleActions(s)[np.argmax(l)]

        policy_hash[s] = action

        trans_hash[s] = np.ravel(m.evalpsi(i, [(t, actions[1,:])]))

    g.visualizePolicy(policy_hash, trans_hash, blank=True, filename="resources/results/exp_stuff"+str(i)+".png")