Exemple #1
0
 def __init__(self, question, testDict):
     super(ApproximateQLearningTest, self).__init__(question, testDict)
     self.discount = float(testDict['discount'])
     self.grid = gridworld.Gridworld(parseGrid(testDict['grid']))
     if 'noise' in testDict: self.grid.setNoise(float(testDict['noise']))
     if 'livingReward' in testDict: self.grid.setLivingReward(float(testDict['livingReward']))
     self.grid = gridworld.Gridworld(parseGrid(testDict['grid']))
     self.env = gridworld.GridworldEnvironment(self.grid)
     self.epsilon = float(testDict['epsilon'])
     self.learningRate = float(testDict['learningRate'])
     self.extractor = 'IdentityExtractor'
     if 'extractor' in testDict:
         self.extractor = testDict['extractor']
     self.opts = {'actionFn': self.env.getPossibleActions, 'epsilon': self.epsilon, 'gamma': self.discount, 'alpha': self.learningRate}
     numExperiences = int(testDict['numExperiences'])
     maxPreExperiences = 10
     self.numsExperiencesForDisplay = range(min(numExperiences, maxPreExperiences))
     self.testOutFile = testDict['test_out_file']
     if maxPreExperiences < numExperiences:
         self.numsExperiencesForDisplay.append(numExperiences)
    def __init__(self, question, testDict):
        super(EpsilonGreedyTest, self).__init__(question, testDict)
        self.discount = float(testDict['discount'])
        self.grid = gridworld.Gridworld(parseGrid(testDict['grid']))
        if 'noise' in testDict: self.grid.setNoise(float(testDict['noise']))
        if 'livingReward' in testDict:
            self.grid.setLivingReward(float(testDict['livingReward']))

        self.grid = gridworld.Gridworld(parseGrid(testDict['grid']))
        self.env = gridworld.GridworldEnvironment(self.grid)
        self.epsilon = float(testDict['epsilon'])
        self.learningRate = float(testDict['learningRate'])
        self.numExperiences = int(testDict['numExperiences'])
        self.numIterations = int(testDict['iterations'])
        self.opts = {
            'actionFn': self.env.getPossibleActions,
            'epsilon': self.epsilon,
            'gamma': self.discount,
            'alpha': self.learningRate
        }
    def __init__(self, question, testDict):
        super(EpsilonGreedyTest, self).__init__(question, testDict)
        self.discount = float(testDict["discount"])
        self.grid = gridworld.Gridworld(parseGrid(testDict["grid"]))
        if "noise" in testDict:
            self.grid.setNoise(float(testDict["noise"]))
        if "livingReward" in testDict:
            self.grid.setLivingReward(float(testDict["livingReward"]))

        self.grid = gridworld.Gridworld(parseGrid(testDict["grid"]))
        self.env = gridworld.GridworldEnvironment(self.grid)
        self.epsilon = float(testDict["epsilon"])
        self.learningRate = float(testDict["learningRate"])
        self.numExperiences = int(testDict["numExperiences"])
        self.numIterations = int(testDict["iterations"])
        self.opts = {
            "actionFn": self.env.getPossibleActions,
            "epsilon": self.epsilon,
            "gamma": self.discount,
            "alpha": self.learningRate,
        }
 def __init__(self, question, testDict):
     super(ValueIterationTest, self).__init__(question, testDict)
     self.discount = float(testDict['discount'])
     self.grid = gridworld.Gridworld(parseGrid(testDict['grid']))
     iterations = int(testDict['valueIterations'])
     if 'noise' in testDict: self.grid.setNoise(float(testDict['noise']))
     if 'livingReward' in testDict: self.grid.setLivingReward(float(testDict['livingReward']))
     maxPreIterations = 10
     self.numsIterationsForDisplay = range(min(iterations, maxPreIterations))
     self.testOutFile = testDict['test_out_file']
     if maxPreIterations < iterations:
         self.numsIterationsForDisplay.append(iterations)
Exemple #5
0
 def __init__(self, question, testDict):
     super(QLearningTest, self).__init__(question, testDict)
     self.discount = float(testDict['discount'])
     self.grid = gridworld.Gridworld(parseGrid(testDict['grid']))
     if 'noise' in testDict: self.grid.setNoise(float(testDict['noise']))
     if 'livingReward' in testDict: self.grid.setLivingReward(float(testDict['livingReward']))
     self.grid = gridworld.Gridworld(parseGrid(testDict['grid']))
     self.env = gridworld.GridworldEnvironment(self.grid)
     self.epsilon = float(testDict['epsilon'])
     self.learningRate = float(testDict['learningRate'])
     self.opts = {'actionFn': self.env.getPossibleActions, 'epsilon': self.epsilon, 'gamma': self.discount, 'alpha': self.learningRate}
     numExperiences = int(testDict['numExperiences'])
     maxPreExperiences = 10
     self.numsExperiencesForDisplay = list(range(min(numExperiences, maxPreExperiences)))
     self.testOutFile = testDict['test_out_file']
     if sys.platform == 'win32':
         _, question_name, test_name = testDict['test_out_file'].split('\\')
     else:
         _, question_name, test_name = testDict['test_out_file'].split('/')
     self.experiences = Experiences(test_name.split('.')[0])
     if maxPreExperiences < numExperiences:
         self.numsExperiencesForDisplay.append(numExperiences)
 def __init__(self, question, testDict):
     super(ValueIterationTest, self).__init__(question, testDict)
     self.discount = float(testDict["discount"])
     self.grid = gridworld.Gridworld(parseGrid(testDict["grid"]))
     iterations = int(testDict["valueIterations"])
     if "noise" in testDict:
         self.grid.setNoise(float(testDict["noise"]))
     if "livingReward" in testDict:
         self.grid.setLivingReward(float(testDict["livingReward"]))
     maxPreIterations = 10
     self.numsIterationsForDisplay = list(
         range(min(iterations, maxPreIterations)))
     self.testOutFile = testDict["test_out_file"]
     if maxPreIterations < iterations:
         self.numsIterationsForDisplay.append(iterations)
Exemple #7
0
def main(grid_size, discount, n_trajectories, learning_rate):

	wind = 0.3
	gw = gridworld.Gridworld(grid_size, wind, discount)

	ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
	r = Apprenticeship.irl(gw, n_trajectories, learning_rate)

	plt.subplot(1, 2, 1)
	plt.pcolor(ground_r.reshape((grid_size, grid_size)))
	plt.colorbar()
	plt.title("Groundtruth reward")
	plt.subplot(1, 2, 2)
	plt.pcolor(r.reshape((grid_size, grid_size)))
	plt.colorbar()
	plt.title("Recovered reward")
	plt.show()
def lp_irl_gridworld(grid_size, discount):
    wind = 0.3
    traj_len = 3 * grid_size

    gw = gridworld.Gridworld(grid_size, wind, discount)
    gt_reward = np.array([gw.reward(s) for s in range(gw.n_states)])
    policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)]
    r = lp_irl.compute_reward(gw.n_states, gw.n_actions,
                              gw.transition_probability, policy, gw.discount,
                              1, 5)

    plt.subplot(1, 2, 1)
    plt.pcolor(gt_reward.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(gt_reward.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
Exemple #9
0
def main(grid_size, discount, n_trajectories, epochs, learning_rate):

    wind = 0.3
    trajectory_length = 3 * grid_size

    gw = gridworld.Gridworld(grid_size, wind, discount)

    #trajectories = gw.my_generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy)
    #trajectories = gw.my_generate_trajectories_some_without_goal(n_trajectories,trajectory_length,gw.optimal_policy)
    trajectories = gw.my_generate_trajectories_multiple(
        n_trajectories, trajectory_length, gw.optimal_policy)

    feature_matrix = gw.feature_matrix()
    #feature_matrix = gw.feature_matrix_goalVsOther()
    #feature_matrix = gw.feature_matrix_goalVsOtherTwo()
    #feature_matrix = gw.feature_matrix_goalVsOtherThree()

    #ground truth given by us as we know which states are good vs bad
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])

    #reard recovered using IRL algorithm
    recovered_reward = maxent.irl(feature_matrix, gw.n_actions, discount,
                                  gw.transition_probability, trajectories,
                                  epochs, learning_rate)
    #let's standardiese it
    scaler = StandardScaler()
    standardised_reward = scaler.fit_transform(recovered_reward.reshape(-1, 1))

    #print(recovered_reward)
    #print(standardised_reward)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(standardised_reward.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
Exemple #10
0
            l1=l1,
            l2=l2)
    scaler = StandardScaler()
    standardised_reward = scaler.fit_transform(recovered_reward.reshape(-1, 1))

    plot.plot(ground_r, standardised_reward, grid_size)


grid_size = 5
discount = 0.01
n_trajectories = 25
epochs = 700
learning_rate = 0.01
wind = 0.3
trajectory_length = 3 * grid_size
gw = gridworld.Gridworld(grid_size, wind, discount)
ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])

feature_matrix = gw.feature_matrix()
#feature_matrix = gw.feature_matrix_goalVsOther()
#feature_matrix = gw.feature_matrix_goalVsOtherTwo()
#feature_matrix = gw.feature_matrix_goalVsOtherThree()
feature_space = feature_matrix.shape[1]

#trajectories = gw.my_generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy)
trajectories = gw.my_generate_trajectories_some_without_goal(
    n_trajectories, trajectory_length, gw.optimal_policy)

n_states, d_states = feature_matrix.shape
no_of_iterations = 20
structure = (3, 3)
Exemple #11
0
    def do_turn(self, ants):
        # track all moves, prevent collisions
        orders = {}

        def do_move_direction(loc, direction):
            #Rrr destination takes care of wrapping around, returns the destination
            #issues the moving order
            new_loc = ants.destination(loc, direction)
            #Rrr orders is the dictionary of location of ants
            if (ants.unoccupied(new_loc) and new_loc not in orders):
                ants.issue_order((loc, direction))
                orders[new_loc] = loc
                return True
            else:
                return False

        targets = {}

        #ROHAN added the variable directn
        def do_move_location(loc, dirctn):
            #Rrr ants.direction takes a location and a destination and returns a list of the closest direction "as the crow flies".
            #If the target is up and to the left, it will return ['n', 'w'] and we should then try and move our ant one of the two directions.
            #If the target is directly down, it will return ['s'], which is a list of one item.
            directions = dirctn
            for direction in directions:
                if do_move_direction(loc, direction):
                    #targets[dest] = loc
                    return True
            return False

# --------------------------------starts from here--------------------------------------
# find close

        self.turn = self.turn + 1

        #MY HILLLLSSS
        for hill_loc in ants.my_hills():
            x, y = hill_loc
            self.grid[x][y] = self.MYHILL
            #Rrr The dummy entry doesn't need a from location, so we just set the value to None.
            #prevent stepping on own hill
            orders[hill_loc] = None
        #ENEMY HILLLSSSS
        for hill_loc, hill_owner in ants.enemy_hills():
            hillrow, hillcol = hill_loc
            self.grid[hillrow][hillcol] = self.ENEMYHILL

        #LAND, water food
        for i in range(ants.rows):
            for j in range(ants.cols):
                #if ((ants.visible((i,j))==True) or (self.grid[i][j]==(self.FOOD or self.ENEMYANTS or self.BOUNDARY2 or self.ENEMYANTS2))):
                #    self.grid[i][j]=' '
                #    self.gridu[i][j]='v'
                if ants.visible((i, j)) == True:
                    self.gridu[i][j] = 'v'
                    if (self.grid[i][j] != (self.MYHILL or self.MYHILL2
                                            or self.ENEMYHILL or self.WATER)):
                        self.grid[i][j] = ' '

                elif self.grid[i][j] == (self.FOOD or self.ENEMYANTS
                                         or self.BOUNDARY2 or self.ENEMYANTS2
                                         or self.MYANTS):
                    self.grid[i][j] = ' '

                if ants.map[i][j] == -3:
                    self.grid[i][j] = self.FOOD
                elif ants.map[i][j] == -4:
                    self.grid[i][j] = self.WATER

            # if i cant see my hill, retreat to it urgently
                if self.grid[i][j] == self.MYHILL:
                    if ants.visible((i, j)) == False:
                        print >> sys.stderr, 'hill retreat', i, j
                        sys.stderr.flush()
                        self.grid[i][j] = self.MYHILL2
                    else:
                        self.grid[i][j] = self.MYHILL

                if self.grid[i][j] == self.ENEMYHILL:
                    print >> sys.stderr, 'hill attack!!!!!!!!!!!!!!!!!!', i, j
                    sys.stderr.flush()

        #MY ANTSSSSSSSSSS S
        num_ants = 0
        sx = 0
        sy = 0
        for ant_loc in ants.my_ants():
            antrow, antcol = ant_loc
            sx = sx + antrow
            sy = sy + antcol
            #self.grid[antrow][antcol]=self.MYANTS
            num_ants = num_ants + 1

        sx = int(sx / num_ants)
        sy = int(sy / num_ants)
        self.grid[sx][sy] = self.MYANTS

        ## change MODE------------------------------------------what do i do here ?????????
        if num_ants >= 0:  ##(ants.rows*ants.cols/200):
            self.BOUNDARY = self.BOUNDARY2
        else:
            self.BOUNDARY = ' '

        #ENEMYYYYYYYY ANTSSSSSSS
        for enemy_loc, enemy_owner in ants.enemy_ants():
            enemyrow, enemycol = enemy_loc
            #TO DO, if own ant concentration is good near enemy ant (enemy ant conc in the area), then a positive reward
            self.grid[enemyrow][enemycol] = self.ENEMYANTS

            #if they're near my base, retreat to base
            for hill_loc in ants.my_hills():
                x, y = hill_loc
                if ants.distance(hill_loc, enemy_loc) < 9.0:
                    self.grid[enemyrow][enemycol] = self.ENEMYANTS2

            #if i can surround em attack :) TODO: also check the enemy density
            surround = 0
            for ant_loc in ants.my_ants():
                antrow, antcol = ant_loc
                if ants.distance(ant_loc, enemy_loc) < 6.0:
                    surround = surround + 1

            if surround >= 3:
                self.grid[enemyrow][enemycol] = self.ENEMYANTS2
                print >> sys.stderr, 'ursurrounded attack: ', enemyrow, enemycol
                sys.stderr.flush()

#BOUNDARY EXPANDING !!!!!!!!!
        for i in range(ants.rows):
            for j in range(ants.cols):
                if (self.gridu[i][j] == 'v' and self.grid[i][j] == ' '):
                    if (ants.visible(ants.destination((i, j), 'n')) == False
                            or ants.visible(ants.destination(
                                (i, j), 'e')) == False
                            or ants.visible(ants.destination(
                                (i, j), 'w')) == False
                            or ants.visible(ants.destination(
                                (i, j), 's')) == False):
                        self.grid[i][j] = self.BOUNDARY

#-----------------------------------------------------------------------------------------VALUE ITERATION

#opts={'agent': 'value', 'discount': 0.9, 'iters': 200, 'noise': 0.01, 'livingReward': 0.0, 'epsilon': 0.0, 'pause': False, 'manual': False, 'quiet': True, 'episodes': 100, 'learningRate': 0.5, 'grid': 'BookGrid', 'gridSize': 150, 'speed': 1000.0, 'textDisplay': False}
        opts = {
            'livingReward': 0.0,
            'discount': 0.9,
            'iters': 300,
            'noise': 0.05,
            'epsilon': 0.0,
            'manual': False,
            'quiet': True,
            'agent': 'value',
            'pause': False,
            'episodes': 100,
            'learningRate': 0.5,
            'grid': 'BookGrid',
            'gridSize': 150,
            'speed': 1000.0,
            'textDisplay': False
        }

        mdp = gridworld.Gridworld(self.grid)
        mdp.setLivingReward(opts['livingReward'])
        mdp.setNoise(opts['noise'])
        env = gridworld.GridworldEnvironment(mdp)

        ###########################
        # GET THE AGENT
        ###########################

        #time_to_spare = (ants.turntime/1000.0) - (0.00064286 + 0.0000547619*num_ants + 0.0000065476*(num_ants*num_ants)) - 0.01
        if num_ants <= 60:
            time_to_spare = (ants.turntime / 1000.0) - 0.03
        else:
            time_to_spare = (ants.turntime / 1000.0) - (
                -0.003512 + 0.00047632 * num_ants - 0.00000105286 *
                (num_ants * num_ants)) - 0.005

        a = None
        a = valueIterationAgents.ValueIterationAgent(ants.turn_start_time,
                                                     time_to_spare, mdp,
                                                     opts['discount'],
                                                     opts['iters'])

        #TIME TIME TIME TIME
        #t1 = time.time()

        for ant_loc in ants.my_ants():
            antcol, antrow = ant_loc
            antcol = ants.rows - antcol - 1
            inverted_ant_loc = (antrow, antcol)
            if (a.getQValue(inverted_ant_loc, 'north') == a.getQValue(
                    inverted_ant_loc, 'south') == a.getQValue(
                        inverted_ant_loc, 'east') == a.getQValue(
                            inverted_ant_loc, 'west')):
                direct = random.choice('sewn')
                do_move_location(ant_loc, direct)
            elif a.getPolicy(inverted_ant_loc) == 'north':
                direct = 'n'
                do_move_location(ant_loc, direct)

            elif a.getPolicy(inverted_ant_loc) == 'south':
                direct = 's'
                do_move_location(ant_loc, direct)
            elif a.getPolicy(inverted_ant_loc) == 'east':
                direct = 'e'
                do_move_location(ant_loc, direct)
            elif a.getPolicy(inverted_ant_loc) == 'west':
                direct = 'w'
                do_move_location(ant_loc, direct)
            else:
                direct = random.choice('sewn')
                do_move_location(ant_loc, direct)

        #TIME 2 TIME 2 TIME 2
#t2 = time.time() - t1
        print >> sys.stderr, 'turn: ', self.turn, 'ants :', num_ants, 'spare:', time_to_spare, 'time:', (
            time.time() - ants.turn_start_time)
        sys.stderr.flush()

        # unblock own hill
        for hill_loc in ants.my_hills():
            if hill_loc in ants.my_ants() and hill_loc not in orders.values():
                for direction in ('s', 'e', 'w', 'n'):
                    if do_move_direction(hill_loc, direction):
                        break
Exemple #12
0
    sol = solvers.lp(matrix(c), matrix(A), matrix(b))
    rewards = sol['x'][:n_states]
    rewards = utils.normalize(rewards) * R_MAX

    return rewards


if __name__ == '__main__':

    print("\n*** Gridworld: Value Iteration demo ***\n")

    # Create gridworld
    trans_prob = 0.7
    size_grid = 10
    gamma = 0.5
    gw = gridworld.Gridworld(size_grid, trans_prob)

    # Convert gridwolrd in Finite discrete MDP format
    n_states, n_actions, p_trans, rewards, terminal_state_1d = gw.get_MDP_format(
    )

    # Run Value Iteration algortims
    v_states = value_iteration.run_value_iteration(n_states, n_actions,
                                                   p_trans, rewards,
                                                   terminal_state_1d, gamma)
    v_states = np.reshape(v_states, gw.grid.shape, order='F')

    # Find aptimal policy
    policy_opt = value_iteration.get_optimal_policy(n_states, n_actions,
                                                    p_trans, rewards,
                                                    terminal_state_1d, gamma)
Exemple #13
0
def main(grid_size, discount):
    """
    Run multi-agent linear programming inverse reinforcement learning on the gridworld MG.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MG discount factor. float.
    """

    play_num = 2

    gw = gridworld.Gridworld(play_num, grid_size, discount)
    act = np.array(gw.actions)

    policy_tu = [((0,1),(1,0)),((0,1),(0,1)),((0,1),(1,0)),((0,1),(0,-1)),\
            ((0,1),(0,1)),((-1,0),(0,1)),((-1,0),(-1,0)),((-1,0),(-1,0)),\
                ((-1,0),(0,1)),((-1,0),(0,1)),((-1,0),(0,1)),((-1,0),(0,1)),\
            ((-1,0),(1,0)),((-1,0),(-1,0)),((-1,0),(1,0)),((-1,0),(0,-1))]

    policy = np.zeros(gw.n_states, dtype=int)
    for i in range(gw.n_states):
        policy[i] = int(tu_action(act, policy_tu[i]))

    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])

    print(ground_r)

    r = linear_irl.irl(gw.n_players, gw.actions, gw.n_states, gw.n_actions,
                       gw.transition_probability, policy, gw.discount, 10, 0)

    print(r)

    a = ground_r[:, 0].reshape((4, 4))
    b = ground_r[:, 1].reshape((4, 4))
    a_1 = r[:16].reshape((4, 4))
    b_1 = r[16:].reshape((4, 4))

    print(b, b_1)

    fig, axes = plt.subplots(nrows=1, ncols=2)

    im1 = axes.flat[0].imshow(a)
    axes.flat[0].set_xlabel("B's square", fontsize=13)
    axes.flat[0].set_ylabel("A's square", fontsize=13)
    axes.flat[0].set_title("A's reward", fontsize=13)
    im2 = axes.flat[1].imshow(b)
    axes.flat[1].set_xlabel("B's square", fontsize=13)
    axes.flat[1].set_title("B's reward", fontsize=13)

    fig.subplots_adjust(right=0.8)
    cbar_ax = fig.add_axes([0.85, 0.25, 0.05, 0.5])
    fig.colorbar(im1, cax=cbar_ax)
    plt.show()

    plt.subplot(2, 2, 3)
    plt.pcolormesh(a_1)
    plt.colorbar()
    plt.title("a_1")
    plt.subplot(2, 2, 4)
    plt.pcolormesh(b_1)
    plt.colorbar()
    plt.title("b_1")
    plt.show()
Exemple #14
0
 def __init__(self, question, testDict):
     super(PolicyIterationTest, self).__init__(question, testDict)
     self.discount = float(testDict['discount'])
     self.grid = gridworld.Gridworld(parseGrid(testDict['grid']))
     if 'livingReward' in testDict: self.grid.setLivingReward(float(testDict['livingReward']))
 def test_manual_sums_to_one(self):
     """Tests issue #1 on GitHub."""
     gw = gridworld.Gridworld(5, 0.3, 0.2)
     self.assertTrue(
         np.isclose(gw.transition_probability.sum(axis=2), 1).all())
def make_random_gridworld():
    grid_size = rn.randint(2, 15)
    wind = rn.uniform(0.0, 1.0)
    discount = rn.uniform(0.0, 1.0)
    return gridworld.Gridworld(grid_size, wind, discount)