def sample_ns_na(self, policy, action=None, start_trajectory=False):
        ''' Given a policy sample the next state and next action along the trajectory followed by the policy
        * Noise is added in selecting action:
        with probability 1-e, follow the policy
        with probability self.epsilon pick a uniform random action from possible actions
        * if start_trajectory = True the initial state is sampled from s0() function of the domain otherwise
        take the action given in the current state
        '''
        if start_trajectory:
            ns, terminal, possible_actions = self.domain.s0()
        else:
            _, ns, terminal, possible_actions = self.domain.step(action)

        if np.random.rand() > self.epsilon:
            na = policy.pi(ns, terminal, possible_actions)
        else:
            na = randSet(possible_actions)

        return ns, na, terminal
    def sample_ns_na(self, policy, action=None, start_trajectory=False):
        ''' Given a policy sample the next state and next action along the trajectory followed by the policy
        * Noise is added in selecting action:
        with probability 1-e, follow the policy
        with probability self.epsilon pick a uniform random action from possible actions
        * if start_trajectory = True the initial state is sampled from s0() function of the domain otherwise
        take the action given in the current state
        '''
        if start_trajectory:
            ns, terminal, possible_actions = self.domain.s0()
        else:
            _, ns, terminal, possible_actions = self.domain.step(action)

        if np.random.rand() > self.epsilon:
            na = policy.pi(ns, terminal, possible_actions)
        else:
            na = randSet(possible_actions)

        return ns, na, terminal
    def solve(self):
        """Solve the domain MDP."""

        # Used to show the total time took the process
        self.start_time = clock()
        bellmanUpdates = 0
        converged = False
        iteration = 0
        # Track the number of consequent trajectories with very small observed
        # BellmanError
        converged_trajectories = 0
        while self.hasTime() and not converged:

            # Generate a new episode e-greedy with the current values
            max_Bellman_Error = 0
            step = 0
            terminal = False
            s, terminal, p_actions = self.domain.s0()
            a = self.representation.bestAction(
                s, terminal, p_actions
            ) if np.random.rand() > self.epsilon else randSet(p_actions)
            while not terminal and step < self.domain.episodeCap and self.hasTime(
            ):
                new_Q = self.representation.Q_oneStepLookAhead(
                    s, a, self.ns_samples)
                phi_s = self.representation.phi(s, terminal)
                phi_s_a = self.representation.phi_sa(s, terminal, a, phi_s)
                old_Q = np.dot(phi_s_a, self.representation.weight_vec)
                bellman_error = new_Q - old_Q

                # print s, old_Q, new_Q, bellman_error
                self.representation.weight_vec += self.alpha * bellman_error * phi_s_a
                bellmanUpdates += 1
                step += 1

                # Discover features if the representation has the discover method
                discover_func = getattr(
                    self.representation, 'discover', None
                )  # None is the default value if the discover is not an attribute
                if discover_func and callable(discover_func):
                    self.representation.discover(phi_s, bellman_error)

                max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error))
                # Simulate new state and action on trajectory
                _, s, terminal, p_actions = self.domain.step(a)
                a = self.representation.bestAction(
                    s, terminal, p_actions
                ) if np.random.rand() > self.epsilon else randSet(p_actions)

            # check for convergence
            iteration += 1
            if max_Bellman_Error < self.convergence_threshold:
                converged_trajectories += 1
            else:
                converged_trajectories = 0
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun(
            )
            converged = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES
            self.logger.info(
                'PI #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Return=%0.4f, Steps=%d, Features=%d'
                % (iteration, hhmmss(deltaT(self.start_time)), bellmanUpdates,
                   max_Bellman_Error, performance_return, performance_steps,
                   self.representation.features_num))
            if self.show:
                self.domain.show(a, representation=self.representation, s=s)

            # store stats
            self.result["bellman_updates"].append(bellmanUpdates)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(
                performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info('Converged!')
        super(TrajectoryBasedValueIteration, self).solve()
Esempio n. 4
0
    def pi2(self, s, terminal, p_actions):
        domain = self.representation.domain
        if not className(domain) in self.supportedDomains:
            print("ERROR: There is no fixed policy defined for %s" %
                  className(domain))
            return None

        if className(domain) == 'GridWorld':
            # Actions are Up, Down, Left, Right
            if not self.policyName in self.gridWorldPolicyNames:
                print("Error: There is no GridWorld policy with name %s" %
                      self.policyName)
                return None

            if self.policyName == 'cw_circle':
                # Cycle through actions, starting with 0, causing agent to go
                # in loop
                if not hasattr(self, "curAction"):
                    # it doesn't exist yet, so initialize it [immediately
                    # incremented]
                    self.curAction = 0
                while (not (self.curAction in domain.possibleActions(s))):
                    # We can't do something simple because of the order in which actions are defined
                    # must do switch statement
                    if self.curAction == 0:  # up
                        self.curAction = 3
                    elif self.curAction == 3:  # right
                        self.curAction = 1
                    elif self.curAction == 1:  # down
                        self.curAction = 2
                    elif self.curAction == 2:  # left
                        self.curAction = 0
                    else:
                        print(
                            'Something terrible happened...got an invalid action on GridWorld Fixed Policy'
                        )
    #                 self.curAction = self.curAction % domain.actions_num
            elif self.policyName == 'ccw_circle':
                # Cycle through actions, starting with 0, causing agent to go
                # in loop
                if not hasattr(self, "curAction"):
                    # it doesn't exist yet, so initialize it
                    self.curAction = 1
                while (not (self.curAction in domain.possibleActions(s))):
                    # We can't do something simple because of the order in which actions are defined
                    # must do switch statement
                    if self.curAction == 3:  # right
                        self.curAction = 0
                    elif self.curAction == 0:  # up
                        self.curAction = 2
                    elif self.curAction == 2:  # left
                        self.curAction = 1
                    elif self.curAction == 1:  # down
                        self.curAction = 3
                    else:
                        print(
                            'Something terrible happened...got an invalid action on GridWorld Fixed Policy'
                        )

    #                 self.curAction = self.curAction % domain.actions_num

            else:
                print(
                    "Error: No policy defined with name %s, but listed in gridWorldPolicyNames"
                    % self.policyName)
                print(
                    "You need to create a switch statement for the policy name above, or remove it from gridWorldPolicyNames"
                )
                return None
            return self.curAction

# Cycle through actions, starting with 0, causing agent to go in other direction
#             if not hasattr(pi, "curAction"):
# pi.curAction = domain.actions_num-1  # it doesn't exist yet, so initialize it
#             if not(pi.curAction in domain.possibleActions(s)):
#                 pi.curAction -= 1
#                 if pi.curAction < 0: pi.curAction = domain.actions_num-1

        if className(domain) == 'InfCartPoleBalance':
            # Fixed policy rotate the pendulum in the opposite direction of the
            # thetadot
            theta, thetadot = s
            if thetadot > 0:
                return 2
            else:
                return 0
        if className(domain) == 'BlocksWorld':
            # Fixed policy rotate the blocksworld = Optimal Policy (Always pick the next piece of the tower and move it to the tower
            # Policy: Identify the top of the tower.
            # move the next piece on the tower with 95% chance 5% take a random
            # action

            # Random Action with some probability
            # TODO fix isTerminal use here
            if self.random_state.rand() < .3 or domain.isTerminal():
                return randSet(domain.possibleActions(s))

            # non-Random Policy
            # next_block is the block that should be stacked on the top of the tower
            # wrong_block is the highest block stacked on the top of the next_block
            # Wrong_tower_block is the highest stacked on the top of the tower
            blocks = domain.blocks
            # Length of the tower assumed to be built correctly.
            correct_tower_size = 0
            while True:
                # Check the next block
                block = correct_tower_size
                if (block == 0 and domain.on_table(block, s)) or domain.on(
                        block, block - 1, s):
                    # This block is on the right position, check the next block
                    correct_tower_size += 1
                else:
                    # print s
                    # print "Incorrect block:", block
                    # The block is on the wrong place.
                    # 1. Check if the tower is empty => If not take one block from the tower and put it on the table
                    # 2. check to see if this wrong block is empty => If not put one block from its stack and put on the table
                    # 3. Otherwise move this block on the tower

                    ###################
                    # 1
                    ###################
                    # If the first block is in the wrong place, then the tower
                    # top which is table is empty by definition
                    if block != 0:
                        ideal_tower_top = block - 1
                        tower_top = domain.towerTop(ideal_tower_top, s)
                        if tower_top != ideal_tower_top:
                            # There is a wrong block there hence we should put
                            # it on the table first
                            return (
                                # put the top of the tower on the table since
                                # it is not correct
                                domain.getActionPutAonTable(tower_top))
                    ###################
                    # 2
                    ###################
                    block_top = domain.towerTop(block, s)
                    if block_top != block:
                        # The target block to be stacked is not empty
                        return domain.getActionPutAonTable(block_top)
                    ###################
                    # 3
                    ###################
                    if block == 0:
                        return domain.getActionPutAonTable(block)
                    else:
                        return domain.getActionPutAonB(block, block - 1)
        if className(domain) == 'IntruderMonitoring':
            # Each UAV assign themselves to a target
            # Each UAV finds the closest danger zone to its target and go towards there.
            # If UAVs_num > Target, the rest will hold position
            # Move all agents based on the taken action
            agents = np.array(s[:domain.NUMBER_OF_AGENTS * 2].reshape(-1, 2))
            targets = np.array(s[domain.NUMBER_OF_AGENTS * 2:].reshape(-1, 2))
            zones = domain.danger_zone_locations
            # Default action is hold
            actions = np.ones(len(agents), dtype=np.integer) * 4
            planned_agents_num = min(len(agents), len(targets))
            for i in range(planned_agents_num):
                # Find cloasest zone (manhattan) to the corresponding target
                target = targets[i, :]
                distances = np.sum(
                    np.abs(np.tile(target, (len(zones), 1)) - zones), axis=1)
                z_row, z_col = zones[np.argmin(distances), :]
                # find the valid action
                a_row, a_col = agents[i, :]
                a = 4  # hold as a default action
                if a_row > z_row:
                    a = 0  # up
                if a_row < z_row:
                    a = 1  # down
                if a_col > z_col:
                    a = 2  # left
                if a_col < z_col:
                    a = 3  # right
                actions[i] = a
#                print "Agent=", agents[i,:]
#                print "Target", target
#                print "Zone", zones[argmin(distances),:]
#                print "Action", a
#                print '============'
            return vec2id(actions, np.ones(len(agents), dtype=np.integer) * 5)
        if className(domain) == 'SystemAdministrator':
            # Select a broken computer and reset it
            brokenComputers = np.where(s == 0)[0]
            if len(brokenComputers):
                return randSet(brokenComputers)
            else:
                return domain.computers_num
        if className(domain) == 'MountainCar':
            # Accelerate in the direction of the valley
            # WORK IN PROGRESS
            x, xdot = s
            if xdot > 0:
                return 2
            else:
                return 0
        if className(domain) == 'PST':
            # One stays at comm, n-1 stay at target area. Whenever fuel is
            # lower than reaching the base the move back
            print(s)
            s = domain.state2Struct(s)
            uavs = domain.NUM_UAV
            print(s)
            return vec2id(np.zeros(uavs), np.ones(uavs) * 3)
Esempio n. 5
0
    def solve(self):
        """Solve the domain MDP."""

        # Used to show the total time took the process
        self.start_time = clock()
        bellmanUpdates = 0
        converged = False
        iteration = 0
        # Track the number of consequent trajectories with very small observed
        # BellmanError
        converged_trajectories = 0
        while self.hasTime() and not converged:

            # Generate a new episode e-greedy with the current values
            max_Bellman_Error = 0
            step = 0
            terminal = False
            s, terminal, p_actions = self.domain.s0()
            a = self.representation.bestAction(
                s,
                terminal,
                p_actions) if np.random.rand(
            ) > self.epsilon else randSet(
                p_actions)
            while not terminal and step < self.domain.episodeCap and self.hasTime():
                new_Q = self.representation.Q_oneStepLookAhead(s, a, self.ns_samples)
                phi_s = self.representation.phi(s, terminal)
                phi_s_a = self.representation.phi_sa(s, terminal, a, phi_s)
                old_Q = np.dot(phi_s_a, self.representation.weight_vec)
                bellman_error = new_Q - old_Q

                # print s, old_Q, new_Q, bellman_error
                self.representation.weight_vec += self.alpha * bellman_error * phi_s_a
                bellmanUpdates += 1
                step += 1

                # Discover features if the representation has the discover method
                discover_func = getattr(self.representation, 'discover', None)  # None is the default value if the discover is not an attribute
                if discover_func and callable(discover_func):
                    self.representation.discover(phi_s, bellman_error)

                max_Bellman_Error = max(max_Bellman_Error, abs(bellman_error))
                # Simulate new state and action on trajectory
                _, s, terminal, p_actions = self.domain.step(a)
                a = self.representation.bestAction(s, terminal, p_actions) if np.random.rand() > self.epsilon else randSet(p_actions)

            # check for convergence
            iteration += 1
            if max_Bellman_Error < self.convergence_threshold:
                converged_trajectories += 1
            else:
                converged_trajectories = 0
            performance_return, performance_steps, performance_term, performance_discounted_return = self.performanceRun(
            )
            converged = converged_trajectories >= self.MIN_CONVERGED_TRAJECTORIES
            self.logger.info(
                'PI #%d [%s]: BellmanUpdates=%d, ||Bellman_Error||=%0.4f, Return=%0.4f, Steps=%d, Features=%d' % (iteration,
                                                                                                                  hhmmss(
                                                                                                                      deltaT(
                                                                                                                          self.start_time)),
                                                                                                                  bellmanUpdates,
                                                                                                                  max_Bellman_Error,
                                                                                                                  performance_return,
                                                                                                                  performance_steps,
                                                                                                                  self.representation.features_num))
            if self.show:
                self.domain.show(a, representation=self.representation, s=s)

            # store stats
            self.result["bellman_updates"].append(bellmanUpdates)
            self.result["return"].append(performance_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(self.representation.features_num)
            self.result["steps"].append(performance_steps)
            self.result["terminated"].append(performance_term)
            self.result["discounted_return"].append(performance_discounted_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info('Converged!')
        super(TrajectoryBasedValueIteration, self).solve()
Esempio n. 6
0
    def pi2(self, s, terminal, p_actions):
        domain = self.representation.domain
        if not className(domain) in self.supportedDomains:
            print "ERROR: There is no fixed policy defined for %s" % className(domain)
            return None

        if className(domain) == 'GridWorld':
            # Actions are Up, Down, Left, Right
            if not self.policyName in self.gridWorldPolicyNames:
                print "Error: There is no GridWorld policy with name %s" % self.policyName
                return None

            if self.policyName == 'cw_circle':
                # Cycle through actions, starting with 0, causing agent to go
                # in loop
                if not hasattr(self, "curAction"):
                    # it doesn't exist yet, so initialize it [immediately
                    # incremented]
                    self.curAction = 0
                while (not(self.curAction in domain.possibleActions(s))):
                    # We can't do something simple because of the order in which actions are defined
                    # must do switch statement
                    if self.curAction == 0:  # up
                        self.curAction = 3
                    elif self.curAction == 3:  # right
                        self.curAction = 1
                    elif self.curAction == 1:  # down
                        self.curAction = 2
                    elif self.curAction == 2:  # left
                        self.curAction = 0
                    else:
                        print 'Something terrible happened...got an invalid action on GridWorld Fixed Policy'
    #                 self.curAction = self.curAction % domain.actions_num
            elif self.policyName == 'ccw_circle':
                # Cycle through actions, starting with 0, causing agent to go
                # in loop
                if not hasattr(self, "curAction"):
                    # it doesn't exist yet, so initialize it
                    self.curAction = 1
                while (not(self.curAction in domain.possibleActions(s))):
                    # We can't do something simple because of the order in which actions are defined
                    # must do switch statement
                    if self.curAction == 3:  # right
                        self.curAction = 0
                    elif self.curAction == 0:  # up
                        self.curAction = 2
                    elif self.curAction == 2:  # left
                        self.curAction = 1
                    elif self.curAction == 1:  # down
                        self.curAction = 3
                    else:
                        print 'Something terrible happened...got an invalid action on GridWorld Fixed Policy'
    #                 self.curAction = self.curAction % domain.actions_num

            else:
                print "Error: No policy defined with name %s, but listed in gridWorldPolicyNames" % self.policyName
                print "You need to create a switch statement for the policy name above, or remove it from gridWorldPolicyNames"
                return None
            return self.curAction

# Cycle through actions, starting with 0, causing agent to go in other direction
#             if not hasattr(pi, "curAction"):
# pi.curAction = domain.actions_num-1  # it doesn't exist yet, so initialize it
#             if not(pi.curAction in domain.possibleActions(s)):
#                 pi.curAction -= 1
#                 if pi.curAction < 0: pi.curAction = domain.actions_num-1

        if className(domain) == 'InfCartPoleBalance':
            # Fixed policy rotate the pendulum in the opposite direction of the
            # thetadot
            theta, thetadot = s
            if thetadot > 0:
                return 2
            else:
                return 0
        if className(domain) == 'BlocksWorld':
            # Fixed policy rotate the blocksworld = Optimal Policy (Always pick the next piece of the tower and move it to the tower
            # Policy: Identify the top of the tower.
            # move the next piece on the tower with 95% chance 5% take a random
            # action

            # Random Action with some probability
            # TODO fix isTerminal use here
            if np.random.rand() < .3 or domain.isTerminal():
                return randSet(domain.possibleActions(s))

            # non-Random Policy
            # next_block is the block that should be stacked on the top of the tower
            # wrong_block is the highest block stacked on the top of the next_block
            # Wrong_tower_block is the highest stacked on the top of the tower
            blocks = domain.blocks
            # Length of the tower assumed to be built correctly.
            correct_tower_size = 0
            while True:
                # Check the next block
                block = correct_tower_size
                if (block == 0 and domain.on_table(block, s)) or domain.on(block, block - 1, s):
                    # This block is on the right position, check the next block
                    correct_tower_size += 1
                else:
                    # print s
                    # print "Incorrect block:", block
                    # The block is on the wrong place.
                    # 1. Check if the tower is empty => If not take one block from the tower and put it on the table
                    # 2. check to see if this wrong block is empty => If not put one block from its stack and put on the table
                    # 3. Otherwise move this block on the tower

                    ###################
                    # 1
                    ###################
                    # If the first block is in the wrong place, then the tower
                    # top which is table is empty by definition
                    if block != 0:
                        ideal_tower_top = block - 1
                        tower_top = domain.towerTop(ideal_tower_top, s)
                        if tower_top != ideal_tower_top:
                            # There is a wrong block there hence we should put
                            # it on the table first
                            return (
                                # put the top of the tower on the table since
                                # it is not correct
                                domain.getActionPutAonTable(tower_top)
                            )
                    ###################
                    # 2
                    ###################
                    block_top = domain.towerTop(block, s)
                    if block_top != block:
                        # The target block to be stacked is not empty
                        return domain.getActionPutAonTable(block_top)
                    ###################
                    # 3
                    ###################
                    if block == 0:
                        return domain.getActionPutAonTable(block)
                    else:
                        return domain.getActionPutAonB(block, block - 1)
        if className(domain) == 'IntruderMonitoring':
            # Each UAV assign themselves to a target
            # Each UAV finds the closest danger zone to its target and go towards there.
            # If UAVs_num > Target, the rest will hold position
            # Move all agents based on the taken action
            agents = np.array(s[:domain.NUMBER_OF_AGENTS * 2].reshape(-1, 2))
            targets = np.array(s[domain.NUMBER_OF_AGENTS * 2:].reshape(-1, 2))
            zones = domain.danger_zone_locations
            # Default action is hold
            actions = np.ones(len(agents), dtype=np.integer) * 4
            planned_agents_num = min(len(agents), len(targets))
            for i in xrange(planned_agents_num):
                # Find cloasest zone (manhattan) to the corresponding target
                target = targets[i, :]
                distances = np.sum(
                    np.abs(np.tile(target, (len(zones), 1)) - zones), axis=1)
                z_row, z_col = zones[np.argmin(distances), :]
                # find the valid action
                a_row, a_col = agents[i, :]
                a = 4  # hold as a default action
                if a_row > z_row:
                    a = 0  # up
                if a_row < z_row:
                    a = 1  # down
                if a_col > z_col:
                    a = 2  # left
                if a_col < z_col:
                    a = 3  # right
                actions[i] = a
#                print "Agent=", agents[i,:]
#                print "Target", target
#                print "Zone", zones[argmin(distances),:]
#                print "Action", a
#                print '============'
            return vec2id(actions, np.ones(len(agents), dtype=np.integer) * 5)
        if className(domain) == 'SystemAdministrator':
            # Select a broken computer and reset it
            brokenComputers = np.where(s == 0)[0]
            if len(brokenComputers):
                return randSet(brokenComputers)
            else:
                return domain.computers_num
        if className(domain) == 'MountainCar':
            # Accelerate in the direction of the valley
            # WORK IN PROGRESS
            x, xdot = s
            if xdot > 0:
                return 2
            else:
                return 0
        if className(domain) == 'PST':
            # One stays at comm, n-1 stay at target area. Whenever fuel is
            # lower than reaching the base the move back
            print s
            s = domain.state2Struct(s)
            uavs = domain.NUM_UAV
            print s
            return vec2id(np.zeros(uavs), np.ones(uavs) * 3)