Ejemplo n.º 1
0
    def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9, discount=0.9):
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.discount = discount
        self.legalActions = Const.ACTIONS
        self.featExtractor = DangerExtractorInstance()
        self.featExtractor = SimpleExtractor()
        self.weights = [0] * self.featExtractor.featureNum
        self.game_state = None

        self.train_episodes = 10000

        self.env = Env()
Ejemplo n.º 2
0
def mat_features(states, extractor=SimpleExtractor(), ftrs=None):
    """Transform a list of states in state matrices"""
    if isinstance(states, np.ndarray):
        m = list(map(functools.partial(extractor.getMatrixFeatures, features=ftrs), states))
        return m
    else:
        return [extractor.getMatrixFeatures(states, features=ftrs)]
Ejemplo n.º 3
0
    def __init__(self, **args):
        "You can initialize Q-values here..."
        ReinforcementAgent.__init__(self, **args)
        self.weights = util.Counter()

        #100 episodes of training
        self.weights['enemy'] = -10
        self.weights['bias'] = -10
        self.weights['bullet'] = -200
        self.weights['edge'] = -10
        self.weights['hitEnemy'] = 10
        self.weights['moveFoward'] = 10

        self.legalActions = Const.ACTIONS
        self.featExtractor = SimpleExtractor()
        self.lastAction = 1
Ejemplo n.º 4
0
    def __init__(self,
                 epsilon=0.05,
                 gamma=0.8,
                 alpha=0.2,
                 numTraining=900,
                 extractor=SimpleExtractor(),
                 **args):
        "You can initialize Q-values here..."

        args['epsilon'] = epsilon
        args['gamma'] = gamma
        args['alpha'] = alpha
        args['numTraining'] = numTraining
        self.featExtractor = extractor
        self.index = 0  # This is always Pacman
        self.weights = CustomCounter()
        self.q_values = CustomCounter()
        self.lastAction = None
        ReinforcementAgent.__init__(self,
                                    epsilon=epsilon,
                                    gamma=gamma,
                                    alpha=alpha,
                                    numTraining=numTraining)

        "*** YOUR CODE HERE ***"
Ejemplo n.º 5
0
def enhancedPacmanFeatures(state, action):
    """
    For each state, this function is called with each legal action.
    It should return a counter with { <feature name> : <feature value>, ... }
    """
    # features = util.Counter()
    "*** YOUR CODE HERE ***"
    # successor = state.generateSuccessor(0, action)
    # foodCount = successor.getFood().count()
    # features['foodCount'] = foodCount
    #features = neuralDistances(state, action)
    featureExtract = SimpleExtractor()
    features = featureExtract.getFeatures(state, action)
    #print("from enhanced")
    #print(features)
    #print("features: ",features.items())
    # it looks like 'capsule 0' is the problem; its values is a list of values not an int
    return features
Ejemplo n.º 6
0
    def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9, discount=0.9):
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.discount = discount
        self.legalActions = Const.ACTIONS
        self.featExtractor = SimpleExtractor()
        self.featExtractor2 = PositionExtractor()
        self.weights = util.Counter()
        self.game_state = None

        self.env = Env()

        self.input_num = self.featExtractor2.getFeatureNum()
        self.hidden_num = 100
        self.output_num = 6
        self.W1 = np.random.rand(self.input_num, self.hidden_num)
        self.W2 = np.random.rand(self.hidden_num, self.output_num)
Ejemplo n.º 7
0
class QLearningAgent(ReinforcementAgent):
    def __init__(self, **args):
        "You can initialize Q-values here..."
        ReinforcementAgent.__init__(self, **args)
        self.weights = util.Counter()

        #100 episodes of training
        self.weights['enemy'] = -10
        self.weights['bias'] = -10
        self.weights['bullet'] = -200
        self.weights['edge'] = -10
        self.weights['hitEnemy'] = 10
        self.weights['moveFoward'] = 10

        self.legalActions = Const.ACTIONS
        self.featExtractor = SimpleExtractor()
        self.lastAction = 1

    def getQValue(self, state, action):
        """
          Returns Q(state,action)
          Should return 0.0 if we have never seen a state
          or the Q node value otherwise
        """
        "*** YOUR CODE HERE ***"
        qvalue = 0.0
        for feature_name, value in self.featExtractor.getFeatures(
                state, action).iteritems():
            qvalue += value * self.weights[feature_name]
        return qvalue

    def computeValueFromQValues(self, state):
        """
          Returns max_action Q(state,action)
          where the max is over legal actions.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return a value of 0.0.
        """
        "*** YOUR CODE HERE ***"
        max_next_qvalue = None
        for nextAction in self.legalActions:
            next_qvalue = self.getQValue(state, nextAction)
            if max_next_qvalue is None or max_next_qvalue < next_qvalue:
                max_next_qvalue = next_qvalue
        if max_next_qvalue is None:
            max_next_qvalue = 0.0

        return max_next_qvalue

    def computeActionFromQValues(self, state):
        """
          Compute the best action to take in a state.  Note that if there
          are no legal actions, which is the case at the terminal state,
          you should return None.
        """
        "*** YOUR CODE HERE ***"

        max_qvalue = None
        for action in self.legalActions:
            qvalue = self.getQValue(state, action)
            if max_qvalue is None or max_qvalue < qvalue:
                max_qvalue = qvalue

        if max_qvalue is None:
            return None

        actions = []
        for action in self.legalActions:
            qvalue = self.getQValue(state, action)
            if qvalue == max_qvalue:
                actions.append(action)

        if max_qvalue is not None and len(actions) == 0:
            return self.legalActions[0]
        if len(actions) > 1:
            return Const.DO_NOTHING
        return random.choice(actions)

    def getAction(self, state):
        """
          Compute the action to take in the current state.  With
          probability self.epsilon, we should take a random action and
          take the best policy action otherwise.  Note that if there are
          no legal actions, which is the case at the terminal state, you
          should choose None as the action.

          HINT: You might want to use util.flipCoin(prob)
          HINT: To pick randomly from a list, use random.choice(list)
        """
        # Pick Action
        "*** YOUR CODE HERE ***"
        # Epsilon greedy
        if util.flipCoin(self.epsilon) is True:
            self.lastAction = random.choice(self.legalActions)
        else:
            self.lastAction = self.computeActionFromQValues(state)
        return self.lastAction

    def update(self, state, action, nextState, reward):
        """
          The parent class calls this to observe a
          state = action => nextState and reward transition.
          You should do your Q-Value update here

          NOTE: You should never call this function,
          it will be called on your behalf
        """
        "*** YOUR CODE HERE ***"
        diff = reward + self.discount * self.computeValueFromQValues(
            nextState) - self.getQValue(state, action)
        for feature_name, feature_value in self.featExtractor.getFeatures(
                state, action).iteritems():
            self.weights[feature_name] += self.alpha * diff * feature_value

    def getPolicy(self, state):
        return self.computeActionFromQValues(state)

    def getValue(self, state):
        return self.computeValueFromQValues(state)

    def final(self, state):
        print("Training Done")
        print("Total episodes: " + str(self.episodesSoFar))
        f = open('train_weight.txt', 'w')
        for feature, weight in self.weights.iteritems():
            f.write(str(feature) + " " + str(weight) + "\n")
        f.close()  # you can omit in most cases as the destructor will call it

        plt.plot(self.episodeRewardsList)
        plt.ylabel("Episode Reward")
        plt.show()
Ejemplo n.º 8
0
import numpy as np
import gym
import matplotlib.pyplot as plt
from tanks import Env
from const import Const
from featureExtractors import DangerExtractor
from featureExtractors import SimpleExtractor

# env = gym.make('CartPole-v0')
train_episodes = 100
learning_rate = 0.01
level_type = "minimal"
game_speed = 1000
env = Env(level_type, game_speed, train_episodes)
gamma = 0.99
featExtractor = SimpleExtractor()


def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r


class agent():
    def __init__(self, lr, s_size, a_size, h_size):
        # These lines established the feed-forward part of the network. The agent takes a state and produces an action.
Ejemplo n.º 9
0
class LinearQAgent:
    def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9, discount=0.9):
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.discount = discount
        self.legalActions = Const.ACTIONS
        self.featExtractor = DangerExtractorInstance()
        self.featExtractor = SimpleExtractor()
        self.weights = [0] * self.featExtractor.featureNum
        self.game_state = None

        self.train_episodes = 10000

        self.env = Env()

    def getQValue(self, state, action):
        qvalue = 0.0
        for idx, value in enumerate(
                self.featExtractor.getFeatures(state, action)):
            qvalue += value * self.weights[idx]
        return qvalue

    def getMaxQValue(self, state):
        max_next_qvalue = None
        for nextAction in self.legalActions:
            next_qvalue = self.getQValue(state, nextAction)
            if max_next_qvalue is None or max_next_qvalue < next_qvalue:
                max_next_qvalue = next_qvalue
        if max_next_qvalue is None:
            max_next_qvalue = 0.0

        return max_next_qvalue

    def computeActionFromQValues(self, state):
        max_qvalue = None
        for action in self.legalActions:
            qvalue = self.getQValue(state, action)
            if max_qvalue is None or max_qvalue < qvalue:
                max_qvalue = qvalue

        if max_qvalue is None:
            return None

        actions = []
        for action in self.legalActions:
            qvalue = self.getQValue(state, action)
            if qvalue == max_qvalue:
                actions.append(action)

        if max_qvalue is not None and len(actions) == 0:
            return self.legalActions[0]
        if len(actions) > 1:
            return Const.DO_NOTHING
        return random.choice(actions)

    def getAction(self, state):
        if util.flipCoin(self.epsilon) is True:
            return random.choice(self.legalActions)
        return self.computeActionFromQValues(state)

    ''' @return reward, episode_over '''

    def step(self):
        if self.game_state is None:
            self.game_state, reward, episode_over, _ = self.env.step(0)
        else:
            action = self.getAction(self.game_state)
            next_game_state, reward, episode_over, _ = self.env.step(action)
            diff = reward + self.discount * self.getMaxQValue(
                next_game_state) - self.getQValue(self.game_state, action)
            for idx, feature_value in enumerate(
                    self.featExtractor.getFeatures(self.game_state, action)):
                self.weights[idx] += self.alpha * diff * feature_value
            self.game_state = next_game_state
        return reward, episode_over

    def start_episode(self, episode_num):
        self.env.reset()
        total_reward = 0.0
        while True:
            self.env.render()
            reward, episode_over = self.step()
            total_reward += reward
            if episode_over: break
        return total_reward

    def start_train(self):
        episode_cnt = 0
        total_reward = []
        while episode_cnt < self.train_episodes:
            episode_cnt += 1
            total_reward.append(self.start_episode(episode_cnt))
            if episode_cnt % 10 == 0:
                print("Episode #" + str(episode_cnt))
                print("10 avg: ", np.mean(total_reward[-10:]))
Ejemplo n.º 10
0
class PolicyGradientAgent(ReinforcementAgent):
    ''' Actor-Critic Agent '''
    def __init__(self, **args):
        "You can initialize Q-values here..."
        ReinforcementAgent.__init__(self, **args)
        self.q_value_weights = util.Counter()
        self.policy_weights = util.Counter()

        #learning rate for policy weights
        self.beta = 0.1

        self.legalActions = Const.ACTIONS
        self.featExtractor = SimpleExtractor()
        self.lastAction = 1

        #100 episodes of training
        self.q_value_weights['enemy'] = -10
        self.q_value_weights['bias'] = -10
        self.q_value_weights['bullet'] = -200
        self.q_value_weights['edge'] = -10
        self.q_value_weights['hitEnemy'] = 10
        self.q_value_weights['moveFoward'] = 10

    def getQValue(self, state, action):
        """
          Returns Q(state,action)
          Should return 0.0 if we have never seen a state
          or the Q node value otherwise
        """
        "*** YOUR CODE HERE ***"
        qvalue = 0.0
        for feature_name, value in self.featExtractor.getFeatures(
                state, action).iteritems():
            qvalue += value * self.q_value_weights[feature_name]
        return qvalue

    def computeValueFromQValues(self, state):
        """
          Returns max_action Q(state,action)
          where the max is over legal actions.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return a value of 0.0.
        """
        "*** YOUR CODE HERE ***"
        max_next_qvalue = None
        for nextAction in self.legalActions:
            next_qvalue = self.getQValue(state, nextAction)
            if max_next_qvalue is None or max_next_qvalue < next_qvalue:
                max_next_qvalue = next_qvalue
        if max_next_qvalue is None:
            max_next_qvalue = 0.0

        return max_next_qvalue

    def computeActionFromQValues(self, state):
        """
          Compute the best action to take in a state.  Note that if there
          are no legal actions, which is the case at the terminal state,
          you should return None.
        """
        "*** YOUR CODE HERE ***"

        max_qvalue = None
        for action in self.legalActions:
            qvalue = self.getQValue(state, action)
            if max_qvalue is None or max_qvalue < qvalue:
                max_qvalue = qvalue

        if max_qvalue is None:
            return None

        actions = []
        for action in self.legalActions:
            qvalue = self.getQValue(state, action)
            if qvalue == max_qvalue:
                actions.append(action)

        if max_qvalue is not None and len(actions) == 0:
            return self.legalActions[0]
        if len(actions) > 1:
            return Const.DO_NOTHING
        return random.choice(actions)

    def getAction(self, state):
        """
          Compute the action to take in the current state.  With
          probability self.epsilon, we should take a random action and
          take the best policy action otherwise.  Note that if there are
          no legal actions, which is the case at the terminal state, you
          should choose None as the action.

          HINT: You might want to use util.flipCoin(prob)
          HINT: To pick randomly from a list, use random.choice(list)
        """

        # Pick Action
        "*** YOUR CODE HERE ***"
        # Epsilon greedy
        if util.flipCoin(self.epsilon) is True:
            return random.choice(self.legalActions)

        max_policy_value = None
        max_action = -1
        for action, value in self.softmaxPolicy(state).iteritems():
            if max_policy_value is None or max_policy_value < value:
                max_action = action
                max_policy_value = value
        return max_action

        # Pick Action
        "*** YOUR CODE HERE ***"
        # Epsilon greedy
        if util.flipCoin(self.epsilon) is True:
            self.lastAction = random.choice(self.legalActions)
        else:
            self.lastAction = self.computeActionFromQValues(state)
        return self.lastAction

    def update(self, state, action, nextState, reward):
        """
          The parent class calls this to observe a
          state = action => nextState and reward transition.
          You should do your Q-Value update here

          NOTE: You should never call this function,
          it will be called on your behalf
        """
        "*** YOUR CODE HERE ***"

        #update q value weights (omega)
        diff = reward + self.discount * self.computeValueFromQValues(
            nextState) - self.getQValue(state, action)
        for feature_name, feature_value in self.featExtractor.getFeatures(
                state, action).iteritems():
            self.q_value_weights[
                feature_name] += self.alpha * diff * feature_value

        #update policy weights (theta)

        expectedFeatureValues = util.Counter()
        for action in self.legalActions:
            for feature_name, value in self.featExtractor.getFeatures(
                    state, action).iteritems():
                expectedFeatureValues[feature_name] += value
        for feature_name, value in expectedFeatureValues.iteritems():
            expectedFeatureValues[feature_name] /= len(self.legalActions)

        for feature_name, value in self.featExtractor.getFeatures(
                state, action).iteritems():
            scoreFunc = value - expectedFeatureValues[feature_name]
            self.policy_weights[
                feature_name] += self.beta * scoreFunc * self.getQValue(
                    state, action)

    def softmaxPolicy(self, state):
        ''' return policy values using linear softmax '''
        softmaxValues = util.Counter()
        valueSum = 0.0
        for action in self.legalActions:
            policyValue = 0.0
            for feature_name, value in self.featExtractor.getFeatures(
                    state, action).iteritems():
                policyValue += value * self.policy_weights[feature_name]
            policyValue = math.exp(policyValue)
            softmaxValues[action] = policyValue
            valueSum += policyValue
        for action, val in softmaxValues.iteritems():
            softmaxValues[action] /= valueSum  #normalize
        return softmaxValues

    def getPolicy(self, state):
        return self.computeActionFromQValues(state)

    def getValue(self, state):
        return self.computeValueFromQValues(state)

    def final(self, state):
        print("Training Done")
        print("Total episodes: " + str(self.episodesSoFar))
        f = open('train_weight.txt', 'w')
        for feature, weight in self.q_value_weights.iteritems():
            f.write(str(feature) + " " + str(weight) + "\n")
        f.close()  # you can omit in most cases as the destructor will call it

        plt.plot(self.episodeRewardsList)
        plt.ylabel("Episode Reward")
        plt.show()

    #override
    def stopEpisode(self):
        """
          Called by environment when episode is done
        """
        if self.episodesSoFar < self.numTraining:
            self.accumTrainRewards += self.episodeRewards
        else:
            self.accumTestRewards += self.episodeRewards
        self.episodesSoFar += 1
        if self.episodesSoFar >= self.numTraining:
            # Take off the training wheels
            self.epsilon = 0.0  # no exploration
            self.alpha = 0.0  # no learning

        print("Agent Stop Episode")
        print(self.episodeRewards)
        self.episodeRewardsList.append(self.episodeRewards)