def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9, discount=0.9): self.epsilon = epsilon self.alpha = alpha self.gamma = gamma self.discount = discount self.legalActions = Const.ACTIONS self.featExtractor = DangerExtractorInstance() self.featExtractor = SimpleExtractor() self.weights = [0] * self.featExtractor.featureNum self.game_state = None self.train_episodes = 10000 self.env = Env()
def mat_features(states, extractor=SimpleExtractor(), ftrs=None): """Transform a list of states in state matrices""" if isinstance(states, np.ndarray): m = list(map(functools.partial(extractor.getMatrixFeatures, features=ftrs), states)) return m else: return [extractor.getMatrixFeatures(states, features=ftrs)]
def __init__(self, **args): "You can initialize Q-values here..." ReinforcementAgent.__init__(self, **args) self.weights = util.Counter() #100 episodes of training self.weights['enemy'] = -10 self.weights['bias'] = -10 self.weights['bullet'] = -200 self.weights['edge'] = -10 self.weights['hitEnemy'] = 10 self.weights['moveFoward'] = 10 self.legalActions = Const.ACTIONS self.featExtractor = SimpleExtractor() self.lastAction = 1
def __init__(self, epsilon=0.05, gamma=0.8, alpha=0.2, numTraining=900, extractor=SimpleExtractor(), **args): "You can initialize Q-values here..." args['epsilon'] = epsilon args['gamma'] = gamma args['alpha'] = alpha args['numTraining'] = numTraining self.featExtractor = extractor self.index = 0 # This is always Pacman self.weights = CustomCounter() self.q_values = CustomCounter() self.lastAction = None ReinforcementAgent.__init__(self, epsilon=epsilon, gamma=gamma, alpha=alpha, numTraining=numTraining) "*** YOUR CODE HERE ***"
def enhancedPacmanFeatures(state, action): """ For each state, this function is called with each legal action. It should return a counter with { <feature name> : <feature value>, ... } """ # features = util.Counter() "*** YOUR CODE HERE ***" # successor = state.generateSuccessor(0, action) # foodCount = successor.getFood().count() # features['foodCount'] = foodCount #features = neuralDistances(state, action) featureExtract = SimpleExtractor() features = featureExtract.getFeatures(state, action) #print("from enhanced") #print(features) #print("features: ",features.items()) # it looks like 'capsule 0' is the problem; its values is a list of values not an int return features
def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9, discount=0.9): self.epsilon = epsilon self.alpha = alpha self.gamma = gamma self.discount = discount self.legalActions = Const.ACTIONS self.featExtractor = SimpleExtractor() self.featExtractor2 = PositionExtractor() self.weights = util.Counter() self.game_state = None self.env = Env() self.input_num = self.featExtractor2.getFeatureNum() self.hidden_num = 100 self.output_num = 6 self.W1 = np.random.rand(self.input_num, self.hidden_num) self.W2 = np.random.rand(self.hidden_num, self.output_num)
class QLearningAgent(ReinforcementAgent): def __init__(self, **args): "You can initialize Q-values here..." ReinforcementAgent.__init__(self, **args) self.weights = util.Counter() #100 episodes of training self.weights['enemy'] = -10 self.weights['bias'] = -10 self.weights['bullet'] = -200 self.weights['edge'] = -10 self.weights['hitEnemy'] = 10 self.weights['moveFoward'] = 10 self.legalActions = Const.ACTIONS self.featExtractor = SimpleExtractor() self.lastAction = 1 def getQValue(self, state, action): """ Returns Q(state,action) Should return 0.0 if we have never seen a state or the Q node value otherwise """ "*** YOUR CODE HERE ***" qvalue = 0.0 for feature_name, value in self.featExtractor.getFeatures( state, action).iteritems(): qvalue += value * self.weights[feature_name] return qvalue def computeValueFromQValues(self, state): """ Returns max_action Q(state,action) where the max is over legal actions. Note that if there are no legal actions, which is the case at the terminal state, you should return a value of 0.0. """ "*** YOUR CODE HERE ***" max_next_qvalue = None for nextAction in self.legalActions: next_qvalue = self.getQValue(state, nextAction) if max_next_qvalue is None or max_next_qvalue < next_qvalue: max_next_qvalue = next_qvalue if max_next_qvalue is None: max_next_qvalue = 0.0 return max_next_qvalue def computeActionFromQValues(self, state): """ Compute the best action to take in a state. Note that if there are no legal actions, which is the case at the terminal state, you should return None. """ "*** YOUR CODE HERE ***" max_qvalue = None for action in self.legalActions: qvalue = self.getQValue(state, action) if max_qvalue is None or max_qvalue < qvalue: max_qvalue = qvalue if max_qvalue is None: return None actions = [] for action in self.legalActions: qvalue = self.getQValue(state, action) if qvalue == max_qvalue: actions.append(action) if max_qvalue is not None and len(actions) == 0: return self.legalActions[0] if len(actions) > 1: return Const.DO_NOTHING return random.choice(actions) def getAction(self, state): """ Compute the action to take in the current state. With probability self.epsilon, we should take a random action and take the best policy action otherwise. Note that if there are no legal actions, which is the case at the terminal state, you should choose None as the action. HINT: You might want to use util.flipCoin(prob) HINT: To pick randomly from a list, use random.choice(list) """ # Pick Action "*** YOUR CODE HERE ***" # Epsilon greedy if util.flipCoin(self.epsilon) is True: self.lastAction = random.choice(self.legalActions) else: self.lastAction = self.computeActionFromQValues(state) return self.lastAction def update(self, state, action, nextState, reward): """ The parent class calls this to observe a state = action => nextState and reward transition. You should do your Q-Value update here NOTE: You should never call this function, it will be called on your behalf """ "*** YOUR CODE HERE ***" diff = reward + self.discount * self.computeValueFromQValues( nextState) - self.getQValue(state, action) for feature_name, feature_value in self.featExtractor.getFeatures( state, action).iteritems(): self.weights[feature_name] += self.alpha * diff * feature_value def getPolicy(self, state): return self.computeActionFromQValues(state) def getValue(self, state): return self.computeValueFromQValues(state) def final(self, state): print("Training Done") print("Total episodes: " + str(self.episodesSoFar)) f = open('train_weight.txt', 'w') for feature, weight in self.weights.iteritems(): f.write(str(feature) + " " + str(weight) + "\n") f.close() # you can omit in most cases as the destructor will call it plt.plot(self.episodeRewardsList) plt.ylabel("Episode Reward") plt.show()
import numpy as np import gym import matplotlib.pyplot as plt from tanks import Env from const import Const from featureExtractors import DangerExtractor from featureExtractors import SimpleExtractor # env = gym.make('CartPole-v0') train_episodes = 100 learning_rate = 0.01 level_type = "minimal" game_speed = 1000 env = Env(level_type, game_speed, train_episodes) gamma = 0.99 featExtractor = SimpleExtractor() def discount_rewards(r): """ take 1D float array of rewards and compute discounted reward """ discounted_r = np.zeros_like(r) running_add = 0 for t in reversed(range(0, r.size)): running_add = running_add * gamma + r[t] discounted_r[t] = running_add return discounted_r class agent(): def __init__(self, lr, s_size, a_size, h_size): # These lines established the feed-forward part of the network. The agent takes a state and produces an action.
class LinearQAgent: def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9, discount=0.9): self.epsilon = epsilon self.alpha = alpha self.gamma = gamma self.discount = discount self.legalActions = Const.ACTIONS self.featExtractor = DangerExtractorInstance() self.featExtractor = SimpleExtractor() self.weights = [0] * self.featExtractor.featureNum self.game_state = None self.train_episodes = 10000 self.env = Env() def getQValue(self, state, action): qvalue = 0.0 for idx, value in enumerate( self.featExtractor.getFeatures(state, action)): qvalue += value * self.weights[idx] return qvalue def getMaxQValue(self, state): max_next_qvalue = None for nextAction in self.legalActions: next_qvalue = self.getQValue(state, nextAction) if max_next_qvalue is None or max_next_qvalue < next_qvalue: max_next_qvalue = next_qvalue if max_next_qvalue is None: max_next_qvalue = 0.0 return max_next_qvalue def computeActionFromQValues(self, state): max_qvalue = None for action in self.legalActions: qvalue = self.getQValue(state, action) if max_qvalue is None or max_qvalue < qvalue: max_qvalue = qvalue if max_qvalue is None: return None actions = [] for action in self.legalActions: qvalue = self.getQValue(state, action) if qvalue == max_qvalue: actions.append(action) if max_qvalue is not None and len(actions) == 0: return self.legalActions[0] if len(actions) > 1: return Const.DO_NOTHING return random.choice(actions) def getAction(self, state): if util.flipCoin(self.epsilon) is True: return random.choice(self.legalActions) return self.computeActionFromQValues(state) ''' @return reward, episode_over ''' def step(self): if self.game_state is None: self.game_state, reward, episode_over, _ = self.env.step(0) else: action = self.getAction(self.game_state) next_game_state, reward, episode_over, _ = self.env.step(action) diff = reward + self.discount * self.getMaxQValue( next_game_state) - self.getQValue(self.game_state, action) for idx, feature_value in enumerate( self.featExtractor.getFeatures(self.game_state, action)): self.weights[idx] += self.alpha * diff * feature_value self.game_state = next_game_state return reward, episode_over def start_episode(self, episode_num): self.env.reset() total_reward = 0.0 while True: self.env.render() reward, episode_over = self.step() total_reward += reward if episode_over: break return total_reward def start_train(self): episode_cnt = 0 total_reward = [] while episode_cnt < self.train_episodes: episode_cnt += 1 total_reward.append(self.start_episode(episode_cnt)) if episode_cnt % 10 == 0: print("Episode #" + str(episode_cnt)) print("10 avg: ", np.mean(total_reward[-10:]))
class PolicyGradientAgent(ReinforcementAgent): ''' Actor-Critic Agent ''' def __init__(self, **args): "You can initialize Q-values here..." ReinforcementAgent.__init__(self, **args) self.q_value_weights = util.Counter() self.policy_weights = util.Counter() #learning rate for policy weights self.beta = 0.1 self.legalActions = Const.ACTIONS self.featExtractor = SimpleExtractor() self.lastAction = 1 #100 episodes of training self.q_value_weights['enemy'] = -10 self.q_value_weights['bias'] = -10 self.q_value_weights['bullet'] = -200 self.q_value_weights['edge'] = -10 self.q_value_weights['hitEnemy'] = 10 self.q_value_weights['moveFoward'] = 10 def getQValue(self, state, action): """ Returns Q(state,action) Should return 0.0 if we have never seen a state or the Q node value otherwise """ "*** YOUR CODE HERE ***" qvalue = 0.0 for feature_name, value in self.featExtractor.getFeatures( state, action).iteritems(): qvalue += value * self.q_value_weights[feature_name] return qvalue def computeValueFromQValues(self, state): """ Returns max_action Q(state,action) where the max is over legal actions. Note that if there are no legal actions, which is the case at the terminal state, you should return a value of 0.0. """ "*** YOUR CODE HERE ***" max_next_qvalue = None for nextAction in self.legalActions: next_qvalue = self.getQValue(state, nextAction) if max_next_qvalue is None or max_next_qvalue < next_qvalue: max_next_qvalue = next_qvalue if max_next_qvalue is None: max_next_qvalue = 0.0 return max_next_qvalue def computeActionFromQValues(self, state): """ Compute the best action to take in a state. Note that if there are no legal actions, which is the case at the terminal state, you should return None. """ "*** YOUR CODE HERE ***" max_qvalue = None for action in self.legalActions: qvalue = self.getQValue(state, action) if max_qvalue is None or max_qvalue < qvalue: max_qvalue = qvalue if max_qvalue is None: return None actions = [] for action in self.legalActions: qvalue = self.getQValue(state, action) if qvalue == max_qvalue: actions.append(action) if max_qvalue is not None and len(actions) == 0: return self.legalActions[0] if len(actions) > 1: return Const.DO_NOTHING return random.choice(actions) def getAction(self, state): """ Compute the action to take in the current state. With probability self.epsilon, we should take a random action and take the best policy action otherwise. Note that if there are no legal actions, which is the case at the terminal state, you should choose None as the action. HINT: You might want to use util.flipCoin(prob) HINT: To pick randomly from a list, use random.choice(list) """ # Pick Action "*** YOUR CODE HERE ***" # Epsilon greedy if util.flipCoin(self.epsilon) is True: return random.choice(self.legalActions) max_policy_value = None max_action = -1 for action, value in self.softmaxPolicy(state).iteritems(): if max_policy_value is None or max_policy_value < value: max_action = action max_policy_value = value return max_action # Pick Action "*** YOUR CODE HERE ***" # Epsilon greedy if util.flipCoin(self.epsilon) is True: self.lastAction = random.choice(self.legalActions) else: self.lastAction = self.computeActionFromQValues(state) return self.lastAction def update(self, state, action, nextState, reward): """ The parent class calls this to observe a state = action => nextState and reward transition. You should do your Q-Value update here NOTE: You should never call this function, it will be called on your behalf """ "*** YOUR CODE HERE ***" #update q value weights (omega) diff = reward + self.discount * self.computeValueFromQValues( nextState) - self.getQValue(state, action) for feature_name, feature_value in self.featExtractor.getFeatures( state, action).iteritems(): self.q_value_weights[ feature_name] += self.alpha * diff * feature_value #update policy weights (theta) expectedFeatureValues = util.Counter() for action in self.legalActions: for feature_name, value in self.featExtractor.getFeatures( state, action).iteritems(): expectedFeatureValues[feature_name] += value for feature_name, value in expectedFeatureValues.iteritems(): expectedFeatureValues[feature_name] /= len(self.legalActions) for feature_name, value in self.featExtractor.getFeatures( state, action).iteritems(): scoreFunc = value - expectedFeatureValues[feature_name] self.policy_weights[ feature_name] += self.beta * scoreFunc * self.getQValue( state, action) def softmaxPolicy(self, state): ''' return policy values using linear softmax ''' softmaxValues = util.Counter() valueSum = 0.0 for action in self.legalActions: policyValue = 0.0 for feature_name, value in self.featExtractor.getFeatures( state, action).iteritems(): policyValue += value * self.policy_weights[feature_name] policyValue = math.exp(policyValue) softmaxValues[action] = policyValue valueSum += policyValue for action, val in softmaxValues.iteritems(): softmaxValues[action] /= valueSum #normalize return softmaxValues def getPolicy(self, state): return self.computeActionFromQValues(state) def getValue(self, state): return self.computeValueFromQValues(state) def final(self, state): print("Training Done") print("Total episodes: " + str(self.episodesSoFar)) f = open('train_weight.txt', 'w') for feature, weight in self.q_value_weights.iteritems(): f.write(str(feature) + " " + str(weight) + "\n") f.close() # you can omit in most cases as the destructor will call it plt.plot(self.episodeRewardsList) plt.ylabel("Episode Reward") plt.show() #override def stopEpisode(self): """ Called by environment when episode is done """ if self.episodesSoFar < self.numTraining: self.accumTrainRewards += self.episodeRewards else: self.accumTestRewards += self.episodeRewards self.episodesSoFar += 1 if self.episodesSoFar >= self.numTraining: # Take off the training wheels self.epsilon = 0.0 # no exploration self.alpha = 0.0 # no learning print("Agent Stop Episode") print(self.episodeRewards) self.episodeRewardsList.append(self.episodeRewards)