def get_state(self): ##should only take state if game has just started self.score = cartpole.get_score() if self.score == 0: #print("Begining of game") self.state = cartpole.get_state() self.state = np.reshape(self.state, [1, self.observation_space]) if self.score == 1: #print("Begining of game") self.state = cartpole.get_state() self.state = np.reshape(self.state, [1, self.observation_space]) return 0
def get_new_state(self): ##should only take state of game after an action ##check score self.reward = cartpole.get_score() ##check if game ended terminal = cartpole.get_end() ##check new state self.new_state = cartpole.get_state() self.reward = self.reward if not terminal else -self.reward self.new_state = np.reshape(self.new_state, [1, self.observation_space]) self.remember(self.state, self.action_index, self.reward, self.new_state, terminal) self.state = self.new_state self.experience_replay() ##if game end record scores if terminal == True: self.scores.append(self.reward) return self.state
def get_new_state(self): # observe the env after a descion has been made self.new_observation = cartpole.get_state() self.new_state = self.get_state_as_string( self.assign_bins(self.new_observation, bins)) #print(self.new_state) return self.new_state
def get_state(self): self.observation = cartpole.get_state() self.state = self.get_state_as_string( self.assign_bins( self.observation, bins)) # set state to string to use as key in dict return self.state
def get_keys_pressed(self, reward): # This is the real work horse of the code. Here is where the # actual work gets done. # Get the current state of the game. current_state = cartpole.get_state() # Append the latest observation to the collection of # observations. self.observations.append( [self.last_state, self.last_action, reward, current_state]) # We can't keep all observations. If there are too many then # pop off the oldest. if (len(self.observations) > self.max_obs_length): # only remove non-rewarded actions, if there aren't enough if (self.rewards_frac() < 0.4): self.remove_bad_point() else: self.observations = self.observations[1:] # If we have collected enough observations, train. if (len(self.observations) > self.min_obs_steps): if cartpole.get_score() < 50: print "Initialization score is too low. Initializing again." # remove 50 bad points for i in range(50): self.remove_bad_point() else: self.train_model() # Reset the last state, and get the next action. self.last_state = current_state self.last_action, action_index = self.choose_next_action() # If we are out of the randomness-only regime, reduce the # current probability for a random move. if ((self.random_action_prob > self.final_random_prob) and (len(self.observations) > self.min_obs_steps)): self.random_action_prob -= ( (self.initial_random_prob - self.final_random_prob) / self.explore_steps) # Set the move to take, based on the action. if action_index == 0: action = [K_LEFT] elif action_index == 1: action = [] else: action = [K_RIGHT] return action
def get_keys_pressed(self, reward): # Here is where the actual work gets done. # Get the current state of the game. current_state = cartpole.get_state() # Append the latest observation to the collection of # observations. self.last_state = cartpole.get_state() self.observations.append( [self.last_state, self.last_action, reward, current_state]) # Reset the last state, and get the next action. self.last_state = current_state self.last_action, action_index = self.choose_next_action() # Set the move to take, based on the action. if action_index == 0: action = [K_LEFT] elif action_index == 1: action = [K_RIGHT] return action
def __init__(self): """ Plays CartPole by implementing a NN Q-learning strategy. """ # The future discount rate. self.future_reward_discount = 1.0 # The number of possible actions (left, right, no move) self.num_actions = 3 # The probabilities of using a random move, instead of one # from the NN. self.initial_random_prob = 1.0 self.final_random_prob = 0.05 self.random_action_prob = self.initial_random_prob # Variables for holding information about the previous # timestep. self.last_score = 0 self.last_state = cartpole.get_state() self.last_action = np.array([1.0, 0.0, 0.0]) # Variables for dealing with pressed keys. self.keys_pressed = [] self.last_keys_pressed = [] # Build the neural network. self.build_model() # Size of the observations collection. self.max_obs_length = 6000 self.observations = [] # Number of observations to gather before starting training of # the NN. self.min_obs_steps = 3000 # Number of observations over which to decrease the # probability of using a random move, rather than a move from # the NN. self.explore_steps = 5000 # The mini-batch size. self.mini_batch_size = self.max_obs_length / 8 # Have we starting training the NN yet? self.started_training = False
def get_observation(self): self.observation = cartpole.get_state() ## remember env and action choosen if self.training == True: if len(self.prev_obseration) > 0: self.game_memory.append( [self.prev_obseration, self.action_index]) #print(self.game_memory) self.prev_obseration = self.observation else: self.prev_obseration = self.observation self.game_memory.append([self.observation, self.action_index]) self.reward = cartpole.get_score() self.score += self.reward return self.score
def get_keys_pressed(self, reward): # Here is where the actual work gets done. self.current_state = cartpole.get_state() self.state = self.get_state_as_string(self.assign_bins(self.current_state, bins)) # get the next action. action_index = self.choose_next_action() self.last_state = self.current_state # Set the move to take, based on the action. if action_index == 0: action = [K_LEFT] elif action_index == 1: action = [K_RIGHT] print(action) return action
def get_new_state(self): ##observe the current state of the game self.new_observation = cartpole.get_state() ##check score self.reward = cartpole.get_score() - self.prev_score self.prev_score = cartpole.get_score() ##check if game ended self.done = cartpole.get_end() self.reward_sum += self.reward if self.training == True: self.drs.append( self.reward ) # record reward (has to be done after we call step() to get reward for previous action) if self.done: # an episode finished # stack together all inputs, hidden states, action gradients, and rewards for this episode self.epx = np.vstack(self.xs) eph = np.vstack(self.hs) epdlogp = np.vstack(self.dlogps) epr = np.vstack(self.drs) # reset array memory self.xs, self.hs, self.dlogps, self.drs = [], [], [], [] # compute the discounted reward backwards through time discounted_epr = self.discount_rewards(epr) # standardize the rewards to be unit normal (helps control the gradient estimator variance) discounted_epr = discounted_epr - np.mean(discounted_epr) discounted_epr /= np.std(discounted_epr) epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.) grad = self.policy_backward(eph, epdlogp) for k in self.model: self.grad_buffer[k] += grad[ k] # accumulate grad over batch # perform rmsprop parameter update every batch_size episodes if self.episode_number % self.batch_size == 0: for k, v in self.model.iteritems(): g = self.grad_buffer[k] # gradient self.rmsprop_cache[ k] = self.decay_rate * self.rmsprop_cache[k] + ( 1 - self.decay_rate) * g**2 self.model[k] = self.alpha * g / ( np.sqrt(self.rmsprop_cache[k]) + 1e-5) self.grad_buffer[k] = np.zeros_like( v) # reset batch gradient buffer self.reward_sum = 0 self.episode_number += 1 self.prev_score = 0 else: if self.done or self.reward_sum >= 200: self.play_scores.append(self.reward_sum) self.reward_sum = 0 self.prev_score = 0 return self.observation
def get_state(self): ##observe the current state of the game self.observation = cartpole.get_state() return self.observation