def visualize_trial(self, n_steps = 200): """Do a trial without learning, with display. Parameters ---------- n_steps -- number of steps to simulate for """ # prepare for the visualization plb.ion() mv = mountaincar.MountainCarViewer(self.mountain_car) mv.create_figure(n_steps, n_steps) plb.draw() # make sure the mountain-car is reset self.mountain_car.reset() for n in range(n_steps): print('\rt =', self.mountain_car.t) sys.stdout.flush() # choose a random action self.mountain_car.apply_force(np.random.randint(3) - 1) # simulate the timestep self.mountain_car.simulate_timesteps(100, 0.01) # update the visualization mv.update_figure() plb.draw() # check for rewards if self.mountain_car.R > 0.0: print("\rreward obtained at t = ", self.mountain_car.t) break
def visualize_trial(self, n_steps, tau): """Do a trial without learning, with display. Parameters ---------- n_steps -- number of steps to simulate for """ print('Simulating for:') print(self.weights.sum(axis=1).sum(axis=1)) # Initialize self.pos = None self.vel = None self.energy = None self.action = None self.activations = np.zeros((self.n_neurons, self.n_neurons)) self.Q = np.multiply(self.weights, self.activations).sum(axis=1).sum(axis=1) self.tau = tau #Let's us choose a lower temperature for evaluation # prepare for the visualization plb.ion() mv = mountaincar.MountainCarViewer(self.mountain_car) mv.create_figure(n_steps, n_steps) plb.draw() # make sure the mountain-car is reset self.mountain_car.reset() for n in range(n_steps): # For time step showing #print('\rt =', self.mountain_car.t) #sys.stdout.flush() self._choose_action() self._update_state() # update the visualization mv.update_figure() plb.draw() # check for rewards if self.mountain_car.R > 0.0: print("\rreward obtained at t = ", self.mountain_car.t) break
def visualize_trial(self, agent=None, n_steps=200): """Do a trial without learning, with display. Parameters ---------- n_steps -- number of steps to simulate for """ # prepare for the visualization plb.ion() mv = mountaincar.MountainCarViewer(self.mountain_car) mv.create_figure(n_steps, n_steps) plb.draw() # make sure the mountain-car is reset self.mountain_car.reset() # get the initial state state = self.mountain_car.state() # choose an action from the policy: action = self._next_action(state) for n in range(n_steps): # Apply action self.mountain_car.apply_force(action - 1) # Simulate the time step self.mountain_car.simulate_timesteps(100, 0.01) # update the visualization mv.update_figure() plb.draw() # Observe the reward reward = self.mountain_car.R # check for rewards if reward > 0.0: plb.savefig('Car_viz.png') print("\rreward obtained at t = ", self.mountain_car.t) break # Get the next action state = self.mountain_car.state() action = agent._next_action(state) return self.mountain_car.t
def visualize_trial(self, n_steps=200): """Do a trial without learning, with display. Parameters ---------- n_steps -- number of steps to simulate for """ # prepare for the visualization plb.ion() mv = mountaincar.MountainCarViewer(self.mountain_car) mv.create_figure(n_steps, n_steps) plb.draw() plb.pause(1e-3) # make sure the mountain-car is reset self.mountain_car.reset() for n in (range(n_steps)): print('\rt =', self.mountain_car.t) sys.stdout.flush() # get current state s = (self.mountain_car.x, self.mountain_car.x_d) # selection current action based on softmax action_index = self.get_action_index(*s, greedy=True) # perform the action self.mountain_car.apply_force(action_index - 1) # simulate the timestep self.mountain_car.simulate_timesteps(100, 0.01) # update the visualization mv.update_figure() plb.draw() plb.pause(1e-3) # check for rewards if self.mountain_car.R > 0.0: print("\rreward obtained at t = ", self.mountain_car.t) break
def initiate_trial(self, visual=False): # H5 Data Sets h5data = dataSets.generate_data_sets(self.filename, self.centers) time_to_reward = [0] steps_to_reward = [0] # prepare for the visualization if visual: plb.ion() plb.pause(0.0001) mv = mountaincar.MountainCarViewer(self.mountain_car) mv.create_figure(self.n_steps, self.n_steps) plb.show() for episode_count in np.arange(self.n_episodes): self.reset() self._parameter_settings(episode_count) for step_count in range(self.n_steps): if self.verbose: print("Episode: " + str(episode_count)) print("Simulation Step: {0}".format(step_count)) print("Mountain Car state: {0}".format(self.state)) print("Grid Center index: {0}".format(self.index)) print("Grid Center: {0}".format(self.centers[self.index])) self._learn() if visual: # update the visualization mv.update_figure() plb.show() plb.pause(0.0001) if self.mountain_car.R > 0.0: print("Reward obtained at t = " + str(self.mountain_car.t)) steps_to_reward[0] = step_count break elif step_count == self.n_steps - 1: print("Maximum number of iterations reached. No reward.") steps_to_reward[0] = step_count time_to_reward[0] = self.mountain_car.t dataSets.generate_data_save(h5data, episode_count, time_to_reward, steps_to_reward, self.weights)
def visualize_trial(self, n_steps = 200): """Do a trial without learning, with display. Parameters ---------- n_steps -- number of steps to simulate for """ # prepare for the visualization plb.ion() plb.pause(0.0001) mv = mountaincar.MountainCarViewer(self.mountain_car) mv.create_figure(n_steps, n_steps) plb.show() # make sure the mountain-car is reset self.mountain_car.reset() for n in range(n_steps): print('\rt =', self.mountain_car.t, sys.stdout.flush()) inputs = self._compute_inputs(self.mountain_car.x, self.mountain_car.x_d) outputs = self._compute_outputs(inputs, self.W) action = self._select_action(self._softmax(outputs, self.t)) self.mountain_car.apply_force(action - 1) # simulate the timestep self.mountain_car.simulate_timesteps(100, 0.01) # update the visualization mv.update_figure() plb.show() plb.pause(0.0001) # check for rewards if self.mountain_car.R > 0.0: print("\rreward obtained at t = ", self.mountain_car.t) break
def visualize_trial(self, n_steps=200): """Do a trial without learning, with display. Parameters ---------- n_steps -- number of steps to simulate for """ # prepare for the visualization plb.ion() mv = mountaincar.MountainCarViewer(self.mountain_car) mv.create_figure(n_steps, n_steps) plb.draw() # make sure the mountain-car is reset self.mountain_car.reset() for n in range(n_steps): sys.stdout.flush() # choose a random action q, self.activations = self.action_values() p = np.exp(q / self.temperature) / sum(np.exp( q / self.temperature)) # choose the action with a softmax self.a = np.random.choice(3, p=p) self.mountain_car.apply_force(self.a - 1) # simulate the timestep self.mountain_car.simulate_timesteps(100, 0.01) # update the visualization mv.update_figure() plb.draw() # check for rewards if self.mountain_car.R > 0.0: print("\rreward obtained at t = ", self.mountain_car.t) break
def visualize_trial(agent, n_steps=200): """Do a trial without learning, with display. Parameters ---------- n_steps -- number of steps to simulate for """ # prepare for the visualization plb.ion() mv = mountaincar.MountainCarViewer(agent.mountain_car) mv.create_figure(n_steps=n_steps, max_time=n_steps) plb.draw() plb.pause(0.0001) # make sure the mountain-car is reset agent.mountain_car.reset() for n in range(0, n_steps): # choose a action state = State(agent.mountain_car.x, agent.mountain_car.x_d) action = agent.choose_action(state) agent.mountain_car.apply_force(action) print("action", action) # simulate the timestep agent.mountain_car.simulate_timesteps(100, 0.01) # update the visualization mv.update_figure() plb.draw() plb.pause(0.0001) # check for rewards if agent.mountain_car.R > 0.0: print("reward obtained at t = ", agent.mountain_car.t) break
def visualize_trial(self, n_steps=100): """ Do a trial without learning, with display. Parameters ---------- n_steps -- number of steps to simulate for """ # prepare for the visualization plb.ion() mv = mountaincar.MountainCarViewer(self.mountain_car) mv.create_figure(n_steps, n_steps) plb.draw() # make sure the mountain-car is reset self.mountain_car.reset() for n in xrange(n_steps): print('\rt =', self.mountain_car.t) print("Enter to continue...") raw_input() sys.stdout.flush() reward = self.mountain_car.act(self.agent.act()) self.agent.state = [self.mountain_car.x, self.mountain_car.vx] # update the visualization mv.update_figure() plb.draw() # check for rewards if reward > 0.0: print("\rreward obtained at t = ", self.mountain_car.t) break
def episode(self, n_steps=2000, tau=None, animation=False, fig=None): """ Do an episode of maximum `n_steps` This also accepts the `tau` parameter, in case you want to update it Optionally, you can specify a figure where to draw this episode """ # prepare for the visualization if animation: mv = mountaincar.MountainCarViewer(self.mountain_car) mv.create_figure(n_steps, n_steps, fig) # Initialisation: # --------------- self.mountain_car.reset() self.eligibility_trace = np.zeros_like(self.eligibility_trace) if tau is not None: self.tau = tau if self.tau < 1e-4: print( "WARNING: Tau is too small so it has been replaced by 0.0001", "-- at t=%d" % len(self.escape_times), file=sys.stderr) sys.stderr.flush() self.tau = 1e-4 # current state x = self.mountain_car.x v = self.mountain_car.x_d # representation of the current state, and current action action, state, Q_s_a = self.choose_action(x, v) Q_sp_ap = 0 # not yet known for n in range(n_steps): # Use current action to get to next state, s_prime self.mountain_car.apply_force(action) self.mountain_car.simulate_timesteps(100, 0.01) x_prime = self.mountain_car.x v_prime = self.mountain_car.x_d # Since this is SARSA, choose next action supposing you also use same policy in next state action_prime, state_prime, Q_sp_ap = self.choose_action( x_prime, v_prime) # update weights based on observations self.learn(state, action, Q_s_a, Q_sp_ap) # move to next state Q_s_a = Q_sp_ap action = action_prime state = state_prime # update the visualization if animation: mv.update_figure(n) # stop when goal was reached if self.mountain_car.R > 0.0: # print("reward obtained at t = ", self.mountain_car.t, end='\n\n') break # episode is finished, save number of steps self.escape_times.append(self.mountain_car.t)
def one_run(self, n_steps=2000, visual_no_learn=False): # if visualisation on, init figure if visual_no_learn: plb.ion() mv = mountaincar.MountainCarViewer(self.mountain_car) mv.create_figure(n_steps, n_steps) plb.draw() plb.pause(0.001) # make sure the mountain-car is reset self.mountain_car.reset() # 1)Being in state s choose action a according to policy # replace state by r values (the response of the neurons) self.state = S(self.mountain_car.x, self.mountain_car.x_d) self.state_response = self.compute_response(self.state) self.action = self.choose_action(self.state_response, self.tau(0)) self.eligibilities = np.zeros((3, self.iN.shape[0], self.iN.shape[1])) for n in range(n_steps): # 2) Observe reward r and next state s' self.mountain_car.apply_force(self.ActDict[self.action]) self.mountain_car.simulate_timesteps(100, 0.01) self.new_state = S(self.mountain_car.x, self.mountain_car.x_d) self.new_state_response = self.compute_response(self.new_state) # 3) Choose action a' in state s' according to policy self.new_action = self.choose_action(self.new_state_response, self.tau(n + 1)) if visual_no_learn: # update the visualization mv.update_figure() plb.draw() plb.pause(0.001) else: # 4) Update weights self.weights_hist[0].append(self.weights.mean()) self.weights_hist[1].append( self.weights.std()) # save the evolution of the weight self.eligibilities[self.action] += self.state_response delta = self.mountain_car.R + self.gamma * self.Q( self.new_state_response, self.new_action) - self.Q( self.state_response, self.action) self.weights = self.weights + self.eta * delta * self.eligibilities self.eligibilities *= self.gamma * self.lambd # 5) s' -> s; a' -> a self.state = self.new_state self.state_response = self.new_state_response self.action = self.new_action # check for rewards if self.mountain_car.R > 0.0: return n return n_steps
def learn(self, n_trials=100, n_steps=10000, verbose=0): self.verbose = verbose learning_curve = n_steps * np.ones(n_trials) for i in range(n_trials): if self.verbose: # Prepare for visualization plb.ion() mv = mountaincar.MountainCarViewer(self.mc) mv.create_figure(n_steps, n_steps) plb.draw() # Initialization for new trial self.mc.reset() self.temp = self.temp0 el_tr = np.zeros(self.net.W.shape) # eligibility traces a, q, r = self.choose_action() # Update exploration temperature if self.temp_fun is not None: self.temp = self.temp_fun(self.temp0, 0, i, n_trials) for j in range(n_steps): # Update eligibility traces el_tr *= (self.reward_factor * self.el_tr_rate) el_tr[a, :] += r # Simulate timesteps self.mc.simulate_timesteps(n=100, dt=0.01) # Choose next action a_prime, q_prime, r_prime = self.choose_action() # Calculate TD error delta = self.mc.R + (self.reward_factor * q_prime) - q # Update network weights deltaW = self.learn_rate * delta * el_tr self.net.W += deltaW # Log if self.verbose: mv.update_figure() plb.draw() print("tau = {}".format(self.temp)) print("a' = {}".format(a_prime)) print("q' = {}".format(q_prime)) print("delta = {}".format(delta)) print("||deltaW|| = {}".format(np.linalg.norm(deltaW))) print("max(|deltaW|) = {}".format(np.max(np.abs(deltaW)))) sys.stdout.flush() # Change old varibles for new ones a = a_prime q = q_prime r = r_prime # Check for rewards if self.mc.R > 0.0: if self.verbose: print("\rGot reward at t = {}".format(self.mc.t)) sys.stdout.flush() learning_curve[i] = j break if verbose: input("Press ENTER to continue...") sys.stdout.flush() return learning_curve