コード例 #1
0
    def visualize_trial(self, n_steps = 200):
        """Do a trial without learning, with display.

        Parameters
        ----------
        n_steps -- number of steps to simulate for
        """
        
        # prepare for the visualization
        plb.ion()
        mv = mountaincar.MountainCarViewer(self.mountain_car)
        mv.create_figure(n_steps, n_steps)
        plb.draw()
            
        # make sure the mountain-car is reset
        self.mountain_car.reset()

        for n in range(n_steps):
            print('\rt =', self.mountain_car.t)
            sys.stdout.flush()
            
            # choose a random action
            self.mountain_car.apply_force(np.random.randint(3) - 1)
            # simulate the timestep
            self.mountain_car.simulate_timesteps(100, 0.01)

            # update the visualization
            mv.update_figure()
            plb.draw()            
            
            # check for rewards
            if self.mountain_car.R > 0.0:
                print("\rreward obtained at t = ", self.mountain_car.t)
                break
コード例 #2
0
ファイル: NeuralAgent_v4.py プロジェクト: vidit09/unsupNN
    def visualize_trial(self, n_steps, tau):
        """Do a trial without learning, with display.

        Parameters
        ----------
        n_steps -- number of steps to simulate for
        """

        print('Simulating for:')
        print(self.weights.sum(axis=1).sum(axis=1))

        # Initialize
        self.pos = None
        self.vel = None
        self.energy = None
        self.action = None
        self.activations = np.zeros((self.n_neurons, self.n_neurons))
        self.Q = np.multiply(self.weights,
                             self.activations).sum(axis=1).sum(axis=1)

        self.tau = tau  #Let's us choose a lower temperature for evaluation

        # prepare for the visualization
        plb.ion()
        mv = mountaincar.MountainCarViewer(self.mountain_car)
        mv.create_figure(n_steps, n_steps)
        plb.draw()

        # make sure the mountain-car is reset
        self.mountain_car.reset()

        for n in range(n_steps):
            # For time step showing
            #print('\rt =', self.mountain_car.t)
            #sys.stdout.flush()

            self._choose_action()

            self._update_state()

            # update the visualization
            mv.update_figure()
            plb.draw()

            # check for rewards
            if self.mountain_car.R > 0.0:
                print("\rreward obtained at t = ", self.mountain_car.t)
                break
コード例 #3
0
    def visualize_trial(self, agent=None, n_steps=200):
        """Do a trial without learning, with display.

        Parameters
        ----------
        n_steps -- number of steps to simulate for
        """

        # prepare for the visualization
        plb.ion()
        mv = mountaincar.MountainCarViewer(self.mountain_car)
        mv.create_figure(n_steps, n_steps)
        plb.draw()

        # make sure the mountain-car is reset
        self.mountain_car.reset()

        # get the initial state
        state = self.mountain_car.state()

        # choose an action from the policy:
        action = self._next_action(state)

        for n in range(n_steps):
            # Apply action
            self.mountain_car.apply_force(action - 1)

            # Simulate the time step
            self.mountain_car.simulate_timesteps(100, 0.01)

            # update the visualization
            mv.update_figure()
            plb.draw()

            # Observe the reward
            reward = self.mountain_car.R

            # check for rewards
            if reward > 0.0:
                plb.savefig('Car_viz.png')
                print("\rreward obtained at t = ", self.mountain_car.t)
                break

            # Get the next action
            state = self.mountain_car.state()
            action = agent._next_action(state)

        return self.mountain_car.t
コード例 #4
0
    def visualize_trial(self, n_steps=200):
        """Do a trial without learning, with display.

        Parameters
        ----------
        n_steps -- number of steps to simulate for
        """

        # prepare for the visualization
        plb.ion()
        mv = mountaincar.MountainCarViewer(self.mountain_car)
        mv.create_figure(n_steps, n_steps)
        plb.draw()
        plb.pause(1e-3)

        # make sure the mountain-car is reset
        self.mountain_car.reset()

        for n in (range(n_steps)):
            print('\rt =', self.mountain_car.t)
            sys.stdout.flush()

            # get current state
            s = (self.mountain_car.x, self.mountain_car.x_d)

            # selection current action based on softmax
            action_index = self.get_action_index(*s, greedy=True)

            # perform the action
            self.mountain_car.apply_force(action_index - 1)

            # simulate the timestep
            self.mountain_car.simulate_timesteps(100, 0.01)

            # update the visualization
            mv.update_figure()
            plb.draw()
            plb.pause(1e-3)

            # check for rewards
            if self.mountain_car.R > 0.0:
                print("\rreward obtained at t = ", self.mountain_car.t)
                break
コード例 #5
0
    def initiate_trial(self, visual=False):
        # H5 Data Sets
        h5data = dataSets.generate_data_sets(self.filename, self.centers)
        time_to_reward = [0]
        steps_to_reward = [0]

        # prepare for the visualization
        if visual:
            plb.ion()
            plb.pause(0.0001)
            mv = mountaincar.MountainCarViewer(self.mountain_car)
            mv.create_figure(self.n_steps, self.n_steps)
            plb.show()

        for episode_count in np.arange(self.n_episodes):
            self.reset()
            self._parameter_settings(episode_count)
            for step_count in range(self.n_steps):
                if self.verbose:
                    print("Episode: " + str(episode_count))
                    print("Simulation Step: {0}".format(step_count))
                    print("Mountain Car state: {0}".format(self.state))
                    print("Grid Center index: {0}".format(self.index))
                    print("Grid Center: {0}".format(self.centers[self.index]))
                self._learn()
                if visual:
                    # update the visualization
                    mv.update_figure()
                    plb.show()
                    plb.pause(0.0001)
                if self.mountain_car.R > 0.0:
                    print("Reward obtained at t = " + str(self.mountain_car.t))
                    steps_to_reward[0] = step_count
                    break
                elif step_count == self.n_steps - 1:
                    print("Maximum number of iterations reached.  No reward.")
                    steps_to_reward[0] = step_count
            time_to_reward[0] = self.mountain_car.t
            dataSets.generate_data_save(h5data, episode_count, time_to_reward,
                                        steps_to_reward, self.weights)
コード例 #6
0
    def visualize_trial(self, n_steps = 200):
        """Do a trial without learning, with display.

        Parameters
        ----------
        n_steps -- number of steps to simulate for
        """
        
        # prepare for the visualization
        plb.ion()
        plb.pause(0.0001)
        mv = mountaincar.MountainCarViewer(self.mountain_car)
        mv.create_figure(n_steps, n_steps)
        plb.show()
            
        # make sure the mountain-car is reset
        self.mountain_car.reset()

        for n in range(n_steps):
            print('\rt =', self.mountain_car.t,
            sys.stdout.flush())
            
            inputs = self._compute_inputs(self.mountain_car.x, self.mountain_car.x_d)
            outputs = self._compute_outputs(inputs, self.W)
            action = self._select_action(self._softmax(outputs, self.t))
            self.mountain_car.apply_force(action - 1)
            # simulate the timestep
            self.mountain_car.simulate_timesteps(100, 0.01)

            # update the visualization
            mv.update_figure()
            plb.show()
            plb.pause(0.0001)

            # check for rewards
            if self.mountain_car.R > 0.0:
                print("\rreward obtained at t = ", self.mountain_car.t)
                break
コード例 #7
0
    def visualize_trial(self, n_steps=200):
        """Do a trial without learning, with display.

        Parameters
        ----------
        n_steps -- number of steps to simulate for
        """

        # prepare for the visualization
        plb.ion()
        mv = mountaincar.MountainCarViewer(self.mountain_car)
        mv.create_figure(n_steps, n_steps)
        plb.draw()

        # make sure the mountain-car is reset
        self.mountain_car.reset()

        for n in range(n_steps):
            sys.stdout.flush()

            # choose a random action
            q, self.activations = self.action_values()
            p = np.exp(q / self.temperature) / sum(np.exp(
                q / self.temperature))
            # choose the action with a softmax
            self.a = np.random.choice(3, p=p)
            self.mountain_car.apply_force(self.a - 1)
            # simulate the timestep
            self.mountain_car.simulate_timesteps(100, 0.01)

            # update the visualization
            mv.update_figure()
            plb.draw()

            # check for rewards
            if self.mountain_car.R > 0.0:
                print("\rreward obtained at t = ", self.mountain_car.t)
                break
コード例 #8
0
def visualize_trial(agent, n_steps=200):
    """Do a trial without learning, with display.

    Parameters
    ----------
    n_steps -- number of steps to simulate for
    """

    # prepare for the visualization
    plb.ion()
    mv = mountaincar.MountainCarViewer(agent.mountain_car)
    mv.create_figure(n_steps=n_steps, max_time=n_steps)
    plb.draw()
    plb.pause(0.0001)

    # make sure the mountain-car is reset
    agent.mountain_car.reset()

    for n in range(0, n_steps):

        # choose a action
        state = State(agent.mountain_car.x, agent.mountain_car.x_d)
        action = agent.choose_action(state)
        agent.mountain_car.apply_force(action)

        print("action", action)
        # simulate the timestep
        agent.mountain_car.simulate_timesteps(100, 0.01)

        # update the visualization
        mv.update_figure()
        plb.draw()
        plb.pause(0.0001)
        # check for rewards
        if agent.mountain_car.R > 0.0:
            print("reward obtained at t = ", agent.mountain_car.t)
            break
コード例 #9
0
    def visualize_trial(self, n_steps=100):
        """
        Do a trial without learning, with display.

        Parameters
        ----------
        n_steps -- number of steps to simulate for
        """

        # prepare for the visualization
        plb.ion()
        mv = mountaincar.MountainCarViewer(self.mountain_car)
        mv.create_figure(n_steps, n_steps)
        plb.draw()

        # make sure the mountain-car is reset
        self.mountain_car.reset()

        for n in xrange(n_steps):
            
            print('\rt =', self.mountain_car.t)
            print("Enter to continue...")
            raw_input()

            sys.stdout.flush()
            
            reward = self.mountain_car.act(self.agent.act())
            self.agent.state = [self.mountain_car.x, self.mountain_car.vx]
            
            # update the visualization
            mv.update_figure()
            plb.draw()
            
            # check for rewards
            if reward > 0.0:
                print("\rreward obtained at t = ", self.mountain_car.t)
                break
コード例 #10
0
ファイル: starter.py プロジェクト: cipri-tom/mountain-car
    def episode(self, n_steps=2000, tau=None, animation=False, fig=None):
        """ Do an episode of maximum `n_steps`
            This also accepts the `tau` parameter, in case you want to update it
            Optionally, you can specify a figure where to draw this episode
        """

        # prepare for the visualization
        if animation:
            mv = mountaincar.MountainCarViewer(self.mountain_car)
            mv.create_figure(n_steps, n_steps, fig)

        # Initialisation:
        # ---------------
        self.mountain_car.reset()
        self.eligibility_trace = np.zeros_like(self.eligibility_trace)

        if tau is not None:
            self.tau = tau
        if self.tau < 1e-4:
            print(
                "WARNING: Tau is too small so it has been replaced by 0.0001",
                "-- at t=%d" % len(self.escape_times),
                file=sys.stderr)
            sys.stderr.flush()
            self.tau = 1e-4

        # current state
        x = self.mountain_car.x
        v = self.mountain_car.x_d

        # representation of the current state, and current action
        action, state, Q_s_a = self.choose_action(x, v)
        Q_sp_ap = 0  # not yet known

        for n in range(n_steps):
            # Use current action to get to next state, s_prime
            self.mountain_car.apply_force(action)
            self.mountain_car.simulate_timesteps(100, 0.01)
            x_prime = self.mountain_car.x
            v_prime = self.mountain_car.x_d

            # Since this is SARSA, choose next action supposing you also use same policy in next state
            action_prime, state_prime, Q_sp_ap = self.choose_action(
                x_prime, v_prime)

            # update weights based on observations
            self.learn(state, action, Q_s_a, Q_sp_ap)

            # move to next state
            Q_s_a = Q_sp_ap
            action = action_prime
            state = state_prime

            # update the visualization
            if animation:
                mv.update_figure(n)

            # stop when goal was reached
            if self.mountain_car.R > 0.0:
                # print("reward obtained at t = ", self.mountain_car.t, end='\n\n')
                break

        # episode is finished, save number of steps
        self.escape_times.append(self.mountain_car.t)
コード例 #11
0
    def one_run(self, n_steps=2000, visual_no_learn=False):

        # if visualisation on, init figure
        if visual_no_learn:
            plb.ion()
            mv = mountaincar.MountainCarViewer(self.mountain_car)
            mv.create_figure(n_steps, n_steps)
            plb.draw()
            plb.pause(0.001)

        # make sure the mountain-car is reset
        self.mountain_car.reset()

        # 1)Being in state s choose action a according to policy
        # replace state by r values (the response of the neurons)
        self.state = S(self.mountain_car.x, self.mountain_car.x_d)
        self.state_response = self.compute_response(self.state)
        self.action = self.choose_action(self.state_response, self.tau(0))

        self.eligibilities = np.zeros((3, self.iN.shape[0], self.iN.shape[1]))

        for n in range(n_steps):

            # 2) Observe reward r and next state s'
            self.mountain_car.apply_force(self.ActDict[self.action])
            self.mountain_car.simulate_timesteps(100, 0.01)

            self.new_state = S(self.mountain_car.x, self.mountain_car.x_d)
            self.new_state_response = self.compute_response(self.new_state)

            # 3) Choose action a' in state s' according to policy
            self.new_action = self.choose_action(self.new_state_response,
                                                 self.tau(n + 1))

            if visual_no_learn:
                # update the visualization
                mv.update_figure()
                plb.draw()
                plb.pause(0.001)
            else:
                # 4) Update weights
                self.weights_hist[0].append(self.weights.mean())
                self.weights_hist[1].append(
                    self.weights.std())  # save the evolution of the weight

                self.eligibilities[self.action] += self.state_response
                delta = self.mountain_car.R + self.gamma * self.Q(
                    self.new_state_response, self.new_action) - self.Q(
                        self.state_response, self.action)
                self.weights = self.weights + self.eta * delta * self.eligibilities
                self.eligibilities *= self.gamma * self.lambd

            # 5) s' -> s; a' -> a
            self.state = self.new_state
            self.state_response = self.new_state_response

            self.action = self.new_action

            # check for rewards
            if self.mountain_car.R > 0.0:
                return n

        return n_steps
コード例 #12
0
    def learn(self, n_trials=100, n_steps=10000, verbose=0):
        self.verbose = verbose
        learning_curve = n_steps * np.ones(n_trials)

        for i in range(n_trials):
            if self.verbose:
                # Prepare for visualization
                plb.ion()
                mv = mountaincar.MountainCarViewer(self.mc)
                mv.create_figure(n_steps, n_steps)
                plb.draw()

            # Initialization for new trial
            self.mc.reset()
            self.temp = self.temp0
            el_tr = np.zeros(self.net.W.shape)  # eligibility traces
            a, q, r = self.choose_action()

            # Update exploration temperature
            if self.temp_fun is not None:
                self.temp = self.temp_fun(self.temp0, 0, i, n_trials)

            for j in range(n_steps):

                # Update eligibility traces
                el_tr *= (self.reward_factor * self.el_tr_rate)
                el_tr[a, :] += r

                # Simulate timesteps
                self.mc.simulate_timesteps(n=100, dt=0.01)

                # Choose next action
                a_prime, q_prime, r_prime = self.choose_action()

                # Calculate TD error
                delta = self.mc.R + (self.reward_factor * q_prime) - q

                # Update network weights
                deltaW = self.learn_rate * delta * el_tr
                self.net.W += deltaW

                # Log
                if self.verbose:
                    mv.update_figure()
                    plb.draw()
                    print("tau = {}".format(self.temp))
                    print("a' = {}".format(a_prime))
                    print("q' = {}".format(q_prime))
                    print("delta = {}".format(delta))
                    print("||deltaW|| = {}".format(np.linalg.norm(deltaW)))
                    print("max(|deltaW|) = {}".format(np.max(np.abs(deltaW))))
                    sys.stdout.flush()

                # Change old varibles for new ones
                a = a_prime
                q = q_prime
                r = r_prime

                # Check for rewards
                if self.mc.R > 0.0:
                    if self.verbose:
                        print("\rGot reward at t = {}".format(self.mc.t))
                        sys.stdout.flush()
                    learning_curve[i] = j
                    break

            if verbose:
                input("Press ENTER to continue...")
                sys.stdout.flush()

        return learning_curve