Esempio n. 1
0
    def learn(self, ipy_clear=False, max_episodes=100000000, max_pathlength=200):

        start_time = time.time()
        numeptotal = 0
        i = 0

        if self.enable_plots and self.stats is None:
            import matplotlib.pyplot as plt
            self.stats = {
                "tr":statbin(self.stats_rate),     # Total Reward
                "ft":statbin(self.stats_rate),     # Finishing Time
                "minvf":statbin(self.stats_rate),     # Min Value Fn
                "maxvf":statbin(self.stats_rate),     # Min Value Fn
            }

        for e in xrange(max_episodes):

            observation = self.env.reset()
            done = False
            total_reward = 0.0
            t = 0
            maxv = []
            minv = []

            obs = np.zeros( [self.nframes]+list(self.env.observation_space.shape) )
            new_obs = np.zeros( [self.nframes]+list(self.env.observation_space.shape) )
            obs[0,:] = observation

            while (not done) and (t<max_pathlength):
                t += 1
                self.env.render()
                action, values = self.act(obs)
                maxv.append(max(values.flatten()))
                minv.append(min(values.flatten()))

                new_observation, reward, done, info = self.env.step(action)
                new_obs[1:,:] = obs[-1:,:]
                new_obs[0,:] = new_observation
                if not done and t == max_pathlength-1:
                    done = True

                do_update = (i%self.timesteps_per_batch==self.timesteps_per_batch-1)
                self.update_train( obs, action, reward, new_obs, done, do_update )

                obs[:,:] = new_obs[:,:]
                total_reward += reward
                i += 1

            print " * Episode %08d\tFrame %08d\tSamples: %08d\tTerminal: %08d\tReward: %d\tEpsilon: %f"%(e, t, len(self.observations), self.nterminal, total_reward, self.epsilon)
            if not self.epsilon_schedule == None:
                self.epsilon = self.epsilon_schedule(e, self.epsilon)

            if self.enable_plots:
                self.stats["tr"].add(total_reward)
                self.stats["ft"].add(t)
                self.stats["maxvf"].add(np.mean(maxv))
                self.stats["minvf"].add(np.mean(minv))

                if(e%self.stats_rate == self.stats_rate-1):
                    if ipy_clear:
                        from IPython import display
                        display.clear_output(wait=True)
                    fig = plt.figure(1)
                    fig.canvas.set_window_title("DDQN Training Stats for %s"%(self.env.__class__.__name__))
                    plt.clf()
                    plt.subplot(2,2,1)
                    self.stats["tr"].plot()
                    plt.title("Total Reward per Episode")
                    plt.xlabel("Episode")
                    plt.ylabel("Total Reward")
                    plt.legend(loc=2)
                    plt.subplot(2,2,2)
                    self.stats["ft"].plot()
                    plt.title("Finishing Time per Episode")
                    plt.xlabel("Episode")
                    plt.ylabel("Finishing Time")
                    plt.legend(loc=2)
                    plt.subplot(2,2,3)
                    self.stats["maxvf"].plot2(fill_col='lightblue', label='Avg Max VF')
                    self.stats["minvf"].plot2(fill_col='slategrey', label='Avg Min VF')
                    plt.title("Value Function Outputs")
                    plt.xlabel("Episode")
                    plt.ylabel("Value Fn")
                    plt.legend(loc=2)
                    ax = plt.subplot(2,2,4)
                    plt.plot(self.train_costs)
                    plt.title("Training Loss")
                    plt.xlabel("Training Epoch")
                    plt.ylabel("Loss")
                    try:
                        ax.set_yscale("log", nonposy='clip')
                        plt.tight_layout()
                    except:
                        pass
                    plt.show(block=False)
                    plt.draw()
                    plt.pause(0.001)
Esempio n. 2
0
    def train(self, ipy_clear=False, max_episodes=100000000, max_pathlength=200):

        rewards = statbin.statbin(10)
        observation = self.env.reset()
        prev_x = None # used in computing the difference frame
        xs,hs,dlogps,drs = [],[],[],[]
        running_reward = None
        reward_sum = 0
        episode_number = 0
        while True:
          if self.render: self.env.render()

          # preprocess the observation, set input to network to be difference image
          if not self.preprocessor ==None:
              cur_x = self.preprocessor(observation)
          else:
              cur_x = observation

          x = cur_x - prev_x if prev_x is not None else np.zeros(self.input_dim, dtype='float32')
          x = x.flatten()
          prev_x = cur_x

          # forward the policy network and sample an action from the returned probability
          aprob = self.model.predict(x.reshape([1,self.input_dim]), batch_size=1).flatten()
          action = np.random.choice( self.env.action_space.n, 1, p=aprob/np.sum(aprob) )[0]

          # record various intermediates (needed later for backprop)
          xs.append(x) # observation

          # Harsh Grad ...
          y = np.zeros([self.env.action_space.n])
          y[action] = 1

          # Subtle Grad ...
#          y = aprob*0.9
#          y[action] = aprob[action] * 1.1

          dlogps.append(y) # grad that encourages the action that was tak
          #dlogps.append(y - aprob) # grad that encourages the action that was tak
          observation, reward, done, info = self.env.step(action)
          reward_sum += float(reward)

          drs.append(float(reward)) # record reward (has to be done after we call step() to get reward for previous action)

          if done: # an episode finished
            episode_number += 1

            # stack together all inputs, hidden states, action gradients, and rewards for this episode
            epx = np.vstack(xs)
            epdlogp = np.vstack(dlogps)
            epr = np.vstack(drs)
            xs,hs,dlogps,drs = [],[],[],[] # reset array memory

            # compute the discounted reward backwards through time
            discounted_epr = self.discount_rewards(epr)
            # standardize the rewards to be unit normal (helps control the gradient estimator variance)
            discounted_epr -= np.mean(discounted_epr)
            discounted_epr /= np.std(discounted_epr)

            epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)

            self.model.fit(epx, epdlogp,
                    nb_epoch=1, verbose=2, shuffle=True)

            # boring book-keeping
            running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
            rewards.add(reward_sum)
            print 'resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward)
            if episode_number % 100 == 0:
                self.save()
            reward_sum = 0
            observation = self.env.reset() # reset env
            prev_x = None

            if(self.enable_plots):
                plt.figure(1)
                #plt.plot(rewards)
                rewards.plot()
                plt.show(block=False)
                plt.draw()
                plt.pause(0.001)

          if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
            print ('ep %d: game finished, reward: %f' % (episode_number, reward)) + ('' if reward == -1 else ' !!!!!!!!')
Esempio n. 3
0
    def learn(self,
              ipy_clear=False,
              max_episodes=100000000,
              max_pathlength=200):

        start_time = time.time()
        numeptotal = 0
        i = 0

        if self.enable_plots and self.stats is None:
            import matplotlib.pyplot as plt
            self.stats = {
                "tr": statbin(self.stats_rate),  # Total Reward
                "ft": statbin(self.stats_rate),  # Finishing Time
                "minvf": statbin(self.stats_rate),  # Min Value Fn
                "maxvf": statbin(self.stats_rate),  # Min Value Fn
            }

        for e in xrange(max_episodes):

            observation = self.env.reset()
            done = False
            total_reward = 0.0
            t = 0
            maxv = []
            minv = []

            obs = np.zeros([self.nframes] +
                           list(self.env.observation_space.shape))
            new_obs = np.zeros([self.nframes] +
                               list(self.env.observation_space.shape))
            obs[0, :] = observation

            while (not done) and (t < max_pathlength):
                t += 1
                self.env.render()
                action, values = self.act(obs)
                maxv.append(max(values.flatten()))
                minv.append(min(values.flatten()))

                new_observation, reward, done, info = self.env.step(action)
                new_obs[1:, :] = obs[-1:, :]
                new_obs[0, :] = new_observation
                if not done and t == max_pathlength - 1:
                    done = True

                do_update = (
                    i % self.timesteps_per_batch == self.timesteps_per_batch -
                    1)
                self.update_train(obs, action, reward, new_obs, done,
                                  do_update)

                obs[:, :] = new_obs[:, :]
                total_reward += reward
                i += 1

            print " * Episode %08d\tFrame %08d\tSamples: %08d\tTerminal: %08d\tReward: %d\tEpsilon: %f" % (
                e, t, len(self.observations), self.nterminal, total_reward,
                self.epsilon)
            if not self.epsilon_schedule == None:
                self.epsilon = self.epsilon_schedule(e, self.epsilon)

            if self.enable_plots:
                self.stats["tr"].add(total_reward)
                self.stats["ft"].add(t)
                self.stats["maxvf"].add(np.mean(maxv))
                self.stats["minvf"].add(np.mean(minv))

                if (e % self.stats_rate == self.stats_rate - 1):
                    if ipy_clear:
                        from IPython import display
                        display.clear_output(wait=True)
                    fig = plt.figure(1)
                    fig.canvas.set_window_title("DDQN Training Stats for %s" %
                                                (self.env.__class__.__name__))
                    plt.clf()
                    plt.subplot(2, 2, 1)
                    self.stats["tr"].plot()
                    plt.title("Total Reward per Episode")
                    plt.xlabel("Episode")
                    plt.ylabel("Total Reward")
                    plt.legend(loc=2)
                    plt.subplot(2, 2, 2)
                    self.stats["ft"].plot()
                    plt.title("Finishing Time per Episode")
                    plt.xlabel("Episode")
                    plt.ylabel("Finishing Time")
                    plt.legend(loc=2)
                    plt.subplot(2, 2, 3)
                    self.stats["maxvf"].plot2(fill_col='lightblue',
                                              label='Avg Max VF')
                    self.stats["minvf"].plot2(fill_col='slategrey',
                                              label='Avg Min VF')
                    plt.title("Value Function Outputs")
                    plt.xlabel("Episode")
                    plt.ylabel("Value Fn")
                    plt.legend(loc=2)
                    ax = plt.subplot(2, 2, 4)
                    plt.plot(self.train_costs)
                    plt.title("Training Loss")
                    plt.xlabel("Training Epoch")
                    plt.ylabel("Loss")
                    try:
                        ax.set_yscale("log", nonposy='clip')
                        plt.tight_layout()
                    except:
                        pass
                    plt.show(block=False)
                    plt.draw()
                    plt.pause(0.001)
Esempio n. 4
0
    def train(self,
              ipy_clear=False,
              max_episodes=100000000,
              max_pathlength=200):

        rewards = statbin.statbin(10)
        observation = self.env.reset()
        prev_x = None  # used in computing the difference frame
        xs, hs, dlogps, drs = [], [], [], []
        running_reward = None
        reward_sum = 0
        episode_number = 0
        while True:
            if self.render: self.env.render()

            # preprocess the observation, set input to network to be difference image
            if not self.preprocessor == None:
                cur_x = self.preprocessor(observation)
            else:
                cur_x = observation

            x = cur_x - prev_x if prev_x is not None else np.zeros(
                self.input_dim, dtype='float32')
            x = x.flatten()
            prev_x = cur_x

            # forward the policy network and sample an action from the returned probability
            aprob = self.model.predict(x.reshape([1, self.input_dim]),
                                       batch_size=1).flatten()
            action = np.random.choice(self.env.action_space.n,
                                      1,
                                      p=aprob / np.sum(aprob))[0]

            # record various intermediates (needed later for backprop)
            xs.append(x)  # observation

            # Harsh Grad ...
            y = np.zeros([self.env.action_space.n])
            y[action] = 1

            # Subtle Grad ...
            #          y = aprob*0.9
            #          y[action] = aprob[action] * 1.1

            dlogps.append(y)  # grad that encourages the action that was tak
            #dlogps.append(y - aprob) # grad that encourages the action that was tak
            observation, reward, done, info = self.env.step(action)
            reward_sum += float(reward)

            drs.append(
                float(reward)
            )  # record reward (has to be done after we call step() to get reward for previous action)

            if done:  # an episode finished
                episode_number += 1

                # stack together all inputs, hidden states, action gradients, and rewards for this episode
                epx = np.vstack(xs)
                epdlogp = np.vstack(dlogps)
                epr = np.vstack(drs)
                xs, hs, dlogps, drs = [], [], [], []  # reset array memory

                # compute the discounted reward backwards through time
                discounted_epr = self.discount_rewards(epr)
                # standardize the rewards to be unit normal (helps control the gradient estimator variance)
                discounted_epr -= np.mean(discounted_epr)
                discounted_epr /= np.std(discounted_epr)

                epdlogp *= discounted_epr  # modulate the gradient with advantage (PG magic happens right here.)

                experiment = Experiment(project_name='osh/kerlym')
                self.model.fit(epx,
                               epdlogp,
                               nb_epoch=1,
                               verbose=2,
                               shuffle=True)

                # boring book-keeping
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                rewards.add(reward_sum)
                print 'resetting env. episode reward total was %f. running mean: %f' % (
                    reward_sum, running_reward)
                if episode_number % 100 == 0:
                    self.save()
                reward_sum = 0
                observation = self.env.reset()  # reset env
                prev_x = None

                if (self.enable_plots):
                    plt.figure(1)
                    #plt.plot(rewards)
                    rewards.plot()
                    plt.show(block=False)
                    plt.draw()
                    plt.pause(0.001)

            if reward != 0:  # Pong has either +1 or -1 reward exactly when game ends.
                print('ep %d: game finished, reward: %f' %
                      (episode_number, reward)) + ('' if reward == -1 else
                                                   ' !!!!!!!!')