Ejemplo n.º 1
0
class Agent:
  def __init__(self, environment, replay_memory, deep_q_network, args):
    self.env = environment
    self.mem = replay_memory
    self.net = deep_q_network
    self.buf = StateBuffer(args)
    self.num_actions = self.env.numActions()
    self.random_starts = args.random_starts
    self.history_length = args.history_length

    self.exploration_rate_start = args.exploration_rate_start
    self.exploration_rate_end = args.exploration_rate_end
    self.exploration_decay_steps = args.exploration_decay_steps
    self.exploration_rate_test = args.exploration_rate_test
    self.total_train_steps = args.start_epoch * args.train_steps

    self.train_frequency = args.train_frequency
    self.train_repeat = args.train_repeat

    self.callback = None

  def _restartRandom(self):
    self.env.restart()
    # perform random number of dummy actions to produce more stochastic games
    for i in xrange(random.randint(self.history_length, self.random_starts) + 1):
      reward = self.env.act(0)
      screen = self.env.getScreen()
      terminal = self.env.isTerminal()
      assert not terminal, "terminal state occurred during random initialization"
      # add dummy states to buffer
      self.buf.add(screen)

  def _explorationRate(self):
    # calculate decaying exploration rate
    if self.total_train_steps < self.exploration_decay_steps:
      return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps
    else:
      return self.exploration_rate_end

  def step(self, exploration_rate):
    # exploration rate determines the probability of random moves
    if random.random() < exploration_rate:
      action = random.randrange(self.num_actions)
      logger.debug("Random action = %d" % action)
    else:
      # otherwise choose action with highest Q-value
      state = self.buf.getStateMinibatch()
      # for convenience getStateMinibatch() returns minibatch
      # where first item is the current state
      qvalues = self.net.predict(state)
      assert len(qvalues[0]) == self.num_actions
      # choose highest Q-value of first state
      action = np.argmax(qvalues[0])
      logger.debug("Predicted action = %d" % action)

    # perform the action
    reward = self.env.act(action)
    screen = self.env.getScreen()
    terminal = self.env.isTerminal()

    # print reward
    if reward <> 0:
      logger.debug("Reward: %d" % reward)

    # add screen to buffer
    self.buf.add(screen)

    # restart the game if over
    if terminal:
      logger.debug("Terminal state, restarting")
      self._restartRandom()

    # call callback to record statistics
    if self.callback:
      self.callback.on_step(action, reward, terminal, screen, exploration_rate)

    return action, reward, screen, terminal

  def play_random(self, random_steps):
    # play given number of steps
    for i in xrange(random_steps):
      # use exploration rate 1 = completely random
      self.step(1)

  def train(self, train_steps, epoch = 0):
    # do not do restart here, continue from testing
    #self._restartRandom()
    # play given number of steps
    for i in xrange(train_steps):
      # perform game step
      action, reward, screen, terminal = self.step(self._explorationRate())
      self.mem.add(action, reward, screen, terminal)
      # train after every train_frequency steps
      if self.mem.count > self.mem.batch_size and i % self.train_frequency == 0:
        # train for train_repeat times
        for j in xrange(self.train_repeat):
          #logger.info("i=%d, j=%d, mem.count=%d" % (i, j, self.mem.count))
          # sample minibatch
          minibatch = self.mem.getMinibatch()
          # train the network
          self.net.train(minibatch, epoch)
      # increase number of training steps for epsilon decay
      self.total_train_steps += 1

  def test(self, test_steps, epoch = 0):
    # just make sure there is history_length screens to form a state
    self._restartRandom()
    # play given number of steps
    for i in xrange(test_steps):
      # perform game step
      self.step(self.exploration_rate_test)

  def play(self, num_games):
    # just make sure there is history_length screens to form a state
    self._restartRandom()
    for i in xrange(num_games):
      # play until terminal state
      terminal = False
      while not terminal:
        action, reward, screen, terminal = self.step(self.exploration_rate_test)
        # add experiences to replay memory for visualization
        self.mem.add(action, reward, screen, terminal)
Ejemplo n.º 2
0
class Agent:
  def __init__(self, environment, replay_memory, deep_q_network, args):
    self.env = environment
    self.mem = replay_memory
    self.net = deep_q_network
    self.buf = StateBuffer(args)
    self.num_actions = self.env.numActions()
    self.random_starts = args.random_starts
    self.history_length = args.history_length

    self.exploration_rate_start = args.exploration_rate_start
    self.exploration_rate_end = args.exploration_rate_end
    self.exploration_decay_steps = args.exploration_decay_steps
    self.exploration_rate_test = args.exploration_rate_test
    self.total_train_steps = args.start_epoch * args.train_steps

    self.train_frequency = args.train_frequency
    self.train_repeat = args.train_repeat
    self.target_steps = args.target_steps

    self.callback = None

  def _restartRandom(self):
    self.env.restart()
    # perform random number of dummy actions to produce more stochastic games
    for i in range(random.randint(self.history_length, self.random_starts) + 1):
      reward = self.env.act(0)
      terminal = self.env.isTerminal()
      if terminal:
          self.env.restart()
      screen = self.env.getScreen()
      # add dummy states to buffer
      self.buf.add(screen)

  def _explorationRate(self):
    # calculate decaying exploration rate
    if self.total_train_steps < self.exploration_decay_steps:
      return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps
    else:
      return self.exploration_rate_end

  def step(self, exploration_rate):
    # exploration rate determines the probability of random moves
    if random.random() < exploration_rate:
      action = random.randrange(self.num_actions)
      logger.debug("Random action = %d" % action)
    else:
      # otherwise choose action with highest Q-value
      state = self.buf.getStateMinibatch()
      # for convenience getStateMinibatch() returns minibatch
      # where first item is the current state
      qvalues = self.net.predict(state)
      assert len(qvalues[0]) == self.num_actions
      # choose highest Q-value of first state
      action = np.argmax(qvalues[0])
      logger.debug("Predicted action = %d" % action)

    # perform the action
    reward = self.env.act(action)
    screen = self.env.getScreen()
    terminal = self.env.isTerminal()

    # print reward
    if reward != 0:
      logger.debug("Reward: %d" % reward)

    # add screen to buffer
    self.buf.add(screen)

    # restart the game if over
    if terminal:
      logger.debug("Terminal state, restarting")
      self._restartRandom()

    # call callback to record statistics
    if self.callback:
      self.callback.on_step(action, reward, terminal, screen, exploration_rate)

    return action, reward, screen, terminal

  def play_random(self, random_steps):
    #call env.restart first so that env.reset is called before step.
    self.env.restart()
    # play given number of steps
    for i in range(random_steps):
      # use exploration rate 1 = completely random
      action, reward, screen, terminal = self.step(1)
      self.mem.add(action, reward, screen, terminal)

  def train(self, train_steps, epoch = 0):
    # do not do restart here, continue from testing
    #self._restartRandom()
    # play given number of steps
    for i in range(train_steps):
      # perform game step
      action, reward, screen, terminal = self.step(self._explorationRate())
      self.mem.add(action, reward, screen, terminal)
      # Update target network every target_steps steps
      if self.target_steps and i % self.target_steps == 0:
        self.net.update_target_network()
      # train after every train_frequency steps
      if self.mem.count > self.mem.batch_size and i % self.train_frequency == 0:
        # train for train_repeat times
        for j in range(self.train_repeat):
          # sample minibatch
          minibatch = self.mem.getMinibatch()
          # train the network
          self.net.train(minibatch, epoch)
      # increase number of training steps for epsilon decay
      self.total_train_steps += 1

  def test(self, test_steps, epoch = 0):
    # just make sure there is history_length screens to form a state
    self._restartRandom()
    # play given number of steps
    for i in range(test_steps):
      # perform game step
      self.step(self.exploration_rate_test)

  def play(self, num_games):
    # just make sure there is history_length screens to form a state
    self._restartRandom()
    for i in range(num_games):
      # play until terminal state
      terminal = False
      while not terminal:
        action, reward, screen, terminal = self.step(self.exploration_rate_test)
        # add experiences to replay memory for visualization
        self.mem.add(action, reward, screen, terminal)
Ejemplo n.º 3
0
class GymAgent(object):
    def __init__(self,
                 env=Breakout - v0,
                 net,
                 replay_memory,
                 exploration_strategy,
                 args):
        self.env = env
        self.net = net
        self.mem = replay_memory
        self.exporation_strategy = exporation_strategy
        self.buf = StateBuffer(args)
        self.history_length = args.history_length
        #self.exploration_train_strategy = exploration_strategy.args.exploration_train_strategy
        #self.exploration_test_strategy = exploration_strategy.args.exploration_test_strategy
        self.train_net_frequency = args.train_net_frequency
        self.train_net_repeat = args.train_net_repeat

    def _restart_random(self):
        self.env.reset()
        # perform random number of dummy actions to produce more stochastic games
        for t in xrange(
                random.randint(self.history_length, self.random_starts) + 1):
            self.mem.action = self.env.action_space.sample()
            self.mem.observation, self.mem.reward, self.mem.done, self.mem.info = self.env.step(
                self.mem.action)
            assert not self.env.done, "done state occurred during random initialization"

            # add dummy states to buffer
#to be merged in replay_memor=self.mem here   self.buf.add(observation)

    def act(self, exploration_strategy):
        # FOR BASE AGENT, perhasp use: raise NotImplementedError
        callbacks.on_act_begin()
        # determine whether to explore
        action = exploration_strategy()
        if action:
            logger.debug("Explore action = {}".format(action))
        else:
            # otherwise choose action with highest Q-value
            state = self.buf.getStateMinibatch()
            # for convenience getStateMinibatch() returns minibatch
            # where first item is the current state
            qvalues = self.net.predict(state)
            assert len(qvalues[0]) == self.env.action_space.n
            # choose highest Q-value of first state
            action = np.argmax(qvalues[0])
            logger.debug("Predicted action = {}".format(action))
        # perform the action, and update replay_memory
        self.mem.action = action
        self.mem.observation, self.mem.reward, self.mem.done, self.mem.info = self.env.step(
            self.mem.action)
        # add screen to buffer
        #self.buf.add(observation)
        # restart the game if over
        if done:
            self._restart_random()
        # call callback to log progress
        #MOVE THIS TO CALLBACK SELF.AGENT (need to add self stuff above - NO! USE e.g. buf.observations[last (obvisously replace with the actual number)]):
##        act_logs = {}
##        act_logs['observation'] = observation
##        act_logs['done'] = done
##        act_logs['reward'] = reward
##        act_logs['t'] = t
        self.callback.on_act_end(act)
        #see statistics vs monitor
        return action, observation, reward, done, info

    def train(self, train_steps, episode=0):
        #CHECK WHY, INPARTICULAR SURELY WE DON'T NECCESSARILY HAVE 4STATES FOR CONVNET???        # do not do restart here, continue from testing
        #self._restart_random()
        # play given number of steps
        for t in xrange(train_steps):
            # update agent replay memory regarding t
            self.mem.t = t
            # perform game step
            self.act(self.exploration_train_strategy)
            # train after every train_frequency steps
            if self.mem.count > self.mem.batch_size and t % self.train_frequency == 0:
                # train for train_repeat times
                for j in xrange(self.train_net_repeat):
                    # sample minibatch
                    minibatch = self.mem.getMinibatch()
                    # train the network
                    self.net.train(minibatch, episode)
            # restart the game if over
            if self.mem.done:
                # just make sure there is history_length screens to form a state
                # perform random number of dummy actions to produce more stochastic games
                if t < random.randint(self.history_length,
                                      self.random_starts) + 1:
                    self.act(self.exploration_strategy.play_random)

    def test(self, test_steps, episode=0):
        # play given number of steps
        for t in xrange(test_steps):
            # update agent replay memory regarding t
            # check if we trained
            if t == 0:
                test_start_t = self.mem.t
                # reset environment
                self.env.reset()
            self.mem.t = test_start_t + t
            # just make sure there is history_length screens to form a state
            # perform random number of dummy actions to produce more stochastic games
            if t < random.randint(self.history_length, self.random_starts) + 1:
                self.act(self.exploration_strategy.play_random)
            # perform game step
            self.act(self.exploration_test_strategy)

    def play(self, num_games):
        for t in xrange(num_games):
            # just make sure there is history_length screens to form a state
            # perform random number of dummy actions to produce more stochastic games
            if t < random.randint(self.history_length, self.random_starts) + 1:
                self.act(self.exploration_strategy.play_random)
            # play until terminal state
            while not self.mem.done:
                self.act(t, self.exploration_test_strategy)