Ejemplo n.º 1
0
    def run(self):
        mb_obs, mb_td_targets, mb_base_actions, \
        mb_xy0, mb_xy1, \
        mb_values, mb_dones \
          = [],[],[],[],[],[], []
        # ,[],[],[],[],[],[],[],[],[],[],[]

        mb_states = self.states
        for n in range(self.nsteps):
            # pi, pi2, x1, y1, x2, y2, v0
            pi1, pi_xy0, pi_xy1, values, states = self.model.step(
                self.obs, self.states, self.dones)

            pi1_noise = np.random.random_sample((self.nenv, 2)) * 0.3
            # avail = self.env.available_actions()
            # print("pi1 : ", pi1)
            # print("pi1 * self.base_act_mask : ", pi1 * self.base_act_mask)
            # print("pi1 * self.base_act_mask + pi1_noise : ", pi1 * self.base_act_mask + pi1_noise)

            base_actions = np.argmax(pi1 * self.base_act_mask + pi1_noise,
                                     axis=1)
            xy0 = np.argmax(pi_xy0, axis=1)

            x0 = (xy0 % 32).astype(int)
            y0 = (xy0 / 32).astype(int)

            xy1 = np.argmax(pi_xy1, axis=1)
            x1 = (xy1 % 32).astype(int)
            y1 = (xy1 / 32).astype(int)

            # pi (2?, 524) * (2?, 524) masking
            # print("base_actions : ", base_actions)
            # print("base_action_spec : ", base_action_spec)
            # sub1_act_mask, sub2_act_mask, sub3_act_mask = self.get_sub_act_mask(base_action_spec)
            # print("base_actions : ", base_actions, "base_action_spec", base_action_spec,
            #       "sub1_act_mask :", sub1_act_mask, "sub2_act_mask :", sub2_act_mask, "sub3_act_mask :", sub3_act_mask)
            # sub3_actions = np.argmax(pi_sub3, axis=1) # pi (2?, 2) [1 0]
            # sub4_actions = np.argmax(pi_sub4, axis=1) # pi (2?, 5) [4 4]
            # sub5_actions = np.argmax(pi_sub5, axis=1) # pi (2?, 10) [1 4]
            # sub6_actions = np.argmax(pi_sub6, axis=1) # pi (2?, 4) [3 1]
            # sub7_actions = np.argmax(pi_sub7, axis=1) # pi (2?, 2)
            # sub8_actions = np.argmax(pi_sub8, axis=1) # pi (2?, 4)
            # sub9_actions = np.argmax(pi_sub9, axis=1) # pi (2?, 500)
            # sub10_actions = np.argmax(pi_sub10, axis=1) # pi (2?, 4)
            # sub11_actions = np.argmax(pi_sub11, axis=1) # pi (2?, 10)
            # sub12_actions = np.argmax(pi_sub12, axis=1) # pi (2?, 500)

            # Scripted Agent Hacking

            for env_num in range(self.nenv):
                if (env_num >= self.nscripts):  # only for scripted agents
                    continue

                ob = self.obs[env_num, :, :, :]
                # extra = ob[:,:,-1]
                # selected = ob[:, :, -2]
                player_relative = ob[:, :, -1]

                #if(common.check_group_list())
                self.group_list[env_num] = common.update_group_list2(
                    self.control_groups[env_num])
                # if(len(self.action_queue[env_num]) == 0 and len(self.group_list[env_num]) == 0):
                #
                #   # Scripted Agent is only for even number agents
                #   self.action_queue[env_num] = common.group_init_queue(player_relative)

                if (len(self.action_queue[env_num]) == 0):

                    self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], self.xy_per_marine[env_num] = \
                      common.solve_tsp(player_relative,
                                       self.selected[env_num][0],
                                       self.group_list[env_num],
                                       self.group_id[env_num],
                                       self.dest_per_marine[env_num],
                                       self.xy_per_marine[env_num])

                base_actions[env_num] = 0
                x0[env_num] = 0
                y0[env_num] = 0
                x1[env_num] = 0
                y1[env_num] = 0

                if (len(self.action_queue[env_num]) > 0):
                    action = self.action_queue[env_num].pop(0)
                    # print("action :", action)
                    base_actions[env_num] = action.get("base_action", 0)

                    x0[env_num] = action.get("x0", 0)
                    y0[env_num] = action.get("y0", 0)
                    xy0[env_num] = y0[env_num] * 32 + x0[env_num]

                    x1[env_num] = action.get("x1", 0)
                    y1[env_num] = action.get("y1", 0)
                    xy1[env_num] = y1[env_num] * 32 + x1[env_num]

            base_actions = self.valid_base_action(base_actions)
            # print("valid_base_actions : ", base_actions)
            new_base_actions = self.trans_base_actions(base_actions)
            # print("new_base_actions : ", new_base_actions)

            base_action_spec = self.env.action_spec(new_base_actions)

            actions = self.construct_action(
                base_actions,
                base_action_spec,
                # sub3_actions, sub4_actions, sub5_actions,
                # sub6_actions,
                # sub7_actions, sub8_actions,
                # sub9_actions, sub10_actions,
                # sub11_actions, sub12_actions,
                x0,
                y0,
                x1,
                y1
                # , x2, y2
            )

            mb_obs.append(np.copy(self.obs))
            mb_base_actions.append(base_actions)
            # mb_sub3_actions.append(sub3_actions)
            # mb_sub4_actions.append(sub4_actions)
            # mb_sub5_actions.append(sub5_actions)
            # mb_sub6_actions.append(sub6_actions)
            # mb_sub7_actions.append(sub7_actions)
            # mb_sub8_actions.append(sub8_actions)
            # mb_sub9_actions.append(sub9_actions)
            # mb_sub10_actions.append(sub10_actions)
            # mb_sub11_actions.append(sub11_actions)
            # mb_sub12_actions.append(sub12_actions)

            mb_xy0.append(xy0)
            # mb_y0.append(y0)
            mb_xy1.append(xy1)
            # mb_y1.append(y1)
            # mb_x2.append(x2)
            # mb_y2.append(y2)
            mb_values.append(values)
            mb_dones.append(self.dones)

            #print("final acitons : ", actions)
            obs, rewards, dones, available_actions, army_counts, control_groups, selected, xy_per_marine = self.env.step(
                actions=actions)
            self.army_counts = army_counts
            self.control_groups = control_groups
            self.selected = selected
            for env_num, data in enumerate(xy_per_marine):
                # print("env_num", env_num, "xy_per_marine:", data)
                self.xy_per_marine[env_num] = data
            self.update_available(available_actions)

            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                self.total_reward[n] += float(rewards[n])
                if done:
                    self.obs[n] = self.obs[n] * 0
                    self.episodes += 1
                    num_episodes = self.episodes
                    self.episode_rewards.append(self.total_reward[n])

                    mean_100ep_reward = round(
                        np.mean(self.episode_rewards[-101:-1]), 1)

                    if (n < self.nscripts):  # scripted agents
                        self.episode_rewards_script.append(
                            self.total_reward[n])
                        mean_100ep_reward_script = round(
                            np.mean(self.episode_rewards_script[-101:-1]), 1)
                        # logger.record_tabular("reward script",
                        #                       self.total_reward[n])
                        # logger.record_tabular("mean reward script",
                        #                       mean_100ep_reward_script)
                        nsml.report(
                            reward_script=self.total_reward[n],
                            mean_reward_script=mean_100ep_reward_script,
                            reward=self.total_reward[n],
                            mean_100ep_reward=mean_100ep_reward,
                            episodes=self.episodes,
                            step=self.episodes,
                            scope=locals())
                    else:
                        self.episode_rewards_a2c.append(self.total_reward[n])
                        mean_100ep_reward_a2c = round(
                            np.mean(self.episode_rewards_a2c[-101:-1]), 1)
                        # logger.record_tabular("reward a2c",
                        #                       self.total_reward[n])
                        # logger.record_tabular("mean reward a2c",
                        #                       mean_100ep_reward_a2c)
                        nsml.report(reward_a2c=self.total_reward[n],
                                    mean_reward_a2c=mean_100ep_reward_a2c,
                                    reward=self.total_reward[n],
                                    mean_100ep_reward=mean_100ep_reward,
                                    episodes=self.episodes,
                                    step=self.episodes,
                                    scope=locals())

                    #print("env %s done! reward : %s mean_100ep_reward : %s " %
                    #      (n, self.total_reward[n], mean_100ep_reward))

                    # logger.record_tabular("reward", self.total_reward[n])
                    # logger.record_tabular("mean 100 episode reward",
                    #                       mean_100ep_reward)
                    # logger.record_tabular("episodes", self.episodes)

                    # logger.dump_tabular()

                    self.total_reward[n] = 0
                    self.group_list[n] = []

                    model = self.model
                    if self.callback is not None:
                        self.callback(locals(), globals())

            #print("rewards : ", rewards)
            #print("self.total_reward :", self.total_reward)
            self.update_obs(obs)
            mb_td_targets.append(rewards)
        mb_dones.append(self.dones)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
        mb_td_targets = np.asarray(mb_td_targets,
                                   dtype=np.float32).swapaxes(1, 0)
        mb_base_actions = np.asarray(mb_base_actions,
                                     dtype=np.int32).swapaxes(1, 0)
        # mb_sub3_actions = np.asarray(mb_sub3_actions, dtype=np.int32).swapaxes(1, 0)
        # mb_sub4_actions = np.asarray(mb_sub4_actions, dtype=np.int32).swapaxes(1, 0)
        # mb_sub5_actions = np.asarray(mb_sub5_actions, dtype=np.int32).swapaxes(1, 0)
        # mb_sub6_actions = np.asarray(mb_sub6_actions, dtype=np.int32).swapaxes(1, 0)
        # mb_sub7_actions = np.asarray(mb_sub7_actions, dtype=np.int32).swapaxes(1, 0)
        # mb_sub8_actions = np.asarray(mb_sub8_actions, dtype=np.int32).swapaxes(1, 0)
        # mb_sub9_actions = np.asarray(mb_sub9_actions, dtype=np.int32).swapaxes(1, 0)
        # mb_sub10_actions = np.asarray(mb_sub10_actions, dtype=np.int32).swapaxes(1, 0)
        # mb_sub11_actions = np.asarray(mb_sub11_actions, dtype=np.int32).swapaxes(1, 0)
        # mb_sub12_actions = np.asarray(mb_sub12_actions, dtype=np.int32).swapaxes(1, 0)

        mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0)
        # mb_y0 = np.asarray(mb_y0, dtype=np.int32).swapaxes(1, 0)
        mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0)
        # mb_y1 = np.asarray(mb_y1, dtype=np.int32).swapaxes(1, 0)
        # mb_x2 = np.asarray(mb_x2, dtype=np.int32).swapaxes(1, 0)
        # mb_y2 = np.asarray(mb_y2, dtype=np.int32).swapaxes(1, 0)

        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]
        last_values = self.model.value(self.obs, self.states,
                                       self.dones).tolist()
        #discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_td_targets, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                rewards = discount_with_dones(rewards + [value], dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_td_targets[n] = rewards
        mb_td_targets = mb_td_targets.flatten()
        mb_base_actions = mb_base_actions.flatten()
        # mb_sub3_actions = mb_sub3_actions.flatten()
        # mb_sub4_actions = mb_sub4_actions.flatten()
        # mb_sub5_actions = mb_sub5_actions.flatten()
        # mb_sub6_actions = mb_sub6_actions.flatten()
        # mb_sub7_actions = mb_sub7_actions.flatten()
        # mb_sub8_actions = mb_sub8_actions.flatten()
        # mb_sub9_actions = mb_sub9_actions.flatten()
        # mb_sub10_actions = mb_sub10_actions.flatten()
        # mb_sub11_actions = mb_sub11_actions.flatten()
        # mb_sub12_actions = mb_sub12_actions.flatten()
        mb_xy0 = mb_xy0.flatten()
        # mb_y0 = mb_y0.flatten()
        mb_xy1 = mb_xy1.flatten()
        # mb_y1 = mb_y1.flatten()
        # mb_x2 = mb_x2.flatten()
        # mb_y2 = mb_y2.flatten()

        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_td_targets, mb_masks, \
               mb_base_actions, mb_xy0, mb_xy1, mb_values
Ejemplo n.º 2
0
  def run(self):
    mb_obs, mb_td_targets, mb_base_actions, \
    mb_xy0, mb_xy1, \
    mb_values, mb_dones \
      = [], [], [], [], [], [], []

    mb_states = self.states
    for n in range(self.nsteps):
      # pi, pi2, x1, y1, x2, y2, v0
      pi1, pi_xy0, pi_xy1, values, states = self.model.step(
          self.obs, self.states, self.dones)

      pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3

      base_actions = np.argmax(
          pi1 * self.base_act_mask + pi1_noise, axis=1)
      xy0 = np.argmax(pi_xy0, axis=1)

      x0 = (xy0 % 32).astype(int)
      y0 = (xy0 / 32).astype(int)

      xy1 = np.argmax(pi_xy1, axis=1)
      x1 = (xy1 % 32).astype(int)
      y1 = (xy1 / 32).astype(int)

      # Scripted Agent Hacking

      for env_num in range(self.nenv):
        if env_num >= self.nscripts:  # only for scripted agents
          continue

        ob = self.obs[env_num, :, :, :]
        player_relative = ob[:, :, -1]

        self.group_list[env_num] = common.update_group_list2(
            self.control_groups[env_num])

        if len(self.action_queue[env_num]) == 0:

          self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], self.xy_per_marine[env_num] = \
            common.solve_tsp(player_relative,
                             self.selected[env_num][0],
                             self.group_list[env_num],
                             self.group_id[env_num],
                             self.dest_per_marine[env_num],
                             self.xy_per_marine[env_num])

        base_actions[env_num] = 0
        x0[env_num] = 0
        y0[env_num] = 0
        x1[env_num] = 0
        y1[env_num] = 0

        if len(self.action_queue[env_num]) > 0:
          action = self.action_queue[env_num].pop(0)
          base_actions[env_num] = action.get("base_action", 0)

          x0[env_num] = action.get("x0", 0)
          y0[env_num] = action.get("y0", 0)
          xy0[env_num] = y0[env_num] * 32 + x0[env_num]

          x1[env_num] = action.get("x1", 0)
          y1[env_num] = action.get("y1", 0)
          xy1[env_num] = y1[env_num] * 32 + x1[env_num]

      base_actions = self.valid_base_action(base_actions)
      new_base_actions = self.trans_base_actions(base_actions)

      base_action_spec = self.env.action_spec(new_base_actions)
      # print("base_actions:", base_actions)
      actions = self.construct_action(
          base_actions,
          base_action_spec,
          x0,
          y0,
          x1,
          y1
      )

      mb_obs.append(np.copy(self.obs))
      mb_base_actions.append(base_actions)

      mb_xy0.append(xy0)
      mb_xy1.append(xy1)
      mb_values.append(values)
      mb_dones.append(self.dones)

      #print("final acitons : ", actions)
      obs, rewards, dones,\
      available_actions, army_counts,\
      control_groups, selected, xy_per_marine\
      = self.env.step(
          actions=actions)
      self.army_counts = army_counts
      self.control_groups = control_groups
      self.selected = selected
      for env_num, data in enumerate(xy_per_marine):
        self.xy_per_marine[env_num] = data
      self.update_available(available_actions)

      self.states = states
      self.dones = dones
      mean_100ep_reward_a2c = 0
      for n, done in enumerate(dones):
        self.total_reward[n] += float(rewards[n])
        if done:
          self.obs[n] = self.obs[n] * 0
          self.episodes += 1
          num_episodes = self.episodes
          self.episode_rewards.append(self.total_reward[n])

          model = self.model
          mean_100ep_reward = round(
              np.mean(self.episode_rewards[-101:]), 1)
          if (n < self.nscripts):  # scripted agents
            self.episode_rewards_script.append(
                self.total_reward[n])
            mean_100ep_reward_script = round(
                np.mean(self.episode_rewards_script[-101:]), 1)
            nsml.report(
                reward_script=self.total_reward[n],
                mean_reward_script=mean_100ep_reward_script,
                reward=self.total_reward[n],
                mean_100ep_reward=mean_100ep_reward,
                episodes=self.episodes,
                step=self.episodes,
                scope=locals()
            )
          else:
            self.episode_rewards_a2c.append(self.total_reward[n])
            mean_100ep_reward_a2c = round(
                np.mean(self.episode_rewards_a2c[-101:]), 1)
            nsml.report(
                reward_a2c=self.total_reward[n],
                mean_reward_a2c=mean_100ep_reward_a2c,
                reward=self.total_reward[n],
                mean_100ep_reward=mean_100ep_reward,
                episodes=self.episodes,
                step=self.episodes,
                scope=locals()
            )
            print("mean_100ep_reward_a2c", mean_100ep_reward_a2c)

          if self.callback is not None:
            self.callback(locals(), globals())
          self.total_reward[n] = 0
          self.group_list[n] = []


      self.update_obs(obs)
      mb_td_targets.append(rewards)
    mb_dones.append(self.dones)
    #batch of steps to batch of rollouts
    mb_obs = np.asarray(
        mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
    mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0)
    mb_base_actions = np.asarray(
        mb_base_actions, dtype=np.int32).swapaxes(1, 0)

    mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0)
    mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0)

    mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
    mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
    mb_masks = mb_dones[:, :-1]
    mb_dones = mb_dones[:, 1:]
    last_values = self.model.value(self.obs, self.states,
                                   self.dones).tolist()
    #discount/bootstrap off value fn
    for n, (rewards, dones, value) in enumerate(
        zip(mb_td_targets, mb_dones, last_values)):
      rewards = rewards.tolist()
      dones = dones.tolist()
      if dones[-1] == 0:
        rewards = discount_with_dones(rewards + [value], dones + [0],
                                      self.gamma)[:-1]
      else:
        rewards = discount_with_dones(rewards, dones, self.gamma)
      mb_td_targets[n] = rewards
    mb_td_targets = mb_td_targets.flatten()
    mb_base_actions = mb_base_actions.flatten()
    mb_xy0 = mb_xy0.flatten()
    mb_xy1 = mb_xy1.flatten()

    mb_values = mb_values.flatten()
    mb_masks = mb_masks.flatten()
    return mb_obs, mb_states, mb_td_targets, mb_masks, \
           mb_base_actions, mb_xy0, mb_xy1, mb_values
Ejemplo n.º 3
0
  def run(self):
    mb_obs, mb_td_targets, mb_base_actions, \
    mb_xy0, mb_xy1, \
    mb_values, mb_dones \
      = [], [], [], [], [], [], []

    mb_states = self.states
    for n in range(self.nsteps):
      # pi, pi2, x1, y1, x2, y2, v0
      pi1, pi_xy0, pi_xy1, values, states = self.model.step(
          self.obs, self.states, self.dones)

      pi1_noise = np.random.random_sample((self.nenv, 3)) * 0.3

      base_actions = np.argmax(
          pi1 * self.base_act_mask + pi1_noise, axis=1)
      xy0 = np.argmax(pi_xy0, axis=1)

      x0 = (xy0 % 32).astype(int)
      y0 = (xy0 / 32).astype(int)

      xy1 = np.argmax(pi_xy1, axis=1)
      x1 = (xy1 % 32).astype(int)
      y1 = (xy1 / 32).astype(int)

      # Scripted Agent Hacking

      for env_num in range(self.nenv):
        if env_num >= self.nscripts:  # only for scripted agents
          continue

        ob = self.obs[env_num, :, :, :]
        player_relative = ob[:, :, -1]

        self.group_list[env_num] = common.update_group_list2(
            self.control_groups[env_num])

        if len(self.action_queue[env_num]) == 0:

          self.action_queue[env_num], self.group_id[env_num], self.dest_per_marine[env_num], self.xy_per_marine[env_num] = \
            common.solve_tsp(player_relative,
                             self.selected[env_num][0],
                             self.group_list[env_num],
                             self.group_id[env_num],
                             self.dest_per_marine[env_num],
                             self.xy_per_marine[env_num])

        base_actions[env_num] = 0
        x0[env_num] = 0
        y0[env_num] = 0
        x1[env_num] = 0
        y1[env_num] = 0

        if len(self.action_queue[env_num]) > 0:
          action = self.action_queue[env_num].pop(0)
          base_actions[env_num] = action.get("base_action", 0)

          x0[env_num] = action.get("x0", 0)
          y0[env_num] = action.get("y0", 0)
          xy0[env_num] = y0[env_num] * 32 + x0[env_num]

          x1[env_num] = action.get("x1", 0)
          y1[env_num] = action.get("y1", 0)
          xy1[env_num] = y1[env_num] * 32 + x1[env_num]

      base_actions = self.valid_base_action(base_actions)
      new_base_actions = self.trans_base_actions(base_actions)

      base_action_spec = self.env.action_spec(new_base_actions)
      # print("base_actions:", base_actions)
      actions = self.construct_action(
          base_actions,
          base_action_spec,
          x0,
          y0,
          x1,
          y1
      )

      mb_obs.append(np.copy(self.obs))
      mb_base_actions.append(base_actions)

      mb_xy0.append(xy0)
      mb_xy1.append(xy1)
      mb_values.append(values)
      mb_dones.append(self.dones)

      #print("final acitons : ", actions)
      obs, rewards, dones,\
      available_actions, army_counts,\
      control_groups, selected, xy_per_marine\
      = self.env.step(
          actions=actions)
      self.army_counts = army_counts
      self.control_groups = control_groups
      self.selected = selected
      for env_num, data in enumerate(xy_per_marine):
        self.xy_per_marine[env_num] = data
      self.update_available(available_actions)

      self.states = states
      self.dones = dones
      mean_100ep_reward_a2c = 0
      for n, done in enumerate(dones):
        self.total_reward[n] += float(rewards[n])
        if done:
          self.obs[n] = self.obs[n] * 0
          self.episodes += 1
          num_episodes = self.episodes
          self.episode_rewards.append(self.total_reward[n])

          model = self.model
          mean_100ep_reward = round(
              np.mean(self.episode_rewards[-101:]), 1)
          if (n < self.nscripts):  # scripted agents
            self.episode_rewards_script.append(
                self.total_reward[n])
            mean_100ep_reward_script = round(
                np.mean(self.episode_rewards_script[-101:]), 1)
            nsml.report(
                reward_script=self.total_reward[n],
                mean_reward_script=mean_100ep_reward_script,
                reward=self.total_reward[n],
                mean_100ep_reward=mean_100ep_reward,
                episodes=self.episodes,
                step=self.episodes,
                scope=locals()
            )
          else:
            self.episode_rewards_a2c.append(self.total_reward[n])
            mean_100ep_reward_a2c = round(
                np.mean(self.episode_rewards_a2c[-101:]), 1)
            nsml.report(
                reward_a2c=self.total_reward[n],
                mean_reward_a2c=mean_100ep_reward_a2c,
                reward=self.total_reward[n],
                mean_100ep_reward=mean_100ep_reward,
                episodes=self.episodes,
                step=self.episodes,
                scope=locals()
            )
            print("mean_100ep_reward_a2c", mean_100ep_reward_a2c)

          if self.callback is not None:
            self.callback(locals(), globals())
          self.total_reward[n] = 0
          self.group_list[n] = []


      self.update_obs(obs)
      mb_td_targets.append(rewards)
    mb_dones.append(self.dones)
    #batch of steps to batch of rollouts
    mb_obs = np.asarray(
        mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
            self.batch_ob_shape)
    mb_td_targets = np.asarray(mb_td_targets, dtype=np.float32).swapaxes(1, 0)
    mb_base_actions = np.asarray(
        mb_base_actions, dtype=np.int32).swapaxes(1, 0)

    mb_xy0 = np.asarray(mb_xy0, dtype=np.int32).swapaxes(1, 0)
    mb_xy1 = np.asarray(mb_xy1, dtype=np.int32).swapaxes(1, 0)

    mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
    mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
    mb_masks = mb_dones[:, :-1]
    mb_dones = mb_dones[:, 1:]
    last_values = self.model.value(self.obs, self.states,
                                   self.dones).tolist()
    #discount/bootstrap off value fn
    for n, (rewards, dones, value) in enumerate(
        zip(mb_td_targets, mb_dones, last_values)):
      rewards = rewards.tolist()
      dones = dones.tolist()
      if dones[-1] == 0:
        rewards = discount_with_dones(rewards + [value], dones + [0],
                                      self.gamma)[:-1]
      else:
        rewards = discount_with_dones(rewards, dones, self.gamma)
      mb_td_targets[n] = rewards
    mb_td_targets = mb_td_targets.flatten()
    mb_base_actions = mb_base_actions.flatten()
    mb_xy0 = mb_xy0.flatten()
    mb_xy1 = mb_xy1.flatten()

    mb_values = mb_values.flatten()
    mb_masks = mb_masks.flatten()
    return mb_obs, mb_states, mb_td_targets, mb_masks, \
           mb_base_actions, mb_xy0, mb_xy1, mb_values