def update_master_policy(self,rbs,disc, lr, cter):
        samples =random.sample(rbs,batch_size)
        minimaps = []
        screens = []
        infos = []
        next_minimaps = []
        next_screens = []
        next_infos = []
        actions = []
        rewards = []
        for i,[obs,_,action,_,next_obs] in enumerate(samples):
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            next_minimap = np.array(next_obs.observation['minimap'], dtype=np.float32)
            next_minimap = np.expand_dims(U.preprocess_minimap(next_minimap), axis=0)
            next_screen = np.array(obs.observation['screen'], dtype=np.float32)
            next_screen = np.expand_dims(U.preprocess_screen(next_screen), axis=0)
            next_info = np.zeros([1, self.isize], dtype=np.float32)
            next_info[0, obs.observation['available_actions']] = 1
            reward = next_obs.reward

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)
            next_minimaps.append(next_minimap)
            next_screens.append(next_screen)
            next_infos.append(next_info)
            cur_action = np.zeros(num_subpolicies)
            cur_action[action]=1
            actions.append(cur_action)
            rewards.append(reward)

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)
        next_minimaps = np.concatenate(next_minimaps, axis=0)
        next_screens = np.concatenate(next_screens, axis=0)
        next_infos = np.concatenate(next_infos, axis=0)
        y_batch = []
        Qvalue_batch =self.sess_master.run(self.subpolicy_Q,feed_dict = {self.minimap: next_minimaps,
                       self.screen: next_screens,
                       self.info: next_infos})
        for i in range(0, batch_size):
            terminal = samples[i][3]
            if terminal:
                y_batch.append(rewards[i])
            else:
                y_batch.append(rewards[i] + disc * np.max(Qvalue_batch[i]))

        self.sess_master.run(self.master_train_op, feed_dict={self.minimap:minimaps,
            self.screen:screens,
            self.info:infos,
            self.y_input:y_batch,
            self.action_input:actions,
            self.learning_rate:lr})
Beispiel #2
0
def calc_pixel_change(obs2, obs1):
    screen1 = np.array(obs1.observation['screen'], dtype=np.float32)
    screen1 = np.expand_dims(U.preprocess_screen(screen1), axis=0)
    screen2 = np.array(obs2.observation['screen'], dtype=np.float32)
    screen2 = np.expand_dims(U.preprocess_screen(screen2), axis=0)

    screen1_avg = np.mean(screen1, axis=0)
    screen2_avg = np.mean(screen1, axis=0)

    d = np.absolute(screen2_avg[2:-2, 2:-2, :] - screen1_avg[2:-2, 2:-2, :])
    # (60,60,3), any channel here? but with the np.mean on next line, doesn't matter either way.
    m = np.mean(d, 2)
    pc = self._subsample(m, 3)
    return pc
Beispiel #3
0
    def getTrainFeedDict(self, obs, action, attributed_act_id):
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = U.preprocess_screen(screen)
        info = np.zeros([len(U.useful_actions)], dtype=np.float32)
        info[U.compressActions(obs.observation['available_actions'])] = 1
        valid_spatial_action = 0
        valid_action = np.zeros([len(U.useful_actions)], dtype=np.float32)

        custom_inputs = np.array(obs.observation.custom_inputs, dtype=np.float32)

        act_id = action.function
        net_act_id = attributed_act_id
        act_args = action.arguments

        player_relative = obs.observation.feature_screen.player_relative
        valid_actions = obs.observation["available_actions"]
        valid_actions = U.compressActions(valid_actions)
        valid_action[valid_actions] = 1

        args = actions.FUNCTIONS[act_id].args
        for arg, act_arg in zip(args, act_args):
            if arg.name in ('screen', 'minimap', 'screen2') and (not self.flags.force_focus_fire or (act_id != 12 and act_id != 2)):
                valid_spatial_action = 1

        return {
            self.screen: screen, # yes
            self.info: info, # yes
            self.custom_inputs: custom_inputs, #yes
            self.valid_spatial_action: valid_spatial_action, #yes
            self.valid_action: valid_action, # yes
        }
Beispiel #4
0
    def step_low(self, ind_thread, obs, dir_high, act_id):
        # obs就是环境传入的timestep
        minimap = np.array(
            obs.observation['feature_minimap'], dtype=np.float32
        )  # 以下4行将minimap和screen的特征做一定处理后分别保存在minimap和screen变量中
        minimap = np.expand_dims(U.preprocess_minimap(minimap),
                                 axis=0)  # 这四行具体语法暂未研究
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)

        # TODO: only use available actions
        info = np.zeros([1, self.isize],
                        dtype=np.float32)  # self.isize值是动作函数的数量
        info[0, obs.observation['available_actions']] = 1  # info存储可执行的动作。

        # 矿物 军队数量 农民数量
        info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)

        info_plus[0] = obs.observation.player.minerals, obs.observation[
            'player'][5], obs.observation['player'][6], obs.observation[
                'player'][4]

        # info 现在的size 是 isize + info_plus_size
        info = np.concatenate((info, info_plus), axis=1)

        dir_high_usedToFeedLowNet = np.ones([1, 1], dtype=np.float32)
        dir_high_usedToFeedLowNet[0][0] = dir_high
        act_ID = np.ones([1, 1], dtype=np.float32)
        act_ID[0][0] = act_id

        feed = {
            self.minimap: minimap,
            self.screen: screen,
            self.info: info,
            self.dir_high_usedToFeedLowNet: dir_high_usedToFeedLowNet,
            self.act_id: act_ID
        }
        spatial_action_low = self.sess.run(  # 数据类型:Tensor("actor_low/Softmax:0", shape=(?, 4096), dtype=float32, device=/device:GPU:0)
            # [array([[0.00019935, 0.00025348, 0.00024519, ..., 0.00016189, 0.00016014, 0.00016842]], dtype=float32)]
            [self.spatial_action_low],
            feed_dict=feed)

        # 选择施加动作的位置
        # spatial_action_low = spatial_action_low.ravel()  # ravel()是numpy的函数,作用是将数据降维
        target = np.argmax(spatial_action_low)
        target = [int(target // self.ssize),
                  int(target % self.ssize)
                  ]  # 获取要施加动作的位置 疑问:若action是勾选方框怎么办?target只有一个坐标吧,那另一个坐标呢?

        # if False:   # 疑问:if False什么意思?网上没查到
        #   print(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration  # 0.2(epsilon[1])的概率随机选一个位置施加动作
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        return target[0], target[1]
Beispiel #5
0
    def step(self, obs):

        minimap = np.array(obs.observation['minimap'], dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        action_indices = [
            0, 1, 2, 3, 5, 7, 12, 13, 14, 15, 18, 19, 20, 261, 274, 331, 332,
            333, 334, 451, 452, 453, 456
        ]
        valid_actions = list(
            set(obs.observation['available_actions']) & set(action_indices))
        # print("valid_actions",valid_actions)
        valid_actions_indices = [
            action_indices.index(i) for i in valid_actions
        ]
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, valid_actions_indices] = 1

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action], feed_dict=feed)

        # print("non_spatial_action",non_spatial_action.shape,len(non_spatial_action.ravel()))
        # print("spatial_action",spatial_action.ravel().shape,len(spatial_action.ravel()))
        # Select an action and a spatial target
        non_spatial_action = non_spatial_action.ravel()
        spatial_action = spatial_action.ravel()
        # valid_actions = obs.observation['available_actions']
        # act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]
        # print("valid",non_spatial_action[valid_actions_indices])

        act_id = valid_actions[np.argmax(
            non_spatial_action[valid_actions_indices])]
        # print("SELECTED",act_id)
        target = np.argmax(spatial_action)
        target = [int(target // self.ssize), int(target % self.ssize)]

        # Epsilon greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  # TODO: Be careful
        return actions.FunctionCall(act_id, act_args)
Beispiel #6
0
    def step(self, obs):
        minimap = np.array(obs.observation['feature_minimap'],
                           dtype=np.float32)
        # self.logger.info(minimap)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action], feed_dict=feed)

        # Select an action and a spatial target
        non_spatial_action = non_spatial_action.ravel(
        )  # .ravel flattens the input into 1D array
        spatial_action = spatial_action.ravel()
        valid_actions = obs.observation['available_actions']
        act_id = valid_actions[np.argmax(
            non_spatial_action[valid_actions]
        )]  # index of best valid non-spatial action
        target = np.argmax(spatial_action)
        target = [
            int(target // self.ssize),
            int(target % self.ssize)
        ]  # int(idx//32), int(idx%32)  # what is the purpose of doing like this?

        if False:  # ???
            self.logger.info(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration.
        if self.training and np.random.rand(
        ) < self.epsilon[0]:  # choose action
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand(
        ) < self.epsilon[1]:  # seems like its not choosing from ALL pixels?
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] +
                                       dy)))  # max(0, min(31, target[0]+dy))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[
                act_id].args:  # find args of indexed action
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  # TODO: Be careful (???)
        return actions.FunctionCall(act_id, act_args)
Beispiel #7
0
    def step(self, obs):
        """
        choose action
        get observation, return spatial, nonspatial action using RL
        obs = observation spec in lib/features.py : 218
        """
        # for v in tf.get_default_graph().as_graph_def().node:
        #     print(v.name)

        # obs.observation.screen_feature is (17, 64, 64)
        screen = np.array(obs.observation.feature_screen, dtype=np.float32)
        screen = np.expand_dims(preprocess_screen(screen), axis=0) # return (bs=1, channel=42, h=64, w=64)

        # get available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1

        # run session to obtain spatial action output and non spatial action array
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action], feed_dict={self.screen: screen, self.info: info})

        # select action and spatial target
        non_spatial_action = non_spatial_action.ravel() # flatten
        spatial_action = spatial_action.ravel() # flatten
        valid_actions = obs.observation['available_actions']    # available action index
        act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]
        target = np.argmax(spatial_action)  # position to move
        target = [int(target // self.ssize), int(target % self.ssize)]

        # e-greedy action selection (IN THIS NETWORK, WE EXPLORE ONLY IF A RANDOM FRACTION IS ABOVE EPSILON)
        if np.random.random() > self.epsilon:
            # randomly select non-spatial action
            act_id = np.random.choice(valid_actions)

            # randomly select spatial action
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):    # in fact, only screen
                act_args.append([target[1], target[0]]) # y x to x y
            else:
                act_args.append([0])  # [0] means not queue

        # self.steps += 1
        # self.reward += obs.reward

        # print("return action with id: {} and args {} ".format(act_id, act_args))
        return actions.FunctionCall(act_id, act_args)
Beispiel #8
0
    def step(self, obs):
        screen = np.array(obs.observation.feature_screen, dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        structured = np.zeros([1, self.structured_dimensions],
                              dtype=np.float32)
        structured[0, obs.observation.available_actions] = 1

        feed_dict = {
            self.screen_ph: screen,
            self.minimap_ph: minimap,
            self.structured_ph: structured
        }
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action],
            feed_dict=feed_dict)

        non_spatial_action, spatial_action = non_spatial_action.ravel(
        ), spatial_action.ravel()
        available_actions = obs.observation.available_actions
        action_id = available_actions[np.argmax(
            non_spatial_action[available_actions])]
        spatial_target = np.argmax(spatial_action)
        spatial_target = [
            int(spatial_target // self.screen_dimensions),
            int(spatial_target % self.screen_dimensions)
        ]

        # epsilon-greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            action_id = np.random.choice(available_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            delta_y, delta_x = np.random.randint(-4,
                                                 5), np.random.randint(-4, 5)
            spatial_target[0] = int(
                max(
                    0,
                    min(self.screen_dimensions - 1,
                        spatial_target[0] + delta_y)))
            spatial_target[1] = int(
                max(
                    0,
                    min(self.screen_dimensions - 1,
                        spatial_target[1] + delta_x)))

        action_args = []
        for arg in actions.FUNCTIONS[action_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                action_args.append([spatial_target[1], spatial_target[0]])
            else:
                action_args.append([0])
        return actions.FunctionCall(action_id, action_args)
    def get_cur_Q_action(self,obs):
        minimap = np.array(obs.observation['minimap'], dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1

        feed_master = {self.minimap: minimap,
                       self.screen: screen,
                       self.info: info}
        subpolicy_selected = np.argmax(self.sess_master.run(self.subpolicy_Q, feed_dict=feed_master),axis=1)[0]
        return subpolicy_selected
    def step(self, obs):
        minimap = np.array(obs.observation['minimap'], dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1

        subpolicy_index = self.get_cur_Q_action(obs)

        #print("Subpolicy Chosen is :"+str(subpolicy_index))
        feed = {self.minimap: minimap,
                self.screen: screen,
                self.info: info}
        cur_spatial_action,cur_non_spation_action,_ = self.subpolicies[subpolicy_index]

        non_spatial_action, spatial_action = self.sess_master.run(
            [cur_non_spation_action, cur_spatial_action],
            feed_dict=feed)

        # Select an action and a spatial target
        non_spatial_action = non_spatial_action.ravel()
        spatial_action = spatial_action.ravel()
        valid_actions = obs.observation['available_actions']
        act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]
        target = np.argmax(spatial_action)
        target = [int(target // self.ssize), int(target % self.ssize)]

        if False:
            print(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  # TODO: Be careful
        return actions.FunctionCall(act_id, act_args)
Beispiel #11
0
 def getPredictFeedDict(self, obs, hState, cState):
     screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
     screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
     info = np.zeros([1, len(U.useful_actions)], dtype=np.float32)
     info[0, U.compressActions(obs.observation['available_actions'])] = 1
     custom_inputs = np.expand_dims(
         np.array(obs.observation.custom_inputs, dtype=np.float32), axis=0)
     hState = np.expand_dims(np.array(hState), axis=0)
     cState = np.expand_dims(np.array(cState), axis=0)
     return {
         self.screen: screen,
         self.info: info,
         self.custom_inputs: custom_inputs,
         self.hStateInput: hState,
         self.cStateInput: cState }
Beispiel #12
0
    def step(self, obs):
        super(A3CAgent, self).step(obs)
        minimap = np.array(obs.observation['feature_minimap'],
                           dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.
             observation['available_actions']] = 1  #mask for available actions

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action], feed_dict=feed)

        # Select an action and a spatial target
        non_spatial_action = non_spatial_action.ravel()
        spatial_action = spatial_action.ravel()
        valid_actions = obs.observation['available_actions']
        act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]
        target = np.argmax(spatial_action)
        target = [int(target // self.ssize), int(target % self.ssize)]

        if False:
            print(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1],
                                 target[0]])  #use (y,x)->(height,width)
            else:
                act_args.append([0])  #not queued TODO: Be careful
        return actions.FunctionCall(act_id, act_args)
Beispiel #13
0
  def step(self, obs):
    minimap = np.array(obs.observation['minimap'], dtype=np.float32)
    minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
    screen = np.array(obs.observation['screen'], dtype=np.float32)
    screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
    # TODO: only use available actions
    info = np.zeros([1, self.isize], dtype=np.float32)
    info[0, obs.observation['available_actions']] = 1

    feed = {self.minimap: minimap,
            self.screen: screen,
            self.info: info}
    non_spatial_action, spatial_action = self.sess.run(
      [self.non_spatial_action, self.spatial_action],
      feed_dict=feed)

    # Select an action and a spatial target
    non_spatial_action = non_spatial_action.ravel()
    spatial_action = spatial_action.ravel()
    valid_actions = obs.observation['available_actions']
    act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]
    target = np.argmax(spatial_action)
    target = [int(target // self.ssize), int(target % self.ssize)]

    if False:
      print(actions.FUNCTIONS[act_id].name, target)

    # Epsilon greedy exploration
    if self.training and np.random.rand() < self.epsilon[0]:
      act_id = np.random.choice(valid_actions)
    if self.training and np.random.rand() < self.epsilon[1]:
      dy = np.random.randint(-4, 5)
      target[0] = int(max(0, min(self.ssize-1, target[0]+dy)))
      dx = np.random.randint(-4, 5)
      target[1] = int(max(0, min(self.ssize-1, target[1]+dx)))

    # Set act_id and act_args
    act_args = []
    for arg in actions.FUNCTIONS[act_id].args:
      if arg.name in ('screen', 'minimap', 'screen2'):
        act_args.append([target[1], target[0]])
      else:
        act_args.append([0])  # TODO: Be careful
    return actions.FunctionCall(act_id, act_args)
Beispiel #14
0
    def step_high(self, obs):  # obs就是环境传入的timestep
        minimap = np.array(
            obs.observation['feature_minimap'], dtype=np.float32
        )  # 以下4行将minimap和screen的特征做一定处理后分别保存在minimap和screen变量中
        minimap = np.expand_dims(U.preprocess_minimap(minimap),
                                 axis=0)  # 这四行具体语法暂未研究
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions

        info = np.zeros([1, self.isize],
                        dtype=np.float32)  # self.isize值是动作函数的数量
        info[0, obs.observation['available_actions']] = 1  # info存储可执行的动作。

        # 矿物 军队数量 农民数量
        info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)
        info_plus[0] = obs.observation.player.minerals, obs.observation[
            'player'][5], obs.observation['player'][6], obs.observation[
                'player'][4]

        # info 现在的size 是 isize + info_plus_size
        info = np.concatenate((info, info_plus), axis=1)

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}
        dir_high = self.sess.run([self.dir_high], feed_dict=feed)

        # 选择出宏动作的编号/id

        #DHN待处理: 可以将dir_high先根据一定的方法筛选一下(比如宏动作中的硬编码微动作是否在obs.observation['available_actions']中)
        # valid_dir_high = obs.observation['available_actions']

        dir_high_id = np.argmax(dir_high)  # 获取要执行的宏动作id(从0开始)

        # if False:   # 疑问:if False什么意思?网上没查到
        #   print(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration  # 0.05(epsilon[0])的概率随机选一个宏动作(会覆盖之前的dir_high_id)
        if self.training and np.random.rand() < self.epsilon[0]:
            dir_high_id = random.randint(0, num_macro_action - 1)

        return dir_high_id
Beispiel #15
0
    def update(self, rbs, disc, lr, cter):
        # Compute R, which is value of the last observation

        obs = rbs[-1][-1]

        # Print out score on a test run through a full episode, don't update network on test run
        if self.test_run and obs.last():
            self.test_scores.append(obs.observation['score_cumulative'][0])
            # print("TEST SCORE: " + str(self.test_scores[-1]))

            return
        else:
            train_score = obs.observation['score_cumulative'][0]

        logger.info('Total game steps: %s', self.count_steps)
        self.count_steps += len(rbs)

        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            action_indices = [
                0, 1, 2, 3, 5, 7, 12, 13, 14, 15, 18, 19, 20, 261, 274, 331,
                332, 333, 334, 451, 452, 453, 456
            ]
            valid_actions = list(
                set(obs.observation['available_actions'])
                & set(action_indices))
            valid_actions_indices = [
                action_indices.index(i) for i in valid_actions
            ]
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, valid_actions_indices] = 1

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }
            R = self.sess.run(self.value, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []

        value_target = np.zeros([len(rbs)], dtype=np.float32)
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
        spatial_action_selected = np.zeros([len(rbs), self.ssize**2],
                                           dtype=np.float32)
        valid_non_spatial_action = np.zeros([len(rbs), self.isize],
                                            dtype=np.float32)
        non_spatial_action_selected = np.zeros([len(rbs), self.isize],
                                               dtype=np.float32)

        rbs.reverse()
        for i, [obs, action, next_obs] in enumerate(rbs):
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            action_indices = [
                0, 1, 2, 3, 5, 7, 12, 13, 14, 15, 18, 19, 20, 261, 274, 331,
                332, 333, 334, 451, 452, 453, 456
            ]
            valid_actions = list(
                set(obs.observation['available_actions'])
                & set(action_indices))
            valid_actions_indices = [
                action_indices.index(i) for i in valid_actions
            ]
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, valid_actions_indices] = 1

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            reward = obs.reward
            act_id = action.function
            act_args = action.arguments

            value_target[i] = reward + disc * value_target[i - 1]

            # valid_actions = obs.observation["available_actions"]
            valid_non_spatial_action[i, valid_actions_indices] = 1
            non_spatial_action_selected[i, action_indices.index(act_id)] = 1

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)

        # Train
        feed = {
            self.minimap: minimaps,
            self.screen: screens,
            self.info: infos,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: lr,
            self.train_score: train_score
        }
        _, summary = self.sess.run([self.train_op, self.summary_op],
                                   feed_dict=feed)
        self.summary_writer.add_summary(summary, cter)
Beispiel #16
0
    def update(self, replay_buffer, gamma, learning_rate, step):
        obs = replay_buffer[-1][-1]
        if obs.last():
            reward = 0
        else:
            screen = np.array(obs.observation.feature_screen, dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            minimap = np.array(obs.observation.feature_minimap,
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            structured = np.zeros([1, self.structured_dimensions],
                                  dtype=np.float32)
            structured[0, obs.observation.available_actions] = 1

            feed_dict = {
                self.screen_ph: screen,
                self.minimap_ph: minimap,
                self.structured_ph: structured
            }
            reward = self.sess.run(self.value, feed_dict=feed_dict)

        # compute targets and masks
        screens, minimaps, structureds = [], [], []
        target_value = np.zeros([len(replay_buffer)], dtype=np.float32)
        target_value[-1] = reward

        valid_non_spatial_action = np.zeros(
            [len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32)
        sample_non_spatial_action = np.zeros(
            [len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32)
        valid_spatial_action = np.zeros([len(replay_buffer)], dtype=np.float32)
        sample_spatial_action = np.zeros(
            [len(replay_buffer), self.screen_dimensions**2], dtype=np.float32)

        replay_buffer.reverse()
        for i, [obs, action, next_obs] in enumerate(replay_buffer):
            screen = np.array(obs.observation.feature_screen, dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            minimap = np.array(obs.observation.feature_minimap,
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            structured = np.zeros([1, self.structured_dimensions],
                                  dtype=np.float32)
            structured[0, obs.observation.available_actions] = 1

            screens.append(screen)
            minimaps.append(minimap)
            structureds.append(structured)

            reward = obs.reward
            action_id = action.function
            action_args = action.arguments

            target_value[i] = reward + gamma * target_value[i - 1]

            available_actions = obs.observation.available_actions
            valid_non_spatial_action[i, available_actions] = 1
            sample_non_spatial_action[i, action_id] = 1

            args = actions.FUNCTIONS[action_id].args
            for arg, action_arg in zip(args, action_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    spatial_action = action_arg[
                        1] * self.screen_dimensions + action_arg[0]
                    valid_spatial_action[i] = 1
                    sample_spatial_action[i, spatial_action] = 1

        screens = np.concatenate(screens, axis=0)
        minimaps = np.concatenate(minimaps, axis=0)
        structureds = np.concatenate(structureds, axis=0)

        feed_dict = {
            self.screen_ph: screens,
            self.minimap_ph: minimaps,
            self.structured_ph: structureds,
            self.target_value_ph: target_value,
            self.valid_non_spatial_action_ph: valid_non_spatial_action,
            self.sample_non_spatial_action_ph: sample_non_spatial_action,
            self.valid_spatial_action_ph: valid_spatial_action,
            self.sample_spatial_action_ph: sample_spatial_action,
            self.learning_rate_ph: learning_rate
        }
        _, summary = self.sess.run([self.train_op, self.summary_op],
                                   feed_dict=feed_dict)
        self.summary_writer.add_summary(summary, step)
Beispiel #17
0
    def learn(self):
        """when certain number of replay size reach, learn from minibatch replay"""
        print("\nstart learning...")

        # replace target net parameters
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.replace_target_op)
            print('\nreplaced target net parameters...')

        # sample mini-batch
        sample_indices = np.random.choice(len(self.memory), size=self.batch_size)
        batch_memory = deque(list(np.array(self.memory)[sample_indices]))
        print("selecting minibatch of size: {}..." .format(len(batch_memory)))

        # extract s = [], a = [], s' = [], r = []
        screens = []
        screens_next = []
        infos = []
        infos_next = []
        rewards = []

        # actions
        valid_spatial_action = np.zeros([self.batch_size], dtype=np.float32)
        spatial_action_selected = np.zeros([self.batch_size, self.ssize ** 2], dtype=np.float32)
        valid_non_spatial_action = np.zeros([self.batch_size, len(actions.FUNCTIONS)], dtype=np.float32)
        non_spatial_action_selected = np.zeros([self.batch_size, len(actions.FUNCTIONS)], dtype=np.float32)

        for i, [obs, a, r, obs_] in enumerate(batch_memory):
            # s current state from obs
            screen = np.array(obs.observation.feature_screen, dtype=np.float32)
            screen = np.expand_dims(preprocess_screen(screen), axis=0)  # return (bs=1, channel=42, h=64, w=64)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            # s_ next state from obs_
            screen_next = np.array(obs_.observation.feature_screen, dtype=np.float32)
            screen_next = np.expand_dims(preprocess_screen(screen_next), axis=0)  # return (bs=1, channel=42, h=64, w=64)
            info_next = np.zeros([1, self.isize], dtype=np.float32)
            info_next[0, obs_.observation['available_actions']] = 1

            # append to s list, s_ list
            screens.append(screen)
            infos.append(info)
            screens_next.append(screen_next)
            infos_next.append(info_next)

            # get reward r
            rewards.append(r)

            # get action 'a'
            act_id = a.function
            act_args = a.arguments

            valid_actions = obs.observation["available_actions"]
            valid_non_spatial_action[i, valid_actions] = 1
            non_spatial_action_selected[i, act_id] = 1

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        screens = np.concatenate(screens, axis=0) # (32, size of s)
        infos = np.concatenate(infos, axis=0)
        screens_next = np.concatenate(screens_next, axis=0)
        infos_next = np.concatenate(infos_next, axis=0)

        rewards = np.transpose(np.array(rewards)) # (32, r)

        # get q_next = Q(s', a': theta) to calculate y
        q_next = self.sess.run(self.q_next, feed_dict={self.screen: screens_next, self.info: infos_next})
        # q_next = self.sess.run(self.q_eval, feed_dict={self.screen: screens_next, self.info: infos_next})
        q_target = rewards + self.gamma * q_next

        # train
        feed = {self.screen: screens,
                self.info: infos,
                self.q_target: q_target,
                self.valid_spatial_action: valid_spatial_action,
                self.spatial_action_selected: spatial_action_selected,
                self.valid_non_spatial_action: valid_non_spatial_action,
                self.non_spatial_action_selected: non_spatial_action_selected
                }

        # _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed)
        # self.summary_writer.add_summary(summary, self.learn_step_counter)
        _ = self.sess.run(self.train_op, feed_dict=feed)

        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1
        pass
Beispiel #18
0
  def update(self, rbs, disc, lr, cter):
    # Compute R, which is value of the last observation
    obs = rbs[-1][-1]
    if obs.last():
      R = 0
    else:
      minimap = np.array(obs.observation['minimap'], dtype=np.float32)
      minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
      screen = np.array(obs.observation['screen'], dtype=np.float32)
      screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
      info = np.zeros([1, self.isize], dtype=np.float32)
      info[0, obs.observation['available_actions']] = 1

      feed = {self.minimap: minimap,
              self.screen: screen,
              self.info: info}
      R = self.sess.run(self.value, feed_dict=feed)[0]

    # Compute targets and masks
    minimaps = []
    screens = []
    infos = []

    value_target = np.zeros([len(rbs)], dtype=np.float32)
    value_target[-1] = R

    valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
    spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32)
    valid_non_spatial_action = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
    non_spatial_action_selected = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)

    rbs.reverse()
    for i, [obs, action, next_obs] in enumerate(rbs):
      minimap = np.array(obs.observation['minimap'], dtype=np.float32)
      minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
      screen = np.array(obs.observation['screen'], dtype=np.float32)
      screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
      info = np.zeros([1, self.isize], dtype=np.float32)
      info[0, obs.observation['available_actions']] = 1

      minimaps.append(minimap)
      screens.append(screen)
      infos.append(info)

      reward = obs.reward
      act_id = action.function
      act_args = action.arguments

      value_target[i] = reward + disc * value_target[i-1]

      valid_actions = obs.observation["available_actions"]
      valid_non_spatial_action[i, valid_actions] = 1
      non_spatial_action_selected[i, act_id] = 1

      args = actions.FUNCTIONS[act_id].args
      for arg, act_arg in zip(args, act_args):
        if arg.name in ('screen', 'minimap', 'screen2'):
          ind = act_arg[1] * self.ssize + act_arg[0]
          valid_spatial_action[i] = 1
          spatial_action_selected[i, ind] = 1

    minimaps = np.concatenate(minimaps, axis=0)
    screens = np.concatenate(screens, axis=0)
    infos = np.concatenate(infos, axis=0)

    # Train
    feed = {self.minimap: minimaps,
            self.screen: screens,
            self.info: infos,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: lr}
    _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed)
    self.summary_writer.add_summary(summary, cter)
Beispiel #19
0
    def step(self, obs):  # action selection is in here
        minimap = np.array(obs.observation['feature_minimap'],
                           dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}

        # Select an action and a spatial target.
        valid_actions = np.zeros(self.isize, dtype=np.int32)
        valid_actions[obs.observation['available_actions']] = 1
        function_id_policy, spatial_policy = self.sess.run(
            [self.non_spatial_policy, self.spatial_policy], feed_dict=feed)

        # self.logger.info(f"spatial_policy unraveled: {spatial_policy}.")
        # self.logger.info(f":{spatial_policy.shape}.")

        function_id_policy = function_id_policy.ravel(
        )  # .ravel flattens the input into 1D array
        spatial_policy = spatial_policy.ravel()
        # self.logger.info(f"spatial_policy .raveled: {spatial_policy}")  # this will help with target below
        # self.logger.info(f":{spatial_policy.shape}.")
        function_id_policy *= valid_actions

        function_ids = np.arange(len(function_id_policy))
        function_id_policy /= np.sum(function_id_policy)
        #     act_id = valid_actions[np.argmax(non_spatial_policy[valid_actions])]
        act_id = np.random.choice(function_ids,
                                  p=np.squeeze(function_id_policy))
        target = np.argmax(spatial_policy)  # currentl will be (,1024)
        target = [int(target // self.ssize),
                  int(target % self.ssize)]  # not sure why different operators

        if False:
            self.logger.info(
                f"if false: {actions.FUNCTIONS[act_id].name, target}")

        # Epsilon greedy exploration. Keeping this to see if it works
        # basically, if eps greedy: take the target and move it left/right and up/down 4 px
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(
                0, min(self.ssize - 1, target[0] +
                       dy)))  # make sure target is within possible pxl range
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        args = []
        # args: A list of the types of args passed to function_type
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                # x_policy = self.sess.run(
                #     self.argument_policy[str(arg) + "x"],
                #     feed_dict=feed)

                # y_policy = self.sess.run(
                #     self.argument_policy[str(arg) + "y"],
                #     feed_dict=feed)

                # x_policy = np.squeeze(x_policy)
                # x_ids = np.arange(len(x_policy))
                # x = np.random.choice(x_ids, p=x_policy)

                # y_policy = np.squeeze(y_policy)
                # y_ids = np.arange(len(y_policy))
                # y = np.random.choice(y_ids, p=y_policy)
                # args.append([x, y])
                args.append([target[1], target[0]])
                # self.logger.info(f"target coords: {[target[1], target[0]]}")
            else:
                arg_policy = self.sess.run(self.argument_policy[str(arg)],
                                           feed_dict=feed)
                arg_policy = np.squeeze(arg_policy)
                arg_ids = np.arange(len(arg_policy))
                arg_index = np.random.choice(arg_ids, p=arg_policy)
                args.append([arg_index])
                # self.logger.info(f"arg: index: {arg_index}")


#           args.append([0])

# sizes: The max+1 of each of the dimensions this argument takes.
        return actions.FunctionCall(
            act_id, args)  #  args should be int from (0, arg.size)
Beispiel #20
0
    def step(self, obs):
        super(RandomAgent, self).step(obs)
        self.randomOrgreedy = False
        feature_screen = np.expand_dims(preprocess_screen(
            obs.observation.feature_screen),
                                        axis=0)
        feature_map = np.expand_dims(preprocess_minimap(
            obs.observation.feature_minimap),
                                     axis=0)
        info = np.zeros([1, self.action_size], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1
        feed_dict = {
            self.minimap: feature_map,
            self.screen: feature_screen,
            self.info: info
        }
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action],
            feed_dict=feed_dict)
        non_spatial_action = non_spatial_action.ravel()
        spatial_action = spatial_action.ravel()  #output shape 4096
        target = np.argmax(spatial_action)
        target = [
            int(target // self.minimap_size),
            int(target % self.minimap_size)
        ]
        valid_actions = obs.observation.available_actions
        act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]

        # print("available actions = " + str(obs.observation.available_actions))
        # function_id = numpy.random.choice(obs.observation.available_actions)
        # function_id = 1
        # print("function_id = " + str(function_id))
        # print("observation_spec " + str(self.obs_spec))
        # print("action_spec" + str((self.action_spec.functions)))
        # args = [[numpy.random.randint(0, size) for size in arg.sizes]
        # for arg in self.action_spec.functions[function_id].args]
        # print("function args = " + str(self.action_spec.functions[function_id].args))
        # for id in obs.observation.available_actions:
        #     for arg in self.action_spec.functions[id].args:
        #         ctr = 0
        #         for size in arg.sizes:
        #             ctr +=1
        #         if(ctr>2):
        #             print("function_id = " + str(id))

        if np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
            self.randomOrgreedy = True
        if np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.screen_size - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.screen_size - 1, target[1] + dx)))
        act_args = []
        for arg in self.action_spec.functions[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  # TODO: Be careful
        if (act_id != self.temp_act_id):
            self.temp_act_id = act_id
            if (self.randomOrgreedy):
                print("RANDOM")
            print("action " + str(actions.FUNCTIONS[act_id].name))
            print("target" + str(target))
        # print("args = " + str(args))
        # print("\n\n\n")
        return actions.FunctionCall(act_id, act_args)
Beispiel #21
0
    def update(self, rbs, disc, lr, cter):
        # Compute R, which is value of the last observation
        obs = rbs[-1][-1]
        if obs.last():
            # obs[3]['score_cumulative'][0] or obs.reward
            R = obs[3]['score_cumulative'][0]

            # enums from https://github.com/Blizzard/s2client-api/blob/master/include/sc2api/sc2_typeenums.h
            _TERRAN_BARRACKS = 21
            _TERRAN_MARINE = 48
            _UNIT_TYPE = features.SCREEN_FEATURES.unit_type.index
            unit_type = obs.observation['feature_screen'][_UNIT_TYPE]

            barracks_y, barracks_x = (unit_type == _TERRAN_BARRACKS).nonzero()

            if barracks_x.any():
                print('Barracks detected')
                R += 1

            print('Episode reward: {}'.format(R))

        else:
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }
            R = self.sess.run(self.value, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []

        value_target = np.zeros([len(rbs)], dtype=np.float32)
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
        spatial_action_selected = np.zeros([len(rbs), self.ssize**2],
                                           dtype=np.float32)
        valid_non_spatial_action = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
        non_spatial_action_selected = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)

        rbs.reverse()
        for i, [obs, action, next_obs] in enumerate(rbs):
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            #reward = obs.reward
            reward = 0.25 * (next_obs.observation['score_cumulative'][0] -
                             obs.observation['score_cumulative'][0])
            act_id = action.function
            act_args = action.arguments

            value_target[i] = reward + disc * value_target[i - 1]

            valid_actions = obs.observation["available_actions"]
            valid_non_spatial_action[i, valid_actions] = 1
            non_spatial_action_selected[i, act_id] = 1

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)

        for minimaps, screens, infos, value_target, valid_spatial_action, spatial_action_selected, valid_non_spatial_action, \
            non_spatial_action_selected in zip(*[self.batch(mask, BATCH_SIZE) for mask in [minimaps, screens, infos, value_target, valid_spatial_action,
                                                                                           spatial_action_selected, valid_non_spatial_action, non_spatial_action_selected]]):

            # Train in batches

            feed = {
                self.minimap: minimaps,
                self.screen: screens,
                self.info: infos,
                self.value_target: value_target,
                self.valid_spatial_action: valid_spatial_action,
                self.spatial_action_selected: spatial_action_selected,
                self.valid_non_spatial_action: valid_non_spatial_action,
                self.non_spatial_action_selected: non_spatial_action_selected,
                self.learning_rate: lr
            }
            #print('Commiting {} replay samples'.format(len(minimaps)))
            _, summary = self.sess.run([self.train_op, self.summary_op],
                                       feed_dict=feed)
            self.summary_writer.add_summary(summary, cter)
Beispiel #22
0
    def step(self, obs, use_unit_selector):
        minimap = np.array(obs.observation['minimap'], dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1
        if self.init_counter == 0:
            self.init_counter += 1
            return actions.FunctionCall(7, [[1]])
        elif self.init_counter == 1:
            self.init_counter += 1
            return actions.FunctionCall(
                5, [[sc_ui.ActionMultiPanel.SingleSelect], [0]])
        elif self.init_counter == 2:
            self.init_counter += 1
            return actions.FunctionCall(4, [[1], [0]])
        elif self.init_counter == 3:
            self.init_counter += 1
            return actions.FunctionCall(7, [[1]])
        elif self.init_counter == 4:
            self.init_counter += 1
            return actions.FunctionCall(
                5, [[sc_ui.ActionMultiPanel.SingleSelect], [1]])
        elif self.init_counter == 5:
            self.init_counter += 1
            return actions.FunctionCall(4, [[1], [1]])
        elif use_unit_selector:
            unitSel = self.get_unit_sel_res(obs)
            if self.training and np.random.rand() < self.epsilon[0]:
                unitSel = np.random.randint(0, 4)
            if unitSel == num_units + 1:
                return actions.FunctionCall(7, [[1]])
            elif unitSel == num_units:
                feed = {
                    self.minimap: minimap,
                    self.screen: screen,
                    self.info: info
                }

                non_spatial_action, spatial_action = self.sess_master.run(
                    [self.non_spatial_action, self.spatial_action],
                    feed_dict=feed)

                # Select an action and a spatial target
                non_spatial_action = non_spatial_action.ravel()
                spatial_action = spatial_action.ravel()
                valid_actions = obs.observation['available_actions']
                act_id = valid_actions[np.argmax(
                    non_spatial_action[valid_actions])]
                target = np.argmax(spatial_action)
                target = [int(target // self.ssize), int(target % self.ssize)]

                # Epsilon greedy exploration
                if self.training and np.random.rand() < self.epsilon[0]:
                    act_id = np.random.choice(valid_actions)
                if self.training and np.random.rand() < self.epsilon[1]:
                    dy = np.random.randint(-4, 5)
                    target[0] = int(max(0, min(self.ssize - 1,
                                               target[0] + dy)))
                    dx = np.random.randint(-4, 5)
                    target[1] = int(max(0, min(self.ssize - 1,
                                               target[1] + dx)))

                # Set act_id and act_args
                act_args = []
                for arg in actions.FUNCTIONS[act_id].args:
                    if arg.name in ('screen', 'minimap', 'screen2'):
                        act_args.append([target[1], target[0]])
                    else:
                        act_args.append([0])  # TODO: Be careful
                return actions.FunctionCall(act_id, act_args)
            else:
                return actions.FunctionCall(4, [[0], [unitSel]])
        else:
            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }

            non_spatial_action, spatial_action = self.sess_master.run(
                [self.non_spatial_action, self.spatial_action], feed_dict=feed)

            # Select an action and a spatial target
            non_spatial_action = non_spatial_action.ravel()
            spatial_action = spatial_action.ravel()
            valid_actions = obs.observation['available_actions']
            act_id = valid_actions[np.argmax(
                non_spatial_action[valid_actions])]
            target = np.argmax(spatial_action)
            target = [int(target // self.ssize), int(target % self.ssize)]

            # Epsilon greedy exploration
            if self.training and np.random.rand() < self.epsilon[0]:
                act_id = np.random.choice(valid_actions)
            if self.training and np.random.rand() < self.epsilon[1]:
                dy = np.random.randint(-4, 5)
                target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
                dx = np.random.randint(-4, 5)
                target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

            # Set act_id and act_args
            act_args = []
            for arg in actions.FUNCTIONS[act_id].args:
                if arg.name in ('screen', 'minimap', 'screen2'):
                    act_args.append([target[1], target[0]])
                else:
                    act_args.append([0])  # TODO: Be careful
            return actions.FunctionCall(act_id, act_args)
Beispiel #23
0
    def update_low(self, ind_thread, rbs, dhs, disc, lr_a, lr_c, cter,
                   macro_type, coord_type):

        # rbs(replayBuffers)是[last_timesteps[0], actions[0], timesteps[0]]的集合(agent在一回合里进行了多少step就有多少个),具体见run_loop25行

        # Compute R, which is value of the last observation
        obs = rbs[-1][-1]  # rbs的最后一个元素,应当是当前一步的timesteps值。即obs可以看作timesteps
        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)  # 类似105-111行
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)
            info_plus[0] = obs.observation.player.minerals, obs.observation[
                'player'][5], obs.observation['player'][6], obs.observation[
                    'player'][4]
            # info 现在的size 是 isize + info_plus_size
            info = np.concatenate((info, info_plus), axis=1)
            # print('info')
            # print(info)
            # print(info_plus)

            dir_high_usedToFeedLowNet = np.ones([1, 1], dtype=np.float32)
            dir_high_usedToFeedLowNet[0][0] = dhs[0]
            act_id = np.ones([1, 1], dtype=np.float32)
            # act_ID[0][0] = rbs[-1][1].function
            # 之所以不能用rbs里的action信息,是因为rbs里的action可能是no_op(由于出现动作not valid/不合法的情况,为了使游戏不崩掉而不得不这么办的补救措施)
            # 但这里要输入的act_id应该是step_low算出来的act_id
            act_id[0][0] = GL.get_value(ind_thread, "act_id_micro")

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info,
                self.dir_high_usedToFeedLowNet: dir_high_usedToFeedLowNet,
                self.act_id: act_id,
            }
            R = self.sess.run(self.value_low, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []
        dir_highs = []
        act_ids = []

        value_target = np.zeros(
            [len(rbs)], dtype=np.float32)  # len(rbs) 计算出agent在回合里总共进行的步数
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)],
                                        dtype=np.float32)  # 含义是每一个step需不需要坐标参数
        spatial_action_selected = np.zeros(
            [len(rbs), self.ssize**2],
            dtype=np.float32)  # 含义是每一个step需不需要坐标参数(第一维上),且具体坐标参数是什么(第二维上)

        rbs.reverse()  # 先reverse 与莫烦A3C_continuous_action.py的代码类似
        micro_isdone = GL.get_value(ind_thread, "micro_isdone")
        micro_isdone.reverse()

        sum_low_reward = GL.get_value(ind_thread, "sum_low_reward")
        for i, [obs, action,
                next_obs] in enumerate(rbs):  # agent在回合里进行了多少步,就进行多少轮循环
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)  # 类似105-111行
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)
            info_plus[0] = obs.observation.player.minerals, obs.observation[
                'player'][5], obs.observation['player'][6], obs.observation[
                    'player'][4]

            # info 现在的size 是 isize + info_plus_size
            info = np.concatenate((info, info_plus), axis=1)

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            dir_high_usedToFeedLowNet = np.ones([1, 1], dtype=np.float32)
            dir_high_usedToFeedLowNet[0][0] = dhs[i]
            act_ID = np.ones([1, 1], dtype=np.float32)
            # act_ID[0][0] = act_id
            # 之所以不能用rbs里的action信息,是因为rbs里的action可能是no_op(由于出现动作not valid/不合法的情况,为了使游戏不崩掉而不得不这么办的补救措施)
            # 但这里要输入的act_id应该是step_low算出来的act_id
            act_ID[0][0] = GL.get_value(ind_thread, "act_id_micro")
            # dir_highs.append(dir_high_usedToFeedLowNet)
            # act_ids.append(act_ID)

            coord = [0, 0]
            # coord[0], coord[1] = [32, 32]
            coord[0], coord[1] = self.step_low(ind_thread, obs,
                                               dir_high_usedToFeedLowNet,
                                               act_ID)
            reward = low_reward(next_obs, obs, coord, micro_isdone[i],
                                macro_type, coord_type)
            sum_low_reward += reward
            GL.add_value_list(ind_thread, "low_reward_of_episode", reward)

            act_id = action.function  # Agent在这一步中选择动作的id序号
            act_args = action.arguments

            value_target[i] = reward + disc * value_target[
                i -
                1]  # 可参考莫烦Q_Learning教程中对Gamma的意义理解的那张图(有3个眼镜那张),得到回合中每个状态的价值V_S
            # 这里没像莫烦一样再次reverse value 似乎是因为其他参数(如minimap、screen、info等)也都是最后往前反序排列的。见181-182行

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        GL.set_value(ind_thread, "sum_low_reward", sum_low_reward)

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)

        # 实际上由于low_net是单步更新策略,所以以下feed的参数里面都只有一帧的数据

        # Train
        feed = {
            self.minimap:
            minimaps,
            self.screen:
            screens,
            self.info:
            infos,
            # self.dir_high_usedToFeedLowNet: dir_highs,
            self.dir_high_usedToFeedLowNet:
            dir_high_usedToFeedLowNet,
            # self.act_id: act_ids,
            self.act_id:
            act_ID,
            self.value_target_low:
            value_target,
            self.valid_spatial_action_low:
            valid_spatial_action,
            self.spatial_action_selected_low:
            spatial_action_selected,
            self.learning_rate_a_low:
            lr_a,
            self.learning_rate_c_low:
            lr_c
        }
        _, __, summary = self.sess.run(
            [self.update_a_low, self.update_c_low, self.summary_op_low],
            feed_dict=feed)
        self.summary_writer.add_summary(summary, cter)
Beispiel #24
0
  def update(self, rbs, disc, lr, cter):
    # Compute R, which is value of the last observation
    spatial_action = None
    non_spatial_action = None

    obs = rbs[-1][-1]
    if obs.last():
      R = 0
    else:
      minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
      minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
      screen = np.array(obs.observation.feature_screen, dtype=np.float32)
      screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
      info = np.zeros([1, self.isize], dtype=np.float32)
      info[0, obs.observation.available_actions] = 1

      # first get probabilities for each action; Then greedly pick the largest to calculate q value. one hot vector softmax
      # have low confidence, just use full episode and the last observation R should be just 0.
      feed = {self.minimap: minimap,
              self.screen: screen,
              self.info: info}
      spatial_action, non_spatial_action = self.sess.run([self.spatial_action, self.non_spatial_action], feed_dict=feed)

    # Compute targets and masks
    minimaps = []
    screens = []
    infos = []
    valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
    spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32)
    valid_non_spatial_action = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
    non_spatial_action_selected = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)

    rbs.reverse()
    for i, [obs, action, next_obs] in enumerate(rbs):
      minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
      minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
      screen = np.array(obs.observation.feature_screen, dtype=np.float32)
      screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
      info = np.zeros([1, self.isize], dtype=np.float32)
      #info[0, obs.observation['available_actions']] = 1
      info[0, obs.observation.available_actions] = 1

      minimaps.append(minimap)
      screens.append(screen)
      infos.append(info)

      act_id = action.function
      act_args = action.arguments

      #valid_actions = obs.observation["available_actions"]
      valid_actions = obs.observation.available_actions
      valid_non_spatial_action[i, valid_actions] = 1
      non_spatial_action_selected[i, act_id] = 1

      args = actions.FUNCTIONS[act_id].args
      for arg, act_arg in zip(args, act_args):
        if arg.name in ('screen', 'minimap', 'screen2'):
          ind = act_arg[1] * self.ssize + act_arg[0]
          valid_spatial_action[i] = 1
          spatial_action_selected[i, ind] = 1


    value_target = np.zeros([len(rbs)], dtype=np.float32)
    if spatial_action is not None:
      q_spatial = np.max(spatial_action * valid_spatial_action[0], axis=1)
      q_non_spatial = np.max(non_spatial_action * valid_non_spatial_action[0], axis=1)
      q_value = self.ispatial*q_spatial + q_non_spatial
      R = q_value[0]
      
    value_target[-1] = R

    for i, [obs, action, next_obs] in enumerate(rbs):
      reward = obs.reward
      value_target[i] = reward + disc * value_target[i-1]

    minimaps = np.concatenate(minimaps, axis=0)
    screens = np.concatenate(screens, axis=0)
    infos = np.concatenate(infos, axis=0)

    # Train
    feed = {self.minimap: minimaps,
            self.screen: screens,
            self.info: infos,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: lr}
    _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed)
    self.summary_writer.add_summary(summary, cter)
Beispiel #25
0
    def update_high(self, ind_thread, rbs, dhs, disc, lr_a, lr_c, cter):
        # rbs(replayBuffers)是[last_timesteps[0], actions[0], timesteps[0]]的集合(更新时经历了多少个step就有多少个),具体见run_loop25行
        # dhs(dir_high_buffers) 是指令序号的集合。比如一共有5个宏动作,则dhs形如[5, 4, 1, 2, 3, 4, 2, 1, ......]

        dir_high_selected = np.zeros(
            [len(rbs), num_macro_action],
            dtype=np.float32)  # 含义是每一个step需不需要坐标参数(第一维上),且具体坐标参数是什么(第二维上)
        for i in range(len(rbs)):
            dir_high_selected[i, dhs[i][0] - 1] = 1

        # Compute R, which is value of the last observation
        obs = rbs[-1][-1]  # rbs的最后一个元素,应当是当前一步的timesteps值。即obs可以看作timesteps
        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)  # 类似105-111行
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)
            info_plus[0] = obs.observation.player.minerals, obs.observation[
                'player'][5], obs.observation['player'][6], obs.observation[
                    'player'][4]

            # info 现在的size 是 isize + info_plus_size
            info = np.concatenate((info, info_plus), axis=1)

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }
            R = self.sess.run(self.value_high, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []

        value_target = np.zeros(
            [len(rbs)], dtype=np.float32)  # len(rbs) 计算出agent在回合里总共进行的步数
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)],
                                        dtype=np.float32)  # 含义是每一个step需不需要坐标参数
        spatial_action_selected = np.zeros(
            [len(rbs), self.ssize**2],
            dtype=np.float32)  # 含义是每一个step需不需要坐标参数(第一维上),且具体坐标参数是什么(第二维上)

        rbs.reverse()  # 先reverse 与莫烦A3C_continuous_action.py的代码类似
        micro_isdone = GL.get_value(ind_thread, "micro_isdone")
        micro_isdone.reverse()
        sum_high_reward = GL.get_value(ind_thread, "sum_high_reward")
        for i, [obs, action,
                next_obs] in enumerate(rbs):  # agent在回合里进行了多少步,就进行多少轮循环
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)  # 类似105-111行
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)
            info_plus[0] = obs.observation.player.minerals, obs.observation[
                'player'][5], obs.observation['player'][6], obs.observation[
                    'player'][4]

            # info 现在的size 是 isize + info_plus_size
            info = np.concatenate((info, info_plus), axis=1)

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            # reward = obs.reward
            reward = high_reward(ind_thread, next_obs, obs, action,
                                 micro_isdone[i])  # 翔森设计的high reward
            sum_high_reward += reward
            GL.add_value_list(ind_thread, "high_reward_of_episode", reward)
            act_id = action.function  # Agent在这一步中选择动作的id序号
            act_args = action.arguments

            value_target[i] = reward + disc * value_target[
                i -
                1]  # 可参考莫烦Q_Learning教程中对Gamma的意义理解的那张图(有3个眼镜那张),得到回合中每个状态的价值V_S
            # 这里没像莫烦一样再次reverse value 似乎是因为其他参数(如minimap、screen、info等)也都是最后往前反序排列的。见181-182行

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        GL.set_value(ind_thread, "sum_high_reward", sum_high_reward)
        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)

        # Train
        feed = {
            self.minimap: minimaps,
            self.screen: screens,
            self.info: infos,
            self.value_target_high: value_target,
            self.dir_high_selected: dir_high_selected,
            self.learning_rate_a_high: lr_a,
            self.learning_rate_c_high: lr_c
        }
        _, __, summary = self.sess.run(
            [self.update_a_high, self.update_c_high, self.summary_op_high],
            feed_dict=feed)
        self.summary_writer.add_summary(summary, cter)

        GL.set_value(ind_thread, "micro_isdone", [])
Beispiel #26
0
	def step(self, obs):
		screen = np.array(obs.observation.feature_screen, dtype=np.float32)
		screen = np.expand_dims(utils.preprocess_screen(screen), axis=0)
		# np.expand_dims: 展開數組的形狀。
		# x = np.array([1,2])
		# x.shape
		# (2,)
		# y = np.expand_dims(x, axis=0)
		# y
		# array([[1, 2]])
		# y.shape
		# (1, 2)
		minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
		minimap = np.expand_dims(utils.preprocess_minimap(minimap), axis=0)
		structured = np.zeros([1, self.structured_dimensions], dtype=np.float32)
		structured[0, obs.observation.available_actions] = 1

		feed_dict = {
			self.screen: screen,
			self.minimap: minimap,
			self.structured: structured
		}
		
		non_spatial_action, spatial_action = self.sess.run(
			[self.non_spatial_action, self.spatial_action],
			feed_dict=feed_dict
		)

		non_spatial_action = non_spatial_action.ravel()
		spatial_action = spatial_action.ravel()
		# np.ravel: 返回一個連續的扁平數組(flatten array)。
		# x = np.array([[1, 2, 3], [4, 5, 6]])
		# print(np.ravel(x))
		# [1 2 3 4 5 6]
		available_actions = obs.observation.available_actions
		action_id = 0
		spatial_target = []
		if self.mode == 'original_ac3':
			non_spatial_action = np.array(non_spatial_action[available_actions])
			non_spatial_action /= non_spatial_action.sum()
			x = np.random.choice(non_spatial_action, p=non_spatial_action)
			action_id = available_actions[np.where(non_spatial_action == x)[0][0]]
			spatial_target = random.choice(list(enumerate(spatial_action)))[0]
			# x = np.random.choice(spatial_action, p=spatial_action)
			# if len(np.where(spatial_action == x)[0]) > 1:
			# 	random = np.random.choice(len(np.where(spatial_action == x)[0]))
			# 	spatial_target = np.where(spatial_action == x)[0][random]
			# else:
			# 	spatial_target = np.where(spatial_action == x)[0][0]
			spatial_target = [int(spatial_target // self.resolution), int(spatial_target % self.resolution)]
		else:
			action_id = available_actions[np.argmax(non_spatial_action[available_actions])]
			spatial_target = np.argmax(spatial_action)
			spatial_target = [int(spatial_target // self.resolution), int(spatial_target % self.resolution)]

			# epsilon-greedy exploration
			if self.training and np.random.rand() < self.epsilon[0]:
				action_id = np.random.choice(available_actions)
			if self.training and np.random.rand() < self.epsilon[1]:
				delta_y, delta_x = np.random.randint(-4, 5), np.random.randint(-4, 5)
				spatial_target[0] = int(max(0, min(self.resolution -1, spatial_target[0] + delta_y)))
				spatial_target[1] = int(max(0, min(self.resolution -1, spatial_target[1] + delta_x)))

		action_args = []
		for arg in actions.FUNCTIONS[action_id].args:
			if arg.name in ('screen', 'minimap', 'screen2'):
				action_args.append([spatial_target[1], spatial_target[0]])
			else:
				action_args.append([0])
		return actions.FunctionCall(action_id, action_args)
Beispiel #27
0
    def step(self, obs):

        minimap = np.array(obs.observation['minimap'], dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1
        #print('info = ',info)

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action], feed_dict=feed)

        # Select an action and a spatial target
        non_spatial_action = non_spatial_action.ravel()
        spatial_action = spatial_action.ravel()
        valid_actions = obs.observation['available_actions']

        #print('valid_actions = ',valid_actions)
        #print('self.less_actions = ',self.less_actions)

        #找valid_actions中各元素在self.less_actions中的脚标
        valid_actions_idx = []
        for i in range(len(valid_actions)):
            for j in range(len(self.less_actions)):
                if (self.less_actions[j] == valid_actions[i]):
                    valid_actions_idx.append(j)
        #valid_actions_idx = np.sort(valid_actions_idx)
        act_id = int(self.less_actions[np.argmax(
            non_spatial_action[valid_actions_idx])])

        #print('valid_actions_idx = ',valid_actions_idx)
        #print('np.argmax(non_spatial_action[valid_actions_idx]) = ', np.argmax(non_spatial_action[valid_actions_idx]))

        #print('act_id = ',act_id)
        target = np.argmax(spatial_action)
        target = [int(target // self.ssize), int(target % self.ssize)]

        #if False:
        #      print(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  # TODO: Be careful
        if (not act_id in valid_actions):
            return actions.FunctionCall(_NOOP, [])

        return actions.FunctionCall(act_id, act_args)
Beispiel #28
0
	def update(self, replay_buffer, learning_rate, step):
		obs = replay_buffer[-1][-1]
		if obs.last():
			reward = 0
		else:
			screen = np.array(obs.observation.feature_screen, dtype=np.float32)
			screen = np.expand_dims(utils.preprocess_screen(screen), axis=0)
			minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
			minimap = np.expand_dims(utils.preprocess_minimap(minimap), axis=0)
			structured = np.zeros([1, self.structured_dimensions], dtype=np.float32)
			structured[0, obs.observation.available_actions] = 1

			feed_dict = {
				self.screen: screen,
				self.minimap: minimap,
				self.structured: structured
			}
			reward = self.sess.run(self.value, feed_dict=feed_dict)

		#compute targets and masks
		screens, minimaps, structureds = [], [], []
		target_value = np.zeros([len(replay_buffer)], dtype=np.float32)
		target_value[-1] = reward

		valid_non_spatial_action = np.zeros([len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32)
		non_spatial_action_selected = np.zeros([len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32)
		valid_spatial_action = np.zeros([len(replay_buffer)], dtype=np.float32)
		spatial_action_selected = np.zeros([len(replay_buffer), self.resolution ** 2], dtype=np.float32)

		record_score = replay_buffer[-1][0].observation['score_cumulative'][0]
		summary = tf.Summary()
		summary.value.add(tag='episode_score', simple_value=record_score)
		print('train!! step %d: score = %f' % (step, record_score))
		self.summary_writer.add_summary(summary, step)

		replay_buffer.reverse()
		# reverse:方法沒有返回值,但是會對列表的元素進行反向排序
		for i, [obs, action, next_obs] in enumerate(replay_buffer):
		# seq = ['one', 'two', 'three']
		# for i, element in enumerate(seq):
		# print i, element
		#	0 one
		#	1 two
		#	2 three
			screen = np.array(obs.observation.feature_screen, dtype=np.float32)
			screen = np.expand_dims(utils.preprocess_screen(screen), axis=0)
			minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
			minimap = np.expand_dims(utils.preprocess_minimap(minimap), axis=0)
			structured = np.zeros([1, self.structured_dimensions], dtype=np.float32)
			structured[0, obs.observation.available_actions] = 1

			screens.append(screen)
			minimaps.append(minimap)
			structureds.append(structured)

			reward = obs.reward
			action_id = action.function
			action_args = action.arguments

			target_value[i] = reward + self.discount * target_value[i - 1]

			available_actions = obs.observation.available_actions
			valid_non_spatial_action[i, available_actions] = 1
			non_spatial_action_selected[i, action_id] = 1

			args = actions.FUNCTIONS[action_id].args
			for arg, action_arg in zip(args, action_args):
				if arg.name in ('screen', 'minimap', 'screen2'):
					spatial_action = action_arg[1] * self.resolution + action_arg[0]
					valid_spatial_action[i] = 1
					spatial_action_selected[i, spatial_action] = 1

		screens = np.concatenate(screens, axis=0)
		minimaps = np.concatenate(minimaps, axis=0)
		structureds = np.concatenate(structureds, axis=0)

		feed_dict = {
			self.screen: screens,
			self.minimap: minimaps,
			self.structured: structureds,
			self.target_value: target_value,
			self.valid_non_spatial_action: valid_non_spatial_action,
			self.non_spatial_action_selected: non_spatial_action_selected,
			self.valid_spatial_action: valid_spatial_action,
			self.spatial_action_selected: spatial_action_selected,
			self.learning_rate: learning_rate
		}
		_, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed_dict)
		self.summary_writer.add_summary(summary, step)
Beispiel #29
0
    def update(self, rbs, replay_buffer, disc, lr, cter):
        # Compute R, which is value of the last observation
        buffer_size = len(replay_buffer)
        obs = rbs[-1][-1]
        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }
            R = self.sess.run(self.value, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []

        value_target = np.zeros([len(rbs)], dtype=np.float32)
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
        spatial_action_selected = np.zeros([len(rbs), self.ssize**2],
                                           dtype=np.float32)
        valid_non_spatial_action = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
        non_spatial_action_selected = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)

        rbs.reverse()
        for i, [obs, action, pixel_change, next_obs] in enumerate(rbs):
            # added pixel change to update function, just directly put it into the feed dict
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            reward = obs.reward
            act_id = action.function
            act_args = action.arguments

            value_target[i] = reward + disc * value_target[i - 1]

            valid_actions = obs.observation["available_actions"]
            valid_non_spatial_action[i, valid_actions] = 1
            non_spatial_action_selected[i, act_id] = 1

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)

        # Train
        feed = {
            self.minimap: minimaps,
            self.screen: screens,
            self.pixel_change: pixel_change,
            self.info: infos,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: lr
        }
        # _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed)
        # self.summary_writer.add_summary(summary, cter)
        ######################################################################################
        # Update the pc network
        start_pos = np.random.randint(0, buffer_size - self.sequence_size - 1)
        #take care of terminals
        if replay_buffer[start_pos][-1].last():
            start_pos += 1
            # Assuming that there are no successive terminal frames.

        pc_experience_frames = []

        for i in range(self.sequence_size + 1):
            frame = replay_buffer[start_pos + i]
            pc_experience_frames.append(frame)
            if frame[-1].last():
                break
        # Reverse sequence to calculate from the last
        pc_experience_frames.reverse()

        batch_pc_si = []
        batch_pc_a = []
        batch_pc_R = []
        batch_pc_va = []
        pc_R = np.zeros([20, 20], dtype=np.float32)
        if not pc_experience_frames[1].last():
            # pc_R = self.run_pc_q_max(self.sess, pc_experience_frames[0].state)
            # def run_pc_q_max(self, sess, s_t):
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            # TODO: only use available actions
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1
            s_feed = {
                self.pc_minimap: minimap,
                self.pc_screen: screen,
                self.pc_info: info
            }
            pc_R = self.sess.run(self.pc_q_max, s_feed)
        pc_valid_non_spatial_action = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
        for i, [obs, action, pixel_change,
                next_obs] in enumerate(pc_experience_frames[1:]):
            pc_R = pixel_change + self.gamma_pc * pc_R
            a = np.zeros([self.action_size])
            a[action] = 1.0
            valid_actions = np.zeros((len(actions.FUNCTIONS)),
                                     dtype=np.float32)
            valid_actions_inds = obs.observation["available_actions"]
            valid_actions[valid_actions_inds] = 1
            batch_pc_si.append(frame.state)
            batch_pc_a.append(a)
            batch_pc_R.append(pc_R)
            batch_pc_va.append(valid_actions)

        batch_pc_si.reverse()
        batch_pc_a.reverse()
        batch_pc_R.reverse()
        batch_pc_va.reverse
        pc_feed_dict = {
            self.pc_input: batch_pc_si,
            self.pc_a: batch_pc_a,
            self.pc_r: batch_pc_R,
            self.pc_valid_non_spatial_action: batch_pc_va
        }
        feed.update(pc_feed_dict)
        _, summary = self.sess.run([self.train_op, self.summary_op],
                                   feed_dict=feed)
        self.summary_writer.add_summary(summary, cter)
def supervised_train(training_episode):
    # Initialization
    EPISODES, episode, max_average = 20000, 0, 50.0  # specific for pong

    while episode < training_episode:
        if episode < EPISODES:
            episode += 1

        replay = trajectory.Trajectory(
            '/media/kimbring2/Steam/StarCraftII/Replays/', 'Terran', 'Terran',
            2500)
        replay.get_random_trajectory()

        replay_index = 0
        home_replay_done = False

        home_replay_feature_screen_list, home_replay_feature_player_list, home_replay_available_actions_list = [], [], []
        home_replay_fn_id_list, home_replay_arg_ids_list = [], []
        home_replay_memory_state_list, home_replay_carry_state_list = [], []

        memory_state = np.zeros([1, 256], dtype=np.float32)
        carry_state = np.zeros([1, 256], dtype=np.float32)
        while not home_replay_done:
            home_replay_state = replay.home_trajectory[replay_index][0]
            home_replay_actions = replay.home_trajectory[replay_index][1]
            home_replay_done = replay.home_trajectory[replay_index][2]

            home_replay_feature_screen = home_replay_state['feature_screen']
            home_replay_feature_screen = preprocess_screen(
                home_replay_feature_screen)
            home_replay_feature_screen = np.transpose(
                home_replay_feature_screen, (1, 2, 0))

            home_replay_feature_player = home_replay_state['player']
            home_replay_feature_player = preprocess_player(
                home_replay_feature_player)

            home_replay_available_actions = home_replay_state[
                'available_actions']
            home_replay_available_actions = preprocess_available_actions(
                home_replay_available_actions)

            home_replay_feature_screen_array = np.array(
                [home_replay_feature_screen])
            home_replay_feature_player_array = np.array(
                [home_replay_feature_player])
            home_replay_available_actions_array = np.array(
                [home_replay_available_actions])

            home_replay_feature_screen_list.append(
                home_replay_feature_screen_array)
            home_replay_feature_player_list.append(
                home_replay_feature_player_array)
            home_replay_available_actions_list.append(
                home_replay_available_actions_array)
            home_replay_memory_state_list.append(memory_state)
            home_replay_carry_state_list.append(carry_state)

            home_replay_prediction = home_agent.act(
                home_replay_feature_screen_array,
                home_replay_feature_player_array,
                home_replay_available_actions_array, memory_state, carry_state)
            home_replay_next_memory_state = home_replay_prediction[3]
            home_replay_next_carry_state = home_replay_prediction[4]
            home_replay_action = random.choice(home_replay_actions)

            home_replay_fn_id = int(home_replay_action.function)
            home_replay_args_ids = dict()
            for arg_type in actions.TYPES:
                home_replay_args_ids[arg_type] = -1

            arg_index = 0
            for arg_type in FUNCTIONS._func_list[home_replay_fn_id].args:
                home_replay_args_ids[arg_type] = home_replay_action.arguments[
                    arg_index]
                arg_index += 1

            home_replay_fn_id_list.append(home_replay_fn_id)
            home_replay_arg_id_list = []
            for arg_type in home_replay_args_ids.keys():
                arg_id = home_replay_args_ids[arg_type]

                if type(arg_id) == list:
                    if len(arg_id) == 2:
                        arg_id = arg_id[0] * feature_screen_size + arg_id[1]
                    else:
                        arg_id = int(arg_id[0])

                home_replay_arg_id_list.append(arg_id)

            home_replay_arg_ids_list.append(np.array([home_replay_arg_id_list
                                                      ]))

            if home_replay_done == StepType.LAST:
                home_replay_done = True
            else:
                home_replay_done = False

            if home_replay_done:
                break

            replay_index += 1
            #print("replay_index: ", replay_index)
            if replay_index >= len(replay.home_trajectory) - 1:
                break

            memory_state = home_replay_next_memory_state
            carry_state = home_replay_next_carry_state
            if len(home_replay_feature_screen_list) == 16:
                if arguments.training == True:
                    home_agent.supervised_replay(
                        home_replay_feature_screen_list,
                        home_replay_feature_player_list,
                        home_replay_available_actions_list,
                        home_replay_fn_id_list, home_replay_arg_ids_list,
                        home_replay_memory_state_list,
                        home_replay_carry_state_list)

                home_replay_feature_screen_list, home_replay_feature_player_list, home_replay_available_actions_list = [], [], []
                home_replay_fn_id_list, home_replay_arg_ids_list = [], []
                home_replay_memory_state_list, home_replay_carry_state_list = [], []
Beispiel #31
0
    def update(self, rbs, disc, lr, cter):
        # Compute R, which is value of the last observation
        obs = rbs[-1][-1]
        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }
            R = self.sess.run(self.value, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []

        value_target = np.zeros([len(rbs)], dtype=np.float32)
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
        spatial_action_selected = np.zeros([len(rbs), self.ssize**2],
                                           dtype=np.float32)
        valid_non_spatial_action = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
        non_spatial_action_selected = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)

        rbs.reverse()
        for i, [obs, action, next_obs] in enumerate(rbs):
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            self.reward = obs.reward
            act_id = action.function
            act_args = action.arguments

            value_target[i] = self.reward + disc * value_target[i - 1]

            valid_actions = obs.observation["available_actions"]
            valid_non_spatial_action[i, valid_actions] = 1
            non_spatial_action_selected[i, act_id] = 1

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)
        # Train
        feed = {
            self.minimap: minimaps,
            self.screen: screens,
            self.info: infos,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: lr,
            self.score: self.reward
        }  # will this work?
        _, summary = self.sess.run([self.train_op, self.summary_op],
                                   feed_dict=feed)
        self.summary_writer.add_summary(summary, cter)
def reinforcement_train(training_episode):
    score_list = []
    max_average = 5.0
    EPISODES, episode, max_average = 20000, 0, 5.0
    while episode < training_episode:
        # Reset episode
        home_score, home_done, SAVING = 0, False, ''
        opponent_score, opponent_done = 0, False
        state = env.reset()

        home_feature_screen_list, home_feature_player_list, home_feature_units_list, home_available_actions_list = [], [], [], []
        home_fn_id_list, home_arg_ids_list, home_rewards, home_dones = [], [], [], []
        home_memory_state_list, home_carry_state_list = [], []

        memory_state = np.zeros([1, 256], dtype=np.float32)
        carry_state = np.zeros([1, 256], dtype=np.float32)
        while not home_done:
            home_state = state[0]
            #opponent_state = state[1]

            home_feature_screen = home_state[3]['feature_screen']
            home_feature_screen = preprocess_screen(home_feature_screen)
            home_feature_screen = np.transpose(home_feature_screen, (1, 2, 0))

            home_feature_player = home_state[3]['player']
            home_feature_player = preprocess_player(home_feature_player)

            home_available_actions = home_state[3]['available_actions']
            home_available_actions = preprocess_available_actions(
                home_available_actions)

            home_feature_units = home_state[3]['feature_units']
            home_feature_units = preprocess_feature_units(
                home_feature_units, feature_screen_size)
            #print("home_feature_units.shape: ", home_feature_units.shape)

            home_feature_screen_array = np.array([home_feature_screen])
            home_feature_player_array = np.array([home_feature_player])
            home_feature_units_array = np.array([home_feature_units])
            home_available_actions_array = np.array([home_available_actions])

            home_feature_screen_list.append(home_feature_screen_array)
            home_feature_player_list.append(home_feature_player_array)
            home_feature_units_list.append(home_feature_units_array)
            home_available_actions_list.append([home_available_actions])
            home_memory_state_list.append(memory_state)
            home_carry_state_list.append(carry_state)

            home_prediction = home_agent.act(home_feature_screen_array,
                                             home_feature_player_array,
                                             home_feature_units_array,
                                             home_available_actions_array,
                                             memory_state, carry_state)
            home_fn_pi = home_prediction[0]
            home_arg_pis = home_prediction[1]
            home_next_memory_state = home_prediction[3]
            home_next_carry_state = home_prediction[4]

            home_fn_samples, home_arg_samples = sample_actions(
                home_available_actions, home_fn_pi, home_arg_pis)
            home_fn_id, home_arg_ids = mask_unused_argument_samples(
                home_fn_samples, home_arg_samples)
            home_fn_id_list.append(home_fn_id[0])

            home_arg_id_list = []
            for arg_type in home_arg_ids.keys():
                arg_id = home_arg_ids[arg_type]
                home_arg_id_list.append(arg_id)

            home_arg_ids_list.append(np.array([home_arg_id_list]))
            home_actions_list = actions_to_pysc2(home_fn_id, home_arg_ids,
                                                 (32, 32))

            actions_list = [home_actions_list, actions.FUNCTIONS.no_op()]
            next_state = env.step(actions_list)

            home_next_state = next_state[0]

            home_done = home_next_state[0]
            if home_done == StepType.LAST:
                home_done = True
            else:
                home_done = False

            state = next_state
            memory_state = home_next_memory_state
            carry_state = home_next_carry_state

            home_reward = float(home_next_state[1])
            home_rewards.append(home_reward)
            home_dones.append(home_done)

            home_score += home_reward
            state = next_state
            if len(home_feature_screen_list) == 16:
                if arguments.training == True:
                    home_agent.reinforcement_replay(
                        home_feature_screen_list, home_feature_player_list,
                        home_feature_units_list, home_available_actions_list,
                        home_fn_id_list, home_arg_ids_list, home_rewards,
                        home_dones, home_memory_state_list,
                        home_carry_state_list)

                home_feature_screen_list, home_feature_player_list, home_feature_units_list, home_available_actions_list = [], [], [], []
                home_fn_id_list, home_arg_ids_list, home_rewards, home_dones = [], [], [], []
                home_memory_state_list, home_carry_state_list = [], []

        score_list.append(home_score)
        average = sum(score_list) / len(score_list)

        PlotModel(home_score, episode)
        print("episode: {}/{}, score: {}, average: {:.2f} {}".format(
            episode, EPISODES, home_score, average, SAVING))
        if episode < EPISODES:
            episode += 1

    env.close()