Ejemplo n.º 1
0
    def update_master_policy(self,rbs,disc, lr, cter):
        samples =random.sample(rbs,batch_size)
        minimaps = []
        screens = []
        infos = []
        next_minimaps = []
        next_screens = []
        next_infos = []
        actions = []
        rewards = []
        for i,[obs,_,action,_,next_obs] in enumerate(samples):
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            next_minimap = np.array(next_obs.observation['minimap'], dtype=np.float32)
            next_minimap = np.expand_dims(U.preprocess_minimap(next_minimap), axis=0)
            next_screen = np.array(obs.observation['screen'], dtype=np.float32)
            next_screen = np.expand_dims(U.preprocess_screen(next_screen), axis=0)
            next_info = np.zeros([1, self.isize], dtype=np.float32)
            next_info[0, obs.observation['available_actions']] = 1
            reward = next_obs.reward

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)
            next_minimaps.append(next_minimap)
            next_screens.append(next_screen)
            next_infos.append(next_info)
            cur_action = np.zeros(num_subpolicies)
            cur_action[action]=1
            actions.append(cur_action)
            rewards.append(reward)

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)
        next_minimaps = np.concatenate(next_minimaps, axis=0)
        next_screens = np.concatenate(next_screens, axis=0)
        next_infos = np.concatenate(next_infos, axis=0)
        y_batch = []
        Qvalue_batch =self.sess_master.run(self.subpolicy_Q,feed_dict = {self.minimap: next_minimaps,
                       self.screen: next_screens,
                       self.info: next_infos})
        for i in range(0, batch_size):
            terminal = samples[i][3]
            if terminal:
                y_batch.append(rewards[i])
            else:
                y_batch.append(rewards[i] + disc * np.max(Qvalue_batch[i]))

        self.sess_master.run(self.master_train_op, feed_dict={self.minimap:minimaps,
            self.screen:screens,
            self.info:infos,
            self.y_input:y_batch,
            self.action_input:actions,
            self.learning_rate:lr})
Ejemplo n.º 2
0
    def step_low(self, ind_thread, obs, dir_high, act_id):
        # obs就是环境传入的timestep
        minimap = np.array(
            obs.observation['feature_minimap'], dtype=np.float32
        )  # 以下4行将minimap和screen的特征做一定处理后分别保存在minimap和screen变量中
        minimap = np.expand_dims(U.preprocess_minimap(minimap),
                                 axis=0)  # 这四行具体语法暂未研究
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)

        # TODO: only use available actions
        info = np.zeros([1, self.isize],
                        dtype=np.float32)  # self.isize值是动作函数的数量
        info[0, obs.observation['available_actions']] = 1  # info存储可执行的动作。

        # 矿物 军队数量 农民数量
        info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)

        info_plus[0] = obs.observation.player.minerals, obs.observation[
            'player'][5], obs.observation['player'][6], obs.observation[
                'player'][4]

        # info 现在的size 是 isize + info_plus_size
        info = np.concatenate((info, info_plus), axis=1)

        dir_high_usedToFeedLowNet = np.ones([1, 1], dtype=np.float32)
        dir_high_usedToFeedLowNet[0][0] = dir_high
        act_ID = np.ones([1, 1], dtype=np.float32)
        act_ID[0][0] = act_id

        feed = {
            self.minimap: minimap,
            self.screen: screen,
            self.info: info,
            self.dir_high_usedToFeedLowNet: dir_high_usedToFeedLowNet,
            self.act_id: act_ID
        }
        spatial_action_low = self.sess.run(  # 数据类型:Tensor("actor_low/Softmax:0", shape=(?, 4096), dtype=float32, device=/device:GPU:0)
            # [array([[0.00019935, 0.00025348, 0.00024519, ..., 0.00016189, 0.00016014, 0.00016842]], dtype=float32)]
            [self.spatial_action_low],
            feed_dict=feed)

        # 选择施加动作的位置
        # spatial_action_low = spatial_action_low.ravel()  # ravel()是numpy的函数,作用是将数据降维
        target = np.argmax(spatial_action_low)
        target = [int(target // self.ssize),
                  int(target % self.ssize)
                  ]  # 获取要施加动作的位置 疑问:若action是勾选方框怎么办?target只有一个坐标吧,那另一个坐标呢?

        # if False:   # 疑问:if False什么意思?网上没查到
        #   print(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration  # 0.2(epsilon[1])的概率随机选一个位置施加动作
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        return target[0], target[1]
Ejemplo n.º 3
0
    def step(self, obs):

        minimap = np.array(obs.observation['minimap'], dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        action_indices = [
            0, 1, 2, 3, 5, 7, 12, 13, 14, 15, 18, 19, 20, 261, 274, 331, 332,
            333, 334, 451, 452, 453, 456
        ]
        valid_actions = list(
            set(obs.observation['available_actions']) & set(action_indices))
        # print("valid_actions",valid_actions)
        valid_actions_indices = [
            action_indices.index(i) for i in valid_actions
        ]
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, valid_actions_indices] = 1

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action], feed_dict=feed)

        # print("non_spatial_action",non_spatial_action.shape,len(non_spatial_action.ravel()))
        # print("spatial_action",spatial_action.ravel().shape,len(spatial_action.ravel()))
        # Select an action and a spatial target
        non_spatial_action = non_spatial_action.ravel()
        spatial_action = spatial_action.ravel()
        # valid_actions = obs.observation['available_actions']
        # act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]
        # print("valid",non_spatial_action[valid_actions_indices])

        act_id = valid_actions[np.argmax(
            non_spatial_action[valid_actions_indices])]
        # print("SELECTED",act_id)
        target = np.argmax(spatial_action)
        target = [int(target // self.ssize), int(target % self.ssize)]

        # Epsilon greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  # TODO: Be careful
        return actions.FunctionCall(act_id, act_args)
Ejemplo n.º 4
0
    def step(self, obs):
        screen = np.array(obs.observation.feature_screen, dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        structured = np.zeros([1, self.structured_dimensions],
                              dtype=np.float32)
        structured[0, obs.observation.available_actions] = 1

        feed_dict = {
            self.screen_ph: screen,
            self.minimap_ph: minimap,
            self.structured_ph: structured
        }
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action],
            feed_dict=feed_dict)

        non_spatial_action, spatial_action = non_spatial_action.ravel(
        ), spatial_action.ravel()
        available_actions = obs.observation.available_actions
        action_id = available_actions[np.argmax(
            non_spatial_action[available_actions])]
        spatial_target = np.argmax(spatial_action)
        spatial_target = [
            int(spatial_target // self.screen_dimensions),
            int(spatial_target % self.screen_dimensions)
        ]

        # epsilon-greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            action_id = np.random.choice(available_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            delta_y, delta_x = np.random.randint(-4,
                                                 5), np.random.randint(-4, 5)
            spatial_target[0] = int(
                max(
                    0,
                    min(self.screen_dimensions - 1,
                        spatial_target[0] + delta_y)))
            spatial_target[1] = int(
                max(
                    0,
                    min(self.screen_dimensions - 1,
                        spatial_target[1] + delta_x)))

        action_args = []
        for arg in actions.FUNCTIONS[action_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                action_args.append([spatial_target[1], spatial_target[0]])
            else:
                action_args.append([0])
        return actions.FunctionCall(action_id, action_args)
Ejemplo n.º 5
0
    def step(self, obs):
        minimap = np.array(obs.observation['feature_minimap'],
                           dtype=np.float32)
        # self.logger.info(minimap)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action], feed_dict=feed)

        # Select an action and a spatial target
        non_spatial_action = non_spatial_action.ravel(
        )  # .ravel flattens the input into 1D array
        spatial_action = spatial_action.ravel()
        valid_actions = obs.observation['available_actions']
        act_id = valid_actions[np.argmax(
            non_spatial_action[valid_actions]
        )]  # index of best valid non-spatial action
        target = np.argmax(spatial_action)
        target = [
            int(target // self.ssize),
            int(target % self.ssize)
        ]  # int(idx//32), int(idx%32)  # what is the purpose of doing like this?

        if False:  # ???
            self.logger.info(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration.
        if self.training and np.random.rand(
        ) < self.epsilon[0]:  # choose action
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand(
        ) < self.epsilon[1]:  # seems like its not choosing from ALL pixels?
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] +
                                       dy)))  # max(0, min(31, target[0]+dy))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[
                act_id].args:  # find args of indexed action
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  # TODO: Be careful (???)
        return actions.FunctionCall(act_id, act_args)
Ejemplo n.º 6
0
    def get_cur_Q_action(self,obs):
        minimap = np.array(obs.observation['minimap'], dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1

        feed_master = {self.minimap: minimap,
                       self.screen: screen,
                       self.info: info}
        subpolicy_selected = np.argmax(self.sess_master.run(self.subpolicy_Q, feed_dict=feed_master),axis=1)[0]
        return subpolicy_selected
Ejemplo n.º 7
0
    def step(self, obs):
        minimap = np.array(obs.observation['minimap'], dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1

        subpolicy_index = self.get_cur_Q_action(obs)

        #print("Subpolicy Chosen is :"+str(subpolicy_index))
        feed = {self.minimap: minimap,
                self.screen: screen,
                self.info: info}
        cur_spatial_action,cur_non_spation_action,_ = self.subpolicies[subpolicy_index]

        non_spatial_action, spatial_action = self.sess_master.run(
            [cur_non_spation_action, cur_spatial_action],
            feed_dict=feed)

        # Select an action and a spatial target
        non_spatial_action = non_spatial_action.ravel()
        spatial_action = spatial_action.ravel()
        valid_actions = obs.observation['available_actions']
        act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]
        target = np.argmax(spatial_action)
        target = [int(target // self.ssize), int(target % self.ssize)]

        if False:
            print(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  # TODO: Be careful
        return actions.FunctionCall(act_id, act_args)
Ejemplo n.º 8
0
    def step(self, obs):
        super(A3CAgent, self).step(obs)
        minimap = np.array(obs.observation['feature_minimap'],
                           dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.
             observation['available_actions']] = 1  #mask for available actions

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action], feed_dict=feed)

        # Select an action and a spatial target
        non_spatial_action = non_spatial_action.ravel()
        spatial_action = spatial_action.ravel()
        valid_actions = obs.observation['available_actions']
        act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]
        target = np.argmax(spatial_action)
        target = [int(target // self.ssize), int(target % self.ssize)]

        if False:
            print(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1],
                                 target[0]])  #use (y,x)->(height,width)
            else:
                act_args.append([0])  #not queued TODO: Be careful
        return actions.FunctionCall(act_id, act_args)
Ejemplo n.º 9
0
  def step(self, obs):
    minimap = np.array(obs.observation['minimap'], dtype=np.float32)
    minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
    screen = np.array(obs.observation['screen'], dtype=np.float32)
    screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
    # TODO: only use available actions
    info = np.zeros([1, self.isize], dtype=np.float32)
    info[0, obs.observation['available_actions']] = 1

    feed = {self.minimap: minimap,
            self.screen: screen,
            self.info: info}
    non_spatial_action, spatial_action = self.sess.run(
      [self.non_spatial_action, self.spatial_action],
      feed_dict=feed)

    # Select an action and a spatial target
    non_spatial_action = non_spatial_action.ravel()
    spatial_action = spatial_action.ravel()
    valid_actions = obs.observation['available_actions']
    act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]
    target = np.argmax(spatial_action)
    target = [int(target // self.ssize), int(target % self.ssize)]

    if False:
      print(actions.FUNCTIONS[act_id].name, target)

    # Epsilon greedy exploration
    if self.training and np.random.rand() < self.epsilon[0]:
      act_id = np.random.choice(valid_actions)
    if self.training and np.random.rand() < self.epsilon[1]:
      dy = np.random.randint(-4, 5)
      target[0] = int(max(0, min(self.ssize-1, target[0]+dy)))
      dx = np.random.randint(-4, 5)
      target[1] = int(max(0, min(self.ssize-1, target[1]+dx)))

    # Set act_id and act_args
    act_args = []
    for arg in actions.FUNCTIONS[act_id].args:
      if arg.name in ('screen', 'minimap', 'screen2'):
        act_args.append([target[1], target[0]])
      else:
        act_args.append([0])  # TODO: Be careful
    return actions.FunctionCall(act_id, act_args)
Ejemplo n.º 10
0
    def step_high(self, obs):  # obs就是环境传入的timestep
        minimap = np.array(
            obs.observation['feature_minimap'], dtype=np.float32
        )  # 以下4行将minimap和screen的特征做一定处理后分别保存在minimap和screen变量中
        minimap = np.expand_dims(U.preprocess_minimap(minimap),
                                 axis=0)  # 这四行具体语法暂未研究
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions

        info = np.zeros([1, self.isize],
                        dtype=np.float32)  # self.isize值是动作函数的数量
        info[0, obs.observation['available_actions']] = 1  # info存储可执行的动作。

        # 矿物 军队数量 农民数量
        info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)
        info_plus[0] = obs.observation.player.minerals, obs.observation[
            'player'][5], obs.observation['player'][6], obs.observation[
                'player'][4]

        # info 现在的size 是 isize + info_plus_size
        info = np.concatenate((info, info_plus), axis=1)

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}
        dir_high = self.sess.run([self.dir_high], feed_dict=feed)

        # 选择出宏动作的编号/id

        #DHN待处理: 可以将dir_high先根据一定的方法筛选一下(比如宏动作中的硬编码微动作是否在obs.observation['available_actions']中)
        # valid_dir_high = obs.observation['available_actions']

        dir_high_id = np.argmax(dir_high)  # 获取要执行的宏动作id(从0开始)

        # if False:   # 疑问:if False什么意思?网上没查到
        #   print(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration  # 0.05(epsilon[0])的概率随机选一个宏动作(会覆盖之前的dir_high_id)
        if self.training and np.random.rand() < self.epsilon[0]:
            dir_high_id = random.randint(0, num_macro_action - 1)

        return dir_high_id
Ejemplo n.º 11
0
    def step(self, obs, use_unit_selector):
        minimap = np.array(obs.observation['minimap'], dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1
        if self.init_counter == 0:
            self.init_counter += 1
            return actions.FunctionCall(7, [[1]])
        elif self.init_counter == 1:
            self.init_counter += 1
            return actions.FunctionCall(
                5, [[sc_ui.ActionMultiPanel.SingleSelect], [0]])
        elif self.init_counter == 2:
            self.init_counter += 1
            return actions.FunctionCall(4, [[1], [0]])
        elif self.init_counter == 3:
            self.init_counter += 1
            return actions.FunctionCall(7, [[1]])
        elif self.init_counter == 4:
            self.init_counter += 1
            return actions.FunctionCall(
                5, [[sc_ui.ActionMultiPanel.SingleSelect], [1]])
        elif self.init_counter == 5:
            self.init_counter += 1
            return actions.FunctionCall(4, [[1], [1]])
        elif use_unit_selector:
            unitSel = self.get_unit_sel_res(obs)
            if self.training and np.random.rand() < self.epsilon[0]:
                unitSel = np.random.randint(0, 4)
            if unitSel == num_units + 1:
                return actions.FunctionCall(7, [[1]])
            elif unitSel == num_units:
                feed = {
                    self.minimap: minimap,
                    self.screen: screen,
                    self.info: info
                }

                non_spatial_action, spatial_action = self.sess_master.run(
                    [self.non_spatial_action, self.spatial_action],
                    feed_dict=feed)

                # Select an action and a spatial target
                non_spatial_action = non_spatial_action.ravel()
                spatial_action = spatial_action.ravel()
                valid_actions = obs.observation['available_actions']
                act_id = valid_actions[np.argmax(
                    non_spatial_action[valid_actions])]
                target = np.argmax(spatial_action)
                target = [int(target // self.ssize), int(target % self.ssize)]

                # Epsilon greedy exploration
                if self.training and np.random.rand() < self.epsilon[0]:
                    act_id = np.random.choice(valid_actions)
                if self.training and np.random.rand() < self.epsilon[1]:
                    dy = np.random.randint(-4, 5)
                    target[0] = int(max(0, min(self.ssize - 1,
                                               target[0] + dy)))
                    dx = np.random.randint(-4, 5)
                    target[1] = int(max(0, min(self.ssize - 1,
                                               target[1] + dx)))

                # Set act_id and act_args
                act_args = []
                for arg in actions.FUNCTIONS[act_id].args:
                    if arg.name in ('screen', 'minimap', 'screen2'):
                        act_args.append([target[1], target[0]])
                    else:
                        act_args.append([0])  # TODO: Be careful
                return actions.FunctionCall(act_id, act_args)
            else:
                return actions.FunctionCall(4, [[0], [unitSel]])
        else:
            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }

            non_spatial_action, spatial_action = self.sess_master.run(
                [self.non_spatial_action, self.spatial_action], feed_dict=feed)

            # Select an action and a spatial target
            non_spatial_action = non_spatial_action.ravel()
            spatial_action = spatial_action.ravel()
            valid_actions = obs.observation['available_actions']
            act_id = valid_actions[np.argmax(
                non_spatial_action[valid_actions])]
            target = np.argmax(spatial_action)
            target = [int(target // self.ssize), int(target % self.ssize)]

            # Epsilon greedy exploration
            if self.training and np.random.rand() < self.epsilon[0]:
                act_id = np.random.choice(valid_actions)
            if self.training and np.random.rand() < self.epsilon[1]:
                dy = np.random.randint(-4, 5)
                target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
                dx = np.random.randint(-4, 5)
                target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

            # Set act_id and act_args
            act_args = []
            for arg in actions.FUNCTIONS[act_id].args:
                if arg.name in ('screen', 'minimap', 'screen2'):
                    act_args.append([target[1], target[0]])
                else:
                    act_args.append([0])  # TODO: Be careful
            return actions.FunctionCall(act_id, act_args)
Ejemplo n.º 12
0
    def update_high(self, ind_thread, rbs, dhs, disc, lr_a, lr_c, cter):
        # rbs(replayBuffers)是[last_timesteps[0], actions[0], timesteps[0]]的集合(更新时经历了多少个step就有多少个),具体见run_loop25行
        # dhs(dir_high_buffers) 是指令序号的集合。比如一共有5个宏动作,则dhs形如[5, 4, 1, 2, 3, 4, 2, 1, ......]

        dir_high_selected = np.zeros(
            [len(rbs), num_macro_action],
            dtype=np.float32)  # 含义是每一个step需不需要坐标参数(第一维上),且具体坐标参数是什么(第二维上)
        for i in range(len(rbs)):
            dir_high_selected[i, dhs[i][0] - 1] = 1

        # Compute R, which is value of the last observation
        obs = rbs[-1][-1]  # rbs的最后一个元素,应当是当前一步的timesteps值。即obs可以看作timesteps
        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)  # 类似105-111行
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)
            info_plus[0] = obs.observation.player.minerals, obs.observation[
                'player'][5], obs.observation['player'][6], obs.observation[
                    'player'][4]

            # info 现在的size 是 isize + info_plus_size
            info = np.concatenate((info, info_plus), axis=1)

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }
            R = self.sess.run(self.value_high, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []

        value_target = np.zeros(
            [len(rbs)], dtype=np.float32)  # len(rbs) 计算出agent在回合里总共进行的步数
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)],
                                        dtype=np.float32)  # 含义是每一个step需不需要坐标参数
        spatial_action_selected = np.zeros(
            [len(rbs), self.ssize**2],
            dtype=np.float32)  # 含义是每一个step需不需要坐标参数(第一维上),且具体坐标参数是什么(第二维上)

        rbs.reverse()  # 先reverse 与莫烦A3C_continuous_action.py的代码类似
        micro_isdone = GL.get_value(ind_thread, "micro_isdone")
        micro_isdone.reverse()
        sum_high_reward = GL.get_value(ind_thread, "sum_high_reward")
        for i, [obs, action,
                next_obs] in enumerate(rbs):  # agent在回合里进行了多少步,就进行多少轮循环
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)  # 类似105-111行
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)
            info_plus[0] = obs.observation.player.minerals, obs.observation[
                'player'][5], obs.observation['player'][6], obs.observation[
                    'player'][4]

            # info 现在的size 是 isize + info_plus_size
            info = np.concatenate((info, info_plus), axis=1)

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            # reward = obs.reward
            reward = high_reward(ind_thread, next_obs, obs, action,
                                 micro_isdone[i])  # 翔森设计的high reward
            sum_high_reward += reward
            GL.add_value_list(ind_thread, "high_reward_of_episode", reward)
            act_id = action.function  # Agent在这一步中选择动作的id序号
            act_args = action.arguments

            value_target[i] = reward + disc * value_target[
                i -
                1]  # 可参考莫烦Q_Learning教程中对Gamma的意义理解的那张图(有3个眼镜那张),得到回合中每个状态的价值V_S
            # 这里没像莫烦一样再次reverse value 似乎是因为其他参数(如minimap、screen、info等)也都是最后往前反序排列的。见181-182行

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        GL.set_value(ind_thread, "sum_high_reward", sum_high_reward)
        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)

        # Train
        feed = {
            self.minimap: minimaps,
            self.screen: screens,
            self.info: infos,
            self.value_target_high: value_target,
            self.dir_high_selected: dir_high_selected,
            self.learning_rate_a_high: lr_a,
            self.learning_rate_c_high: lr_c
        }
        _, __, summary = self.sess.run(
            [self.update_a_high, self.update_c_high, self.summary_op_high],
            feed_dict=feed)
        self.summary_writer.add_summary(summary, cter)

        GL.set_value(ind_thread, "micro_isdone", [])
Ejemplo n.º 13
0
    def update_low(self, ind_thread, rbs, dhs, disc, lr_a, lr_c, cter,
                   macro_type, coord_type):

        # rbs(replayBuffers)是[last_timesteps[0], actions[0], timesteps[0]]的集合(agent在一回合里进行了多少step就有多少个),具体见run_loop25行

        # Compute R, which is value of the last observation
        obs = rbs[-1][-1]  # rbs的最后一个元素,应当是当前一步的timesteps值。即obs可以看作timesteps
        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)  # 类似105-111行
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)
            info_plus[0] = obs.observation.player.minerals, obs.observation[
                'player'][5], obs.observation['player'][6], obs.observation[
                    'player'][4]
            # info 现在的size 是 isize + info_plus_size
            info = np.concatenate((info, info_plus), axis=1)
            # print('info')
            # print(info)
            # print(info_plus)

            dir_high_usedToFeedLowNet = np.ones([1, 1], dtype=np.float32)
            dir_high_usedToFeedLowNet[0][0] = dhs[0]
            act_id = np.ones([1, 1], dtype=np.float32)
            # act_ID[0][0] = rbs[-1][1].function
            # 之所以不能用rbs里的action信息,是因为rbs里的action可能是no_op(由于出现动作not valid/不合法的情况,为了使游戏不崩掉而不得不这么办的补救措施)
            # 但这里要输入的act_id应该是step_low算出来的act_id
            act_id[0][0] = GL.get_value(ind_thread, "act_id_micro")

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info,
                self.dir_high_usedToFeedLowNet: dir_high_usedToFeedLowNet,
                self.act_id: act_id,
            }
            R = self.sess.run(self.value_low, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []
        dir_highs = []
        act_ids = []

        value_target = np.zeros(
            [len(rbs)], dtype=np.float32)  # len(rbs) 计算出agent在回合里总共进行的步数
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)],
                                        dtype=np.float32)  # 含义是每一个step需不需要坐标参数
        spatial_action_selected = np.zeros(
            [len(rbs), self.ssize**2],
            dtype=np.float32)  # 含义是每一个step需不需要坐标参数(第一维上),且具体坐标参数是什么(第二维上)

        rbs.reverse()  # 先reverse 与莫烦A3C_continuous_action.py的代码类似
        micro_isdone = GL.get_value(ind_thread, "micro_isdone")
        micro_isdone.reverse()

        sum_low_reward = GL.get_value(ind_thread, "sum_low_reward")
        for i, [obs, action,
                next_obs] in enumerate(rbs):  # agent在回合里进行了多少步,就进行多少轮循环
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)  # 类似105-111行
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            info_plus = np.zeros([1, self.info_plus_size], dtype=np.float32)
            info_plus[0] = obs.observation.player.minerals, obs.observation[
                'player'][5], obs.observation['player'][6], obs.observation[
                    'player'][4]

            # info 现在的size 是 isize + info_plus_size
            info = np.concatenate((info, info_plus), axis=1)

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            dir_high_usedToFeedLowNet = np.ones([1, 1], dtype=np.float32)
            dir_high_usedToFeedLowNet[0][0] = dhs[i]
            act_ID = np.ones([1, 1], dtype=np.float32)
            # act_ID[0][0] = act_id
            # 之所以不能用rbs里的action信息,是因为rbs里的action可能是no_op(由于出现动作not valid/不合法的情况,为了使游戏不崩掉而不得不这么办的补救措施)
            # 但这里要输入的act_id应该是step_low算出来的act_id
            act_ID[0][0] = GL.get_value(ind_thread, "act_id_micro")
            # dir_highs.append(dir_high_usedToFeedLowNet)
            # act_ids.append(act_ID)

            coord = [0, 0]
            # coord[0], coord[1] = [32, 32]
            coord[0], coord[1] = self.step_low(ind_thread, obs,
                                               dir_high_usedToFeedLowNet,
                                               act_ID)
            reward = low_reward(next_obs, obs, coord, micro_isdone[i],
                                macro_type, coord_type)
            sum_low_reward += reward
            GL.add_value_list(ind_thread, "low_reward_of_episode", reward)

            act_id = action.function  # Agent在这一步中选择动作的id序号
            act_args = action.arguments

            value_target[i] = reward + disc * value_target[
                i -
                1]  # 可参考莫烦Q_Learning教程中对Gamma的意义理解的那张图(有3个眼镜那张),得到回合中每个状态的价值V_S
            # 这里没像莫烦一样再次reverse value 似乎是因为其他参数(如minimap、screen、info等)也都是最后往前反序排列的。见181-182行

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        GL.set_value(ind_thread, "sum_low_reward", sum_low_reward)

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)

        # 实际上由于low_net是单步更新策略,所以以下feed的参数里面都只有一帧的数据

        # Train
        feed = {
            self.minimap:
            minimaps,
            self.screen:
            screens,
            self.info:
            infos,
            # self.dir_high_usedToFeedLowNet: dir_highs,
            self.dir_high_usedToFeedLowNet:
            dir_high_usedToFeedLowNet,
            # self.act_id: act_ids,
            self.act_id:
            act_ID,
            self.value_target_low:
            value_target,
            self.valid_spatial_action_low:
            valid_spatial_action,
            self.spatial_action_selected_low:
            spatial_action_selected,
            self.learning_rate_a_low:
            lr_a,
            self.learning_rate_c_low:
            lr_c
        }
        _, __, summary = self.sess.run(
            [self.update_a_low, self.update_c_low, self.summary_op_low],
            feed_dict=feed)
        self.summary_writer.add_summary(summary, cter)
Ejemplo n.º 14
0
    def update(self, rbs, disc, lr, cter):
        # Compute R, which is value of the last observation
        obs = rbs[-1][-1]
        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }
            R = self.sess.run(self.value, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []
        episode_reward = 0
        # [NEW] Alejandro
        episode_modified_reward = 0
        #################

        value_target = np.zeros([len(rbs)], dtype=np.float32)
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
        spatial_action_selected = np.zeros([len(rbs), self.ssize**2],
                                           dtype=np.float32)
        valid_non_spatial_action = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
        non_spatial_action_selected = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)

        rbs.reverse()
        for i, [obs, action, next_obs] in enumerate(rbs):
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            episode_reward += obs.reward
            reward = obs.reward

            # [NEW] Alejandro
            r_modified = reward

            last_dist = min_distance_to_enemy(obs, minimap=True)
            curr_dist = min_distance_to_enemy(next_obs, minimap=True)

            if last_dist == INF and curr_dist < INF:
                print("Zergling discovered!")
                r_modified += 1.0  # Zergling discovered
            elif last_dist < INF and curr_dist == INF:
                if reward <= 0 and i:
                    print("The marines have lost all the Zerglings!")
                    r_modified -= 1.0  # don't flee!
            elif last_dist < INF and curr_dist < INF and reward <= 0:
                # if curr_dist < last_dist:
                # print("Approaching Zergling")
                # elif curr_dist > last_dist:
                # print("Getting away from Zergling")
                r_modified += (last_dist - curr_dist) / 10
                if isnan(r_modified): print("NaN at point A")

            if action.function == 1:
                prop_bef = proportion_visible_onscreen(obs)
                prop_aft = proportion_visible_onscreen(next_obs)
                print("Camera action. Previous %visible: {}."
                      " Current %visible: {}".format(100 * prop_bef,
                                                     100 * prop_aft))
                r_modified += 2 * (prop_aft - prop_bef)
                # units_bef = count_units(obs, minimap=False)
                # units_aft = count_units(next_obs, minimap=False)
                # print("Camera action. Previous #units: {}. Current #units: {}".format(
                # units_bef, units_aft))
                # r_modified += 0.5*(units_aft - units_bef)
                if isnan(r_modified): print("NaN at point B")

            episode_modified_reward += r_modified
            #################

            act_id = action.function
            act_args = action.arguments

            # [OLD]
            # value_target[i] = reward + disc * value_target[i-1]
            # [NEW] Alejandro
            value_target[i] = r_modified + disc * value_target[i - 1]
            #################

            valid_actions = obs.observation["available_actions"]
            valid_non_spatial_action[i, valid_actions] = 1
            non_spatial_action_selected[i, act_id] = 1

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        self.episode_rewards.append(episode_reward)
        # [NEW] Alejandro
        self.episode_modified_rewards.append(episode_modified_reward)
        #################
        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)

        # Train
        feed = {
            self.minimap: minimaps,
            self.screen: screens,
            self.info: infos,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: lr
        }
        _, summary = self.sess.run([self.train_op, self.summary_op],
                                   feed_dict=feed)

        # [Roig] New summary to save the reward
        self.summary_writer.add_summary(summary, cter)
        summ = tf.Summary()
        summ.value.add(tag='Reward', simple_value=float(episode_reward))
        #summ.value.add(tag='Mean Value', simple_value=float(np.mean(value_target)))
        self.summary_writer.add_summary(summ, cter)
        # [NEW] Alejandro: new summary to save the modified reward
        rmod_summ = tf.Summary()
        rmod_summ.value.add(tag="Modified reward",
                            simple_value=episode_modified_reward)
        self.summary_writer.add_summary(rmod_summ, cter)
        #################

        self.summary_writer.flush()
Ejemplo n.º 15
0
    def update(self, replay_buffer, gamma, learning_rate, step):
        obs = replay_buffer[-1][-1]
        if obs.last():
            reward = 0
        else:
            screen = np.array(obs.observation.feature_screen, dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            minimap = np.array(obs.observation.feature_minimap,
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            structured = np.zeros([1, self.structured_dimensions],
                                  dtype=np.float32)
            structured[0, obs.observation.available_actions] = 1

            feed_dict = {
                self.screen_ph: screen,
                self.minimap_ph: minimap,
                self.structured_ph: structured
            }
            reward = self.sess.run(self.value, feed_dict=feed_dict)

        # compute targets and masks
        screens, minimaps, structureds = [], [], []
        target_value = np.zeros([len(replay_buffer)], dtype=np.float32)
        target_value[-1] = reward

        valid_non_spatial_action = np.zeros(
            [len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32)
        sample_non_spatial_action = np.zeros(
            [len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32)
        valid_spatial_action = np.zeros([len(replay_buffer)], dtype=np.float32)
        sample_spatial_action = np.zeros(
            [len(replay_buffer), self.screen_dimensions**2], dtype=np.float32)

        replay_buffer.reverse()
        for i, [obs, action, next_obs] in enumerate(replay_buffer):
            screen = np.array(obs.observation.feature_screen, dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            minimap = np.array(obs.observation.feature_minimap,
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            structured = np.zeros([1, self.structured_dimensions],
                                  dtype=np.float32)
            structured[0, obs.observation.available_actions] = 1

            screens.append(screen)
            minimaps.append(minimap)
            structureds.append(structured)

            reward = obs.reward
            action_id = action.function
            action_args = action.arguments

            target_value[i] = reward + gamma * target_value[i - 1]

            available_actions = obs.observation.available_actions
            valid_non_spatial_action[i, available_actions] = 1
            sample_non_spatial_action[i, action_id] = 1

            args = actions.FUNCTIONS[action_id].args
            for arg, action_arg in zip(args, action_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    spatial_action = action_arg[
                        1] * self.screen_dimensions + action_arg[0]
                    valid_spatial_action[i] = 1
                    sample_spatial_action[i, spatial_action] = 1

        screens = np.concatenate(screens, axis=0)
        minimaps = np.concatenate(minimaps, axis=0)
        structureds = np.concatenate(structureds, axis=0)

        feed_dict = {
            self.screen_ph: screens,
            self.minimap_ph: minimaps,
            self.structured_ph: structureds,
            self.target_value_ph: target_value,
            self.valid_non_spatial_action_ph: valid_non_spatial_action,
            self.sample_non_spatial_action_ph: sample_non_spatial_action,
            self.valid_spatial_action_ph: valid_spatial_action,
            self.sample_spatial_action_ph: sample_spatial_action,
            self.learning_rate_ph: learning_rate
        }
        _, summary = self.sess.run([self.train_op, self.summary_op],
                                   feed_dict=feed_dict)
        self.summary_writer.add_summary(summary, step)
Ejemplo n.º 16
0
    def step(self, obs):  # action selection is in here
        minimap = np.array(obs.observation['feature_minimap'],
                           dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}

        # Select an action and a spatial target.
        valid_actions = np.zeros(self.isize, dtype=np.int32)
        valid_actions[obs.observation['available_actions']] = 1
        function_id_policy, spatial_policy = self.sess.run(
            [self.non_spatial_policy, self.spatial_policy], feed_dict=feed)

        # self.logger.info(f"spatial_policy unraveled: {spatial_policy}.")
        # self.logger.info(f":{spatial_policy.shape}.")

        function_id_policy = function_id_policy.ravel(
        )  # .ravel flattens the input into 1D array
        spatial_policy = spatial_policy.ravel()
        # self.logger.info(f"spatial_policy .raveled: {spatial_policy}")  # this will help with target below
        # self.logger.info(f":{spatial_policy.shape}.")
        function_id_policy *= valid_actions

        function_ids = np.arange(len(function_id_policy))
        function_id_policy /= np.sum(function_id_policy)
        #     act_id = valid_actions[np.argmax(non_spatial_policy[valid_actions])]
        act_id = np.random.choice(function_ids,
                                  p=np.squeeze(function_id_policy))
        target = np.argmax(spatial_policy)  # currentl will be (,1024)
        target = [int(target // self.ssize),
                  int(target % self.ssize)]  # not sure why different operators

        if False:
            self.logger.info(
                f"if false: {actions.FUNCTIONS[act_id].name, target}")

        # Epsilon greedy exploration. Keeping this to see if it works
        # basically, if eps greedy: take the target and move it left/right and up/down 4 px
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(
                0, min(self.ssize - 1, target[0] +
                       dy)))  # make sure target is within possible pxl range
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        args = []
        # args: A list of the types of args passed to function_type
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                # x_policy = self.sess.run(
                #     self.argument_policy[str(arg) + "x"],
                #     feed_dict=feed)

                # y_policy = self.sess.run(
                #     self.argument_policy[str(arg) + "y"],
                #     feed_dict=feed)

                # x_policy = np.squeeze(x_policy)
                # x_ids = np.arange(len(x_policy))
                # x = np.random.choice(x_ids, p=x_policy)

                # y_policy = np.squeeze(y_policy)
                # y_ids = np.arange(len(y_policy))
                # y = np.random.choice(y_ids, p=y_policy)
                # args.append([x, y])
                args.append([target[1], target[0]])
                # self.logger.info(f"target coords: {[target[1], target[0]]}")
            else:
                arg_policy = self.sess.run(self.argument_policy[str(arg)],
                                           feed_dict=feed)
                arg_policy = np.squeeze(arg_policy)
                arg_ids = np.arange(len(arg_policy))
                arg_index = np.random.choice(arg_ids, p=arg_policy)
                args.append([arg_index])
                # self.logger.info(f"arg: index: {arg_index}")


#           args.append([0])

# sizes: The max+1 of each of the dimensions this argument takes.
        return actions.FunctionCall(
            act_id, args)  #  args should be int from (0, arg.size)
Ejemplo n.º 17
0
	def update(self, replay_buffer, learning_rate, step):
		obs = replay_buffer[-1][-1]
		if obs.last():
			reward = 0
		else:
			screen = np.array(obs.observation.feature_screen, dtype=np.float32)
			screen = np.expand_dims(utils.preprocess_screen(screen), axis=0)
			minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
			minimap = np.expand_dims(utils.preprocess_minimap(minimap), axis=0)
			structured = np.zeros([1, self.structured_dimensions], dtype=np.float32)
			structured[0, obs.observation.available_actions] = 1

			feed_dict = {
				self.screen: screen,
				self.minimap: minimap,
				self.structured: structured
			}
			reward = self.sess.run(self.value, feed_dict=feed_dict)

		#compute targets and masks
		screens, minimaps, structureds = [], [], []
		target_value = np.zeros([len(replay_buffer)], dtype=np.float32)
		target_value[-1] = reward

		valid_non_spatial_action = np.zeros([len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32)
		non_spatial_action_selected = np.zeros([len(replay_buffer), len(actions.FUNCTIONS)], dtype=np.float32)
		valid_spatial_action = np.zeros([len(replay_buffer)], dtype=np.float32)
		spatial_action_selected = np.zeros([len(replay_buffer), self.resolution ** 2], dtype=np.float32)

		record_score = replay_buffer[-1][0].observation['score_cumulative'][0]
		summary = tf.Summary()
		summary.value.add(tag='episode_score', simple_value=record_score)
		print('train!! step %d: score = %f' % (step, record_score))
		self.summary_writer.add_summary(summary, step)

		replay_buffer.reverse()
		# reverse:方法沒有返回值,但是會對列表的元素進行反向排序
		for i, [obs, action, next_obs] in enumerate(replay_buffer):
		# seq = ['one', 'two', 'three']
		# for i, element in enumerate(seq):
		# print i, element
		#	0 one
		#	1 two
		#	2 three
			screen = np.array(obs.observation.feature_screen, dtype=np.float32)
			screen = np.expand_dims(utils.preprocess_screen(screen), axis=0)
			minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
			minimap = np.expand_dims(utils.preprocess_minimap(minimap), axis=0)
			structured = np.zeros([1, self.structured_dimensions], dtype=np.float32)
			structured[0, obs.observation.available_actions] = 1

			screens.append(screen)
			minimaps.append(minimap)
			structureds.append(structured)

			reward = obs.reward
			action_id = action.function
			action_args = action.arguments

			target_value[i] = reward + self.discount * target_value[i - 1]

			available_actions = obs.observation.available_actions
			valid_non_spatial_action[i, available_actions] = 1
			non_spatial_action_selected[i, action_id] = 1

			args = actions.FUNCTIONS[action_id].args
			for arg, action_arg in zip(args, action_args):
				if arg.name in ('screen', 'minimap', 'screen2'):
					spatial_action = action_arg[1] * self.resolution + action_arg[0]
					valid_spatial_action[i] = 1
					spatial_action_selected[i, spatial_action] = 1

		screens = np.concatenate(screens, axis=0)
		minimaps = np.concatenate(minimaps, axis=0)
		structureds = np.concatenate(structureds, axis=0)

		feed_dict = {
			self.screen: screens,
			self.minimap: minimaps,
			self.structured: structureds,
			self.target_value: target_value,
			self.valid_non_spatial_action: valid_non_spatial_action,
			self.non_spatial_action_selected: non_spatial_action_selected,
			self.valid_spatial_action: valid_spatial_action,
			self.spatial_action_selected: spatial_action_selected,
			self.learning_rate: learning_rate
		}
		_, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed_dict)
		self.summary_writer.add_summary(summary, step)
Ejemplo n.º 18
0
    def step(self, obs, num_frames, global_episodes=-1):
        global script_step_cnt
        if num_frames == 1:
            script_step_cnt = 0
        cheater_rand = np.random.random()
        #print(num_frames, global_episodes)

        # 以 0.5 概率做 scripted_agent,类似于监督学习
        if FLAGS.map == 'MoveToBeacon' and FLAGS.teaching \
          and cheater_rand <= 0.5**(global_episodes/50) and self.training:

            print("Teaching at frame No. {}.".format(num_frames))
            FUNCTIONS = actions.FUNCTIONS
            if FUNCTIONS.Move_screen.id in obs.observation.available_actions:
                player_relative = obs.observation.feature_screen.player_relative
                beacon = _xy_locs(player_relative == _PLAYER_NEUTRAL)
                if not beacon:
                    return FUNCTIONS.no_op(), True
                beacon_center = np.mean(beacon, axis=0).round()
                return FUNCTIONS.Move_screen("now", beacon_center), True
            else:
                return FUNCTIONS.select_army("select"), True

        # CollectMineralsAndGas 任务的监督脚本
        if FLAGS.map == 'CollectMineralsAndGas' and FLAGS.teaching \
          and cheater_rand <= 0.5**(global_episodes/50) and self.training:

            #print("CMAG teaching at frame No. {}.".format(num_frames))
            return self.TeacherCollectMineralsAndGas(obs), True

        # 否则按照eps-greedy step
        minimap = np.array(obs.observation['feature_minimap'],
                           dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['feature_screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action], feed_dict=feed)

        # Select an action and a spatial target
        non_spatial_action = non_spatial_action.ravel()
        spatial_action = spatial_action.ravel()
        valid_actions = obs.observation['available_actions']
        act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]
        target = np.argmax(spatial_action)
        target = [int(target // self.ssize), int(target % self.ssize)]

        # 根据MoveToBeacon训练弱点加了一点人工规则
        # 模型习惯与走右上角 (x,y最小的点),有可能在少数情况下出现卡死,因此略微增加x,y
        if FLAGS.map == 'MoveToBeacon' and not self.training:
            target[0] += 1
            target[1] += 1

        if False:
            print(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  # TODO: Be careful
        return actions.FunctionCall(act_id, act_args), False
Ejemplo n.º 19
0
    def step(self, obs):
        super(RandomAgent, self).step(obs)
        self.randomOrgreedy = False
        feature_screen = np.expand_dims(preprocess_screen(
            obs.observation.feature_screen),
                                        axis=0)
        feature_map = np.expand_dims(preprocess_minimap(
            obs.observation.feature_minimap),
                                     axis=0)
        info = np.zeros([1, self.action_size], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1
        feed_dict = {
            self.minimap: feature_map,
            self.screen: feature_screen,
            self.info: info
        }
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action],
            feed_dict=feed_dict)
        non_spatial_action = non_spatial_action.ravel()
        spatial_action = spatial_action.ravel()  #output shape 4096
        target = np.argmax(spatial_action)
        target = [
            int(target // self.minimap_size),
            int(target % self.minimap_size)
        ]
        valid_actions = obs.observation.available_actions
        act_id = valid_actions[np.argmax(non_spatial_action[valid_actions])]

        # print("available actions = " + str(obs.observation.available_actions))
        # function_id = numpy.random.choice(obs.observation.available_actions)
        # function_id = 1
        # print("function_id = " + str(function_id))
        # print("observation_spec " + str(self.obs_spec))
        # print("action_spec" + str((self.action_spec.functions)))
        # args = [[numpy.random.randint(0, size) for size in arg.sizes]
        # for arg in self.action_spec.functions[function_id].args]
        # print("function args = " + str(self.action_spec.functions[function_id].args))
        # for id in obs.observation.available_actions:
        #     for arg in self.action_spec.functions[id].args:
        #         ctr = 0
        #         for size in arg.sizes:
        #             ctr +=1
        #         if(ctr>2):
        #             print("function_id = " + str(id))

        if np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
            self.randomOrgreedy = True
        if np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.screen_size - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.screen_size - 1, target[1] + dx)))
        act_args = []
        for arg in self.action_spec.functions[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  # TODO: Be careful
        if (act_id != self.temp_act_id):
            self.temp_act_id = act_id
            if (self.randomOrgreedy):
                print("RANDOM")
            print("action " + str(actions.FUNCTIONS[act_id].name))
            print("target" + str(target))
        # print("args = " + str(args))
        # print("\n\n\n")
        return actions.FunctionCall(act_id, act_args)
Ejemplo n.º 20
0
    def step(self, obs):

        minimap = np.array(obs.observation['minimap'], dtype=np.float32)
        minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
        screen = np.array(obs.observation['screen'], dtype=np.float32)
        screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
        # TODO: only use available actions
        info = np.zeros([1, self.isize], dtype=np.float32)
        info[0, obs.observation['available_actions']] = 1
        #print('info = ',info)

        feed = {self.minimap: minimap, self.screen: screen, self.info: info}
        non_spatial_action, spatial_action = self.sess.run(
            [self.non_spatial_action, self.spatial_action], feed_dict=feed)

        # Select an action and a spatial target
        non_spatial_action = non_spatial_action.ravel()
        spatial_action = spatial_action.ravel()
        valid_actions = obs.observation['available_actions']

        #print('valid_actions = ',valid_actions)
        #print('self.less_actions = ',self.less_actions)

        #找valid_actions中各元素在self.less_actions中的脚标
        valid_actions_idx = []
        for i in range(len(valid_actions)):
            for j in range(len(self.less_actions)):
                if (self.less_actions[j] == valid_actions[i]):
                    valid_actions_idx.append(j)
        #valid_actions_idx = np.sort(valid_actions_idx)
        act_id = int(self.less_actions[np.argmax(
            non_spatial_action[valid_actions_idx])])

        #print('valid_actions_idx = ',valid_actions_idx)
        #print('np.argmax(non_spatial_action[valid_actions_idx]) = ', np.argmax(non_spatial_action[valid_actions_idx]))

        #print('act_id = ',act_id)
        target = np.argmax(spatial_action)
        target = [int(target // self.ssize), int(target % self.ssize)]

        #if False:
        #      print(actions.FUNCTIONS[act_id].name, target)

        # Epsilon greedy exploration
        if self.training and np.random.rand() < self.epsilon[0]:
            act_id = np.random.choice(valid_actions)
        if self.training and np.random.rand() < self.epsilon[1]:
            dy = np.random.randint(-4, 5)
            target[0] = int(max(0, min(self.ssize - 1, target[0] + dy)))
            dx = np.random.randint(-4, 5)
            target[1] = int(max(0, min(self.ssize - 1, target[1] + dx)))

        # Set act_id and act_args
        act_args = []
        for arg in actions.FUNCTIONS[act_id].args:
            if arg.name in ('screen', 'minimap', 'screen2'):
                act_args.append([target[1], target[0]])
            else:
                act_args.append([0])  # TODO: Be careful
        if (not act_id in valid_actions):
            return actions.FunctionCall(_NOOP, [])

        return actions.FunctionCall(act_id, act_args)
Ejemplo n.º 21
0
  def update(self, rbs, disc, lr, cter):
    # Compute R, which is value of the last observation
    spatial_action = None
    non_spatial_action = None

    obs = rbs[-1][-1]
    if obs.last():
      R = 0
    else:
      minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
      minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
      screen = np.array(obs.observation.feature_screen, dtype=np.float32)
      screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
      info = np.zeros([1, self.isize], dtype=np.float32)
      info[0, obs.observation.available_actions] = 1

      # first get probabilities for each action; Then greedly pick the largest to calculate q value. one hot vector softmax
      # have low confidence, just use full episode and the last observation R should be just 0.
      feed = {self.minimap: minimap,
              self.screen: screen,
              self.info: info}
      spatial_action, non_spatial_action = self.sess.run([self.spatial_action, self.non_spatial_action], feed_dict=feed)

    # Compute targets and masks
    minimaps = []
    screens = []
    infos = []
    valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
    spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32)
    valid_non_spatial_action = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
    non_spatial_action_selected = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)

    rbs.reverse()
    for i, [obs, action, next_obs] in enumerate(rbs):
      minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
      minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
      screen = np.array(obs.observation.feature_screen, dtype=np.float32)
      screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
      info = np.zeros([1, self.isize], dtype=np.float32)
      #info[0, obs.observation['available_actions']] = 1
      info[0, obs.observation.available_actions] = 1

      minimaps.append(minimap)
      screens.append(screen)
      infos.append(info)

      act_id = action.function
      act_args = action.arguments

      #valid_actions = obs.observation["available_actions"]
      valid_actions = obs.observation.available_actions
      valid_non_spatial_action[i, valid_actions] = 1
      non_spatial_action_selected[i, act_id] = 1

      args = actions.FUNCTIONS[act_id].args
      for arg, act_arg in zip(args, act_args):
        if arg.name in ('screen', 'minimap', 'screen2'):
          ind = act_arg[1] * self.ssize + act_arg[0]
          valid_spatial_action[i] = 1
          spatial_action_selected[i, ind] = 1


    value_target = np.zeros([len(rbs)], dtype=np.float32)
    if spatial_action is not None:
      q_spatial = np.max(spatial_action * valid_spatial_action[0], axis=1)
      q_non_spatial = np.max(non_spatial_action * valid_non_spatial_action[0], axis=1)
      q_value = self.ispatial*q_spatial + q_non_spatial
      R = q_value[0]
      
    value_target[-1] = R

    for i, [obs, action, next_obs] in enumerate(rbs):
      reward = obs.reward
      value_target[i] = reward + disc * value_target[i-1]

    minimaps = np.concatenate(minimaps, axis=0)
    screens = np.concatenate(screens, axis=0)
    infos = np.concatenate(infos, axis=0)

    # Train
    feed = {self.minimap: minimaps,
            self.screen: screens,
            self.info: infos,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: lr}
    _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed)
    self.summary_writer.add_summary(summary, cter)
Ejemplo n.º 22
0
    def update(self, rbs, disc, lr, cter):
        # Compute R, which is value of the last observation
        obs = rbs[-1][-1]
        if obs.last():
            # obs[3]['score_cumulative'][0] or obs.reward
            R = obs[3]['score_cumulative'][0]

            # enums from https://github.com/Blizzard/s2client-api/blob/master/include/sc2api/sc2_typeenums.h
            _TERRAN_BARRACKS = 21
            _TERRAN_MARINE = 48
            _UNIT_TYPE = features.SCREEN_FEATURES.unit_type.index
            unit_type = obs.observation['feature_screen'][_UNIT_TYPE]

            barracks_y, barracks_x = (unit_type == _TERRAN_BARRACKS).nonzero()

            if barracks_x.any():
                print('Barracks detected')
                R += 1

            print('Episode reward: {}'.format(R))

        else:
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }
            R = self.sess.run(self.value, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []

        value_target = np.zeros([len(rbs)], dtype=np.float32)
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
        spatial_action_selected = np.zeros([len(rbs), self.ssize**2],
                                           dtype=np.float32)
        valid_non_spatial_action = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
        non_spatial_action_selected = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)

        rbs.reverse()
        for i, [obs, action, next_obs] in enumerate(rbs):
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            #reward = obs.reward
            reward = 0.25 * (next_obs.observation['score_cumulative'][0] -
                             obs.observation['score_cumulative'][0])
            act_id = action.function
            act_args = action.arguments

            value_target[i] = reward + disc * value_target[i - 1]

            valid_actions = obs.observation["available_actions"]
            valid_non_spatial_action[i, valid_actions] = 1
            non_spatial_action_selected[i, act_id] = 1

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)

        for minimaps, screens, infos, value_target, valid_spatial_action, spatial_action_selected, valid_non_spatial_action, \
            non_spatial_action_selected in zip(*[self.batch(mask, BATCH_SIZE) for mask in [minimaps, screens, infos, value_target, valid_spatial_action,
                                                                                           spatial_action_selected, valid_non_spatial_action, non_spatial_action_selected]]):

            # Train in batches

            feed = {
                self.minimap: minimaps,
                self.screen: screens,
                self.info: infos,
                self.value_target: value_target,
                self.valid_spatial_action: valid_spatial_action,
                self.spatial_action_selected: spatial_action_selected,
                self.valid_non_spatial_action: valid_non_spatial_action,
                self.non_spatial_action_selected: non_spatial_action_selected,
                self.learning_rate: lr
            }
            #print('Commiting {} replay samples'.format(len(minimaps)))
            _, summary = self.sess.run([self.train_op, self.summary_op],
                                       feed_dict=feed)
            self.summary_writer.add_summary(summary, cter)
Ejemplo n.º 23
0
  def update(self, rbs, disc, lr, cter):
    # Compute R, which is value of the last observation
    obs = rbs[-1][-1]
    if obs.last():
      R = 0
    else:
      minimap = np.array(obs.observation['minimap'], dtype=np.float32)
      minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
      screen = np.array(obs.observation['screen'], dtype=np.float32)
      screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
      info = np.zeros([1, self.isize], dtype=np.float32)
      info[0, obs.observation['available_actions']] = 1

      feed = {self.minimap: minimap,
              self.screen: screen,
              self.info: info}
      R = self.sess.run(self.value, feed_dict=feed)[0]

    # Compute targets and masks
    minimaps = []
    screens = []
    infos = []

    value_target = np.zeros([len(rbs)], dtype=np.float32)
    value_target[-1] = R

    valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
    spatial_action_selected = np.zeros([len(rbs), self.ssize**2], dtype=np.float32)
    valid_non_spatial_action = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
    non_spatial_action_selected = np.zeros([len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)

    rbs.reverse()
    for i, [obs, action, next_obs] in enumerate(rbs):
      minimap = np.array(obs.observation['minimap'], dtype=np.float32)
      minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
      screen = np.array(obs.observation['screen'], dtype=np.float32)
      screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
      info = np.zeros([1, self.isize], dtype=np.float32)
      info[0, obs.observation['available_actions']] = 1

      minimaps.append(minimap)
      screens.append(screen)
      infos.append(info)

      reward = obs.reward
      act_id = action.function
      act_args = action.arguments

      value_target[i] = reward + disc * value_target[i-1]

      valid_actions = obs.observation["available_actions"]
      valid_non_spatial_action[i, valid_actions] = 1
      non_spatial_action_selected[i, act_id] = 1

      args = actions.FUNCTIONS[act_id].args
      for arg, act_arg in zip(args, act_args):
        if arg.name in ('screen', 'minimap', 'screen2'):
          ind = act_arg[1] * self.ssize + act_arg[0]
          valid_spatial_action[i] = 1
          spatial_action_selected[i, ind] = 1

    minimaps = np.concatenate(minimaps, axis=0)
    screens = np.concatenate(screens, axis=0)
    infos = np.concatenate(infos, axis=0)

    # Train
    feed = {self.minimap: minimaps,
            self.screen: screens,
            self.info: infos,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: lr}
    _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed)
    self.summary_writer.add_summary(summary, cter)
Ejemplo n.º 24
0
    def update(self, rbs, replay_buffer, disc, lr, cter):
        # Compute R, which is value of the last observation
        buffer_size = len(replay_buffer)
        obs = rbs[-1][-1]
        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }
            R = self.sess.run(self.value, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []

        value_target = np.zeros([len(rbs)], dtype=np.float32)
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
        spatial_action_selected = np.zeros([len(rbs), self.ssize**2],
                                           dtype=np.float32)
        valid_non_spatial_action = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
        non_spatial_action_selected = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)

        rbs.reverse()
        for i, [obs, action, pixel_change, next_obs] in enumerate(rbs):
            # added pixel change to update function, just directly put it into the feed dict
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            reward = obs.reward
            act_id = action.function
            act_args = action.arguments

            value_target[i] = reward + disc * value_target[i - 1]

            valid_actions = obs.observation["available_actions"]
            valid_non_spatial_action[i, valid_actions] = 1
            non_spatial_action_selected[i, act_id] = 1

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)

        # Train
        feed = {
            self.minimap: minimaps,
            self.screen: screens,
            self.pixel_change: pixel_change,
            self.info: infos,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: lr
        }
        # _, summary = self.sess.run([self.train_op, self.summary_op], feed_dict=feed)
        # self.summary_writer.add_summary(summary, cter)
        ######################################################################################
        # Update the pc network
        start_pos = np.random.randint(0, buffer_size - self.sequence_size - 1)
        #take care of terminals
        if replay_buffer[start_pos][-1].last():
            start_pos += 1
            # Assuming that there are no successive terminal frames.

        pc_experience_frames = []

        for i in range(self.sequence_size + 1):
            frame = replay_buffer[start_pos + i]
            pc_experience_frames.append(frame)
            if frame[-1].last():
                break
        # Reverse sequence to calculate from the last
        pc_experience_frames.reverse()

        batch_pc_si = []
        batch_pc_a = []
        batch_pc_R = []
        batch_pc_va = []
        pc_R = np.zeros([20, 20], dtype=np.float32)
        if not pc_experience_frames[1].last():
            # pc_R = self.run_pc_q_max(self.sess, pc_experience_frames[0].state)
            # def run_pc_q_max(self, sess, s_t):
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            # TODO: only use available actions
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1
            s_feed = {
                self.pc_minimap: minimap,
                self.pc_screen: screen,
                self.pc_info: info
            }
            pc_R = self.sess.run(self.pc_q_max, s_feed)
        pc_valid_non_spatial_action = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
        for i, [obs, action, pixel_change,
                next_obs] in enumerate(pc_experience_frames[1:]):
            pc_R = pixel_change + self.gamma_pc * pc_R
            a = np.zeros([self.action_size])
            a[action] = 1.0
            valid_actions = np.zeros((len(actions.FUNCTIONS)),
                                     dtype=np.float32)
            valid_actions_inds = obs.observation["available_actions"]
            valid_actions[valid_actions_inds] = 1
            batch_pc_si.append(frame.state)
            batch_pc_a.append(a)
            batch_pc_R.append(pc_R)
            batch_pc_va.append(valid_actions)

        batch_pc_si.reverse()
        batch_pc_a.reverse()
        batch_pc_R.reverse()
        batch_pc_va.reverse
        pc_feed_dict = {
            self.pc_input: batch_pc_si,
            self.pc_a: batch_pc_a,
            self.pc_r: batch_pc_R,
            self.pc_valid_non_spatial_action: batch_pc_va
        }
        feed.update(pc_feed_dict)
        _, summary = self.sess.run([self.train_op, self.summary_op],
                                   feed_dict=feed)
        self.summary_writer.add_summary(summary, cter)
Ejemplo n.º 25
0
    def update(self, rbs, disc, lr, cter):
        # Compute R, which is value of the last observation

        obs = rbs[-1][-1]

        # Print out score on a test run through a full episode, don't update network on test run
        if self.test_run and obs.last():
            self.test_scores.append(obs.observation['score_cumulative'][0])
            # print("TEST SCORE: " + str(self.test_scores[-1]))

            return
        else:
            train_score = obs.observation['score_cumulative'][0]

        logger.info('Total game steps: %s', self.count_steps)
        self.count_steps += len(rbs)

        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            action_indices = [
                0, 1, 2, 3, 5, 7, 12, 13, 14, 15, 18, 19, 20, 261, 274, 331,
                332, 333, 334, 451, 452, 453, 456
            ]
            valid_actions = list(
                set(obs.observation['available_actions'])
                & set(action_indices))
            valid_actions_indices = [
                action_indices.index(i) for i in valid_actions
            ]
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, valid_actions_indices] = 1

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }
            R = self.sess.run(self.value, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []

        value_target = np.zeros([len(rbs)], dtype=np.float32)
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
        spatial_action_selected = np.zeros([len(rbs), self.ssize**2],
                                           dtype=np.float32)
        valid_non_spatial_action = np.zeros([len(rbs), self.isize],
                                            dtype=np.float32)
        non_spatial_action_selected = np.zeros([len(rbs), self.isize],
                                               dtype=np.float32)

        rbs.reverse()
        for i, [obs, action, next_obs] in enumerate(rbs):
            minimap = np.array(obs.observation['minimap'], dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['screen'], dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            action_indices = [
                0, 1, 2, 3, 5, 7, 12, 13, 14, 15, 18, 19, 20, 261, 274, 331,
                332, 333, 334, 451, 452, 453, 456
            ]
            valid_actions = list(
                set(obs.observation['available_actions'])
                & set(action_indices))
            valid_actions_indices = [
                action_indices.index(i) for i in valid_actions
            ]
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, valid_actions_indices] = 1

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            reward = obs.reward
            act_id = action.function
            act_args = action.arguments

            value_target[i] = reward + disc * value_target[i - 1]

            # valid_actions = obs.observation["available_actions"]
            valid_non_spatial_action[i, valid_actions_indices] = 1
            non_spatial_action_selected[i, action_indices.index(act_id)] = 1

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)

        # Train
        feed = {
            self.minimap: minimaps,
            self.screen: screens,
            self.info: infos,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: lr,
            self.train_score: train_score
        }
        _, summary = self.sess.run([self.train_op, self.summary_op],
                                   feed_dict=feed)
        self.summary_writer.add_summary(summary, cter)
Ejemplo n.º 26
0
    def update(self, rbs, disc, lr, cter):
        # Compute R, which is value of the last observation
        obs = rbs[-1][-1]
        if obs.last():
            R = 0
        else:
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            feed = {
                self.minimap: minimap,
                self.screen: screen,
                self.info: info
            }
            R = self.sess.run(self.value, feed_dict=feed)[0]

        # Compute targets and masks
        minimaps = []
        screens = []
        infos = []

        value_target = np.zeros([len(rbs)], dtype=np.float32)
        value_target[-1] = R

        valid_spatial_action = np.zeros([len(rbs)], dtype=np.float32)
        spatial_action_selected = np.zeros([len(rbs), self.ssize**2],
                                           dtype=np.float32)
        valid_non_spatial_action = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)
        non_spatial_action_selected = np.zeros(
            [len(rbs), len(actions.FUNCTIONS)], dtype=np.float32)

        rbs.reverse()
        for i, [obs, action, next_obs] in enumerate(rbs):
            minimap = np.array(obs.observation['feature_minimap'],
                               dtype=np.float32)
            minimap = np.expand_dims(U.preprocess_minimap(minimap), axis=0)
            screen = np.array(obs.observation['feature_screen'],
                              dtype=np.float32)
            screen = np.expand_dims(U.preprocess_screen(screen), axis=0)
            info = np.zeros([1, self.isize], dtype=np.float32)
            info[0, obs.observation['available_actions']] = 1

            minimaps.append(minimap)
            screens.append(screen)
            infos.append(info)

            self.reward = obs.reward
            act_id = action.function
            act_args = action.arguments

            value_target[i] = self.reward + disc * value_target[i - 1]

            valid_actions = obs.observation["available_actions"]
            valid_non_spatial_action[i, valid_actions] = 1
            non_spatial_action_selected[i, act_id] = 1

            args = actions.FUNCTIONS[act_id].args
            for arg, act_arg in zip(args, act_args):
                if arg.name in ('screen', 'minimap', 'screen2'):
                    ind = act_arg[1] * self.ssize + act_arg[0]
                    valid_spatial_action[i] = 1
                    spatial_action_selected[i, ind] = 1

        minimaps = np.concatenate(minimaps, axis=0)
        screens = np.concatenate(screens, axis=0)
        infos = np.concatenate(infos, axis=0)
        # Train
        feed = {
            self.minimap: minimaps,
            self.screen: screens,
            self.info: infos,
            self.value_target: value_target,
            self.valid_spatial_action: valid_spatial_action,
            self.spatial_action_selected: spatial_action_selected,
            self.valid_non_spatial_action: valid_non_spatial_action,
            self.non_spatial_action_selected: non_spatial_action_selected,
            self.learning_rate: lr,
            self.score: self.reward
        }  # will this work?
        _, summary = self.sess.run([self.train_op, self.summary_op],
                                   feed_dict=feed)
        self.summary_writer.add_summary(summary, cter)
Ejemplo n.º 27
0
	def step(self, obs):
		screen = np.array(obs.observation.feature_screen, dtype=np.float32)
		screen = np.expand_dims(utils.preprocess_screen(screen), axis=0)
		# np.expand_dims: 展開數組的形狀。
		# x = np.array([1,2])
		# x.shape
		# (2,)
		# y = np.expand_dims(x, axis=0)
		# y
		# array([[1, 2]])
		# y.shape
		# (1, 2)
		minimap = np.array(obs.observation.feature_minimap, dtype=np.float32)
		minimap = np.expand_dims(utils.preprocess_minimap(minimap), axis=0)
		structured = np.zeros([1, self.structured_dimensions], dtype=np.float32)
		structured[0, obs.observation.available_actions] = 1

		feed_dict = {
			self.screen: screen,
			self.minimap: minimap,
			self.structured: structured
		}
		
		non_spatial_action, spatial_action = self.sess.run(
			[self.non_spatial_action, self.spatial_action],
			feed_dict=feed_dict
		)

		non_spatial_action = non_spatial_action.ravel()
		spatial_action = spatial_action.ravel()
		# np.ravel: 返回一個連續的扁平數組(flatten array)。
		# x = np.array([[1, 2, 3], [4, 5, 6]])
		# print(np.ravel(x))
		# [1 2 3 4 5 6]
		available_actions = obs.observation.available_actions
		action_id = 0
		spatial_target = []
		if self.mode == 'original_ac3':
			non_spatial_action = np.array(non_spatial_action[available_actions])
			non_spatial_action /= non_spatial_action.sum()
			x = np.random.choice(non_spatial_action, p=non_spatial_action)
			action_id = available_actions[np.where(non_spatial_action == x)[0][0]]
			spatial_target = random.choice(list(enumerate(spatial_action)))[0]
			# x = np.random.choice(spatial_action, p=spatial_action)
			# if len(np.where(spatial_action == x)[0]) > 1:
			# 	random = np.random.choice(len(np.where(spatial_action == x)[0]))
			# 	spatial_target = np.where(spatial_action == x)[0][random]
			# else:
			# 	spatial_target = np.where(spatial_action == x)[0][0]
			spatial_target = [int(spatial_target // self.resolution), int(spatial_target % self.resolution)]
		else:
			action_id = available_actions[np.argmax(non_spatial_action[available_actions])]
			spatial_target = np.argmax(spatial_action)
			spatial_target = [int(spatial_target // self.resolution), int(spatial_target % self.resolution)]

			# epsilon-greedy exploration
			if self.training and np.random.rand() < self.epsilon[0]:
				action_id = np.random.choice(available_actions)
			if self.training and np.random.rand() < self.epsilon[1]:
				delta_y, delta_x = np.random.randint(-4, 5), np.random.randint(-4, 5)
				spatial_target[0] = int(max(0, min(self.resolution -1, spatial_target[0] + delta_y)))
				spatial_target[1] = int(max(0, min(self.resolution -1, spatial_target[1] + delta_x)))

		action_args = []
		for arg in actions.FUNCTIONS[action_id].args:
			if arg.name in ('screen', 'minimap', 'screen2'):
				action_args.append([spatial_target[1], spatial_target[0]])
			else:
				action_args.append([0])
		return actions.FunctionCall(action_id, action_args)