def nextAction(self, beliefstate):
        '''
        select next action

        :param beliefstate:
        :param hyps:
        :returns: (int) next summary action
        '''
        if self.architecture != 'dip2':
            beliefVec = flatten_belief(beliefstate, self.domainUtil)
        else:
            dip_state = DIP_state(
                beliefstate.domainStates[beliefstate.currentdomain],
                self.domainString)
        execMask = self.summaryaction.getExecutableMask(
            beliefstate, self.lastSystemAction)

        if self.exploration_type == 'e-greedy':
            # epsilon greedy
            if self.is_training and utils.Settings.random.rand(
            ) < self.epsilon:
                admissible = [i for i, x in enumerate(execMask) if x == 0.0]
                random.shuffle(admissible)
                nextaIdex = admissible[0]
            else:
                if self.architecture != 'dip' and self.architecture != 'dip2':
                    action_Q = self.dqn.predict(
                        np.reshape(
                            beliefVec,
                            (1, len(beliefVec))))  # + (1. / (1. + i + j))
                    admissible = np.add(action_Q, np.array(execMask))
                    logger.info('action Q...')
                    #print admissible.shape
                    #print admissible
                    nextaIdex = np.argmax(admissible)

                    # add current max Q to self.episode_ave_max_q
                    #print 'current maxQ', np.max(admissible)
                    self.episode_ave_max_q.append(np.max(admissible))
                elif self.architecture == 'dip2':
                    admissible = []
                    for idx, v in enumerate(execMask):
                        action_name = self.actions.action_names[idx]
                        act_slot = 'general'
                        for slot in dip_state.slots:
                            if slot in action_name:
                                act_slot = slot
                        beliefVec = dip_state.get_beliefStateVec(act_slot)
                        action_Q = self.dqn.predict(
                            np.reshape(
                                beliefVec,
                                (1, len(beliefVec))))  # + (1. / (1. + i + j))
                        if v == 0:
                            admissible.append(action_Q[0][idx])
                        else:
                            admissible.append(v)
                    nextaIdex = np.argmax(admissible)
                    self.episode_ave_max_q.append(np.max(admissible))

                else:
                    admissible = []
                    for idx, v in enumerate(execMask):
                        if v > -sys.maxint:
                            Action_idx = np.eye(self.action_dim,
                                                self.action_dim)[[idx]]
                            Qidx = self.dqn.predict_dip(
                                np.reshape(beliefVec, (1, len(beliefVec))),
                                Action_idx)
                            #print 'argmax Q',Qidx[0]
                            admissible.append(Qidx[0])
                        else:
                            admissible.append(-sys.maxint)
                    # action_Q = self.dqn.predict(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j))
                    # admissible = np.add(action_Q, np.array(execMask))
                    logger.info('action Q...')
                    #print admissible
                    nextaIdex = np.argmax(admissible)

                    # add current max Q to self.episode_ave_max_q
                    #print 'current maxQ', np.max(admissible)
                    self.episode_ave_max_q.append(np.max(admissible))

        elif self.exploration_type == 'Boltzman':
            # softmax
            if not self.is_training:
                self.epsilon = 0.001
            # self.epsilon here is served as temperature
            action_Q = self.dqn.predict(
                np.reshape(beliefVec,
                           (1, len(beliefVec))))  # + (1. / (1. + i + j))
            action_Q_admissible = np.add(action_Q, np.array(
                execMask))  # enforce Q of inadmissible actions to be -inf

            action_prob = drlutils.softmax(action_Q_admissible / self.epsilon)
            logger.info('action Q...')
            #print action_Q_admissible
            logger.info('action prob...')
            #print action_prob
            sampled_prob = np.random.choice(action_prob[0], p=action_prob[0])
            nextaIdex = np.argmax(action_prob[0] == sampled_prob)

        self.stats[nextaIdex] += 1
        summaryAct = self.action_names[nextaIdex]
        beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
        masterAct = self.summaryaction.Convert(beliefstate, summaryAct,
                                               self.lastSystemAction)
        return masterAct, nextaIdex
Example #2
0
    def nextAction(self, beliefstate):
        '''
        select next action

        :param beliefstate: 
        :param hyps:
        :returns: (int) next summary action
        '''
        beliefVec = flatten_belief(beliefstate, self.domainUtil)

        execMask = self.summaryaction.getExecutableMask(
            beliefstate, self.lastSystemAction)

        if self.exploration_type == 'e-greedy':

            action_prob = self.enac.predict_policy(
                np.reshape(beliefVec, (1, len(beliefVec))))
            admissibleCnt = [i for i, x in enumerate(execMask) if x == 0.0]
            admissible = np.add(action_prob, np.array(execMask))
            greedyNextaIdex = np.argmax(admissible)

            # epsilon greedy
            if self.is_training and utils.Settings.random.rand(
            ) < self.epsilon:
                admissible = [i for i, x in enumerate(execMask) if x == 0.0]
                random.shuffle(admissible)
                nextaIdex = admissible[0]

                # Importance sampling
                if nextaIdex == greedyNextaIdex:
                    self.mu_prob = self.epsilon / float(
                        self.action_dim) + 1 - self.epsilon
                else:
                    self.mu_prob = self.epsilon / float(self.action_dim)
            else:
                nextaIdex = greedyNextaIdex

                # add current max Q to self.episode_ave_max_q
                #print 'current maxQ', np.max(admissible)
                self.episode_ave_max_q.append(np.max(admissible))

                # Importance sampling
                self.mu_prob = self.epsilon / float(
                    self.action_dim) + 1 - self.epsilon

        elif self.exploration_type == 'Boltzman':
            # softmax
            if not self.is_training:
                self.epsilon = 0.001
            # self.epsilon here is served as temperature
            #action_prob, value = self.a2c.predict_action_value(np.reshape(beliefVec, (1, len(beliefVec))))# + (1. / (1. + i + j))
            action_prob = self.enac.predict_policy(
                np.reshape(beliefVec,
                           (1, len(beliefVec))))  # + (1. / (1. + i + j))
            action_Q_admissible = np.add(action_prob, np.array(
                execMask))  # enforce Q of inadmissible actions to be -inf

            action_prob = drlutils.softmax(action_Q_admissible / self.epsilon)
            logger.info('action Q...')
            print action_Q_admissible
            logger.info('action prob...')
            print action_prob
            sampled_prob = np.random.choice(action_prob[0], p=action_prob[0])
            nextaIdex = np.argmax(action_prob[0] == sampled_prob)

        self.stats[nextaIdex] += 1
        summaryAct = self.summaryaction.action_names[nextaIdex]
        beliefstate = beliefstate.getDomainState(self.domainUtil.domainString)
        masterAct = self.summaryaction.Convert(beliefstate, summaryAct,
                                               self.lastSystemAction)
        return masterAct, nextaIdex