def initialize_episode(self):
        self.episode_count += 1
        if self.training and self.episode_count%self.batch_size==0:
            self.num_updates += 1
            if self.num_updates>self.pol_start and self.num_updates%ANNEAL==0: self.anneal_lr()
            if self.num_updates < self.pol_start: loss = self.update(regime='SL')
            else: loss = self.update(regime='RL')
            if self.num_updates%DISPF==0: self._print_progress(loss)
            if self.num_updates%SAVEF==0: self.save_model(dialog_config.MODEL_PATH+self._name)

        self.state = {}
        self.state['database'] = pkl.loads(pkl.dumps(self.database,-1))
        self.state['prevact'] = 'begin@begin'
        self.state['inform_slots'] = self._init_beliefs()
        self.state['turn'] = 0
        self.state['num_requests'] = {s:0 for s in self.state['database'].slots}
        self.state['slot_tracker'] = set()
        self.state['dont_care'] = set()
        self.state['init_entropy'] = {}
        for s in dialog_config.inform_slots:
            s_p = self.state['inform_slots'][s]/self.state['inform_slots'][s].sum()
            self.state['init_entropy'][s] = tools.entropy_p(s_p)
        self.state['inputs'] = []
        self.state['actions'] = []
        self.state['rewards'] = []
        self.state['pol_state'] = np.zeros((1,self.n_hid)).astype('float32')
    def next(self, user_action, verbose=False):
        self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose)
        self.state['turn'] += 1

        db_status, db_index = self._check_db()
        N_db = len(db_index)
        H_slots = {}
        for s in dialog_config.inform_slots:
            s_p = self.state['inform_slots'][s]/self.state['inform_slots'][s].sum()
            H_slots[s] = tools.entropy_p(s_p)
        p_vector = np.zeros((self.in_size,)).astype('float32')
        if self.inputtype=='entropy':
            for i,s in enumerate(dialog_config.inform_slots):
                if s in H_slots: p_vector[i] = H_slots[s]
                p_vector[i+len(dialog_config.inform_slots)] = 1. if s in self.state['dont_care'] \
                        else 0.
            if self.state['turn']>1:
                pr_act = self.state['prevact'].split('@')
                act_id = dialog_config.inform_slots.index(pr_act[1])
                p_vector[2*len(dialog_config.inform_slots)+act_id] = 1.
            #p_vector[-1] = N_db/self.state['database'].N
            if N_db<=5: p_vector[N_db-6] = 1.
            else: p_vector[-1] = 1.
        else:
            p_slots = self._dict2vec(self.state['inform_slots'])
            p_vector[:p_slots.shape[0]] = p_slots
            if self.state['turn']>1:
                pr_act = self.state['prevact'].split('@')
                act_id = dialog_config.inform_slots.index(pr_act[1])
                p_vector[p_slots.shape[0]+act_id] = 1.
            db_i_vector = np.zeros((self.database.N,)).astype('float32')
            db_i_vector[db_index] = 1.
            p_vector[-self.database.N:] = db_i_vector
        p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0)
        p_vector = standardize(p_vector)

        if self.training and self.num_updates<self.pol_start:
            # act on policy but train on expert
            pp = np.zeros((len(dialog_config.inform_slots)+1,))
            for i,s in enumerate(dialog_config.inform_slots):
                pp[i] = H_slots[s]
            pp[-1] = N_db
            _, action = self._rule_act(pp, db_index)
            act, _, p_out = self._prob_act(p_vector, db_index, mode='sample')
        else:
            if self.training: act, action, p_out = self._prob_act(p_vector, db_index, mode='sample')
            else: act, action, p_out = self._prob_act(p_vector, db_index, mode='max')

        self.state['inputs'].append(p_vector[0,0,:])
        self.state['actions'].append(action)
        self.state['rewards'].append(user_action['reward'])
        self.state['pol_state'] = p_out

        act['posterior'] = np.zeros((len(self.database.labels),))
        if len(db_index)>0:
            act['posterior'][db_index] = 1./len(db_index)
        else:
            act['posterior'] = 1./len(self.database.labels)

        return act
    def initialize_episode(self):
        self.episode_count += 1
        if self.training and self.episode_count % self.batch_size == 0:
            self.num_updates += 1
            if self.num_updates > self.pol_start and self.num_updates % ANNEAL == 0:
                self.anneal_lr()
            if self.num_updates < self.pol_start:
                loss = self.update(regime='SL')
            else:
                loss = self.update(regime='RL')
            if self.num_updates % DISPF == 0: self._print_progress(loss)
            if self.num_updates % SAVEF == 0:
                self.save_model(dialog_config.MODEL_PATH + self._name)

        self.state = {}
        self.state['database'] = pkl.loads(pkl.dumps(self.database, -1))
        self.state['prevact'] = 'begin@begin'
        self.state['inform_slots'] = self._init_beliefs()
        self.state['turn'] = 0
        self.state['num_requests'] = {
            s: 0
            for s in self.state['database'].slots
        }
        self.state['slot_tracker'] = set()
        self.state['dont_care'] = set()
        self.state['init_entropy'] = {}
        for s in dialog_config.inform_slots:
            s_p = self.state['inform_slots'][s] / self.state['inform_slots'][
                s].sum()
            self.state['init_entropy'][s] = tools.entropy_p(s_p)
        self.state['inputs'] = []
        self.state['actions'] = []
        self.state['rewards'] = []
        self.state['pol_state'] = np.zeros((1, self.n_hid)).astype('float32')
    def next(self, user_action, verbose=False):
        self._update_state(user_action['nl_sentence'],
                           upd=self.upd,
                           verbose=verbose)
        self.state['turn'] += 1

        db_probs = self._check_db()
        H_db = tools.entropy_p(db_probs)
        H_slots = calc_entropies(self.state['inform_slots'], db_probs,
                                 self.state['database'])
        p_vector = np.zeros((self.in_size, )).astype('float32')
        if self.input_type == 'entropy':
            for i, s in enumerate(dialog_config.inform_slots):
                if s in H_slots: p_vector[i] = H_slots[s]
                p_vector[i+len(dialog_config.inform_slots)] = 1. if s in self.state['dont_care'] \
                        else 0.
            if self.state['turn'] > 1:
                pr_act = self.state['prevact'].split('@')
                act_id = dialog_config.inform_slots.index(pr_act[1])
                p_vector[2 * len(dialog_config.inform_slots) + act_id] = 1.
            p_vector[-1] = H_db
        else:
            p_slots = self._dict2vec(self.state['inform_slots'])
            p_vector[:p_slots.shape[0]] = p_slots
            if self.state['turn'] > 1:
                pr_act = self.state['prevact'].split('@')
                act_id = dialog_config.inform_slots.index(pr_act[1])
                p_vector[p_slots.shape[0] + act_id] = 1.
            p_vector[-self.database.N:] = db_probs
        p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0)
        p_vector = standardize(p_vector)

        if self.training and self.num_updates < self.pol_start:
            # act on policy but train on expert
            pp = np.zeros((len(dialog_config.inform_slots) + 1, ))
            for i, s in enumerate(dialog_config.inform_slots):
                pp[i] = H_slots[s]
            pp[-1] = H_db
            _, action = self._rule_act(pp, db_probs)
            act, _, p_out = self._prob_act(p_vector, db_probs, mode='sample')
        else:
            if self.training:
                act, action, p_out = self._prob_act(p_vector,
                                                    db_probs,
                                                    mode='sample')
            else:
                act, action, p_out = self._prob_act(p_vector,
                                                    db_probs,
                                                    mode='max')

        self.state['inputs'].append(p_vector[0, 0, :])
        self.state['actions'].append(action)
        self.state['rewards'].append(user_action['reward'])
        self.state['pol_state'] = p_out

        act['posterior'] = db_probs

        return act
    def next(self, user_action, verbose=False):
        self._update_state(user_action['nl_sentence'],
                           upd=self.upd,
                           verbose=verbose)
        self.state['turn'] += 1

        act = {}
        act['diaact'] = 'UNK'
        act['request_slots'] = {}
        act['target'] = []

        db_status, db_index = self._check_db()
        H_slots = {}
        for s in dialog_config.inform_slots:
            s_p = self.state['inform_slots'][s] / self.state['inform_slots'][
                s].sum()
            H_slots[s] = tools.entropy_p(s_p)
        sorted_entropies = sorted(H_slots.items(),
                                  key=operator.itemgetter(1),
                                  reverse=True)
        if verbose:
            print 'Agent slot belief entropies - '
            print ' '.join(
                ['%s:%.2f' % (k, v) for k, v in H_slots.iteritems()])

        if not db_status:
            # no match, some error, re-ask some slot
            act['diaact'] = 'request'
            request_slot = random.choice(self.state['inform_slots'].keys())
            act['request_slots'][request_slot] = 'UNK'
            self.state['prevact'] = 'request@%s' % request_slot
            self.state['num_requests'][request_slot] += 1
        elif len(db_status) == 1:
            act['diaact'] = 'inform'
            act['target'] = self._inform(db_index)
            self.state['prevact'] = 'inform@inform'
        else:
            req = False
            for (s, h) in sorted_entropies:
                if H_slots[s]<self.frac*self.state['init_entropy'][s] or H_slots[s]<self.ts or \
                        self.state['num_requests'][s] >= self.max_req:
                    continue
                act['diaact'] = 'request'
                act['request_slots'][s] = 'UNK'
                self.state['prevact'] = 'request@%s' % s
                self.state['num_requests'][s] += 1
                req = True
                break
            if not req:
                # agent confident about all slots, inform
                act['diaact'] = 'inform'
                act['target'] = self._inform(db_index)
                self.state['prevact'] = 'inform@inform'

        act['posterior'] = np.zeros((len(self.database.labels), ))
        act['posterior'][db_index] = 1. / len(db_index)

        return act
Exemple #6
0
    def next(self, user_action, verbose=False):
        self.state['turn'] += 1

        p_vector = np.zeros((self.in_size, )).astype('float32')
        p_vector[:self.feat_extractor.n] = self.feat_extractor.featurize( \
                user_action['nl_sentence'])
        if self.state['turn'] > 1:
            pr_act = self.state['prevact'].split('@')
            assert pr_act[0] != 'inform', 'Agent called after informing!'
            act_id = dialog_config.inform_slots.index(pr_act[1])
            p_vector[self.feat_extractor.n + act_id] = 1
        p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0)
        p_vector = standardize(p_vector)

        p_targets = []
        phi_targets = []
        if self.training and self.num_updates < self.pol_start:
            self._update_state(user_action['nl_sentence'],
                               upd=self.upd,
                               verbose=verbose)
            db_probs = self._check_db()
            H_db = tools.entropy_p(db_probs)
            H_slots = calc_entropies(self.state['inform_slots'], db_probs,
                                     self.state['database'])

            # act on policy but train on expert
            pp = np.zeros((len(dialog_config.inform_slots) + 1, ))
            for i, s in enumerate(dialog_config.inform_slots):
                pp[i] = H_slots[s]
            pp[-1] = H_db
            pp = np.expand_dims(np.expand_dims(pp, axis=0), axis=0)
            _, action = self._rule_act(pp, db_probs)
            act, _, p_out, hid_out, p_db = self._prob_act(p_vector,
                                                          mode='sample')
            for s in dialog_config.inform_slots:
                p_s = self.state['inform_slots'][s] / self.state[
                    'inform_slots'][s].sum()
                p_targets.append(p_s)
                if s in self.state['dont_care']:
                    phi_targets.append(np.ones((1, )).astype('float32'))
                else:
                    phi_targets.append(np.zeros((1, )).astype('float32'))
        else:
            if self.training:
                act, action, p_out, hid_out, db_probs = self._prob_act(
                    p_vector, mode='sample')
            else:
                act, action, p_out, hid_out, db_probs = self._prob_act(
                    p_vector, mode='max')

        self._state_update(act, p_vector, action, user_action['reward'], p_out, hid_out, p_targets, \
                phi_targets)

        act['posterior'] = db_probs

        return act
Exemple #7
0
def calc_entropies(state, q, db):
    entropies = {}
    for s,c in state.iteritems():
        if s not in db.slots:
            entropies[s] = 0.
        else:
            p = (db.ids[s]*q).sum(axis=1)
            u = db.priors[s]*q[db.unks[s]].sum()
            c_tilde = p+u
            c_tilde = c_tilde/c_tilde.sum()
            entropies[s] = tools.entropy_p(c_tilde)
    return entropies
Exemple #8
0
def calc_entropies(state, q, db):
    entropies = {}
    for s, c in state.iteritems():
        if s not in db.slots:
            entropies[s] = 0.
        else:
            p = (db.ids[s] * q).sum(axis=1)
            u = db.priors[s] * q[db.unks[s]].sum()
            c_tilde = p + u
            c_tilde = c_tilde / c_tilde.sum()
            entropies[s] = tools.entropy_p(c_tilde)
    return entropies
    def next(self, user_action, verbose=False):
        self._update_state(user_action['nl_sentence'],
                           upd=self.upd,
                           verbose=verbose)
        self.state['turn'] += 1

        act = {}
        act['diaact'] = 'UNK'
        act['request_slots'] = {}
        act['target'] = []

        db_probs = self._check_db()
        H_slots = {}
        for s in dialog_config.inform_slots:
            s_p = self.state['inform_slots'][s] / self.state['inform_slots'][
                s].sum()
            H_slots[s] = tools.entropy_p(s_p)
        if verbose:
            print 'Agent slot belief entropies - '
            print ' '.join(
                ['%s:%.2f' % (k, v) for k, v in H_slots.iteritems()])

        sorted_entropies = sorted(H_slots.items(),
                                  key=operator.itemgetter(1),
                                  reverse=True)
        req = False
        for (s, h) in sorted_entropies:
            if H_slots[s]<self.frac*self.state['init_entropy'][s] or H_slots[s]<self.ts or \
                    self.state['num_requests'][s] >= self.max_req:
                continue
            act['diaact'] = 'request'
            act['request_slots'][s] = 'UNK'
            self.state['prevact'] = 'request@%s' % s
            self.state['num_requests'][s] += 1
            req = True
            break
        if not req:
            # agent confident about all slots, inform
            act['diaact'] = 'inform'
            act['target'] = self._inform(db_probs)
            self.state['prevact'] = 'inform@inform'

        act['probs'] = [np.concatenate([self.state['inform_slots'][s]/ \
                self.state['inform_slots'][s].sum(), \
                np.asarray([float(self.state['database'].inv_counts[s][-1])/ \
                self.state['database'].N])]) \
                for s in dialog_config.inform_slots]
        act['phis'] = [
            1. if s in self.state['dont_care'] else 0.
            for s in dialog_config.inform_slots
        ]
        act['posterior'] = db_probs
        return act
    def next(self, user_action, verbose=False):
        self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose)
        self.state['turn'] += 1

        act = {}
        act['diaact'] = 'UNK'
        act['request_slots'] = {}
        act['target'] = []

        db_status, db_index = self._check_db()
        H_slots = {}
        for s in dialog_config.inform_slots:
            s_p = self.state['inform_slots'][s]/self.state['inform_slots'][s].sum()
            H_slots[s] = tools.entropy_p(s_p)
        sorted_entropies = sorted(H_slots.items(), key=operator.itemgetter(1), reverse=True)
        if verbose:
            print 'Agent slot belief entropies - '
            print ' '.join(['%s:%.2f' %(k,v) for k,v in H_slots.iteritems()])

        if not db_status:
            # no match, some error, re-ask some slot
            act['diaact'] = 'request'
            request_slot = random.choice(self.state['inform_slots'].keys())
            act['request_slots'][request_slot] = 'UNK'
            self.state['prevact'] = 'request@%s' %request_slot
            self.state['num_requests'][request_slot] += 1
        elif len(db_status)==1:
            act['diaact'] = 'inform'
            act['target'] = self._inform(db_index)
            self.state['prevact'] = 'inform@inform'
        else:
            req = False
            for (s,h) in sorted_entropies:
                if H_slots[s]<self.frac*self.state['init_entropy'][s] or H_slots[s]<self.ts or \
                        self.state['num_requests'][s] >= self.max_req:
                    continue
                act['diaact'] = 'request'
                act['request_slots'][s] = 'UNK'
                self.state['prevact'] = 'request@%s' %s
                self.state['num_requests'][s] += 1
                req = True
                break
            if not req:
                # agent confident about all slots, inform
                act['diaact'] = 'inform'
                act['target'] = self._inform(db_index)
                self.state['prevact'] = 'inform@inform'

        act['posterior'] = np.zeros((len(self.database.labels),))
        act['posterior'][db_index] = 1./len(db_index)

        return act
 def initialize_episode(self):
     self.state = {}
     self.state['database'] = pkl.loads(pkl.dumps(self.database,-1))
     self.state['prevact'] = 'begin@begin'
     self.state['inform_slots'] = self._init_beliefs()
     self.state['turn'] = 0
     self.state['init_entropy'] = {}
     for s in dialog_config.inform_slots:
         s_p = self.state['inform_slots'][s]/self.state['inform_slots'][s].sum()
         self.state['init_entropy'][s] = tools.entropy_p(s_p)
     self.state['num_requests'] = {s:0 for s in self.state['inform_slots'].keys()}
     self.state['slot_tracker'] = set()
     self.state['dont_care'] = set()
    def next(self, user_action, verbose=False):
        self.state['turn'] += 1

        p_vector = np.zeros((self.in_size,)).astype('float32')
        p_vector[:self.feat_extractor.n] = self.feat_extractor.featurize( \
                user_action['nl_sentence'])
        if self.state['turn']>1:
            pr_act = self.state['prevact'].split('@')
            assert pr_act[0]!='inform', 'Agent called after informing!'
            act_id = dialog_config.inform_slots.index(pr_act[1])
            p_vector[self.feat_extractor.n+act_id] = 1
        p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0)
        p_vector = standardize(p_vector)

        p_targets = []
        phi_targets = []
        if self.training and self.num_updates<self.pol_start:
            self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose)
            db_probs = self._check_db()
            H_db = tools.entropy_p(db_probs)
            H_slots = calc_entropies(self.state['inform_slots'], db_probs, self.state['database'])

            # act on policy but train on expert
            pp = np.zeros((len(dialog_config.inform_slots)+1,))
            for i,s in enumerate(dialog_config.inform_slots):
                pp[i] = H_slots[s]
            pp[-1] = H_db
            pp = np.expand_dims(np.expand_dims(pp, axis=0), axis=0)
            _, action = self._rule_act(pp, db_probs)
            act, _, p_out, hid_out, p_db = self._prob_act(p_vector, mode='sample')
            for s in dialog_config.inform_slots:
                p_s = self.state['inform_slots'][s]/self.state['inform_slots'][s].sum()
                p_targets.append(p_s)
                if s in self.state['dont_care']:
                    phi_targets.append(np.ones((1,)).astype('float32'))
                else:
                    phi_targets.append(np.zeros((1,)).astype('float32'))
        else:
            if self.training: act, action, p_out, hid_out, db_probs = self._prob_act(p_vector, mode='sample')
            else: act, action, p_out, hid_out, db_probs = self._prob_act(p_vector, mode='max')

        self._state_update(act, p_vector, action, user_action['reward'], p_out, hid_out, p_targets, \
                phi_targets)

        act['posterior'] = db_probs

        return act
    def next(self, user_action, verbose=False):
        self._update_state(user_action['nl_sentence'], upd=self.upd, verbose=verbose)
        self.state['turn'] += 1

        act = {}
        act['diaact'] = 'UNK'
        act['request_slots'] = {}
        act['target'] = []

        db_probs = self._check_db()
        H_db = tools.entropy_p(db_probs)
        H_slots = calc_entropies(self.state['inform_slots'], db_probs, self.state['database'])
        if verbose:
            print 'Agent DB entropy = ', H_db
            print 'Agent slot belief entropies - '
            print ' '.join(['%s:%.2f' %(k,v) for k,v in H_slots.iteritems()])

        if H_db < self.tr:
            # agent reasonable confident, inform
            act['diaact'] = 'inform'
            act['target'] = self._inform(db_probs)
        else:
            sorted_entropies = sorted(H_slots.items(), key=operator.itemgetter(1), reverse=True)
            req = False
            for (s,h) in sorted_entropies:
                if H_slots[s]<self.frac*self.state['init_entropy'][s] or H_slots[s]<self.ts or \
                        self.state['num_requests'][s] >= self.max_req:
                    continue
                act['diaact'] = 'request'
                act['request_slots'][s] = 'UNK'
                self.state['prevact'] = 'request@%s' %s
                self.state['num_requests'][s] += 1
                req = True
                break
            if not req:
                # agent confident about all slots, inform
                act['diaact'] = 'inform'
                act['target'] = self._inform(db_probs)
                self.state['prevact'] = 'inform@inform'

        act['probs'] = [np.concatenate([self.state['inform_slots'][s]/self.state['inform_slots'][s].sum(), \
                np.asarray([float(self.state['database'].inv_counts[s][-1])/self.state['database'].N])]) \
                for s in dialog_config.inform_slots]
        act['phis'] = [1. if s in self.state['dont_care'] else 0. for s in dialog_config.inform_slots]
        act['posterior'] = db_probs
        return act
Exemple #14
0
 def initialize_episode(self):
     self.state = {}
     self.state['database'] = pkl.loads(pkl.dumps(self.database, -1))
     self.state['prevact'] = 'begin@begin'
     self.state['inform_slots'] = self._init_beliefs()
     self.state['turn'] = 0
     self.state['init_entropy'] = {}
     for s in dialog_config.inform_slots:
         s_p = self.state['inform_slots'][s] / self.state['inform_slots'][
             s].sum()
         self.state['init_entropy'][s] = tools.entropy_p(s_p)
     self.state['num_requests'] = {
         s: 0
         for s in self.state['inform_slots'].keys()
     }
     self.state['slot_tracker'] = set()
     self.state['dont_care'] = set()
Exemple #15
0
def calc_entropies(state, q, db):
    '''
    SL中计算熵的方式,跟RL中不一样!
    :param state:
    :param q: table probability, (N,)
    :param db: database
    :return: 每个slot的熵
    '''
    entropies = {}
    for s,c in state.iteritems():
        if s not in db.slots:
            entropies[s] = 0.
        else:
            p = (db.ids[s]*q).sum(axis=1)
            u = db.priors[s]*q[db.unks[s]].sum()
            c_tilde = p+u
            c_tilde = c_tilde/c_tilde.sum()
            entropies[s] = tools.entropy_p(c_tilde)
    return entropies
Exemple #16
0
    def next(self, user_action, verbose=False):
        '''
        get next action based on rules
        :param user_action: 用户输入之后,新的state
        :param verbose: 是否打印模型运行过程产生的log,是否开启唠叨模式
        :return: 返回action的dict里面有其他的参数,包括diaact,request_slots,target,p和q等
        '''
        self.state['turn'] += 1

        # TODO: 改为embedding之后,这段要全部改掉,主要是in_size这个变量要改,不知道会不会出一些其他的幺蛾子
        # TODO: 改为Embedding后效果很差,还是应该试试更多的模型
        p_vector, seq_len = self.feat_extractor.featurize(
            user_action['nl_sentence'])
        p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0)
        p_vector = standardize(p_vector)

        # p_vector = np.zeros((self.in_size,)).astype('float32')   # (|Grams|+|Slots|, )
        # p_vector[:self.feat_extractor.n] = self.feat_extractor.featurize(user_action['nl_sentence'])
        # if self.state['turn']>1:
        #     pr_act = self.state['prevact'].split('@')
        #     assert pr_act[0]!='inform', 'Agent called after informing!'
        #     act_id = dialog_config.inform_slots.index(pr_act[1])
        #     p_vector[self.feat_extractor.n+act_id] = 1
        # p_vector = np.expand_dims(np.expand_dims(p_vector, axis=0), axis=0) # (1, 1, |Grams|+|Slots|)
        # p_vector = standardize(p_vector)

        p_targets = []
        phi_targets = []
        if self.training and self.num_updates < self.pol_start:
            self._update_state(user_action['nl_sentence'],
                               upd=self.upd,
                               verbose=verbose)
            db_probs = self._check_db()
            H_db = tools.entropy_p(db_probs)
            H_slots = calc_entropies(self.state['inform_slots'], db_probs,
                                     self.state['database'])

            # act on policy but train on expert
            pp = np.zeros((len(dialog_config.inform_slots) + 1, ))
            for i, s in enumerate(dialog_config.inform_slots):
                pp[i] = H_slots[s]
            pp[-1] = H_db
            pp = np.expand_dims(np.expand_dims(pp, axis=0),
                                axis=0)  # (1, 1, |Slots|)
            _, action = self._rule_act(pp, db_probs)
            act, _, p_out, hid_out, p_db = self._prob_act(p_vector,
                                                          mode='sample')
            for s in dialog_config.inform_slots:
                p_s = self.state['inform_slots'][s] / self.state[
                    'inform_slots'][s].sum()
                p_targets.append(p_s)
                if s in self.state['dont_care']:
                    phi_targets.append(np.ones((1, )).astype('float32'))
                else:
                    phi_targets.append(np.zeros((1, )).astype('float32'))
        else:
            if self.training:
                act, action, p_out, hid_out, db_probs = self._prob_act(
                    p_vector, mode='sample')
            else:
                act, action, p_out, hid_out, db_probs = self._prob_act(
                    p_vector, mode='max')

        # TODO: 添加seq_len参数,注意在哪找到这个参数
        self._state_update(act, p_vector, action, user_action['reward'], p_out,
                           hid_out, p_targets, phi_targets, seq_len)
        act['posterior'] = db_probs
        return act