def _build_vocab_from_db(self, corpus):
     try:
         f = io.open(corpus, 'r')
         for line in f:
             elements = line.rstrip().split('\t')[1:]
             for ele in elements:
                 tokens = to_tokens(ele)
                 for i in range(len(tokens)):
                     for t in range(self.N):
                         if i-t<0: continue
                         ngram = '_'.join(tokens[i-t:i+1])
                         if ngram not in self.grams:
                             self.grams[ngram] = self.n
                             self.n += 1
         f.close()
     except UnicodeDecodeError:
         f = open(corpus, 'r')
         for line in f:
             elements = line.rstrip().split('\t')[1:]
             for ele in elements:
                 tokens = to_tokens(ele)
                 for i in range(len(tokens)):
                     for t in range(self.N):
                         if i-t<0: continue
                         ngram = '_'.join(tokens[i-t:i+1])
                         if ngram not in self.grams:
                             self.grams[ngram] = self.n
                             self.n += 1
         f.close()
Beispiel #2
0
 def _build_vocab_from_corpus(self, corpus):
     if not os.path.isfile(corpus): return
     try:
         f = io.open(corpus, 'r')
         for line in f:
             tokens = to_tokens(line.rstrip())
             for i in range(len(tokens)):
                 for t in range(self.N):
                     if i - t < 0: continue
                     ngram = '_'.join(tokens[i - t:i + 1])
                     if ngram not in self.grams:
                         self.grams[ngram] = self.n
                         self.n += 1
         f.close()
     except UnicodeDecodeError:
         f = open(corpus, 'r')
         for line in f:
             tokens = to_tokens(line.rstrip())
             for i in range(len(tokens)):
                 for t in range(self.N):
                     if i - t < 0: continue
                     ngram = '_'.join(tokens[i - t:i + 1])
                     if ngram not in self.grams:
                         self.grams[ngram] = self.n
                         self.n += 1
         f.close()
 def _build_vocab_from_corpus(self, corpus):
     if not os.path.isfile(corpus): return
     try:
         f = io.open(corpus, 'r')
         for line in f:
             tokens = to_tokens(line.rstrip())
             for i in range(len(tokens)):
                 for t in range(self.N):
                     if i-t<0: continue
                     ngram = '_'.join(tokens[i-t:i+1])
                     if ngram not in self.grams:
                         self.grams[ngram] = self.n
                         self.n += 1
         f.close()
     except UnicodeDecodeError:
         f = open(corpus, 'r')
         for line in f:
             tokens = to_tokens(line.rstrip())
             for i in range(len(tokens)):
                 for t in range(self.N):
                     if i-t<0: continue
                     ngram = '_'.join(tokens[i-t:i+1])
                     if ngram not in self.grams:
                         self.grams[ngram] = self.n
                         self.n += 1
         f.close()
Beispiel #4
0
 def _build_vocab_from_db(self, corpus):
     try:
         f = io.open(corpus, 'r')
         for line in f:
             elements = line.rstrip().split('\t')[1:]
             for ele in elements:
                 tokens = to_tokens(ele)
                 for i in range(len(tokens)):
                     for t in range(self.N):
                         if i - t < 0: continue
                         ngram = '_'.join(tokens[i - t:i + 1])
                         if ngram not in self.grams:
                             self.grams[ngram] = self.n
                             self.n += 1
         f.close()
     except UnicodeDecodeError:
         f = open(corpus, 'r')
         for line in f:
             elements = line.rstrip().split('\t')[1:]
             for ele in elements:
                 tokens = to_tokens(ele)
                 for i in range(len(tokens)):
                     for t in range(self.N):
                         if i - t < 0: continue
                         ngram = '_'.join(tokens[i - t:i + 1])
                         if ngram not in self.grams:
                             self.grams[ngram] = self.n
                             self.n += 1
         f.close()
Beispiel #5
0
    def _corrupt_value(self, val):
        def _is_int(s):
            try:
                int(s)
                return True
            except ValueError:
                return False

        def _is_float(s):
            try:
                float(s)
                return True
            except ValueError:
                return False
        # 中文不用NLTK
        # tokens = nltk.word_tokenize(val)
        tokens = to_tokens(val)
        if len(tokens)>1: tokens.pop(random.randrange(len(tokens)))
        out = []
        for t in tokens:
            if _is_int(t):
                out.append(str(int(random.gauss(int(t),0.5))))
            elif _is_float(t):
                out.append('%.1f' %random.gauss(float(t),0.5))
            else:
                out.append(t)
        return ' '.join([o for o in out])
Beispiel #6
0
    def _update_state(self, user_utterance, upd=UPD, verbose=False):
        prev_act, prev_slot = self.state['prevact'].split('@')

        s_t = to_tokens(user_utterance)
        slot_match = self._search_slots(s_t) # search slots
        val_match = self._search_values(s_t) # search values

        for slot, values in val_match.iteritems():
            requested = (prev_act=='request') and (prev_slot==slot)
            matched = (slot in slot_match)
            if not values:
                if requested: # asked for value but did not get it
                    self.state['database'].delete_slot(slot)
                    self.state['num_requests'][slot] = 1000
                    self.state['dont_care'].add(slot)
            else:
                for y, match in values.iteritems():
                    #y = self.movie_dict.dict[slot].index(val)
                    if verbose:
                        print 'Detected %s' %self.movie_dict.dict[slot][y], ' update = ', match
                    if matched and requested:
                        alpha = upd*(match + 1. + slot_match[slot])
                    elif matched and not requested:
                        alpha = upd*(match + slot_match[slot])
                    elif not matched and requested:
                        alpha = upd*(match + 1.)
                    else:
                        alpha = upd*match
                    self.state['inform_slots'][slot][y] += alpha
                self.state['slot_tracker'].add(slot)
 def _search_values(self, s_t):
     '''
     在用户输入中查找出现的slot value,这里已经转为了token操作
     :param s_t: 用户输入的token化结果
     :return: 匹配统计结果
     '''
     print('-' * 100 + "\nsearching values: ")
     for v in s_t:
         print(
             v.encode("utf8")
             if v is not None and type(v) == unicode else v)
     matches = {}
     for slot in self.state['database'].slots:
         matches[slot] = defaultdict(float)
         for ss in s_t:
             if ss in self.movie_dict.tokens[slot]:
                 for vi in self.movie_dict.tokens[slot][ss]:
                     matches[slot][vi] += 1.
         for vi, f in matches[slot].iteritems():
             val = self.movie_dict.dict[slot][vi]
             # 中文版删掉了nltk的东西
             # print(nltk.word_tokenize(val))
             # matches[slot][vi] = f/len(nltk.word_tokenize(val))
             matches[slot][vi] = f / len(to_tokens(val))
     print('-' * 100)
     return matches
Beispiel #8
0
 def _build_token_index(self):
     self.tokens = {}
     for slot,vals in self.dict.iteritems():
         self.tokens[slot] = defaultdict(list)
         for vi,vv in enumerate(vals):
             w_v = to_tokens(vv)
             for w in w_v: self.tokens[slot][w].append(vi)
    def featurize(self, text):
        '''
        基于N-Gram的方式构造文本text中的特征
        :param text: 自然语言文本
        :return: 长度为|Grams|的向量,向量中的每个值表示该Gram的数量
        '''
        vec = np.zeros((len(self.grams), )).astype('float32')

        embeddings = []

        # 转到中文之后,N-Gram不再必要了,全部使用Embedding即可
        for ngram in to_tokens(text):
            if ngram in self.grams:
                vec[self.grams[ngram]] += 1.
            if ngram in self.embedding_vocab_t2n:
                embeddings.append(
                    self.embedding_vectors[self.embedding_vocab_t2n[ngram]])
            else:
                embeddings.append(self.UNK_EMBEDDING)
        seq_size = len(embeddings)
        while len(embeddings) < self.seq_max_len:
            embeddings.append(self.BAK_EMBEDDING)
        embeddings = embeddings[:self.seq_max_len]
        cat_embedding = torch.cat([x.view(1,
                                          x.size()[0]) for x in embeddings], 0)
        # TODO: embedding直接改为二维列表,numpy的二维array
        return cat_embedding.numpy(), seq_size
Beispiel #10
0
    def _corrupt_value(self, val):
        def _is_int(s):
            try:
                int(s)
                return True
            except ValueError:
                return False

        def _is_float(s):
            try:
                float(s)
                return True
            except ValueError:
                return False

        # 中文不用nltk
        # tokens = nltk.word_tokenize(val)
        tokens = to_tokens(val)
        if len(tokens)>1: 
            tokens.pop(random.randrange(len(tokens)))
            out = set([' '.join(tokens)])
        else:
            t = tokens[0]
            out = set()
            if _is_int(t):
                pert = round(random.gauss(0,0.5))
                if pert>0: out.add('%d' %(int(t)+pert))
                out.add(t)
            elif _is_float(t):
                pert = random.gauss(0,0.5)
                if pert>0.05: out.add('%.1f' %(float(t)+pert))
                out.add(t)
            else:
                out.add(t)
        return out
Beispiel #11
0
 def _vocab_search(self, text):
     tokens = to_tokens(text)
     for i in range(len(tokens)):
         for t in range(self.N):
             if i - t < 0: continue
             ngram = '_'.join(tokens[i - t:i + 1])
             if ngram in self.grams:
                 return True
     return False
Beispiel #12
0
 def _vocab_search(self, text):
     tokens = to_tokens(text)
     for i in range(len(tokens)):
         for t in range(self.N):
             if i-t<0: continue
             ngram = '_'.join(tokens[i-t:i+1])
             if ngram in self.grams: 
                 return True
     return False
 def featurize(self, text):
     vec = np.zeros((len(self.grams),)).astype('float32')
     tokens = to_tokens(text)
     for i in range(len(tokens)):
         for t in range(self.N):
             if i-t<0: continue
             ngram = '_'.join(tokens[i-t:i+1])
             if ngram in self.grams: 
                 vec[self.grams[ngram]] += 1.
     return vec
Beispiel #14
0
 def _build_token_index(self):
     self.tokens = {}
     # tokens的结构:{slot_name:{word_token: [slot value IDs]}}
     for slot, vals in self.dict.iteritems():
         # print "db slot: {}\nslot values: {}".format(slot.encode("utf-8"), vals)
         self.tokens[slot] = defaultdict(list)
         for vi, vv in enumerate(vals):
             w_v = to_tokens(vv)
             for w in w_v:
                 self.tokens[slot][w].append(vi)
Beispiel #15
0
 def featurize(self, text):
     vec = np.zeros((len(self.grams), )).astype('float32')
     tokens = to_tokens(text)
     for i in range(len(tokens)):
         for t in range(self.N):
             if i - t < 0: continue
             ngram = '_'.join(tokens[i - t:i + 1])
             if ngram in self.grams:
                 vec[self.grams[ngram]] += 1.
     return vec
Beispiel #16
0
 def _vocab_search(self, text):
     '''
     按照2-Gram的方式查看是否用户输入中是否存在关键字
     :param text: 用户输入的自然语言句子
     :return: 是否包含关键字,或者说用户输入的问题是否有意义
     '''
     tokens = to_tokens(text)
     for i in range(len(tokens)):
         for t in range(self.N):
             if i-t<0: continue
             ngram = u''.join(tokens[i-t:i+1])
             if ngram in self.grams: 
                 return True
     return False
    def _build_vocab_from_corpus(self, corpus):
        '''
        根据对话文本构造全局vocabulary存到grams中,N-Gram实际上包括了全部[1,N]-Grams
        :param corpus: 对话文本文件path
        :return: None
        '''

        #转到中文之后,N-Gram不再必要了
        if not os.path.isfile(corpus): return
        with open(corpus, 'r') as f:
            for line in f:
                for ngram in to_tokens(line.strip()):
                    if ngram.strip() != "" and ngram not in self.grams:
                        self.grams[ngram] = self.n
                        # print(ngram.encode("utf8") if ngram is not None and type(ngram) == unicode else ngram)
                        self.n += 1
Beispiel #18
0
    def _update_state(self, user_utterance, upd=UPD, verbose=False):
        '''
        根据用户输入,暴力查找用户输入中的关键字(token),就地改变BT的状态
        :param user_utterance: 用户输入
        :param upd: 当前训练次数(模型更新次数)
        :param verbose: 是否启用唠叨模式
        :return: None
        '''
        prev_act, prev_slot = self.state['prevact'].split('@')

        s_t = to_tokens(user_utterance)
        slot_match = self._search_slots(s_t)  # search slots
        val_match = self._search_values(s_t)  # search values

        for slot, values in val_match.iteritems():
            requested = (prev_act == 'request') and (prev_slot == slot)
            matched = (slot in slot_match)
            if not values:
                if requested:  # asked for value but did not get it,就不再关心这个slot了!
                    self.state['database'].delete_slot(slot)
                    self.state['num_requests'][slot] = 1000
                    self.state['dont_care'].add(slot)
            else:
                for y, match in values.iteritems():
                    # match记录的是支持度这样一个类似的概念
                    #y = self.movie_dict.dict[slot].index(val)
                    if verbose:
                        v = self.movie_dict.dict[slot][y]
                        print "检测到:{}, 更新置信度:{}".format(
                            v.encode("utf8") if v is not None
                            and type(v) == unicode else v, match)
                        # print 'Detected %s' %v.encode("utf8") if v is not None and type(v) == unicode else v, ' update = ', match
                    if matched and requested:
                        alpha = upd * (match + 1. + slot_match[slot])
                    elif matched and not requested:
                        alpha = upd * (match + slot_match[slot])
                    elif not matched and requested:
                        alpha = upd * (match + 1.)
                    else:
                        alpha = upd * match
                    self.state['inform_slots'][slot][y] += alpha
                    # inform_slots记录的其实是每个slot的每个value的已知程度,用于手工计算p,同理dont_care用于手工计算q
                self.state['slot_tracker'].add(slot)
    def _build_vocab_from_db(self, corpus):
        '''
        根据database构造全局vocabulary存到grams中,N-Gram实际上包括了全部[1,N]-Grams
        :param corpus: database文本文件path
        :return: None
        '''

        # 中文中就不需要N-Gram了!
        with open(corpus, 'r') as f:
            for line in f:
                elements = line.strip().split('\t')[1:]
                for ele in elements:
                    # if '·' in ele:
                    #     tokens = ele.split('·')
                    # else:
                    tokens = to_tokens(ele)
                    for ngram in tokens:
                        if ngram.strip() != "" and ngram not in self.grams:
                            # print(ngram.encode("utf8") if ngram is not None and type(ngram) == unicode else ngram)
                            self.grams[ngram] = self.n
                            self.n += 1
Beispiel #20
0
 def _prepare_for_search(self):
     self.slot_tokens = {}
     for slot in self.slots:
         self.slot_tokens[slot] = to_tokens(slot)
Beispiel #21
0
 def _prepare_for_search(self):
     self.slot_tokens = {}
     for slot in self.slots:
         self.slot_tokens[slot] = to_tokens(slot)