def _build_vocab_from_db(self, corpus): try: f = io.open(corpus, 'r') for line in f: elements = line.rstrip().split('\t')[1:] for ele in elements: tokens = to_tokens(ele) for i in range(len(tokens)): for t in range(self.N): if i-t<0: continue ngram = '_'.join(tokens[i-t:i+1]) if ngram not in self.grams: self.grams[ngram] = self.n self.n += 1 f.close() except UnicodeDecodeError: f = open(corpus, 'r') for line in f: elements = line.rstrip().split('\t')[1:] for ele in elements: tokens = to_tokens(ele) for i in range(len(tokens)): for t in range(self.N): if i-t<0: continue ngram = '_'.join(tokens[i-t:i+1]) if ngram not in self.grams: self.grams[ngram] = self.n self.n += 1 f.close()
def _build_vocab_from_corpus(self, corpus): if not os.path.isfile(corpus): return try: f = io.open(corpus, 'r') for line in f: tokens = to_tokens(line.rstrip()) for i in range(len(tokens)): for t in range(self.N): if i - t < 0: continue ngram = '_'.join(tokens[i - t:i + 1]) if ngram not in self.grams: self.grams[ngram] = self.n self.n += 1 f.close() except UnicodeDecodeError: f = open(corpus, 'r') for line in f: tokens = to_tokens(line.rstrip()) for i in range(len(tokens)): for t in range(self.N): if i - t < 0: continue ngram = '_'.join(tokens[i - t:i + 1]) if ngram not in self.grams: self.grams[ngram] = self.n self.n += 1 f.close()
def _build_vocab_from_corpus(self, corpus): if not os.path.isfile(corpus): return try: f = io.open(corpus, 'r') for line in f: tokens = to_tokens(line.rstrip()) for i in range(len(tokens)): for t in range(self.N): if i-t<0: continue ngram = '_'.join(tokens[i-t:i+1]) if ngram not in self.grams: self.grams[ngram] = self.n self.n += 1 f.close() except UnicodeDecodeError: f = open(corpus, 'r') for line in f: tokens = to_tokens(line.rstrip()) for i in range(len(tokens)): for t in range(self.N): if i-t<0: continue ngram = '_'.join(tokens[i-t:i+1]) if ngram not in self.grams: self.grams[ngram] = self.n self.n += 1 f.close()
def _build_vocab_from_db(self, corpus): try: f = io.open(corpus, 'r') for line in f: elements = line.rstrip().split('\t')[1:] for ele in elements: tokens = to_tokens(ele) for i in range(len(tokens)): for t in range(self.N): if i - t < 0: continue ngram = '_'.join(tokens[i - t:i + 1]) if ngram not in self.grams: self.grams[ngram] = self.n self.n += 1 f.close() except UnicodeDecodeError: f = open(corpus, 'r') for line in f: elements = line.rstrip().split('\t')[1:] for ele in elements: tokens = to_tokens(ele) for i in range(len(tokens)): for t in range(self.N): if i - t < 0: continue ngram = '_'.join(tokens[i - t:i + 1]) if ngram not in self.grams: self.grams[ngram] = self.n self.n += 1 f.close()
def _corrupt_value(self, val): def _is_int(s): try: int(s) return True except ValueError: return False def _is_float(s): try: float(s) return True except ValueError: return False # 中文不用NLTK # tokens = nltk.word_tokenize(val) tokens = to_tokens(val) if len(tokens)>1: tokens.pop(random.randrange(len(tokens))) out = [] for t in tokens: if _is_int(t): out.append(str(int(random.gauss(int(t),0.5)))) elif _is_float(t): out.append('%.1f' %random.gauss(float(t),0.5)) else: out.append(t) return ' '.join([o for o in out])
def _update_state(self, user_utterance, upd=UPD, verbose=False): prev_act, prev_slot = self.state['prevact'].split('@') s_t = to_tokens(user_utterance) slot_match = self._search_slots(s_t) # search slots val_match = self._search_values(s_t) # search values for slot, values in val_match.iteritems(): requested = (prev_act=='request') and (prev_slot==slot) matched = (slot in slot_match) if not values: if requested: # asked for value but did not get it self.state['database'].delete_slot(slot) self.state['num_requests'][slot] = 1000 self.state['dont_care'].add(slot) else: for y, match in values.iteritems(): #y = self.movie_dict.dict[slot].index(val) if verbose: print 'Detected %s' %self.movie_dict.dict[slot][y], ' update = ', match if matched and requested: alpha = upd*(match + 1. + slot_match[slot]) elif matched and not requested: alpha = upd*(match + slot_match[slot]) elif not matched and requested: alpha = upd*(match + 1.) else: alpha = upd*match self.state['inform_slots'][slot][y] += alpha self.state['slot_tracker'].add(slot)
def _search_values(self, s_t): ''' 在用户输入中查找出现的slot value,这里已经转为了token操作 :param s_t: 用户输入的token化结果 :return: 匹配统计结果 ''' print('-' * 100 + "\nsearching values: ") for v in s_t: print( v.encode("utf8") if v is not None and type(v) == unicode else v) matches = {} for slot in self.state['database'].slots: matches[slot] = defaultdict(float) for ss in s_t: if ss in self.movie_dict.tokens[slot]: for vi in self.movie_dict.tokens[slot][ss]: matches[slot][vi] += 1. for vi, f in matches[slot].iteritems(): val = self.movie_dict.dict[slot][vi] # 中文版删掉了nltk的东西 # print(nltk.word_tokenize(val)) # matches[slot][vi] = f/len(nltk.word_tokenize(val)) matches[slot][vi] = f / len(to_tokens(val)) print('-' * 100) return matches
def _build_token_index(self): self.tokens = {} for slot,vals in self.dict.iteritems(): self.tokens[slot] = defaultdict(list) for vi,vv in enumerate(vals): w_v = to_tokens(vv) for w in w_v: self.tokens[slot][w].append(vi)
def featurize(self, text): ''' 基于N-Gram的方式构造文本text中的特征 :param text: 自然语言文本 :return: 长度为|Grams|的向量,向量中的每个值表示该Gram的数量 ''' vec = np.zeros((len(self.grams), )).astype('float32') embeddings = [] # 转到中文之后,N-Gram不再必要了,全部使用Embedding即可 for ngram in to_tokens(text): if ngram in self.grams: vec[self.grams[ngram]] += 1. if ngram in self.embedding_vocab_t2n: embeddings.append( self.embedding_vectors[self.embedding_vocab_t2n[ngram]]) else: embeddings.append(self.UNK_EMBEDDING) seq_size = len(embeddings) while len(embeddings) < self.seq_max_len: embeddings.append(self.BAK_EMBEDDING) embeddings = embeddings[:self.seq_max_len] cat_embedding = torch.cat([x.view(1, x.size()[0]) for x in embeddings], 0) # TODO: embedding直接改为二维列表,numpy的二维array return cat_embedding.numpy(), seq_size
def _corrupt_value(self, val): def _is_int(s): try: int(s) return True except ValueError: return False def _is_float(s): try: float(s) return True except ValueError: return False # 中文不用nltk # tokens = nltk.word_tokenize(val) tokens = to_tokens(val) if len(tokens)>1: tokens.pop(random.randrange(len(tokens))) out = set([' '.join(tokens)]) else: t = tokens[0] out = set() if _is_int(t): pert = round(random.gauss(0,0.5)) if pert>0: out.add('%d' %(int(t)+pert)) out.add(t) elif _is_float(t): pert = random.gauss(0,0.5) if pert>0.05: out.add('%.1f' %(float(t)+pert)) out.add(t) else: out.add(t) return out
def _vocab_search(self, text): tokens = to_tokens(text) for i in range(len(tokens)): for t in range(self.N): if i - t < 0: continue ngram = '_'.join(tokens[i - t:i + 1]) if ngram in self.grams: return True return False
def _vocab_search(self, text): tokens = to_tokens(text) for i in range(len(tokens)): for t in range(self.N): if i-t<0: continue ngram = '_'.join(tokens[i-t:i+1]) if ngram in self.grams: return True return False
def featurize(self, text): vec = np.zeros((len(self.grams),)).astype('float32') tokens = to_tokens(text) for i in range(len(tokens)): for t in range(self.N): if i-t<0: continue ngram = '_'.join(tokens[i-t:i+1]) if ngram in self.grams: vec[self.grams[ngram]] += 1. return vec
def _build_token_index(self): self.tokens = {} # tokens的结构:{slot_name:{word_token: [slot value IDs]}} for slot, vals in self.dict.iteritems(): # print "db slot: {}\nslot values: {}".format(slot.encode("utf-8"), vals) self.tokens[slot] = defaultdict(list) for vi, vv in enumerate(vals): w_v = to_tokens(vv) for w in w_v: self.tokens[slot][w].append(vi)
def featurize(self, text): vec = np.zeros((len(self.grams), )).astype('float32') tokens = to_tokens(text) for i in range(len(tokens)): for t in range(self.N): if i - t < 0: continue ngram = '_'.join(tokens[i - t:i + 1]) if ngram in self.grams: vec[self.grams[ngram]] += 1. return vec
def _vocab_search(self, text): ''' 按照2-Gram的方式查看是否用户输入中是否存在关键字 :param text: 用户输入的自然语言句子 :return: 是否包含关键字,或者说用户输入的问题是否有意义 ''' tokens = to_tokens(text) for i in range(len(tokens)): for t in range(self.N): if i-t<0: continue ngram = u''.join(tokens[i-t:i+1]) if ngram in self.grams: return True return False
def _build_vocab_from_corpus(self, corpus): ''' 根据对话文本构造全局vocabulary存到grams中,N-Gram实际上包括了全部[1,N]-Grams :param corpus: 对话文本文件path :return: None ''' #转到中文之后,N-Gram不再必要了 if not os.path.isfile(corpus): return with open(corpus, 'r') as f: for line in f: for ngram in to_tokens(line.strip()): if ngram.strip() != "" and ngram not in self.grams: self.grams[ngram] = self.n # print(ngram.encode("utf8") if ngram is not None and type(ngram) == unicode else ngram) self.n += 1
def _update_state(self, user_utterance, upd=UPD, verbose=False): ''' 根据用户输入,暴力查找用户输入中的关键字(token),就地改变BT的状态 :param user_utterance: 用户输入 :param upd: 当前训练次数(模型更新次数) :param verbose: 是否启用唠叨模式 :return: None ''' prev_act, prev_slot = self.state['prevact'].split('@') s_t = to_tokens(user_utterance) slot_match = self._search_slots(s_t) # search slots val_match = self._search_values(s_t) # search values for slot, values in val_match.iteritems(): requested = (prev_act == 'request') and (prev_slot == slot) matched = (slot in slot_match) if not values: if requested: # asked for value but did not get it,就不再关心这个slot了! self.state['database'].delete_slot(slot) self.state['num_requests'][slot] = 1000 self.state['dont_care'].add(slot) else: for y, match in values.iteritems(): # match记录的是支持度这样一个类似的概念 #y = self.movie_dict.dict[slot].index(val) if verbose: v = self.movie_dict.dict[slot][y] print "检测到:{}, 更新置信度:{}".format( v.encode("utf8") if v is not None and type(v) == unicode else v, match) # print 'Detected %s' %v.encode("utf8") if v is not None and type(v) == unicode else v, ' update = ', match if matched and requested: alpha = upd * (match + 1. + slot_match[slot]) elif matched and not requested: alpha = upd * (match + slot_match[slot]) elif not matched and requested: alpha = upd * (match + 1.) else: alpha = upd * match self.state['inform_slots'][slot][y] += alpha # inform_slots记录的其实是每个slot的每个value的已知程度,用于手工计算p,同理dont_care用于手工计算q self.state['slot_tracker'].add(slot)
def _build_vocab_from_db(self, corpus): ''' 根据database构造全局vocabulary存到grams中,N-Gram实际上包括了全部[1,N]-Grams :param corpus: database文本文件path :return: None ''' # 中文中就不需要N-Gram了! with open(corpus, 'r') as f: for line in f: elements = line.strip().split('\t')[1:] for ele in elements: # if '·' in ele: # tokens = ele.split('·') # else: tokens = to_tokens(ele) for ngram in tokens: if ngram.strip() != "" and ngram not in self.grams: # print(ngram.encode("utf8") if ngram is not None and type(ngram) == unicode else ngram) self.grams[ngram] = self.n self.n += 1
def _prepare_for_search(self): self.slot_tokens = {} for slot in self.slots: self.slot_tokens[slot] = to_tokens(slot)