def frequency_calculate(self): """ 计算所有模板生成问句的生成频度 Return: frequency --生成频度,list格式 """ fq_list = [] if self.questions.__len__() is not 4: raise Exception('模板数不为4,请重新配置!') else: for i in range(4): fq_list.append(self.question2probability(self.questions[i])) log('warning', '频度计算:{}-->{}'.format(self.questions, fq_list)) return fq_list
def origin_to_train(self): # 读取源数据 f = codecs.open(self.origin_path, 'r', encoding='utf-8') # 目标训练集 target = codecs.open(self.train_path, 'w', encoding='utf-8') # 逐行读取,逐行切分 line_num = 1 line = f.readline() while line: log('warning', '--------processing {} line---------------'.format(line_num)) line_seg = self.cut(line) target.write(line_seg) line = f.readline() line_num += 1 # 扫尾处理 f.close() target.close() log('warning', '源数据转换训练集成功!')
def jac_list(self, ser1, ser2): if ser1.__len__() == ser2.__len__(): jac_list = [] length = ser1.__len__() for x in range(length): x1 = [i for i in cut(str(ser1[x]))] x2 = [i for i in cut(str(ser2[x]))] jac_list.append(self.jaccard(x1, x2)) return jac_list else: return log('warning', '数据集长度不一致,请调整后再转换!')
def ranking(self): """排序打分 Return: 排序得分 -- 得分序列,list类型 """ score_list = [] for i in range(self.len): score = (1-self.lamda) * (e ** (-self.sim_list[i]*self.alpha)) + \ self.lamda * (1 / (1 + e **(-self.fre_list[i]*self.beta))) score_list.append(score) rank_list = [] for index, i in enumerate(score_list): rank_list.append((index, i)) rank_list.sort(key=lambda x: x[1], reverse=True) log( 'warning', '最大值:{},最小值{},差值{}'.format(max(score_list), min(score_list), max(score_list) - min(score_list))) log('warning', '排序打分{}'.format(rank_list)) return rank_list
def train(self): model = RNNLM(self.vocab_size, embed_size, hidden_size, num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Train the model for epoch in range(num_epochs): # Set initial hidden and cell states states = (torch.zeros(num_layers, batch_size, hidden_size).to(device), torch.zeros(num_layers, batch_size, hidden_size).to(device)) for i in range(0, self.ids.size(1) - seq_length, seq_length): # Get mini-batch inputs and targets inputs = self.ids[:, i:i + seq_length].to(device) targets = self.ids[:, (i + 1):(i + 1) + seq_length].to(device) # Forward pass states = self.detach(states) outputs, states = model(inputs, states) loss = criterion(outputs, targets.reshape(-1)) # Backward and optimize model.zero_grad() loss.backward() clip_grad_norm(model.parameters(), 0.5) optimizer.step() step = (i + 1) // seq_length if step % 100 == 0: log('warning', 'Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}' .format(epoch + 1, num_epochs, step, self.num_batches, loss.item(), np.exp(loss.item()))) # Save the model checkpoints torch.save(model, self.model_path) rnn_dict = dict(zip(self.corpus.dictionary.idx2word.values(), self.corpus.dictionary.idx2word.keys())) with open(self.dict_path, mode='wb') as f: pickle.dump(rnn_dict, f)
def searchs2templates(s1='', s2='', s3='', rank_id=None): """通过搜索词,匹配相应搜索词个数的模板 Keyword arguments: s1 -- str类型,搜索词1 s2 -- str类型,搜索词2 s3 -- str类型,搜索词3 rank_id -- 未定义,排序打分后得到的模板id号 Return: [问句1,问句2,问句3,问句4] """ if s1 and s2 and s3: t = 3 t1 = s1 + s2 + x0_3[0].split()[0] + s3 + x0_3[0].split()[1] t2 = s1 + s2 + x0_3[1].split()[0] + s3 + x0_3[1].split()[1] t3 = s1 + s2 + x0_3[2].split()[0] + s3 + x0_3[2].split()[1] t4 = s1 + s2 + x0_3[3].split()[0] + s3 + x0_3[3].split()[1] elif s1 and s2: t = 2 t1 = s1 + x0_2[0].split()[0] + s2 + x0_2[0].split()[1] t2 = s1 + x0_2[1].split()[0] + s2 + x0_2[1].split()[1] t3 = s1 + x0_2[2].split()[0] + s2 + x0_2[2].split()[1] t4 = s1 + x0_2[3].split()[0] + s2 + x0_2[3].split()[1] elif s1: t = 1 t1 = s1 + x0_1[0] t2 = x0_1[1].split()[0] + s1 + x0_1[1].split()[1] t3 = s1 + x0_1[2] t4 = s1 + x0_1[3] else: raise Exception('搜索词提取错误!') t_list = [t1, t2, t3, t4] if rank_id == None: return t_list else: log('warning', '{} {} {} --> {}'.format(s1, s2, s3, t_list[rank_id])) return t_list[rank_id]
def origin_to_train(self): # 读取数据源 origin = pd.read_csv(self.origin_path, encoding='utf-8') # 数据预处理(删除任何包含空值的行) origin = origin.dropna() # 生成新的索引号 origin.index = range(len(origin)) # 生成特征字段 origin['x1_len'] = pd.Series([str(i).__len__() for i in origin.x1]) origin['x2_len'] = pd.Series([str(i).__len__() for i in origin.x2]) origin['jac'] = pd.Series(self.jac_list(origin.x1, origin.x2)) # 生成训练数据集 origin.to_csv(self.train_path, columns=['x1_len', 'x2_len', 'jac', 'label'], encoding='utf-8', date_format=float) return log('warning', '源数据转换成训练集成功!')
def train(self): # 读入训练集 train_data = np.loadtxt(self.train_path, dtype=float, delimiter=',', skiprows=1) # 按照label,划分训练集和标签 x, y = np.split(train_data, (4,), axis=1) # 选取训练特征 x = x[:, 1:] # 自动划分训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6) # 为分类器设定训练参数(线性划分核函数:linear,高斯核函数:rbf) clf = svm.SVC(C=1, kernel='linear', gamma='auto', decision_function_shape='ovr') # 开始训练 clf.fit(x_train, y_train.ravel()) # 保存训练好的模型 joblib.dump(clf, self.model_path) return log('warning', '训练SVM分类器成功!准确率为:{}%'.format(round(clf.score(x_test, y_test) * 100, 2)))
'政策法规', '弹性伸缩服务使用攻略', '云容灾', '云硬盘', '云硬盘备份', '弹性文件服务', '机器学习服务', '专属云', '通信平台云', '软件开发云使用攻略', 'MapReduce服务', '联络中心', '安全指数使用攻略', '云服务器备份', '数据查询服务', '云硬盘使用指南', '微服务云应用平台', 'OBS_Android_SDK', '新手指引', '云专线', '主机入侵检测', '综合上云迁移交付服务', '如何', '是什么', '有哪些', ] for i in topic_list: jieba.add_word(i.lower()) jieba.suggest_freq(i.lower(), True) log('info', '初始化用户词典完成!')
def correlation_calcuulate(self): """ 输出搜索词 Return: 问句或搜索词 -- 问句,str类型 搜索词,list类型 """ # 判断是否为问句 if self.question(self.x1): log( 'warning', '生成问句:{} {} {}-->{}'.format(self.x1, self.x2, self.x3, self.x1)) return self.x1 if self.question(self.x2): log( 'warning', '生成问句:{} {} {}-->{}'.format(self.x1, self.x2, self.x3, self.x2)) return self.x2 if self.question(self.x3): log( 'warning', '生成问句:{} {} {}-->{}'.format(self.x1, self.x2, self.x3, self.x3)) return self.x3 # 前置条件 terms = [] if self.x1 is not '' and type(self.x1) == str: terms.append(self.x1) if self.x2 is not '' and type(self.x2) == str: terms.append(self.x2) if self.x3 is not '' and type(self.x3) == str: terms.append(self.x3) if terms.__len__() == 0: raise Exception('请传入至少一个查询词') # 相关度判定 if terms.__len__() == 1: log('warning', '相关度计算:{}-->{}'.format(self.x1, self.x1)) return [self.x1] if terms.__len__() == 2: if self.corretale(self.x1, self.x2): log( 'warning', '相关度计算:{} {}-->{} {}'.format(self.x1, self.x2, self.x1, self.x2)) return [self.x1, self.x2] else: log('warning', '相关度计算:{} {}-->{}'.format(self.x1, self.x2, self.x2)) return [self.x2] if terms.__len__() == 3: if self.corretale(self.x1, self.x2) and self.corretale( self.x2, self.x3): log( 'warning', '相关度计算:{} {} {}-->{} {} {}'.format(self.x1, self.x2, self.x3, self.x1, self.x2, self.x3)) return [self.x1, self.x2, self.x3] elif not self.corretale(self.x1, self.x2) and self.corretale( self.x2, self.x3): log( 'warning', '相关度计算:{} {} {}-->{} {}'.format(self.x1, self.x2, self.x3, self.x2, self.x3)) return [self.x2, self.x3] elif self.corretale( self.x1, self.x2) and not self.corretale(self.x2, self.x3): log( 'warning', '相关度计算:{} {} {}-->{} {}'.format(self.x1, self.x2, self.x3, self.x1, self.x3)) return [self.x1, self.x3] elif not self.corretale(self.x1, self.x2) and not self.corretale( self.x2, self.x3): log( 'warning', '相关度计算:{} {} {}-->{}'.format(self.x1, self.x2, self.x3, self.x3)) return [self.x3]
t1 = set(t1) t2 = set(t2) t3 = set(t3) t4 = set(t4) t3.difference_update(t2) t4.difference_update(t2) t4.difference_update(t3) t1.difference_update(t2) t1.difference_update(t3) t1.difference_update(t4) log('info', '模板集初始化完成!') # 匹配模板 def templates(x1='', x2='', x3=''): """接受得到的搜索词,匹配相应搜索词个数的模板 Keyword arguments: x1 -- x1,搜索词1 x2 -- x2,搜索词2 x3 -- x3,搜索词3 Return: (搜索序列,模板号) """ if x1 and x2 and x3: s1 = x1 + ' ' + x2 + ' ' + x3