コード例 #1
0
    def frequency_calculate(self):
        """  计算所有模板生成问句的生成频度

        Return:
        frequency      --生成频度,list格式
        """
        fq_list = []
        if self.questions.__len__() is not 4:
            raise Exception('模板数不为4,请重新配置!')
        else:
            for i in range(4):
                fq_list.append(self.question2probability(self.questions[i]))

        log('warning', '频度计算:{}-->{}'.format(self.questions, fq_list))

        return fq_list
コード例 #2
0
ファイル: train_models.py プロジェクト: pzs741/QGDT
 def origin_to_train(self):
     # 读取源数据
     f = codecs.open(self.origin_path, 'r', encoding='utf-8')
     # 目标训练集
     target = codecs.open(self.train_path, 'w', encoding='utf-8')
     # 逐行读取,逐行切分
     line_num = 1
     line = f.readline()
     while line:
         log('warning', '--------processing {} line---------------'.format(line_num))
         line_seg = self.cut(line)
         target.write(line_seg)
         line = f.readline()
         line_num += 1
     # 扫尾处理
     f.close()
     target.close()
     log('warning', '源数据转换训练集成功!')
コード例 #3
0
ファイル: train_models.py プロジェクト: pzs741/QGDT
 def jac_list(self, ser1, ser2):
     if ser1.__len__() == ser2.__len__():
         jac_list = []
         length = ser1.__len__()
         for x in range(length):
             x1 = [i for i in cut(str(ser1[x]))]
             x2 = [i for i in cut(str(ser2[x]))]
             jac_list.append(self.jaccard(x1, x2))
         return jac_list
     else:
         return log('warning', '数据集长度不一致,请调整后再转换!')
コード例 #4
0
    def ranking(self):
        """排序打分

        Return:
        排序得分               -- 得分序列,list类型
        """
        score_list = []
        for i in range(self.len):
            score = (1-self.lamda) * (e ** (-self.sim_list[i]*self.alpha)) + \
                    self.lamda * (1 / (1 + e **(-self.fre_list[i]*self.beta)))
            score_list.append(score)
        rank_list = []
        for index, i in enumerate(score_list):
            rank_list.append((index, i))
        rank_list.sort(key=lambda x: x[1], reverse=True)
        log(
            'warning',
            '最大值:{},最小值{},差值{}'.format(max(score_list), min(score_list),
                                       max(score_list) - min(score_list)))
        log('warning', '排序打分{}'.format(rank_list))
        return rank_list
コード例 #5
0
ファイル: train_models.py プロジェクト: pzs741/QGDT
    def train(self):
        model = RNNLM(self.vocab_size, embed_size, hidden_size, num_layers).to(device)

        # Loss and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

        # Train the model
        for epoch in range(num_epochs):
            # Set initial hidden and cell states
            states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
                      torch.zeros(num_layers, batch_size, hidden_size).to(device))

            for i in range(0, self.ids.size(1) - seq_length, seq_length):
                # Get mini-batch inputs and targets
                inputs = self.ids[:, i:i + seq_length].to(device)
                targets = self.ids[:, (i + 1):(i + 1) + seq_length].to(device)

                # Forward pass
                states = self.detach(states)
                outputs, states = model(inputs, states)
                loss = criterion(outputs, targets.reshape(-1))

                # Backward and optimize
                model.zero_grad()
                loss.backward()
                clip_grad_norm(model.parameters(), 0.5)
                optimizer.step()

                step = (i + 1) // seq_length
                if step % 100 == 0:
                    log('warning', 'Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                        .format(epoch + 1, num_epochs, step, self.num_batches, loss.item(), np.exp(loss.item())))

        # Save the model checkpoints
        torch.save(model, self.model_path)
        rnn_dict = dict(zip(self.corpus.dictionary.idx2word.values(), self.corpus.dictionary.idx2word.keys()))
        with open(self.dict_path, mode='wb') as f:
            pickle.dump(rnn_dict, f)
コード例 #6
0
def searchs2templates(s1='', s2='', s3='', rank_id=None):
    """通过搜索词,匹配相应搜索词个数的模板

    Keyword arguments:
    s1                  -- str类型,搜索词1
    s2                  -- str类型,搜索词2
    s3                  -- str类型,搜索词3
    rank_id             -- 未定义,排序打分后得到的模板id号
    Return:
        [问句1,问句2,问句3,问句4]
    """
    if s1 and s2 and s3:
        t = 3
        t1 = s1 + s2 + x0_3[0].split()[0] + s3 + x0_3[0].split()[1]
        t2 = s1 + s2 + x0_3[1].split()[0] + s3 + x0_3[1].split()[1]
        t3 = s1 + s2 + x0_3[2].split()[0] + s3 + x0_3[2].split()[1]
        t4 = s1 + s2 + x0_3[3].split()[0] + s3 + x0_3[3].split()[1]
    elif s1 and s2:
        t = 2
        t1 = s1 + x0_2[0].split()[0] + s2 + x0_2[0].split()[1]
        t2 = s1 + x0_2[1].split()[0] + s2 + x0_2[1].split()[1]
        t3 = s1 + x0_2[2].split()[0] + s2 + x0_2[2].split()[1]
        t4 = s1 + x0_2[3].split()[0] + s2 + x0_2[3].split()[1]
    elif s1:
        t = 1
        t1 = s1 + x0_1[0]
        t2 = x0_1[1].split()[0] + s1 + x0_1[1].split()[1]
        t3 = s1 + x0_1[2]
        t4 = s1 + x0_1[3]
    else:
        raise Exception('搜索词提取错误!')
    t_list = [t1, t2, t3, t4]
    if rank_id == None:
        return t_list
    else:
        log('warning', '{} {} {} --> {}'.format(s1, s2, s3, t_list[rank_id]))
        return t_list[rank_id]
コード例 #7
0
ファイル: train_models.py プロジェクト: pzs741/QGDT
 def origin_to_train(self):
     # 读取数据源
     origin = pd.read_csv(self.origin_path, encoding='utf-8')
     # 数据预处理(删除任何包含空值的行)
     origin = origin.dropna()
     # 生成新的索引号
     origin.index = range(len(origin))
     # 生成特征字段
     origin['x1_len'] = pd.Series([str(i).__len__() for i in origin.x1])
     origin['x2_len'] = pd.Series([str(i).__len__() for i in origin.x2])
     origin['jac'] = pd.Series(self.jac_list(origin.x1, origin.x2))
     # 生成训练数据集
     origin.to_csv(self.train_path, columns=['x1_len', 'x2_len', 'jac', 'label'], encoding='utf-8',
                   date_format=float)
     return log('warning', '源数据转换成训练集成功!')
コード例 #8
0
ファイル: train_models.py プロジェクト: pzs741/QGDT
    def train(self):
        # 读入训练集
        train_data = np.loadtxt(self.train_path, dtype=float, delimiter=',', skiprows=1)
        # 按照label,划分训练集和标签
        x, y = np.split(train_data, (4,), axis=1)
        # 选取训练特征
        x = x[:, 1:]
        # 自动划分训练集和测试集
        x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6)
        # 为分类器设定训练参数(线性划分核函数:linear,高斯核函数:rbf)
        clf = svm.SVC(C=1, kernel='linear', gamma='auto', decision_function_shape='ovr')
        # 开始训练
        clf.fit(x_train, y_train.ravel())
        # 保存训练好的模型
        joblib.dump(clf, self.model_path)

        return log('warning', '训练SVM分类器成功!准确率为:{}%'.format(round(clf.score(x_test, y_test) * 100, 2)))
コード例 #9
0
ファイル: __init__.py プロジェクト: youstair/QGDT
    '政策法规',
    '弹性伸缩服务使用攻略',
    '云容灾',
    '云硬盘',
    '云硬盘备份',
    '弹性文件服务',
    '机器学习服务',
    '专属云',
    '通信平台云',
    '软件开发云使用攻略',
    'MapReduce服务',
    '联络中心',
    '安全指数使用攻略',
    '云服务器备份',
    '数据查询服务',
    '云硬盘使用指南',
    '微服务云应用平台',
    'OBS_Android_SDK',
    '新手指引',
    '云专线',
    '主机入侵检测',
    '综合上云迁移交付服务',
    '如何',
    '是什么',
    '有哪些',
]
for i in topic_list:
    jieba.add_word(i.lower())
    jieba.suggest_freq(i.lower(), True)
log('info', '初始化用户词典完成!')
コード例 #10
0
    def correlation_calcuulate(self):
        """ 输出搜索词

         Return:
         问句或搜索词               -- 问句,str类型 搜索词,list类型
         """
        # 判断是否为问句
        if self.question(self.x1):
            log(
                'warning', '生成问句:{} {} {}-->{}'.format(self.x1, self.x2,
                                                       self.x3, self.x1))
            return self.x1
        if self.question(self.x2):
            log(
                'warning', '生成问句:{} {} {}-->{}'.format(self.x1, self.x2,
                                                       self.x3, self.x2))
            return self.x2
        if self.question(self.x3):
            log(
                'warning', '生成问句:{} {} {}-->{}'.format(self.x1, self.x2,
                                                       self.x3, self.x3))
            return self.x3
        # 前置条件
        terms = []
        if self.x1 is not '' and type(self.x1) == str:
            terms.append(self.x1)
        if self.x2 is not '' and type(self.x2) == str:
            terms.append(self.x2)
        if self.x3 is not '' and type(self.x3) == str:
            terms.append(self.x3)
        if terms.__len__() == 0:
            raise Exception('请传入至少一个查询词')
        # 相关度判定
        if terms.__len__() == 1:
            log('warning', '相关度计算:{}-->{}'.format(self.x1, self.x1))
            return [self.x1]
        if terms.__len__() == 2:
            if self.corretale(self.x1, self.x2):
                log(
                    'warning',
                    '相关度计算:{} {}-->{} {}'.format(self.x1, self.x2, self.x1,
                                                 self.x2))
                return [self.x1, self.x2]
            else:
                log('warning',
                    '相关度计算:{} {}-->{}'.format(self.x1, self.x2, self.x2))
                return [self.x2]
        if terms.__len__() == 3:
            if self.corretale(self.x1, self.x2) and self.corretale(
                    self.x2, self.x3):
                log(
                    'warning',
                    '相关度计算:{} {} {}-->{} {} {}'.format(self.x1, self.x2,
                                                       self.x3, self.x1,
                                                       self.x2, self.x3))
                return [self.x1, self.x2, self.x3]
            elif not self.corretale(self.x1, self.x2) and self.corretale(
                    self.x2, self.x3):
                log(
                    'warning',
                    '相关度计算:{} {} {}-->{} {}'.format(self.x1, self.x2, self.x3,
                                                    self.x2, self.x3))
                return [self.x2, self.x3]
            elif self.corretale(
                    self.x1, self.x2) and not self.corretale(self.x2, self.x3):
                log(
                    'warning',
                    '相关度计算:{} {} {}-->{} {}'.format(self.x1, self.x2, self.x3,
                                                    self.x1, self.x3))
                return [self.x1, self.x3]
            elif not self.corretale(self.x1, self.x2) and not self.corretale(
                    self.x2, self.x3):
                log(
                    'warning',
                    '相关度计算:{} {} {}-->{}'.format(self.x1, self.x2, self.x3,
                                                 self.x3))
                return [self.x3]
コード例 #11
0
ファイル: similarity_calculate.py プロジェクト: youstair/QGDT
t1 = set(t1)
t2 = set(t2)
t3 = set(t3)
t4 = set(t4)

t3.difference_update(t2)

t4.difference_update(t2)
t4.difference_update(t3)

t1.difference_update(t2)
t1.difference_update(t3)
t1.difference_update(t4)

log('info', '模板集初始化完成!')


# 匹配模板
def templates(x1='', x2='', x3=''):
    """接受得到的搜索词,匹配相应搜索词个数的模板

    Keyword arguments:
    x1                  -- x1,搜索词1
    x2                  -- x2,搜索词2
    x3                  -- x3,搜索词3
    Return:
        (搜索序列,模板号)
    """
    if x1 and x2 and x3:
        s1 = x1 + ' ' + x2 + ' ' + x3