Esempio n. 1
0
def _prepare_q_a(q_lines: list, a_lines: list, qa_dict: dict):
    for q, a in zip(q_lines, a_lines):
        q = q.strip()
        a = a.strip()
        qa_dict[q] = dict()
        # 获取命名实体,就是按词分词后,选择词性为kc(课程)的所有词作为命名实体
        ret = cut(q, by_character=False, with_pos=True)
        # [('产品经理', 'kc'), ('的', 'uj'), ('课程', 'n'), ('有', 'v'), ('什么', 'r'), ('特点', 'n'), ('?', 'x')]
        qa_dict[q]["entity"] = [i[0] for i in ret if i[1] == "kc" or i[1] == "shisu"]
        qa_dict[q]["q_cut_by_word"] = [i[0] for i in ret]
        qa_dict[q]["q_cut_by_char"] = cut(q, by_character=True)
        qa_dict[q]["answer"] = a
Esempio n. 2
0
def predict():
    s2s.eval()
    sentence = input("请输入句子: ")
    # 句子转序列
    sentence = cut(sentence, by_character=by_char)
    s2s_input = config.s2s_input
    s2s_target = config.s2s_target
    seq_len = config.seq_len
    feature = s2s_input.transform(sentence, seq_len)
    # 构造feature和feature——length
    feature = torch.LongTensor(feature).to(config.device).unsqueeze(0)
    feature_length = torch.LongTensor([min(len(sentence),
                                           seq_len)]).to(config.device)
    # 预测
    y_predict = s2s.evaluate(feature, feature_length)
    # 转换
    y_predict = y_predict.permute(1, 0, 2)
    # 取最后一个维度的最大值作为预测的结果
    pred = y_predict.argmax(dim=-1)
    # 转成列表
    pred = pred.squeeze().detach().numpy().tolist()
    # 转成句子
    pred = s2s_target.inverse_transform(pred)
    # 拼接
    pred = "".join(pred).split("EOS")[0]
    print("预测结果为:", pred)
Esempio n. 3
0
def process_xiaohuangji(f_train, f_test):
    """处理小黄鸡语料"""
    num_train = 0
    num_test = 0
    ret = open(xiaohuangji_path, encoding='UTF-8').readlines()
    flag = 0
    for line in tqdm(ret, desc='小黄鸡'):
        # 句子长度为1考虑删除
        if line.startswith("E"):
            flag = 0
            continue
        elif line.startswith("M"):
            if flag == 0:
                line = line[1:].strip()
                flag = 1
            else:
                continue
        line_cuted = cut(line)
        if not keywoads_in_line((line_cuted)):
            line_cuted = " ".join(line_cuted) + "\t" + "__label__chat"
            if random.choice(flags) == 0:
                num_train += 1
                f_train.write(line_cuted + "\n")
            else:
                num_test += 1
                f_test.write(line_cuted + "\n")
    return num_train, num_test
Esempio n. 4
0
def cut_file(file_path, out_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    out = ''
    for line in lines:
        new_lines = cut(line)
        out += new_lines
    with open(out_path, 'w') as f:
        f.write(out)
def process_byhand_data(file):
    num = 0
    for line in tqdm(open(byhand_path, "r", encoding="utf-8").readlines(),
                     desc="问答对"):
        line = line.strip()
        line_cut = cut(line)

        line_cut = " ".join(line_cut) + "\t" + "__label__QA"
        num += 1
        file.write(line_cut + "\n")
    return num
Esempio n. 6
0
 def predict(self, sentence):
     self.s2s.eval()
     # 句子转序列
     sentence = cut(sentence, by_character=by_char)
     feature = config.s2s_input.transform(sentence, config.seq_len)
     # 构造feature和feature——length
     feature = torch.LongTensor(feature).to(config.device).unsqueeze(0)
     feature_length = torch.LongTensor([min(len(sentence), config.seq_len)]).to(config.device)
     # 预测
     y_predict = self.s2s.evaluate_beam_search(feature, feature_length)
     # 任选一条转成句子并返回
     return "".join(config.s2s_target.inverse_transform(random.choice(y_predict)))
Esempio n. 7
0
 def predict(self, sentence, recall_list):
     # 将用户提问的问题广播到和recall_list相同的数量
     sentence_list = [sentence] * len(recall_list)
     sentence_cut = [cut(i, by_character=True) for i in sentence_list]
     recall_cut = [cut(i, by_character=True) for i in recall_list]
     # [['python', '好', '学', '吗'], ['python', '好', '学', '吗'], ['python', '好', '学', '吗']]
     # [['python', '难', '吗'], ['蒋', '夏', '梦', '是', '谁'], ['c', '语', '言', '好', '就', '业', '吗']]
     sentence_cut = [
         self.ws.transform(i, config.seq_len) for i in sentence_cut
     ]
     recall_cut = [self.ws.transform(i, config.seq_len) for i in recall_cut]
     q1 = torch.LongTensor(sentence_cut)
     q2 = torch.LongTensor(recall_cut)
     out = self.model(q1, q2)  # [batch_size, 2] 最后一列是句子匹配的概率
     value, index = torch.topk(out[:, -1], k=1, dim=0)
     value = value.item()
     index = index.item()
     # 设置阈值
     if value > config.sort_threshold:  # 如果符合阈值要求,则返回该问题对应的答案
         return self.qa_dict[recall_list[index]]["answer"]
     else:
         return "这个问题我也还没学到啊!"
Esempio n. 8
0
 def predict(self, sentence: str):
     """
     输入问题,返回最相近的问题
     :param sentence: 要搜索的问题
     :return:
     """
     sentence_cut = [" ".join(cut(sentence, by_character=by_char))]
     # ['python 真的 很 简单 吗 ?', '什么 是 产品经理 ?'] 以空格作为分隔
     search_vector = self.vectorizer.transform(sentence_cut)
     search_results = self.search_index.search(
         search_vector,
         k=config.recall_nums,
         k_clusters=config.recall_clusters,
         num_indexes=2,
         return_distance=True)
     # [[('0.0', '蒋夏梦是谁?'), ('1.0', 'python真的很简单吗?'), ('1.0', '什么是产品经理?'), ('1.0', '什么样的人适合做产品经理呀?')]]
     final_result = list()
     # 过滤实体entity
     # 获取用户输入的问题中的实体
     sentence_cut_with_pos = cut(sentence,
                                 by_character=False,
                                 with_pos=True)
     q_entity = [
         i[0] for i in sentence_cut_with_pos
         if i[1] == "kc" or i[1] == "shisu"
     ]
     # 判断是否存在相同实体
     for i in search_results:
         for j in i:
             matched_q = j[1]
             matched_q_entity = self.s2v.qa_dict[matched_q]["entity"]
             if len(set(matched_q_entity) & set(q_entity)) > 0:  # 集合取交集
                 final_result.append(matched_q)
     # 如果存在相同实体,则返回匹配到的存在相同实体的问题
     if len(final_result) > 0:
         return final_result
     # 如果不存在相同实体,则返回原始结果
     else:
         return [j[1] for j in i for i in search_results]
Esempio n. 9
0
def process_crawled_corpus(fout_train, fout_test, by_char):
    """处理爬取的数据"""
    num = 0
    fin = open(config.by_crawl_path, "r").readlines()
    for line in tqdm(fin, desc="Processing Crawled Corpus"):
        q = " ".join(cut(line, by_character=by_char)).strip()  # 分词
        q += "\t__label__QA"
        if random.randint(0, N + 1) == 0:
            fout_test.write(q + "\n")
        else:
            fout_train.write(q + "\n")
        num += 1
    return num
Esempio n. 10
0
def extract_and_cut_question(by_char=True):
    num = 0
    fin = open(config.by_hand_path, "r").read()  # 读取json文件
    fin_dic = json.loads(fin)  # 用json直接读也行
    with open(
            config.recall_corpus_by_char
            if by_char else config.recall_corpus_by_word, "w") as fout:
        for q_list_list in tqdm(fin_dic.values(),
                                desc="Processing Homemade Corpus"):  # 列表中嵌套列表
            for q_list in q_list_list:
                for q in q_list:
                    q = " ".join(cut(q, by_character=by_char)).strip()  # 分词
                    fout.write(q + "\n")
                    num += 1
    print(num)
Esempio n. 11
0
def process_crawled_data(f_train, f_test):
    """处理爬取的数据"""
    num_train = 0
    num_test = 0
    for line in tqdm(open(crawled_path, encoding='UTF-8').readlines(),
                     desc='crawled'):
        line_cuted = cut(line)
        line_cuted = " ".join(line_cuted).replace("\n",
                                                  "") + "\t" + "__label__QA"
        if random.choice(flags) == 0:
            num_train += 1
            f_train.write(line_cuted + "\n")
        else:
            num_test += 1
            f_test.write(line_cuted + "\n")
    return num_train, num_test
Esempio n. 12
0
def predict_beam_search():
    s2s.eval()
    sentence = input("请输入句子: ")
    # 句子转序列
    sentence = cut(sentence, by_character=by_char)
    s2s_input = config.s2s_input
    s2s_target = config.s2s_target
    seq_len = config.seq_len
    feature = s2s_input.transform(sentence, seq_len)
    # 构造feature和feature——length
    feature = torch.LongTensor(feature).to(config.device).unsqueeze(0)
    feature_length = torch.LongTensor([min(len(sentence),
                                           seq_len)]).to(config.device)
    # 预测
    y_predict = s2s.evaluate_beam_search(feature, feature_length)
    # 转成句子
    for i in y_predict:
        print("".join(s2s_target.inverse_transform(i)))
Esempio n. 13
0
def process_by_hand(fout_train, fout_test, by_char):
    """处理手工构造的句子"""
    num = 0
    fin = open(config.by_hand_path, "r").read()  # 读取json文件
    # fin_dic = eval(fin)  # 将文件转成字典
    fin_dic = json.loads(fin)  # 用json直接读也行
    for q_list_list in tqdm(fin_dic.values(),
                            desc="Processing Homemade Corpus"):  # 列表中嵌套列表
        for q_list in q_list_list:
            for q in q_list:
                if "校区" in q:
                    continue
                q = " ".join(cut(q, by_character=by_char)).strip()  # 分词
                q += "\t__label__QA"
                if random.randint(0, N + 1) == 0:
                    fout_test.write(q + "\n")
                else:
                    fout_train.write(q + "\n")
                num += 1
    return num
Esempio n. 14
0
def process_byhand_data(f_train, f_test):
    """处理手工构造的数据"""
    num_train = 0
    num_test = 0
    total_lines = json.loads(open(byhand_path, encoding='UTF-8').read())
    for key in (total_lines):
        for lines in tqdm(total_lines[key], desc='byhand'):
            for line in lines:
                # 去除个别不要的问题
                if "校区" in line:
                    continue
                line_cuted = cut(line)
                line_cuted = " ".join(line_cuted) + "\t" + "__label__QA"

                if random.choice(flags) == 0:
                    num_train += 1
                    f_train.write(line_cuted + "\n")
                else:
                    num_test += 1
                    f_test.write(line_cuted + "\n")
    return num_train, num_test
Esempio n. 15
0
def process_xiaohuangji(fout_train, fout_test, by_char):
    num = 0
    fin = open(config.xiaohuangji_path, "r").readlines()
    first_m_flag = True  # 标志着是否是第一个m
    for line in tqdm(fin, desc="Processing Xiaohuangji Corpus"):
        if line.startswith("M"):  # 句子以M开头
            if first_m_flag:
                if not keywords_in_line(line):  # 句子不包含指定关键词
                    line = line[2:].strip()  # 去除最开始的M
                    if len(line) > 1:  # 删去句子长度为1的句子
                        line_cut = " ".join(cut(
                            line,
                            by_character=by_char)).strip()  # 将分词后的结果,用空格连接在一起
                        line_cut += "\t__label__chat"  # 添加类别信息
                        # 将数据按照4:1的比例分为训练集和测试集
                        if random.randint(0, N + 1) == 0:
                            fout_test.write(line_cut + "\n")  # 加上换行
                        else:
                            fout_train.write(line_cut + "\n")
                        num += 1
            first_m_flag = not first_m_flag
    return num
Esempio n. 16
0
def process_xiaohuangji(file):
    # TODO 句子长度为1,考虑删除
    num = 0
    for line in tqdm(open(xiaohuangji_path, "r", encoding="utf-8").readlines(),
                     desc="小黄鸡"):
        if line.startswith("E"):
            flag = 0
            continue
        elif line.startswith("M"):
            if flag == 0:
                line = line[1:].strip()
                if len(line) == 1:
                    continue
                flag = 1
            else:
                continue

        line_cut = cut(line)
        if not keywords_in_line(line_cut):
            line_cut = " ".join(line_cut) + "\t" + "__label__chat"
            num += 1
            file.write(line_cut + "\n")
    return num
Esempio n. 17
0
def prepare_xiaohuangji(by_char=False):
    """
    准备小黄鸡问答语料
    :param by_char: 是否按照字符切分
    """
    with open(config.xiaohuangji_path, mode="r", encoding="utf-8") as fin:
        with open(config.chatbot_input_by_char_path
                  if by_char else config.chatbot_input_by_word_path,
                  mode="w",
                  encoding="utf-8") as f_input:  # 存储问
            with open(config.chatbot_target_by_char_path
                      if by_char else config.chatbot_target_by_word_path,
                      mode="w",
                      encoding="utf-8") as f_target:  # 存储答
                text = fin.readlines()
                num = 0
                lines = list()  # 临时存储句子
                for line in tqdm(text, desc="Processing Xiaohuangji Corpus"):
                    if line.startswith("E"):
                        continue
                    elif line.startswith("M"):
                        lines.append(replace_emoji(
                            line.strip()[2:]))  # 删去句首的M,并去掉颜文字
                    if len(lines) == 2:
                        # 去除符合过滤规则的句子
                        lines = [
                            " ".join(cut(i, by_character=by_char)) + "\n"
                            for i in lines if not filter_line(i)
                        ]
                        # 经过筛选后,如果问答都在,则写入文件
                        if len(lines) == 2:
                            f_input.write(lines[0])
                            f_target.write(lines[1])
                            num += 1
                        # 重新变为空列表
                        lines = list()
                print("{} QA Pairs Write".format(num))
Esempio n. 18
0
from prepar_corpus.prepar_user_dict.test_user_dict import test_user_dict
from lib import cut
from lib import stopwords
if __name__ == '__main__':
    t = "python难不难,不是很难,哈,啊"
    print(cut(t, with_sg=False, use_stopwords=True))