# coding:utf8 # @Time : 18-7-9 上午10:29 # @Author : evilpsycho # @Mail : [email protected] import pysparnn.cluster_index as ci from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.externals.joblib import load, dump from chatbot.utils.log import get_logger logger = get_logger("TfidfQA") class TfidfQA(object): ''' 词频-逆文本频率 ''' def __init__(self, path=None): ''' :param path: 文件路径 ''' if path is None: self._tfidf = TfidfVectorizer() self._ci = None else: self._tfidf = load(path + ".tfidf") self._ci = load(path + ".cluster_index") def fit(self, queries, queries_idx): """
# coding:utf8 # @Time : 18-5-14 下午2:13 # @Author : evilpsycho # @Mail : [email protected] import os import sys from pathlib import Path sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import opencc from chatbot.utils.log import get_logger from chatbot.utils.wrapper import time_counter logger = get_logger("Text Preprocessing T2S") @time_counter def traditional2simple(input_path, output_path): """繁体转简体 :param input_path: :param output_path: :return: """ # input check assert os.path.exists(input_path) # output check path = Path(output_path).resolve() path.parent.mkdir(exist_ok=True) # convert
# @Time : 5/14/18 22:07 # @Author : evilpsycho # @Mail : [email protected] from pathlib import Path import sys sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import gensim import numpy as np from chatbot.utils.log import get_logger from chatbot.utils.wrapper import time_counter from chatbot.config.constant import PAD, PAD_IDX, UNK, UNK_IDX from chatbot.cparse.vocabulary import Vocabulary logger = get_logger("Word2vec") class Word2vecExt(object): def __init__(self): self.model = None def _input2sentences(self, inputs, **kwargs): if isinstance(inputs, str): path = Path(inputs).resolve() if path.is_dir(): sentences = gensim.models.word2vec.PathLineSentences( inputs, **kwargs) else: sentences = gensim.models.word2vec.LineSentence( inputs, **kwargs)
# coding:utf8 # @Time : 18-6-6 上午9:37 # @Author : evilpsycho # @Mail : [email protected] import datetime as dt from chatbot.preprocessing.text import cut from chatbot.core.context import Context from threading import Timer from chatbot.utils.log import get_logger from chatbot.utils.chat_record import chat_record TIMEOUT = 60 logger = get_logger("chatbot") class ChatBot(object): def __init__(self, vocab, label, intent_model, intent_rule, ner, intent2skills): self.contexts = dict() self.intent2skills = intent2skills self.intent_model = intent_model self.intent_rule = intent_rule self.ner = ner skills = [] for i, s in self.intent2skills.items(): if s not in skills: skills.append(s) self.skills = skills self.vocab = vocab self.label = label self._delete_timeout_context()
# @Author : evilpsycho # @Mail : [email protected] import pickle import numpy as np import torch from torch import nn import torch.nn.functional as F from sklearn.metrics import accuracy_score from chatbot.core.serializable import Serializable from chatbot.core.trainable import Trainable from chatbot.core.estimator import Estimator from chatbot.utils.path import MODEL_PATH from chatbot.utils.log import get_logger logger = get_logger("intent model") class BaseIntentModel(nn.Module, Serializable, Trainable, Estimator): def __init__(self, param: dict, opt=torch.optim.Adam, metric=accuracy_score, loss=F.cross_entropy, save_path="default"): super().__init__() self.param = param self.opt = None self.loss = loss self.metric = metric
# -*- coding: utf-8 -*- # @Time : 5/12/18 13:23 # @Author : evilpsycho # @Mail : [email protected] from chatbot.utils import path from chatbot.utils.log import get_logger from pathlib import Path import multiprocessing as mp import jieba CPU = mp.cpu_count() logger = get_logger("Text cut") SEG_VOCAB_PATH = Path(path.ROOT_PATH, "config", "vocab_jieba_seg").resolve().absolute() jieba.load_userdict(str(SEG_VOCAB_PATH)) jieba.initialize() def _cut2list(x): return list(jieba.cut(x)) def _cut2str(x): return " ".join(jieba.cut(x)) # @time_counter def cut(x, n_job=None, join=False): """ 分词功能,接收一个字符串,返回分词结果
# coding:utf8 # @Time : 18-6-11 下午2:41 # @Author : evilpsycho # @Mail : [email protected] import random import codecs from chatbot.core.skill import BaseSkill from chatbot.utils.path import ROOT_PATH from chatbot.utils.log import get_logger logger = get_logger("simple skill") def read_txt(path): """read txt :return: <list> """ # path = "D:\\Users\\tanmx\\chatbot\\Task-Oriented-Chatbot\\corpus\\skill\\GoodBye_response.txt" with open(path, "r", encoding='UTF-8') as f: txts = f.readlines() # remove chomp, blank sents = [item.strip().split(' ')[-1] for item in txts if len(item) > 1] return sents class LeaveMessage(BaseSkill): """LeaveMessage存储及回复封装 :param context: context :return: <String> 回复信息,context{user:,query} to txt
# @Time : 18-5-21 下午2:10 # @Author : evilpsycho # @Mail : [email protected] from chatbot.config.constant import UNDEFINE, UNDEFINE_IDX from chatbot.utils.log import get_logger from chatbot.cparse.dictionary import Dictionary from chatbot.utils.path import ROOT_PATH def get_intent_labels(name): with open(str(ROOT_PATH / "config" / name), "r") as f: labels = [l.rstrip("\n") for l in f.readlines()] return labels logger = get_logger(__name__) class IntentLabel(Dictionary): def __init__(self): # TODO: init class for config intent super().__init__() def init_from_config(self, name): intent_labels = get_intent_labels(name) for label in intent_labels: self._add_one(label) self.training = False def fit(self, x): """
# -*- coding: utf-8 -*- # @Time : 5/12/18 13:22 # @Author : evilpsycho # @Mail : [email protected] from gensim.corpora import WikiCorpus import sys from pathlib import Path sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) from chatbot.utils.log import get_logger from chatbot.utils.wrapper import time_counter logger = get_logger("Wiki Extract") @time_counter def wiki_extract(input_file, output_file): """wiki下载文件提取文本内容脚本 :param input_file: 原始文件路径 :param output_file: 提取文件路径 :return: None """ # 原始文件是否存在 assert Path(input_file).resolve().exists() # 提取文件路径不存在就新建 output_file_path = Path(output_file).resolve() output_file_path.parent.mkdir(exist_ok=True) logger.info("Start extract wiki ..") wiki = WikiCorpus(input_file, lemmatize=False) with open(output_file, "w", encoding="utf8") as f:
# coding:utf8 # @Time : 18-6-25 下午2:41 # @Author : evilpsycho # @Mail : [email protected] import sys sys.path.append("/home/zhouzr/project/Task-Oriented-Chatbot") from chatbot.utils.log import get_logger logger = get_logger("test") from chatbot.intent.models.fast_text import FastText from chatbot.intent.rules.rule_v1 import IntentRuleV1 from chatbot.ner.rules.rule_ner import NerRuleV1 from chatbot.cparse.label import IntentLabel from chatbot.cparse.vocabulary import Vocabulary from chatbot.skills.simple import SayHi, Thanks, Praise, Criticize, GoodBye, LeaveMessage, CompanyInfo, BusinessInfo from chatbot.skills.botQA import BotQA from chatbot.skills.data_query import TestDataQuery, DataInquiry from chatbot.skills.help import Help from chatbot.skills.safe import SafeResponse from chatbot.skills.tuling import Tuling from chatbot.skills.file_retrieval import FileRetrievalExt from chatbot.bot import ChatBot from chatbot.utils.path import MODEL_PATH from wxpy import Bot intent_model = FastText.load( str(MODEL_PATH / "v0.21" / "intent_model.FastText")) intent_rule = IntentRuleV1() ner = NerRuleV1() label = IntentLabel.load(str(MODEL_PATH / "v0.21" / "label")) vocab = Vocabulary.load(str(MODEL_PATH / "v0.21" / "vocab")) file_retrieval = FileRetrievalExt(
# coding:utf8 # @Time : 18-6-7 上午10:51 # @Author : evilpsycho # @Mail : [email protected] from chatbot.core.entity import TimeInterval, Location, Company, Tag from chatbot.ner.rules.company_ner import CompanyNer from chatbot.ner.rules.time_ner import TimeNer from chatbot.ner.rules.location_ner import LocationNer from chatbot.ner.rules.tag_ner import TagNer from chatbot.utils.log import get_logger logger = get_logger("NER") class NerRuleV1: def __init__(self): super().__init__() self.ner_company = CompanyNer() self.ner_loc = LocationNer() self.ner_time = TimeNer() self.ner_tag = TagNer() def extract(self, context): """ :param context: context["query"] :return: <dict of list> {"TimeInterval": ["", ""]} """ rst = {} ext_time = self.ner_time.extract(context)
# coding:utf8 # @Time : 18-5-15 下午4:16 # @Author : evilpsycho # @Mail : [email protected] from torch import nn import torch from torch.nn import functional as F from chatbot.intent.models.base_intent_model import BaseIntentModel from chatbot.utils.log import get_logger logger = get_logger("TextCNN") class TextCNN(BaseIntentModel): def __init__(self, param: dict): super().__init__(param) ci = 1 # input chanel size kernel_num = param['kernel_num'] # output chanel size kernel_size = param['kernel_size'] vocab_size = param['vocab_size'] embed_dim = param['embed_dim'] dropout = param['dropout'] class_num = param['class_num'] self.param = param self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=1) self.conv11 = nn.Conv2d(ci, kernel_num, (kernel_size[0], embed_dim)) self.conv12 = nn.Conv2d(ci, kernel_num, (kernel_size[1], embed_dim)) self.conv13 = nn.Conv2d(ci, kernel_num, (kernel_size[2], embed_dim)) self.dropout = nn.Dropout(dropout) self.fc1 = nn.Linear(len(kernel_size) * kernel_num, class_num)
# -*- coding: utf-8 -*- # @Time : 5/12/18 14:17 # @Author : evilpsycho # @Mail : [email protected] import time from functools import wraps from chatbot.utils.log import get_logger time_counter_logger = get_logger("Time Counter") def time_counter(func): @wraps(func) def time_it(*args, **kwargs): s = time.time() rst = func(*args, **kwargs) e = time.time() t = (e - s) / 60 time_counter_logger.info(func.__name__ + " running %.2f min" % t) return rst return time_it