def train_with_gensim(self): self.status = 1 push.push_to_rtx(push.generate_rtx_markdown("gensim转子引擎开始加热")) self.tc_wv_model = KeyedVectors.load_word2vec_format( './Tencent_AILab_ChineseEmbedding.txt', binary=False) push.push_to_rtx(push.generate_rtx_markdown("gensim转子引擎加热完毕")) self.status = 2
def switch_log_to_bot(): global log_to_bot log_to_bot = not log_to_bot if (log_to_bot): push.push_to_rtx(push.generate_rtx_markdown("bot调用日志已开启")) else: push.push_to_rtx(push.generate_rtx_markdown("bot调用日志已关闭")) return "succ"
def __init__(self): # 0: 未训练 # 1: 正在训练gensim版 # 2: gensim版可用 # 3: 正在训练annoy # 4: annoy版可用 self.status = 0 push.push_to_rtx(push.generate_rtx_markdown("wordcalc出仓状态良好"))
def do_word_calc(): global word_calc, log_to_bot # GET 和 POST 都行 if request.method == 'POST': post_body = str(request.data.decode('utf-8')) elif request.method == 'GET': post_body = str(request.args.get("q")) if "" == post_body: resp = { "ret": 0, "msgtype": "markdown", "text": "喵喵喵?", "pic": "nopic" } return json.dumps(resp, indent=4, ensure_ascii=False) # 解析提问句 pos, neg = parse_formular(post_body) # 使用 gensim/annoy result = word_calc.calc(pos, neg) if log_to_bot: push.push_to_rtx( push.generate_rtx_markdown(post_body + "=\r\n" + str(result))) # 过滤掉重复词 filtered_result = [] for item in result: if item[0] not in pos and item[0] not in neg: filtered_result.append(item[0]) # 生成文本回复 resp_choices = [] if len(filtered_result) == 0: resp_choices.append("臣妾实在算不出啊") resp_choices.append("算晕了,今天天气不错,用云计算试试?") resp_choices.append("程序已崩溃") resp_choices.append("爆炸倒计时: 3...2...1...") elif len(filtered_result) < 3: item = random.choice(filtered_result) resp_choices.append(item) resp_choices.append("也许等于" + item + "?") resp_choices.append("答案是" + item + "~") else: items = random.choices(filtered_result, k=3) resp_choices.append("大概也许是{0}、{1}或{2}".format( items[0], items[1], items[2])) resp_choices.append("等于{0}".format(items[0])) resp_choices.append("我算出来的结果是{0}".format(items[0])) resp_choices.append("大概也许是{0}".format(items[0])) resp_choices.append("答案是{0}、{1}或{2}".format( items[0], items[1], items[2])) resp_choices.append("你是想说{0}和{1}吗?".format(items[0], items[1])) # 按业务约定返回结果 resp = { "ret": 0, "msgtype": "markdown", "text": random.choice(resp_choices), "pic": "nopic" } return json.dumps(resp, indent=4, ensure_ascii=False)
def train_with_annoy(self): self.status = 3 push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间开始注水")) self.annoy_index = AnnoyIndexer(self.tc_wv_model, 200) fname = 'tc_index_genoy.index' self.annoy_index.save(fname) # 导出训练结果,以后直接 load 即可 # annoy_index = AnnoyIndexer() # annoy_index.load(fname) # annoy_index.model = tc_wv_model push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间注水完毕")) self.status = 4
def do_push(): ori_url = spider.get_today_joke_url() myhtml = spider.get_html_from_url(ori_url) if None == myhtml: print("spider failed") myjokes = utils.get_qa_from_html(myhtml) if None == myjokes: print("parse failed") rtx_md = push.generate_rtx_markdown(myjokes) rtx_card = push.generate_rtx_cardinfo(myjokes, ori_url) push.push_to_rtx(rtx_card) push.push_to_rtx(rtx_md)
def train_with_annoy(): global word_calc push.push_to_rtx(push.generate_rtx_markdown("收到训练annoy请求")) word_calc.train_with_annoy() push.push_to_rtx(push.generate_rtx_markdown("annoy请求处理完毕")) return "succ"
# 处理开关日志推送请求 @app.route('/switch', methods=('GET', 'POST')) def switch_log_to_bot(): global log_to_bot log_to_bot = not log_to_bot if (log_to_bot): push.push_to_rtx(push.generate_rtx_markdown("bot调用日志已开启")) else: push.push_to_rtx(push.generate_rtx_markdown("bot调用日志已关闭")) return "succ" # 处理训练annoy有损匹配模型请求 @app.route('/annoy', methods=('GET', 'POST')) def train_with_annoy(): global word_calc push.push_to_rtx(push.generate_rtx_markdown("收到训练annoy请求")) word_calc.train_with_annoy() push.push_to_rtx(push.generate_rtx_markdown("annoy请求处理完毕")) return "succ" return app if __name__ == "__main__": # 将数据导入 gensim word_calc.train_with_gensim() # 拉起 flask web 服务 push.push_to_rtx(push.generate_rtx_markdown("flask初号机已就位")) create_app().run(host='0.0.0.0', port=5000)
#!/usr/bin/env python # -*- coding: utf-8 -*- from gensim.models import KeyedVectors from collections import OrderedDict import json import time import push try: from gensim.similarities.index import AnnoyIndexer except ImportError: print('import gensim.annoy error') push.push_to_rtx(push.generate_rtx_markdown("gensim引导失败")) raise ValueError("anny indexer 加载失败") class WordCalc: def __init__(self): # 0: 未训练 # 1: 正在训练gensim版 # 2: gensim版可用 # 3: 正在训练annoy # 4: annoy版可用 self.status = 0 push.push_to_rtx(push.generate_rtx_markdown("wordcalc出仓状态良好")) def train_with_gensim(self): self.status = 1 push.push_to_rtx(push.generate_rtx_markdown("gensim转子引擎开始加热")) self.tc_wv_model = KeyedVectors.load_word2vec_format( './Tencent_AILab_ChineseEmbedding.txt', binary=False)
#!/usr/bin/env python # -*- coding: utf-8 -*- print('hello python') import push push.push_to_rtx(push.generate_rtx_markdown("python姿态测试正常"))