def test_text(): cg = CorpusGraph() # 从json文件读取语料库模型 # cg.load_from_json() # 连接mongodb建立语料库模型 cg.build_corpus() # 保存为json文件 cg.save_as_json() tg = TextGraph() # 从mongodb读取句子,以便分词 # sentences = tg.get_sentences(isRandom=False) sentences = ["准许原告肖振明撤回起诉"] # 对句子数组建立图模型 tg.build(sentences) # 填入边的权重 tg.fill_edge(cg) # 输出语句图需要的json文件, path如果为None则返回json,而不保存在硬盘 tg.make_json(cg, path='./data/text.json')
def test_text(): cg = CorpusGraph() cg.build_corpus() cg.get_sorted_neighbour('一') # print("###############") # for cge in cg.corpus.edges: # print(cge) # break # print('###', cg.corpus['朝']) tg = TextGraph() sentences = tg.get_sentences(isRandom=False) tg.build(sentences) tg.fill_edge(cg) tg.make_json(cg)
def tokenize(): if request.method == 'GET': tg = TextGraph() sentence = "没有输入" # 从参数获取待分词句子 if request.args.get('sentence', '') != "": sentence = request.args.get('sentence', '') tg.build([sentence]) tg.fill_edge(cg) # 暂时只对单句分词 time_count(print_to_console=False) result = tg.cut()[0] time_count("分词完毕") check_jieba = jieba_checker.check(sentence, result) time_count("jieba分词完毕") check_thulac = thulac_checker.check(sentence, result) time_count("thulac分词完毕") # jieba的分词结果 jieba_result = check_jieba["jieba_result"] jieba_overlap = check_jieba["overlap"] thulac_result = check_thulac["thulac_result"] thulac_overlap = check_thulac["overlap"] # res = json.dumps( # {"graph": tg.make_json(cg, path=None), "result": result, # "jieba": jieba_result, "jieba_overlap": jieba_overlap, # "thulac": thulac_result, "thulac_overlap": thulac_overlap}, # ensure_ascii=False) res = json.dumps( { "graph": tg.make_json(cg, path=None), "result": result, "jieba": { "words": jieba_result, "overlap": "%.2f" % jieba_overlap }, "thulac": { "words": thulac_result, "overlap": "%.2f" % thulac_overlap } }, ensure_ascii=False) # print("json dumping") # res = json.dumps( # {"graph": tg.make_json(cg, path=None), "result": result, # "jieba": jieba_result, "jieba_overlap": jieba_overlap, # }, # ensure_ascii=False) print("server returned") return res
def tokenize(sentence): tg = TextGraph() tg.build([sentence]) tg.fill_edge(cg) # 暂时只对单句分词 result = tg.cut()[0] check = checker.check(sentence, result) jieba_result = check["jieba_result"] overlap = check["overlap"] res = json.dumps( {"result": result, "jieba": jieba_result, "overlap": overlap}, ensure_ascii=False) return res
def tokenize(sentence): tg = TextGraph() tg.build([sentence]) tg.fill_edge(cg) # 暂时只对单句分词 result = tg.cut()[0] jieba_check = jieba_checker.check(sentence, result) thulac_check = thulac_checker.check(sentence, result) jieba_result = jieba_check["jieba_result"] jieba_overlap = jieba_check["overlap"] thulac_result = thulac_check["thulac_result"] thulac_overlap = thulac_check["overlap"] res = {"sentence": sentence, "result": result, "jieba": jieba_result, "jieba_overlap": jieba_overlap,"thulac":thulac_result,"thulac_overlap":thulac_overlap} return res