def test_text(): cg = CorpusGraph() # 从json文件读取语料库模型 # cg.load_from_json() # 连接mongodb建立语料库模型 cg.build_corpus() # 保存为json文件 cg.save_as_json() tg = TextGraph() # 从mongodb读取句子,以便分词 # sentences = tg.get_sentences(isRandom=False) sentences = ["准许原告肖振明撤回起诉"] # 对句子数组建立图模型 tg.build(sentences) # 填入边的权重 tg.fill_edge(cg) # 输出语句图需要的json文件, path如果为None则返回json,而不保存在硬盘 tg.make_json(cg, path='./data/text.json')
def test_text(): cg = CorpusGraph() cg.build_corpus() cg.get_sorted_neighbour('一') # print("###############") # for cge in cg.corpus.edges: # print(cge) # break # print('###', cg.corpus['朝']) tg = TextGraph() sentences = tg.get_sentences(isRandom=False) tg.build(sentences) tg.fill_edge(cg) tg.make_json(cg)
#!/usr/bin/env python # -*- coding: utf-8 -*- from Network import CorpusGraph from Network import TextGraph from ResultReference import JiebaChecker from ResultReference import ThulacChecker from IO import DisIO cg = CorpusGraph() # cg.build_corpus() #cg.save_as_json('./data/ten.json') cg.load_from_json('./data/corpus_50k.json') jieba_checker = JiebaChecker() thulac_checker = ThulacChecker() def tokenize(sentence): tg = TextGraph() tg.build([sentence]) tg.fill_edge(cg) # 暂时只对单句分词 result = tg.cut()[0] jieba_check = jieba_checker.check(sentence, result) thulac_check = thulac_checker.check(sentence, result) jieba_result = jieba_check["jieba_result"] jieba_overlap = jieba_check["overlap"] thulac_result = thulac_check["thulac_result"] thulac_overlap = thulac_check["overlap"]
from flask import Flask from flask import request from flask import send_from_directory from flask import send_file from IO import RemoteIO from Network import CorpusGraph from Network import TextGraph from ResultReference import JiebaChecker, ThulacChecker from utl import count as time_count import os import json # 从json文件建立语料库图模型 cg = CorpusGraph() cg.load_from_json() # 分词结果校对 jieba_checker = JiebaChecker() thulac_checker = ThulacChecker() rio = RemoteIO() app = Flask(__name__, template_folder='./presentation', static_folder='./presentation') @app.route('/') def hello_world(): return send_file('./presentation/WordLink.html')
def make_corpus(): cg = CorpusGraph() cg.build_corpus() # cg.save_as_json() cg.load_from_json()
import sys from Network import CorpusGraph cmds = sys.argv cg = CorpusGraph() if "build" in cmds and "toJson" in cmds: cg.build_corpus() cg.save_as_json()