Example #1
0
def test_text():
    cg = CorpusGraph()

    # 从json文件读取语料库模型
    # cg.load_from_json()

    # 连接mongodb建立语料库模型
    cg.build_corpus()

    # 保存为json文件
    cg.save_as_json()

    tg = TextGraph()

    # 从mongodb读取句子,以便分词
    # sentences = tg.get_sentences(isRandom=False)

    sentences = ["准许原告肖振明撤回起诉"]

    # 对句子数组建立图模型
    tg.build(sentences)

    # 填入边的权重
    tg.fill_edge(cg)

    # 输出语句图需要的json文件, path如果为None则返回json,而不保存在硬盘
    tg.make_json(cg, path='./data/text.json')
def test_text():
    cg = CorpusGraph()
    cg.build_corpus()
    cg.get_sorted_neighbour('一')
    # print("###############")
    # for cge in cg.corpus.edges:
    #     print(cge)
    # break
    # print('###', cg.corpus['朝'])

    tg = TextGraph()
    sentences = tg.get_sentences(isRandom=False)
    tg.build(sentences)
    tg.fill_edge(cg)
    tg.make_json(cg)
def tokenize():
    if request.method == 'GET':
        tg = TextGraph()
        sentence = "没有输入"

        # 从参数获取待分词句子
        if request.args.get('sentence', '') != "":
            sentence = request.args.get('sentence', '')
        tg.build([sentence])
        tg.fill_edge(cg)

        # 暂时只对单句分词
        time_count(print_to_console=False)
        result = tg.cut()[0]
        time_count("分词完毕")
        check_jieba = jieba_checker.check(sentence, result)
        time_count("jieba分词完毕")
        check_thulac = thulac_checker.check(sentence, result)
        time_count("thulac分词完毕")

        # jieba的分词结果
        jieba_result = check_jieba["jieba_result"]
        jieba_overlap = check_jieba["overlap"]

        thulac_result = check_thulac["thulac_result"]
        thulac_overlap = check_thulac["overlap"]
        # res = json.dumps(
        #     {"graph": tg.make_json(cg, path=None), "result": result,
        #      "jieba": jieba_result, "jieba_overlap": jieba_overlap,
        #      "thulac": thulac_result, "thulac_overlap": thulac_overlap},
        #     ensure_ascii=False)
        res = json.dumps(
            {
                "graph": tg.make_json(cg, path=None),
                "result": result,
                "jieba": {
                    "words": jieba_result,
                    "overlap": "%.2f" % jieba_overlap
                },
                "thulac": {
                    "words": thulac_result,
                    "overlap": "%.2f" % thulac_overlap
                }
            },
            ensure_ascii=False)
        # print("json dumping")
        # res = json.dumps(
        #     {"graph": tg.make_json(cg, path=None), "result": result,
        #      "jieba": jieba_result, "jieba_overlap": jieba_overlap,
        #      },
        #     ensure_ascii=False)
        print("server returned")
        return res
Example #4
0
def tokenize(sentence):
    tg = TextGraph()
    tg.build([sentence])
    tg.fill_edge(cg)

    # 暂时只对单句分词
    result = tg.cut()[0]
    check = checker.check(sentence, result)

    jieba_result = check["jieba_result"]
    overlap = check["overlap"]
    res = json.dumps(
        {"result": result, "jieba": jieba_result, "overlap": overlap},
        ensure_ascii=False)
    return res
Example #5
0
def tokenize(sentence):
    tg = TextGraph()
    tg.build([sentence])
    tg.fill_edge(cg)

    # 暂时只对单句分词
    result = tg.cut()[0]
    jieba_check = jieba_checker.check(sentence, result)
    thulac_check = thulac_checker.check(sentence, result)

    jieba_result = jieba_check["jieba_result"]
    jieba_overlap = jieba_check["overlap"]

    thulac_result = thulac_check["thulac_result"]
    thulac_overlap = thulac_check["overlap"]
    res = {"sentence": sentence, "result": result, "jieba": jieba_result, "jieba_overlap": jieba_overlap,"thulac":thulac_result,"thulac_overlap":thulac_overlap}
    return res