Beispiel #1
0
def create_ftset_ex():
    allwsdict = '/users/wenny/nju/task/法条文书分析/故意杀人罪/2014'
    allwsdict = '/盗窃罪'
    testwsdict = '../data/testjtzs1w'
    testdata = '../data/1w篇_事实到法条'
    dir = os.listdir(allwsdict)

    print(len(dir))
    resultList = random.sample(range(0, 40000), 10000)
    for i in resultList:
        wsname = dir[i]
        src = allwsdict + '/' + wsname
        target = testwsdict + '/' + wsname
        shutil.copy(src, target)

    dir = os.listdir(testwsdict)
    for wsname in dir:
        print(wsname)
        wspath = testwsdict + '/' + wsname
        # wstestpath = testdata + '/' + wsname.split('.')[0] + '.txt'
        if wsname.endswith('.xml'):
            ftmcls, ftnrls = getFTList(wspath)
            zipob = zip(ftmcls, ftnrls)
            cols = []
            for ftmc, ftnr in zipob:
                cols.append(str(ftmc) + ':' + str(ftnr))
            wsStrls = cutcontent(getSSMatchObject(wspath))
            createx(wsname, wsStrls, cols, [], testdata)
Beispiel #2
0
def findft(dictpath):
    dir = os.listdir(dictpath)
    for i in range(len(dir)):
        ws = dir[i]
        wspath = dictpath + '/' + ws
        ftmcls, ftnrls = getFTList(wspath)
        for ft in ftmcls:
            if ft.find('《中华人民共和国刑事诉讼法》的解释第五百零五条') > 1:
                print(ws)
Beispiel #3
0
def predictjlws(wspath):
    fj_model = jlCnn()
    jlStrls = cutcontent(getJLMatchObject(wspath))
    ftmcls, ftnrls = getFTList(wspath)
    # 法条到结论
    for jl in jlStrls:
        print(jl)
        for ft in ftnrls:
            print(ft)
            print(fj_model.predict(jl, ft))
Beispiel #4
0
def predictssws(wspath):
    sf_model = ssCnn()
    ssStrls = cutcontent(getSSMatchObject(wspath))
    ftmcls, ftnrls = getFTList(wspath)
    outputss = []
    #事实到法条
    for ss in ssStrls:
        print(ss)
        for ft in ftnrls:
            print(ft)
            print(sf_model.predict(ss, ft))
Beispiel #5
0
def createtest(wspath, wstestpath):
    ftmcls, ftnrls = getFTList(wspath)
    wsStrls = cutcontent(getJLMatchObject(wspath))

    with open(wstestpath, 'w', encoding='utf-8') as f:
        for i in range(len(ftmcls)):

            for wsStr in wsStrls:
                line = []
                line.append(ftmcls[i])
                line.append(ftnrls[i])
                line.append(wsStr)
                f.write('!@#'.join(line))
                f.write('\n')
def setCor(dicpath,corpuspath,spwordpath):
    print('setCor:'+dicpath)
    filepathlist = os.listdir(dicpath)
    index = 0
    cxlist = ['x','p','nr','uj']
    with open(corpuspath,'w',encoding='UTF-8') as f:
        for filepath in filepathlist:
            print('index', index)
            index += 1
            ftmclist,ftnrlist = getFTList(dicpath + '\\' + filepath)
            ftnr = '。'.join(ftnrlist)
            content = getQW(dicpath + '\\' + filepath).attrib['value']+ftnr
            contentcut = pos.cut(content)
            content_filter = filterwordwithcx(contentcut, cxlist, spwordpath)
            for word in content_filter:
                f.write(word+' ')
            f.write('\n')
Beispiel #7
0
def usekeys(wspath, model):
    outputdata_ss = []
    outputdata_jl = []

    ftmcls, ftnrls = getFTList(wspath)
    print('wsft..', ftmcls)
    ssStrls = cutcontent(getSSMatchObject(wspath))

    # 预先将结论所有句子的关键词提取好
    jlStrls = cutcontent(getJLMatchObject(wspath))

    # 获取关键词json文件
    keys = readkeysjson('../data/交通肇事罪.json')
    with open('../data/2014corpus.txt', 'r', encoding='utf-8') as f:
        vocab = list(f.read().split(' '))
    for ssStr in ssStrls:
        line = []
        mk = mapkey(ssStr, keys, model, vocab)
        for ft in ftnrls:
            f = 0
            for key in mk:
                if ft.find(key) > 0:
                    line.append(1)
                    f = 1
                    break
            if f == 0:
                line.append(0)
        outputdata_ss.append(line)

    # for jlStr in jlStrls:
    #     mk = mapkey(jlStr,keys,model,vocab)
    #     line = []
    #     for ft in ftnrls:
    #         f = 0
    #         for key in mk:
    #             if ft.find(key) > 0:
    #                 line.append(1)
    #                 f = 1
    #                 break
    #         if f == 0:
    #             line.append(0)
    #     outputdata_jl.append(line)
    return numpy.array(outputdata_ss).T
Beispiel #8
0
def traversews(wspath, model):
    # 加入data
    # outputdata_dict = {}
    outputdata_ss = []
    outputdata_jl = []

    ftmcls, ftnrls = getFTList(wspath)
    print('wsft..', ftmcls)
    wsStrls = cutcontent(getSSMatchObject(wspath))
    wsStrkeys = []
    # 预先将文书所有句子的关键词提取好
    for wsStr in wsStrls:
        wsStrkeys.append(getnormalizeweigth(wsStr, False))  # h获取文书关键词,[[]]
    print('set ws sp length....', len(wsStrkeys))

    # 预先将结论所有句子的关键词提取好
    jlStrkeys = []
    jlStrls = cutcontent(getJLMatchObject(wspath))
    for jlStr in jlStrls:
        jlStrkeys.append(getnormalizeweigth(jlStr, False))
    print('set jl sp length....', len(jlStrkeys))

    # 遍历法条
    for i in range(len(ftnrls)):
        # 加入法条data
        ftdata_ss = []
        ftdata_jl = []

        ftnr = ftnrls[i]
        ftmc = ftmcls[i]
        print('ftnr', ftnr)
        ftnrArra = cutcontent(ftnr)
        print('ftnrArra lenght..', len(ftnrArra))

        # 优化,将nr对应的keys先算好,存进dict中
        nrkeysdict = {}
        for nr in ftnrArra:

            keys, weights = getnormalizeweigth(nr, True)
            nrkeysdict[nr] = [keys, weights]
            print(keys, weights)

        #   对于每个事实句子都把法条sp遍历比较一遍
        for i in range(len(wsStrkeys)):
            wsStrkey = wsStrkeys[i]
            wsStr = wsStrls[i]
            smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model)
            if smaxsum > 0.25:
               ftdata_ss.append(1)
            else:
                ftdata_ss.append(0)
            print('ws nr',wsStr)
            print('ws keys',wsStrkey)
            print('ws ft max distance',smaxsum)

        # 对于每个结论句子都把法条sp遍历比较一遍
        for i in range(len(jlStrkeys)):
            jlStrkey = jlStrkeys[i]
            jlStr = jlStrls[i]
            smaxsum = distance(jlStrkey, ftnrArra, nrkeysdict, jlStr, model)
            if smaxsum > 0.45:
               ftdata_jl.append(1)  # error1
            else:
                ftdata_jl.append(0)
        # outputdata_dict[ftmc] = [ftdata_ss,ftdata_jl]
        outputdata_ss.append(ftdata_ss)
        outputdata_jl.append(ftdata_jl)
    # 输出到法条与事实的映射list,以及法条到结论映射list
    return outputdata_ss, outputdata_jl
Beispiel #9
0
def traversews(expath, model):
    # 加入data

    outputdata_ss = []
    outputdata_jl = []

    ftmcls, ftnrls = getFTList(expath)
    print('wsft..', ftmcls)
    # wsStrls = cutcontent(getSSMatchObject(wspath))
    # wsStrkeys = []
    # 预先将文书所有句子的关键词提取好
    # for wsStr in wsStrls:
    #     wsStrkeys.append(getnormalizeweigth(wsStr, False))  # h获取文书关键词,[[]]
    # print('set ws keys length....', len(wsStrkeys))

    # 预先将结论所有句子的关键词提取好
    jlStrkeys = []
    jlStrls = cutcontent(getJLMatchObject(wspath))
    for jlStr in jlStrls:
        jlStrkeys.append(getnormalizeweigth(jlStr, False))
    # print('set jl keys length....', len(jlStrkeys))

    # 遍历法条
    for i in range(len(ftnrls)):
        # 加入法条data
        ftdata_ss = []
        ftdata_jl = []

        ftnr = ftnrls[i]
        ftmc = ftmcls[i]

        print('ftnr', ftnr)

        #加载该法条的lda模型
        # ft_corpus = '../LDAmodel/source/corpus/'+ftmc+'.txt'
        # ft_model = '../LDAmodel/source/model/'+ftmc+'_lda.model'
        # if os.path.exists(ft_model):
        #     dictionary,lda = getLDAmodel(ft_corpus,ft_model)

        ftnrArra = cutcontent(ftnr)

        this_ftkeys = getftkeys(ftmc)

        # 优化,将nr对应的keys先算好,存进dict中
        nrkeysdict = {}
        allftkeys = []
        for nr in ftnrArra:
            keys, weights = getnormalizeweigth(nr, True)
            print(keys)
            nrkeysdict[nr] = [keys, weights]
            #
            # if len(keys) >=4:
            #     allftkeys.extend([keys[3]])
            if len(keys) >= 3:
                allftkeys.extend([keys[2]])
            if len(keys) >= 2:
                allftkeys.extend([keys[1]])
            elif len(keys) == 1:
                allftkeys.extend([keys[0]])

        #   对于每个事实句子都把法条sp遍历比较一遍
        # for i in range(len(wsStrkeys)):
        #     wsStrkey = wsStrkeys[i]
        #     wsStr = wsStrls[i]
        #     print('wsStr',wsStr)
        #     print('wsStrkeys',wsStrkey)
        #使用法条使用lda得出的keys=========================================
        # ftkeyflag = 0
        # for _ in allftkeys:
        #     if wsStr.count(_) > 0:
        #         ftkeyflag += 1
        #
        # if ftkeyflag >= 1:
        #     ftdata_ss.append(1)
        # else:
        #
        #     count = 0
        #     if len(this_ftkeys) > 0:
        #         for this_key in this_ftkeys:
        #             if wsStr.count(this_key) > 0:
        #                 count += 1
        #         if count > 0:
        #             if count > 5:
        #                 ftdata_ss.append(1)
        #             else:
        #                 smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model)
        #                 print('samxsum', smaxsum)
        #                 if smaxsum > 0.3:
        #                     ftdata_ss.append(1)
        #                 else:
        #                     ftdata_ss.append(0)
        #         else:
        #             ftdata_ss.append(0)
        #     else:
        #         smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model)
        #         print('samxsum', smaxsum)
        #         if smaxsum > 0.3:
        #             ftdata_ss.append(1)
        #         else:
        #             ftdata_ss.append(0)
        # 使用法条使用lda得出的keys=========================================
        # if os.path.exists(ft_model) == True:
        #     tp = predict(dictionary, lda, wsStr)
        #     print('tp',tp)
        #     if tp > 0.95:
        #         ftdata_ss.append(1)
        #     # else:
        #     #     smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model)
        #     #     if smaxsum > 0.2:
        #     #         ftdata_ss.append(1)
        #     #     else:
        #     #         ftdata_ss.append(0)
        #
        #     elif tp > 0.85:
        #         smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model)
        #         print('samxsum',smaxsum)
        #         if smaxsum > 0.2:
        #             ftdata_ss.append(1)
        #         else:
        #             ftdata_ss.append(0)
        #     else:
        #         ftdata_ss.append(0)
        #
        # else:
        #     smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model)
        #     print('samxsum', smaxsum)
        #     if smaxsum > 0.2:
        #         ftdata_ss.append(1)
        #     else:
        #         ftdata_ss.append(0)

        # smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model)
        # if smaxsum > 0.3:
        #    ftdata_ss.append(1)
        # else:
        #     ftdata_ss.append(0)

        # 对于每个结论句子都把法条sp遍历比较一遍
        for i in range(len(jlStrkeys)):
            jlStrkey = jlStrkeys[i]
            jlStr = jlStrls[i]
            smaxsum = distance(jlStrkey, ftnrArra, nrkeysdict, jlStr, model)
            if smaxsum > 0.5:
                ftdata_jl.append(1)  # error1
            else:
                ftdata_jl.append(0)

        # outputdata_ss.append(ftdata_ss)
        outputdata_jl.append(ftdata_jl)
    # 输出到法条与事实的映射list,以及法条到结论映射list
    return outputdata_jl