def create_ftset_ex(): allwsdict = '/users/wenny/nju/task/法条文书分析/故意杀人罪/2014' allwsdict = '/盗窃罪' testwsdict = '../data/testjtzs1w' testdata = '../data/1w篇_事实到法条' dir = os.listdir(allwsdict) print(len(dir)) resultList = random.sample(range(0, 40000), 10000) for i in resultList: wsname = dir[i] src = allwsdict + '/' + wsname target = testwsdict + '/' + wsname shutil.copy(src, target) dir = os.listdir(testwsdict) for wsname in dir: print(wsname) wspath = testwsdict + '/' + wsname # wstestpath = testdata + '/' + wsname.split('.')[0] + '.txt' if wsname.endswith('.xml'): ftmcls, ftnrls = getFTList(wspath) zipob = zip(ftmcls, ftnrls) cols = [] for ftmc, ftnr in zipob: cols.append(str(ftmc) + ':' + str(ftnr)) wsStrls = cutcontent(getSSMatchObject(wspath)) createx(wsname, wsStrls, cols, [], testdata)
def txttestset(dictpath, output): dir = os.listdir(dictpath) content = '' for i in range(len(dir)): ws = dir[i] wspath = dictpath + '/' + ws ftls = getFTfromQW(wspath) ssls = cutcontent(getSSMatchObject(wspath)) jlls = cutcontent(getJLMatchObject(wspath)) content += ws + 'end!fact:' content += '!@#'.join(ssls) content += 'end!ft:' content += '!@#'.join(ftls) content += 'end!jl:' content += '!@#'.join(jlls) content += '\n' with open(output, 'w', encoding='utf-8') as f: f.write(content)
def predictjlws(wspath): fj_model = jlCnn() jlStrls = cutcontent(getJLMatchObject(wspath)) ftmcls, ftnrls = getFTList(wspath) # 法条到结论 for jl in jlStrls: print(jl) for ft in ftnrls: print(ft) print(fj_model.predict(jl, ft))
def usekeys(wspath, model): outputdata_ss = [] outputdata_jl = [] ftmcls, ftnrls = getFTList(wspath) print('wsft..', ftmcls) ssStrls = cutcontent(getSSMatchObject(wspath)) # 预先将结论所有句子的关键词提取好 jlStrls = cutcontent(getJLMatchObject(wspath)) # 获取关键词json文件 keys = readkeysjson('../data/交通肇事罪.json') with open('../data/2014corpus.txt', 'r', encoding='utf-8') as f: vocab = list(f.read().split(' ')) for ssStr in ssStrls: line = [] mk = mapkey(ssStr, keys, model, vocab) for ft in ftnrls: f = 0 for key in mk: if ft.find(key) > 0: line.append(1) f = 1 break if f == 0: line.append(0) outputdata_ss.append(line) # for jlStr in jlStrls: # mk = mapkey(jlStr,keys,model,vocab) # line = [] # for ft in ftnrls: # f = 0 # for key in mk: # if ft.find(key) > 0: # line.append(1) # f = 1 # break # if f == 0: # line.append(0) # outputdata_jl.append(line) return numpy.array(outputdata_ss).T
def predictssws(wspath): sf_model = ssCnn() ssStrls = cutcontent(getSSMatchObject(wspath)) ftmcls, ftnrls = getFTList(wspath) outputss = [] #事实到法条 for ss in ssStrls: print(ss) for ft in ftnrls: print(ft) print(sf_model.predict(ss, ft))
def create_zjset_ex(testwsdict, expath): dir = os.listdir(testwsdict) index = 0 for wsname in dir: print(index) index += 1 wspath = testwsdict + '/' + wsname if str(wsname).endswith('.xml'): zjlist = getZJ(wspath) wsStrls = cutcontent(getSSMatchObject(wspath)) createx(wsname, wsStrls, zjlist, [], expath)
def createtest(wspath, wstestpath): ftmcls, ftnrls = getFTList(wspath) wsStrls = cutcontent(getJLMatchObject(wspath)) with open(wstestpath, 'w', encoding='utf-8') as f: for i in range(len(ftmcls)): for wsStr in wsStrls: line = [] line.append(ftmcls[i]) line.append(ftnrls[i]) line.append(wsStr) f.write('!@#'.join(line)) f.write('\n')
def traversews(wspath, model): # 加入data # outputdata_dict = {} outputdata_ss = [] outputdata_jl = [] ftmcls, ftnrls = getFTList(wspath) print('wsft..', ftmcls) wsStrls = cutcontent(getSSMatchObject(wspath)) wsStrkeys = [] # 预先将文书所有句子的关键词提取好 for wsStr in wsStrls: wsStrkeys.append(getnormalizeweigth(wsStr, False)) # h获取文书关键词,[[]] print('set ws sp length....', len(wsStrkeys)) # 预先将结论所有句子的关键词提取好 jlStrkeys = [] jlStrls = cutcontent(getJLMatchObject(wspath)) for jlStr in jlStrls: jlStrkeys.append(getnormalizeweigth(jlStr, False)) print('set jl sp length....', len(jlStrkeys)) # 遍历法条 for i in range(len(ftnrls)): # 加入法条data ftdata_ss = [] ftdata_jl = [] ftnr = ftnrls[i] ftmc = ftmcls[i] print('ftnr', ftnr) ftnrArra = cutcontent(ftnr) print('ftnrArra lenght..', len(ftnrArra)) # 优化,将nr对应的keys先算好,存进dict中 nrkeysdict = {} for nr in ftnrArra: keys, weights = getnormalizeweigth(nr, True) nrkeysdict[nr] = [keys, weights] print(keys, weights) # 对于每个事实句子都把法条sp遍历比较一遍 for i in range(len(wsStrkeys)): wsStrkey = wsStrkeys[i] wsStr = wsStrls[i] smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model) if smaxsum > 0.25: ftdata_ss.append(1) else: ftdata_ss.append(0) print('ws nr',wsStr) print('ws keys',wsStrkey) print('ws ft max distance',smaxsum) # 对于每个结论句子都把法条sp遍历比较一遍 for i in range(len(jlStrkeys)): jlStrkey = jlStrkeys[i] jlStr = jlStrls[i] smaxsum = distance(jlStrkey, ftnrArra, nrkeysdict, jlStr, model) if smaxsum > 0.45: ftdata_jl.append(1) # error1 else: ftdata_jl.append(0) # outputdata_dict[ftmc] = [ftdata_ss,ftdata_jl] outputdata_ss.append(ftdata_ss) outputdata_jl.append(ftdata_jl) # 输出到法条与事实的映射list,以及法条到结论映射list return outputdata_ss, outputdata_jl
def traversews(expath, model): # 加入data outputdata_ss = [] outputdata_jl = [] ftmcls, ftnrls = getFTList(expath) print('wsft..', ftmcls) # wsStrls = cutcontent(getSSMatchObject(wspath)) # wsStrkeys = [] # 预先将文书所有句子的关键词提取好 # for wsStr in wsStrls: # wsStrkeys.append(getnormalizeweigth(wsStr, False)) # h获取文书关键词,[[]] # print('set ws keys length....', len(wsStrkeys)) # 预先将结论所有句子的关键词提取好 jlStrkeys = [] jlStrls = cutcontent(getJLMatchObject(wspath)) for jlStr in jlStrls: jlStrkeys.append(getnormalizeweigth(jlStr, False)) # print('set jl keys length....', len(jlStrkeys)) # 遍历法条 for i in range(len(ftnrls)): # 加入法条data ftdata_ss = [] ftdata_jl = [] ftnr = ftnrls[i] ftmc = ftmcls[i] print('ftnr', ftnr) #加载该法条的lda模型 # ft_corpus = '../LDAmodel/source/corpus/'+ftmc+'.txt' # ft_model = '../LDAmodel/source/model/'+ftmc+'_lda.model' # if os.path.exists(ft_model): # dictionary,lda = getLDAmodel(ft_corpus,ft_model) ftnrArra = cutcontent(ftnr) this_ftkeys = getftkeys(ftmc) # 优化,将nr对应的keys先算好,存进dict中 nrkeysdict = {} allftkeys = [] for nr in ftnrArra: keys, weights = getnormalizeweigth(nr, True) print(keys) nrkeysdict[nr] = [keys, weights] # # if len(keys) >=4: # allftkeys.extend([keys[3]]) if len(keys) >= 3: allftkeys.extend([keys[2]]) if len(keys) >= 2: allftkeys.extend([keys[1]]) elif len(keys) == 1: allftkeys.extend([keys[0]]) # 对于每个事实句子都把法条sp遍历比较一遍 # for i in range(len(wsStrkeys)): # wsStrkey = wsStrkeys[i] # wsStr = wsStrls[i] # print('wsStr',wsStr) # print('wsStrkeys',wsStrkey) #使用法条使用lda得出的keys========================================= # ftkeyflag = 0 # for _ in allftkeys: # if wsStr.count(_) > 0: # ftkeyflag += 1 # # if ftkeyflag >= 1: # ftdata_ss.append(1) # else: # # count = 0 # if len(this_ftkeys) > 0: # for this_key in this_ftkeys: # if wsStr.count(this_key) > 0: # count += 1 # if count > 0: # if count > 5: # ftdata_ss.append(1) # else: # smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model) # print('samxsum', smaxsum) # if smaxsum > 0.3: # ftdata_ss.append(1) # else: # ftdata_ss.append(0) # else: # ftdata_ss.append(0) # else: # smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model) # print('samxsum', smaxsum) # if smaxsum > 0.3: # ftdata_ss.append(1) # else: # ftdata_ss.append(0) # 使用法条使用lda得出的keys========================================= # if os.path.exists(ft_model) == True: # tp = predict(dictionary, lda, wsStr) # print('tp',tp) # if tp > 0.95: # ftdata_ss.append(1) # # else: # # smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model) # # if smaxsum > 0.2: # # ftdata_ss.append(1) # # else: # # ftdata_ss.append(0) # # elif tp > 0.85: # smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model) # print('samxsum',smaxsum) # if smaxsum > 0.2: # ftdata_ss.append(1) # else: # ftdata_ss.append(0) # else: # ftdata_ss.append(0) # # else: # smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model) # print('samxsum', smaxsum) # if smaxsum > 0.2: # ftdata_ss.append(1) # else: # ftdata_ss.append(0) # smaxsum = distance(wsStrkey, ftnrArra, nrkeysdict, wsStr, model) # if smaxsum > 0.3: # ftdata_ss.append(1) # else: # ftdata_ss.append(0) # 对于每个结论句子都把法条sp遍历比较一遍 for i in range(len(jlStrkeys)): jlStrkey = jlStrkeys[i] jlStr = jlStrls[i] smaxsum = distance(jlStrkey, ftnrArra, nrkeysdict, jlStr, model) if smaxsum > 0.5: ftdata_jl.append(1) # error1 else: ftdata_jl.append(0) # outputdata_ss.append(ftdata_ss) outputdata_jl.append(ftdata_jl) # 输出到法条与事实的映射list,以及法条到结论映射list return outputdata_jl