def extract_result(text): sentences = sepearate(text) regex = "," file_path = Dir.resourceDir+"dict/civil_mariiage_result" judge_words = tools.readLines(file_path) result = {} for sentence in sentences: flag = False details_sentences = re.split(regex,sentence) for details_sentence in details_sentences: words = list(jieba.cut(details_sentence)) count =0 for word in judge_words: if word in words: count+=1 if count/ words.__len__()>0.3: words_sent = list(jieba.cut(sentence)) inter = set(words_sent).intersection(set(words)) sentence_simple = "" if "由此" in sentence: sentence_simple = sentence[sentence.find("由此")+2:] elif ",故" in sentence: sentence_simple = sentence[sentence.find(",故")+2:] else: sentence_simple = sentence if sentence_simple not in result.keys(): result[sentence_simple] = inter.__len__() / words_sent.__len__() flag = True break; else: if re.findall("\d*?元",sentence): result[sentence] = "元" if flag: break return list(result.keys())
def extract_laws(text): path = Dir.resourceDir + "dict\\LawsName" laws_name = tools.readLines(path) lines = seperate_sentences(text) middle_result = [] for line in lines: if "《" in line: middle_result.append(line) result = [] for line in middle_result: index = int(line.find("《")) line = line[index:] end = int(line.find("规定")) words = str(line[:end]) if words.__len__() > 0: # if True: if re.match("的|之", words[-1]) != None: words = str(words[:-1]) index_end = int(words.find("》")) if "第" not in words[index_end:index_end + 10]: words = str(words[:index_end + 1]) if contain_laws_name(words, laws_name): result.append(words) elif "中华人民共和国" in words: result.append(words) return result
def extract_from_texts(text_dict, func): result = {} for file in text_dict.keys(): text = text_dict[file] tmp = extract_from_text(text, func) if tmp != None: result[tools.get_filename(file)] = tmp return result
def observate_sentence_lenth(filepath, savepath=Dir.resourceDir + "\\ObservationResult\\result_"): savepath += tools.get_filename(filepath) + ".txt" result = tools.read_dir(filepath) split_regex = "。|?|!" observation_result = "" print(filepath) tmp_result = {} for i in range(result.__len__()): length = re.split(split_regex, result[i]).__len__() if length not in tmp_result.keys(): tmp_result[length] = 0 tmp_result[length] += 1 observation_result += str(i) + "\t" + str(length) + "\n" # print(observation_result) for key in tmp_result.keys(): print(str(key) + "\t" + str(tmp_result[key])) tools.write(savepath, observation_result)
def demo(): paths = [ Dir.resourceDir+"\\摘要文书\\离婚纠纷" # , # Dir.resourceDir+"\\摘要文书\\民间借贷纠纷" ] filepath = Dir.resourceDir+"/结果/离婚纠纷结果/result.txt" for path in paths: text_list = tools.read_dir(path) string = "" for file in text_list.keys(): text = text_list[file] result_laws = extract_laws(text) result_fact = extract_fact(text) result_result = extract_result(text) # print(text) string += tools.get_filename(file)+"\t"+text+"\t"+str(result_laws)+"\t"+str(result_fact)+"\t"+str(result_result)+"\n" tools.write(filepath,string)
def demo(): paths = [ # Dir.resourceDir+"\\摘要文书\\故意伤害罪" # , Dir.resourceDir + "\\摘要文书\\盗窃罪" ] content = "" for path in paths: text_list = tools.read_dir(path) for file in text_list.keys(): text = text_list[file] result = extract(text) string = tools.get_filename(file) + "\t" + text + "\t" + str( result[0])[1:-1] + "\t" + str(result[1])[1:-1] + "\t" + str( result[2])[1:-1] + "\t" + str(result[3])[1:-1] content += string + "\n" print(string) filepath = Dir.resourceDir + "结果\\盗窃罪结果\\result.txt" tools.write(filepath, content)
def extract_result(text): result = set() path1 = Dir.resourceDir + "dict\\guilty_name.txt" guilty_names = tools.readLines(path1) sentences = seperate_sentences(text) for sentence in sentences: for guilty_name in guilty_names: if guilty_name in sentence: index = sentence.find(guilty_name) if "构成" in sentence[index - 10:index] and ( "," in sentence[index:index + guilty_name.__len__() + 2] or index + guilty_name.__len__() + 2 > sentence.__len__()): result.add(guilty_name) return list(result)
def extract_fact_from_guilty(text): result = [] sentences = seperate_sentences(text) for sentence in sentences: path1 = Dir.resourceDir + "dict\\guilty_name.txt" guilty_names = tools.readLines(path1) for guilty_name in guilty_names: if guilty_name in sentence: index = sentence.find(guilty_name) if "," in sentence[:index]: index = sentence[:index].rfind(",") start = sentence[:index].find("被告人") if index > start + 10: result.append(guilty_name + ":" + str(sentence[start:index])) return result
def demo(func): # path = Dir.resourceDir+"\\摘要文书\\故意伤害罪" paths = [ Dir.resourceDir + "\\摘要文书\\故意伤害罪" # ,Dir.resourceDir+"\\摘要文书\\离婚纠纷", # Dir.resourceDir+"\\摘要文书\\盗窃罪" # Dir.resourceDir+"\\摘要文书\\民间借贷纠纷" ] final_result = [] for path in paths: text_list = tools.read_dir(path) result = extract_from_texts(text_list, func) string = "" for res in result.keys(): string += res + "\t" for tmp in result[res]: string += str(tmp) + "\t" string += "\n" print(string)
def loadWords(): file = Dir.resourceDir+"dict\civil_marriage" words = tools.readLines(file) return words
idf = math.log( float(count) / float(reversed_index[word].__len__() + 1), math.e) if word not in word_idf.keys(): word_idf[word] = idf tfidf = {} for index in tf.keys(): if index not in tfidf.keys(): tfidf[index] = {} words = tf[index] for word in words.keys(): if word not in tfidf[index].keys(): tfidf[index][word] = [ words[word], word_idf[word], words[word] * word_idf[word] ] print(word, tfidf[index][word]) return tfidf file_dir = Dir.resourceDir + "摘要文书\\离婚纠纷\\" text_list = tools.read_dir(file_dir) combineTextAndGetTf(text_list) # result =gettfidf(text_list) # for key in result.keys(): # string ="" # for word in result[key].keys(): # string += word+":"+str(result[key][word]) # string+="##" # string+="\n" # print(string)