def insert_new_forestry_subject(subject_set): insert_sql = '''insert into new_forestry_subject (subject) value (%s)''' cursor = conn.cursor() for subject in subject_set: cursor.execute(insert_sql, (subject, )) conn.commit() print(subject, 'insert success!')
def get_correlation_sentences(): cursor = conn.cursor() select_article_1_sql = '''select * from article_1''' select_article_2_sql = '''select * from article_2''' cursor.execute(select_article_1_sql) results_1 = cursor.fetchall() pattern_1 = "^本(办法|条例|规定|法|细则)(.*?)由(.*?)" chapter_to_law_dict = get_chapter_law_mapping() count = 0 sentences_list = list() for result in results_1: law_id = chapter_to_law_dict[result[3]] contents = list(filter(None, str(result[2]).strip().split('\n'))) for content in contents: if re.findall(pattern_1, content.strip()): count = count + 1 if len(content.strip()) > 150: continue sentences_list.append(tuple((law_id, content.strip()))) cursor.execute(select_article_2_sql) results_2 = cursor.fetchall() for result in results_2: law_id = result[3] contents = list(filter(None, str(result[2]).strip().split('\n'))) for content in contents: if re.findall(pattern_1, content.strip()): count = count + 1 if len(content.strip()) > 150: continue sentences_list.append(tuple((law_id, content.strip()))) print(count) return sentences_list
def get_relation_collection(): cursor = conn.cursor() query_for_relation_classify = '''select * from relation_classify''' cursor.execute(query_for_relation_classify) relation_classify_results = cursor.fetchall() relation_classify_dict = { 'define': [], 'aim': [], 'application_scope': [], 'contain': [], 'duty': [], 'right': [], 'accord': [], 'forbid': [], 'punishment': [] } for res in relation_classify_results: law_id = res[1] article_class = res[2] chapter_id = res[3] sentence_id = res[4] parse_sentence = res[6] relation_type = res[7] relation_classify_dict[relation_type].append( tuple((law_id, article_class, chapter_id, sentence_id, parse_sentence, relation_type))) return relation_classify_dict
def get_relation_by_type(relation_type): cursor = conn.cursor() select_sql = '''select * from new_relation where relation_type = %s''' cursor.execute(select_sql, (relation_type)) relation_results = cursor.fetchall() return relation_results
def relation_collection_expand(filter_colum, key, relation_type): cursor = conn.cursor() key_word = '%' + key + '%' select_srl_results = "select * from semantic_role_label_result where %s" % filter_colum \ + " like %s and parse_sentence not like %s group by parse_sentence" insert_relation_classify = '''insert into relation_classify (law_id, class, chapter_id, sentence_id, complete_sentence, parse_sentence, relation_type, is_complex) value (%s, %s, %s, %s, %s, %s, %s, %s)''' num_reg = '[0-9]+' head_reg = '^[一二三四五六七八(1234567890]' count = 0 cursor.execute(select_srl_results, ('%' + key + '%', '%所有权%')) define_resutlts = cursor.fetchall() for res in define_resutlts: parse_sentence = str(res[6]).strip() count = count + 1 law_id = res[1] article_class = res[2] chapter_id = res[3] sentence_id = res[4] complete_sentence = res[5] is_comlex = res[10] cursor.execute( insert_relation_classify, (law_id, article_class, chapter_id, sentence_id, complete_sentence, parse_sentence, relation_type, is_comlex)) conn.commit() print(relation_type, ' insert success') # print(parse_sentence) print(relation_type, count) return count
def search(): content = request.args.get('content', '') search_words = list(segmentor.segment(content)) cursor = conn.cursor() select_sql = '''select name, type from law''' cursor.execute(select_sql) law_info_dict = {} law_results = cursor.fetchall() for law in law_results: law_name = law[0] law_name_words = list(segmentor.segment(law_name)) match_count = 0 for w in law_name_words: if w in search_words: match_count = match_count + 1 if (len(search_words) <= 3 and match_count >= 1) or match_count >= 3: law_info_dict.update({law[0]: match_count}) if len(law_info_dict) > 0: match_res = sorted(law_info_dict.items(), key=lambda x: x[1], reverse=True) result = {'status': 200, 'data': match_res} else: result = {'status': 0, 'message': '未找到相关法律法规!'} return result
def query_and_match(query_sql, article_type): cursor = conn.cursor() cursor.execute(query_sql) results_1 = cursor.fetchall() results_detail = [] results_accord = [] accord_count = 0 detail_count = 0 output_file = "C:\\Users\\dhz\\Desktop\\template\\punishment_content.txt" with open(output_file, "a") as w: for res in results_1: content = str(res[3]).strip() pattern_res_detail = re.search(PUNISHMENT_PATTERN_DETAIL, content, re.M | re.I) pattern_res_accord = re.search(PUNISHMENT_AND_ACCORD_PATTERN, content, re.M | re.I) if pattern_res_accord: accord_count = accord_count + 1 results_accord.append(res) elif pattern_res_detail: # w.write(content.replace("\n", "") + '\n') detail_count = detail_count + 1 results_detail.append(res) print(detail_count) print(accord_count) return results_detail, results_accord, article_type
def detail_content_parser(): cursor = conn.cursor() select_sql = "select law_id, p_key, p_content from law_content_parse where id < 101" cursor.execute(select_sql) results = cursor.fetchall() for res in results: print(res[1] + ': ' + str(res[2]).split('\n')[0])
def complex_extraction(): query_for_parse_sentence = '''select complete_sentence, parse_sentence, class, sentence_id from dependency_parsing_result where is_complex = 1 and sentence_id > 9481 group by parse_sentence order by id''' cursor = conn.cursor() cursor.execute(query_for_parse_sentence) parse_sentences = cursor.fetchall() for parse_sentence in parse_sentences: complete_sentence = parse_sentence[0] parsing_sentence = parse_sentence[1] cursor.execute( SELECT_DP_SQL, (parse_sentence[1], parse_sentence[2], parse_sentence[3])) dp_results = cursor.fetchall() cursor.execute( SELECT_SDP_SQL, (parse_sentence[1], parse_sentence[2], parse_sentence[3])) sdp_results = cursor.fetchall() cursor.execute( SELECT_SRL_SQL, (parse_sentence[1], parse_sentence[2], parse_sentence[3])) srl_results = decode_srl_results(cursor.fetchall()) # TODO: 调用复杂句关系抽取核心 complex_extraction_core(dp_results, sdp_results, srl_results, complete_sentence, parsing_sentence)
def parsing_and_semantic_analysis(relation_type, relation_collect): cursor = conn.cursor() output_path = "G:\\analysis\\" + relation_type + ".txt" with open(output_path, "a") as w: for res in relation_collect: law_id = res[1] article_class = res[2] chapter_id = res[3] sentence_id = res[4] parse_sentence = res[6] relation_type = res[7] w.write(parse_sentence + '\n') cursor.execute(SELECT_SRL_SQL, (law_id, article_class, chapter_id, sentence_id, parse_sentence)) srl_results = cursor.fetchall() srl_dict = srl_for_verb(srl_results) for verb in srl_dict: w.write(verb + ':') for role_label in srl_dict[verb]: w.write(role_label + '\t') w.write('\n') w.write( '\n================================================================================\n' ) cursor.execute(SELECT_DP_SQL, (law_id, article_class, chapter_id, sentence_id, parse_sentence)) dp_results = cursor.fetchall() for dp in dp_results: w.write(dp[7] + ' -------- ' + dp[8] + ' ------- ' + dp[9] + '\n') w.write( '\n********************************************************************************\n' )
def city_index(word): # 城市索引对应 cursor = conn.cursor() city_select_sql = 'select * from city' # 查询标准城市 cursor.execute(city_select_sql) cities = cursor.fetchall() city_dict = dict() for city in cities: city_key = city[2].replace(' ', '').replace('市', '') # 将“省”,“市”以及空格去掉,方便比较 city_dict.update({city_key: city[1]}) word = word.replace(' ', '').replace('市', '') select_special_city_sql = "select name from city where name not like '%市'" cursor.execute(select_special_city_sql) select_special_city = cursor.fetchall() special_city_list = list() for c in select_special_city: special_city_list.append(c[0]) if word in city_dict: # 判断是否有对应的城市 code = city_dict[word] if word not in special_city_list: word = word + '市' return {word: code} else: return None
def province_index(word): # 省份索引对应 cursor = conn.cursor() province_select_sql = 'select * from province' # 查询标准省份 cursor.execute(province_select_sql) provinces = cursor.fetchall() province_dict = dict() for province in provinces: province_key = province[2].replace(' ', '').replace('省', '').replace( '市', '') # 将“省”,“市”以及空格去掉,方便比较 province_dict.update({province_key: province[1]}) word = word.replace(' ', '').replace('省', '').replace('市', '') special_province = ['北京', '上海', '重庆', '天津'] special_area = ['澳门', '香港'] if word in province_dict: # 判断是否有对应的省份 code = province_dict[word] if word in special_province: word = word + '市' elif word in special_area: word = word + '特别行政区' else: word = word + '省' return {word: code} else: return None
def subject_save(subject_set): cursor = conn.cursor() insert_sql = '''insert into forestry_subject (subject) value (%s)''' for subject in subject_set: cursor.execute(insert_sql, (subject, )) conn.commit() print(subject) print(len(subject_set))
def get_article_2_map_dict(): article_to_law_dict = dict() cursor = conn.cursor() query_for_article_2_sql = '''select id, a_key, law_id from article_2''' cursor.execute(query_for_article_2_sql) results = cursor.fetchall() for res in results: article_to_law_dict.update({res[0]: tuple((res[1], res[2]))}) return article_to_law_dict
def get_article_2_map_dict(): select_article_2_sql = '''select id, law_id from article_2''' cursor = conn.cursor() cursor.execute(select_article_2_sql) article_law_dict = dict() results = cursor.fetchall() for res in results: article_law_dict.update({res[0]: res[1]}) return article_law_dict
def article_1_sentence_extract(): # 将article_1 的句子尽心分割提取 select_sql = "select * from article_1" cursor = conn.cursor() cursor.execute(select_sql) articles = cursor.fetchall() for article in articles: article_1_id = article[0] article_1_content = article[2] insert_article_1_sentence_sql = '''insert into article_1_sentence (article_1_id, is_single, content) value (%s, %s, %s)''' if ':' in article_1_content: is_single = 0 article_1_sentence_content = str(article_1_content).split( ':')[0].replace(" ", "") try: cursor.execute( insert_article_1_sentence_sql, (article_1_id, is_single, article_1_sentence_content)) conn.commit() except Exception as e: conn.rollback() print('\033[1;32;41m' + str(article_1_id) + article_1_sentence_content + e + ': FAILED---------' + '\033[0m') article_1_clauses = str(article_1_content).split(':')[1].split( "\n") select_article1_sentence_id = '''SELECT id from article_1_sentence where id = (SELECT max(id) FROM article_1_sentence);''' cursor.execute(select_article1_sentence_id) sentence_id = cursor.fetchone()[0] for article_1_clause in article_1_clauses: if article_1_clause is not None and article_1_clause != '': insert_article_1_clause_sql = '''insert into article_1_clause (article_1_id, article_1_sentence_id, clause_content) value (%s, %s, %s)''' try: cursor.execute( insert_article_1_clause_sql, (article_1_id, sentence_id, str(article_1_clause).replace(" ", ""))) conn.commit() except Exception as e: conn.rollback() print('\033[1;32;41m' + str(article_1_id) + article_1_clause + e + ': FAILED---------' + '\033[0m') print(article[2] + '============================================SUCCESS') else: is_single = 1 try: cursor.execute(insert_article_1_sentence_sql, (article_1_id, is_single, article_1_content)) conn.commit() print(article_1_content + '=========================================SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + str(article_1_id) + '--' + e + ': FAILED---------' + '\033[0m')
def test_pyltp_sentence_split(): cursor = conn.cursor() select_sql = "select p_content from law_content_parse" cursor.execute(select_sql) results = cursor.fetchall() count = 0 for res in results: if ':' in res[0] and count == 0: sens = SentenceSplitter.split(res[0]) print('\n'.join(sens)) count = count + 1
def wash_law(table_name): cursor = conn.cursor() select_sql = "select id, subject from %s" % table_name + " where subject like %s" cursor.execute(select_sql, ('%' + 'law' + '%', )) results = cursor.fetchall() update_sql = "update %s " % table_name + "set subject = %s where id = %s" for res in results: subject = res[1][4:] cursor.execute(update_sql, (subject, res[0])) conn.commit() print(res[0], 'success')
def get_law_aim(): cursor = conn.cursor() select_sql = '''select * from law_aim group by law_id''' cursor.execute(select_sql) aim_results = cursor.fetchall() aim_dict = {} for aim in aim_results: law_id = aim[1] aim_content = aim[2] aim_dict.update({law_id: aim_content}) return aim_dict
def create_test_txt(): cursor = conn.cursor() select_sql = '''select * from article_1_sentence where is_single = 0 limit 0, 500''' cursor.execute(select_sql) sentences = cursor.fetchall() with open(r"C:\Users\dhz1216\Desktop\test\law_input.txt", "a") as w: for sentence in sentences: contents = SentenceSplitter.split(sentence[3]) for content in contents: if content is not None and content != '' and ':' in content: w.write(str(content).strip() + '\n')
def update_law(location, location_code, location_level, law_id): cursor = conn.cursor() update_sql = "update law set location = %s, location_code = %s, location_level = %s where id = %s" try: cursor.execute(update_sql, (location, location_code, location_level, law_id)) conn.commit() print(str(law_id) + '-----------------------UPDATE SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + str(law_id) + ': FAILED---------' + e + '\033[0m')
def relation_wash(define_relation_list, relation_type): law_reg = "本(.*)所称(.*)" duty_reg = "(.*)的主要职责" head_reg = "^[一二三四五六七八九]" special_reg1 = "[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”《》〔〕:;!,。?、~@#¥%……&*()]+" special_reg2 = "[1234567890.]+" num_reg = "^[((]" update_sql = '''update new_relation set subject = %s where id = %s''' update_sql2 = '''update new_relation set subject = %s, relation_type = %s where id = %s''' cursor = conn.cursor() subject_list = [] law_aim_dict = get_law_aim() for relation in define_relation_list: id = relation[0] law_id = relation[1] chapter_id = relation[2] sentence_id = relation[3] parse_sentence = relation[4] subject = relation[5] object = relation[7] subject = re.sub(special_reg2, '', subject) object = re.sub(special_reg2, '', object) if re.search(num_reg, subject): subject = subject[3:] if subject.startswith('、'): subject = subject[1:] if subject.endswith(',') or subject.endswith('、') or subject.endswith(',') or\ subject.endswith('。') or subject.endswith(';') or subject.endswith(':'): subject = subject[:-1] if object.startswith('、'): object = object[1:] if object.endswith(',') or object.endswith('、') or object.endswith(',') or \ object.endswith('。') or object.endswith(';') or object.endswith(':'): object = object[:-1] if len(subject) < 15: subject_list.append(subject) data = { 'law_id': law_id, 'chapter_id': chapter_id, 'sentence_id': sentence_id, 'parse_sentence': parse_sentence, 'subject': subject, 'relation': '权利/义务', 'object': object } insert_new_relation_base_type(relation_type, data) subject_set = list(set(subject_list)) insert_new_forestry_subject(subject_set)
def law_classify(): # 法律法规文本归类 dir_path = "C:\\Users\\dhz1216\\Desktop\\wenben" class_select_sql = "select * from law_class" # 查找法律分类表law_class law_select_sql = "select id from law where text_name = %s" # 根据文本名称查找law ID update_sql = "update law set type = %s where id = %s" # 根据ID更新law表中的type信息 insert_sql = "insert into law_to_class (law_id, class_id) value (%s, %s)" # 插入法律和分类对照表 cursor = conn.cursor() cursor.execute(class_select_sql) results = cursor.fetchall() class_id_dict = dict() # 记录类别和id的对应关系 class_keyword_dict = dict() # 记录类别和关键词的对应关系 for res in results: class_id_dict.update({res[1]: res[0]}) class_keyword_dict.update({res[1]: str(res[2]).split(',')}) for file in os.listdir(dir_path): text_name = file.split('.')[0] cursor.execute(law_select_sql, (text_name)) law = cursor.fetchone() if law is None: continue law_id = law[0] class_type = str() for c in class_id_dict: for word in class_keyword_dict[c]: if word in text_name: class_type = c break if class_type is not None and class_type != '': break if class_type is None or class_type == '': class_type = '其他' class_id = class_id_dict[class_type] # 获取所属类别的id try: cursor.execute(update_sql, (class_type, law_id)) conn.commit() try: cursor.execute(insert_sql, (law_id, class_id)) conn.commit() except Exception as e: conn.rollback() print('\033[1;32;41m' + text_name + e + ': INSERT FAILED---------' + '\033[0m') print(text_name + '--------success--------' + class_type) except Exception as e: conn.rollback() print('\033[1;32;41m' + text_name + e + ': UPDATE FAILED---------' + '\033[0m') print(text_name + '------------------' + class_type + str(class_id))
def write_to_file_for_observe(): query_for_parse_sentence = '''select complete_sentence, parse_sentence, class, sentence_id from dependency_parsing_result group by parse_sentence order by id''' cursor = conn.cursor() cursor.execute(query_for_parse_sentence) parse_sentences = cursor.fetchall() with open(OUTPUT_FILE, "a") as w: for parse_sentence in parse_sentences: w.write("原句:" + parse_sentence[0] + '\n') w.write("解析:" + parse_sentence[1] + '\n') cursor.execute( SELECT_DP_SQL, (parse_sentence[1], parse_sentence[2], parse_sentence[3])) dp_results = cursor.fetchall() cursor.execute( SELECT_SDP_SQL, (parse_sentence[1], parse_sentence[2], parse_sentence[3])) sdp_results = cursor.fetchall() cursor.execute( SELECT_SRL_SQL, (parse_sentence[1], parse_sentence[2], parse_sentence[3])) srl_results = decode_srl_results(cursor.fetchall()) w.write( "-----------------------------依存句法分析结果---------------------------\n" ) for dp in dp_results: front_word = dp[7] relation_name = dp[8] tail_word = dp[9] w.write("%s -----(%s)---- %s\n" % (front_word, relation_name, tail_word)) w.write( "-----------------------------语义角色标注结果---------------------------\n" ) for verb in srl_results: w.write(verb + ":\t") for role_info in srl_results[verb]: w.write(role_info[0] + '-' + role_info[1] + '\t') w.write('\n') w.write( "-----------------------------语义依存分析结果---------------------------\n" ) for sdp in sdp_results: front_word = sdp[7] relation_name = sdp[8] tail_word = sdp[9] w.write("%s -----(%s)---- %s\n" % (front_word, relation_name, tail_word)) w.write( "\n********************************************************************************************\n" )
def accord_relation_process(): cursor = conn.cursor() select_sql = 'select * from accord_relation' cursor.execute(select_sql) results = cursor.fetchall() accord_list = [] for res in results: relation = res[5] for accord in str(relation).split('/'): accord_list.append(accord) accord_set = set(accord_list) for accord in accord_set: print(accord)
def query_for_subject(): cursor = conn.cursor() subject_list = [] for class_type in SINGLE_RELATION_CLASS: table_name = class_type + '_relation' query_sql = 'select * from %s' % table_name cursor.execute(query_sql) results = cursor.fetchall() for relation in results: subject = relation[4] subject_list.append(subject) subject_set = set(subject_list) return subject_set
def chapter_article_parser(file_path): # "第xx条"类文本解析与格式化 file_name = file_path.split('\\')[-1] write_path = "C:\\Users\\dhz1216\\Desktop\\washing\\第一类\\" + file_name # 查询法规基本数据库中该文本的id和对应的法规名称 law_name = file_name.split('.')[0] try: cursor = conn.cursor() select_sql = "select id from law where text_name = %s" cursor.execute(select_sql, (file_name.split('.')[0])) law_id = cursor.fetchone()[0] except Exception as e: print(e) return with open(file_path, "r", encoding='gbk', errors='ignore') as f: line = f.readline() pattern = re.compile("第(.*?)(?:章|条)") while line: if line.startswith('【法规全文】'): line = line.replace('【法规全文】', '') with open(write_path, "a") as w: while line: match = pattern.match(line.lstrip()) if match: p_key = match.group() p_content = line.replace(match.group(), '').lstrip() line = f.readline() while line: match = pattern.match(line.lstrip()) if not match: p_content = p_content + line line = f.readline() else: break w.write(p_key + ': ' + p_content + '\n') insert_sql = "insert into law_content_parse (law_id, p_key, p_content, law_name) " \ "value (%s, %s, %s, %s)" try: cursor.execute(insert_sql, (law_id, p_key, p_content, law_name)) conn.commit() print(file_name + ': PARSE SUCCESS') except Exception as e: print(e) conn.rollback() print('\033[1;32;41m' + file_name + ': PARSE FAILED---------' + '\033[0m') else: line = f.readline() else: line = f.readline()
def dp_based_similarity_core(): cursor = conn.cursor() model = Word2Vec.load('../model/forestry_law.model') cn_reg = '^[\u4e00-\u9fa5]+$' select_sql = '''select * from dependency_parsing_result where parse_sentence = %s''' sentence1 = '各级林业主管部门负责木材经营加工的管理和监督。' sentence2 = '市园林主管部门应负责组织城市园林病虫害防治工作。' sentence2 = '市园林主管部门负责监督和技术指导。' # sentence2 = '没有违法所得或者违法所得不足三万元的,并处三千元以上三万元以下罚款。' cursor.execute(select_sql, (sentence1, )) res1 = cursor.fetchall() group1 = [] group2 = [] for res in res1: front_word = res[7] relation = res[8] tail_word = res[9] if re.search(cn_reg, front_word) and re.search(cn_reg, tail_word): group1.append(tuple((front_word, relation, tail_word))) else: continue cursor.execute(select_sql, (sentence2, )) res2 = cursor.fetchall() for res in res2: front_word = res[7] relation = res[8] tail_word = res[9] if re.search(cn_reg, front_word) and re.search(cn_reg, tail_word): group2.append(tuple((front_word, relation, tail_word))) else: continue max_len = max(len(group1), len(group2)) print(max_len) sim_score = 0 for pair1 in group1: for pair2 in group2: if pair1[1] == pair2[1]: if model[pair1[0]].any() and model[pair1[2]].any() and model[ pair2[0]].any() and model[pair2[2]].any(): sim1 = model.similarity(pair1[0], pair2[0]) sim2 = model.similarity(pair1[2], pair2[2]) print(sim1) print(sim2) print('-----------------------------') if sim1 > 0.35 and sim2 > 0.35: sim_score = sim_score + 0.7 * ( (sim1 + sim2) / 2) / max_len print(sim_score)
def science_spot_parser(file_path): dir_path = "C:\\Users\\dhz1216\\Desktop\\washing\\风景名胜" file_name = file_path.split("\\")[-1] cursor = conn.cursor() select_sql = "select id, name from law where text_name = %s" cursor.execute(select_sql, (file_name.split('.')[0])) law_id = cursor.fetchone()[0] count = 0 with open(file_path, "r", encoding='gbk', errors='ignore') as f: line = f.readline() while line: if line.startswith('【法规全文】'): with open(dir_path + "\\" + file_name, "a") as w: line = line.replace('【法规全文】', '') # line = f.readline() while line: if len(line.lstrip().split(' ')) > 1: key_title = line.lstrip().split(' ')[0] value_content = line.lstrip().split(' ')[1] line = f.readline() while line: if len(line.lstrip().split(' ')) <= 1: value_content = value_content + line.lstrip( ).split(' ')[0] line = f.readline() else: break w.write(key_title + ':' + value_content + '\n') insert_sql = "insert into law_content (law_id, p_key, p_content, law_class) " \ "value (%s, %s, %s, %s)" try: cursor.execute( insert_sql, (law_id, key_title, value_content, '风景名胜')) conn.commit() count = count + 1 print('\033[1;37;40m' + file_name + ': PARSE SUCCESS' + '\033[0m') except Exception as e: print(e) conn.rollback() print('\033[1;32;41m' + file_name + ': PARSE FAILED---------' + '\033[0m') else: line = f.readline() else: line = f.readline() print('共插入:' + str(count) + '条')
def filter_subject(): cursor = conn.cursor() select_sql = '''select * from new_forestry_subject''' insert_sql = '''insert into new_forestry_subject_final (subject) value (%s)''' cursor.execute(select_sql) results = cursor.fetchall() subject_list = [] for res in results: subject = res[1] subject_list.append(subject) subject_set = list(set(subject_list)) for s in subject_set: cursor.execute(insert_sql, (s, )) conn.commit() print(s, 'success!')