def update_law(location, location_code, location_level, law_id): cursor = conn.cursor() update_sql = "update law set location = %s, location_code = %s, location_level = %s where id = %s" try: cursor.execute(update_sql, (location, location_code, location_level, law_id)) conn.commit() print(str(law_id) + '-----------------------UPDATE SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + str(law_id) + ': FAILED---------' + e + '\033[0m')
def law_classify(): # 法律法规文本归类 dir_path = "C:\\Users\\dhz1216\\Desktop\\wenben" class_select_sql = "select * from law_class" # 查找法律分类表law_class law_select_sql = "select id from law where text_name = %s" # 根据文本名称查找law ID update_sql = "update law set type = %s where id = %s" # 根据ID更新law表中的type信息 insert_sql = "insert into law_to_class (law_id, class_id) value (%s, %s)" # 插入法律和分类对照表 cursor = conn.cursor() cursor.execute(class_select_sql) results = cursor.fetchall() class_id_dict = dict() # 记录类别和id的对应关系 class_keyword_dict = dict() # 记录类别和关键词的对应关系 for res in results: class_id_dict.update({res[1]: res[0]}) class_keyword_dict.update({res[1]: str(res[2]).split(',')}) for file in os.listdir(dir_path): text_name = file.split('.')[0] cursor.execute(law_select_sql, (text_name)) law = cursor.fetchone() if law is None: continue law_id = law[0] class_type = str() for c in class_id_dict: for word in class_keyword_dict[c]: if word in text_name: class_type = c break if class_type is not None and class_type != '': break if class_type is None or class_type == '': class_type = '其他' class_id = class_id_dict[class_type] # 获取所属类别的id try: cursor.execute(update_sql, (class_type, law_id)) conn.commit() try: cursor.execute(insert_sql, (law_id, class_id)) conn.commit() except Exception as e: conn.rollback() print('\033[1;32;41m' + text_name + e + ': INSERT FAILED---------' + '\033[0m') print(text_name + '--------success--------' + class_type) except Exception as e: conn.rollback() print('\033[1;32;41m' + text_name + e + ': UPDATE FAILED---------' + '\033[0m') print(text_name + '------------------' + class_type + str(class_id))
def chapter_article_parser(file_path): # "第xx条"类文本解析与格式化 file_name = file_path.split('\\')[-1] write_path = "C:\\Users\\dhz1216\\Desktop\\washing\\第一类\\" + file_name # 查询法规基本数据库中该文本的id和对应的法规名称 law_name = file_name.split('.')[0] try: cursor = conn.cursor() select_sql = "select id from law where text_name = %s" cursor.execute(select_sql, (file_name.split('.')[0])) law_id = cursor.fetchone()[0] except Exception as e: print(e) return with open(file_path, "r", encoding='gbk', errors='ignore') as f: line = f.readline() pattern = re.compile("第(.*?)(?:章|条)") while line: if line.startswith('【法规全文】'): line = line.replace('【法规全文】', '') with open(write_path, "a") as w: while line: match = pattern.match(line.lstrip()) if match: p_key = match.group() p_content = line.replace(match.group(), '').lstrip() line = f.readline() while line: match = pattern.match(line.lstrip()) if not match: p_content = p_content + line line = f.readline() else: break w.write(p_key + ': ' + p_content + '\n') insert_sql = "insert into law_content_parse (law_id, p_key, p_content, law_name) " \ "value (%s, %s, %s, %s)" try: cursor.execute(insert_sql, (law_id, p_key, p_content, law_name)) conn.commit() print(file_name + ': PARSE SUCCESS') except Exception as e: print(e) conn.rollback() print('\033[1;32;41m' + file_name + ': PARSE FAILED---------' + '\033[0m') else: line = f.readline() else: line = f.readline()
def science_spot_parser(file_path): dir_path = "C:\\Users\\dhz1216\\Desktop\\washing\\风景名胜" file_name = file_path.split("\\")[-1] cursor = conn.cursor() select_sql = "select id, name from law where text_name = %s" cursor.execute(select_sql, (file_name.split('.')[0])) law_id = cursor.fetchone()[0] count = 0 with open(file_path, "r", encoding='gbk', errors='ignore') as f: line = f.readline() while line: if line.startswith('【法规全文】'): with open(dir_path + "\\" + file_name, "a") as w: line = line.replace('【法规全文】', '') # line = f.readline() while line: if len(line.lstrip().split(' ')) > 1: key_title = line.lstrip().split(' ')[0] value_content = line.lstrip().split(' ')[1] line = f.readline() while line: if len(line.lstrip().split(' ')) <= 1: value_content = value_content + line.lstrip( ).split(' ')[0] line = f.readline() else: break w.write(key_title + ':' + value_content + '\n') insert_sql = "insert into law_content (law_id, p_key, p_content, law_class) " \ "value (%s, %s, %s, %s)" try: cursor.execute( insert_sql, (law_id, key_title, value_content, '风景名胜')) conn.commit() count = count + 1 print('\033[1;37;40m' + file_name + ': PARSE SUCCESS' + '\033[0m') except Exception as e: print(e) conn.rollback() print('\033[1;32;41m' + file_name + ': PARSE FAILED---------' + '\033[0m') else: line = f.readline() else: line = f.readline() print('共插入:' + str(count) + '条')
def one_two_article_parser(file_path): file_name = file_path.split('\\')[-1] write_path = "C:\\Users\\dhz1216\\Desktop\\washing\\第二类\\" + file_name # 查询法规基本数据库中该文本的id和对应的法规名称 law_name = file_name.split('.')[0] try: cursor = conn.cursor() select_sql = "select id from law where text_name = %s" cursor.execute(select_sql, (file_name.split('.')[0])) law_id = cursor.fetchone()[0] except Exception as e: print(e) return with open(file_path, "r", encoding='gbk', errors='ignore') as f: line = f.readline() while line: if line.startswith('【法规全文】'): line = line.replace('【法规全文】', '') with open(write_path, "a") as w: while line: if is_cotain_one_two_title(line) is not None: p_key = is_cotain_one_two_title(line) p_content = line.lstrip().replace(p_key, '') line = f.readline() while line: if is_cotain_one_two_title(line) is None: p_content = p_content + line line = f.readline() else: break w.write(p_key + ': ' + p_content + '\n') insert_sql = "insert into law_content_parse (law_id, p_key, p_content, law_name) " \ "value (%s, %s, %s, %s)" try: cursor.execute(insert_sql, (law_id, p_key, p_content, law_name)) conn.commit() print(file_name + ': PARSE SUCCESS') except Exception as e: print(e) conn.rollback() print('\033[1;32;41m' + file_name + ': PARSE FAILED---------' + '\033[0m') else: line = f.readline() else: line = f.readline() pass
def explain_relation_process(law_id, sentence_parse_info, content): cursor = conn.cursor() insert_sql = '''insert into response_to_explain (law_id, responsibility, relation, from_sentence) value (%s, %s, %s, %s)''' for verb in dict(sentence_parse_info).keys(): # [('A1', '本办法'), ('A0', '由省财政厅'), ('C-A1', '解释')] verb_role_list = sentence_parse_info[verb] verb_role_dict = dict() for role in verb_role_list: if role[0] in verb_role_dict: verb_role_dict[role[0]].append(role[1]) else: verb_role_dict.update({role[0]: []}) verb_role_dict[role[0]].append(role[1]) # print(verb_role_dict) role_list = list(verb_role_dict.keys()) if 'A1' not in role_list and 'C-A1' not in role_list: continue elif 'A0' not in role_list or len(verb_role_dict['A0']) > 1: continue else: law_name = '' relation_name = verb orgnization = verb_role_dict['A0'][0] if 'A1' in role_list and len(verb_role_dict['A1']) == 2: law_name = verb_role_dict['A1'][0] relation_name = relation_name + verb_role_dict['A1'][1] elif 'A1' in role_list and len( verb_role_dict['A1']) == 1 and 'C-A1' in role_list: law_name = verb_role_dict['A1'][0] relation_name = relation_name + verb_role_dict['C-A1'][0] elif 'A1' in role_list and len( verb_role_dict['A1']) == 1 and 'C-A1' not in role_list: law_name = '本条例 | 本办法' relation_name = relation_name + verb_role_dict['A1'][0] elif 'A1' not in role_list and 'C-A1' in role_list: law_name = '本条例 | 本办法' relation_name = relation_name + verb_role_dict['C-A1'][0] print("【%s】(%s ---%s--> %s)" % (str(law_id), law_name, relation_name, orgnization)) orgnization = str(orgnization).replace('由', '') try: cursor.execute(insert_sql, (law_id, orgnization, relation_name, content)) conn.commit() except Exception as e: conn.rollback() print('\033[1;32;41m' + law_id + ': ------------PARSE FAILED---------' + '\033[0m')
def clause_strip(): # 对法律条款去空格 select_article_1_sql = '''select * from article_1_clause''' select_article_2_sql = '''select * from article_2_clause''' update_article_1_sql = '''update article_1_clause set clause_content = %s where id = %s''' update_article_2_sql = '''update article_2_clause set clause_content = %s where id = %s''' cursor = conn.cursor() cursor.execute(select_article_1_sql) article_1_clauses = cursor.fetchall() cursor.execute(select_article_2_sql) article_2_clauses = cursor.fetchall() for a1_clause in article_1_clauses: a1_clause_id = a1_clause[0] a1_clause_content = str(a1_clause[3]).strip() try: cursor.execute(update_article_1_sql, (a1_clause_content, a1_clause_id)) conn.commit() print( str(a1_clause_id) + a1_clause_content + '------------------------------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + str(a1_clause_id) + '--' + e + ': FAILED---------' + '\033[0m') print( '=========================================================================================================' ) print( '=========================================================================================================' ) print( '=========================================================================================================' ) for a2_clause in article_2_clauses: a2_clause_id = a2_clause[0] a2_clause_content = str(a2_clause[3]).strip() try: cursor.execute(update_article_2_sql, (a2_clause_content, a2_clause_id)) conn.commit() print( str(a2_clause_id) + a2_clause_content + '------------------------------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + str(a2_clause_id) + '--' + e + ': FAILED---------' + '\033[0m')
def key_words_extract(): # 利用jieba的两种方式提取关键词,并做交集,更新到 law 表的 key_words 字段 dir_path = "C:\\Users\\dhz1216\\Desktop\\wenben\\" cursor = conn.cursor() select_sql = "select id from law where text_name = %s" update_sql = "update law set key_words = %s where id = %s" for file in os.listdir(dir_path): with open(dir_path + file, "r", encoding='gbk', errors='ignore') as f: text = f.read() text_name = file.split('.')[0] cursor.execute(select_sql, (text_name)) law = cursor.fetchone() if not law: continue law_id = law[0] textrank = analyse.textrank key_words_textrank = textrank(text, topK=3, withWeight=False, allowPOS=('n', 'ns', 'vn', 'v', 'nz')) key_words_tfidf = analyse.extract_tags(text, topK=5, withWeight=False, allowPOS=('n', 'ns', 'vn', 'v', 'nz')) intersection_list = list( set(key_words_textrank).intersection(set(key_words_tfidf))) if intersection_list: key_words_list = intersection_list else: key_words_list = list( set(key_words_textrank).union(set(key_words_tfidf))) key_words = str() for i in range(len(key_words_list)): if i == len(key_words_list) - 1: key_words = key_words + key_words_list[i] else: key_words = key_words + key_words_list[i] + ',' try: cursor.execute(update_sql, (key_words, law_id)) conn.commit() print(text_name + '--------UPDATE SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + text_name + ': PARSE FAILED---------' + e + '\033[0m')
def violate_punishment_save(is_contain_accord, violate_punishment_accord_info, law_id): cursor = conn.cursor() if is_contain_accord == 1: contain_accord_insert_sql = '''insert into violate_punishment_accord (violate_law_id, violate_chapter_id, violate_article_id, violate_sentence_id, punishment_law_id, punishment_chapter_id, punishment_article_id, punishment_sentence_id, accord_law_id, accord_chapter_id, accord_article_id, accord_sentence_id, violate_content, punishment_content, accord_content, is_contain_accord) value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''' violate_info = violate_punishment_accord_info['violate_info'] punishment_info = violate_punishment_accord_info['punishment_info'] accord_info = violate_punishment_accord_info['accord_info'] try: cursor.execute( contain_accord_insert_sql, (law_id, violate_info[0], violate_info[1], violate_info[2], law_id, punishment_info[0], punishment_info[1], punishment_info[2], law_id, accord_info[0], accord_info[1], accord_info[2], violate_info[3], punishment_info[3], accord_info[3], int(is_contain_accord))) conn.commit() except Exception as e: conn.rollback() print(e) else: contain_accord_insert_sql = '''insert into violate_punishment_accord (violate_law_id, violate_chapter_id, violate_article_id, violate_sentence_id, punishment_law_id, punishment_chapter_id, punishment_article_id, punishment_sentence_id, violate_content, punishment_content, is_contain_accord) value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''' violate_info = violate_punishment_accord_info['violate_info'] punishment_info = violate_punishment_accord_info['punishment_info'] try: cursor.execute( contain_accord_insert_sql, (law_id, violate_info[0], violate_info[1], violate_info[2], law_id, punishment_info[0], punishment_info[1], punishment_info[2], violate_info[3], punishment_info[3], int(is_contain_accord))) conn.commit() except Exception as e: conn.rollback() print(e)
def article_2_key_process(): # 将不包含“章”的条款的条款序号统一为 “第XX条” select_sql = "select id, a_key from article_2" update_sql = "update article_2 set a_key = %s where id = %s" cursor = conn.cursor() cursor.execute(select_sql) articles = cursor.fetchall() for article in articles: if '条' not in article[1]: article_key = '第' + str(article[1]).replace('、', '') + '条' try: cursor.execute(update_sql, (article_key, article[0])) conn.commit() print( str(article[0]) + article_key + '--------------------UPDATE SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + str(article[0]) + article_key + e + ': ARTICLE FAILED---------' + '\033[0m')
def class_one_sentences_extracte(): # 第一类文本的单句提取 pattern = re.compile("第(.*?)(?:章|条)") # 定义正则表达式用以判断是否是第一类 cursor = conn.cursor() select_sql = "select id, law_id, p_key, p_content from law_content_parse" cursor.execute(select_sql) results = cursor.fetchall() complex_count = 0 single_count = 0 insert_complex_sentence = "insert into sentences (law_id, title_id, sentence, is_single) " \ "value (%s, %s, %s, %s)" for res in results: if pattern.match(res[2]): title_id = res[0] # law_content_parse的主键ID,对应sentences表中的title_id law_id = res[1] # 对应法律法规id if ':' in str(res[3]): complex_count = complex_count + 1 try: cursor.execute(insert_complex_sentence, (law_id, title_id, res[3], 0)) conn.commit() print(str(res[3]) + '-----------Success') except Exception as e: print('\033[1;32;41m' + str(res[3]) + ': FAILED---------' + '\033[0m') conn.rollback() print(e) else: single_count = single_count + 1 sentences = str(res[3]).split('\n') for sentence in sentences: if len(sentence) != 0: try: cursor.execute(insert_complex_sentence, (law_id, title_id, sentence, 1)) conn.commit() print(str(sentence) + '-----------Success') except Exception as e: print('\033[1;32;41m' + sentence + ': FAILED---------' + '\033[0m') conn.rollback() print(e)
def location_extract(): # 运用pyltp的分词和词性标注,识别法律法规所属地区(省市) select_sql = "select id, name from law" update_sql = "update law set location = %s where id = %s" cursor = conn.cursor() select_special_city_sql = "select name from city where name not like '%市'" cursor.execute(select_sql) results = cursor.fetchall() cursor.execute(select_special_city_sql) select_special_city = cursor.fetchall() special_city_list = list() for c in select_special_city: special_city_list.append(c[0]) for result in results: title = result[1] location = '' for city in special_city_list: if city in title: location = city break if location is None or location == '': words = list(segmentor.segment(title)) postag = list(postagger.postag(words)) for index in range(len(words)): if postag[index] == 'ns': location = location + words[index] if postag[index + 1] == 'ns': location = location + words[index + 1] if postag[index + 2] == 'ns': location = location + words[index + 2] break if location is None or location == '' or len(location) <= 1: location = '中华人民共和国' try: cursor.execute(update_sql, (location, result[0])) conn.commit() print(str(result[0]) + result[1] + '-----------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + str(result[0]) + result[1] + e + '\033[0m')
def law_to_class_process(): # 为mysql中law表添加class_id一列,对应类别的ID cursor = conn.cursor() law_select = "select id, type, name from law" class_select_sql = "select id from law_class where type = %s" update_sql = "update law set class_id = %s where id = %s" cursor.execute(law_select) laws = cursor.fetchall() for law in laws: law_id = law[0] law_type = law[1] cursor.execute(class_select_sql, (law_type)) class_info = cursor.fetchone() class_id = class_info[0] try: cursor.execute(update_sql, (class_id, law_id)) conn.commit() print(law[2] + '-----------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + law[2] + ': PARSE FAILED---------' + e + '\033[0m')
def update_article(): select_article_1_sentence = '''select * from article_1_sentence where is_single = 0''' select_article_2_sentence = '''select * from article_2_sentence where is_single = 0''' update_article_1_sentence = '''update article_1_sentence set content = %s where id = %s''' update_article_2_sentence = '''update article_2_sentence set content = %s where id = %s''' cursor = conn.cursor() cursor.execute(select_article_1_sentence) article_1_sentences = cursor.fetchall() for sentence in article_1_sentences: sentence_id = sentence[0] content = sentence[3] + ':' try: cursor.execute(update_article_1_sentence, (content, sentence_id)) conn.commit() print(str(sentence_id), '-1-', content, '------------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m', str(sentence_id), '-2-', e, '-------FAILED', '\033[0m') print( '\n', '=============================================================================================', '\n') cursor.execute(select_article_2_sentence) article_2_sentences = cursor.fetchall() for sentence in article_2_sentences: sentence_id = sentence[0] content = sentence[3] + ':' try: cursor.execute(update_article_2_sentence, (content, sentence_id)) conn.commit() print(str(sentence_id), '-2-', content, '------------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m', str(sentence_id), '-2-', e, '-------FAILED', '\033[0m')
def save_relation(relation_list, law_id, content_class, chapter_id, sentence_id): cursor = conn.cursor() insert_sql = '''insert into extract_relation (law_id, class, chapter_id, sentence_id, is_contain, subject, relation, object) value (%s, %s, %s, %s, %s, %s, %s, %s)''' for relation in relation_list: subject = relation[0] relation_name = relation[1] object = relation[2] is_contain = 0 if object == '根据章节条款信息补全list': is_contain = 1 try: cursor.execute(insert_sql, (law_id, content_class, chapter_id, sentence_id, is_contain, subject, relation_name, object)) conn.commit() print(subject, relation_name, object, '--------saved--------') except Exception as e: conn.rollback() print('\033[1;32;41m' + relation + e + ': FAILED---------' + '\033[0m')
def article_1_sentence_extract(): # 将article_1 的句子尽心分割提取 select_sql = "select * from article_1" cursor = conn.cursor() cursor.execute(select_sql) articles = cursor.fetchall() for article in articles: article_1_id = article[0] article_1_content = article[2] insert_article_1_sentence_sql = '''insert into article_1_sentence (article_1_id, is_single, content) value (%s, %s, %s)''' if ':' in article_1_content: is_single = 0 article_1_sentence_content = str(article_1_content).split( ':')[0].replace(" ", "") try: cursor.execute( insert_article_1_sentence_sql, (article_1_id, is_single, article_1_sentence_content)) conn.commit() except Exception as e: conn.rollback() print('\033[1;32;41m' + str(article_1_id) + article_1_sentence_content + e + ': FAILED---------' + '\033[0m') article_1_clauses = str(article_1_content).split(':')[1].split( "\n") select_article1_sentence_id = '''SELECT id from article_1_sentence where id = (SELECT max(id) FROM article_1_sentence);''' cursor.execute(select_article1_sentence_id) sentence_id = cursor.fetchone()[0] for article_1_clause in article_1_clauses: if article_1_clause is not None and article_1_clause != '': insert_article_1_clause_sql = '''insert into article_1_clause (article_1_id, article_1_sentence_id, clause_content) value (%s, %s, %s)''' try: cursor.execute( insert_article_1_clause_sql, (article_1_id, sentence_id, str(article_1_clause).replace(" ", ""))) conn.commit() except Exception as e: conn.rollback() print('\033[1;32;41m' + str(article_1_id) + article_1_clause + e + ': FAILED---------' + '\033[0m') print(article[2] + '============================================SUCCESS') else: is_single = 1 try: cursor.execute(insert_article_1_sentence_sql, (article_1_id, is_single, article_1_content)) conn.commit() print(article_1_content + '=========================================SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + str(article_1_id) + '--' + e + ': FAILED---------' + '\033[0m')
def law_province_code_update(): # 提取法律所在省份的省份代码并更新law表的province_code字段 select_sql = "select id, name, location, location_code from law" update_sql = "update law set province_code = %s where id = %s" cursor = conn.cursor() cursor.execute(select_sql) laws = cursor.fetchall() for law in laws: law_id = law[0] law_name = law[1] law_location = law[2] if law[3] is not None: # location_code不为None的时候更新 law_location_code = law[3] province_code = str(law_location_code)[0:2] + '0000' else: province_code = '000000' try: cursor.execute(update_sql, (province_code, law_id)) conn.commit() print(law_name + '-------------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + law_name + ': FAILED---------' + e + '\033[0m')
def complex_main_sentence_analysis(): # 非单句主句依存句法分析以及语义角色标注,结果入库 start_time = time.time() select_sql = '''select * from sentences where is_single = 0''' cursor = conn.cursor() cursor.execute(select_sql) complex_sentences = cursor.fetchall() for sentence in complex_sentences: sentence_id = sentence[0] main_sentence = str(sentence[3]).strip().split(':')[0] + ':' origin_words = list(segmentor.segment(main_sentence)) # 分词 origin_postags = list(postagger.postag(origin_words)) # 词性标注 arcs = parser.parse(origin_words, origin_postags) # 依存句法分析 roles = labeller.label(origin_words, origin_postags, arcs) # 语义角色标注 print('语义角色标注--------', str(len(roles))) # 语义角色标注信息提取并存入数据库 core_verb_list = list() insert_role_label_sql = '''insert into role_label (sentence_id, arg_name, arg_start, arg_end, core_verb_index) value (%s, %s, %s, %s, %s)''' for role in roles: core_verb_list.append(role.index) # 建立核心动词索引列表 for arg in role.arguments: arg_name = arg.name arg_start = arg.range.start arg_end = arg.range.end # 将语义角色标注信息插入到role_label表中 try: cursor.execute(insert_role_label_sql, (sentence_id, arg_name, arg_start, arg_end, role.index)) conn.commit() print(str(sentence_id), main_sentence, '-----------', origin_words[role.index], arg_name, '-------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + str(sentence_id) + main_sentence + e + ': ---------FAILED---------' + '\033[0m') # 提取动词信息,并插入数据库 print('提取动词信息-------------------------------------------') insert_verb_sql = '''insert into verb (sentence_id, part_of_speech, loc_index, is_core) value (%s, %s, %s, %s)''' for index in range(len(origin_words)): if origin_postags[index] == 'v': is_core = 0 if index in core_verb_list: is_core = 1 try: cursor.execute(insert_verb_sql, (sentence_id, 'v', index, is_core)) conn.commit() print(str(index), '--', origin_words[index], '-----------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + str(index) + origin_words[index] + e + ': ---------FAILED---------' + '\033[0m') # 提取其他词和动词的关系,没有关系的设为NONE arc_head = [a.head for a in arcs] arc_relation = [a.relation for a in arcs] tree_node_list = ['ROOT'] + origin_words postags = ['NONE'] + origin_postags for i in range(len(arc_head)): j = arc_head[i] head_index = j - 1 tail_index = i relation = arc_relation[i] if arc_relation[i] == 'HED': update_verb_sql = '''update verb set is_head = 1 where sentence_id = %s and loc_index = %s''' print('更新根动词情况:') try: cursor.execute(update_verb_sql, (sentence_id, i)) conn.commit() print('根动词-----index: ', str(i), '----', origin_words[i], '-----UPDATE SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + '根动词---' + str(i) + origin_words[i] + e + ': ---------FAILED---------' + '\033[0m') continue if head_index not in core_verb_list and tail_index not in core_verb_list: continue elif head_index in core_verb_list: part_of_speech = postags[i + 1] core_verb_index = head_index word = origin_words[i] loc = 'tail' else: part_of_speech = postags[j] core_verb_index = tail_index word = tree_node_list[j] loc = 'head' # TODO:----------------------------数据库插入操作-------------------------------------------- insert_words_sql = '''insert into words (sentence_id, part_of_speech, core_verb_index, relation, word, head_or_tail) value (%s, %s, %s, %s, %s, %s)''' try: cursor.execute(insert_words_sql, (sentence_id, part_of_speech, core_verb_index, relation, word, loc)) conn.commit() print(tree_node_list[j], postags[j], '----', origin_words[i], postags[i + 1], relation, '-----SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m', tree_node_list[j], postags[j], '----', origin_words[i], postags[i + 1], relation, '-----FAILED', e, '\033[0m') print( '\n', '===============================================================', '\n') end_time = time.time() print('处理', str(len(complex_sentences)), '条数据的总耗时为:', str(end_time - start_time), 's')
def chapter_article_process( ): # 法一(有问题):将法律文本的条款信息做进一步分表,分为两类,第一类包含“章”大标题,第二类只含条款 select_sql = "select * from law_content_parse" cursor = conn.cursor() cursor.execute(select_sql) contents = cursor.fetchall() index = 0 while index < len(contents): pattern_chapter = re.compile("第(.*?)章") pattern_article = re.compile("第(.*?)条") match_chapter = pattern_chapter.match(contents[index][2]) if match_chapter: # 此处将章的信息插入chapter insert_chapter_sql = "insert into chapter (chapter_key, chapter_name, law_id) value (%s, %s, %s)" try: cursor.execute(insert_chapter_sql, (contents[index][2], contents[index][3], contents[index][1])) conn.commit() print(contents[index][5] + '----' + contents[index][2] + '----------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + contents[index][5] + e + ': CHAPTER FAILED---------' + '\033[0m') return index = index + 1 while index < len(contents): select_chapter_sql = 'SELECT id from chapter where id = (SELECT max(id) FROM chapter);' cursor.execute(select_chapter_sql) # TODO: 此处添加逻辑判断是否已经读到下一篇法规 chapter_id = cursor.fetchone()[0] match_article = pattern_article.match(contents[index][2]) if match_article: # 此处插入article信息 insert_article1_sql = "insert into article_1 (a_key, a_content, chapter_id) value (%s, %s, %s)" try: cursor.execute(insert_article1_sql, (contents[index][2], contents[index][3], chapter_id)) conn.commit() print(contents[index][5] + '----' + contents[index][2] + '----------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + contents[index][5] + e + ': ARTICLE FAILED---------' + '\033[0m') return index = index + 1 else: print('-----------------------------' + contents[index][5] + '----------------------------------') break else: insert_article2_sql = "insert into article_2 (a_key, a_content, law_id) value (%s, %s, %s)" try: cursor.execute(insert_article2_sql, (contents[index][2], contents[index][3], contents[index][1])) conn.commit() print(contents[index][5] + '----' + contents[index][2] + '----------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + contents[index][5] + e + ': ARTICLE FAILED---------' + '\033[0m') return index = index + 1
def chapter_article_process_2( ): # 法二:将法律文本的条款信息做进一步分表,分为两类,第一类包含“章”大标题,第二类只含条款 cursor = conn.cursor() pattern_chapter = re.compile("第(.*?)章") pattern_article = re.compile("第(.*?)条") select_law_id_sql = "select law_id from law_content_parse group by law_id" # 先统计出law_id, 保存到list当中 cursor.execute(select_law_id_sql) law_id_tuple = cursor.fetchall() law_id_list = list() for law in law_id_tuple: law_id_list.append(law[0]) # 按照id查询law_content_parse, 并做处理 select_law_content_sql = "select * from law_content_parse where law_id = %s" for law_id in law_id_list: cursor.execute(select_law_content_sql, (law_id, )) contents = cursor.fetchall() index = 0 while index < len(contents): match_chapter = pattern_chapter.match(contents[index][2]) if match_chapter: # 此处将章的信息插入chapter insert_chapter_sql = "insert into chapter (chapter_key, chapter_name, law_id) value (%s, %s, %s)" try: cursor.execute( insert_chapter_sql, (contents[index][2], contents[index][3], law_id)) conn.commit() print(contents[index][5] + '----' + contents[index][2] + '----------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + contents[index][5] + e + ': CHAPTER FAILED---------' + '\033[0m') return index = index + 1 while index < len(contents): select_chapter_sql = 'SELECT id from chapter where id = (SELECT max(id) FROM chapter);' cursor.execute(select_chapter_sql) chapter_id = cursor.fetchone()[0] match_article = pattern_article.match(contents[index][2]) if match_article: # 此处插入article信息 insert_article1_sql = "insert into article_1 (a_key, a_content, chapter_id) value (%s, %s, %s)" try: cursor.execute(insert_article1_sql, (contents[index][2], contents[index][3], chapter_id)) conn.commit() print(contents[index][5] + '----' + contents[index][2] + '----------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + contents[index][5] + e + ': ARTICLE FAILED---------' + '\033[0m') return index = index + 1 else: print('-----------------------------' + contents[index][5] + '----------------------------------') break else: insert_article2_sql = "insert into article_2 (a_key, a_content, law_id) value (%s, %s, %s)" try: cursor.execute( insert_article2_sql, (contents[index][2], contents[index][3], law_id)) conn.commit() print(contents[index][5] + '----' + contents[index][2] + '----------------SUCCESS') except Exception as e: conn.rollback() print('\033[1;32;41m' + contents[index][5] + e + ': ARTICLE FAILED---------' + '\033[0m') return index = index + 1