def law_item_parse(item): templates = dict() lines = item.strip().replace('<p>', '').replace('\u3000', '').split('</p>') if lines: first_item = lines[0] first_item = item_title_filter(first_item) if len(lines) > 1: items = lines[1:] else: items = [] items = [number_zh_filter(it) for it in items] ltp_result_dict = ltp_tool(first_item, 'srl') first_segs = first_item_filter(first_item) if not first_segs: return templates seg = ltp_result_dict['seg'] key_id = 0 if seg: for n in seg: if n['word'] in key1: # 应该为关键字列表 key_id = n['id'] break roles = ltp_result_dict['role'] if roles: for role in roles[::-1]: role_type = role['type'] beg = role['beg'] end = role['end'] result = remove_special_character(first_segs[1]) if role_type == 'A0' and end < key_id: sub = ''.join([n['word'] for n in seg[beg:end + 1]]) condition = remove_special_character(first_segs[0].replace( sub, '')) template = st.SentenceTemplate(subject=[sub], condition=condition, result=result, flag=0) break elif role_type == 'A1' and end < key_id: segs_ = remove_special_character(first_segs[0]) template = st.SentenceTemplate(subject=[], condition=segs_, result=result, flag=1) continue else: template = st.SentenceTemplate(subject=[], condition='', result=first_segs[1], flag=1) if template: condition, subject, behavior, result = template.parse_items( items) templates['condition'], templates['subject'], templates['behavior'], templates['result'] = \ condition, subject, behavior, result return templates
def law_item_parse_j(lines): global template templates = dict() # 按照</p>拆 lines = lines.strip().replace('<p>', '').replace('\u3000', '').split('</p>') # 非空行 if lines: # first_item是(一)到(十)前面的说明性文字 first_item = lines[0] # 去掉所有的第……条 first_item = item_title_filter(first_item) items = [] items_ap = [] if len(lines) > 1: # 第0行一般是“有下列情形之一的……” for i, word in enumerate(lines[1:]): if has_key_one(word): items.append(word) else: items_ap = lines[(i + 1):] break # 去掉(一)等 items = [ number_zh_filter(remove_special_character(it)) for it in items ] # 对第一条进行语意角色标注 # 对应的字典有两个key分别为role 和 seg,role部分是每个词(可能不止一个词,具体几个词由beg和end决定) # type代表角色对应类型,id代表这个类型的角色在role里服务的个体 ltp_result_dict = ltp_tool(first_item, 'srl') # ltp_jufa_dict = ltp_parse(first_item, 'parse') # 这个用来将“有下列情形之一的”的主客体分开 first_segs = first_item_filter(first_item) if not first_segs: return templates seg = ltp_result_dict['seg'] key_id = 0 if seg: for n in seg: if n['word'] in key1: key_id = n['id'] break roles = ltp_result_dict['role'] # key_id之前的部分是subject,之后的都是result if roles: # 逆序roles for role in roles[::-1]: role_type = role['type'] beg = role['beg'] end = role['end'] result = remove_special_character(first_segs[1]) # 实际上是直到找到A0为止,否则会一直循环下去 if role_type == 'A0' and end < key_id: sub = ''.join([n['word'] for n in seg[beg:end + 1]]) condition = remove_special_character(first_segs[0].replace( sub, '')) template = st.SentenceTemplate(subject=sub, condition=condition, result=result, flag=0) break elif role_type == 'A1' and end < key_id: segs_ = remove_special_character(first_segs[0]) template = st.SentenceTemplate(subject='', condition=segs_, result=result, flag=1) continue else: template = st.SentenceTemplate(subject='', condition='', result=result, flag=1) if template: # condition, subject, behavior, result = template.parse_items(items) beh = [] for tiao in items: beh.append(tiao) templates['condition'], templates['subject'], templates['behavior'], templates['result'] = \ template.condition, template.subject, beh, template.result return templates
def law_item_parse_j(lines): global template templates = dict() # 按照</p>拆 result_list = [] key_list = [] last_behavior = '' # 如果behavior,说的是同一个behavior key_item = '' last_beh = '' # 必须对其所有的或者所经营的船舶、排筏、设施的安全负责,并且应当 '做到' 下列各项 拿出做到 加到behavior上 lines = lines.strip().replace('<p>', '').replace('\u3000', '').split('</p>') # 非空行 if lines: # first_item是(一)到(十)前面的说明性文字 first_item = lines[0] # 去掉所有的第……条 first_item = item_title_filter(first_item) items = [] items_ap = [] if len(lines) > 1: # 第0行一般是“有下列情形之一的……” for i, word in enumerate(lines[1:]): if has_key_one(word) or has_key_one_v2(word) or has_key_one_v4( word): # 有(一)或 一或 1、 items.append(word) else: items_ap = lines[(i + 1):] break # 去掉(一)等 items = [ number_zh_filter_plus( number_zh_filter(remove_special_character(it))) for it in items ] # 对第一条进行语意角色标注 # 对应的字典有两个key分别为role 和 seg,role部分是每个词(可能不止一个词,具体几个词由beg和end决定) # type代表角色对应类型,id代表这个类型的角色在role里服务的个体 ltp_result_dict = ltp_tool(first_item, 'srl') # ltp_jufa_dict = ltp_parse(first_item, 'parse') # 这个用来将“有下列情形之一的”的主客体分开 first_segs = first_item_filter(first_item) if not first_segs: return templates if ltp_result_dict: seg = ltp_result_dict['seg'] key_id = 0 if seg: for n in seg: if n['word'] in key2 or n['word'] in key: key_id = n['id'] key_item = n['word'] if key_item in key: # 找到key2中的关键词 key_item = '' continue if key_item in key2: break roles = ltp_result_dict['role'] # key_id之前的部分是subject,之后的都是result if roles: # 逆序roles for role in roles[::-1]: role_type = role['type'] beg = role['beg'] end = role['end'] result = remove_special_character(first_segs[1]).replace( key_item, '') # 除去result中的key # 实际上是直到找到A0为止,否则会一直循环下去 # 优先判断特殊情况 if check_sub_v2(first_segs[0]): try: sub = '' condition = remove_special_character( (first_segs[0].replace(key_item, ''))) if second_item_filter( first_segs[0] ): # 未经自治县交通主管部门批准,在乡道和乡道用地范围内,不得从事下列活动: second_result = second_item_filter( first_segs[0]) condition = remove_special_character( second_result[0]) last_beh = second_result[1] template = st.SentenceTemplate(subject=sub, condition=condition, result=result, flag=0) except Exception: write_to_file_append(lines, 'model_1_error.txt') elif role_type == 'A0' and end < key_id: sub = ''.join([n['word'] for n in seg[beg:end + 1]]) if sub in sub_list_2: sub = '' condition = remove_special_character( first_segs[0].replace(sub, '')) if second_item_filter(condition): second_result = second_item_filter(condition) condition = remove_special_character( second_result[0]) last_beh = second_result[1] template = st.SentenceTemplate(subject=sub, condition=condition, result=result, flag=0) break elif role_type == 'A1' and end < key_id: segs_ = remove_special_character(first_segs[0]) template = st.SentenceTemplate(subject='', condition=segs_, result=result, flag=1) continue else: template = st.SentenceTemplate(subject='', condition='', result=result, flag=1) if template: # condition, subject, behavior, result = template.parse_items(items) beh = [] for i, tiao in enumerate(items): try: pre_behavior = '' if get_sentence_key(tiao): get_result = get_sentence_key(tiao) key_list.append(''.join( s for s in get_result[0])) behavior = last_beh + remove_last_de( tiao.replace(key_list[i], '')) if filter_key_one_behv( behavior ) and template.subject not in sub_list: # 对于behavior中是result的情况进行了过滤 filter_result = get_result_from_beh( behavior) if filter_result: if filter_result[0]: pre_behavior = remove_last_de( remove_special_character( filter_result[0])) if not pre_behavior: pre_behavior = last_behavior beh.append(pre_behavior) if filter_result[2]: result_list.append( filter_result[1] + filter_result[2]) else: result_list.append( filter_result[1]) last_behavior = pre_behavior else: beh.append(behavior) else: key_item = '' if key_item == '由' else key_item key_list.append(key_item) behavior = last_beh + remove_last_de(tiao) if filter_key_one_behv(behavior) and filter_key_one_behv_plus(behavior) and template.subject not in sub_list \ and template.condition not in condition_list_2: # 对于behavior中是result的情况进行了过滤 filter_result = get_result_from_beh( behavior) if filter_result: if filter_result[0]: pre_behavior = remove_last_de( remove_special_character( filter_result[0])) if not pre_behavior: pre_behavior = last_behavior beh.append(pre_behavior) if filter_result[2]: result_list.append( filter_result[1] + filter_result[2]) else: result_list.append( filter_result[1]) last_behavior = pre_behavior else: beh.append(behavior) except Exception: write_to_file_append(lines, 'model_1_error.txt') if len(template.condition ) <= 1 or template.condition in condition_list: template.condition = '' if result_list: templates['condition'], templates['subject'], templates['key'], templates['behavior'], \ templates['result'] = \ template.condition+template.result, template.subject, key_list, beh, result_list else: templates['condition'], templates['subject'], templates['key'], templates['behavior'], templates['result'] = \ template.condition, template.subject, key_list, beh, template.result.replace(template.subject, '') return templates