def rewriteString(string): numberft = re.finditer(r"((\,\s)|(và\s))((\d+[a-zđ]{0,1})|([a-zđ]{1}))(?=(\s|\,|\.|\;))",string, re.U) a = divlaw.lenIterator(numberft) count = 0 #################################### while a > 0: if count >10 : return None numberft = re.finditer(r"((\,\s)|(và\s))((\d+[a-zđ]{0,1})|([a-zđ]{1}))(?=(\s|\,|\.|\;))",string, re.U) i = getFirst(numberft) startIndex = i.start() cutIndex = 0 if string[startIndex] == ',' : cutIndex = startIndex + 2 startIndex -= 1 else : cutIndex = startIndex + 4 startIndex -=2 lastW = startIndex firstW = 0 findLast = True while startIndex > 0: if string[startIndex] == ' ': if findLast : findLast = False lastW = startIndex else : firstW = startIndex + 1 break startIndex -= 1 string = string[:cutIndex] + string[firstW:lastW] + ' ' + string[cutIndex:] numberft = re.finditer(r"((\,\s)|(và\s))((\d+[a-zđ]{0,1})|([a-zđ]{1}))(?=(\s|\,|\.|\;))",string, re.U) a = divlaw.lenIterator(numberft) count += 1 return writeDetail(string)
def writeDetail(string): numberft = re.finditer(r"điểm\s[a-zđ]{1}(?!\skhoản)",string, re.U) if divlaw.lenIterator(numberft) > 0: numberft = re.finditer(r"điểm\s[a-zđ]{1}(?!\skhoản)",string, re.U) addLen = 0 for i in numberft: findItem = re.finditer(r"khoản\s\d+[a-zđ]?",string[i.end()+addLen:len(string)],re.U) if divlaw.lenIterator(findItem) > 0: findItem = re.finditer(r"khoản\s\d+[a-zđ]?",string[i.end()+addLen:len(string)],re.U) j = getFirst(findItem) if ' đ' in string[i.start():i.end()+1]: string = string[:i.end()+addLen+1] + " " + string[i.end()+j.start()+addLen:i.end()+j.end()+addLen] + string[1+i.end()+addLen:] else : string = string[:i.end()+addLen] + " " + string[i.end()+j.start()+addLen:i.end()+j.end()+addLen] + string[i.end()+addLen:] addLen += j.end() - j.start() + 1 else : break numberft = re.finditer(r"khoản\s\d+[a-zđ]?(?!\sđiều)",string, re.U) if divlaw.lenIterator(numberft) > 0: numberft = re.finditer(r"khoản\s\d+[a-zđ]?(?!\sđiều)",string, re.U) addLen = 0 for i in numberft: findItem = re.finditer(r"điều\s\d+[a-zđ]?",string[i.end()+addLen:len(string)],re.U) if divlaw.lenIterator(findItem) > 0: findItem = re.finditer(r"điều\s\d+[a-zđ]?",string[i.end()+addLen:len(string)],re.U) j = getFirst(findItem) string = string[:i.end()+addLen] + " " + string[i.end()+j.start()+addLen:i.end()+j.end()+addLen] + string[i.end()+addLen:] addLen += j.end() - j.start() + 1 else : break return string
def extract(doc_id="text", header_text="text", title="text"): temp = "" if re.search(r'(sửa đổi|bổ sung)', handle_string.toLowerCase(title), re.U): check_symbol = re.search( r'[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+(\s|\_|\#|\*|\.|\\)', title, re.U | re.I) if check_symbol is not None: yield [ doc_id, (re.search(r'[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+', check_symbol.group(), re.U | re.I)).group(), findDate(title[check_symbol.end(0):]) ] else: get_content = re.finditer( re.escape(handle_string.toUpperCase(title.strip())) + r'\s(SỐ\s)*[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+(\s|\_|\#|\*|\.|\\)', handle_string.toUpperCase(header_text), re.U | re.I) if divlaw.lenIterator(get_content) > 0: get_content = re.finditer( re.escape(handle_string.toUpperCase(title.strip())) + r'\s(SỐ\s)*[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+(\s|\_|\#|\*|\.|\\)', handle_string.toUpperCase(header_text), re.U | re.I) for i in get_content: break yield [ doc_id, (re.search(r'[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+', i.group(0), re.U | re.I)).group(0), findDate(title[i.end():]) ] else: getTitleModified = re.finditer(r'của\s', title, re.U | re.I) if divlaw.lenIterator(getTitleModified) > 0: getTitleModified = re.finditer(r'của\s', title, re.U | re.I) for i in getTitleModified: break temp = title[i.end():] get_content = re.finditer( re.escape(handle_string.toUpperCase(title)), handle_string.toUpperCase(header_text), re.U | re.I) if divlaw.lenIterator(get_content) > 0: get_content = re.finditer( re.escape(handle_string.toUpperCase(title)), handle_string.toUpperCase(header_text), re.U | re.I) for i in get_content: pass yield [doc_id, temp, findDate(header_text[i.end():])] else: yield [doc_id, temp, None]
def getTitle(string): temp = re.finditer(r"\:(\s|\n|\*|\_|\#)*(\“|\")",string,re.DOTALL) end_title = len(string) if divlaw.lenIterator(temp) > 0 : temp = re.finditer(r"\:(\s|\n|\*|\_|\#)*(\“|\")",string,re.DOTALL) for i in temp: end_title = i.start() break return string[:end_title]
def divTitle(string): result = [] findAddition = re.finditer(r"bổ\ssung\s.+vào",string,re.U) if divlaw.lenIterator(findAddition) > 0 : if ";" in string : findSemicomma = re.finditer(r"(bổ\ssung|sửa\sđổi).+\;",string,re.U) for i in findSemicomma: result.append(string[:i.end()]) result.append(string[i.end():]) break else : result.append(string) else : result.append(string) return result
def extract(law_id="text", part_index="int", chap_index="int", sec_index="int", law_index="int", item_index="int", point_index="int", numerical_symbol="text", titles="text", content="text", location_content="int", count="int"): titles = handle_string.toLowerCase(titles) titles = rewriteString(titles) a = divTitle(titles) for title in a: findType = re.finditer(r"(.+vào.+)|(.+(sau|trước).{7,})", title, re.U) if divlaw.lenIterator(findType) > 0: type_modify = 2 else: type_modify = 1 match = re.finditer( r"(\\n(\s|\_|\.|\*|\#)*\“(.(?!\“|\”))+.{2})|(\\n(\s|\_|\.|\*|\#)*\"(.(?!\"))+.{2})", content, re.DOTALL) quotesIndex = [] for i in match: quotesIndex.append(i.start()) for j in range(len(quotesIndex)): if type_modify == 1: divModify = divlaw.divPartModifyLaw(content) if j != (len(quotesIndex) - 1): divModify = divlaw.divPartModifyLaw( content[quotesIndex[j]:quotesIndex[j + 1]]) else: divModify = divlaw.divPartModifyLaw(content[quotesIndex[j]:]) totalPart = divlaw.getTotalPart(divModify) if (totalPart == 0): totalPart = 1 for part_id in range(0, totalPart): part = divlaw.getPart(divModify, part_id) if part['name'] != "": part_name = handle_string.toLowerCase(part['name']) if part_name in title: yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, None, None, None, None, None, type_modify ] continue totalChap = divlaw.getTotalChapter(divModify, part_id) if totalChap == 0: totalChap = 1 for chap_id in range(0, totalChap): chap = divlaw.getChapter(divModify, part_id, chap_id) if chap['name'] != "": chap_name = handle_string.toLowerCase(chap['name']) if chap_name in title: part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) for fN in findName: part_name = title[fN.span()[0]:fN.span( )[1]] break yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, None, None, None, None, type_modify ] continue totalSec = divlaw.getTotalSection(divModify, part_id, chap_id) if totalSec == 0: totalSec = 1 for sec_id in range(0, totalSec): sec = divlaw.getSection(divModify, part_id, chap_id, sec_id) if sec['name'] != "": sec_name = handle_string.toLowerCase(sec['name']) if sec_name in title: part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) for fN in findName: part_name = title[fN.span()[0]:fN.span( )[1]] break chap_name = None findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title) for fN in findName: chap['name'] = title[fN.span()[0]:fN. span()[1]] break yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, sec_name, None, None, None, type_modify ] continue totalLaw = divlaw.getTotalLaw(divModify, part_id, chap_id, sec_id) if totalLaw == 0: totalLaw = 1 for law_index in range(0, totalSec): law = divlaw.getLaw(divModify, part_id, chap_id, sec_id, law_index) if law['name'] != "": law_name = handle_string.toLowerCase( law['name']) if law_name in title: part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) for fN in findName: part_name = title[fN.span()[0]:fN. span()[1]] break chap_name = None findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title) for fN in findName: chap_name = title[fN.span()[0]:fN. span()[1]] break sec_name = None findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title) for fN in findName: sec_name = title[fN.span()[0]:fN. span()[1]] break yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, sec_name, law_name, None, None, type_modify ] continue totalItem = divlaw.getTotalItem( divModify, part_id, chap_id, sec_id, law_index) if totalItem == 0: totalItem = 1 for item_id in range(0, totalItem): item = divlaw.getItem(divModify, part_id, chap_id, sec_id, law_index, item_id) if item['name'] != "": item_name = 'khoản ' + item['name'] if item_name in title: find_item_name = re.finditer( r"khoản\s" + item['name'], title, re.U) ex = getFirst(find_item_name) index_start = ex.end() part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[index_start:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[index_start:]) for fN in findName: part_name = title[ index_start + fN.span()[0]:index_start + fN.span()[1]] break chap_name = None findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title[index_start:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title[index_start:]) for fN in findName: chap_name = title[ index_start + fN.span()[0]:index_start + fN.span()[1]] break sec_name = None findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title[index_start:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title[index_start:]) for fN in findName: sec_name = title[ index_start + fN.span()[0]:index_start + fN.span()[1]] break law_name = None findName = re.finditer( r"điều [0-9]+\w*", title[index_start:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"điều [0-9]+\w*", title[index_start:]) for fN in findName: law_name = title[ index_start + fN.span()[0]:index_start + fN.span()[1]] break yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, sec_name, law_name, item['name'], None, type_modify ] continue totalPoint = divlaw.getTotalPoint( divModify, part_id, chap_id, sec_id, law_index, item_id) if totalPoint == 0: totalPoint = 1 for point_id in range(0, totalPoint): point = divlaw.getPoint( divModify, part_id, chap_id, sec_id, law_index, item_id, point_id) if point['name'] != "": point_name = 'điểm ' + point['name'] if point_name in title: find_point_name = re.finditer( r"điểm " + point['name'], title, re.U) index_start = getFirst( find_point_name).end() part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[index_start:]) if divlaw.lenIterator( findName) > 0: findName = re.finditer( r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[index_start:]) for fN in findName: part_name = title[ index_start + fN.span( )[0]:index_start + fN.span()[1]] break chap_name = None findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title[index_start:]) if divlaw.lenIterator( findName) > 0: findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title[index_start:]) for fN in findName: chap_name = title[ index_start + fN.span( )[0]:index_start + fN.span()[1]] break sec_name = None findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title[index_start:]) if divlaw.lenIterator( findName) > 0: findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title[index_start:]) for fN in findName: sec_name = title[ index_start + fN.span( )[0]:index_start + fN.span()[1]] break law_name = None findName = re.finditer( r"điều [0-9]+\w*", title[index_start:]) if divlaw.lenIterator( findName) > 0: findName = re.finditer( r"điều [0-9]+\w*", title[index_start:]) for fN in findName: law_name = title[ index_start + fN.span( )[0]:index_start + fN.span()[1]] break item_name = None findName = re.finditer( r"(?:khoản\s)[0-9]+\w*", title[index_start:]) if divlaw.lenIterator( findName) > 0: findName = re.finditer( r"(?:khoản\s)[0-9]+\w*", title[index_start:]) for fN in findName: item_name = title[ index_start + 8 + fN. span()[0]:index_start + fN.span()[1]] break yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, sec_name, law_name, item_name, point['name'], type_modify ] continue if type_modify == 2: start_index = 0 ft = re.finditer(r"bổ\ssung\s.+(vào).{5}", title, re.U) for i in ft: start_index = i.end() - 5 break ft = re.finditer(r"bổ\ssung\s.+(sau|trước).{5}", title, re.U) for i in ft: start_index = i.end() - 5 break part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[start_index:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[start_index:]) for fN in findName: part_name = title[start_index + fN.span()[0]:start_index + fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+", title[start_index:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+", title[start_index:]) for fN in findName: chap_name = title[start_index + fN.span()[0]:start_index + fN.span()[1]] break sec_name = None findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+", title[start_index:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+", title[start_index:]) for fN in findName: sec_name = title[start_index + fN.span()[0]:start_index + fN.span()[1]] break law_name = None findName = re.finditer(r"điều [0-9]+[A-zĐđ]*", title[start_index:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer(r"điều [0-9]+[A-zĐđ]*", title[start_index:]) for fN in findName: law_name = title[start_index + fN.span()[0]:start_index + fN.span()[1]] break item_name = None findName = re.finditer(r"(khoản\s)[0-9]+", title[start_index:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer(r"(khoản\s)[0-9]+", title[start_index:]) for fN in findName: item_name = title[start_index + fN.span()[0] + 8:start_index + fN.span()[1]] break point_name = None temp = title findName = re.finditer(r"(điểm\s)[A-z]+", title[start_index:], re.U) if divlaw.lenIterator(findName) > 0: findName = re.finditer(r"(điểm\s)[A-zđ]+", temp[start_index:], re.U) for fN in findName: point_name = temp[start_index + fN.span()[0]:start_index + fN.span()[0]] break if 'sau' in title[:start_index]: type_modify = 3 elif 'trước' in title[:start_index]: type_modify = 4 yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, sec_name, law_name, item_name, point_name, type_modify ]
def extract( id ="text", content ="text", part_index ="int", chap_index ="int", sec_index ="int", law_index ="int", item_index ="int", start_index ="int", end_index ="int", ): sent_index = 0 for s in content[start_index:end_index].split("\n"): if s != "": it = re.finditer(r"(.(?!(\.\s)))+.{2}",s,re.I) lent = divlaw.lenIterator(it) it = re.finditer(r"(.(?!(\.\s)))+.{2}",s,re.I) listIndex = [] position = 0 if item_index is None: position = "{}_{}_{}_{}_{}".format(part_index+1,chap_index+1,sec_index+1,law_index+1,0) else : position = "{}_{}_{}_{}_{}".format(part_index+1,chap_index+1,sec_index+1,law_index+1,item_index+1) if lent > 0: for i in it : listIndex.append(i.start()) if (len(s) - i.end()) > 5 : listIndex.append(i.end()) lent += 1 else : listIndex.append(0) for j in range(0,lent) : if (j != (lent - 1)) : string = handle_string.to_unicode(s[listIndex[j]:listIndex[j+1]]) string = string.replace("\\",'') tokenize = ViPosTagger.postagging(ViTokenizer.tokenize(string))[0] pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(string))[1] tk = [] sent_index += 1 for token in tokenize : token = token.encode('utf-8') tk.append(token) if '' in tk : continue else : yield [ id, position, sent_index - 1, " ".join(tk), tk, pos_tag ] else : string = handle_string.to_unicode(s[listIndex[j]:]) string = string.replace("\\",'') tokenize = ViPosTagger.postagging(ViTokenizer.tokenize(string))[0] pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(string))[1] tk = [] sent_index+=1 for token in tokenize : token = token.encode('utf-8') tk.append(token) if '' in tk : continue else : yield [ id, position, sent_index -1, " ".join(tk), tk, pos_tag ]
def extract( law_id ="text", type_modify = "int", content = "text", numerical_symbol = "text", position = "text", released_date = "text" ): titles = getTitle(content) if type_modify == 1: titles = handle_string.toLowerCase(titles) ### titles = rewriteString(titles) if titles is None: titles = "None" yield [ law_id , position, type_modify, "1", None, None, None, None, None, None, None, None, numerical_symbol, released_date ] a = divTitle(titles) for title in a: findType = re.finditer(r"(.+vào.+)|(.+(sau|trước)[^\:]{7,})",title,re.U) if divlaw.lenIterator(findType) > 0: type_modify = 8 match = re.finditer(r"(\n(\s|\_|\.|\*|\#)*\“(.(?!\“|\”))+.{2})|(\n(\s|\_|\.|\*|\#)*\"(.(?!\"))+.{2})", content,re.DOTALL) quotesIndex = [] for i in match: quotesIndex.append(i.start()) for j in range(len(quotesIndex)) : if type_modify == 1: divModify = divlaw.divPartModifyLaw(content) if j != (len(quotesIndex) - 1): divModify = divlaw.divPartModifyLaw(content[quotesIndex[j]:quotesIndex[j+1]]) else : divModify = divlaw.divPartModifyLaw(content[quotesIndex[j]:]) totalPart = divlaw.getTotalPart(divModify) if (totalPart == 0): totalPart = 1 for part_id in range(0,totalPart): part = divlaw.getPart(divModify,part_id) if part['name'] != "": part_name = handle_string.toLowerCase(part['name']) if part_name in title: yield[ law_id , position, type_modify, part_name, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] continue totalChap = divlaw.getTotalChapter(divModify,part_id) if totalChap == 0: totalChap = 1 for chap_id in range(0,totalChap): chap = divlaw.getChapter(divModify,part_id,chap_id) if chap['name'] != "": chap_name = handle_string.toLowerCase(chap['name']) if chap_name in title: part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) for fN in findName: part_name = title[fN.span()[0]:fN.span()[1]] break yield[ law_id , position, type_modify, part_name, chap_name, None, None, None, None, None, None, None, numerical_symbol, released_date ] continue totalSec = divlaw.getTotalSection(divModify,part_id,chap_id) if totalSec == 0: totalSec = 1 for sec_id in range(0,totalSec): sec = divlaw.getSection(divModify, part_id, chap_id,sec_id) if sec['name'] != "": sec_name = handle_string.toLowerCase(sec['name']) if sec_name in title: part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) for fN in findName: part_name = title[fN.span()[0]:fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title) for fN in findName: chap['name'] = title[fN.span()[0]:fN.span()[1]] break yield[ law_id , position, type_modify, part_name, chap_name, sec_name, None, None, None, None, None, None, numerical_symbol, released_date ] continue totalLaw = divlaw.getTotalLaw(divModify,part_id,chap_id,sec_id) if totalLaw == 0: totalLaw = 1 for law_index in range(0,totalLaw): law = divlaw.getLaw(divModify,part_id,chap_id,sec_id,law_index) if law['name'] != "": law_name = handle_string.toLowerCase(law['name']) if law_name in title: part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) for fN in findName: part_name = title[fN.span()[0]:fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title) for fN in findName: chap_name = title[fN.span()[0]:fN.span()[1]] break sec_name = None findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title) for fN in findName: sec_name = title[fN.span()[0]:fN.span()[1]] break yield[ law_id , position, type_modify, part_name, chap_name, sec_name, law_name, None, None, None, None, None, numerical_symbol, released_date ] continue totalItem = divlaw.getTotalItem(divModify,part_id,chap_id,sec_id,law_index) if totalItem == 0: totalItem = 1 for item_id in range(0,totalItem): item = divlaw.getItem(divModify,part_id,chap_id,sec_id,law_index,item_id) if item['name'] != "": item_name = 'khoản ' + item['name'] if item_name in title: find_item_name = re.finditer(r"khoản\s"+item['name'],title,re.U) ex = getFirst(find_item_name) index_start = ex.end() part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:]) for fN in findName: part_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:]) for fN in findName: chap_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break sec_name = None findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:]) for fN in findName: sec_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break law_name = None findName = re.finditer(r"điều [0-9]+\w*",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"điều [0-9]+\w*",title[index_start:]) for fN in findName: law_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break yield[ law_id , position, type_modify, part_name, chap_name, sec_name, law_name, item['name'], None, None, None, None, numerical_symbol, released_date ] continue totalPoint = divlaw.getTotalPoint(divModify,part_id,chap_id,sec_id,law_index,item_id) if totalPoint == 0: totalPoint = 1 for point_id in range(0,totalPoint): point = divlaw.getPoint(divModify,part_id,chap_id,sec_id,law_index,item_id,point_id) if point['name'] != "": point_name = 'điểm ' + point['name'] if point_name in title: find_point_name = re.finditer(r"điểm "+point['name'],title,re.U) index_start = getFirst(find_point_name).end() part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:]) for fN in findName: part_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:]) for fN in findName: chap_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break sec_name = None findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:]) for fN in findName: sec_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break law_name = None findName = re.finditer(r"điều [0-9]+\w*",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"điều [0-9]+\w*",title[index_start:]) for fN in findName: law_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break item_name = None findName = re.finditer(r"(?:khoản\s)[0-9]+\w*",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(?:khoản\s)[0-9]+\w*",title[index_start:]) for fN in findName: item_name = title[index_start+8+fN.span()[0]:index_start+fN.span()[1]] break yield[ law_id , position, type_modify, part_name, chap_name, sec_name, law_name, item_name, point['name'], None, None, None, numerical_symbol, released_date ] continue if type_modify == 8: start_index = 0 ft = re.finditer(r"bổ\ssung\s.+(vào).{5}",title,re.U) for i in ft : start_index = i.end() - 5 break ft = re.finditer(r"bổ\ssung\s.+(sau|trước).{5}",title,re.U) for i in ft : start_index = i.end() - 5 break part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[start_index:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[start_index:]) for fN in findName: part_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[start_index:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[start_index:]) for fN in findName: chap_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]] break sec_name = None findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[start_index:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[start_index:]) for fN in findName: sec_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]] break law_name = None findName = re.finditer(r"điều [0-9]+[A-zĐđ]*",title[start_index:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"điều [0-9]+[A-zĐđ]*",title[start_index:]) for fN in findName: law_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]] break item_name = None findName = re.finditer(r"(khoản\s)[0-9]+",title[start_index:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(khoản\s)[0-9]+",title[start_index:]) for fN in findName: item_name = title[start_index+fN.span()[0] + 8:start_index+fN.span()[1]] break point_name = None temp = title findName = re.finditer(r"(điểm\s)[A-z]+",title[start_index:],re.U) if divlaw.lenIterator(findName) > 0 : findName = re.finditer(r"(điểm\s)[A-zđ]+",temp[start_index:],re.U) for fN in findName: point_name = temp[start_index+fN.span()[0]:start_index+fN.span()[0]] break if 'sau' in title[:start_index]: type_modify = 9 elif 'trước' in title[:start_index]: type_modify = 10 yield[ law_id , position, type_modify, part_name, chap_name, sec_name, law_name, item_name, point_name, None, None, None, numerical_symbol, released_date ] if type_modify == 2 : t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(content) if(lenIterator(extract)>0): for extract in t.finditer(content): temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, None, None, law, item, point, None, None, None, numerical_symbol, released_date ] else : yield[ law_id, position, type_modify, None, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] if(type_modify == 3 ): p =re.compile(r'(B|b)ổ\ssung\s(cụm\s)*từ\s') for location in p.finditer(content): sub_content = content[location.span()[1]:len(content)] temp = p.finditer(sub_content) if(lenIterator(temp)>0): for temp in p.finditer(sub_content): sub_content = sub_content[0:temp.span()[0]] break temp_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")\s.*sau\s(cụm\s)*từ\s',sub_content) if(temp_replace is not None): temp_from_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group()) from_replace = temp_from_replace.group() temp_replace = re.search(r'sau\s(cụm\s)*từ\s(\“|\")(\s)*.+(\s)*(\”|\")',sub_content) temp_to_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group()) to_replace = temp_to_replace.group() t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(sub_content,re.DOTALL) if(lenIterator(extract)>0): for extract in t.finditer(sub_content): temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, None, None, law, item, point, sub_content, from_replace, to_replace, numerical_symbol, released_date ] else : yield[ law_id, position, type_modify, None, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] if(type_modify == 4 ): p =re.compile(r'((t|T)hay\s)*(cụm\s)*từ\s') for location in p.finditer(content): sub_content = content[location.span()[1]:len(content)] temp = p.finditer(sub_content) if(lenIterator(temp)>0): for temp in p.finditer(sub_content): # sub_content_from : lấy cụm từ cần sửa đổi để tách sub_content_from = sub_content[0:temp.span()[1]] break temp_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")\s.*(được\s)*(thay\s)*bằng\s(cụm\s)*từ',sub_content_from) if(temp_replace is not None): temp_from_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group()) from_replace = temp_from_replace.group() temp_replace = re.search(r'(được\s)*(thay\s)*bằng\s(cụm\s)*từ\s(\“|\")(\s)*.+(\s)*(\”|\")',sub_content) if(temp_replace is not None): temp_to_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group()) to_replace = temp_to_replace.group() t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(sub_content,re.DOTALL) if(lenIterator(extract)>0): for extract in t.finditer(sub_content): temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, None, None, law, item, point, sub_content, from_replace, to_replace, numerical_symbol, released_date ] else : yield[ law_id, position, type_modify, content, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] if(type_modify == 7): text_delete = re.search(r'(\“|\").+(\”|\")',content,re.M|re.I) if(text_delete is not None): # numerical_symbol = get_numerical_symbol(content) t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(content) if(lenIterator(extract)>0): for extract in t.finditer(content): temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_chapter is not None): chapter = temp_chapter.group() else: chapter = None temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, chapter, None, law, item, point, text_delete.group(), None, None, numerical_symbol, released_date ] else : t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(content) if(lenIterator(extract)>0): for extract in t.finditer(content): temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_chapter is not None): chapter = temp_chapter.group() else: chapter = None temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, chapter, None, law, item, point, "NA", None, None, numerical_symbol, released_date ] yield[ law_id, position, type_modify, None, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] if(type_modify == 5): location = re.search('(t|T)ên của\s.*\sđược\s((s|S)ửa đổi\,\s)*((b|B)ổ sung\s)*',content) if(location is not None): sub_content = location.group() text = re.search('(\"|\").*(\"|\")',content) t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(sub_content) if(lenIterator(extract)>0): for extract in t.finditer(sub_content): temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_chapter is not None): chapter = temp_chapter.group() else: chapter = None temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, chapter, None, law, item, point, sub_content, None, text.group(), numerical_symbol, released_date ] else : yield[ law_id, position, type_modify, None, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] if(type_modify == 6): text = re.search('(\“|\"|\").*(\”|\"|\")',content) if(text is not None): t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(content) if(lenIterator(extract)>0): for extract in t.finditer(content): temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_chapter is not None): chapter = temp_chapter.group() else: chapter = None temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, chapter, None, law, item, point, None, None, text.group(), numerical_symbol, released_date ] else : yield[ law_id, position, type_modify, None, None, None, None, None, None, None, None, None, numerical_symbol, released_date ]
def extract(mention_id="text", sentence_text="text", tokens="text[]", begin_exp="int", end_exp="int", begin_explain="int", end_explain="int", sentence_source="text[]", position_source="text[]"): forbidden_word = [ "nếu", "phải", "đó", "không", "được", "đã", "đồng_thời", "cần", "chỉ", 'cụ_thể', 'ai', 'đây' ] for i in range(2): if begin_explain + i <= end_explain: if handle_string.toLowerCase(tokens[begin_explain + i]) in forbidden_word: yield [mention_id, -10, "forbidden_word_1"] if end_exp - i >= begin_exp: if handle_string.toLowerCase(tokens[end_exp - i]) in forbidden_word: yield [mention_id, -10, "forbidden_word_1"] if handle_string.toLowerCase(tokens[end_exp]) in forbidden_word: yield [mention_id, -1, "forbidden_word_2"] if ("nếu" in tokens[begin_exp:end_exp]) or ("Nếu" in tokens[begin_exp:end_exp]): yield [mention_id, -4, "forbidden_word_3"] if ("đối_với" in tokens[begin_exp:end_exp]) or ("Đối_với" in tokens[begin_exp:end_exp]): yield [mention_id, -4, "forbidden_word_4"] if ("trường_hợp" in tokens[begin_exp:end_exp]) or ("Trường_hợp" in tokens[begin_exp:end_exp]): yield [mention_id, -4, "forbidden_word_5"] #if ('là' in tokens[begin_explain:end_explain]) : #yield [ #mention_id, #-4, #"forbidden_word_6" #] i = len(mention_id) - 1 first = False while (i > 0): if mention_id[i] == '_' and not first: first = True i -= 1 continue if mention_id[i] == '_' and first: break i -= 1 j = 0 while (j < len(mention_id)): if mention_id[j] == '_': break j += 1 position_require = mention_id[j + 1:i + 1] index = 0 explain_text = " ".join( map(lambda i: tokens[i], xrange(begin_exp, end_exp + 1))) if len(explain_text) < 60: for index in range(0, len(position_source)): if position_require in position_source[index]: temp_sen = handle_string.toLowerCase(sentence_source[index]) if divlaw.lenIterator( re.finditer(r"giải(\s|\_)thích(\s|\_)từ(\s|\_)ngữ", sentence_source[index], re.U | re.I)) > 0: yield [mention_id, 1, "in_explain_words_law"]
def extract( mention_id ="text", sentence_text ="text", tokens ="text[]", begin_exp ="int", end_exp ="int", begin_explain ="int", end_explain ="int", sentence_source ="text[]", position_source ="text[]" ): forbidden_word = ["nếu","phải","đó","không","được","đã","đồng_thời","cần", "chỉ",'cụ_thể'] for i in range(2): if end_exp +2 +i <= end_explain: if handle_string.toLowerCase(tokens[end_exp+2+i]) in forbidden_word: yield [ mention_id, -10, "forbidden_word_1" ] if end_exp - i >= begin_exp: if handle_string.toLowerCase(tokens[end_exp-i]) in forbidden_word: yield [ mention_id, -10, "forbidden_word_1" ] if handle_string.toLowerCase(tokens[end_exp]) in forbidden_word: yield [ mention_id, -1, "forbidden_word_2" ] if ("nếu" in tokens[begin_exp:end_exp]) or ("Nếu" in tokens[begin_exp:end_exp]): yield [ mention_id, -1, "forbidden_word_3" ] i = len(mention_id) - 1 first = False while(i>0) : if mention_id[i] == '_' and not first: first = True i -= 1 continue if mention_id[i] == '_' and first: break i -= 1 j = 0 while(j<len(mention_id)) : if mention_id[j] == '_': break j += 1 position_require = mention_id[j+1:i+1] index = 0 for index in range(0,len(position_source)): if position_require in position_source[index] : if divlaw.lenIterator(re.finditer(r"Giải_thích\stừ_ngữ",sentence_source[index],re.U|re.I)) > 0 : yield [ mention_id, 1, "in_explain_words_law" ]
def extract( law_id = "text", totalLaw = "int", law_content = "text", totalItem = "int", item_content = "text", totalpoint = "int", point_content = "text", part_index ="int", chap_index ="int", sec_index ="int", law_index ="int", item_index ="int", point_index ="int" ): # get_type = re.search(r'[s|S]ửa đổi[\s|\,]*(bổ sung)*',name_title) # if(get_type is not None): # text = 'như sau: “1. _Hoạt động giao thông đường thủy nội địa_ gồm hoạt động của người, phương tiện tham gia giao thông vận tải trên đường thủy nội địa' p =re.compile(r'\:(\s|\\n|\*|\_|\#)*(\“|\")') numerical_sybol = None done = 0 end = 0 if(totalpoint > 0 ): temp = p.finditer(point_content,re.DOTALL) if(lenIterator(temp) > 0 ): for get_content in p.finditer(point_content): start_point = get_content.start() leng = len(point_content) title_point = point_content[0:start_point] numerical_sybol_point = get_numerical_symbol(title_point) if(numerical_sybol_point is None): done = 2 start = start_point title = title_point content = point_content elif(numerical_sybol is not None): done = 3 match = re.finditer(r"(\\n(\s|\_|\.|\*|\#)*\“(.(?!\“|\”))+.{2})|(\\n(\s|\_|\.|\*|\#)*\"(.(?!\"))+.{2})", point_content,re.DOTALL) count = divlaw.lenIterator(match) yield[ law_id, part_index , chap_index , sec_index , law_index , item_index , point_index, numerical_sybol_point, title_point, point_content, start_point, count ] break else : end = 1 if(((totalItem > 0 and done == 0 ) or ( totalItem > 0 and done ==2 )) and end != 1): temp = p.finditer(item_content,re.DOTALL) if(lenIterator(temp) > 0 ): for get_content in p.finditer(item_content): if(done == 0 ): start_item = get_content.start() leng = len(item_content) title_item = item_content[0:start_item] numerical_sybol_item = get_numerical_symbol(title_item) if(numerical_sybol_item is None): done = 4 start = start_item title = title_item content = item_content if(numerical_sybol_item is not None): done = 5 match = re.finditer(r"(\\n(\s|\_|\.|\*|\#)*\“(.(?!\“|\”))+.{2})|(\\n(\s|\_|\.|\*|\#)*\"(.(?!\"))+.{2})", item_content,re.DOTALL) count = divlaw.lenIterator(match) yield[ law_id, part_index , chap_index , sec_index , law_index , item_index , point_index, numerical_sybol_item, title_item, item_content, start_item, count ] if(done == 2 ): start_item = get_content.start() leng = len(item_content) title_item = item_content[0:start_item] content = item_content numerical_sybol_item = get_numerical_symbol(title_item) if(numerical_sybol_item is not None): numerical_sybol = numerical_sybol_item done = 6 break else: end = 1 if(((totalLaw > 0 and done == 0) or (totalLaw > 0 and done == 4) or(totalLaw > 0 and done == 2)) and end != 1): temp = p.finditer(law_content,re.DOTALL) if(lenIterator(temp)>0): for get_content in p.finditer(law_content): if(done == 0): start_law = get_content.start() leng = len(law_content) title_law = law_content[0:start_law] numerical_sybol = get_numerical_symbol(title_law) match = re.finditer(r"(\\n(\s|\_|\.|\*|\#)*\“(.(?!\“|\”))+.{2})|(\\n(\s|\_|\.|\*|\#)*\"(.(?!\"))+.{2})", law_content,re.DOTALL) count = divlaw.lenIterator(match) yield[ law_id, part_index , chap_index , sec_index , law_index , item_index , point_index, numerical_sybol, title_law, law_content, start_law, count ] if(done == 2 or done == 4): start_law = get_content.start() title_law = law_content[0:start_law] numerical_sybol = get_numerical_symbol(title_law) if(numerical_sybol is not None): done = 7 break else : end =1 if(done == 2 or done == 4 or done ==6 or done == 7 ): match = re.finditer(r"(\\n(\s|\_|\.|\*|\#)*\“(.(?!\“|\”))+.{2})|(\\n(\s|\_|\.|\*|\#)*\"(.(?!\"))+.{2})", content,re.DOTALL) count = divlaw.lenIterator(match) yield[ law_id, part_index , chap_index , sec_index , law_index , item_index , point_index, numerical_sybol, title, content, start, count ] done = 0 end =0