def __init__(self, filepath, common_element=None): self.tree = hrs_xml.parse_html_to_xml_tree(filepath) if common_element is None: self.elem = HRSElement(filepath) else: self.elem = common_element self.elem.set_filepath(filepath)
class HRSIndex(object): # filepath is a pathlib.Path def __init__(self, filepath, common_element=None): self.tree = hrs_xml.parse_html_to_xml_tree(filepath) if common_element is None: self.elem = HRSElement(filepath) else: self.elem = common_element self.elem.set_filepath(filepath) def get_elements(self): return hrs_xml.get_elements_by_xpaths(self.tree, ["/html/body/div/p"]) def get_header(self): header = self.get_elements()[0].text header = header.split(" ")[:2] header = " ".join(header) header = header.upper() header = header.strip(" .\xa0") return header def get_lines(self): def get_element_text(e): text = str(e.text) if e.text is not None else "" # collapse multiple lines into single line text = text.replace("\n", " ") # replace multiple spaces with single space text = re.sub(" +", " ", text) # strip leading spaces and brackets text = text.lstrip(" []") while len(text) and text[-1] in " ]" and not text.upper()[-5:] in ["[NEW]", "[OLD]"]: text = text[:-1] # remove angle bracket pairs # while len(text) and text[0] == "[" and text[-1] == "]": # text = text[1:-1].strip() return text elements = self.get_elements() # msonormal = [get_element_text(e) for e in elements if e.get("class").startswith("MsoNormal")] elements = [e for e in elements if not e.get("class").startswith("XNotes")] # get element text lines = [get_element_text(e) for e in elements] # remove empty lines # lines = [line for line in lines if len(line)] return lines def debug_print_elements_in_classes( self, classes=("MsoNormal", "1Paragraph", "XNotes", "New", "RegularParagraphs", "XNotesHeading") ): for e in self.get_elements(): clazz = e.get("class") if clazz in classes: print('url={}\nclass="{}" txt = "{}"'.format(self.elem.url, clazz, e.text), file=sys.stderr) def parse_all(self): global DEBUG_OUT # debug_print_elements_in_classes(['XNotes', 'XNotesHeading']) self.parse_line_num = 0 self.parse = ParseMode() hrs_elems = [] lines = self.get_lines() while self.parse_line_num < len(lines): old_mode = self.parse.mode() line = lines[self.parse_line_num] prev_elem = copy.deepcopy(self.elem) prev_append = self.parse.append() lineno = self.parse_line_num # print("CSV: "+self.elem.csv_txt()) # print("LINE:"+line) exception = None result = 0 try: result = self.parse_line(lines, hrs_elems) except Exception as e: exception = e if exception: DEBUG_OUT = sys.stderr if DEBUG_OUT: print("\nparse_line(lineno={})".format(lineno), file=DEBUG_OUT) print("prev_append={}".format(prev_append), file=DEBUG_OUT) print("prev_elem = {}".format(prev_elem.json_txt()), file=DEBUG_OUT) for j in range(-3, 4): i = lineno + j if i >= 0 and i < len(lines): jt = "**" if not j else "{:>2d}".format(j) print("{:>5}({:>2}): {}".format(i, jt, lines[i]), file=DEBUG_OUT) print("self.elem = {}".format(self.elem.json_txt()), file=DEBUG_OUT) print("append={}".format(self.parse.append()), file=DEBUG_OUT) print("parse_mode={}".format(self.parse.mode_), file=DEBUG_OUT) print("result={}".format(result), file=DEBUG_OUT) if not exception: cmd = input() if len(cmd) == 0: continue if cmd == "c": DEBUG_OUT = None else: sys.exit(1) if exception: raise exception hrs_elems = [e.capwords() for e in hrs_elems] return hrs_elems def parse_line(self, lines, hrs_elems): line = lines[self.parse_line_num] lupper = line.upper() # print("self.parse_mode={}, line=\"{}\", filepath={}".format(self.parse_mode, line, self.elem.filepath)) # hex_print(line) self.parse_line_num += 1 # Begin SPECIAL CASES # # Text specific ifs for special cases where the general parsing code breaks chap_sec = "{}-{}".format(self.elem.chapter, self.elem.section) if chap_sec in ["235-2.5", "269-15"] and lupper == ParseMode.CHAPTER: hrs_elems[-1].section_text += " chapter" return 1 if chap_sec in ["386-153", "386-154"] and lupper == "386-121(A)(1)": hrs_elems[-1].section_text += "386-121(a)(1)" return 2 if line == "431-17 Referendum": line = "421-17 Referendum" if line == "409:2A-210 Express warranties": line = "490:2A-210 Express warranties" if line == "409:9-405 Modification of assigned contract": line = "490:9-405 Modification of assigned contract" if line == "463B-19.5 Suspension or denial of license for noncompliance": line = "436B-19.5 Suspension or denial of license for noncompliance" if line.startswith("490-7:501"): line = "490:7-501" + line[9:] if ( line == "DERIVATION TABLE OF CHAPTER 414 FROM" and self.elem.year in ["current"] and self.elem.filepath.name == "HRS_0414-.htm" ): self.parse_line_num = len(lines) return 3 if line == "PREAMBLE" and self.elem.year in ["current"] and self.elem.filepath.name == "HRS_0084-.htm": self.parse_line_num = len(lines) return 4 if ( line == "Patients' bill of rights and responsibilities act, see chapter 432E." and self.elem.year in ["current"] and self.elem.filepath.name == "HRS_0327G-.htm" ): self.parse_line_num = len(lines) return 5 if ( line.startswith("Defendant lender's motion for summary judgment") and self.elem.year in ["current"] and self.elem.filepath.name == "HRS_0480-.htm" ): self.parse_line_num = len(lines) return 6 if self.parse_line_num == len(lines) and self.elem.year in ["current"]: if ( False or # http://www.capitol.hawaii.gov/hrscurrent/Vol03_Ch0121-0200D/HRS0174C/HRS_0174C-.htm # line: " Where Justice Flows Like Water: The Moon Court's Role in Illuminating Hawai‘i Water Law. 33 UH L. Rev. 537 (2011)." self.elem.filepath.name == "HRS_0174C-.htm" or # http://www.capitol.hawaii.gov/hrscurrent/Vol05_Ch0261-0319/HRS0302A/HRS_0302A-.htm # line: "Medical marijuana patient and caregiver protections, see §329-125.5." self.elem.filepath.name == "HRS_00302A-.htm" or # http://www.capitol.hawaii.gov/hrscurrent/Vol06_Ch0321-0344/HRS0323/HRS_0323-.htm # line: "Kahuku medical center; designation as a rural hospital, see §346-53.8." self.elem.filepath.name == "HRS_00302A-.htm" or # www.capitol.hawaii.gov/hrscurrent/Vol12_Ch0501-0588/HRS0572B/HRS_0572B-.htm # line: "Continuity of rights; civil union and reciprocal beneficiary relationships, see §572-1.7." self.elem.filepath.name == "HRS_0572B-.htm" or # placeholder to make it easier to insert more ifs False ): return 7 """ if line == "5.12.2 State land mammal": line = "5-12.2 State land mammal" lupper = line.upper() if line == "8.18.5 Sakada Day": line = "8-18.5 Sakada Day" lupper = line.upper() """ # http://www.capitol.hawaii.gov/hrs2014/Vol04_Ch0201-0257/HRS0237/HRS_0237-.htm" # http://www.capitol.hawaii.gov/hrs2015/Vol04_Ch0201-0257/HRS0237/HRS_0237-.htm" if self.elem.year in ["2013", "2014", "2015"] and self.elem.section == "237-13.3": # Append this and next line to previous section text hrs_elems[-1].section_text += " " + line + " " + lines[self.parse_line_num] self.parse_line_num += 1 self.elem.set("section", "") return 8 # End SPECIAL CASES lupper = line.upper() line_empty = len(line) == 0 if self.parse.mode() == ParseMode.SUBTITLE: if line_empty: self.parse.mode(ParseMode.CHAPTER) return 9 if re.match("^[0-9]+.", lupper): self.parse.mode(ParseMode.CHAPTER) if line_empty: self.parse_append = False if self.parse.mode() == ParseMode.CHAPTER: self.parse.mode(ParseMode.UNKNOWN) self.parse.mode(ParseMode.UNKNOWN) if ( len(line) and self.parse.mode() == ParseMode.UNKNOWN and self.parse.mode_[-2] in [ParseMode.CHAPTER, ParseMode.SECTION, ParseMode.SUBPART, ParseMode.PART, ParseMode.ARTICLE] ): self.parse.mode(ParseMode.SUBPART, False) if False: pass elif lupper.startswith("APPENDIX"): self.parse.mode(ParseMode.UNKNOWN) return 10 elif lupper in [ ParseMode.NOTE, ParseMode.CROSS_REFERENCES, ParseMode.RULES_OF_COURT, ParseMode.LAW_JOURNALS_AND_REVIEWS, ParseMode.CASE_NOTES, ParseMode.REVISION_NOTE, ParseMode.APPENDIX, ]: self.parse.mode(ParseMode.UNKNOWN) return 11 elif re.match("^DIVISION [0-9]+.", lupper): assert self.parse.mode() in [ParseMode.DIVISION, ParseMode.UNKNOWN] self.elem.set_id_and_text("division", line) self.parse.mode(ParseMode.TITLE, True) return 12 elif re.match("^TITLE [0-9]+", lupper): assert self.parse.mode() in [ParseMode.UNKNOWN, ParseMode.TITLE] self.elem.set_id_and_text("title", line) self.parse.mode(ParseMode.TITLE, True) return 13 elif re.match("^SUBTITLE [0-9]+", lupper): assert self.parse.mode() in [ParseMode.UNKNOWN, ParseMode.SUBPART, ParseMode.TITLE] self.elem.set_id_and_text("subtitle", line) self.parse.mode(ParseMode.SUBTITLE, True) return 14 elif re.match("^ARTICLE [0-9?IV]+", lupper): assert self.parse.mode() in [ParseMode.UNKNOWN, ParseMode.CHAPTER, ParseMode.SECTION, ParseMode.SUBPART] # ensure the id after ARTICLE terminates with a period (.) so set_id_and_text works parts = line.split(" ") assert len(parts) > 1 if parts[1][-1] != ".": parts[1] += "." line = " ".join(parts) self.elem.set_id_and_text("article", line) self.parse.mode(ParseMode.ARTICLE, True) return 15 elif re.match("^CHAPTER [0-9]+", lupper): assert self.parse.mode() in [ ParseMode.CHAPTER, ParseMode.SUBPART, ParseMode.UNKNOWN, ], "CHAPTER tag found in invlaid parse_mode" line = line + ". " + lines[self.parse_line_num] self.parse_line_num += 1 self.elem.set_id_and_text("chapter", line) self.parse.mode(ParseMode.CHAPTER, True) hrs_elems.append(copy.deepcopy(self.elem)) HRSChapters.add(hrs_elems[-1]) HRSChapters.fixup(self.elem) return 16 elif lupper == ParseMode.CHAPTER: assert self.parse.mode() in [ParseMode.TITLE, ParseMode.UNKNOWN, ParseMode.SUBTITLE, ParseMode.CHAPTER] self.parse.mode(ParseMode.CHAPTER) return 17 elif lupper == ParseMode.SECTION: self.parse.mode(ParseMode.SUBPART, False) return 18 elif lupper.startswith(ParseMode.CROSS_REFERENCES): self.parse.mode(ParseMode.UNKNOWN, False) return 19 elif lupper.startswith("PART "): line = line[5:] parts = line.replace(".", " ").split(" ", 1) self.elem.set("part", parts[0]) self.elem.set("part_text", parts[1] if len(parts) > 1 else "") self.elem.part_text = self.elem.part_text.strip(" []") self.parse.mode(ParseMode.PART, True) return 20 if line_empty: return 21 if self.parse.mode() == ParseMode.TITLE and self.parse.append(): self.elem.title_text += " " + line return 23 elif self.parse.mode() == ParseMode.ARTICLE and self.parse.append(): self.elem.article_text += " " + line self.elem.article_text = self.elem.article_text.strip(" []") return 24 elif self.parse.mode() == ParseMode.SUBTITLE and self.parse.append(): self.elem.subtitle_text += " " + line return 25 elif self.parse.mode() == ParseMode.PART and self.parse.append(): if not self.parse_section_line(line, hrs_elems): self.elem.part_text += " " + line self.elem.part_text = self.elem.part_text.strip(" []") return 26 return 27 elif self.parse.mode() in [ParseMode.CHAPTER]: if self.parse.append() and not line[0].isdigit() and len(hrs_elems) > 0: hrs_elems[-1].chapter_text += " " + line HRSChapters.fixup(hrs_elems[-1]) return 22 chapter, chapter_text = line.split(" ", 1) self.elem.set("chapter", chapter) self.elem.set("chapter_text", chapter_text) self.parse.append(True) hrs_elems.append(copy.deepcopy(self.elem)) HRSChapters.add(hrs_elems[-1]) HRSChapters.fixup(self.elem) return 28 elif self.parse.mode() in [ParseMode.SECTION, ParseMode.SUBPART]: if self.parse_section_line(line, hrs_elems): return 30 if self.parse.append() and self.parse.mode() in [ParseMode.SECTION]: assert len(hrs_elems) > 0 space = "" if hrs_elems[-1].section_text[-1] == "-" else " " hrs_elems[-1].section_text += space + line return 29 if self.parse.append() and self.parse.mode() in [ParseMode.SUBPART]: assert len(hrs_elems) > 0 self.elem.subpart += " " + line return 33 self.elem.set("subpart", line) self.parse.append(True) # print("ERROR PARSING SECTION LINE:", file=sys.stderr) # print("\t{}".format(self.elem.url), file=sys.stderr) # print("\tline=\"{}\"".format(line), file=sys.stderr) # print("\tprefixes=\"{}\"".format(self.get_section_prefixes()), file=sys.stderr) return 31 return 32 def get_section_prefixes(self): prefixes = [] prefixes.append(self.elem.chapter + "-") prefixes.append(self.elem.chapter + ".") if self.elem.article: prefixes.append(self.elem.chapter + ":" + maybe_convert_roman_numeral(self.elem.article) + "-") return prefixes def parse_section_line(self, line, hrs_elems): if not line[0].isdigit() and not " " in line: return False citation, section_text = line.split(" ", 1) citation = citation.strip(" ,") section = "" for prefix in self.get_section_prefixes(): if line.upper().startswith(prefix.upper()): section = citation[len(prefix) :] break if not section: return False self.elem.set("section", section) self.elem.set("section_text", section_text) self.elem.set("citation", citation) self.parse.mode(ParseMode.SECTION, True) hrs_elems.append(copy.deepcopy(self.elem)) return True