def import_xml(self, filename): parser = lxml.etree.XMLParser(dtd_validation=False, load_dtd=False, resolve_entities=False, encoding="utf8") with open(filename) as f: source = os.path.basename(filename) file_contents = unescape(f.read()) file_contents = re.sub('''^<\?xml version="1.0" encoding="UTF-8"\s*\??>''', '', file_contents) file_contents = file_contents.replace("&", "&") xml = lxml.etree.fromstring(file_contents, parser=parser) try: self.title = xml.attrib['titlenum'].lstrip('0') except KeyError: return try: title_section = xml.xpath('//hdsupnest')[0].text match = re.match("TITLE (?P<title>\w+)\s*(?:-|—)\s*(?P<name>\w+)", title_section) if match: result = Law.objects.get_or_create( title=match.group('title').lstrip('0'), section="", psection="", defaults={ 'text': title_section, 'order': self.ordering, 'level': 0, }) self.ordering += 1 except IndexError: pass sections = xml.xpath('//section') if len(sections) == 0: return self.section = sections[0].attrib['num'] matches = Law.objects.filter( title=self.title, section=self.section, psection="") if matches: law = matches[0] else: law = Law(title=self.title, section=self.section, psection="") law.order = self.ordering body = self.xslt(xml).xpath('//xhtml:body/xhtml:div', namespaces={ 'xhtml': 'http://www.w3.org/1999/xhtml'})[0] law.text = unicode(lxml.etree.tostring(body)) law.source = source law.set_name() law.save() for sect_text in xml.xpath('//section/sectioncontent/text'): self.ordering += 1 l2 = Law( title=self.title, section=self.section, psection="", order=self.ordering, text=unicode(sect_text.xpath('string()')), source=source) l2.set_name() l2.save() for psection in xml.xpath('//section/sectioncontent/psection'): self.parse_psection(psection, [], source)
def parse_psection(self, psection, parts, source): parts.append(psection.xpath('string(enum)')) psection_id = psection.attrib['id'] # Get references ref_laws = [] for ref in psection.xpath('text/aref'): for subref in ref.xpath('subref'): if subref.attrib['type'] == 'title': match = re.match( r"usc_sup_01_([^_])", subref.attrib['target']) if match: (title,) = match.groups() title = title.lstrip('0') section = "" ref_psec_id = "" else: continue elif subref.attrib['type'] in ['sec', 'psec']: match = re.match( r"usc_sec_(?P<title>\d+)_(?P<section>[^-]+)-*(?P<section2>[0-9A-Za-z]*)-?(?:\#(?P<psection>\w+))?", subref.attrib['target']) if not match: continue (title, sec1, sec2, ref_psec_id) = match.groups() title = title.lstrip('0') section = sec1.lstrip('0') + sec2.rstrip('0') ref_psec_id = ref_psec_id or "" else: continue matches = Law.objects.filter( title=title, section=section, psection=ref_psec_id) if len(matches) == 0: ref_law = Law.objects.create( title=title, section=section, psection=ref_psec_id, order=0) else: ref_law = matches[0] ref_laws.append(ref_law) for sub_element in psection: if sub_element.tag in ["text", "head"]: self.ordering += 1 matches = Law.objects.filter( title=self.title, section=self.section, psection=psection_id) if len(matches) == 1 and not matches[0].source: law = matches[0] else: law = Law( title=self.title, section=self.section, psection=psection_id) law.level = int(psection.attrib['lev']) law.text = unicode(sub_element.xpath('string()') or "") law.order = self.ordering law.source = source law.set_name(parts) law.save() elif sub_element.tag == "psection": self.parse_psection(sub_element, parts, source) if ref_laws: first = Law.objects.filter(title=self.title, section=self.section, psection=psection_id)[0] first.references = ref_laws parts.pop()