コード例 #1
0
ファイル: scrape_US.py プロジェクト: se7enmilgram/legalcode
 def processTitle( self, title, parent):
     code = self.findOrCreateCode(title['identifier'])
     code.parent = parent
     code.depth = 1
     code.pre = Text.make(title.num.text + title.heading.text)
     self.progress(label=code.cid)
     chapters = title.select("chapter[identifier]")
     log.debug('Processing %i chapters for title %s' % (len(chapters), code.pre.text))
     self.progress(i=len(chapters))
     for chapter in chapters:
         self.processChapter(chapter, code)
     self.progress()
コード例 #2
0
ファイル: scrape_US.py プロジェクト: se7enmilgram/legalcode
 def processAppendix(self, appendix, parent):
     # assume the title can be identified by removing the 'a' from the appendix identifier
     title = self.findOrCreateCode(appendix['identifier'][:-1])
     log.debug('Processing appendix for %s' % title.cid)
     self.progress(label=appendix.cid)
     chapter_cid = title.cid + '/appendix'
     chapter = self.findOrCreateCode(chapter_cid)
     chapter.parent = title
     chapter.depth = 2
     chapter.pre = Text.make(appendix.heading.text)
     sections = appendix("section")
     sections += appendix("courtRule")
     if sections:
         self.progress(i=len(sections))
         for section in sections:
             self.processSection(section, chapter)
コード例 #3
0
ファイル: scrape_US.py プロジェクト: se7enmilgram/legalcode
 def processChapter( self, chapter, parent):
     # exclude omitted or repealed chapters
     if chapter.has_attr('status') and chapter['status'] in ('omitted', 'repealed'):
         pass
     else:
         code = self.findOrCreateCode(chapter['identifier'])
         code.depth = 2
         code.parent = parent
         code.pre = Text.make(chapter.num.text + chapter.heading.text)
         self.progress(label=code.cid)
         sections = chapter.select("section[identifier]")
         self.progress(i=len(sections))
         if sections:
             log.debug('Processing chapter %s with %i sections:' % (code.cid, len(sections)))
             for section in sections:
                 self.processSection(section, code)
         self.progress()
コード例 #4
0
ファイル: scrape_CA.py プロジェクト: se7enmilgram/legalcode
 def processAct( self, actcid, parent ):
     log.info('Processing act: %s' % actcid)
     soup = self.getActSoup(actcid)
     act = self.findOrCreateAct(parent.released, actcid, parent.rev)
     act.parent = parent
     act.cid = actcid
     act.released = parent.released
     act.rev = parent.rev
     act.depth = 1
     act.pre = Text.make( soup.title.text )
     act.text = Text.make( soup.select("section.intro")[0].text )
     act.meta = utf8(soup.select("p#assentedDate")[0].text.rpartition('.')[0])
     doc = soup.select("div.docContents div")[0]
     #so much easier to use the CSS selector
     #sections = [i['id'] for i in doc.select("[id]") if i['id'].startswith('h-')]
     id_prefix = 'h-'
     sections = [i['id'] for i in doc.select('[id^={}]'.format(id_prefix))]
     classAndTag = lambda o: isinstance(o, Tag) and o.has_attr('class')
     
     if sections:
         for secid in progress.bar(sections, label=act.cid):
             sec = self.findOrCreateSection(act.released, secid, act)
             soup = doc.select("[id=%s]" % secid)[0]
             sec.pre = Text.make(soup.text)
             sec.cid = secid
             sec.depth=2
             sec.parent = act
             sec.released = act.released
             sec.rev = act.rev
             stop = False
             sib = soup.nextSibling
             content = ""
             for t in soup.select(".wb-invisible"):
                 t.clear()
             while not stop:
                 if classAndTag(sib):
                     if sib.has_attr('id') and sib['id'].startswith('h-'):
                         stop = True
                     elif sib.name == 'section':
                         stop = True
                     elif any( c in ['Definition', 
                                     'Section',
                                     'MarginalNote', 
                                     'ProvisionList', 
                                     'Part', 
                                     'Subheading', 
                                     'MarginalNoteDefinedTerm',
                                     'ContinuedSectionSubsection',
                                     'Oath'] for c in sib['class']):
                         content += sib.text
                     elif sib['class'][0].startswith('indent'):
                         content += sib.text
                     elif sib['class'][0] == 'HistoricalNote':
                         sec.meta = utf8(sib.text)
                     elif sib['class'][0] in ['PITLink',
                                              'nif']:
                         pass
                     else:
                         log.info('Unhandled case in parsing section %s/%s' % (act.cid, secid))
                         log.debug(sib.name)
                         log.debug(sib.attrs)
                 if not sib or not sib.nextSibling:
                     stop = True
                 if not stop:
                     sib = sib.nextSibling
             sec.text = Text.make(content)
             sec.stored = now()
             schedules = soup.select('div [class=Schedule]')
             post = ''
             for sched in schedules:
                 post += sched.text
             act.post = Text.make(post)
             act.stored = now()
     else:
         #alternative section method
         #for this method we switch to the XML version and pull identifying information 
         #out of the code = attribute. Annecdotally, this seems to need to be done for
         #very small acts
         log.info('Switching to alternate section method')
         soup = self.getActXMLSoup(act.cid)
         sections = soup.select("section[code^=se=]")
         for section in sections:
             try:
                 secid = section['code'].replace('=', '-').replace('"', '')
                 pre = ''
                 pre = section.label.text + ' ' if section.label else pre
                 pre = pre + section.marginalnote.text if section.marginalnote else pre
                 text = section.select_one('text').text
             except:
                 log.warn('ERROR in alternate parsing method for {}.{}'.format(act.cid, secid))
                 raise
             if 'repealed' in text.lower():
                 pass
             else:
                 sec = self.findOrCreateSection(act.released, secid, act)
                 sec.setPre(pre)
                 sec.setText(text)
                 sec.parent = act
                 sec.depth = 2
                 sec.released = act.released
                 sec.rev = act.rev
                 sec.cid = secid
     act.analyze()
     store.commit()
     return act