def scrape ( self ): acts, datemodified = self.getActs() lastroot = self.getRoot() if datemodified is not lastroot.released: root = self.makeRoot(datemodified=datemodified) else: root = lastroot for act in acts: self.processAct( act, root ) root.analyze() store.commit()
def processFile( self, fn, root ): log.debug('Processing file %s...' % fn) self.progress(label=fn) soup = BS( open(fn, 'r').read() ) if fn.lower().endswith('a.xml'): # we think this is an appendix, let's try to find an appendix # any appendix not identified in the form /us/usc/tN[aA] will NOT be processed appendices = soup.select('appendix[identifier$=a]') appendices += soup.select('appendix[identifier$=A]') self.progress(i=len(appendices)) for appendix in appendices: log.debug('Processing appendix: %s@%s' % ( soup.appendix['identifier'], self.rp ) ) self.processAppendix(appendix, root) else: self.processTitle(soup.title, root) log.debug('Finished processing file %s...' % fn) store.commit()
def scrape (self): if not self.rp: r = Cache.get(self.nation.cfg['entrypoint']) soup = BS(str(r)) #find current release point log.info("No release point specified, retreiving latest...") # this failed fantastically - we'll get the RP from the zipurl #self.rp = utf8(soup.findAll('h3', attrs={'class': 'releasepointinformation'})[0].text.split()[-1]) log.info("Found release point %s" % self.rp) #find the download url self.zipurl = self.nation.cfg['entrypoint'].rpartition('/')[0] + '/' + soup.findAll('a', title='All USC Titles in XML')[0]['href'] # new way to set the rp using the zipurl's filename self.rp = utf8( self.zipurl.rpartition('@')[-1].partition('.')[0] ) else: log.info('Using specified release point %s...' % self.rp) # don't actually need this # rpurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/usc-rp@%s.htm' % (tuple(self.rp.split()) + (self.rp,)) self.zipurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/xml_uscAll@%s.zip' % (tuple(self.rp.split('-')) + (self.rp,)) log.debug("Using zipurl: %s" % self.zipurl) class FileNotThere (Exception): pass class XMLNotThere( Exception ): pass class AllGood( Exception ): pass filename = self.zipurl.rpartition('/')[-1] xmldir = self._workdir + os.sep + 'xml' + os.sep # check to see if we have xml that works # if we don't check to see if we have a zip file # if we don't, download it # if we do, extract it # check the xml again, if it's good, proceed # if it's not, error out try: assert os.path.exists(xmldir + 'usc01.xml') soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read()) xmlrp = soup.find('docpublicationname').text.split('@')[-1] #old way to get rp, the new way is much better # xmlrp = soup.title.first("note", topic="miscellaneous").text.split()[-1] if xmlrp == self.rp: raise AllGood else: raise XMLNotThere except (XMLNotThere,AssertionError): # delete directory if it exists if os.path.exists(xmldir): shutil.rmtree(xmldir) # if there's no xml file, download it if not os.path.exists(self._workdir + os.sep + filename): log.info('No zipfile found for release point, downloading...') self.downloadFile(self.zipurl, filename) # now we should have a zipfile and no existing xmldir log.info('Extracting file %s...' % filename) zf = ZipFile(self._workdir + os.sep + filename, 'r') # older release points do not have an interior xml/ dir if not all( [ n.startswith('xml/') for n in zf.namelist()]): zf.extractall(xmldir) else: zf.extractall(self._workdir) # double check the xml now... assert os.path.exists(xmldir + 'usc01.xml') # it may be problematic to rely on the RP information in the XML documents provided # rp 113-21 (the earliest presently available) does not include this in the # docpublicationname meta tag #soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read()) #xmlrp = soup.find('docpublicationname').text.split('@')[-1] #if xmlrp != self.rp: # raise XMLNotThere('XML did not check out after extraction.') except AllGood: pass except: raise log.info('All checks passed...') xf = os.listdir(xmldir) root = self.findOrCreateRoot() xf = [xmldir + f for f in xf if f.endswith('.xml')] xf.sort() log.info("Processing %i files..." % len(xf)) self.bar = progress.Bar(label='US', expected_size=1000*len(xf)) self.progress( i=len(xf) ) for fn in xf: self.processFile(fn, root) self.progress(rollup=1000) log.info('Analyzing code...') self.progress(label="Analyzing") root.analyze(commit=True, bar=self.bar) store.commit() log.info('Scrape completed.')
def processAct( self, actcid, parent ): log.info('Processing act: %s' % actcid) soup = self.getActSoup(actcid) act = self.findOrCreateAct(parent.released, actcid, parent.rev) act.parent = parent act.cid = actcid act.released = parent.released act.rev = parent.rev act.depth = 1 act.pre = Text.make( soup.title.text ) act.text = Text.make( soup.select("section.intro")[0].text ) act.meta = utf8(soup.select("p#assentedDate")[0].text.rpartition('.')[0]) doc = soup.select("div.docContents div")[0] #so much easier to use the CSS selector #sections = [i['id'] for i in doc.select("[id]") if i['id'].startswith('h-')] id_prefix = 'h-' sections = [i['id'] for i in doc.select('[id^={}]'.format(id_prefix))] classAndTag = lambda o: isinstance(o, Tag) and o.has_attr('class') if sections: for secid in progress.bar(sections, label=act.cid): sec = self.findOrCreateSection(act.released, secid, act) soup = doc.select("[id=%s]" % secid)[0] sec.pre = Text.make(soup.text) sec.cid = secid sec.depth=2 sec.parent = act sec.released = act.released sec.rev = act.rev stop = False sib = soup.nextSibling content = "" for t in soup.select(".wb-invisible"): t.clear() while not stop: if classAndTag(sib): if sib.has_attr('id') and sib['id'].startswith('h-'): stop = True elif sib.name == 'section': stop = True elif any( c in ['Definition', 'Section', 'MarginalNote', 'ProvisionList', 'Part', 'Subheading', 'MarginalNoteDefinedTerm', 'ContinuedSectionSubsection', 'Oath'] for c in sib['class']): content += sib.text elif sib['class'][0].startswith('indent'): content += sib.text elif sib['class'][0] == 'HistoricalNote': sec.meta = utf8(sib.text) elif sib['class'][0] in ['PITLink', 'nif']: pass else: log.info('Unhandled case in parsing section %s/%s' % (act.cid, secid)) log.debug(sib.name) log.debug(sib.attrs) if not sib or not sib.nextSibling: stop = True if not stop: sib = sib.nextSibling sec.text = Text.make(content) sec.stored = now() schedules = soup.select('div [class=Schedule]') post = '' for sched in schedules: post += sched.text act.post = Text.make(post) act.stored = now() else: #alternative section method #for this method we switch to the XML version and pull identifying information #out of the code = attribute. Annecdotally, this seems to need to be done for #very small acts log.info('Switching to alternate section method') soup = self.getActXMLSoup(act.cid) sections = soup.select("section[code^=se=]") for section in sections: try: secid = section['code'].replace('=', '-').replace('"', '') pre = '' pre = section.label.text + ' ' if section.label else pre pre = pre + section.marginalnote.text if section.marginalnote else pre text = section.select_one('text').text except: log.warn('ERROR in alternate parsing method for {}.{}'.format(act.cid, secid)) raise if 'repealed' in text.lower(): pass else: sec = self.findOrCreateSection(act.released, secid, act) sec.setPre(pre) sec.setText(text) sec.parent = act sec.depth = 2 sec.released = act.released sec.rev = act.rev sec.cid = secid act.analyze() store.commit() return act