def scrape (self): if not self.rp: r = Cache.get(self.nation.cfg['entrypoint']) soup = BS(str(r)) #find current release point log.info("No release point specified, retreiving latest...") # this failed fantastically - we'll get the RP from the zipurl #self.rp = utf8(soup.findAll('h3', attrs={'class': 'releasepointinformation'})[0].text.split()[-1]) log.info("Found release point %s" % self.rp) #find the download url self.zipurl = self.nation.cfg['entrypoint'].rpartition('/')[0] + '/' + soup.findAll('a', title='All USC Titles in XML')[0]['href'] # new way to set the rp using the zipurl's filename self.rp = utf8( self.zipurl.rpartition('@')[-1].partition('.')[0] ) else: log.info('Using specified release point %s...' % self.rp) # don't actually need this # rpurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/usc-rp@%s.htm' % (tuple(self.rp.split()) + (self.rp,)) self.zipurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/xml_uscAll@%s.zip' % (tuple(self.rp.split('-')) + (self.rp,)) log.debug("Using zipurl: %s" % self.zipurl) class FileNotThere (Exception): pass class XMLNotThere( Exception ): pass class AllGood( Exception ): pass filename = self.zipurl.rpartition('/')[-1] xmldir = self._workdir + os.sep + 'xml' + os.sep # check to see if we have xml that works # if we don't check to see if we have a zip file # if we don't, download it # if we do, extract it # check the xml again, if it's good, proceed # if it's not, error out try: assert os.path.exists(xmldir + 'usc01.xml') soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read()) xmlrp = soup.find('docpublicationname').text.split('@')[-1] #old way to get rp, the new way is much better # xmlrp = soup.title.first("note", topic="miscellaneous").text.split()[-1] if xmlrp == self.rp: raise AllGood else: raise XMLNotThere except (XMLNotThere,AssertionError): # delete directory if it exists if os.path.exists(xmldir): shutil.rmtree(xmldir) # if there's no xml file, download it if not os.path.exists(self._workdir + os.sep + filename): log.info('No zipfile found for release point, downloading...') self.downloadFile(self.zipurl, filename) # now we should have a zipfile and no existing xmldir log.info('Extracting file %s...' % filename) zf = ZipFile(self._workdir + os.sep + filename, 'r') # older release points do not have an interior xml/ dir if not all( [ n.startswith('xml/') for n in zf.namelist()]): zf.extractall(xmldir) else: zf.extractall(self._workdir) # double check the xml now... assert os.path.exists(xmldir + 'usc01.xml') # it may be problematic to rely on the RP information in the XML documents provided # rp 113-21 (the earliest presently available) does not include this in the # docpublicationname meta tag #soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read()) #xmlrp = soup.find('docpublicationname').text.split('@')[-1] #if xmlrp != self.rp: # raise XMLNotThere('XML did not check out after extraction.') except AllGood: pass except: raise log.info('All checks passed...') xf = os.listdir(xmldir) root = self.findOrCreateRoot() xf = [xmldir + f for f in xf if f.endswith('.xml')] xf.sort() log.info("Processing %i files..." % len(xf)) self.bar = progress.Bar(label='US', expected_size=1000*len(xf)) self.progress( i=len(xf) ) for fn in xf: self.processFile(fn, root) self.progress(rollup=1000) log.info('Analyzing code...') self.progress(label="Analyzing") root.analyze(commit=True, bar=self.bar) store.commit() log.info('Scrape completed.')
def geturl( self, url ): return Cache.get( url ).resp