Example #1
0
 def scrape (self):
     if not self.rp:
         r = Cache.get(self.nation.cfg['entrypoint'])
         soup = BS(str(r))
         #find current release point
         log.info("No release point specified, retreiving latest...")
         # this failed fantastically - we'll get the RP from the zipurl
         #self.rp = utf8(soup.findAll('h3', attrs={'class': 'releasepointinformation'})[0].text.split()[-1])
         log.info("Found release point %s" % self.rp)
         #find the download url
         self.zipurl = self.nation.cfg['entrypoint'].rpartition('/')[0] + '/' + soup.findAll('a', title='All USC Titles in XML')[0]['href']
         # new way to set the rp using the zipurl's filename
         self.rp = utf8( self.zipurl.rpartition('@')[-1].partition('.')[0] )
     else:
         log.info('Using specified release point %s...' % self.rp)
         # don't actually need this
         # rpurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/usc-rp@%s.htm' % (tuple(self.rp.split()) + (self.rp,))
         self.zipurl = 'http://uscode.house.gov/download/releasepoints/us/pl/%s/%s/xml_uscAll@%s.zip'  % (tuple(self.rp.split('-')) + (self.rp,))
     
     log.debug("Using zipurl: %s" % self.zipurl)
     
     class FileNotThere (Exception): pass
     class XMLNotThere( Exception ): pass
     class AllGood( Exception ): pass
     
     filename = self.zipurl.rpartition('/')[-1]
     xmldir = self._workdir + os.sep + 'xml' + os.sep
     
     # check to see if we have xml that works
     # if we don't check to see if we have a zip file
     # if we don't, download it
     # if we do, extract it
     # check the xml again, if it's good, proceed
     # if it's not, error out
     
     try:
         assert os.path.exists(xmldir + 'usc01.xml')
         soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read())
         xmlrp = soup.find('docpublicationname').text.split('@')[-1]
         #old way to get rp, the new way is much better
         # xmlrp = soup.title.first("note", topic="miscellaneous").text.split()[-1]
         if xmlrp == self.rp:
             raise AllGood
         else:
             raise XMLNotThere
     except (XMLNotThere,AssertionError):
         # delete directory if it exists
         if os.path.exists(xmldir):
             shutil.rmtree(xmldir)
         # if there's no xml file, download it
         if not os.path.exists(self._workdir + os.sep + filename):
             log.info('No zipfile found for release point, downloading...')
             self.downloadFile(self.zipurl, filename)
         # now we should have a zipfile and no existing xmldir
         log.info('Extracting file %s...' % filename)
         zf = ZipFile(self._workdir + os.sep + filename, 'r')
         # older release points do not have an interior xml/ dir
         if not all( [ n.startswith('xml/') for n in zf.namelist()]):
             zf.extractall(xmldir)
         else:
             zf.extractall(self._workdir)
         # double check the xml now...
         assert os.path.exists(xmldir + 'usc01.xml')
         # it may be problematic to rely on the RP information in the XML documents provided
         # rp 113-21 (the earliest presently available) does not include this in the 
         # docpublicationname meta tag
         #soup = BS(open(xmldir + os.sep + 'usc01.xml', 'r').read())
         #xmlrp = soup.find('docpublicationname').text.split('@')[-1]
         #if xmlrp != self.rp:
         #    raise XMLNotThere('XML did not check out after extraction.')
     except AllGood:
         pass
     except:
         raise
     
     log.info('All checks passed...')
     xf = os.listdir(xmldir)
     root = self.findOrCreateRoot()
     xf = [xmldir + f for f in xf if f.endswith('.xml')]
     xf.sort()
     log.info("Processing %i files..." % len(xf))
     self.bar = progress.Bar(label='US', expected_size=1000*len(xf))
     self.progress( i=len(xf) )
     for fn in xf:
         self.processFile(fn, root)
         self.progress(rollup=1000)
     log.info('Analyzing code...')
     self.progress(label="Analyzing")
     root.analyze(commit=True, bar=self.bar)
     store.commit()
     log.info('Scrape completed.')
Example #2
0
 def geturl( self, url ):
     return Cache.get( url ).resp