Beispiel #1
0
 def fetch(self, url):
     """ Fetch url and return a file-like representation. """
     fname = os.path.join(self._cachedir, self._formatter(url))
     if not os.path.exists(fname):
         time.sleep(self._sleep)
         html = urllib.urlopen(url).read()
         with codecs.open(fname, 'w', 'utf-8') as f:
             soup = BeautifulSoup(html)
             f.write(unicode(soup))
     return fname
Beispiel #2
0
def run(sitemapurl, patt, cachedir, cachejournal, sleep=5):
    """
  Args:
    sitemapurl: A string URL to an XML sitemap.
    patt: A string used for substring matching of the urls in the sitemap.
    cachedir: Directory used to cache downloaded HTML files.
    cachejournal: A string filename to store records about the 
      cache directory. Should be considered a tmp file.
    sleep: Integer amount of time to sleep between HTTP requests, in seconds.
  """
    fetcher = CacheFetcher(cachedir, filename_formatter, sleep)
    sitemap = urllib.urlopen(sitemapurl)
    with open(cachejournal, 'w') as journal:
        for url in extract_sitemap(sitemap, patt):
            fname = fetcher.fetch(url)
            journal.write('{0},{1}\n'.format(fname, url))