def postprocess(self): # build the list of urls that were set up with add_internal() that # do not have a parent (they form the base for the site) for url in self._internal_urls: link = self.linkMap[url].follow_link() if link == None: debugio.warn('base link %s redirects to nowhere' % url) continue # add the link to bases debugio.debug('crawler.postprocess(): adding %s to bases' % link.url) self.bases.append(link) # if we got no bases, just use the first internal one if len(self.bases) == 0: debugio.debug('crawler.postprocess(): fallback to adding %s to bases' % self._internal_urls[0]) self.bases.append(self.linkMap[self._internal_urls[0]]) # do a breadth first traversal of the website to determin depth and # figure out page children tocheck = [] for link in self.bases: link.depth = 0 tocheck.append(link) # repeat until we have nothing more to check while len(tocheck) > 0: debugio.debug('crawler.postprocess(): items left to examine: %d' % len(tocheck)) # choose a link from the tocheck list link = tocheck.pop(0) # figure out page children for child in link._pagechildren(): # skip children already in our list or the wrong depth if child in tocheck or child.depth != link.depth+1: continue tocheck.append(child) # set some compatibility properties # TODO: figure out a better way to get to this to the plugins self.base = self.bases[0].url
def _maketxt(txt, encoding): """Return an unicode text of the specified string do correct character conversions and replacing html entities with normal characters.""" # try to decode with the given encoding if encoding: try: return htmlunescape(unicode(txt, encoding, 'replace')) except (LookupError, TypeError, ValueError), e: debugio.warn('page has unknown encoding: %s' % str(encoding))
def install_file(source, text=False): """Install the given file in the output directory. If the text flag is set to true it is assumed the file is text, translating line endings.""" import shutil import urlparse # figure out mode to open the file with mode = 'r' if text: mode += 'U' # check with what kind of argument we are called scheme = urlparse.urlsplit(source)[0] if scheme == 'file': # this is a file:/// url, translate to normal path and open import urllib source = urllib.url2pathname(urlparse.urlsplit(source)[2]) elif scheme == '' and os.path.isabs(source): # this is an absolute path, just open it as is pass elif scheme == '': # this is a relavite path, try to fetch it from the python path for directory in sys.path: tst = os.path.join(directory, source) if os.path.isfile(tst): source = tst break # TODO: support more schemes here # figure out the destination name target = os.path.join(config.OUTPUT_DIR, os.path.basename(source)) # test if source and target are the same source = os.path.realpath(source) if source == os.path.realpath(target): debugio.warn('attempt to overwrite %(fname)s with itself' % {'fname': source}) return # open the input file sfp = None try: sfp = open(source, mode) except IOError, (errno, strerror): debugio.error('%(fname)s: %(strerror)s' % { 'fname': source, 'strerror': strerror }) sys.exit(1)
def postprocess(self): # build the list of urls that were set up with add_internal() that # do not have a parent (they form the base for the site) for url in self._internal_urls: link = self.linkMap[url].follow_link() if link == None: debugio.warn('base link %s redirects to nowhere' % url) continue # add the link to bases debugio.debug('crawler.postprocess(): adding %s to bases' % link.url) self.bases.append(link) # if we got no bases, just use the first internal one if len(self.bases) == 0: debugio.debug( 'crawler.postprocess(): fallback to adding %s to bases' % self._internal_urls[0]) self.bases.append(self.linkMap[self._internal_urls[0]]) # do a breadth first traversal of the website to determin depth and # figure out page children tocheck = [] for link in self.bases: link.depth = 0 tocheck.append(link) # repeat until we have nothing more to check while len(tocheck) > 0: debugio.debug('crawler.postprocess(): items left to examine: %d' % len(tocheck)) # choose a link from the tocheck list link = tocheck.pop(0) # figure out page children for child in link._pagechildren(): # skip children already in our list or the wrong depth if child in tocheck or child.depth != link.depth + 1: continue tocheck.append(child) # set some compatibility properties # TODO: figure out a better way to get to this to the plugins self.base = self.bases[0].url
def crawl(self): """Crawl the website based on the urls specified with add_internal().""" # TODO: have some different scheme to crawl a site (e.g. separate # internal and external queues, threading, etc) tocheck = [] for u in self._internal_urls: tocheck.append(self._get_link(u)) # repeat until we have nothing more to check while len(tocheck) > 0: debugio.debug("crawler.crawl(): items left to check: %d" % len(tocheck)) # choose a link from the tocheck list link=tocheck.pop(0) # skip link it there is nothing to check if link.isyanked or link.isfetched: continue # fetch the link's contents link.fetch() # add children to tocheck for child in link.children: if not child.isyanked and not child.isfetched and not child in tocheck: tocheck.append(child) # add embedded content for embed in link.embedded: if not embed.isyanked and not embed.isfetched and not embed in tocheck: tocheck.append(embed) # sleep between requests if configured if config.WAIT_BETWEEN_REQUESTS > 0: debugio.debug('sleeping %s seconds' % config.WAIT_BETWEEN_REQUESTS) time.sleep(config.WAIT_BETWEEN_REQUESTS) # build the list of urls that were set up with add_internal() that # do not have a parent (they form the base for the site) bases = [ ] for u in self._internal_urls: l = self.linkMap[u].follow_link() if l == None: debugio.warn('base link %s redirects to nowhere' % u) continue # if the link has no parent add it to the result list unless it is the first one if len(l.parents) == 0 or len(bases) == 0: debugio.debug('crawler.crawl(): adding %s to bases' % l.url) bases.append(l) # if we got no bases, just use the first internal one if len(bases) == 0: debugio.debug('crawler.crawl(): fallback to adding %s to bases' % self._internal_urls[0]) bases.append(self.linkMap[self._internal_urls[0]]) # do a breadth first traversal of the website to determin depth and # figure out page children tocheck = [] for link in bases: link.depth = 0 tocheck.append(link) # repeat until we have nothing more to check while len(tocheck) > 0: debugio.debug("crawler.crawl(): items left to examine: %d" % len(tocheck)) # choose a link from the tocheck list link = tocheck.pop(0) # figure out page children for child in link._pagechildren(): # skip children already in our list or the wrong depth if child in tocheck or child.depth != link.depth+1: continue tocheck.append(child) # set some compatibility properties # TODO: figure out a better way to get to this to the plugins self.base = bases[0].url