コード例 #1
0
 def fetch(self):
     """Attempt to fetch the url (if isyanked is not True) and fill in link
     attributes (based on isinternal)."""
     # fully ignore links that should not be feteched
     if self.isyanked:
         debugio.info('  %s' % self.url)
         debugio.info('    ' + self.isyanked)
         return
     # see if we can import the proper module for this scheme
     schememodule = schemes.get_schememodule(self.scheme)
     if schememodule is None:
         self.isyanked = 'unsupported scheme (' + self.scheme + ')'
         self._ischanged = True
         debugio.info('  %s' % self.url)
         debugio.info('    ' + self.isyanked)
         return
     debugio.info('  %s' % self.url)
     content = schememodule.fetch(self, parsers.get_mimetypes())
     self.isfetched = True
     self._ischanged = True
     # skip parsing of content if we were returned nothing
     if content is None:
         return
     # find a parser for the content-type
     parsermodule = parsers.get_parsermodule(self.mimetype)
     if parsermodule is None:
         debugio.debug(
             'crawler.Link.fetch(): unsupported content-type: %s' %
             self.mimetype)
         return
     # parse the content
     parsermodule.parse(content, self)
コード例 #2
0
 def _get_robotparser(self, link):
     """Return the proper robots parser for the given url or None if one
     cannot be constructed. Robot parsers are cached per scheme and
     netloc."""
     # only some schemes have a meaningful robots.txt file
     if link.scheme != 'http' and link.scheme != 'https':
         debugio.debug('crawler._get_robotparser() called with unsupported scheme (%s)' % link.scheme)
         return None
     # split out the key part of the url
     location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', ''))
     # try to create a new robotparser if we don't already have one
     if not self._robotparsers.has_key(location):
         import httplib
         debugio.info('  getting robots.txt for %s' % location)
         self._robotparsers[location] = None
         try:
             rp = robotparser.RobotFileParser()
             rp.set_url(urlparse.urlunsplit(
               (link.scheme, link.netloc, '/robots.txt', '', '') ))
             rp.read()
             self._robotparsers[location] = rp
         except (TypeError, IOError, httplib.HTTPException):
             # ignore any problems setting up robot parser
             pass
     return self._robotparsers[location]
コード例 #3
0
 def _get_robotparser(self, link):
     """Return the proper robots parser for the given url or None if one
     cannot be constructed. Robot parsers are cached per scheme and
     netloc."""
     # only some schemes have a meaningful robots.txt file
     if link.scheme != 'http' and link.scheme != 'https':
         debugio.debug(
             'crawler._get_robotparser() called with unsupported scheme (%s)'
             % link.scheme)
         return None
     # split out the key part of the url
     location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', ''))
     # try to create a new robotparser if we don't already have one
     if not self._robotparsers.has_key(location):
         import httplib
         debugio.info('  getting robots.txt for %s' % location)
         self._robotparsers[location] = None
         try:
             rp = robotparser.RobotFileParser()
             rp.set_url(
                 urlparse.urlunsplit(
                     (link.scheme, link.netloc, '/robots.txt', '', '')))
             rp.read()
             self._robotparsers[location] = rp
         except (TypeError, IOError, httplib.HTTPException):
             # ignore any problems setting up robot parser
             pass
     return self._robotparsers[location]
コード例 #4
0
 def fetch(self):
     """Attempt to fetch the url (if isyanked is not True) and fill in link
     attributes (based on isinternal)."""
     # fully ignore links that should not be feteched
     if self.isyanked:
         debugio.info('  %s' % self.url)
         debugio.info('    ' + self.isyanked)
         return
     # see if we can import the proper module for this scheme
     schememodule = schemes.get_schememodule(self.scheme)
     if schememodule is None:
         self.isyanked = 'unsupported scheme (' + self.scheme + ')'
         self._ischanged = True
         debugio.info('  %s' % self.url)
         debugio.info('    ' + self.isyanked)
         return
     debugio.info('  %s' % self.url)
     content = schememodule.fetch(self, parsers.get_mimetypes())
     self.isfetched = True
     self._ischanged = True
     # skip parsing of content if we were returned nothing
     if content is None:
         return
     # find a parser for the content-type
     parsermodule = parsers.get_parsermodule(self.mimetype)
     if parsermodule is None:
         debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' % self.mimetype)
         return
     # parse the content
     parsermodule.parse(content, self)
コード例 #5
0
def generate(site):
    """Generate pages for plugins."""
    for p in config.PLUGINS:
        debugio.info('  ' + p)
        # import the plugin
        plugin = __import__('plugins.' + p, globals(), locals(), [p])
        # run the plugin
        plugin.generate(site)
コード例 #6
0
def generate(site):
    """Generate pages for plugins."""
    for p in config.PLUGINS:
        debugio.info('  ' + p)
        # import the plugin
        plugin = __import__('plugins.' + p, globals(), locals(), [p])
        # run the plugin
        plugin.generate(site)
コード例 #7
0
def deserialize(fp):
    """Read data from the file and construct objects from it.
    A new site instance is returned.
    After the site has been deserialized the crawl() and postprocess()
    functions should be called to regenerate the other link attributes."""
    import crawler
    site = crawler.Site()
    link = None
    while True:
        line = fp.readline()
        # check for end-of-file
        if not line:
            break
        # skip comments
        if _commentpattern.search(line):
            continue
        # skip empty lines
        if line.rstrip() == '':
            continue
        # find section header
        match = _sectionpattern.search(line)
        if match:
            link = site.get_link(match.group(1))
            debugio.info('  %s' % link.url)
            # clear some data that is annoying if we have duplicates
            link.anchors = []
            link.linkproblems = []
            link.pageproblems = []
            continue
        # check for key-value pair
        match = _keyvaluepattern.search(line)
        if match:
            key = match.group(1)
            value = match.group(2)
            if link is None:
                _deserialize_site(site, key, value)
            else:
                _deserialize_link(link, key, value)
            continue
        # fallthrough
        raise DeSerializeException('parse error')
    return site
コード例 #8
0
def deserialize(fp):
    """Read data from the file and construct objects from it.
    A new site instance is returned.
    After the site has been deserialized the crawl() and postprocess()
    functions should be called to regenerate the other link attributes."""
    import crawler
    site = crawler.Site()
    link = None
    while True:
        line = fp.readline()
        # check for end-of-file
        if not line:
            break
        # skip comments
        if _commentpattern.search(line):
            continue
        # skip empty lines
        if line.rstrip() == '':
            continue
        # find section header
        match = _sectionpattern.search(line)
        if match:
            link = site.get_link(match.group(1))
            debugio.info('  %s' % link.url)
            # clear some data that is annoying if we have duplicates
            link.anchors = []
            link.linkproblems = []
            link.pageproblems = []
            continue
        # check for key-value pair
        match = _keyvaluepattern.search(line)
        if match:
            key = match.group(1)
            value = match.group(2)
            if link is None:
                _deserialize_site(site, key, value)
            else:
                _deserialize_link(link, key, value)
            continue
        # fallthrough
        raise DeSerializeException('parse error')
    return site
コード例 #9
0
def main():
    """Main program."""
    site = crawler.Site()
    # parse command-line arguments
    parse_args(site)
    # read serialized file
    if config.CONTINUE:
        fname = os.path.join(config.OUTPUT_DIR, 'webcheck.dat')
        debugio.info('reading stored crawler data....')
        try:
            fp = open(fname, 'r')
            site = serialize.deserialize(fp)
            fp.close()
        except IOError, (errno, strerror):
            debugio.error('%(fname)s: %(strerror)s' % {
                'fname': fname,
                'strerror': strerror
            })
            sys.exit(1)
        debugio.info('done.')
コード例 #10
0
     try:
         fp = open(fname, 'r')
         site = serialize.deserialize(fp)
         fp.close()
     except IOError, (errno, strerror):
         debugio.error('%(fname)s: %(strerror)s' % {
             'fname': fname,
             'strerror': strerror
         })
         sys.exit(1)
     debugio.info('done.')
 # create seriazlized file
 fp = plugins.open_file('webcheck.dat', makebackup=True)
 serialize.serialize_site(fp, site)
 # crawl through the website
 debugio.info('checking site....')
 site.crawl(fp)  # this will take a while
 debugio.info('done.')
 fp.close()
 # serialize the final state again
 fp = plugins.open_file('webcheck.dat', makebackup=True)
 serialize.serialize_site(fp, site)
 serialize.serialize_links(fp, site)
 fp.close()
 # do postprocessing (building site structure, etc)
 debugio.info('postprocessing....')
 site.postprocess()
 debugio.info('done.')
 # now we can write out the files
 # start with the frame-description page
 debugio.info('generating reports...')