Esempio n. 1
0
 def processLoadedPage(self):
     """To be called after page is received from the network."""
     if self.status == 200:
         # add to parse queue & save if needed
         dummy, ext = os.path.splitext(self.url)
         ext = ext.lower()
         if self.header.gettype() in config.pagemimetypes:
             pageoutput.save(self)
             pageparsequeue.add(self)
         elif ext in config.documentsuffixes or self.header.gettype() in config.documentmimetypes:
             print "*** docsave", len(self.body)
             documentoutput.save(self)
             pagefetchqueue.add(parenturl(self.url))
             
     elif self.status == 204:        # No Content
         return None
     elif self.status in [300,       # Multiple Choices
                          301,
                          302,       # Found - not really sure how to handle this
                          ]:        
         # XXX this could be handled much smarter
         return None
     elif self.status in [400,
                          401,       # unauthorized
                          403,       # access denied
                          404,       # not fond
                          405,       # method no supported
                          406,
                          408,
                          410,       # Gone
                          412,
                          414,       # Request URI Too Large
                          423,       # Locked
                          ]:
         return None
     elif self.status in [500,              # internal error
                          501,
                          502,              # Bad Gateway
                          503,
                          504,              # Origin Server Timeout
                          505,
                          506,
                          510,
                          0,                # No Header was send 
                          ]:
         return None    
     else:
         print vars(self.header)
         log.exception("unknown response code %r" % self.status)
         raise RuntimeError, "unknown response code %r" % self.status
Esempio n. 2
0
        links = extractlinks.parseForUrlsInHtml(page.body, page.url)
    except Exception, msg:
        log.exception(msg)
        return None

    added = 0
    for link in links:
        if not dupelist.seen(link) and 'osdn.safaribooksonline.com' not in link:
            dummmy, ext = os.path.splitext(link)
            ext = ext.lower()
            if ext in config.documentsuffixes:
                    documentfetchqueue.add(link)
                    added += 1
            else:
                if ext not in config.nonpagesuffixes:
                    pagefetchqueue.add(link)
                    added += 1

    #log.debug('%d links (%d suitable for queue) in %.3fs extracted form %r' % (len(links), added, (time.time() - t), page.url))


    
def main():
    while len(pagefetchqueue) + len(pageparsequeue) + len(documentfetchqueue) > 0:

        if documentfetchqueue:
            print range(config.paralell_downloads - status.active_downloads - 2)
            for i in range(config.paralell_downloads - status.active_downloads - 2):
                fetch_a_document()
        
        log.info('dupe: %d, fetchqueue: %d, parsequeue: %d, docqueue: %d, active: %d, ok: %d, ko: %d' %(len(dupelist), len(pagefetchqueue), len(pageparsequeue), len(documentfetchqueue), status.active_downloads, status.download_successes, status.download_failures))