Ejemplo n.º 1
0
 def crawled(self, url):
     """ Dont recrawl same urls """
     try:
         item = CrawlAuditItem.get(url__exact=url)
         if item:
             return True
     except:
         pass
     return False
Ejemplo n.º 2
0
 def parse(self, response):
     """ Generator func - Check its html - ie has encoding """
     item = CrawlAuditItem()
     item['url'] = response.url
     item['metatype'] = response.meta
     if hasattr(response, 'encoding'):
         hxs = HtmlXPathSelector(response)
         links = hxs.select('//a/@href').extract()
         links = set(links)
         if response.url in links:
             links.remove(response.url)
         item['links'] = len(links)
         #flash, javascript and framesets can be external cookie sources
         embed = hxs.select('//embed/@src').extract()
         embed.extend(hxs.select('//object/@data').extract())
         embed.extend(hxs.select('//script/@src').extract())
         embed.extend(hxs.select('//frameset/@src').extract())
         embed = set(embed)
         for url in embed:
             # Store embedded scripts / flash since also source of cookies
             # can we save flash cookies? - maybe needs separate firefox grab of url
             if url.startswith('/'):
                 url = 'http://www.bris.ac.uk%s' % url
             elif not url.startswith('http'):
                 rurl = response.url
                 if not rurl.endswith('/'):
                     urlbits = response.url.split('/')
                     rurl = '/'.join(urlbits[-1])
                 url = '%s%s' % (rurl, url)
             if not self.crawled(url):
                 newresponse = Request(url)
                 newitem = CrawlAuditItem()                
                 newitem['url'] = response.url
                 newitem['metatype'] = response.meta
                 newitem['links'] = 0
                 newitem.save()
         for url in links:
             if not self.crawled(url):
                 url = self.domain_check(url)
                 if url:
                     yield Request(url, callback=self.parse) 
         # Just save crawled pages not files/images
         try:
             item.save()
         except:
             self.pipe.process_item(item)