コード例 #1
0
ファイル: v-rawler.py プロジェクト: mlarocca/V-rawler
    def get(self, url):
      
      ip = self.request.remote_addr #to prevent abuses, only a request every minute is served
      
      request = Request.gql("where ip='%s'" % ip).get() #Look for request from the same IP address
      if not request is None:
        delta = request.is_allowed()
        
        if delta > 0: #too little time has passed from the previous request
          #self.error(408)   #Timeout Error
          self.response.set_status(408, "Your IP address has issued a request less than 1 min ago. Please wait %d seconds" % delta)
          return
      else:
        request = Request(ip=ip, page_crawled=url)
        request.save()
      
      self.response.headers['Content-Type'] = 'application/json'
      handler = CrawlerHandler()


      site_image = memcache.get(url)

      
      if site_image is None:
        home_page = handler.start_crawling(url, MAX_PAGE_DEPTH, MAX_PAGES_TO_CRAWL, 0.01)  #causes a little delay, but not too big (one 100th of a sec) 

        if home_page is None:
          self.error(400) #Bad Request
          return
        else:
          site_image = handler.page_graph(home_page)
          memcache.set(url, site_image)
      
      self.__responde(site_image) 
コード例 #2
0
def test_list_resources(url, threads = 1, max_page_depth = None, max_pages_to_crawl = None):
  handler = CrawlerHandler()
  home_page = handler.start_crawling(url, threads, max_page_depth, max_pages_to_crawl, 0)
  #Looks for a page that doesn't exist
  resources = handler.list_resources(home_page + str(random()))
  for s in resources.values():
    assert(len(s) == 0) 
  #Looks for a page that DOES exist
  resources = handler.list_resources(home_page)
  assert(len( reduce(lambda s1, s2: s1 | s2, resources.values())) > 0) #At least some resource should be found
コード例 #3
0
def test_page_graph(url, threads = 1, max_page_depth = None, max_pages_to_crawl = None):
  handler = CrawlerHandler()
  home_page = handler.start_crawling(url, threads, max_page_depth, max_pages_to_crawl, 0)
  #Looks for a page that doesn't exist
  pages_set = handler.page_graph(home_page + str(random()))
  assert(len(pages_set) == 0) 
  #looks for a page that DOES exist
  pages_set_1 = handler.page_graph(home_page)
  pages_set_2 = handler.page_graph()
  assert (pages_set_1 == pages_set_2)
  return pages_set_2
コード例 #4
0
def test():
  handler = CrawlerHandler()
  assert handler.start_crawling("www.news.ycombinator.com", 30, None, 20, 0) is None
  print test_crawler("http://repubblica.it", 30, None, 20)
  
  path = "/%s/tests" % os.getcwd().replace("\\", "/")
                   
  #Test cyclic reference between 2 documents 
  site_resources = test_crawler(urlunsplit(("file", path, "test_1.html", '', '')), 3)
  
  #Asserts on content
  assert("img_2.jpg" in site_resources["images"])
  assert("http://mysite.me/img_1.jpg" in site_resources["images"])
  assert("/test.js" in site_resources["scripts"])

  
  #Test cyclic reference between 3 or more documents 
  site_resources = test_crawler(urlunsplit(("file", path, "test_B.html", '', '')), 2)

  #Asserts on content
  assert("img_2.jpg" in site_resources["images"])
  assert("http://mysite.me/img_1.jpg" in site_resources["images"])
  assert("/test.js" in site_resources["scripts"])
  assert("img_3.jpg" in site_resources["images"])  
  url_B = urlunsplit(("file", path, "test_B.html", '', ''))
  test_list_resources(url_B, 2)
  
  print test_page_graph(url_B, 2, 1)
  print test_page_graph(url_B, 2)
  
  graph = test_page_graph(urlunsplit(("file", path, "test_B.html", '', '')), 5)
  url1 = urlunsplit(("file", path, "test_1.html", '', ''))
  url2 = urlunsplit(("file", path, "test_1_copy.html", '', ''))
  
  assert(graph[url1] == graph[url2])  
  assert len(graph[url_B]["resources"]["videos"]) == 2
  print graph[url_B]["resources"]["audios"]
コード例 #5
0
def test_crawler(url, threads = 1, max_page_depth = None, max_pages_to_crawl = None):
  handler = CrawlerHandler()
  handler.start_crawling(url, threads, max_page_depth, max_pages_to_crawl, 0)
  return handler.list_resources()