Example #1
0
 def test_incorrect_parse_xpath(self):
     s = Silk(self.io_loop)
     s.parse_url('//count()',LOCAL_URL%(LOCAL_PORT,'index.html'), self.stop)
     try:
         self.wait()
     except XPathEvalError:
         pass
Example #2
0
 def test_simplehttpserver(self):
     s = Silk(self.io_loop)
     s.get(LOCAL_URL%(LOCAL_PORT,'/'), self.stop)
     response = self.wait()
     self.assertEqual(response.code, 200)
     s.get(LOCAL_URL%(LOCAL_PORT,'thisdoesnotexist.html'),self.stop)
     response = self.wait()
     self.assertEqual(response.code, 404)
Example #3
0
 def test_can_register_spiders(self):
     spider1 = Spider()
     spider2 = Spider()
     s = Silk(self.io_loop)
     s.register(spider1)
     s.register(spider2)
     self.assertIn(spider1, s.spiders)
     self.assertIn(spider2, s.spiders)
Example #4
0
 def test__find_urls(self):
     s = Silk(self.io_loop, allowed_domains=['www.dmoz.org'], fail_silent=False)
     s.get(LOCAL_URL%(LOCAL_PORT,'index.html'), self.stop)
     response = self.wait()
     spider = Spider()
     spider._find_urls(response, self.stop)
     links = self.wait()
     self.assertIn(['http://www.google.com',
                    'page1.html'], links)
Example #5
0
 def test_parse(self):
     s = Silk(self.io_loop)
     s.get(LOCAL_URL%(LOCAL_PORT,'index.html'), self.stop)
     response = self.wait()
     s.parse('//text()', response, self.stop)
     xpath_elements = self.wait()
     self.assertTrue(type(xpath_elements=='list'))
     text_string = ''.join(xpath_elements)
     self.assertIn('test',text_string)
Example #6
0
 def test_spider_prints_urls_without_callback(self):
     allow_regex = ['Python','Ruby']
     deny_regex = ['Deutsch']
     
     spider1 = Spider(allow_regex, deny_regex, callback=None)
     s = Silk(self.io_loop, allowed_domains=['www.dmoz.org'], fail_silent=False)
     s.register(spider1)
     s.crawl('http://www.dmoz.org/Computers/Programming/Languages/Python/Books/',
             self.stop)
     response = self.wait()
Example #7
0
 def test_domains_single_domain(self):
     domains = [
         '127.0.0.1:%s'%(LOCAL_PORT),
     ]
     
     s = Silk(self.io_loop, allowed_domains=domains)
     s.get(LOCAL_URL%(LOCAL_PORT,'index.html'), self.stop)
     response = self.wait()
     self.assertIn("test paragraph", response.body)
     
     s.get('http://google.com', self.stop)
     response = self.wait()
     self.assertEqual(response.body, '') # Silently fails and returns an empty body
Example #8
0
 def test_add_requests(self):
     domains = [
         'www.dmoz.org',
     ]
     s = Silk(self.io_loop, allowed_domains=domains, fail_silent=False)
     s.add_request('http://www.dmoz.org/Computers/Programming/Languages/Python/Books/',
                    self.stop)
     response = self.wait()
     self.assertIn('dmoz', response.body)
     s.add_request('http://www.dmoz.org/Computers/Programming/Languages/Python/Books/',
                    self.stop)
     response = self.wait()
     self.assertIn('dmoz', response.body)
Example #9
0
 def test_multiple_domains(self):
     domains = [
         'www.dmoz.org',
         'www.google.com',
     ]
     
     s3 = Silk(self.io_loop, allowed_domains=domains)
     s3.get('http://www.dmoz.org', self.stop)
     response = self.wait()
     self.assertIn("dmoz", response.body)
     s3.get('http://www.google.com', self.stop)
     response = self.wait()
     self.assertIn("Google", response.body)
Example #10
0
    def test_domains_fail_loudly(self):
        domains = [
            'www.dmoz.org',
        ]

        s = Silk(self.io_loop, allowed_domains=domains, fail_silent=False)
        s.get('http://www.dmoz.org', self.stop)
        response = self.wait()
        self.assertIn("dmoz", response.body)
        try:
            s.get('http://google.com', self.stop)
            self.wait()
        except ExternalDomainError as ex:
            self.assertEquals(type(ExternalDomainError('')), type(ex))
Example #11
0
 def test_local_file_storage(self):
     s = Silk(self.io_loop)
     s.fetch_and_save(LOCAL_URL%(LOCAL_PORT,'index.html'), self.stop)
     response = self.wait()
     s.get_local_file(LOCAL_URL%(LOCAL_PORT,'index.html'), self.stop)
     local_file = self.wait()
     self.assertEqual(response.body, local_file.body)
     s.delete_local_file(LOCAL_URL%(LOCAL_PORT,'index.html'))
Example #12
0
 def test_debug_setting(self):
     """
     Test that with debug=True that files are being saved to the local disk.
     """
     s = Silk(self.io_loop, debug=True)
     s.get(LOCAL_URL%(LOCAL_PORT,'index.html'), self.stop)
     response = self.wait()
     s.get_local_file(LOCAL_URL%(LOCAL_PORT,'index.html'), self.stop)
     cached_response = self.wait()
     self.assertEqual(response.body, cached_response.body)
     s.delete_local_file(LOCAL_URL%(LOCAL_PORT,'index.html'))
Example #13
0
 def test_subdomain(self):
     domains = [
         'www.google.com',
     ]
     
     s = Silk(self.io_loop, allowed_domains=domains)
     s.get('http://google.com', self.stop)
     response = self.wait()
     self.assertEqual(len(response.body), 0)
     
     s = Silk(self.io_loop, allowed_domains=domains)
     s.get('http://www.google.com', self.stop)
     response = self.wait()
     self.assertIn('google', response.body)
Example #14
0
 def test__crawl(self):
     spider = Spider()
     s = Silk(self.io_loop, allowed_domains=[''])
     s.register(spider)
     s.crawl(LOCAL_URL%(LOCAL_PORT,'index.html'), self.stop)
Example #15
0
 def test_start(self):
     s = Silk()
     s.loop = IOLoop.instance()
     s.start()
Example #16
0
 def test_parse_url(self):
     s = Silk(self.io_loop)
     s.parse_url('//text()', LOCAL_URL%(LOCAL_PORT,'index.html'), self.stop)
     xpath_elements = self.wait()
     self.assertTrue(type(xpath_elements=='list'))
Example #17
0
 def test_get(self):
     s = Silk(self.io_loop)
     s.get(LOCAL_URL%(LOCAL_PORT,'index.html'),self.stop)
     response = self.wait()
     self.assertIn("Test paragraph", response.body)