response = urllib2.urlopen(req) data = response.read() return data except Exception as error: raise Exception("Error downloading %s:" % (url, error)) class AwesomeWikipediaTitleCrawler(Crawler): urls = [ ('get_title', '(?P<url>http\:\/\/en.wikipedia.org\/wiki\/(?P<name>.*))', ) ] downloader = 'DefaultDownloaderWithCustomUserAgent' # Downloader options with custom user agent. downloader_options = { 'headers': {'User-agent': 'Firefox'} } def action_get_title(self, data, **kwargs): try: document = document_fromstring(data) selector = CSSSelector('h1.firstHeading > span') return {'title': selector(document)[0].text} except Exception as e: print e crawler = datCrawl() crawler.register_downloader(DefaultDownloaderWithCustomUserAgent) crawler.register_crawler(AwesomeWikipediaTitleCrawler) print crawler.run("http://en.wikipedia.org/wiki/Python_(programming_language)") # returns {'title': 'Python (programming language)'}
def test_instance_check(self): core = datCrawl() self.assertTrue(isinstance(core, datCrawl))
def test_running_full_crawler(self): core = datCrawl() core.register_crawler(AwesomeWikipediaTitleCrawler) result = core.run(URL) self.assertEqual(result['title'], 'Python')
def test_worker_instance(self): core = datCrawl() core.register_crawler(AwesomeWikipediaTitleCrawler) worker = core.worker(URL) self.assertTrue(isinstance(worker, datCrawlWorker)) self.assertEqual(URL, worker.url)
def test_register_urls(self): core = datCrawl() data = ('action', 'http://www.google.es/', 'AwesomeGoogleCrawler') core.register_url(data[0], data[1], data[2]) self.assertEqual(core.urls[0], data)