def test_register_downloader(self): core = datCrawl() core.register_downloader(downloaders.DefaultDownloader) self.assertEqual(core.downloaders['DefaultDownloader'], downloaders.DefaultDownloader)
def test_downloader_receives_options(self): core = datCrawl() core.register_downloader(DownloaderThatReturnKwargs) core.register_crawler(CrawlerWithOptions) self.assertEqual(core.run('http://google.es'), True)
def test_register_incorrect_downloader(self): core = datCrawl() self.assertRaises(DownloaderIsNotInstanceOfBase, lambda: core.register_downloader(object))
def test_cant_register_downloader_twice(self): core = datCrawl() core.register_downloader(downloaders.DefaultDownloader) self.assertRaises(DownloaderAlreadyRegistered, lambda: core.register_downloader(downloaders.DefaultDownloader))
exit() class GithubNameCrawler(Crawler): urls = [ ( 'get_name', '(?P<url>https\:\/\/github\.com\/fmartingr\/(?P<name>.*))' ) ] downloader = 'DefaultDownloader' def action_get_name(self, data, **kwargs): try: document = fromstring(data) selector = CSSSelector('.js-current-repository') name = selector(document)[0].text data = { 'name': name } return data except Exception as e: print e datcrawl = datCrawl() datcrawl.register_downloader(DefaultDownloader) datcrawl.register_crawler(GithubNameCrawler) print datcrawl.run("https://github.com/fmartingr/datCrawl") # returns {'name': 'datCrawl'}