Ejemplo n.º 1
0
 def test_register_downloader(self):
     core = datCrawl()
     core.register_downloader(downloaders.DefaultDownloader)
     self.assertEqual(core.downloaders['DefaultDownloader'], downloaders.DefaultDownloader)
Ejemplo n.º 2
0
 def test_downloader_receives_options(self):
     core = datCrawl()
     core.register_downloader(DownloaderThatReturnKwargs)
     core.register_crawler(CrawlerWithOptions)
     self.assertEqual(core.run('http://google.es'), True)
Ejemplo n.º 3
0
 def test_register_incorrect_downloader(self):
     core = datCrawl()
     self.assertRaises(DownloaderIsNotInstanceOfBase, lambda: core.register_downloader(object))
Ejemplo n.º 4
0
 def test_cant_register_downloader_twice(self):
     core = datCrawl()
     core.register_downloader(downloaders.DefaultDownloader)
     self.assertRaises(DownloaderAlreadyRegistered, lambda: core.register_downloader(downloaders.DefaultDownloader))
Ejemplo n.º 5
0
    exit()


class GithubNameCrawler(Crawler):
    urls = [
        (
            'get_name',
            '(?P<url>https\:\/\/github\.com\/fmartingr\/(?P<name>.*))'
        )
    ]
    downloader = 'DefaultDownloader'

    def action_get_name(self, data, **kwargs):
        try:
            document = fromstring(data)
            selector = CSSSelector('.js-current-repository')
            name = selector(document)[0].text
            data = {
                'name': name
            }
            return data
        except Exception as e:
            print e


datcrawl = datCrawl()
datcrawl.register_downloader(DefaultDownloader)
datcrawl.register_crawler(GithubNameCrawler)
print datcrawl.run("https://github.com/fmartingr/datCrawl")
# returns {'name': 'datCrawl'}