def test_session_exception(self, req_mock): req_mock.get(self.url, exc=requests.exceptions.ConnectTimeout) crawler = self.get_crawler() with patch('dirhunt.crawler_url.CrawlerUrl.close') as m: crawler_url = CrawlerUrl(crawler, self.url) self.assertEqual(crawler_url.start(), crawler_url) self.assertEqual(crawler.current_processed_count, 1) m.assert_called_once()
def test_start(self): crawler = self.get_crawler() crawler.closing = True crawler_url = CrawlerUrl(crawler, self.url) crawler.processing[self.url] = crawler_url with requests_mock.mock() as m: m.get(self.url, headers={'Content-Type': 'text/html'}) crawler_url.start() self.assertIn(self.url, crawler.processed) self.assertNotIn(self.url, crawler.processing)
def test_session_read_exception(self): crawler = self.get_crawler() crawler.sessions = Mock() crawler.sessions.get_session.return_value.get.return_value.status_code = 200 crawler.sessions.get_session.return_value.get.return_value.raw.read.side_effect = \ requests.exceptions.ConnectTimeout() with patch('dirhunt.crawler_url.CrawlerUrl.close') as m: crawler_url = CrawlerUrl(crawler, self.url) self.assertEqual(crawler_url.start(), crawler_url) self.assertEqual(crawler.current_processed_count, 1) m.assert_called_once()
def process(self, text, soup=None): self.crawler_url.crawler.add_url( CrawlerUrl(self.crawler_url.crawler, self.redirector, 3, self.crawler_url, timeout=self.crawler_url.timeout))
def add_url(self, crawler_url, force=False): """Add url to queue""" if not isinstance(crawler_url, CrawlerUrl): crawler_url = CrawlerUrl(self, crawler_url, depth=self.depth, timeout=self.timeout) self.add_lock.acquire() url = crawler_url.url if not url.is_valid() or not url.only_domain or not self.in_domains( url.only_domain): self.add_lock.release() return if url.url in self.processing or url.url in self.processed: self.add_lock.release() return self.processing.get(url.url) or self.processed.get(url.url) fn = reraise_with_stack(crawler_url.start) if self.closing: self.add_lock.release() return if force: future = ThreadPoolExecutor(max_workers=1).submit(fn) else: future = self.submit(fn) self.processing[url.url] = future self.add_lock.release() return future
def test_print_results_limit(self): crawler = self.get_crawler(limit=1) crawler.current_processed_count = 1 crawler_url = CrawlerUrl(crawler, self.url) crawler.results.put(GenericProcessor(None, crawler_url)) crawler.print_results() self.assertTrue(crawler.closing)
def add_init_urls(self, *urls): """Add urls to queue. """ for crawler_url in urls: if not isinstance(crawler_url, CrawlerUrl): crawler_url = CrawlerUrl(self, crawler_url, depth=self.depth, timeout=self.timeout) self.add_domain(crawler_url.url.only_domain) self.add_url(crawler_url)
def resume(self, path): resume_data = json.load(open(path)) file_version = resume_data.get('version') if file_version != __version__: raise IncompatibleVersionError( 'Analysis file incompatible with the current version of dirhunt. ' 'Dirhunt version: {}. File version: {}'.format(__version__, file_version) ) for data in resume_data['processed']: crawler_url_data = data['crawler_url'] url = crawler_url_data['url']['address'] crawler_url = CrawlerUrl(self, url, crawler_url_data['depth'], None, crawler_url_data['exists'], crawler_url_data['type']) crawler_url.flags = set(crawler_url_data['flags']) crawler_url.processor_data = data self.processed[url] = crawler_url self.echo(data['line']) for url in resume_data['processing']: self.add_url(url)
def add_url(self, url, depth=3, **kwargs): if is_url_loop(url): return return self.crawler_url.crawler.add_url( CrawlerUrl(self.crawler_url.crawler, url, depth, self.crawler_url, timeout=self.crawler_url.timeout, **kwargs))
def process(self, text, soup=None): links = [ full_url_address(link.attrs.get('href'), self.crawler_url.url) for link in soup.find_all('a') ] for link in filter(lambda x: x.url.endswith('/'), links): self.crawler_url.crawler.add_url( CrawlerUrl(self.crawler_url.crawler, link, 3, self.crawler_url, type='directory', timeout=self.crawler_url.timeout)) self.files = [Url(link) for link in links]
def search_index_files(self): if self.crawler_url.type not in ['directory', None]: return crawler = self.crawler_url.crawler for index_file in INDEX_FILES: url = self.crawler_url.url.copy() url.set_children(index_file) future = self.crawler_url.crawler.add_url( CrawlerUrl(crawler, url, self.crawler_url.depth - 1, self, None, 'document', timeout=self.crawler_url.timeout), True) if self.crawler_url.crawler.closing: return result = future.result() if result.exists: self.index_file = url break
def links(self, soup): links = [ full_url_address(link.attrs.get('href'), self.crawler_url.url) for link in soup.find_all('a') ] for link in filter(bool, links): url = Url(link) if not url.is_valid(): continue depth = self.crawler_url.depth if url.domain != self.crawler_url.url.domain or \ not url.path.startswith(self.crawler_url.url.directory_path): depth -= 1 if depth <= 0: continue self.crawler_url.crawler.add_url( CrawlerUrl(self.crawler_url.crawler, link, depth, self.crawler_url, timeout=self.crawler_url.timeout))
def assets(self, soup): assets = [ full_url_address(link.attrs.get('href'), self.crawler_url.url) for link in soup.find_all('link') ] assets += [ full_url_address(script.attrs.get('src'), self.crawler_url.url) for script in soup.find_all('script') ] assets += [ full_url_address(img.attrs.get('src'), self.crawler_url.url) for img in soup.find_all('img') ] for asset in filter(bool, assets): self.analyze_asset(asset) self.crawler_url.crawler.add_url( CrawlerUrl(self.crawler_url.crawler, asset, 3, self.crawler_url, type='asset', timeout=self.crawler_url.timeout))
def test_create_report(self, m): crawler = self.get_crawler() crawler.results.put( GenericProcessor(None, CrawlerUrl(crawler, self.url))) crawler.create_report(crawler.get_resume_file()) m.assert_called_once()
def get_crawler_url(self): crawler = self.get_crawler() return CrawlerUrl(crawler, self.url)
def test_print_results(self): crawler = self.get_crawler() crawler_url = CrawlerUrl(crawler, self.url) crawler.results.put(GenericProcessor(None, crawler_url)) crawler.print_results()
def __init__(self, error, level='ERROR'): super(Error, self).__init__(None, CrawlerUrl(None, '')) self.error = error self.level = level
def test_add_url(self): crawler = self.get_crawler() crawler.domains.add('domain.com') crawler_url = CrawlerUrl(crawler, self.url) with patch.object(ThreadPoolExecutor, 'submit') as mock_method: crawler.add_url(crawler_url)