def test_session_exception(self, req_mock):
     req_mock.get(self.url, exc=requests.exceptions.ConnectTimeout)
     crawler = self.get_crawler()
     with patch('dirhunt.crawler_url.CrawlerUrl.close') as m:
         crawler_url = CrawlerUrl(crawler, self.url)
         self.assertEqual(crawler_url.start(), crawler_url)
         self.assertEqual(crawler.current_processed_count, 1)
         m.assert_called_once()
Exemple #2
0
 def test_start(self):
     crawler = self.get_crawler()
     crawler.closing = True
     crawler_url = CrawlerUrl(crawler, self.url)
     crawler.processing[self.url] = crawler_url
     with requests_mock.mock() as m:
         m.get(self.url, headers={'Content-Type': 'text/html'})
         crawler_url.start()
     self.assertIn(self.url, crawler.processed)
     self.assertNotIn(self.url, crawler.processing)
 def test_session_read_exception(self):
     crawler = self.get_crawler()
     crawler.sessions = Mock()
     crawler.sessions.get_session.return_value.get.return_value.status_code = 200
     crawler.sessions.get_session.return_value.get.return_value.raw.read.side_effect = \
         requests.exceptions.ConnectTimeout()
     with patch('dirhunt.crawler_url.CrawlerUrl.close') as m:
         crawler_url = CrawlerUrl(crawler, self.url)
         self.assertEqual(crawler_url.start(), crawler_url)
         self.assertEqual(crawler.current_processed_count, 1)
         m.assert_called_once()
Exemple #4
0
 def process(self, text, soup=None):
     self.crawler_url.crawler.add_url(
         CrawlerUrl(self.crawler_url.crawler,
                    self.redirector,
                    3,
                    self.crawler_url,
                    timeout=self.crawler_url.timeout))
Exemple #5
0
    def add_url(self, crawler_url, force=False):
        """Add url to queue"""
        if not isinstance(crawler_url, CrawlerUrl):
            crawler_url = CrawlerUrl(self,
                                     crawler_url,
                                     depth=self.depth,
                                     timeout=self.timeout)
        self.add_lock.acquire()
        url = crawler_url.url
        if not url.is_valid() or not url.only_domain or not self.in_domains(
                url.only_domain):
            self.add_lock.release()
            return
        if url.url in self.processing or url.url in self.processed:
            self.add_lock.release()
            return self.processing.get(url.url) or self.processed.get(url.url)

        fn = reraise_with_stack(crawler_url.start)
        if self.closing:
            self.add_lock.release()
            return
        if force:
            future = ThreadPoolExecutor(max_workers=1).submit(fn)
        else:
            future = self.submit(fn)
        self.processing[url.url] = future
        self.add_lock.release()
        return future
Exemple #6
0
 def test_print_results_limit(self):
     crawler = self.get_crawler(limit=1)
     crawler.current_processed_count = 1
     crawler_url = CrawlerUrl(crawler, self.url)
     crawler.results.put(GenericProcessor(None, crawler_url))
     crawler.print_results()
     self.assertTrue(crawler.closing)
Exemple #7
0
 def add_init_urls(self, *urls):
     """Add urls to queue.
     """
     for crawler_url in urls:
         if not isinstance(crawler_url, CrawlerUrl):
             crawler_url = CrawlerUrl(self, crawler_url, depth=self.depth, timeout=self.timeout)
         self.add_domain(crawler_url.url.only_domain)
         self.add_url(crawler_url)
Exemple #8
0
 def resume(self, path):
     resume_data = json.load(open(path))
     file_version = resume_data.get('version')
     if file_version != __version__:
         raise IncompatibleVersionError(
             'Analysis file incompatible with the current version of dirhunt. '
             'Dirhunt version: {}. File version: {}'.format(__version__, file_version)
         )
     for data in resume_data['processed']:
         crawler_url_data = data['crawler_url']
         url = crawler_url_data['url']['address']
         crawler_url = CrawlerUrl(self, url, crawler_url_data['depth'], None, crawler_url_data['exists'],
                                  crawler_url_data['type'])
         crawler_url.flags = set(crawler_url_data['flags'])
         crawler_url.processor_data = data
         self.processed[url] = crawler_url
         self.echo(data['line'])
     for url in resume_data['processing']:
         self.add_url(url)
Exemple #9
0
 def add_url(self, url, depth=3, **kwargs):
     if is_url_loop(url):
         return
     return self.crawler_url.crawler.add_url(
         CrawlerUrl(self.crawler_url.crawler,
                    url,
                    depth,
                    self.crawler_url,
                    timeout=self.crawler_url.timeout,
                    **kwargs))
Exemple #10
0
 def process(self, text, soup=None):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     for link in filter(lambda x: x.url.endswith('/'), links):
         self.crawler_url.crawler.add_url(
             CrawlerUrl(self.crawler_url.crawler,
                        link,
                        3,
                        self.crawler_url,
                        type='directory',
                        timeout=self.crawler_url.timeout))
     self.files = [Url(link) for link in links]
Exemple #11
0
 def search_index_files(self):
     if self.crawler_url.type not in ['directory', None]:
         return
     crawler = self.crawler_url.crawler
     for index_file in INDEX_FILES:
         url = self.crawler_url.url.copy()
         url.set_children(index_file)
         future = self.crawler_url.crawler.add_url(
             CrawlerUrl(crawler,
                        url,
                        self.crawler_url.depth - 1,
                        self,
                        None,
                        'document',
                        timeout=self.crawler_url.timeout), True)
         if self.crawler_url.crawler.closing:
             return
         result = future.result()
         if result.exists:
             self.index_file = url
             break
Exemple #12
0
 def links(self, soup):
     links = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('a')
     ]
     for link in filter(bool, links):
         url = Url(link)
         if not url.is_valid():
             continue
         depth = self.crawler_url.depth
         if url.domain != self.crawler_url.url.domain or \
                 not url.path.startswith(self.crawler_url.url.directory_path):
             depth -= 1
         if depth <= 0:
             continue
         self.crawler_url.crawler.add_url(
             CrawlerUrl(self.crawler_url.crawler,
                        link,
                        depth,
                        self.crawler_url,
                        timeout=self.crawler_url.timeout))
Exemple #13
0
 def assets(self, soup):
     assets = [
         full_url_address(link.attrs.get('href'), self.crawler_url.url)
         for link in soup.find_all('link')
     ]
     assets += [
         full_url_address(script.attrs.get('src'), self.crawler_url.url)
         for script in soup.find_all('script')
     ]
     assets += [
         full_url_address(img.attrs.get('src'), self.crawler_url.url)
         for img in soup.find_all('img')
     ]
     for asset in filter(bool, assets):
         self.analyze_asset(asset)
         self.crawler_url.crawler.add_url(
             CrawlerUrl(self.crawler_url.crawler,
                        asset,
                        3,
                        self.crawler_url,
                        type='asset',
                        timeout=self.crawler_url.timeout))
Exemple #14
0
 def test_create_report(self, m):
     crawler = self.get_crawler()
     crawler.results.put(
         GenericProcessor(None, CrawlerUrl(crawler, self.url)))
     crawler.create_report(crawler.get_resume_file())
     m.assert_called_once()
Exemple #15
0
 def get_crawler_url(self):
     crawler = self.get_crawler()
     return CrawlerUrl(crawler, self.url)
Exemple #16
0
 def test_print_results(self):
     crawler = self.get_crawler()
     crawler_url = CrawlerUrl(crawler, self.url)
     crawler.results.put(GenericProcessor(None, crawler_url))
     crawler.print_results()
Exemple #17
0
 def __init__(self, error, level='ERROR'):
     super(Error, self).__init__(None, CrawlerUrl(None, ''))
     self.error = error
     self.level = level
Exemple #18
0
 def test_add_url(self):
     crawler = self.get_crawler()
     crawler.domains.add('domain.com')
     crawler_url = CrawlerUrl(crawler, self.url)
     with patch.object(ThreadPoolExecutor, 'submit') as mock_method:
         crawler.add_url(crawler_url)