コード例 #1
0
 def test_print_results_limit(self):
     crawler = self.get_crawler(limit=1)
     crawler.current_processed_count = 1
     crawler_url = CrawlerUrl(crawler, self.url)
     crawler.results.put(GenericProcessor(None, crawler_url))
     crawler.print_results()
     self.assertTrue(crawler.closing)
コード例 #2
0
    def start(self):
        from dirhunt.processors import get_processor, GenericProcessor, Error

        session = self.crawler.sessions.get_session()
        try:
            resp = session.get(self.url.url, stream=True, timeout=TIMEOUT, allow_redirects=False)
        except RequestException as e:
            self.crawler.results.put(Error(self, e))
            self.close()
            return self

        self.set_type(resp.headers.get('Content-Type'))
        self.flags.add(str(resp.status_code))
        text = ''
        soup = None

        if resp.status_code < 300 and self.maybe_directory():
            text = resp.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
            soup = BeautifulSoup(text, 'html.parser')
        if self.maybe_directory():
            processor = get_processor(resp, text, self, soup) or GenericProcessor(resp, self)
            processor.process(text, soup)
            self.crawler.results.put(processor)
            self.flags.update(processor.flags)
        # TODO: Podemos fijarnos en el processor.index_file. Si existe y es un 200, entonces es que existe.
        if self.exists is None and resp.status_code < 404:
            self.exists = True
        self.add_self_directories(True if (not self.maybe_rewrite() and self.exists) else None,
                                  'directory' if not self.maybe_rewrite() else None)
        self.close()
        return self
コード例 #3
0
ファイル: crawler_url.py プロジェクト: yotube113355/dirhunt
    def start(self):
        from dirhunt.processors import get_processor, GenericProcessor, Error, ProcessIndexOfRequest

        session = self.crawler.sessions.get_session()
        try:
            resp = session.get(self.url.url,
                               stream=True,
                               verify=False,
                               timeout=self.timeout,
                               allow_redirects=False)
        except RequestException as e:
            self.crawler.current_processed_count += 1
            self.crawler.results.put(Error(self, e))
            self.close()
            return self

        self.set_type(resp.headers.get('Content-Type'))
        self.flags.add(str(resp.status_code))

        text = ''
        soup = None
        processor = None
        if resp.status_code < 300 and self.must_be_downloaded(resp):
            try:
                text = resp.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
            except (RequestException, ReadTimeoutError, socket.timeout) as e:
                self.crawler.current_processed_count += 1
                self.crawler.results.put(Error(self, e))
                self.close()
                return self
            soup = BeautifulSoup(text, 'html.parser') if resp.headers.get(
                'Content-Type') == 'text/html' else None
        if self.must_be_downloaded(resp):
            processor = get_processor(resp, text, self,
                                      soup) or GenericProcessor(resp, self)
            processor.process(text, soup)
            self.flags.update(processor.flags)
        if self.maybe_directory():
            self.crawler.results.put(processor)
        if processor is not None:
            self.processor_data = processor.json()
        if processor and isinstance(processor, ProcessIndexOfRequest):
            self.crawler.index_of_processors.append(processor)
        else:
            self.crawler.current_processed_count += 1
        # TODO: Podemos fijarnos en el processor.index_file. Si existe y es un 200, entonces es que existe.
        if self.exists is None and resp.status_code < 404:
            self.exists = True
        self.add_self_directories(
            True if (not self.maybe_rewrite() and self.exists) else None,
            'directory' if not self.maybe_rewrite() else None)
        self.close()
        return self
コード例 #4
0
ファイル: test_crawler.py プロジェクト: vishalkrtr/dirhunt
 def test_print_results(self):
     crawler = self.get_crawler()
     crawler_url = CrawlerUrl(crawler, self.url)
     crawler.results.put(GenericProcessor(None, crawler_url))
     crawler.print_results()
コード例 #5
0
 def test_create_report(self, m):
     crawler = self.get_crawler()
     crawler.results.put(
         GenericProcessor(None, CrawlerUrl(crawler, self.url)))
     crawler.create_report(crawler.get_resume_file())
     m.assert_called_once()