Beispiel #1
0
 def _captcha_worker(self, update):
     while self._pusher_on.is_set():
         with self._lock:
             while self._captcha_queue:
                 try:
                     captcha = self._captcha_queue.pop()
                     update.message.reply_photo(photo=captcha[0],
                                                caption=captcha[1])
                 except IndexError:
                     pass
                 except Exception:
                     self._log.exception('Captcha worker error')
                 shallow_sleep(0.5)
         shallow_sleep(0.5)
Beispiel #2
0
    def get_archive(self, type_, start=None):
        """Get archive from Zone-H by given archive type."""
        domains = _CONF['zoneh']['filters']['domains']
        self._api.init_cookies()
        page_queue = deque([start or START_PAGE])
        while page_queue:
            page_num = page_queue.pop()
            html_page = self._api.get_page(page_num, type_)
            next_page = None
            try:
                for record, next_page in self._parser.get_records(html_page):
                    url = record['defaced_url']
                    if all([domains, '...' in url, '/' not in url]):
                        data = self._get_advanced_data(record['mirror'])
                        record['defaced_url'] = data['defaced_url_full']
                        shallow_sleep(sleep_time())
                    yield record
            except exc.HTMLParserCaptchaRequest:
                captcha_manager.init_captcha(type_, page_num)
                while captcha.is_active:
                    shallow_sleep(1)
                yield from self.get_archive(type_, page_num)
            except exc.HTMLParserCookiesError:
                self._api.init_cookies(force=True)
                shallow_sleep(2)
                yield from self.get_archive(type_, page_num)
            except Exception:
                err_msg = 'Exception during getting record'
                _log.exception(err_msg)
                raise exc.ScraperError(err_msg)

            if next_page:
                page_queue.appendleft(next_page)
            shallow_sleep(sleep_time())
Beispiel #3
0
 def _pusher_worker(self, update):
     rec_num = 0
     while self._pusher_on.is_set():
         with self._lock:
             while self._processor.push_queue:
                 try:
                     record = self._processor.push_queue.pop()
                     rec_num += 1
                     formatter = Formatter(record, rec_num)
                     rec_formatted = formatter.format()
                     keyboard = [[InlineKeyboardButton('Open mirror',
                                                       url=formatter.get_mirror_url())]]
                     reply_markup = InlineKeyboardMarkup(keyboard)
                     update.message.reply_html(rec_formatted,
                                               reply_markup=reply_markup)
                 except IndexError:
                     pass
                 shallow_sleep(1)
         shallow_sleep(1)
Beispiel #4
0
    def _run(self):
        """Real thread run method."""
        rec_num = 0
        while self._run_trigger.is_set():
            with self._lock:
                self._log.debug('Captcha is active: %s', captcha.is_active)
                self._log.debug('Captcha is sent: %s', captcha.is_sent)
                if captcha.is_active and not captcha.is_sent:
                    self._send_captcha(self._update)

                while self._push_queue:
                    try:
                        record = self._push_queue.pop()
                    except IndexError:
                        pass

                    rec_num += 1
                    self._process_record(record, rec_num)
                    shallow_sleep(1)
            shallow_sleep(1)
Beispiel #5
0
    def _worker(self):
        while self._processor_on.is_set():
            try:
                for record in self._scraper.get_archive(_type=self._arch_type):
                    if self._processor_on.is_set(
                    ) and record not in self.temp_queue:
                        self._log.debug(json.dumps(record))
                        self.temp_queue.appendleft(record)
                        if self._filter.satisfy(record):
                            self.push_queue.appendleft(record)
                    else:
                        break
                time_delta = int(time.time()) + self._rescan_period
                while int(time.time()) < time_delta:
                    if not self._processor_on.is_set():
                        break
                    shallow_sleep(0.5)

            except Exception as err:
                err_msg = 'Processor thread received error ' \
                          'during handling scrape records'
                self._log.exception(err_msg)
                raise zoneh.exceptions.ProcessorError(err)
Beispiel #6
0
 def get_archive(self, _type, start=None):
     page_url = const.ARCHIVE_TYPES[_type]['page']
     domains = _CONF['zoneh']['filters']['domains']
     if not self._session.cookies:
         self._initialize_cookies()
     try:
         page_queue = deque([start or const.START_PAGE])
         while page_queue:
             page_num = page_queue.pop()
             page = self._make_request(page_url.format(page_num=page_num))
             next_page = None
             for record, next_page in self._parser.get_records(
                     page.content):
                 url = record['defaced_url']
                 if all([domains, '...' in url, '/' not in url]):
                     data = self._get_advanced_data(record['mirror'])
                     record['defaced_url'] = data['defaced_url_full']
                     shallow_sleep(sleep_time())
                 yield record
             if next_page:
                 page_queue.appendleft(next_page)
             shallow_sleep(sleep_time())
     except zoneh.exceptions.HTMLParserCaptchaRequest:
         self._send_captcha()
         self.got_captcha = True
         self._captcha_page = (_type, page_num)
         while self.got_captcha:
             shallow_sleep(1)
         yield from self.get_archive(_type, page_num)
     except zoneh.exceptions.HTMLParserCookiesError:
         self._purge_cookies()
         self._initialize_cookies()
         yield from self.get_archive(_type, page_num)
     except Exception:
         err_msg = 'Exception during getting record'
         _LOG.exception(err_msg)
         raise zoneh.exceptions.ScraperError(err_msg)