Ejemplo n.º 1
0
    def _run(self):
        while not self.stop.is_set():
            try:
                self.name = 'WarcWriterThread(tid={})'.format(warcprox.gettid())
                while True:
                    try:
                        if self.stop.is_set():
                            qsize = self.recorded_url_q.qsize()
                            if qsize % 50 == 0:
                                self.logger.info("%s urls left to write", qsize)

                        recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
                        self.idle = None
                        if self._filter_accepts(recorded_url):
                            if self.dedup_db:
                                warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
                                        recorded_url, base32=self.options.base32)
                            records = self.writer_pool.write_records(recorded_url)
                            self._final_tasks(recorded_url, records)

                        # try to release resources in a timely fashion
                        if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
                            recorded_url.response_recorder.tempfile.close()
                    except queue.Empty:
                        if self.stop.is_set():
                            break
                        self.idle = time.time()
                        self.writer_pool.maybe_idle_rollover()

                self.logger.info('WarcWriterThread shutting down')
                self.writer_pool.close_writers()
            except:
                self.logger.critical("WarcWriterThread will try to continue after unexpected error", exc_info=True)
                time.sleep(0.5)
Ejemplo n.º 2
0
 def __init__(self, request, client_address, server):
     threading.current_thread(
     ).name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(
         warcprox.gettid(),
         datetime.datetime.utcnow().isoformat(), client_address[0],
         client_address[1])
     self.is_connect = False
     self._headers_buffer = []
     request.settimeout(self._socket_timeout)
     http_server.BaseHTTPRequestHandler.__init__(self, request,
                                                 client_address, server)
Ejemplo n.º 3
0
    def _run(self):
        self.name = '%s(tid=%s)' % (self.name, warcprox.gettid())
        while not self.stop.is_set():
            try:
                while True:
                    try:
                        if self.stop.is_set():
                            qsize = self.recorded_url_q.qsize()
                            if qsize % 50 == 0:
                                self.logger.info("%s urls left to write",
                                                 qsize)

                        recorded_url = self.recorded_url_q.get(block=True,
                                                               timeout=0.5)
                        records = []
                        self.idle = None
                        if self._filter_accepts(recorded_url):
                            if self.dedup_db:
                                warcprox.dedup.decorate_with_dedup_info(
                                    self.dedup_db,
                                    recorded_url,
                                    base32=self.options.base32)
                            records = self.writer_pool.write_records(
                                recorded_url)

                        self._final_tasks(recorded_url, records)

                        # try to release resources in a timely fashion
                        if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
                            recorded_url.response_recorder.tempfile.close()

                        self.writer_pool.maybe_idle_rollover()
                    except queue.Empty:
                        if self.stop.is_set():
                            break
                        self.idle = time.time()

                self.logger.info('WarcWriterThread shutting down')
                self._shutdown()
            except Exception as e:
                if isinstance(e, OSError) and e.errno == 28:
                    # OSError: [Errno 28] No space left on device
                    self.logger.critical(
                        'shutting down due to fatal problem: %s: %s',
                        e.__class__.__name__, e)
                    self._shutdown()
                    sys.exit(1)

                self.logger.critical(
                    'WarcWriterThread will try to continue after unexpected '
                    'error',
                    exc_info=True)
                time.sleep(0.5)
Ejemplo n.º 4
0
 def _wrap_process_url(self, recorded_url):
     if not getattr(self.thread_local, 'name_set', False):
         threading.current_thread().name = 'WarcWriterThread(tid=%s)' % warcprox.gettid()
         self.thread_local.name_set = True
     if self.options.profile:
         import cProfile
         if not hasattr(self.thread_local, 'profiler'):
             self.thread_local.profiler = cProfile.Profile()
             tid = threading.current_thread().ident
             self.thread_profilers[tid] = self.thread_local.profiler
         self.thread_local.profiler.enable()
         self._process_url(recorded_url)
         self.thread_local.profiler.disable()
     else:
         self._process_url(recorded_url)
Ejemplo n.º 5
0
    def _run(self):
        self.name = '%s(tid=%s)'% (self.name, warcprox.gettid())
        while not self.stop.is_set():
            try:
                while True:
                    try:
                        if self.stop.is_set():
                            qsize = self.recorded_url_q.qsize()
                            if qsize % 50 == 0:
                                self.logger.info("%s urls left to write", qsize)

                        recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
                        records = []
                        self.idle = None
                        if self._filter_accepts(recorded_url):
                            if self.dedup_db:
                                warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
                                        recorded_url, base32=self.options.base32)
                            records = self.writer_pool.write_records(recorded_url)

                        self._final_tasks(recorded_url, records)

                        # try to release resources in a timely fashion
                        if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
                            recorded_url.response_recorder.tempfile.close()

                        self.writer_pool.maybe_idle_rollover()
                    except queue.Empty:
                        if self.stop.is_set():
                            break
                        self.idle = time.time()

                self.logger.info('WarcWriterThread shutting down')
                self._shutdown()
            except Exception as e:
                if isinstance(e, OSError) and e.errno == 28:
                    # OSError: [Errno 28] No space left on device
                    self.logger.critical(
                            'shutting down due to fatal problem: %s: %s',
                            e.__class__.__name__, e)
                    self._shutdown()
                    sys.exit(1)

                self.logger.critical(
                    'WarcWriterThread will try to continue after unexpected '
                    'error', exc_info=True)
                time.sleep(0.5)
Ejemplo n.º 6
0
 def __init__(self, request, client_address, server):
     threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
     self.is_connect = False
     self._headers_buffer = []
     request.settimeout(60)  # XXX what value should this have?
     http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server)