def __init__(self, spider, socket_number): """ Args: spider: argument is not used in multicurl transport """ self.spider = spider self.socket_number = socket_number self.multi = pycurl.CurlMulti() self.multi.handles = [] self.freelist = [] self.registry = {} self.connection_count = {} self.sigint_handler = PycurlSigintHandler() self.network_op_lock = Lock() # Create curl instances for _ in six.moves.range(self.socket_number): curl = pycurl.Curl() self.connection_count[id(curl)] = 0 self.freelist.append(curl) # self.multi.handles.append(curl) self.spawner = self.create_worker(self.spawner_callback) self.async_loop = self.create_worker(self.async_loop_callback) self.register_workers(self.spawner, self.async_loop)
def test_record(self): handler = PycurlSigintHandler() with handler.record(): sys.stderr.write('one-1') sys.stderr.write('two-2') val = handler.get_output() self.assertEqual('one-1two-2', val)
def request(self): sigint_handler = PycurlSigintHandler() try: with sigint_handler.handle_sigint(): self.curl.perform() except pycurl.error as ex: new_ex = build_grab_exception(ex, self.curl) if new_ex: raise new_ex # pylint: disable=raising-bad-type except Exception as ex: # pylint: disable=broad-except six.reraise(error.GrabInternalError, error.GrabInternalError(ex), sys.exc_info()[2]) finally: self.curl.grab_callback_interrupted = False
def request(self): sigint_handler = PycurlSigintHandler() try: with sigint_handler.handle_sigint(): self.curl.perform() except pycurl.error as ex: # CURLE_WRITE_ERROR (23) # An error occurred when writing received data to a local file, or # an error was returned to libcurl from a write callback. # This exception should be ignored if grab_callback_interrupted # flag # is enabled (this happens when nohead or nobody options # enabled) # # Also this error is raised when curl receives KeyboardInterrupt # while it is processing some callback function # (WRITEFUNCTION, HEADERFUNCTIO, etc) # If you think WTF then see details here: # https://github.com/pycurl/pycurl/issues/413 if ex.args[0] == 23: if getattr(self.curl, 'grab_callback_interrupted', None) is True: # This is expected error caused by # interruptted execution of body_processor callback # FIXME: is it set automatically? self.curl.grab_callback_interrupted = False else: raise error.GrabNetworkError(ex.args[0], ex.args[1]) else: if ex.args[0] == 28: raise error.GrabTimeoutError(ex.args[0], ex.args[1]) elif ex.args[0] == 7: raise error.GrabConnectionError(ex.args[0], ex.args[1]) elif ex.args[0] == 67: raise error.GrabAuthError(ex.args[0], ex.args[1]) elif ex.args[0] == 47: raise error.GrabTooManyRedirectsError( ex.args[0], ex.args[1]) elif ex.args[0] == 6: raise error.GrabCouldNotResolveHostError( ex.args[0], ex.args[1]) else: raise error.GrabNetworkError(ex.args[0], ex.args[1]) except Exception as ex: # pylint: disable=broad-except six.reraise(error.GrabInternalError, error.GrabInternalError(ex), sys.exc_info()[2])
def __init__(self, spider, socket_number): # pylint: disable=unused-argument """ Args: spider: argument is not used in multicurl transport """ self.socket_number = socket_number self.multi = pycurl.CurlMulti() self.multi.handles = [] self.freelist = [] self.registry = {} self.connection_count = {} self.network_op_lock = Lock() self.sigint_handler = PycurlSigintHandler() # Create curl instances for _ in six.moves.range(self.socket_number): curl = pycurl.Curl() self.connection_count[id(curl)] = 0 self.freelist.append(curl)
class NetworkServiceMulticurl(BaseService): def __init__(self, spider, socket_number): """ Args: spider: argument is not used in multicurl transport """ self.spider = spider self.socket_number = socket_number self.multi = pycurl.CurlMulti() self.multi.handles = [] self.freelist = [] self.registry = {} self.connection_count = {} self.sigint_handler = PycurlSigintHandler() self.network_op_lock = Lock() # Create curl instances for _ in six.moves.range(self.socket_number): curl = pycurl.Curl() self.connection_count[id(curl)] = 0 self.freelist.append(curl) # self.multi.handles.append(curl) self.spawner = self.create_worker(self.spawner_callback) self.async_loop = self.create_worker(self.async_loop_callback) self.register_workers(self.spawner, self.async_loop) def async_loop_callback(self, worker): while not worker.stop_event.is_set(): worker.process_pause_signal() self.process_handlers() time.sleep(0.01) def spawner_callback(self, worker): while not worker.stop_event.is_set(): worker.process_pause_signal() if self.get_free_threads_number(): task = self.spider.get_task_from_queue() if task is None or task is True: time.sleep(0.1) else: worker.is_busy_event.set() try: task.network_try_count += 1 # pylint: disable=no-member is_valid, reason = self.spider.check_task_limits(task) if is_valid: grab = self.spider.setup_grab_for_task(task) self.spider.submit_task_to_transport(task, grab) else: self.spider.log_rejected_task(task, reason) # pylint: disable=no-member handler = task.get_fallback_handler(self.spider) # pylint: enable=no-member if handler: handler(task) finally: worker.is_busy_event.clear() for result, task in self.iterate_results(): self.spider.task_dispatcher.input_queue.put( (result, task, None), ) def ready_for_task(self): return len(self.freelist) def get_free_threads_number(self): return len(self.freelist) def get_active_threads_number(self): return self.socket_number - len(self.freelist) def process_connection_count(self, curl): curl_id = id(curl) self.connection_count[curl_id] += 1 if self.connection_count[curl_id] > 100: del self.connection_count[curl_id] del curl new_curl = pycurl.Curl() self.connection_count[id(new_curl)] = 1 return new_curl else: return curl def start_task_processing(self, task, grab, grab_config_backup): curl = self.process_connection_count(self.freelist.pop()) self.registry[id(curl)] = { 'grab': grab, 'grab_config_backup': grab_config_backup, 'task': task, } grab.transport.curl = curl try: grab.prepare_request() # Enable pycurl built-in redirect processing # In non-spider mode Grab handles redirects itself # by parsing headers and following Location URls # In spider mode that would require to create # new Task objects for each 30* redirect # Maybe that would be implemented in future # For now multicurl transport just uses builtin pycurl # ability to handle 30* redirects grab.transport.curl.setopt( pycurl.FOLLOWLOCATION, 1 if grab.config['follow_location'] else 0) grab.log_request() except Exception: # If some error occurred while processing the request arguments # then we should put curl object back to free list del self.registry[id(curl)] self.freelist.append(curl) raise else: # Add configured curl instance to multi-curl processor try: self.network_op_lock.acquire() with self.sigint_handler.handle_sigint(): self.multi.add_handle(curl) finally: self.network_op_lock.release() def process_handlers(self): try: self.network_op_lock.acquire() with self.sigint_handler.handle_sigint(): rlist, wlist, xlist = self.multi.fdset() if rlist or wlist or xlist: with self.sigint_handler.handle_sigint(): timeout = self.multi.timeout() if timeout and timeout > 0: select.select(rlist, wlist, xlist, timeout / 1000.0) else: pass while True: with self.sigint_handler.handle_sigint(): status, _ = self.multi.perform() if status != pycurl.E_CALL_MULTI_PERFORM: break finally: self.network_op_lock.release() def iterate_results(self): while True: try: self.network_op_lock.acquire() with self.sigint_handler.handle_sigint(): queued_messages, ok_list, fail_list = ( self.multi.info_read()) finally: self.network_op_lock.release() #except Exception as ex: # # Usually that should not happen # logging.error('', exc_info=ex) # continue results = [] for curl in ok_list: results.append((True, curl, None, None, None)) for curl, ecode, emsg in fail_list: curl.grab_callback_interrupted = False try: raise pycurl.error(ecode, emsg) except Exception as exc: # pylint: disable=broad-except grab_exc = build_grab_exception(exc, curl) # grab_exc could be None if the pycurl error # was expected (could be in case of # body_maxsize and other options) if grab_exc: results.append((False, curl, ecode, emsg, grab_exc)) else: results.append((True, curl, None, None, None)) for is_ok, curl, ecode, emsg, grab_exc in results: # FORMAT: {is_ok, grab, grab_config_backup, task, # ecode, emsg, error_abbr, exc} curl_id = id(curl) task = self.registry[curl_id]['task'] grab = self.registry[curl_id]['grab'] grab_config_backup =\ self.registry[curl_id]['grab_config_backup'] try: self.network_op_lock.acquire() grab.process_request_result() except GrabTooManyRedirectsError: ecode = ERROR_TOO_MANY_REDIRECTS emsg = 'Too many meta refresh redirects' is_ok = False finally: self.network_op_lock.release() #except Exception as ex: # logging.error('', exc_info=ex) # ecode = ERROR_INTERNAL_GRAB_ERROR # emsg = 'Internal grab error' # is_ok = False grab.doc.error_code = ecode grab.doc.error_msg = emsg grab.exception = grab_exc # Free resources del self.registry[curl_id] grab.transport.curl = None if is_ok: error_abbr = None else: error_abbr = ERRNUM_TAG.get(ecode, 'unknown-%d' % ecode) yield { 'ok': is_ok, 'ecode': ecode, 'emsg': emsg, 'error_abbr': error_abbr, 'exc': grab_exc, 'grab': grab, 'grab_config_backup': grab_config_backup, }, task try: self.network_op_lock.acquire() with self.sigint_handler.handle_sigint(): self.multi.remove_handle(curl) finally: self.network_op_lock.release() curl.reset() self.freelist.append(curl) if not queued_messages: break
class MulticurlTransport(object): def __init__(self, spider, socket_number): """ Args: spider: argument is not used in multicurl transport """ del spider # is not used in this transport self.socket_number = socket_number self.multi = pycurl.CurlMulti() self.multi.handles = [] self.freelist = [] self.registry = {} self.connection_count = {} self.network_op_lock = Lock() self.sigint_handler = PycurlSigintHandler() # Create curl instances for _ in six.moves.range(self.socket_number): curl = pycurl.Curl() self.connection_count[id(curl)] = 0 self.freelist.append(curl) # self.multi.handles.append(curl) def ready_for_task(self): return len(self.freelist) def get_free_threads_number(self): return len(self.freelist) def get_active_threads_number(self): return self.socket_number - len(self.freelist) def process_connection_count(self, curl): curl_id = id(curl) self.connection_count[curl_id] += 1 if self.connection_count[curl_id] > 100: del self.connection_count[curl_id] del curl new_curl = pycurl.Curl() self.connection_count[id(new_curl)] = 1 return new_curl else: return curl def start_task_processing(self, task, grab, grab_config_backup): self.network_op_lock.acquire() try: curl = self.process_connection_count(self.freelist.pop()) self.registry[id(curl)] = { 'grab': grab, 'grab_config_backup': grab_config_backup, 'task': task, } grab.transport.curl = curl try: grab.prepare_request() # Enable pycurl built-in redirect processing # In non-spider mode Grab handles redirects itself # by parsing headers and following Location URls # In spider mode that would require to create # new Task objects for each 30* redirect # Maybe that would be implemented in future # For now multicurl transport just uses builtin pycurl # ability to handle 30* redirects grab.transport.curl.setopt( pycurl.FOLLOWLOCATION, 1 if grab.config['follow_location'] else 0) grab.log_request() except Exception: # If some error occurred while processing the request arguments # then we should put curl object back to free list del self.registry[id(curl)] self.freelist.append(curl) raise else: # Add configured curl instance to multi-curl processor with self.sigint_handler.handle_sigint(): self.multi.add_handle(curl) finally: self.network_op_lock.release() def process_handlers(self): # Ok, frankly I have really bad understanding of # how to deal with multicurl sockets ;-) # It is a sort of miracle that Grab actually works self.network_op_lock.acquire() with self.sigint_handler.handle_sigint(): rlist, wlist, xlist = self.multi.fdset() if rlist or wlist or xlist: with self.sigint_handler.handle_sigint(): timeout = self.multi.timeout() if timeout and timeout > 0: select.select(rlist, wlist, xlist, timeout / 1000.0) else: pass while True: with self.sigint_handler.handle_sigint(): status, _ = self.multi.perform() if status != pycurl.E_CALL_MULTI_PERFORM: break self.network_op_lock.release() def iterate_results(self): while True: #try: with self.sigint_handler.handle_sigint(): queued_messages, ok_list, fail_list = self.multi.info_read() #except Exception as ex: # # Usually that should not happen # logging.error('', exc_info=ex) # continue results = [] for curl in ok_list: results.append((True, curl, None, None)) for curl, ecode, emsg in fail_list: # CURLE_WRITE_ERROR (23) # An error occurred when writing received data # to a local file, or # an error was returned to libcurl from a write callback. # This exception should be ignored if _callback_interrupted # flag # is enabled (this happens when nohead or # nobody options enabeld) # # Also this error is raised when curl receives # KeyboardInterrupt # while it is processing some callback function # (WRITEFUNCTION, HEADERFUNCTIO, etc) if ecode == 23: if curl.grab_callback_interrupted is True: # FIXME: that flag should be set automatically # FIXME: write tests to test this flag curl.grab_callback_interrupted = False results.append((True, curl, None, None)) else: results.append((False, curl, ecode, emsg)) else: results.append((False, curl, ecode, emsg)) for is_ok, curl, ecode, emsg in results: # FORMAT: {is_ok, grab, grab_config_backup, task, emsg} curl_id = id(curl) task = self.registry[curl_id]['task'] grab = self.registry[curl_id]['grab'] grab_config_backup =\ self.registry[curl_id]['grab_config_backup'] try: grab.process_request_result() except GrabTooManyRedirectsError: ecode = ERROR_TOO_MANY_REFRESH_REDIRECTS emsg = 'Too many meta refresh redirects' is_ok = False #except Exception as ex: # logging.error('', exc_info=ex) # ecode = ERROR_INTERNAL_GRAB_ERROR # emsg = 'Internal grab error' # is_ok = False grab.doc.error_code = ecode grab.doc.error_msg = emsg # Free resources del self.registry[curl_id] grab.transport.curl = None if is_ok: error_abbr = None else: error_abbr = ERROR_ABBR.get(ecode, 'unknown-%d' % ecode) yield { 'ok': is_ok, 'ecode': ecode, 'emsg': emsg, 'error_abbr': error_abbr, 'grab': grab, 'grab_config_backup': grab_config_backup, 'task': task } with self.sigint_handler.handle_sigint(): self.multi.remove_handle(curl) curl.reset() self.freelist.append(curl) if not queued_messages: break
def test_use_stderr(self): handler = PycurlSigintHandler() sys.stderr.write('FOO!') with handler.record(): sys.stderr.write('BAR!')