Esempio n. 1
0
    def __init__(self,
                 pool_size = 20,
                 pop_interval = 1,
                 request_interval = 0,
                 max_empty_retry = 2,
                 request_timeout = 10,
                 each_size_from_queue = 10,
                 max_failure_allowed = -1):
        from gevent import monkey
        monkey.patch_all()
        self.pop_interval = pop_interval
        self.request_interval = request_interval
        self.pool = Pool(pool_size)
        self.quit_event = Event()
        self.max_empty_retry = max_empty_retry
        self.request_timeout = request_timeout
        self.each_size_from_queue = each_size_from_queue
        self.user_agent_provider = UserAgentProvider()
        self.max_failure_allowed = max_failure_allowed
        self._request_failure = 0
        self.proxy_provider = None
        self.processor_manager = RequestEngine.ProcessorManager()
        self.before_each = []
        self.after_each = []

        gevent.signal(signal.SIGINT, self.quit)
        gevent.signal(signal.SIGQUIT, self.quit)
        gevent.signal(signal.SIGTERM, self.quit)
Esempio n. 2
0
class RequestEngine:

    class ProcessorManager(object):
        def __init__(self):
            self._processor_map = {'default': None}
        def set(self, processor_name,  value):
            self._processor_map[processor_name] = value

        def route(self, processor_name, **kwargs):
            if processor_name is None:
                processor_name_indeed = 'default'
            else:
                processor_name_indeed = processor_name

            processor = self._processor_map[processor_name_indeed]
            if processor is None:
                pass
            elif hasattr(processor, '__call__'):
                return processor.__call__(**kwargs)


    def __init__(self,
                 pool_size = 20,
                 pop_interval = 1,
                 request_interval = 0,
                 max_empty_retry = 2,
                 request_timeout = 10,
                 each_size_from_queue = 10,
                 max_failure_allowed = -1):
        from gevent import monkey
        monkey.patch_all()
        self.pop_interval = pop_interval
        self.request_interval = request_interval
        self.pool = Pool(pool_size)
        self.quit_event = Event()
        self.max_empty_retry = max_empty_retry
        self.request_timeout = request_timeout
        self.each_size_from_queue = each_size_from_queue
        self.user_agent_provider = UserAgentProvider()
        self.max_failure_allowed = max_failure_allowed
        self._request_failure = 0
        self.proxy_provider = None
        self.processor_manager = RequestEngine.ProcessorManager()
        self.before_each = []
        self.after_each = []

        gevent.signal(signal.SIGINT, self.quit)
        gevent.signal(signal.SIGQUIT, self.quit)
        gevent.signal(signal.SIGTERM, self.quit)

    def setup_request_queue(self, request_queue_ins):
        self.request_queue = request_queue_ins

    @property
    def active(self):
        if not hasattr(self, '_active'):
            self._active = False
        return self._active

    @active.setter
    def active(self, value):
        self._active = value

    def before_each(self, *processors):
        self.before_each += processors

    def after_each(self, *processors):
        self.after_each += processors

    def worker_count(self):
        return self.pool.size - self.pool.free_count()

    def quit(self):
        self.quit_event.set()

    def request(self, override_req_args= {}):
        self.active = True
        empty_count = 0
        while True:
            if self.quit_event.is_set():

                logger.warning("Quiting Engine")
                if self.pool.size != self.pool.free_count():
                    time.sleep(1)
                    continue

                self.active = False
                logger.warning("Engine Gracefully Quit")
                break

            if (self.max_failure_allowed != -1 and self._request_failure >= self.max_failure_allowed):
                logger.warning( "Exceed Max Failures Count. Engine Stopping ..." )
                self.quit()
                continue

            if self.pool.free_count() > self.each_size_from_queue:
                this_time_size = self.each_size_from_queue
            else:
                this_time_size = self.pool.free_count()

            if this_time_size > 0:
                reqs = self.request_queue.pop(this_time_size)
                logger.info('Current free workers: '+str(self.pool.free_count()))
                if (reqs is not None) and (len(reqs) > 0):

                    for i in reqs:
                        self.pool.spawn(self._make_requests, request=i, override = override_req_args)
                        time.sleep(self.request_interval)
                else:
                    empty_count +=1
                    if (self.max_empty_retry != -1 and empty_count >= self.max_empty_retry):
                        logger.warning( "Exceed Max Empty. Engine Stopping ..." )
                        self.quit()
                        continue

            #while self.pool.free_count() == 0:
            time.sleep(self.pop_interval)

    def setup_user_agent_provider(self, provider):
        self.user_agent_provider = provider

    def setup_proxy_provider(self, provider):
        self.proxy_provider = provider

    def register_processor(self, processor, name='default'):
        self.processor_manager.set(name, processor)

    def _make_requests(self, request, override):
        empty_count = 0
        data= {} # Data flow

        is_failure_set = False
        request.kwargs.update(override)
        # Setting user agent
        if self.user_agent_provider:
            if 'headers' in request.kwargs:
                request.kwargs['headers'].update({'User-Agent': self.user_agent_provider.provide()})
            else:
                request.kwargs['headers'] = {'User-Agent': self.user_agent_provider.provide()}

        # Setting proxy provider
        if self.proxy_provider:
            proxy = self.proxy_provider.provide()
            if proxy is not None:
                # If Provider return None, not use proxy
                _proxy = {'http':proxy.proxy, 'https':proxy.proxy}
                if 'proxies' in request.kwargs:
                    request.kwargs['proxies'].update(_proxy)
                else:
                    request.kwargs['proxies'] = _proxy

                logger.warning("Using Proxy: %s" % str(_proxy))
            else:
                logger.warning("No Using Proxy")
        else:
            proxy = None


        ar = None
        result = False
        processors = {'before':None, 'after':None}
        if request.processors is not None:
            processors.update(request.processors)
        before_each_hook_result = None
        # Execute hook before every item
        try:
            logger.info("Executing before hook")
            before_each_hook_result = self.processor_manager.route(
                                                                   processor_name=processors['before'],
                                                                   request = request,
                                                                   extra = request.raw_info,
                                                                   data= data)

            for p in self.before_each:
                self.processor_manager.route(processor_name=p, request = request ,extra = request.raw_info, data= data)
        except:
            if not is_failure_set:
                self._request_failure += 1
                is_failure_set = True
            logger.error("Exception while before hook execution: "+ traceback.format_exc())
        # Execute request

        if before_each_hook_result != False:
            # Only if before hook return non-false
            try:
                logger.debug("Making request... (%s)" % str(request.kwargs))
                _timeout =  getattr(request.raw_info,'_timeout',self.request_timeout)
                logger.debug("Timeout setting: %s" % _timeout)
                with gevent.Timeout(_timeout):
                    ar = requests.request(**request.kwargs)
                    ar.raw_info = request.raw_info
                    result = True
                # if result is False:
                #     raise Exception("Request timeout (%s)" % self.request_timeout)
            except:

                if not is_failure_set:
                    self._request_failure += 1
                    is_failure_set = True
                logger.error("Exception while requests execution: "+ traceback.format_exc())


            try:

                # Execute hook after every request
                logger.info("Executing after hook")
                self.processor_manager.route(
                                             processor_name=processors['after'],
                                             response = ar,
                                             request = request,
                                             extra = request.raw_info,
                                             result = result, data=data)

                for p in self.after_each:
                    self.processor_manager.route(processor_name=p,response = ar, request = request,extra = request.raw_info, result = result, data= data)

                # process proxy provider
                if proxy:
                    self.proxy_provider.callback(proxy, result=result, response = ar, request=request)
            except:
                if not is_failure_set:
                    self._request_failure += 1
                    is_failure_set = True
                logger.error("Exception while after hook execution", exc_info=True)
Esempio n. 3
0
    if hasattr(settings, 'PROXY_PROVIDER'):
        proxy_provider = getattr(
            importlib.import_module(settings.PROXY_PROVIDER[0]),
            settings.PROXY_PROVIDER[1])
        request_engine.setup_proxy_provider(
            proxy_provider(**settings.PROXY_PROVIDER_ARGUMENTS))
        logger.info('Set up proxy provider')
    else:
        request_engine.setup_proxy_provider(CustomProxyProvider())

    if hasattr(settings, 'UA_PROVIDER'):
        ua_provider = getattr(importlib.import_module(settings.UA_PROVIDER[0]),
                              settings.UA_PROVIDER[1])
        request_engine.setup_user_agent_provider(ua_provider())
        logger.info('Set up UA provider')
    else:
        request_engine.setup_user_agent_provider(UserAgentProvider())

    request_engine.setup_request_queue(env.request_queue)
    env.downloader = request_engine

# Processors

processors = importlib.import_module('app.processors')
for f in glob.glob(os.path.dirname(processors.__file__) + "/*.py"):
    __import__('app.processors.' + os.path.basename(f)[:-3])


def start():
    env.downloader.request()