class InputThread(threading.Thread):
    def __init__(self, conf, processor, proc_name= None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.input_tube = conf['beanstalk_conf']['input_tube']
        self.output_tube = conf['beanstalk_conf']['output_tube']

        self.log = conf['log']
        if not self.log:
            self.log = LogHandler("i_input_thread")
        self.processor = processor
        if self.processor is None:
            self.log.error("Processor not given !")
            raise Exception("Processor not given !")

        self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\
                                         {},\
                                         int(conf['server'].get("process_thread_num", 1))
                                         )
        self.wlock = threading.Lock()

    def stop(self):
        self.log.warning("stop input_thread")
        self.running = False
        self.processor_pool.join_all()

    def run(self):
        job_num = 0
        self.running = True
        while self.running and self.input_tube:
            try:
                job = self.beanstalk.reserve(self.input_tube, 3)
                if not job is None:
                    job_num += 1
                    body = job.body
                    job.delete()
                    self.processor_pool.queue_task(self._on_task_start, (body,), self._on_task_finished)

            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                try:
                    self.beanstalk.reconnect()
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
            except:
                self.log.error("not msg from:%s\tresult:%s" % (self.input_tube, str(traceback.format_exc())))

    def _on_task_start(self, task, **thread_locals):
        result = None
        try:
            result = self.processor.do_task(task)
        except Exception as e:
            self.log.error(e.message)
        return result

    def _on_task_finished(self, (task), **thread_locals):
        self.wlock.acquire()
        if task and isinstance(task, basestring):
            self._output_msg(task)
        elif isinstance(task, list):
            for message in task:
                self._output_msg(message)
        self.wlock.release()
Beispiel #2
0
class CrawlSelector(threading.Thread):
    def __init__(self, log, selector_conf, beanstalk_conf, scheduler=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = False
        self.log = log
        # 下载统计信息

        self.site_static = {}
        self.scheduler = scheduler
        self.download_req_num = 0
        # 下载器配置信息
        # self.downloaders = []
        self.downloader_num = 0
        # self.downloader_conf = downloader_conf

        # for downloader in self.downloader_conf:
        #     try:
        #         self.downloaders.append(ThriftDownloader(downloader['host'], downloader['port']))
        #         self.downloader_num += 1
        #     except Exception, e:
        #         self.log.error('Add_downloader\t' + traceback.format_exc())
        # 选择器配置
        self.selector_conf = selector_conf
        # beanstalk 队列设置
        self.beanstalk_conf = beanstalk_conf
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'],
                                         beanstalk_conf['port'])
        self.output_tube = beanstalk_conf['output_tube']
        self.wlock = threading.Lock()

    def req_to_string(self, req):
        str_req = ""
        try:
            tMemory_b = TMemoryBuffer()
            tBinaryProtocol_b = TBinaryProtocol(tMemory_b)
            req.write(tBinaryProtocol_b)
            str_req = tMemory_b.getvalue()
        except:
            self.log.error('crawled_failt\terror:%s' %
                           (traceback.format_exc()))
        return str_req

    def run(self):
        self.running = True
        while self.running:
            reqs = None
            url = None
            try:
                if self.scheduler:
                    reqs = self.scheduler.dispatch()
                if reqs:
                    for req in reqs:
                        req_str = self.req_to_string(req)
                        self.out_beanstalk.put(self.output_tube, req_str)
                        self.log.info(
                            'start_crawl\turl:%s\tdownload_type:%s\tsession:%s'
                            % (req.url, req.download_type, req.session_commit))
                time.sleep(self.selector_conf['select_seed_sleep_time'])
            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                try:
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
            except Exception, e:
                self.log.error('crawled_failt\turl:%s\terror:%s' %
                               (url, traceback.format_exc()))
Beispiel #3
0
class InputThread(threading.Thread):
    def __init__(self, beanstalk_conf, log=None, process_pool=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True

        assert beanstalk_conf is not None
        assert log is not None
        assert process_pool is not None

        self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.input_tube = beanstalk_conf['input_tube']
        self.output_tube = beanstalk_conf['output_tube']
        self.log = log
        if not self.log:
            self.log = LogHandler("i_input_thread")

        self.process_pool = process_pool
        self.t_lock = threading.Lock()

    def stop(self):
        self.log.warning("stop input_thread")
        self.running = False
        try:
            while True:
                if self.process_pool.get_task_num() <= 0:
                    # if 'processor' in self.process_pool.thread_local_constructors:
                    #     processor = self.process_pool.thread_local_constructors['processor'][1][1]
                    #     self.log.warning("prepare call scheduler_processor to stop scheduler")
                    #     processor.save_status()
                    break
                else:
                    self.log.info("wait tasks be consumed over, wait 5s")
                    time.sleep(5)

            self.beanstalk.__del__()  # 关闭连接不再接受数据
        except Exception as e:
            self.log.error("stop input_thread fail")
            self.log.exception(e)

    def run(self):
        job_num = 0
        while self.running and self.input_tube:
            try:
                job = self.beanstalk.reserve(self.input_tube, 30)
                if job is not None:
                    job_num += 1
                    body = job.body
                    job.delete()

                    self.process_pool.queue_task(self.__on_task_start, (body,), self.__on_task_finished)
                    task_num = self.process_pool.get_task_num()
                    if task_num >= 50:
                        self.log.info("place_processor\ttasks:%d" % task_num)
                        time.sleep(2)
                else:
                    self.log.info("not msg from:%s" % self.input_tube)
            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                self.log.exception(e)
                try:
                    self.beanstalk.reconnect()
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
                    self.log.exception(e)
            except Exception as e:
                self.log.error("not msg from:%s\tresult:" % self.input_tube)
                self.log.exception(e)

    @staticmethod
    def __on_task_start(task, **thread_locals):
        result = None
        if 'profiler' in thread_locals:
            thread_locals['profiler'].begin()
        if 'processor' in thread_locals:
            result = thread_locals['processor'].do_task(task)
        return result

    def __on_task_finished(self, (result), **thread_locals):
        self.t_lock.acquire()
        proccesor = None
        if 'processor' in thread_locals:
            proccesor = thread_locals['processor']
        if 'profiler' in thread_locals:
            thread_locals['profiler'].end()
        if result and isinstance(result, basestring):
            self.__output_msg(result, proccesor)
        elif isinstance(result, list):
            for message in result:
                self.__output_msg(message, proccesor)
        self.t_lock.release()