コード例 #1
0
class InputThreadNew(threading.Thread):
    def __init__(self, conf, processor=None, proc_name=None):
        threading.Thread.__init__(self)
        self.running = True
        self.proc_name = proc_name  # Only for logging
        self.input_tube = conf['beanstalk_conf']['input_tube']
        self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'],
                                     conf['beanstalk_conf']['port'])
        self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'],
                                         conf['beanstalk_conf']['port'])
        self.output_tube = conf['beanstalk_conf']['output_tube']
        self.topic_output_tubes = {}
        self.topic_output_tubes.setdefault('default', [])
        """
            output_tube = ["default_out", "only_special_out:1,2,3:exclusive", "special_out:4", ":5:exclusive"]
            topic_id:1,2,3只会用到only_special_out
            topic_id:4 会进入special_out和default_out
            topic_id:5 不会进入队列
            topic_id:else 用用default_out队列
        """
        if type(self.output_tube) == list:
            for tube_def in self.output_tube:
                tube_def = tube_def.strip()
                if len(tube_def.split(":")) < 2:
                    self.topic_output_tubes['default'].append(tube_def)
                else:
                    elements = [a.strip() for a in tube_def.split(':')]
                    tube_name = elements[0]
                    topic_ids = [
                        int(a.strip()) for a in elements[1].split(',')
                    ]
                    exclusive = False
                    if len(elements) == 3 and elements[2] == 'exclusive':
                        exclusive = True
                    for topic_id in topic_ids:
                        self.topic_output_tubes.setdefault(topic_id, [])
                        self.topic_output_tubes[topic_id].append(
                            (tube_name, exclusive))
        else:
            self.topic_output_tubes['default'].append(self.output_tube)

        self.log = log
        if processor is None:
            log.error("Processor not given !")
            raise Exception("Processor not given !")
        else:
            self.processor = processor

    def stop(self):
        self.log.warning("stop input thread")
        self.running = False

    def run(self):
        log.debug("starting input thread")
        job_num = 0
        while self.running and self.input_tube:
            try:
                job = self.beanstalk.reserve(self.input_tube, 3)
                if job:
                    job_num += 1
                    body = job.body
                    resp = None
                    job.delete()
                    if self.processor is not None:
                        topic_id = None
                        try:
                            if type(self.processor).__name__ in (
                                    'ExtractorProccessor',
                                    'SingleSrcMergerProccessor'):
                                resp, topic_id = self.processor.do_task(body)
                            else:
                                resp = self.processor.do_task(body)
                        except Exception, e:
                            log.error("Process failed. " +
                                      traceback.format_exc())
                        if resp is not None:
                            self.output_msg(resp, topic_id)
                else:
                    self.log.debug(current_process().name +
                                   " : no msg from : %s" % (self.input_tube))
            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                try:
                    self.beanstalk.reconnect()
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
コード例 #2
0
ファイル: test_output.py プロジェクト: mylove1/crawler-2
#!/usr/bin/Python
# coding=utf-8
import sys

from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from bdp.i_crawler.i_extractor.ttypes import PageParseInfo
from i_util.pybeanstalk import PyBeanstalk

if __name__ == '__main__':
    pybeanstalk = PyBeanstalk('101.201.100.58')
    try:
        rsp_info = PageParseInfo()
        job = pybeanstalk.reserve('extract_info_ws')
        # while True:
        if job:
            tMemory_o = TMemoryBuffer(job.body)
            tBinaryProtocol_o = TBinaryProtocol(tMemory_o)
            rsp_info.read(tBinaryProtocol_o)
            d = vars(rsp_info)
            print d
            for k, v in d.items():
                print k, v
            job.delete()
    except EOFError, e:
        print e
コード例 #3
0
ファイル: test_output.py プロジェクト: mylove1/crawler-2
from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from bdp.i_crawler.i_crawler_merge.ttypes import LinkAttr
from bdp.i_crawler.i_downloader.ttypes import DownLoadReq
from bdp.i_crawler.i_extractor.ttypes import PageParseInfo
from i_util.pybeanstalk import PyBeanstalk

if __name__ == '__main__':
    pybeanstalk = PyBeanstalk('10.25.114.50')
    try:
        #link_info = DownLoadReq();
        info = PageParseInfo()
        while True:
            #print pybeanstalk.stats_tube('download_req')
            job = pybeanstalk.reserve('online_extract_info')
            """
            tMemory_o = TMemoryBuffer(job.body)
            tBinaryProtocol_o = TBinaryProtocol(tMemory_o)
            info.read(tBinaryProtocol_o)
            d = vars(info)
            for k,v in d.items():
                print k,v
            """
            job.delete()
            #break;
    except EOFError, e:
        print e
コード例 #4
0
class InputThread(threading.Thread):
    def __init__(self, conf, processor, proc_name= None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.input_tube = conf['beanstalk_conf']['input_tube']
        self.output_tube = conf['beanstalk_conf']['output_tube']

        self.log = conf['log']
        if not self.log:
            self.log = LogHandler("i_input_thread")
        self.processor = processor
        if self.processor is None:
            self.log.error("Processor not given !")
            raise Exception("Processor not given !")

        self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\
                                         {},\
                                         int(conf['server'].get("process_thread_num", 1))
                                         )
        self.wlock = threading.Lock()

    def stop(self):
        self.log.warning("stop input_thread")
        self.running = False
        self.processor_pool.join_all()

    def run(self):
        job_num = 0
        self.running = True
        while self.running and self.input_tube:
            try:
                job = self.beanstalk.reserve(self.input_tube, 3)
                if not job is None:
                    job_num += 1
                    body = job.body
                    job.delete()
                    self.processor_pool.queue_task(self._on_task_start, (body,), self._on_task_finished)

            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                try:
                    self.beanstalk.reconnect()
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
            except:
                self.log.error("not msg from:%s\tresult:%s" % (self.input_tube, str(traceback.format_exc())))

    def _on_task_start(self, task, **thread_locals):
        result = None
        try:
            result = self.processor.do_task(task)
        except Exception as e:
            self.log.error(e.message)
        return result

    def _on_task_finished(self, (task), **thread_locals):
        self.wlock.acquire()
        if task and isinstance(task, basestring):
            self._output_msg(task)
        elif isinstance(task, list):
            for message in task:
                self._output_msg(message)
        self.wlock.release()
コード例 #5
0
#!/usr/bin/Python
# coding=utf-8
import sys

from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from bdp.i_crawler.i_downloader.ttypes import DownLoadRsp
from bdp.i_crawler.i_extractor.ttypes import PageParseInfo
from i_util.pybeanstalk import PyBeanstalk

if __name__ == '__main__':
    pybeanstalk = PyBeanstalk('101.201.102.37')
    try:
        extractor_info = PageParseInfo()
        body = pybeanstalk.reserve('extract_info').body
        tMemory_o = TMemoryBuffer(body)
        tBinaryProtocol_o = TBinaryProtocol(tMemory_o)
        extractor_info.read(tBinaryProtocol_o)
        print extractor_info
    except EOFError, e:
        print e
コード例 #6
0
ファイル: input_thread.py プロジェクト: xtuyaowu/gsxt_crawler
class InputThread(threading.Thread):
    def __init__(self, beanstalk_conf, log=None, process_pool=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True

        assert beanstalk_conf is not None
        assert log is not None
        assert process_pool is not None

        self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.input_tube = beanstalk_conf['input_tube']
        self.output_tube = beanstalk_conf['output_tube']
        self.log = log
        if not self.log:
            self.log = LogHandler("i_input_thread")

        self.process_pool = process_pool
        self.t_lock = threading.Lock()

    def stop(self):
        self.log.warning("stop input_thread")
        self.running = False
        try:
            while True:
                if self.process_pool.get_task_num() <= 0:
                    # if 'processor' in self.process_pool.thread_local_constructors:
                    #     processor = self.process_pool.thread_local_constructors['processor'][1][1]
                    #     self.log.warning("prepare call scheduler_processor to stop scheduler")
                    #     processor.save_status()
                    break
                else:
                    self.log.info("wait tasks be consumed over, wait 5s")
                    time.sleep(5)

            self.beanstalk.__del__()  # 关闭连接不再接受数据
        except Exception as e:
            self.log.error("stop input_thread fail")
            self.log.exception(e)

    def run(self):
        job_num = 0
        while self.running and self.input_tube:
            try:
                job = self.beanstalk.reserve(self.input_tube, 30)
                if job is not None:
                    job_num += 1
                    body = job.body
                    job.delete()

                    self.process_pool.queue_task(self.__on_task_start, (body,), self.__on_task_finished)
                    task_num = self.process_pool.get_task_num()
                    if task_num >= 50:
                        self.log.info("place_processor\ttasks:%d" % task_num)
                        time.sleep(2)
                else:
                    self.log.info("not msg from:%s" % self.input_tube)
            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                self.log.exception(e)
                try:
                    self.beanstalk.reconnect()
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
                    self.log.exception(e)
            except Exception as e:
                self.log.error("not msg from:%s\tresult:" % self.input_tube)
                self.log.exception(e)

    @staticmethod
    def __on_task_start(task, **thread_locals):
        result = None
        if 'profiler' in thread_locals:
            thread_locals['profiler'].begin()
        if 'processor' in thread_locals:
            result = thread_locals['processor'].do_task(task)
        return result

    def __on_task_finished(self, (result), **thread_locals):
        self.t_lock.acquire()
        proccesor = None
        if 'processor' in thread_locals:
            proccesor = thread_locals['processor']
        if 'profiler' in thread_locals:
            thread_locals['profiler'].end()
        if result and isinstance(result, basestring):
            self.__output_msg(result, proccesor)
        elif isinstance(result, list):
            for message in result:
                self.__output_msg(message, proccesor)
        self.t_lock.release()