class InputThreadNew(threading.Thread): def __init__(self, conf, processor=None, proc_name=None): threading.Thread.__init__(self) self.running = True self.proc_name = proc_name # Only for logging self.input_tube = conf['beanstalk_conf']['input_tube'] self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.output_tube = conf['beanstalk_conf']['output_tube'] self.topic_output_tubes = {} self.topic_output_tubes.setdefault('default', []) """ output_tube = ["default_out", "only_special_out:1,2,3:exclusive", "special_out:4", ":5:exclusive"] topic_id:1,2,3只会用到only_special_out topic_id:4 会进入special_out和default_out topic_id:5 不会进入队列 topic_id:else 用用default_out队列 """ if type(self.output_tube) == list: for tube_def in self.output_tube: tube_def = tube_def.strip() if len(tube_def.split(":")) < 2: self.topic_output_tubes['default'].append(tube_def) else: elements = [a.strip() for a in tube_def.split(':')] tube_name = elements[0] topic_ids = [ int(a.strip()) for a in elements[1].split(',') ] exclusive = False if len(elements) == 3 and elements[2] == 'exclusive': exclusive = True for topic_id in topic_ids: self.topic_output_tubes.setdefault(topic_id, []) self.topic_output_tubes[topic_id].append( (tube_name, exclusive)) else: self.topic_output_tubes['default'].append(self.output_tube) self.log = log if processor is None: log.error("Processor not given !") raise Exception("Processor not given !") else: self.processor = processor def stop(self): self.log.warning("stop input thread") self.running = False def run(self): log.debug("starting input thread") job_num = 0 while self.running and self.input_tube: try: job = self.beanstalk.reserve(self.input_tube, 3) if job: job_num += 1 body = job.body resp = None job.delete() if self.processor is not None: topic_id = None try: if type(self.processor).__name__ in ( 'ExtractorProccessor', 'SingleSrcMergerProccessor'): resp, topic_id = self.processor.do_task(body) else: resp = self.processor.do_task(body) except Exception, e: log.error("Process failed. " + traceback.format_exc()) if resp is not None: self.output_msg(resp, topic_id) else: self.log.debug(current_process().name + " : no msg from : %s" % (self.input_tube)) except SocketError as e: time.sleep(30) self.log.error('beanstalk\tconnect\tfail\tstart\treconnect') try: self.beanstalk.reconnect() self.out_beanstalk.reconnect() self.log.error('beanstalk\treconnect\tsuccess') except Exception as e: self.log.error('beanstalk\treconnect\tfail')
#!/usr/bin/Python # coding=utf-8 import sys from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from bdp.i_crawler.i_extractor.ttypes import PageParseInfo from i_util.pybeanstalk import PyBeanstalk if __name__ == '__main__': pybeanstalk = PyBeanstalk('101.201.100.58') try: rsp_info = PageParseInfo() job = pybeanstalk.reserve('extract_info_ws') # while True: if job: tMemory_o = TMemoryBuffer(job.body) tBinaryProtocol_o = TBinaryProtocol(tMemory_o) rsp_info.read(tBinaryProtocol_o) d = vars(rsp_info) print d for k, v in d.items(): print k, v job.delete() except EOFError, e: print e
from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from bdp.i_crawler.i_crawler_merge.ttypes import LinkAttr from bdp.i_crawler.i_downloader.ttypes import DownLoadReq from bdp.i_crawler.i_extractor.ttypes import PageParseInfo from i_util.pybeanstalk import PyBeanstalk if __name__ == '__main__': pybeanstalk = PyBeanstalk('10.25.114.50') try: #link_info = DownLoadReq(); info = PageParseInfo() while True: #print pybeanstalk.stats_tube('download_req') job = pybeanstalk.reserve('online_extract_info') """ tMemory_o = TMemoryBuffer(job.body) tBinaryProtocol_o = TBinaryProtocol(tMemory_o) info.read(tBinaryProtocol_o) d = vars(info) for k,v in d.items(): print k,v """ job.delete() #break; except EOFError, e: print e
class InputThread(threading.Thread): def __init__(self, conf, processor, proc_name= None): threading.Thread.__init__(self) self.daemon = True self.running = True self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.input_tube = conf['beanstalk_conf']['input_tube'] self.output_tube = conf['beanstalk_conf']['output_tube'] self.log = conf['log'] if not self.log: self.log = LogHandler("i_input_thread") self.processor = processor if self.processor is None: self.log.error("Processor not given !") raise Exception("Processor not given !") self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\ {},\ int(conf['server'].get("process_thread_num", 1)) ) self.wlock = threading.Lock() def stop(self): self.log.warning("stop input_thread") self.running = False self.processor_pool.join_all() def run(self): job_num = 0 self.running = True while self.running and self.input_tube: try: job = self.beanstalk.reserve(self.input_tube, 3) if not job is None: job_num += 1 body = job.body job.delete() self.processor_pool.queue_task(self._on_task_start, (body,), self._on_task_finished) except SocketError as e: time.sleep(30) self.log.error('beanstalk\tconnect\tfail\tstart\treconnect') try: self.beanstalk.reconnect() self.out_beanstalk.reconnect() self.log.error('beanstalk\treconnect\tsuccess') except Exception as e: self.log.error('beanstalk\treconnect\tfail') except: self.log.error("not msg from:%s\tresult:%s" % (self.input_tube, str(traceback.format_exc()))) def _on_task_start(self, task, **thread_locals): result = None try: result = self.processor.do_task(task) except Exception as e: self.log.error(e.message) return result def _on_task_finished(self, (task), **thread_locals): self.wlock.acquire() if task and isinstance(task, basestring): self._output_msg(task) elif isinstance(task, list): for message in task: self._output_msg(message) self.wlock.release()
#!/usr/bin/Python # coding=utf-8 import sys from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from bdp.i_crawler.i_downloader.ttypes import DownLoadRsp from bdp.i_crawler.i_extractor.ttypes import PageParseInfo from i_util.pybeanstalk import PyBeanstalk if __name__ == '__main__': pybeanstalk = PyBeanstalk('101.201.102.37') try: extractor_info = PageParseInfo() body = pybeanstalk.reserve('extract_info').body tMemory_o = TMemoryBuffer(body) tBinaryProtocol_o = TBinaryProtocol(tMemory_o) extractor_info.read(tBinaryProtocol_o) print extractor_info except EOFError, e: print e
class InputThread(threading.Thread): def __init__(self, beanstalk_conf, log=None, process_pool=None): threading.Thread.__init__(self) self.daemon = True self.running = True assert beanstalk_conf is not None assert log is not None assert process_pool is not None self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.input_tube = beanstalk_conf['input_tube'] self.output_tube = beanstalk_conf['output_tube'] self.log = log if not self.log: self.log = LogHandler("i_input_thread") self.process_pool = process_pool self.t_lock = threading.Lock() def stop(self): self.log.warning("stop input_thread") self.running = False try: while True: if self.process_pool.get_task_num() <= 0: # if 'processor' in self.process_pool.thread_local_constructors: # processor = self.process_pool.thread_local_constructors['processor'][1][1] # self.log.warning("prepare call scheduler_processor to stop scheduler") # processor.save_status() break else: self.log.info("wait tasks be consumed over, wait 5s") time.sleep(5) self.beanstalk.__del__() # 关闭连接不再接受数据 except Exception as e: self.log.error("stop input_thread fail") self.log.exception(e) def run(self): job_num = 0 while self.running and self.input_tube: try: job = self.beanstalk.reserve(self.input_tube, 30) if job is not None: job_num += 1 body = job.body job.delete() self.process_pool.queue_task(self.__on_task_start, (body,), self.__on_task_finished) task_num = self.process_pool.get_task_num() if task_num >= 50: self.log.info("place_processor\ttasks:%d" % task_num) time.sleep(2) else: self.log.info("not msg from:%s" % self.input_tube) except SocketError as e: time.sleep(30) self.log.error('beanstalk\tconnect\tfail\tstart\treconnect') self.log.exception(e) try: self.beanstalk.reconnect() self.out_beanstalk.reconnect() self.log.error('beanstalk\treconnect\tsuccess') except Exception as e: self.log.error('beanstalk\treconnect\tfail') self.log.exception(e) except Exception as e: self.log.error("not msg from:%s\tresult:" % self.input_tube) self.log.exception(e) @staticmethod def __on_task_start(task, **thread_locals): result = None if 'profiler' in thread_locals: thread_locals['profiler'].begin() if 'processor' in thread_locals: result = thread_locals['processor'].do_task(task) return result def __on_task_finished(self, (result), **thread_locals): self.t_lock.acquire() proccesor = None if 'processor' in thread_locals: proccesor = thread_locals['processor'] if 'profiler' in thread_locals: thread_locals['profiler'].end() if result and isinstance(result, basestring): self.__output_msg(result, proccesor) elif isinstance(result, list): for message in result: self.__output_msg(message, proccesor) self.t_lock.release()