def __init__(self, beanstalk_conf, log=None, process_pool=None): threading.Thread.__init__(self) self.daemon = True self.running = True self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.input_tube = beanstalk_conf['input_tube'] self.output_tube = beanstalk_conf['output_tube'] self.log = log if not self.log: self.log = LogHandler("i_input_thread") self.process_pool = process_pool self.wlock = threading.Lock()
def __init__(self, log, selector_conf, beanstalk_conf, scheduler=None): threading.Thread.__init__(self) self.daemon = True self.running = False self.log = log # 下载统计信息 self.site_static = {} self.scheduler = scheduler self.download_req_num = 0 # 下载器配置信息 # self.downloaders = [] self.downloader_num = 0 # self.downloader_conf = downloader_conf # for downloader in self.downloader_conf: # try: # self.downloaders.append(ThriftDownloader(downloader['host'], downloader['port'])) # self.downloader_num += 1 # except Exception, e: # self.log.error('Add_downloader\t' + traceback.format_exc()) # 选择器配置 self.selector_conf = selector_conf # beanstalk 队列设置 self.beanstalk_conf = beanstalk_conf self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.output_tube = beanstalk_conf['output_tube'] self.wlock = threading.Lock()
def __init__(self, conf, convert, select_handler): self.conf = conf self.log = conf['log'] self.convert = convert self.select_handler = select_handler self.beanstalk = PyBeanstalk( conf.get('beanstalk_conf').get('host'), conf.get('beanstalk_conf').get('port'))
def __init__(self, conf, processor=None, proc_name=None): threading.Thread.__init__(self) self.running = True self.proc_name = proc_name # Only for logging self.input_tube = conf['beanstalk_conf']['input_tube'] self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.output_tube = conf['beanstalk_conf']['output_tube'] self.topic_output_tubes = {} self.topic_output_tubes.setdefault('default', []) """ output_tube = ["default_out", "only_special_out:1,2,3:exclusive", "special_out:4", ":5:exclusive"] topic_id:1,2,3只会用到only_special_out topic_id:4 会进入special_out和default_out topic_id:5 不会进入队列 topic_id:else 用用default_out队列 """ if type(self.output_tube) == list: for tube_def in self.output_tube: tube_def = tube_def.strip() if len(tube_def.split(":")) < 2: self.topic_output_tubes['default'].append(tube_def) else: elements = [a.strip() for a in tube_def.split(':')] tube_name = elements[0] topic_ids = [ int(a.strip()) for a in elements[1].split(',') ] exclusive = False if len(elements) == 3 and elements[2] == 'exclusive': exclusive = True for topic_id in topic_ids: self.topic_output_tubes.setdefault(topic_id, []) self.topic_output_tubes[topic_id].append( (tube_name, exclusive)) else: self.topic_output_tubes['default'].append(self.output_tube) self.log = log if processor is None: log.error("Processor not given !") raise Exception("Processor not given !") else: self.processor = processor
def __init__(self, beanstalk_conf, log): self._queue = Queue() self._log = log self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.beanstalk_conf = beanstalk_conf threading.Thread.__init__(self) self.daemon = True self.running = True
def put_beanstalked(beanstalk_conf, log, rsp): beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) tube = beanstalk_conf['input_tube'] str_page_info = to_string(log, rsp) try: beanstalk.put(tube, str_page_info) log.info('beanstalk\turl:%s\ttube:%s' % (rsp.url, tube)) except Exception as e: log.info('beanstalk put error url:' + rsp.url + '\ttube:' + tube)
def __init__(self, site, url_pattern=".*", test=True, parser_id=None): global logger self.beanstalk = PyBeanstalk("Crawler-Downloader1:Crawler-Downloader2", 11300) self.output_tube = 'download_rsp' self.output_queue = queue.Queue(maxsize = 1000) self.mongo_client = pymongo.MongoClient('Crawler-DataServer1', 40042) self.site = site self.url_pattern = url_pattern self.test = test self.parser_id = parser_id
def deliver_req(): out_beanstalk = PyBeanstalk('172.18.180.223', 11300) while True: try: priority, reqs = index_queue.get_nowait() req_str = req_to_string(reqs) out_beanstalk.put('online_download_req', req_str) except Empty: continue time.sleep(6)
def __init__(self, log, conf): self.log = log self.conf = conf assert log is not None assert isinstance(conf, dict) self.type_extractor_map = self.conf['type_extractor_map'] self.smart_proxy_url = self.conf['smart_proxy_url'] self.out_beanstalk = PyBeanstalk(self.conf['beanstalk_conf']['host'], self.conf['beanstalk_conf']['port']) self.output_tube_scheduler = self.conf['beanstalk_conf']['output_tube_scheduler']
def put_beanstalkd(self, tube_name, obj): str_page_info = self.to_string(obj) try: self.beanstalk.put(tube_name, str_page_info) self._log.info('put beanstalk \ttube:%s success' % (tube_name, )) except SocketError as e: self._log.error('beanstalk connect failed, {}'.format(e.message)) self.beanstalk = PyBeanstalk(self.beanstalk_conf['host'], self.beanstalk_conf['port']) except Exception as e: self._log.info('beanstalk put tube{} error {}'.format( tube_name, str(traceback.format_exc())))
def __init__(self, conf, processor, proc_name= None): threading.Thread.__init__(self) self.daemon = True self.running = True self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.input_tube = conf['beanstalk_conf']['input_tube'] self.output_tube = conf['beanstalk_conf']['output_tube'] self.log = conf['log'] if not self.log: self.log = LogHandler("i_input_thread") self.processor = processor if self.processor is None: self.log.error("Processor not given !") raise Exception("Processor not given !") self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\ {},\ int(conf['server'].get("process_thread_num", 1)) ) self.wlock = threading.Lock()
def __init__(self, conf): self.log = conf['log'] self.conf = conf self.beanstalk_conf = conf['beanstalk_conf'] try: self.mongo_client_web = PyMongo( self.conf['webpage_db']['host'], self.conf['webpage_db']['port'], self.conf['webpage_db']['db'], self.conf['webpage_db']['username'], self.conf['webpage_db']['password']) self.beanstalk_client = PyBeanstalk(self.beanstalk_conf['host'], self.beanstalk_conf['port']) except: self.log.error(traceback.format_exc())
def thrput_task(): input_tube='download_req' beanstalk = PyBeanstalk('101.201.102.37', 11300) client,transport=getclient() cnt=0 start=time.time() suma=100 while suma: suma-=1 for i in pro.keys(): try: req=getreq(proa=i) str_page_info = to_string(req) beanstalk.put(input_tube, str_page_info) cnt+=1 except Exception as e: print e.message print ('usetime:{}'.format(time.time()-start)) closetransport(transport)
import os import json import traceback import time from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from i_util.pybeanstalk import PyBeanstalk from bdp.i_crawler.i_downloader.ttypes import DownLoadReq from i_util.logs import LogHandler import config log = LogHandler('re_crawler', console_out=True) beanstalk_conf = config.beanstalk_conf beanstalk_client = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) def req_to_string(req): str_req = "" try: tMemory_b = TMemoryBuffer() tBinaryProtocol_b = TBinaryProtocol(tMemory_b) req.write(tBinaryProtocol_b) str_req = tMemory_b.getvalue() except: log.error('crawled_failt\terror:%s' % (traceback.format_exc())) return str_req def create_download_req(url,
#!/usr/bin/Python # coding=utf-8 import sys from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from bdp.i_crawler.i_downloader.ttypes import DownLoadRsp from bdp.i_crawler.i_extractor.ttypes import PageParseInfo from i_util.pybeanstalk import PyBeanstalk if __name__ == '__main__': pybeanstalk = PyBeanstalk('101.201.102.37') try: extractor_info = PageParseInfo() body = pybeanstalk.reserve('extract_info').body tMemory_o = TMemoryBuffer(body) tBinaryProtocol_o = TBinaryProtocol(tMemory_o) extractor_info.read(tBinaryProtocol_o) print extractor_info except EOFError, e: print e
#!/usr/bin/Python # coding=utf-8 import sys from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from bdp.i_crawler.i_extractor.ttypes import PageParseInfo from i_util.pybeanstalk import PyBeanstalk if __name__ == '__main__': pybeanstalk = PyBeanstalk('101.201.100.58') try: rsp_info = PageParseInfo() job = pybeanstalk.reserve('extract_info_ws') # while True: if job: tMemory_o = TMemoryBuffer(job.body) tBinaryProtocol_o = TBinaryProtocol(tMemory_o) rsp_info.read(tBinaryProtocol_o) d = vars(rsp_info) print d for k, v in d.items(): print k, v job.delete() except EOFError, e: print e
#!/usr/bin/Python # coding=utf-8 import sys from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from bdp.i_crawler.i_crawler_merge.ttypes import LinkAttr from bdp.i_crawler.i_downloader.ttypes import DownLoadReq from bdp.i_crawler.i_extractor.ttypes import PageParseInfo from i_util.pybeanstalk import PyBeanstalk if __name__ == '__main__': pybeanstalk = PyBeanstalk('10.25.114.50') try: #link_info = DownLoadReq(); info = PageParseInfo() while True: #print pybeanstalk.stats_tube('download_req') job = pybeanstalk.reserve('online_extract_info') """ tMemory_o = TMemoryBuffer(job.body) tBinaryProtocol_o = TBinaryProtocol(tMemory_o) info.read(tBinaryProtocol_o) d = vars(info) for k,v in d.items(): print k,v """ job.delete() #break;