class InputThread(threading.Thread): def __init__(self, beanstalk_conf, log=None, process_pool=None): threading.Thread.__init__(self) self.daemon = True self.running = True self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.input_tube = beanstalk_conf['input_tube'] self.output_tube = beanstalk_conf['output_tube'] self.log = log if not self.log: self.log = LogHandler("i_input_thread") self.process_pool = process_pool self.wlock = threading.Lock() def stop(self): self.log.warning("stop input_thread") self.running = False proccesor = None try: while True: if self.process_pool.get_task_num() == 0: if self.process_pool.thread_local_constructors.has_key('processor'): processor = self.process_pool.thread_local_constructors['processor'][1][1] self.log.warning("prepare call scheduler_processor to stop scheduler") processor.save_status() break else: self.log.info("wait tasks be consumed over, wait 5s") time.sleep(5) self.beanstalk.__del__() # 关闭连接不再接受数据 except Exception, e: self.log.error("stop input_thread fail:%s" % e.message)
def __init__(self, log, selector_conf, beanstalk_conf, scheduler=None): threading.Thread.__init__(self) self.daemon = True self.running = False self.log = log # 下载统计信息 self.site_static = {} self.scheduler = scheduler self.download_req_num = 0 # 下载器配置信息 # self.downloaders = [] self.downloader_num = 0 # self.downloader_conf = downloader_conf # for downloader in self.downloader_conf: # try: # self.downloaders.append(ThriftDownloader(downloader['host'], downloader['port'])) # self.downloader_num += 1 # except Exception, e: # self.log.error('Add_downloader\t' + traceback.format_exc()) # 选择器配置 self.selector_conf = selector_conf # beanstalk 队列设置 self.beanstalk_conf = beanstalk_conf self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.output_tube = beanstalk_conf['output_tube'] self.wlock = threading.Lock()
def put_beanstalked(beanstalk_conf, log, rsp): beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) tube = beanstalk_conf['input_tube'] str_page_info = to_string(log, rsp) try: beanstalk.put(tube, str_page_info) log.info('beanstalk\turl:%s\ttube:%s' % (rsp.url, tube)) except Exception as e: log.info('beanstalk put error url:' + rsp.url + '\ttube:' + tube)
def __init__(self, beanstalk_conf, log): self._queue = Queue() self._log = log self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.beanstalk_conf = beanstalk_conf threading.Thread.__init__(self) self.daemon = True self.running = True
def deliver_req(): out_beanstalk = PyBeanstalk('172.18.180.223', 11300) while True: try: priority, reqs = index_queue.get_nowait() req_str = req_to_string(reqs) out_beanstalk.put('online_download_req', req_str) except Empty: continue time.sleep(6)
def put_beanstalkd(self, tube_name, obj): str_page_info = self.to_string(obj) try: self.beanstalk.put(tube_name, str_page_info) self._log.info('put beanstalk \ttube:%s success' % (tube_name, )) except SocketError as e: self._log.error('beanstalk connect failed, {}'.format(e.message)) self.beanstalk = PyBeanstalk(self.beanstalk_conf['host'], self.beanstalk_conf['port']) except Exception as e: self._log.info('beanstalk put tube{} error {}'.format( tube_name, str(traceback.format_exc())))
def __init__(self, log, conf): self.log = log self.conf = conf assert log is not None assert isinstance(conf, dict) self.type_extractor_map = self.conf['type_extractor_map'] self.smart_proxy_url = self.conf['smart_proxy_url'] self.out_beanstalk = PyBeanstalk(self.conf['beanstalk_conf']['host'], self.conf['beanstalk_conf']['port']) self.output_tube_scheduler = self.conf['beanstalk_conf']['output_tube_scheduler']
class PutBeanstaldServer(threading.Thread): def __init__(self, beanstalk_conf, log): self._queue = Queue() self._log = log self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.beanstalk_conf = beanstalk_conf threading.Thread.__init__(self) self.daemon = True self.running = True def to_string(self, page_info): str_page_info = None try: tMemory_b = TMemoryBuffer() tBinaryProtocol_b = TBinaryServerProtocol(tMemory_b) page_info.write(tBinaryProtocol_b) str_page_info = tMemory_b.getvalue() except EOFError as e: self._log.warning("cann't write data to string") return str_page_info def put_beanstalkd(self, tube_name, obj): str_page_info = self.to_string(obj) try: self.beanstalk.put(tube_name, str_page_info) self._log.info('put beanstalk \ttube:%s success' % (tube_name, )) except SocketError as e: self._log.error('beanstalk connect failed, {}'.format(e.message)) self.beanstalk = PyBeanstalk(self.beanstalk_conf['host'], self.beanstalk_conf['port']) except Exception as e: self._log.info('beanstalk put tube{} error {}'.format( tube_name, str(traceback.format_exc()))) def run(self): while True: record = self._queue.get() self._build_record_and_put(record) def get_tube_by_name(self, tube_name): return self.beanstalk_conf.get(tube_name, None) def _build_record_and_put(self, data): tube_name = data.get('tube_name', None) if not tube_name: return obj = data.get('obj', None) if not obj: return self.put_beanstalkd(tube_name, obj) def save_record(self, data): self._queue.put(data)
def __init__(self, conf): self.log = conf['log'] self.conf = conf self.beanstalk_conf = conf['beanstalk_conf'] try: self.mongo_client_web = PyMongo( self.conf['webpage_db']['host'], self.conf['webpage_db']['port'], self.conf['webpage_db']['db'], self.conf['webpage_db']['username'], self.conf['webpage_db']['password']) self.beanstalk_client = PyBeanstalk(self.beanstalk_conf['host'], self.beanstalk_conf['port']) except: self.log.error(traceback.format_exc())
def __init__(self, beanstalk_conf, log=None, process_pool=None): threading.Thread.__init__(self) self.daemon = True self.running = True self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.input_tube = beanstalk_conf['input_tube'] self.output_tube = beanstalk_conf['output_tube'] self.log = log if not self.log: self.log = LogHandler("i_input_thread") self.process_pool = process_pool self.wlock = threading.Lock()
def __init__(self, conf, convert, select_handler): self.conf = conf self.log = conf['log'] self.convert = convert self.select_handler = select_handler self.beanstalk = PyBeanstalk( conf.get('beanstalk_conf').get('host'), conf.get('beanstalk_conf').get('port'))
def __init__(self, conf, processor=None, proc_name=None): threading.Thread.__init__(self) self.running = True self.proc_name = proc_name # Only for logging self.input_tube = conf['beanstalk_conf']['input_tube'] self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.output_tube = conf['beanstalk_conf']['output_tube'] self.topic_output_tubes = {} self.topic_output_tubes.setdefault('default', []) """ output_tube = ["default_out", "only_special_out:1,2,3:exclusive", "special_out:4", ":5:exclusive"] topic_id:1,2,3只会用到only_special_out topic_id:4 会进入special_out和default_out topic_id:5 不会进入队列 topic_id:else 用用default_out队列 """ if type(self.output_tube) == list: for tube_def in self.output_tube: tube_def = tube_def.strip() if len(tube_def.split(":")) < 2: self.topic_output_tubes['default'].append(tube_def) else: elements = [a.strip() for a in tube_def.split(':')] tube_name = elements[0] topic_ids = [ int(a.strip()) for a in elements[1].split(',') ] exclusive = False if len(elements) == 3 and elements[2] == 'exclusive': exclusive = True for topic_id in topic_ids: self.topic_output_tubes.setdefault(topic_id, []) self.topic_output_tubes[topic_id].append( (tube_name, exclusive)) else: self.topic_output_tubes['default'].append(self.output_tube) self.log = log if processor is None: log.error("Processor not given !") raise Exception("Processor not given !") else: self.processor = processor
def __init__(self, site, url_pattern=".*", test=True, parser_id=None): global logger self.beanstalk = PyBeanstalk("Crawler-Downloader1:Crawler-Downloader2", 11300) self.output_tube = 'download_rsp' self.output_queue = queue.Queue(maxsize = 1000) self.mongo_client = pymongo.MongoClient('Crawler-DataServer1', 40042) self.site = site self.url_pattern = url_pattern self.test = test self.parser_id = parser_id
def thrput_task(): input_tube='download_req' beanstalk = PyBeanstalk('101.201.102.37', 11300) client,transport=getclient() cnt=0 start=time.time() suma=100 while suma: suma-=1 for i in pro.keys(): try: req=getreq(proa=i) str_page_info = to_string(req) beanstalk.put(input_tube, str_page_info) cnt+=1 except Exception as e: print e.message print ('usetime:{}'.format(time.time()-start)) closetransport(transport)
def __init__(self, conf, processor, proc_name= None): threading.Thread.__init__(self) self.daemon = True self.running = True self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.input_tube = conf['beanstalk_conf']['input_tube'] self.output_tube = conf['beanstalk_conf']['output_tube'] self.log = conf['log'] if not self.log: self.log = LogHandler("i_input_thread") self.processor = processor if self.processor is None: self.log.error("Processor not given !") raise Exception("Processor not given !") self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\ {},\ int(conf['server'].get("process_thread_num", 1)) ) self.wlock = threading.Lock()
class InputThread(threading.Thread): def __init__(self, beanstalk_conf, log=None, process_pool=None): threading.Thread.__init__(self) self.daemon = True self.running = True assert beanstalk_conf is not None assert log is not None assert process_pool is not None self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.input_tube = beanstalk_conf['input_tube'] self.output_tube = beanstalk_conf['output_tube'] self.log = log if not self.log: self.log = LogHandler("i_input_thread") self.process_pool = process_pool self.t_lock = threading.Lock() def stop(self): self.log.warning("stop input_thread") self.running = False try: while True: if self.process_pool.get_task_num() <= 0: # if 'processor' in self.process_pool.thread_local_constructors: # processor = self.process_pool.thread_local_constructors['processor'][1][1] # self.log.warning("prepare call scheduler_processor to stop scheduler") # processor.save_status() break else: self.log.info("wait tasks be consumed over, wait 5s") time.sleep(5) self.beanstalk.__del__() # 关闭连接不再接受数据 except Exception as e: self.log.error("stop input_thread fail") self.log.exception(e) def run(self): job_num = 0 while self.running and self.input_tube: try: job = self.beanstalk.reserve(self.input_tube, 30) if job is not None: job_num += 1 body = job.body job.delete() self.process_pool.queue_task(self.__on_task_start, (body,), self.__on_task_finished) task_num = self.process_pool.get_task_num() if task_num >= 50: self.log.info("place_processor\ttasks:%d" % task_num) time.sleep(2) else: self.log.info("not msg from:%s" % self.input_tube) except SocketError as e: time.sleep(30) self.log.error('beanstalk\tconnect\tfail\tstart\treconnect') self.log.exception(e) try: self.beanstalk.reconnect() self.out_beanstalk.reconnect() self.log.error('beanstalk\treconnect\tsuccess') except Exception as e: self.log.error('beanstalk\treconnect\tfail') self.log.exception(e) except Exception as e: self.log.error("not msg from:%s\tresult:" % self.input_tube) self.log.exception(e) @staticmethod def __on_task_start(task, **thread_locals): result = None if 'profiler' in thread_locals: thread_locals['profiler'].begin() if 'processor' in thread_locals: result = thread_locals['processor'].do_task(task) return result def __on_task_finished(self, (result), **thread_locals): self.t_lock.acquire() proccesor = None if 'processor' in thread_locals: proccesor = thread_locals['processor'] if 'profiler' in thread_locals: thread_locals['profiler'].end() if result and isinstance(result, basestring): self.__output_msg(result, proccesor) elif isinstance(result, list): for message in result: self.__output_msg(message, proccesor) self.t_lock.release()
class CrawlSelector(threading.Thread): def __init__(self, log, selector_conf, beanstalk_conf, scheduler=None): threading.Thread.__init__(self) self.daemon = True self.running = False self.log = log # 下载统计信息 self.site_static = {} self.scheduler = scheduler self.download_req_num = 0 # 下载器配置信息 # self.downloaders = [] self.downloader_num = 0 # self.downloader_conf = downloader_conf # for downloader in self.downloader_conf: # try: # self.downloaders.append(ThriftDownloader(downloader['host'], downloader['port'])) # self.downloader_num += 1 # except Exception, e: # self.log.error('Add_downloader\t' + traceback.format_exc()) # 选择器配置 self.selector_conf = selector_conf # beanstalk 队列设置 self.beanstalk_conf = beanstalk_conf self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.output_tube = beanstalk_conf['output_tube'] self.wlock = threading.Lock() def req_to_string(self, req): str_req = "" try: tMemory_b = TMemoryBuffer() tBinaryProtocol_b = TBinaryProtocol(tMemory_b) req.write(tBinaryProtocol_b) str_req = tMemory_b.getvalue() except: self.log.error('crawled_failt\terror:%s' % (traceback.format_exc())) return str_req def run(self): self.running = True while self.running: reqs = None url = None try: if self.scheduler: reqs = self.scheduler.dispatch() if reqs: for req in reqs: req_str = self.req_to_string(req) self.out_beanstalk.put(self.output_tube, req_str) self.log.info( 'start_crawl\turl:%s\tdownload_type:%s\tsession:%s' % (req.url, req.download_type, req.session_commit)) time.sleep(self.selector_conf['select_seed_sleep_time']) except SocketError as e: time.sleep(30) self.log.error('beanstalk\tconnect\tfail\tstart\treconnect') try: self.out_beanstalk.reconnect() self.log.error('beanstalk\treconnect\tsuccess') except Exception as e: self.log.error('beanstalk\treconnect\tfail') except Exception, e: self.log.error('crawled_failt\turl:%s\terror:%s' % (url, traceback.format_exc()))
#!/usr/bin/Python # coding=utf-8 import sys from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from bdp.i_crawler.i_downloader.ttypes import DownLoadRsp from bdp.i_crawler.i_extractor.ttypes import PageParseInfo from i_util.pybeanstalk import PyBeanstalk if __name__ == '__main__': pybeanstalk = PyBeanstalk('101.201.102.37') try: extractor_info = PageParseInfo() body = pybeanstalk.reserve('extract_info').body tMemory_o = TMemoryBuffer(body) tBinaryProtocol_o = TBinaryProtocol(tMemory_o) extractor_info.read(tBinaryProtocol_o) print extractor_info except EOFError, e: print e
class SelectProcessor(object): def __init__(self, conf): self.log = conf['log'] self.conf = conf self.beanstalk_conf = conf['beanstalk_conf'] try: self.mongo_client_web = PyMongo( self.conf['webpage_db']['host'], self.conf['webpage_db']['port'], self.conf['webpage_db']['db'], self.conf['webpage_db']['username'], self.conf['webpage_db']['password']) self.beanstalk_client = PyBeanstalk(self.beanstalk_conf['host'], self.beanstalk_conf['port']) except: self.log.error(traceback.format_exc()) def get_download_rsp(self, result): url = result['url'] content = result['content'].encode('utf-8') content_type = result.get('content_type', 'text/html') page_size = len(content) return DownLoadRsp(url=url, download_time=int(time.time()), status=0, content_type=content_type, page_size=page_size, elapsed=100, content=content, redirect_url=url, src_type='webpage', http_code=200) # 通过url_format批量查询,并发送到队列 def select_webpage(self, site, url_format, limit, start, extra_filter): try: collection_names = self.mongo_client_web.get_collection_names() #i_util中需提供一个函数计算主域 domain = "" for collection_name in collection_names: prefix_domain = "." + collection_name if site.endswith(collection_name) or site.endswith( prefix_domain): domain = collection_name break if domain: item_cursor = self.mongo_client_web.select_by_url_format( domain, site, url_format, limit, start, extra_filter) return item_cursor except: self.log.error( "select_webpage\tsite:{0}\turl_format\t{1}\terror:{2}".format( site, url_format, traceback.format_exc())) self.log.info( "select_webpage\tfinish\tsite:{0}\turl_format:{1}".format( site, url_format)) return None def select_webpage_to_mq(self, condition): url_format = condition.get('url_format', "") site = condition.get('site', "") limit = int(condition.get('limit', -1)) start = int(condition.get('start', 0)) extra_filter = condition.get('extra_filter', '{}') self.log.info( "select_webpage_mq\tstart\tsite:{0}\turl_format:{1}".format( site, url_format)) req_num = 0 all_num = start if site: item_cursor = self.select_webpage(site, url_format, limit, start, extra_filter) if item_cursor: download_time = "" for item in item_cursor: download_time = item.get("download_time", "") all_num += 1 if item.get('content'): download_rsp = self.get_download_rsp(item) download_str = self.to_string(download_rsp) req_num += 1 self.beanstalk_client.put( self.beanstalk_conf['output_tube'], download_str) if all_num % 100 == 1: #print url_format, all_num, req_num, (all_num % 100 == 1) self.log.info( "select_webpage_mq\trunning\tsite:{0}\turl_format:{1}\tall_num:{2}\treq_num:{3}\tdownload_time:{4}" .format(site, url_format, all_num, req_num, download_time)) self.log.info( "select_webpage_mq\tfinish\tsite:{0}\turl_format:{1}\treq_num:{2}". format(site, url_format, req_num)) def select_webpage_to_list(self, condition): return None # 通过url查询单条数据,并发送到队列 def select_webpage_by_url(self, url): self.log.info("select_webpage_by_url start\turl:{}".format(url)) url = url_encode(url) download_result = DownLoadRsp(url=url, download_time=int(time.time()), status=1, content_type='text/html', page_size=0, elapsed=100, content=None, redirect_url=url, src_type='webpage', http_code=0) try: query_item = {'url': url} domain = get_url_info(url).get('domain') result = self.mongo_client_web.find_first(domain, query_item) if result and (result.get('content')): download_result = self.get_download_rsp(result) except: self.log.error("select_webpage_by_url\turl\t{0}\terror:{1}".format( url, traceback.format_exc())) self.log.info("select_webpage_by_url finish\turl:{}".format(url)) return download_result def to_string(self, link_info): str_entity = None try: tMemory_b = TMemoryBuffer() tBinaryProtocol_b = TBinaryProtocol.TBinaryProtocol(tMemory_b) link_info.write(tBinaryProtocol_b) str_entity = tMemory_b.getvalue() except EOFError, e: self.log.warning("can't write LinkAttr to string") return str_entity
class InputThread(threading.Thread): def __init__(self, conf, processor, proc_name= None): threading.Thread.__init__(self) self.daemon = True self.running = True self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.input_tube = conf['beanstalk_conf']['input_tube'] self.output_tube = conf['beanstalk_conf']['output_tube'] self.log = conf['log'] if not self.log: self.log = LogHandler("i_input_thread") self.processor = processor if self.processor is None: self.log.error("Processor not given !") raise Exception("Processor not given !") self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\ {},\ int(conf['server'].get("process_thread_num", 1)) ) self.wlock = threading.Lock() def stop(self): self.log.warning("stop input_thread") self.running = False self.processor_pool.join_all() def run(self): job_num = 0 self.running = True while self.running and self.input_tube: try: job = self.beanstalk.reserve(self.input_tube, 3) if not job is None: job_num += 1 body = job.body job.delete() self.processor_pool.queue_task(self._on_task_start, (body,), self._on_task_finished) except SocketError as e: time.sleep(30) self.log.error('beanstalk\tconnect\tfail\tstart\treconnect') try: self.beanstalk.reconnect() self.out_beanstalk.reconnect() self.log.error('beanstalk\treconnect\tsuccess') except Exception as e: self.log.error('beanstalk\treconnect\tfail') except: self.log.error("not msg from:%s\tresult:%s" % (self.input_tube, str(traceback.format_exc()))) def _on_task_start(self, task, **thread_locals): result = None try: result = self.processor.do_task(task) except Exception as e: self.log.error(e.message) return result def _on_task_finished(self, (task), **thread_locals): self.wlock.acquire() if task and isinstance(task, basestring): self._output_msg(task) elif isinstance(task, list): for message in task: self._output_msg(message) self.wlock.release()
#!/usr/bin/Python # coding=utf-8 import sys from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from bdp.i_crawler.i_crawler_merge.ttypes import LinkAttr from bdp.i_crawler.i_downloader.ttypes import DownLoadReq from bdp.i_crawler.i_extractor.ttypes import PageParseInfo from i_util.pybeanstalk import PyBeanstalk if __name__ == '__main__': pybeanstalk = PyBeanstalk('10.25.114.50') try: #link_info = DownLoadReq(); info = PageParseInfo() while True: #print pybeanstalk.stats_tube('download_req') job = pybeanstalk.reserve('online_extract_info') """ tMemory_o = TMemoryBuffer(job.body) tBinaryProtocol_o = TBinaryProtocol(tMemory_o) info.read(tBinaryProtocol_o) d = vars(info) for k,v in d.items(): print k,v """ job.delete() #break;
class InputThreadNew(threading.Thread): def __init__(self, conf, processor=None, proc_name=None): threading.Thread.__init__(self) self.running = True self.proc_name = proc_name # Only for logging self.input_tube = conf['beanstalk_conf']['input_tube'] self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.output_tube = conf['beanstalk_conf']['output_tube'] self.topic_output_tubes = {} self.topic_output_tubes.setdefault('default', []) """ output_tube = ["default_out", "only_special_out:1,2,3:exclusive", "special_out:4", ":5:exclusive"] topic_id:1,2,3只会用到only_special_out topic_id:4 会进入special_out和default_out topic_id:5 不会进入队列 topic_id:else 用用default_out队列 """ if type(self.output_tube) == list: for tube_def in self.output_tube: tube_def = tube_def.strip() if len(tube_def.split(":")) < 2: self.topic_output_tubes['default'].append(tube_def) else: elements = [a.strip() for a in tube_def.split(':')] tube_name = elements[0] topic_ids = [ int(a.strip()) for a in elements[1].split(',') ] exclusive = False if len(elements) == 3 and elements[2] == 'exclusive': exclusive = True for topic_id in topic_ids: self.topic_output_tubes.setdefault(topic_id, []) self.topic_output_tubes[topic_id].append( (tube_name, exclusive)) else: self.topic_output_tubes['default'].append(self.output_tube) self.log = log if processor is None: log.error("Processor not given !") raise Exception("Processor not given !") else: self.processor = processor def stop(self): self.log.warning("stop input thread") self.running = False def run(self): log.debug("starting input thread") job_num = 0 while self.running and self.input_tube: try: job = self.beanstalk.reserve(self.input_tube, 3) if job: job_num += 1 body = job.body resp = None job.delete() if self.processor is not None: topic_id = None try: if type(self.processor).__name__ in ( 'ExtractorProccessor', 'SingleSrcMergerProccessor'): resp, topic_id = self.processor.do_task(body) else: resp = self.processor.do_task(body) except Exception, e: log.error("Process failed. " + traceback.format_exc()) if resp is not None: self.output_msg(resp, topic_id) else: self.log.debug(current_process().name + " : no msg from : %s" % (self.input_tube)) except SocketError as e: time.sleep(30) self.log.error('beanstalk\tconnect\tfail\tstart\treconnect') try: self.beanstalk.reconnect() self.out_beanstalk.reconnect() self.log.error('beanstalk\treconnect\tsuccess') except Exception as e: self.log.error('beanstalk\treconnect\tfail')
import os import json import traceback import time from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from i_util.pybeanstalk import PyBeanstalk from bdp.i_crawler.i_downloader.ttypes import DownLoadReq from i_util.logs import LogHandler import config log = LogHandler('re_crawler', console_out=True) beanstalk_conf = config.beanstalk_conf beanstalk_client = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) def req_to_string(req): str_req = "" try: tMemory_b = TMemoryBuffer() tBinaryProtocol_b = TBinaryProtocol(tMemory_b) req.write(tBinaryProtocol_b) str_req = tMemory_b.getvalue() except: log.error('crawled_failt\terror:%s' % (traceback.format_exc())) return str_req def create_download_req(url,
class DownloaderProccessor(NormalProccessor): def __init__(self, log, conf): self.log = log self.conf = conf assert log is not None assert isinstance(conf, dict) self.type_extractor_map = self.conf['type_extractor_map'] self.smart_proxy_url = self.conf['smart_proxy_url'] self.out_beanstalk = PyBeanstalk(self.conf['beanstalk_conf']['host'], self.conf['beanstalk_conf']['port']) self.output_tube_scheduler = self.conf['beanstalk_conf']['output_tube_scheduler'] def to_string(self, download_rsp): str_rsq = None try: t_memory_b = TMemoryBuffer() t_binary_protocol_b = TBinaryProtocol(t_memory_b) download_rsp.write(t_binary_protocol_b) str_rsq = t_memory_b.getvalue() # self.log.info('data-length is {}'.format(str(len(str_rsq)))) except EOFError: self.log.warning("cann't write PageParseInfo to string") return str_rsq def do_task(self, body): try: download_req = json.loads(body) self.log.info("request_msg\t%s" % download_req) target_extractor_id = self.type_extractor_map[download_req['_type']] name = download_req['name'].encode('utf-8') target_url = 'http://%(site)s/gongshang_search?%(query)s' % { 'site': prov_site_map[download_req['province']], 'query': urllib.urlencode({ 'name': name, 'original_query': json.dumps(download_req) }) } self.log.info('请求代理企业名称: name = {name}'.format(name=name)) response = requests.get(target_url, proxies={'http': self.smart_proxy_url}) if response.status_code != 200: download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, ) return self.to_string(download_rsp) self.log.debug(response.text) resp_json = response.json() url = resp_json['url'] # 组装DownloadRsp resp = dict() resp['url'] = str_obj(url) resp['download_time'] = resp_json.get('entitySrcDownloadTime', 0) resp['pages'] = [] resp['content'] = str_obj(resp_json['html']) if resp['content'] is None: resp['content'] = '<html></html>' resp['data_extends'] = str_obj(json.dumps(resp_json['entity'])) resp['parse_extends'] = str_obj(json.dumps({"parser_id": target_extractor_id})) resp['page_size'] = len(resp['content']) resp['content_type'] = 'text/html' resp['src_type'] = 'webpage' # resp['info'] = request.info # resp['scheduler'] = request.scheduler # resp['parse_extends'] = request.parse_extends resp['http_code'] = response.status_code resp['elapsed'] = int(response.elapsed.microseconds / 1000.0) resp['status'] = CrawlStatus.CRAWL_SUCCESS download_rsp = DownLoadRsp(**resp) self.log.info('发送到解析器的 name = {name} url = {url}'.format(name=name, url=resp['url'])) # self.log.info(download_rsp) # 写给工商调度 company_name = resp_json['entity'].get('company') self.out_beanstalk.put(self.output_tube_scheduler, json.dumps({ 'company': company_name, 'crawl_online': resp_json['crawlStatus'].get('crawl_online'), 'crawl_online_time': resp_json['crawlStatus'].get('crawl_online_time'), 'query': resp_json['crawlSeed'], })) self.log.info('发送企业名称到工商调度消息队列: comapny = {company}'.format(company=company_name.encode('utf-8'))) return self.to_string(download_rsp) except Exception as err: self.log.error("process failed, err[%s]" % (repr(err))) self.log.exception(err) download_rsp = DownLoadRsp(status=CrawlStatus.CRAWL_FAILT, ) return self.to_string(download_rsp) # return download_rsp def do_output(self, body): return True
#!/usr/bin/Python # coding=utf-8 import sys from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from bdp.i_crawler.i_extractor.ttypes import PageParseInfo from i_util.pybeanstalk import PyBeanstalk if __name__ == '__main__': pybeanstalk = PyBeanstalk('101.201.100.58') try: rsp_info = PageParseInfo() job = pybeanstalk.reserve('extract_info_ws') # while True: if job: tMemory_o = TMemoryBuffer(job.body) tBinaryProtocol_o = TBinaryProtocol(tMemory_o) rsp_info.read(tBinaryProtocol_o) d = vars(rsp_info) print d for k, v in d.items(): print k, v job.delete() except EOFError, e: print e