def __init__(self, beanstalk_conf, log=None, process_pool=None): threading.Thread.__init__(self) self.daemon = True self.running = True self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.input_tube = beanstalk_conf['input_tube'] self.output_tube = beanstalk_conf['output_tube'] self.log = log if not self.log: self.log = LogHandler("i_input_thread") self.process_pool = process_pool self.wlock = threading.Lock()
class InputThread(threading.Thread): def __init__(self, beanstalk_conf, log=None, process_pool=None): threading.Thread.__init__(self) self.daemon = True self.running = True self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.input_tube = beanstalk_conf['input_tube'] self.output_tube = beanstalk_conf['output_tube'] self.log = log if not self.log: self.log = LogHandler("i_input_thread") self.process_pool = process_pool self.wlock = threading.Lock() def stop(self): self.log.warning("stop input_thread") self.running = False proccesor = None try: while True: if self.process_pool.get_task_num() == 0: if self.process_pool.thread_local_constructors.has_key('processor'): processor = self.process_pool.thread_local_constructors['processor'][1][1] self.log.warning("prepare call scheduler_processor to stop scheduler") processor.save_status() break else: self.log.info("wait tasks be consumed over, wait 5s") time.sleep(5) self.beanstalk.__del__() # 关闭连接不再接受数据 except Exception, e: self.log.error("stop input_thread fail:%s" % e.message)
def __init__(self, conf, processor, proc_name= None): threading.Thread.__init__(self) self.daemon = True self.running = True self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.input_tube = conf['beanstalk_conf']['input_tube'] self.output_tube = conf['beanstalk_conf']['output_tube'] self.log = conf['log'] if not self.log: self.log = LogHandler("i_input_thread") self.processor = processor if self.processor is None: self.log.error("Processor not given !") raise Exception("Processor not given !") self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\ {},\ int(conf['server'].get("process_thread_num", 1)) ) self.wlock = threading.Lock()
def init_log(self, conf, level=logging.DEBUG, console_out=False, name=None): if isinstance(level, basestring): level = level.lower() if level == "debug": level = logging.DEBUG elif level == "info": level = logging.INFO elif level == "warning": level = logging.WARNING elif level == "error": level = logging.ERROR else: level = logging.DEBUG if name is None: name = conf['server']['name'] self.log = LogHandler(name, level, console_out=console_out)
selector = CrawlSelector(conf['log'], conf['selector_conf'], conf['beanstalk_conf'], None) selector.start() while True: time.sleep(20) selector.stop() if __name__ == '__main__': try: file_path = './scheduler.toml' opt, args = getopt.getopt(sys.argv[1:], 'f:', ['help']) for name, value in opt: if name == "-f": file_path = value elif name in ("-h", "--help"): usage() sys.exit() else: assert False, "unhandled option" with open(file_path, 'rb') as config: conf = pytoml.load(config) conf['log'] = LogHandler(conf['server']['name'] + str(conf['server']['port'])) main(conf) except getopt.GetoptError: sys.exit()
except Exception, e: self.log.error("report activate failed, because %s" % e.message) time.sleep(self.EXPIREDS) def stop(self): self.running = False if __name__ == "__main__": import logging from i_util.logs import LogHandler # log = LogHandler(config.logname, logging.DEBUG) log = LogHandler('test', logging.DEBUG) conf = { "backend": {"host": "101.201.102.37", "port": 6379, "password": "******"}, "server_name": "test", "server": {"host": "127.0.0.1", "port": 1001}, "log": log } obj = HeartbeatThread("extractor", conf=conf) obj.run()
'port': 6379, 'database': 8, 'proxy_name': 'proxies', 'proxy_test_available': 24 #hours } SERVER = {'port': 8571, 'host': '0.0.0.0', 'debug': False} MONGODB = { 'host': '172.17.1.119', 'port': 40042, 'database': 'task_collect', 'username': '******', 'password': '******' } final_data = { 'host': '172.17.1.119', 'port': 40042, 'name': 'app_data', 'username': '******', 'password': '******' } realtime_crawl = { 'api': 'http://172.18.180.225:9823/gsxt_info', 'query': 'company' } from i_util.logs import LogHandler log = LogHandler("manager", loglevel=logging.INFO)
def main(): logger = LogHandler('select_webpage')
reload(sys) sys.setdefaultencoding("utf-8") import os import json import traceback import time from thrift.protocol.TBinaryProtocol import TBinaryProtocol from thrift.transport.TTransport import TMemoryBuffer sys.path.append('..') from i_util.pybeanstalk import PyBeanstalk from bdp.i_crawler.i_downloader.ttypes import DownLoadReq from i_util.logs import LogHandler import config log = LogHandler('re_crawler', console_out=True) beanstalk_conf = config.beanstalk_conf beanstalk_client = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) def req_to_string(req): str_req = "" try: tMemory_b = TMemoryBuffer() tBinaryProtocol_b = TBinaryProtocol(tMemory_b) req.write(tBinaryProtocol_b) str_req = tMemory_b.getvalue() except: log.error('crawled_failt\terror:%s' % (traceback.format_exc())) return str_req
log.warning("cann't write DownLoadRsp to string") return str_page_info def put_beanstalked(beanstalk_conf, log, rsp): beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) tube = beanstalk_conf['input_tube'] str_page_info = to_string(log, rsp) try: beanstalk.put(tube, str_page_info) log.info('beanstalk\turl:%s\ttube:%s' % (rsp.url, tube)) except Exception as e: log.info('beanstalk put error url:' + rsp.url + '\ttube:' + tube) if __name__ == "__main__": url = 'dsfsdf' redirect_url = 'sdfdfsfsfs' status = 1 http_code = 200 rsp = DownLoadRsp(url=url, redirect_url=redirect_url, status=status, http_code=http_code) beanstalk_conf = {} beanstalk_conf['host'] = '101.201.102.37' beanstalk_conf['port'] = 11300 beanstalk_conf['input_tube'] = 'download_rsp_test' log = LogHandler('download_test') put_beanstalked(beanstalk_conf, log, rsp=rsp)
# -*- coding: utf-8 -*- from conf import config # logging是线程安全的 import logging # logging.basicConfig(level = logging.DEBUG) from i_util.logs import LogHandler log = LogHandler(config.server['name']+str(config.server['port']), logging.INFO) import MySQLdb def get_mysqldb(): # 1 mysql init mysql_db = MySQLdb.connect( host=config.MYSQL['host'], port=config.MYSQL['port'], user=config.MYSQL['username'], passwd=config.MYSQL['password'], db=config.MYSQL['database'], charset='utf8' ) return mysql_db def dbrecord_to_dict(descriptions, record): '''mysql元组记录转换为字典''' return { desc[0]: field_value for desc, field_value in zip(descriptions, record) }
class InputThread(threading.Thread): def __init__(self, conf, processor, proc_name= None): threading.Thread.__init__(self) self.daemon = True self.running = True self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port']) self.input_tube = conf['beanstalk_conf']['input_tube'] self.output_tube = conf['beanstalk_conf']['output_tube'] self.log = conf['log'] if not self.log: self.log = LogHandler("i_input_thread") self.processor = processor if self.processor is None: self.log.error("Processor not given !") raise Exception("Processor not given !") self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\ {},\ int(conf['server'].get("process_thread_num", 1)) ) self.wlock = threading.Lock() def stop(self): self.log.warning("stop input_thread") self.running = False self.processor_pool.join_all() def run(self): job_num = 0 self.running = True while self.running and self.input_tube: try: job = self.beanstalk.reserve(self.input_tube, 3) if not job is None: job_num += 1 body = job.body job.delete() self.processor_pool.queue_task(self._on_task_start, (body,), self._on_task_finished) except SocketError as e: time.sleep(30) self.log.error('beanstalk\tconnect\tfail\tstart\treconnect') try: self.beanstalk.reconnect() self.out_beanstalk.reconnect() self.log.error('beanstalk\treconnect\tsuccess') except Exception as e: self.log.error('beanstalk\treconnect\tfail') except: self.log.error("not msg from:%s\tresult:%s" % (self.input_tube, str(traceback.format_exc()))) def _on_task_start(self, task, **thread_locals): result = None try: result = self.processor.do_task(task) except Exception as e: self.log.error(e.message) return result def _on_task_finished(self, (task), **thread_locals): self.wlock.acquire() if task and isinstance(task, basestring): self._output_msg(task) elif isinstance(task, list): for message in task: self._output_msg(message) self.wlock.release()
class InputThread(threading.Thread): def __init__(self, beanstalk_conf, log=None, process_pool=None): threading.Thread.__init__(self) self.daemon = True self.running = True assert beanstalk_conf is not None assert log is not None assert process_pool is not None self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port']) self.input_tube = beanstalk_conf['input_tube'] self.output_tube = beanstalk_conf['output_tube'] self.log = log if not self.log: self.log = LogHandler("i_input_thread") self.process_pool = process_pool self.t_lock = threading.Lock() def stop(self): self.log.warning("stop input_thread") self.running = False try: while True: if self.process_pool.get_task_num() <= 0: # if 'processor' in self.process_pool.thread_local_constructors: # processor = self.process_pool.thread_local_constructors['processor'][1][1] # self.log.warning("prepare call scheduler_processor to stop scheduler") # processor.save_status() break else: self.log.info("wait tasks be consumed over, wait 5s") time.sleep(5) self.beanstalk.__del__() # 关闭连接不再接受数据 except Exception as e: self.log.error("stop input_thread fail") self.log.exception(e) def run(self): job_num = 0 while self.running and self.input_tube: try: job = self.beanstalk.reserve(self.input_tube, 30) if job is not None: job_num += 1 body = job.body job.delete() self.process_pool.queue_task(self.__on_task_start, (body,), self.__on_task_finished) task_num = self.process_pool.get_task_num() if task_num >= 50: self.log.info("place_processor\ttasks:%d" % task_num) time.sleep(2) else: self.log.info("not msg from:%s" % self.input_tube) except SocketError as e: time.sleep(30) self.log.error('beanstalk\tconnect\tfail\tstart\treconnect') self.log.exception(e) try: self.beanstalk.reconnect() self.out_beanstalk.reconnect() self.log.error('beanstalk\treconnect\tsuccess') except Exception as e: self.log.error('beanstalk\treconnect\tfail') self.log.exception(e) except Exception as e: self.log.error("not msg from:%s\tresult:" % self.input_tube) self.log.exception(e) @staticmethod def __on_task_start(task, **thread_locals): result = None if 'profiler' in thread_locals: thread_locals['profiler'].begin() if 'processor' in thread_locals: result = thread_locals['processor'].do_task(task) return result def __on_task_finished(self, (result), **thread_locals): self.t_lock.acquire() proccesor = None if 'processor' in thread_locals: proccesor = thread_locals['processor'] if 'profiler' in thread_locals: thread_locals['profiler'].end() if result and isinstance(result, basestring): self.__output_msg(result, proccesor) elif isinstance(result, list): for message in result: self.__output_msg(message, proccesor) self.t_lock.release()
host = '127.0.0.1' port = 12400 server_thread_num = 1 server_process_num = 1 process_thread_num = 1 # beanstalk_conf = { # 'host': '101.201.102.37', # 线上beanstalk内网IP # 'port': 11300, # 线上beanstalk内网port # 'input_tube': 'extract_to_crawl', # 'output_tube': 'scheduler_info', # } beanstalk_conf = { 'host': '127.0.0.1', # 线上beanstalk内网IP 'port': 11300, # 线上beanstalk内网port 'input_tube': 'extract_to_crawl', 'output_tube': 'scheduler_info', } mysql_host = '101.201.102.37' mysql_user = '******' mysql_passwd = 'haizhi@)' mysql_db = 'cmb_crawl' download_output_tube = 'czj_download_rsp' from i_util.logs import LogHandler log = LogHandler("crawler_merge")
# -*- coding:utf-8 -*- import threading import traceback import MySQLdb from i_util.logs import LogHandler log = LogHandler('redisdb') class PyMySQL(object): FETCH_ONE = 0 FETCH_MANY = 1 FETCH_ALL = 2 SELECT_SQL_FORMAT = "SELECT %s FROM `%s` WHERE %s" INSERT_SQL_FORMAT = "INSERT INTO `%s` (%s) VALUES (%s)" UPDATE_SQL_FORMAT = "UPDATE `%s` SET %s WHERE %s" DETELE_SQL_FORMAT = "DELETE FROM `%s` WHERE %s" def __init__(self, host, post, database, username, password): self.conn = None self.host = host self.post = post self.database = database self.username = username self.password = password self._connect()
# signal.signal(signal.SIGINT, signal_process) # signal.signal(signal.SIGQUIT, signal_process) # signal.signal(signal.SIGUSR1, lambda a, b: profiling_signal_handler("downloader", a, b)) file_path = 'downloader_smart_test.toml' opt, args = getopt.getopt(sys.argv[1:], 'f:', ['help']) for name, value in opt: if name == "-f": file_path = value elif name in ("-h", "--help"): usaget() sys.exit() else: assert False, "unhandled option" if file_path is None or file_path.strip() == '': raise Exception('配置文件路径错误..') with open(file_path, 'rb') as fp: config = pytoml.load(fp) logger_name = config["local_server"].get('name') + str( config["local_server"].get('port')) logger = LogHandler(logger_name) config['log'] = logger logger.info(logger_name) logger.info('开始启动服务....') main(config) except getopt.GetoptError: raise Exception( '参数读取错误...argv = {argv}'.format(argv=" ".join(sys.argv)))