def register(): MYSQL.execute( SqlComment.CREATE.format( **dict(table='tbl_spiders', fields='ip CHAR(20) PRIMARY KEY'))) nodes = MYSQL.select( SqlComment.SELECT.format(**dict(field='ip', table='tbl_spiders'))) nodes = map(lambda x: x['ip'], nodes) if APP_CONF['config']['localhost'] not in nodes: MYSQL.execute( SqlComment.INSERT.format( **dict(table='tbl_spiders', fields='ip', values='"' + APP_CONF['config']['localhost'] + '"'))) LOGGER.info("Register node:" + APP_CONF['config']['localhost']) MYSQL.commit()
def on_request(ch, method, props, body): if props.correlation_id == 'notify' and REDIS.get( 'master_ip') == APP_CONF['config']['localhost']: LOGGER.info("New Master Confirmed.") # CLIENT.call_crawl() if SCHEDULER.get_job('crawl') == None: SCHEDULER.add_job(CLIENT.call_crawl, 'interval', id='crawl', minutes=10, next_run_time=datetime.datetime.now()) elif props.correlation_id == 'crawl': LOGGER.info("Starting Crawling.") crawl(body) elif props.correlation_id == 'elect' and body != APP_CONF['config'][ 'localhost']: elect()
def call_crawl(self): if not self.check_master(): if self.SCHEDULER.state == 1 and self.SCHEDULER.get_job( 'crawl') != None: # self.SCHEDULER.remove_job('crawl') self.SCHEDULER.shutdown() return REDIS.set('start_crawl_time', datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) LOGGER.info("Initing start urls.") init_start_urls(APP_CONF['redis']['host'], APP_CONF['redis']['port'], "start_urls") LOGGER.info("Requesting for spiders.") self.channel.basic_publish( exchange='fanout_start_urls', properties=pika.BasicProperties( # reply_to = self.callback_queue, correlation_id='crawl', ), routing_key='', body='')
def crawl(body): # shell识别空格后为新的spider参数,去掉空格 # os.system('scrapy crawl news -a start_urls=%s' % str(urls).replace(' ', '')) # proxy_flag = True if body=='true' else False # LOGGER.info("Enable Proxy: %s." % proxy_flag) while len(REDIS.lrange("start_urls", 0, -1)): SPIDER_TYPES = "news" url = REDIS.lpop('start_urls') LOGGER.info("URL: %s." % url) for k, v in SPIDER_TYPES_MAPPING.items(): if k in url: SPIDER_TYPES = v break LOGGER.info("Strat Crawling %s." % SPIDER_TYPES) # os.system('scrapy crawl %s -a start_url="%s"' % (SPIDER_TYPES, url)) LOGGER.info("Crawling %s Finish." % SPIDER_TYPES) LOGGER.info("Awaiting RPC requests")
def callback(): LOGGER.info("New Master Confirmed.") CLIENT.call() # SCHEDULER.add_job(CLIENT.call, 'interval', id='call', minutes=APP_CONF['config']['crawl_frequency'], next_run_time=datetime.datetime.now()) LOGGER.info(SCHEDULER.get_jobs())
# -*- coding: utf-8 -*- import datetime from events_spider.utils.tools import LOGGER, APP_CONF, SCHEDULER from rpc_client import RpcClient from SimpleXMLRPCServer import SimpleXMLRPCServer CLIENT = RpcClient() def callback(): LOGGER.info("New Master Confirmed.") CLIENT.call() # SCHEDULER.add_job(CLIENT.call, 'interval', id='call', minutes=APP_CONF['config']['crawl_frequency'], next_run_time=datetime.datetime.now()) LOGGER.info(SCHEDULER.get_jobs()) server = SimpleXMLRPCServer((APP_CONF['config']['localhost'], 8888)) server.register_function(callback, "call") LOGGER.info("Awaiting Being Eelcted.") server.serve_forever()
def elect(): global HAS_ELECTED if HAS_ELECTED: LOGGER.info('Elect Already.') return # 通知其他node进行elect CLIENT.call_elect() host = REDIS.get('master_ip') if host == APP_CONF['config']['localhost']: LOGGER.info("Ping " + host + ' Success.') REDIS.lpush('spiders_vote', APP_CONF['config']['localhost'] + ':0') elif host != None: # host = host[0]['ip'] result = os.system('ping -c 2 ' + host) if result == 0: LOGGER.info("Ping " + host + ' Success.') REDIS.lpush('spiders_vote', APP_CONF['config']['localhost'] + ':0') else: LOGGER.info("Ping " + host + ' Failed.') REDIS.lpush('spiders_vote', APP_CONF['config']['localhost'] + ':1') time.sleep(60) votes = REDIS.lrange('spiders_vote', 0, -1) ips = [] score = 0 for v in votes: ip, vote = v.split(':') ips.append(ip) score += int(vote) # 锁一定时间后自动释放 if LOCK.accquire_lock(): LOGGER.info('Aqquiring Lock.') # 一半以上的node投票ping不到master if score >= len(ips) / 2: # 从投票成功的主机中随机选择一台作为master master = random.choice(ips) LOGGER.info('Electing master:' + master) REDIS.set('master_ip', master) # 通知new master if REDIS.get('master_ip') != APP_CONF['localhost']: CLIENT.call_notify() elif SCHEDULER.get_job('crawl') == None: SCHEDULER.add_job(CLIENT.call_crawl, 'interval', id='crawl', minutes=10, next_run_time=datetime.datetime.now()) REDIS.delete('spiders_vote') # LOCK.relese_lock() LOGGER.info('Electing News Master Finished.') else: ips = MYSQL.select( SqlComment.SELECT.format(**dict(field='ip', table='tbl_spiders'))) ips = [i['ip'] for i in ips] time.sleep(10) if LOCK.accquire_lock(): LOGGER.info('Aqquiring Lock.') master = random.choice(ips) LOGGER.info('Electing master:' + master) REDIS.set('master_ip', master) # 通知new master if REDIS.get('master_ip') != APP_CONF['localhost']: CLIENT.call_notify() elif SCHEDULER.get_job('crawl') == None: SCHEDULER.add_job(CLIENT.call_crawl, 'interval', id='crawl', minutes=10, next_run_time=datetime.datetime.now()) REDIS.delete('spiders_vote') # LOCK.relese_lock() LOGGER.info('Electing News Master Finished.') HAS_ELECTED = False
LOGGER.info('Electing News Master Finished.') HAS_ELECTED = False def monitor(): start_time = REDIS.get('start_crawl_time') if start_time != None: start_time = datetime.datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%S") delta = datetime.timedelta( minutes=APP_CONF['config']['crawl_frequency'] * 1.5) deadline = start_time + delta if start_time == None or datetime.datetime.now() > deadline: elect() channel.basic_qos(prefetch_count=1) channel.basic_consume(on_request, queue=queue_name, no_ack=True) register() SCHEDULER.add_job(monitor, 'interval', id='monitor', minutes=10, next_run_time=datetime.datetime.now()) SCHEDULER.start() # monitor() LOGGER.info("Awaiting RPC requests") channel.start_consuming()