def add_node(): ip = utils.get_local_ip() #获得ip r = metric.get_redis() #获得redis实例 seeds = mysettings.SEEDS.split(',') #添加的节点 if not seeds: raise NoSeedsException() for node in seeds: seed = metric.get_redis(host=node) #获取相应节点的redis实例 try: cluster_list = seed.smembers(mysettings.REDIS_CLUSTER_LIST_KEY ) #获得此节点上redis中保存的cluster_list break except: pass for node in cluster_list: #对cluster_list中其他节点的cluster_list加入自己的IP,并且将cluster状态设置为需要重新Hash(其他节点会将任务重新Hash); #将所有节点及自己的IP加入到自己的cluster_list,并且将本机状态设置为normal d = metric.get_redis(host=node) d.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, ip) d.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_REHASHING) r.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, node) r.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, ip) r.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_NORMAL)
def add_node(): #fqdn = socket.getfqdn() ip = socket.gethostbyname(socket.gethostname()) r = metric.get_redis() seeds = mysettings.SEEDS.split(',') if not seeds: raise NoSeedsException() for node in seeds: seed = metric.get_redis(host=node) try: cluster_list = seed.smembers(mysettings.REDIS_CLUSTER_LIST_KEY) break except: pass for node in cluster_list: d = metric.get_redis(host=node) d.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, ip) d.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_REHASHING) r.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, node) r.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, ip) #r.set(mysettings.REDIS_CLUSTER_STATUS_KEY, # mysettings.STATUS_REHASHING) r.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_NORMAL)
def del_node(): r = metric.get_redis() global new_nodelist cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY) ip = socket.gethostbyname(socket.gethostname()) for node in cluster_list: d = metric.get_redis(host=node) d.srem(mysettings.REDIS_CLUSTER_LIST_KEY, ip) while 1: fingerprint = r.spop(task_fingerprint_key) if not fingerprint: break host = rendezvous_hashing(fingerprint, new_nodelist) d = metric.get_redis(host=host) d.sadd(task_fingerprint_key, fingerprint) while 1: task = r.spop(task_key) if not task: break host = rendezvous_hashing(task, new_nodelist) d = metric.get_redis(host=host) d.sadd(task_key, task)
def get_cluster_state(): r = metric.get_redis() cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY) states = [] for node in cluster_list: r = metric.get_redis(host=node) status = r.get(mysettings.REDIS_CLUSTER_STATUS_KEY) states.append(status) return states
def new_task(url, get_fingerprint=lambda x: x): r = metric.get_redis() #cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY) global cluster_list fingerprint = get_fingerprint(url) host = rendezvous_hashing(fingerprint, cluster_list) d = metric.get_redis(host=host) exits = d.sismember(task_fingerprint_key, fingerprint) if not exits: d.sadd(task_key, url) d.sadd(task_fingerprint_key, fingerprint)
def push(self, request): #add_task.sync_cluster(self.key, self.fingerprint_key, self._get_fingerprint, self._decode_request) fingerprint = self._get_fingerprint(request) #if add_task.sync_cluster(self.key, self.fingerprint_key, self._get_fingerprint, self._decode_request): #self.cluster_list = self.r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY) host = add_task.rendezvous_hashing(fingerprint, self.cluster_list) d = metric.get_redis(host=host) if not d.sismember(self.fingerprint_key, fingerprint): d.sadd(self.key, self._encode_request(request)) d.sadd(self.fingerprint_key, fingerprint)
def add_task(): r = metric.get_redis() pos = 1 end = 28010000 #end = 100000 limit = 10000 while pos <= end: cnt = int(r.scard(task_key)) if cnt < limit: print 'add tasks', pos pipeline = r.pipeline() for i in xrange(pos, pos + limit): exits = metric.get_metric( i, 'answer') or metric.get_metric(i) or metric.get_metric( i, '404') if not exits: pipeline.sadd(task_key, i) pipeline.execute() pos += limit time.sleep(0.3)
def tasks_generator(build_url=None): if not build_url: build_url = lambda x: x r = metric.get_redis() while 1: cnt = int(r.scard(mysettings.REDIS_PROXY_POOL_KEY)) if cnt < mysettings.LEAST_PROXY_NUM: time.sleep(mysettings.NO_PROXY_SLEEP_TIME) logger.error("no enough proxy sleep %s" % mysettings.NO_PROXY_SLEEP_TIME) #stop when there is not enough proxy task = r.srandmember(task_key) #cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY) #host = rendezvous_hashing(task, cluster_list) #d = metric.get_redis(host=host) #exits = metric.iscomplete(task, d) #if exits: #continue if task: if not str(task).startswith('http'): url = build_url(task) else: url = task yield url else: # TODO add to settings # yield utils.sleep(5) #time.sleep(5) url = r.spop(task_errors_key) if url: yield url continue cnt = int(r.scard(task_key)) if cnt == 0: break
def __init__(self, spider, key): self.spider = spider self.key = key % {'spider': mysettings.SPIDER_NAME} self.proxy_key = mysettings.REDIS_PROXY_POOL_KEY self.fingerprint_key = '%s:fingerprint' % self.key self.r = metric.get_redis()
def is_completed_task(fingerprint): global cluster_list host = rendezvous_hashing(fingerprint, cluster_list) d = metric.get_redis(host=host) exits = d.sismember(task_fingerprint_key, fingerprint) return exits
def new_error_task(url): r = metric.get_redis() r.sadd(task_errors_key, url)
def sync_cluster(tasks_key, fingerprint_key, get_fingerprint, request_decode): global cluster_last_sync_time r = metric.get_redis() now_time = time.time() cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY) if now_time - cluster_last_sync_time > mysettings.SYNC_TIME_INTERAL: states = get_cluster_state() if mysettings.STATUS_REHASHING in states: r.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_READY) while 1: states = get_cluster_state() if mysettings.STATUS_REHASHING in states: time.sleep(1) else: break keys = [task_key, fingerprint_key] tmp_keys = [('%s:tmp' % key) for key in keys] dst = { host: metric.get_redis(host=host).pipeline() for host in cluster_list } cnt = {host: 0 for host in cluster_list} idx = 0 for key, tmp_key in zip(keys, tmp_keys): val = r.spop(key) while val: if idx == 0: host = rendezvous_hashing( get_fingerprint(request_decode(val)), cluster_list) elif idx == 1: host = rendezvous_hashing(val, cluster_list) dst[host].sadd(tmp_key, val) cnt[host] += 1 if cnt[host] > 100: dst[host].execute() cnt[host] = 0 val = r.spop(key) idx += 1 for d in dst: dst[d].execute() r.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_DONE) while 1: states = get_cluster_state() if mysettings.STATUS_READY in states: time.sleep(1) else: break p = r.pipeline() for i in xrange(len(keys)): try: p.rename(tmp_keys[i], keys[i]) except redis.exceptions.ResponseError: pass try: p.execute() except redis.exceptions.ResponseError: pass r.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_NORMAL) return 1 cluster_last_sync_time = time.time() return 0
pass try: p.execute() except redis.exceptions.ResponseError: pass r.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_NORMAL) return 1 cluster_last_sync_time = time.time() return 0 r = metric.get_redis() cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY) def new_task(url, get_fingerprint=lambda x: x): r = metric.get_redis() #cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY) global cluster_list fingerprint = get_fingerprint(url) host = rendezvous_hashing(fingerprint, cluster_list) d = metric.get_redis(host=host) exits = d.sismember(task_fingerprint_key, fingerprint) if not exits: d.sadd(task_key, url)
import gevent.monkey from gevent.pool import Pool gevent.monkey.patch_all() import requests import re import socket import time IP = re.compile(r'\d{1,3}(?:\.\d{1,3}){3}(?::\d{1,6})?') import mysettings import metric import redis #r = redis.Redis(host=mysettings.REDIS_SERVER, port=int(mysettings.REDIS_PORT), db=0) r = metric.get_redis() from utils import retry @retry(0, 'proxy') def get_real_ip(proxy=None): url = 'http://www.baidu.com/s?wd=ip' resp = requests.get(url, proxies={'http': 'http://%s'%proxy} if proxy else {}, timeout=3, headers={'user-agent': 'Chrome',}) if resp.status_code != 200: raise Exception #real_ip = IP.findall(resp.content)[0] real_ip = IP.findall(resp.content[resp.content.find('我的ip地址'):])[0] print real_ip, proxy return real_ip