コード例 #1
0
def add_node():
    ip = utils.get_local_ip()  #获得ip
    r = metric.get_redis()  #获得redis实例

    seeds = mysettings.SEEDS.split(',')  #添加的节点
    if not seeds:
        raise NoSeedsException()

    for node in seeds:
        seed = metric.get_redis(host=node)  #获取相应节点的redis实例
        try:
            cluster_list = seed.smembers(mysettings.REDIS_CLUSTER_LIST_KEY
                                         )  #获得此节点上redis中保存的cluster_list
            break
        except:
            pass

    for node in cluster_list:
        #对cluster_list中其他节点的cluster_list加入自己的IP,并且将cluster状态设置为需要重新Hash(其他节点会将任务重新Hash);
        #将所有节点及自己的IP加入到自己的cluster_list,并且将本机状态设置为normal
        d = metric.get_redis(host=node)
        d.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, ip)
        d.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_REHASHING)
        r.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, node)

    r.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, ip)
    r.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_NORMAL)
コード例 #2
0
ファイル: add_node.py プロジェクト: heicks/github-crawler
def add_node():
  #fqdn = socket.getfqdn()
  ip = socket.gethostbyname(socket.gethostname())
  r = metric.get_redis()

  seeds = mysettings.SEEDS.split(',')

  if not seeds:
    raise NoSeedsException()

  for node in seeds:
    seed = metric.get_redis(host=node)
    try:
      cluster_list = seed.smembers(mysettings.REDIS_CLUSTER_LIST_KEY)
      break
    except:
      pass

  for node in cluster_list:
    d = metric.get_redis(host=node)
    d.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, ip)
    d.set(mysettings.REDIS_CLUSTER_STATUS_KEY, 
        mysettings.STATUS_REHASHING)
    r.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, node)

  r.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, ip)
  #r.set(mysettings.REDIS_CLUSTER_STATUS_KEY,
  #    mysettings.STATUS_REHASHING)
  r.set(mysettings.REDIS_CLUSTER_STATUS_KEY,
      mysettings.STATUS_NORMAL)
コード例 #3
0
ファイル: del_node.py プロジェクト: heicks/github-crawler
def del_node():
  r = metric.get_redis()
  global new_nodelist

  cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY)
  ip = socket.gethostbyname(socket.gethostname())
  for node in cluster_list:
    d = metric.get_redis(host=node)
    d.srem(mysettings.REDIS_CLUSTER_LIST_KEY, ip)

  while 1:
    fingerprint = r.spop(task_fingerprint_key)
    if not fingerprint:
      break
    host = rendezvous_hashing(fingerprint, new_nodelist)
    d = metric.get_redis(host=host)
    d.sadd(task_fingerprint_key, fingerprint)

  while 1:
    task = r.spop(task_key)
    if not task:
      break
    host = rendezvous_hashing(task, new_nodelist)
    d = metric.get_redis(host=host)
    d.sadd(task_key, task)
コード例 #4
0
ファイル: add_node.py プロジェクト: heicks/github-crawler
def add_node():
    #fqdn = socket.getfqdn()
    ip = socket.gethostbyname(socket.gethostname())
    r = metric.get_redis()

    seeds = mysettings.SEEDS.split(',')

    if not seeds:
        raise NoSeedsException()

    for node in seeds:
        seed = metric.get_redis(host=node)
        try:
            cluster_list = seed.smembers(mysettings.REDIS_CLUSTER_LIST_KEY)
            break
        except:
            pass

    for node in cluster_list:
        d = metric.get_redis(host=node)
        d.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, ip)
        d.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_REHASHING)
        r.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, node)

    r.sadd(mysettings.REDIS_CLUSTER_LIST_KEY, ip)
    #r.set(mysettings.REDIS_CLUSTER_STATUS_KEY,
    #    mysettings.STATUS_REHASHING)
    r.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_NORMAL)
コード例 #5
0
ファイル: add_task.py プロジェクト: heicks/github-crawler
def get_cluster_state():
    r = metric.get_redis()
    cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY)
    states = []
    for node in cluster_list:
        r = metric.get_redis(host=node)
        status = r.get(mysettings.REDIS_CLUSTER_STATUS_KEY)
        states.append(status)

    return states
コード例 #6
0
ファイル: add_task.py プロジェクト: heicks/github-crawler
def new_task(url, get_fingerprint=lambda x: x):
    r = metric.get_redis()
    #cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY)
    global cluster_list

    fingerprint = get_fingerprint(url)
    host = rendezvous_hashing(fingerprint, cluster_list)
    d = metric.get_redis(host=host)
    exits = d.sismember(task_fingerprint_key, fingerprint)

    if not exits:
        d.sadd(task_key, url)
        d.sadd(task_fingerprint_key, fingerprint)
コード例 #7
0
  def push(self, request):
    #add_task.sync_cluster(self.key, self.fingerprint_key, self._get_fingerprint, self._decode_request)
    fingerprint = self._get_fingerprint(request)

    #if add_task.sync_cluster(self.key, self.fingerprint_key, self._get_fingerprint, self._decode_request):
      #self.cluster_list = self.r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY)

    host = add_task.rendezvous_hashing(fingerprint, self.cluster_list)
    d = metric.get_redis(host=host)
    if not d.sismember(self.fingerprint_key, fingerprint):
      d.sadd(self.key, self._encode_request(request))
      d.sadd(self.fingerprint_key, fingerprint)
コード例 #8
0
ファイル: add_task.py プロジェクト: heicks/github-crawler
def add_task():
    r = metric.get_redis()
    pos = 1
    end = 28010000
    #end = 100000
    limit = 10000
    while pos <= end:
        cnt = int(r.scard(task_key))
        if cnt < limit:
            print 'add tasks', pos
            pipeline = r.pipeline()
            for i in xrange(pos, pos + limit):
                exits = metric.get_metric(
                    i, 'answer') or metric.get_metric(i) or metric.get_metric(
                        i, '404')
                if not exits:
                    pipeline.sadd(task_key, i)
            pipeline.execute()
            pos += limit
        time.sleep(0.3)
コード例 #9
0
ファイル: add_task.py プロジェクト: heicks/github-crawler
def tasks_generator(build_url=None):
    if not build_url:
        build_url = lambda x: x
    r = metric.get_redis()
    while 1:
        cnt = int(r.scard(mysettings.REDIS_PROXY_POOL_KEY))
        if cnt < mysettings.LEAST_PROXY_NUM:
            time.sleep(mysettings.NO_PROXY_SLEEP_TIME)
            logger.error("no enough proxy sleep %s" %
                         mysettings.NO_PROXY_SLEEP_TIME)
        #stop when there is not enough proxy

        task = r.srandmember(task_key)
        #cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY)
        #host = rendezvous_hashing(task, cluster_list)
        #d = metric.get_redis(host=host)
        #exits = metric.iscomplete(task, d)
        #if exits:
        #continue

        if task:
            if not str(task).startswith('http'):
                url = build_url(task)
            else:
                url = task
            yield url
        else:
            # TODO add to settings
            # yield utils.sleep(5)
            #time.sleep(5)

            url = r.spop(task_errors_key)
            if url:
                yield url
                continue

            cnt = int(r.scard(task_key))
            if cnt == 0:
                break
コード例 #10
0
 def __init__(self, spider, key):
   self.spider = spider
   self.key = key % {'spider': mysettings.SPIDER_NAME}
   self.proxy_key = mysettings.REDIS_PROXY_POOL_KEY
   self.fingerprint_key = '%s:fingerprint' % self.key
   self.r = metric.get_redis()
コード例 #11
0
ファイル: add_task.py プロジェクト: heicks/github-crawler
def is_completed_task(fingerprint):
    global cluster_list
    host = rendezvous_hashing(fingerprint, cluster_list)
    d = metric.get_redis(host=host)
    exits = d.sismember(task_fingerprint_key, fingerprint)
    return exits
コード例 #12
0
ファイル: add_task.py プロジェクト: heicks/github-crawler
def new_error_task(url):
    r = metric.get_redis()
    r.sadd(task_errors_key, url)
コード例 #13
0
ファイル: add_task.py プロジェクト: heicks/github-crawler
def sync_cluster(tasks_key, fingerprint_key, get_fingerprint, request_decode):
    global cluster_last_sync_time
    r = metric.get_redis()
    now_time = time.time()
    cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY)

    if now_time - cluster_last_sync_time > mysettings.SYNC_TIME_INTERAL:
        states = get_cluster_state()
        if mysettings.STATUS_REHASHING in states:

            r.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_READY)

            while 1:
                states = get_cluster_state()
                if mysettings.STATUS_REHASHING in states:
                    time.sleep(1)
                else:
                    break

            keys = [task_key, fingerprint_key]
            tmp_keys = [('%s:tmp' % key) for key in keys]

            dst = {
                host: metric.get_redis(host=host).pipeline()
                for host in cluster_list
            }
            cnt = {host: 0 for host in cluster_list}

            idx = 0
            for key, tmp_key in zip(keys, tmp_keys):

                val = r.spop(key)
                while val:

                    if idx == 0:
                        host = rendezvous_hashing(
                            get_fingerprint(request_decode(val)), cluster_list)
                    elif idx == 1:
                        host = rendezvous_hashing(val, cluster_list)
                    dst[host].sadd(tmp_key, val)
                    cnt[host] += 1
                    if cnt[host] > 100:
                        dst[host].execute()
                        cnt[host] = 0
                    val = r.spop(key)

                idx += 1

            for d in dst:
                dst[d].execute()

            r.set(mysettings.REDIS_CLUSTER_STATUS_KEY, mysettings.STATUS_DONE)
            while 1:
                states = get_cluster_state()
                if mysettings.STATUS_READY in states:
                    time.sleep(1)
                else:
                    break

            p = r.pipeline()
            for i in xrange(len(keys)):
                try:
                    p.rename(tmp_keys[i], keys[i])
                except redis.exceptions.ResponseError:
                    pass

            try:
                p.execute()
            except redis.exceptions.ResponseError:
                pass
            r.set(mysettings.REDIS_CLUSTER_STATUS_KEY,
                  mysettings.STATUS_NORMAL)

            return 1

        cluster_last_sync_time = time.time()
    return 0
コード例 #14
0
ファイル: add_task.py プロジェクト: heicks/github-crawler
                    pass

            try:
                p.execute()
            except redis.exceptions.ResponseError:
                pass
            r.set(mysettings.REDIS_CLUSTER_STATUS_KEY,
                  mysettings.STATUS_NORMAL)

            return 1

        cluster_last_sync_time = time.time()
    return 0


r = metric.get_redis()
cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY)


def new_task(url, get_fingerprint=lambda x: x):
    r = metric.get_redis()
    #cluster_list = r.smembers(mysettings.REDIS_CLUSTER_LIST_KEY)
    global cluster_list

    fingerprint = get_fingerprint(url)
    host = rendezvous_hashing(fingerprint, cluster_list)
    d = metric.get_redis(host=host)
    exits = d.sismember(task_fingerprint_key, fingerprint)

    if not exits:
        d.sadd(task_key, url)
コード例 #15
0
ファイル: add_proxy.py プロジェクト: heicks/github-crawler
import gevent.monkey
from gevent.pool import Pool
gevent.monkey.patch_all()

import requests
import re
import socket
import time

IP = re.compile(r'\d{1,3}(?:\.\d{1,3}){3}(?::\d{1,6})?')

import mysettings
import metric
import redis
#r = redis.Redis(host=mysettings.REDIS_SERVER, port=int(mysettings.REDIS_PORT), db=0)
r = metric.get_redis()

from utils import retry

@retry(0, 'proxy')
def get_real_ip(proxy=None):
  url = 'http://www.baidu.com/s?wd=ip'
  resp = requests.get(url, 
      proxies={'http': 'http://%s'%proxy} if proxy else {}, 
      timeout=3, headers={'user-agent': 'Chrome',})
  if resp.status_code != 200:
    raise Exception  #real_ip =  IP.findall(resp.content)[0]
  real_ip =  IP.findall(resp.content[resp.content.find('我的ip地址'):])[0]
  print real_ip, proxy

  return real_ip