def do_multi_processing_logic(spider_name, save_type, process_seq, img_path,
                              item_data_path):
    redis_srv_par = get_redis_svr_par()
    redis_srv = redis.StrictRedis(**redis_srv_par)
    redis_key = PIPELINE_KEY % {"spider": spider_name}
    logger.info(
        "[process id %d] start to download items in '%s' (server: %s, port: %s)",
        process_seq, redis_key,
        redis_srv_par.get('host')
        if redis_srv_par.get('host') else 'localhost',
        redis_srv_par.get('port') if redis_srv_par.get('port') else '6379')

    if save_type != 'file':
        print "parameter(%s) error" % save_type
        return

    # 设置具体要保存的文件格式为 当前系统时间/spider_name _ process_seq.json
    item_data_file = "%s/%s_%s.json" % (item_data_path, spider_name,
                                        str(process_seq))

    file = codecs.open(item_data_file, "a+", encoding="utf-8")
    while True:
        items = download_items(redis_srv, redis_key, img_path)
        for item in items:
            item_len = len(item)
            cnt = 0
            file.write("{")
            for k, v in item.items():
                cnt += 1
                if cnt == item_len:
                    file.write(
                        "\"%s\": \"%s\"" %
                        (k, re.sub("\"", "\\\"", v) if v is not None else v))
                    break
                file.write(
                    "\"%s\": \"%s\", " %
                    (k, re.sub("\"", "\\\"", v) if v is not None else v))
            file.write("}\n")
        file.flush()
    # items = download_items(redis_srv, redis_key, img_path)
    # for item in items:
    # item_len = len(item)
    # cnt = 0
    # file.write("{")
    # for k, v in item.items():
    # cnt += 1
    # if cnt == item_len:
    # file.write("\"%s\": \"%s\"" % (k, re.sub("\"", "\\\"", v)))
    # break
    # file.write("\"%s\": \"%s\", " % (k, re.sub("\"", "\\\"", v)))
    # file.write("}\n")
    file.close()
def update_ip_pool_simple(spider_name, link, logger):
    """
        从db=4的redis中取出来放到对应的ip_pool中
    """
    # 连接redis 
    redis_srv_par = get_redis_svr_par()
    print redis_srv_par
    redis_srv_wr = redis.StrictRedis(**redis_srv_par)
    ip_pool_key_wr = IP_POOL_KEY % {'name': spider_name}
    
    # 连接redis
    redis_srv_par = get_redis_svr_par()
    redis_srv_par['db'] = 4
    print redis_srv_par
    redis_srv_rd = redis.StrictRedis(**redis_srv_par)
    ip_pool_key_rd = "ipPool"
    
    while True:
        ip_port_uas = redis_srv_wr.smembers(ip_pool_key_wr)
        ip_pool_wr = set([])
        indexs = {} # ip和redis中记录的索引
        for id, ip_port_ua in enumerate(ip_port_uas):
            ip = ip_port_ua.split("||")[0]
            ip_pool_wr.add(ip)
            indexs[ip] = ip_port_ua
            
        ip_pool_rd = redis_srv_rd.smembers(ip_pool_key_rd)
        print ip_pool_rd
        #ip_pool_rd = set([ip_ports])
        
        for one in ip_pool_wr - (ip_pool_wr & ip_pool_rd):
            redis_srv_wr.srem(ip_pool_key_wr, indexs[one])
        for ip in ip_pool_rd - (ip_pool_wr & ip_pool_rd):
            ua = random.choice(ua_pool) or ""
            redis_srv_wr.sadd(ip_pool_key_wr, "%s||%s" % (ip, ua))
        time.sleep(60)
Esempio n. 3
0
def update_ip_pool_logic(spider_name, link, logger):
    """
        1、获取20条代理ip
        2、从ip池(存于redis中)中获取所有ip记录,这里的记录包含ip port user-agent3项信息(简称 ip_port_ua ,3项信息绑定一起是为了防止一个ip频繁变换多个user-agent)
            一条记录示例:171.36.44.47:9797||Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36
        3、将1和2的ip(以及高概率ip)合并在一起,并用curl检验是否请求成功,得到请求成功的ip
        4、为每个ip添加ua(有ua的则不用添加),随机从ua池中取出与ip合并为ip池的一条记录ip_port_ua,并存入redis中
        5、sleep 30秒
        6、重步骤1-5
    """
    # 连接redis
    redis_srv_par = get_redis_svr_par()
    redis_srv = redis.StrictRedis(**redis_srv_par)
    ip_pool_key = IP_POOL_KEY % {'name': spider_name}

    # 高概率成功的IP集合
    high_prob_ips = set(["211.141.64.50:80", "115.29.2.139:80"])

    cnt = 0
    ips = set([])
    stime = str(datetime.datetime.now())
    st_s = time.time()  # start time second
    while True:
        try:
            # 8080,3128,80,808,1080,8081,8998,8123,8888,9999,8118,9797,3129,87,8083,8090,8799,8000,9000,81,8088 可用端口号
            url = "http://ttvp.daxiangip.com/ip/?tid=559436287116377&delay=1&category=2&foreign=none&ports=8080,3128,80,808,1080,8081,8998,8123,8888,9999,8118,9797,3129,87,8083,8090,8799,8000,9000,81,8088&num=20"

            req = urllib2.Request(url)
            res_data = urllib2.urlopen(req)
            ip_ports = res_data.read().split("\r\n")
            print '%s|fetching raw ip number: %d' % (spider_name,
                                                     len(ip_ports))
            logger.info('fetching raw (every turn) ip number: %d' %
                        len(ip_ports))
            for ip_port in ip_ports:
                if len(ip_port.split(":")) != 2:
                    print 'error ip:', ip_port
                    logger.error('error ip: %s' % ip_port)
                    continue
                ips.add(ip_port)
            print '%s|fetching true (cumulative turn) ip number: %d' % (
                spider_name, len(ips))
            logger.info('fetching true (cumulative turn) ip number: %d' %
                        len(ips))
            if len(ips) >= 20:
                # 从ip池获取ip
                ip_port_uas = redis_srv.smembers(ip_pool_key)
                ip_pool = set([])
                indexs = {}  # ip和redis中记录的索引
                for id, ip_port_ua in enumerate(ip_port_uas):
                    ip = ip_port_ua.split("||")[0]
                    ip_pool.add(ip)
                    indexs[ip] = ip_port_ua
                m_ips = set(list(ip_pool) + list(ips) + list(high_prob_ips))

                # 统计参数初始化
                ip_pass_cnt = {}
                for ip_port in m_ips:
                    ip_pass_cnt[ip_port] = 0

                for id, ip_port in enumerate(m_ips):
                    s = "curl --connect-timeout 3 -m 3 -o /dev/null -s -w %{http_code} -x " + ip_port + " " + link
                    print s
                    ret_code = os.popen(s).read()
                    print ret_code, "----", id
                    if ret_code != '200':
                        continue
                    ip_pass_cnt[ip_port] += 1

                success_num = 0.0
                for k, v in ip_pass_cnt.items():
                    logger.info("%s\t%d" % (k, v))
                    if v == 0:
                        if k in ip_pool:
                            # 删除redis中失效的ip
                            redis_srv.srem(ip_pool_key, indexs[k])
                        continue
                    success_num += 1
                    if k not in ip_pool:
                        # 存redis
                        ua = random.choice(ua_pool) or ""
                        redis_srv.sadd(ip_pool_key, "%s||%s" % (k, ua))

                logger.info(
                    "request num( %s ), success num(%d), proportion(%f)" %
                    (len(ip_pass_cnt), success_num,
                     success_num / len(ip_pass_cnt)))
                logger.info(
                    "request start time(%s), end time(%s), cost time(%s)" %
                    (stime, str(
                        datetime.datetime.now()), str(time.time() - st_s)))

                print '===========================> round: %d' % cnt
                logger.info('===========================> round: %d' % cnt)
                time.sleep(30)
                cnt += 1
                ips = set([])
                stime = str(datetime.datetime.now())
                st_s = time.time()
        except redis.exceptions.ConnectionError:
            redis_srv = redis.StrictRedis(**redis_srv_par)
            logger.error('redis connection error, restart redis')
        except:
            # 出现异常时通过日志监控该异常
            print traceback.format_exc()
            logger.error(traceback.format_exc())
            print 'protection, cnt', cnt
            logger.error('protection, cnt: %d' % cnt)
            break