コード例 #1
0
    def __init__(self):

        self.redis_conn = RedisConnection(dbinfo=REDIS).get_conn()
コード例 #2
0
class Resource(object):
    def __init__(self):

        self.redis_conn = RedisConnection(dbinfo=REDIS).get_conn()
        # self.data_conn = MySQLConnection(dbinfo=MYSQL_DB).get_conn()

    def get_ip(self):

        try:
            result = self.redis_conn.llen(HTTPS_PROXY_IP_POOL)
            if result > 0:
                ip_li = []

                # 使用5个IP多线程同时请求Cookie
                for i in range(3):
                    res = self.redis_conn.rpoplpush(
                        HTTPS_PROXY_IP_POOL, HTTPS_PROXY_IP_POOL).decode()
                    ip_li.append(res)

                return ip_li

            else:
                logger.info('获取Cookie时请求的IP池为空...')
                time.sleep(30)

        except:
            logger.info('获取Cookie时请求IP池发生错误...')
            time.sleep(30)

    def verify_ip(self):

        proxy = self.get_ip()

        real_proxy = {'http': proxy, 'https': proxy}
        url = 'http://www.baidu.com/'
        res = requests.get(url, proxies=real_proxy)
        if res.status_code == 200:
            return proxy
        else:
            return False

    def get_webpage(self):

        pros = self.get_ip()
        # pro = 1

        if pros:
            for proxy in pros:
                options = webdriver.ChromeOptions()
                # options.binary_location = r"D:\software_location\cent_install\CentBrowser\Application\chrome.exe"
                options.binary_location = '/usr/bin/google-chrome-stable'

                # Linux 环境下配置
                options.add_argument('--headless')
                options.add_argument('--no-sandbox')
                options.add_argument('--disable-extensions')
                options.add_argument('--disable-gpu')

                logger.info('使用IP %s ...' % (proxy))
                options.add_argument("--proxy-server=http://%s" % (proxy))

                driver = webdriver.Chrome(
                    executable_path=
                    '/home/seeta/zhangyanchao/chromedriver_install/chromedriver',
                    chrome_options=options)
                # driver = webdriver.Chrome(executable_path='D:\projects\Weibo_projects\chromedriver.exe',
                #                           chrome_options=options)

                # # 设置代理
                # # options.add_argument("--proxy-server=http://%s" % (pro))
                # options.add_argument("--proxy-server=http://%s" % ('119.129.236.251:4206'))

                # # 使用 PhantomJS 初始化
                # service_args = ['--proxy=119.129.236.251:4206', '--proxy-type=socks5', ]
                # driver = webdriver.PhantomJS(r'C:\Users\Administrator\Desktop\phantomjs-2.1.1-windows\phantomjs-2.1.1-windows\bin\phantomjs.exe', service_args=service_args)

                # # 一定要注意,=两边不能有空格,不能是这样--proxy-server = http://202.20.16.82:10152
                # driver = webdriver.Chrome(chrome_options=options)
                # driver.maximize_window()

                # # 查看本机ip,查看代理是否起作用
                # driver.get("http://httpbin.org/ip")
                # print(driver.page_source)

                driver.get('https://live.kuaishou.com/')
                cookies = driver.get_cookies()
                # print(driver.page_source)
                logger.info(cookies)
                if '未连接到互联网' in driver.page_source or not cookies:
                    logger.info('访问快手首页出现问题, 重新请求新的IP并访问...')
                    driver.quit()
                    pass
                    # self.get_webpage()  # 此处可能陷入死循环

                else:
                    cookie = {}
                    for res in cookies:
                        name = res['name']
                        value = res['value']
                        # print(res['name'], res['value'])
                        cookie[name] = value
                    cookie_d = "{'cookie':" + str(
                        cookie) + '}'  # 插入后还有双引号, 可能影响结果
                    logger.info(cookie_d)

                    # 上传到服务器
                    self.redis_conn.lpush('ks_cookies', cookie_d)

                    people_li = [
                        'dagaoge666', 'meishi123458888', 'wangzulan',
                        '3xgghwn46skhkxa', 'sanda927', 'hs1590ai', 'xiaoyiyi',
                        '3xjb64qxiwbv2dm', 'huangbo666', 'Sanmei1997'
                    ]
                    driver.get('https://live.kuaishou.com/profile/%s' %
                               (random.choice(people_li)))
                    logger.info('获取首页成功, 刷新页面...')
                    for i in range(2):
                        driver.refresh()
                    # 退出,清除浏览器缓存
                    time.sleep(5)
                    driver.quit()
                    logger.info('退出浏览器 ...')

    def scan_verify_cookie_pool(self, cookie_pool):

        logger.info("%s代理池资源总数:%s" %
                    (cookie_pool, self.redis_conn.llen(cookie_pool)))
        valid_count = 0
        invalid_count = 0
        test_count = 1
        date_format = '%Y-%m-%d %H:%M:%S'

        while True:
            try:
                result = self.redis_conn.llen(cookie_pool)
                if result < FILL_RESOURCE_THRESHOLD:
                    self.get_webpage()

                time.sleep(7)
                logger.info('检测代理池时间...')
            except:
                logger.info('获取页面时程序发生错误 ...')
                time.sleep(10)
コード例 #3
0
class Resource(object):
    def __init__(self):

        self.redis_conn = RedisConnection(dbinfo=REDIS).get_conn()
        # self.data_conn = MySQLConnection(dbinfo=MYSQL_DB).get_conn()

    def request_proxy_ip(self, url, number):

        # request_url = url + '&count=' + str(number) # 蘑菇代理
        request_url = url + '&num=' + str(number)  # 芝麻代理
        # request_url = url + '&number=' + str(number) # 快代理

        try:
            response = requests.get(request_url, headers=base_headers)
            if response.status_code == 200:
                result = json.loads(response.text)
                logger.info(result)
                # if result.get('success'):
                if result.get('data'):
                    logger.info('%s 成功获取%s个代理' %
                                (time.asctime(), len(result['data'])))
                    print(result['data'])
                    return result['data']  # proxies are returned here
                else:
                    logger.error('%s 获取%s个代理失败,URL: %s' %
                                 (time.asctime(), number, request_url))
        except Exception as e:
            logger.error('%s 获取%s个代理失败,URL: %s' %
                         (time.asctime(), number, request_url))
            logger.error('错误:%s' % e)

    def load_proxy_pool(self, protocol, proxies):

        for proxy in proxies:

            if protocol == 'https':

                # proxy_item = {"port": proxy, "user": '******', "pass": '******', "status": "A", "enter_time": int(time.time()), "invalid_time": 0, "success": 0, "fail": 0}
                proxy_item = '%s:%s' % (proxy['ip'], proxy['port'])
                # self.redis_conn.hset(HTTPS_PROXY_IP_POOL, proxy['ip'], str(proxy_item))
                self.redis_conn.lpush(HTTPS_PROXY_IP_POOL, proxy_item)
                logger.info("Added new proxy %s:%s into proxy pool: %s" %
                            (proxy['ip'], proxy['port'], HTTPS_PROXY_IP_POOL))

    def get_new_proxy(self, number=PROXY_REQUEST_CHUNK):

        # 请求新的代理IP
        logger.info("request new proxy IP")
        proxies = self.request_proxy_ip(PROXY_REQUEST_URL, number)
        if proxies:
            logger.info("load new proxy IP")
            self.load_proxy_pool('https', proxies)
            # self.load_proxy_into_db(proxies)

    def scan_verify_proxy_pool(self, proxy_pool):

        logger.info("%s代理池资源总数:%s" %
                    (proxy_pool, self.redis_conn.llen(proxy_pool)))
        valid_count = 0
        invalid_count = 0
        test_count = 1
        date_format = '%Y-%m-%d %H:%M:%S'

        while True:
            try:
                result = self.redis_conn.llen(proxy_pool)
                if result < FILL_RESOURCE_THRESHOLD:
                    self.get_new_proxy()

                time.sleep(10)
                print('检测代理池时间...')
            except:

                time.sleep(30)

    def get_resource_pool_size(self):

        return self.redis_conn.llen(BXS_RESOURCE_POOL)
コード例 #4
0
class Resource(object):
    def __init__(self):

        self.redis_conn = RedisConnection(dbinfo=REDIS).get_conn()
        # self.data_conn = MySQLConnection(dbinfo=MYSQL_DB).get_conn()

    def get_ip(self):

        try:
            result = self.redis_conn.llen(HTTPS_PROXY_IP_POOL)
            if result > 0:
                ip_li = []

                # 使用5个IP多线程同时请求Cookie
                for i in range(1):
                    res = self.redis_conn.rpoplpush(
                        HTTPS_PROXY_IP_POOL, HTTPS_PROXY_IP_POOL).decode()
                    ip_li.append(res)

                return ip_li

            else:
                logging.warning('获取Cookie时请求的IP池为空...')
                time.sleep(30)

        except:
            logging.warning('获取Cookie时请求IP池发生错误...')
            time.sleep(30)

    def verify_ip(self):

        proxy = self.get_ip()

        real_proxy = {'http': proxy, 'https': proxy}
        url = 'http://www.baidu.com/'
        res = requests.get(url, proxies=real_proxy)
        if res.status_code == 200:
            return proxy
        else:
            return False

    def get_webpage(self, cookiess):

        pros = self.get_ip()
        # pros = [1]

        if pros:
            for proxy in pros:
                options = webdriver.ChromeOptions()
                # options.binary_location = r"D:\software_location\cent_install\CentBrowser\Application\chrome.exe"
                options.binary_location = '/usr/bin/google-chrome-stable'

                # Linux 环境下配置
                options.add_argument('--headless')
                options.add_argument('--no-sandbox')
                options.add_argument('--disable-extensions')
                options.add_argument('--disable-gpu')

                options.add_argument("--proxy-server=http://%s" % (proxy))
                # options.add_argument("--proxy-server=http://%s" % ('60.182.165.112:4231'))

                driver = webdriver.Chrome(
                    executable_path=
                    '/home/seeta/zhangyanchao/chromedriver_install/chromedriver',
                    chrome_options=options)
                # driver = webdriver.Chrome(executable_path='D:\projects\Weibo_projects\chromedriver.exe',
                #                           chrome_options=options)

                # 查看本机ip,查看代理是否起作用
                # driver.set_page_load_timeout(5)
                driver.get(
                    "http://httpbin.org/ip"
                )  # 技术提示:必须首先加载网站,这样Selenium 才能知道cookie 属于哪个网站,即使加载网站的行为对我们没任何用处
                if '未连接到互联网' in driver.page_source:
                    logging.warning('访问快手首页出现问题, 关闭浏览器...')
                    driver.quit()
                    logging.warning('把 Cookie 重新放入 invalid_cookies 池中...')
                    self.redis_conn.lpush(INVALID_KS_COOKIE_POOL, cookiess)
                else:
                    # driver.get("http://www.baidu.com/")
                    try:
                        expires = str(self.get_time())
                        for k, v in cookiess['cookie'].items():
                            driver.add_cookie({
                                'domain': '.kuaishou.com',
                                'name': k,
                                'value': v,
                                'path': '/',
                                'expires': expires
                            })

                        people_li = [
                            'dagaoge666', 'TS-J0315J', 'meishi123458888',
                            'lol1314666', 'travelers', 'dingdang660',
                            'xue66666', 'wangzulan', '3xgghwn46skhkxa',
                            'sanda927', 'hs1590ai', 'xiaoyiyi',
                            '3xjb64qxiwbv2dm', 'huangbo666', 'Sanmei1997'
                        ]

                        driver.get('https://live.kuaishou.com/profile/%s' %
                                   (random.choice(people_li)))

                        if '未连接到互联网' in driver.page_source:
                            logging.warning('访问快手首页出现问题, 关闭浏览器...')
                            driver.quit()
                            logging.warning(
                                '把 Cookie 重新放入 invalid_cookies 池中...')
                            self.redis_conn.lpush(INVALID_KS_COOKIE_POOL,
                                                  str(cookiess))

                        else:
                            for i in range(2):
                                time.sleep(0.3)
                                driver.refresh()

                            # 刷新后重新插入 Cookies 池
                            logging.info('使用 %s 刷新成功, 放入 正常Cookies池中...' %
                                         (cookiess))
                            print('使用 %s 刷新成功, 放入 正常Cookies池中...' % (cookiess))
                            self.redis_conn.lpush(HTTPS_PROXY_COOKIE_POOL,
                                                  str(cookiess))

                            # 退出,清除浏览器缓存
                            time.sleep(4)
                            driver.quit()
                    except Exception as e:
                        logging.warning('赋值 Cookie 时发生错误 %s' % (e))
                        logging.warning('把 Cookie 重新放入 invalid_cookies 池中...')
                        self.redis_conn.lpush(INVALID_KS_COOKIE_POOL,
                                              str(cookiess))
                        driver.quit()

    def scan_verify_cookie_pool(self, cookie_pool):

        logger.info("%s代理池资源总数:%s" %
                    (cookie_pool, self.redis_conn.llen(cookie_pool)))
        valid_count = 0
        invalid_count = 0
        test_count = 1
        date_format = '%Y-%m-%d %H:%M:%S'

        while True:
            try:
                result = self.redis_conn.llen(cookie_pool)  # 后期如果重复严重改成 set
                if result > 0:
                    cookiess = json.loads(
                        self.redis_conn.rpop(cookie_pool).decode().replace(
                            "'", '"'))
                    # self.redis_conn.lpush(cookie_pool, str(cookiess)) # 测试使用
                    self.get_webpage(cookiess)
                else:
                    time.sleep(3)  # 如果 Cookie 淤积太多修改睡眠时间
                    logging.warning('检测代理池时间...')
            except Exception as e:
                print(e)
                time.sleep(3)

    def get_time(self):

        bb = datetime.datetime.now() + datetime.timedelta(days=365)
        d = datetime.datetime.strptime(str(bb), "%Y-%m-%d %H:%M:%S.%f")
        t = d.timetuple()
        timeStamp = int(time.mktime(t))
        timeStamp = float(str(timeStamp) +
                          str("%06d" % d.microsecond)) / 1000000
        print('timeStamp', timeStamp)
        return timeStamp