def main(): while True: origin_proxy_data = deserializate_pickle_object( redis_cli.get(_key) or dumps([])) while len(origin_proxy_data) < MAX_PROXY_NUM: lg.info('Pools已存在proxy_num: {}'.format(len(origin_proxy_data))) get_proxy_process_data() # 重置 origin_proxy_data = deserializate_pickle_object( redis_cli.get(_key) or dumps([])) else: lg.info('达标!休眠{}s...'.format(WAIT_TIME)) sleep(WAIT_TIME) check_all_proxy(origin_proxy_data)
def _get_proxies() -> dict: ''' 随机一个高匿名proxy(极大概率失败, 耐心!) :return: ''' global ori_ip_list proxy_list = deserializate_pickle_object( redis_cli.get(_h_key) or dumps([])) proxies = choice(proxy_list) if len(proxy_list) > 0 else None if proxies is not None: ip, port = proxies['ip'], proxies['port'] proxies = { 'http': 'http://{}:{}'.format(ip, port), 'https': 'https://{}:{}'.format(ip, port), } lg.info('正在使用代理 {} crawl...'.format(proxies['http'])) else: if ori_ip_list == []: for url in start_up_ip_url_list: tmp = get_start_up_ip_list(url) ori_ip_list += tmp if ori_ip_list == []: ori_ip_list = _get_66_ip_list() if ori_ip_list == []: lg.info('正在使用本机ip抓取...') else: pass ori_ip_list = list(set(ori_ip_list)) proxies = { 'http': 'http://{}'.format(choice(ori_ip_list)), } lg.info('正在使用代理 {} crawl...'.format(proxies['http'])) return proxies or {} # 如果None则返回{}
def _get_all_ip_proxy(self, _k=high_proxy_list_key_name) -> list: ''' 得到所有ip proxy :param _k: 原始值 :return: ''' _ = deserializate_pickle_object( self.redis_cli.get(name=get_uuid3(_k)) or dumps([])) return _
def _write_into_redis(res): ''' 读取并更新新采集的proxy :param res: :return: ''' origin_data = redis_cli.get(_key) or dumps([]) # get为None, 则返回[] old = deserializate_pickle_object(origin_data) old += res redis_cli.set(name=_key, value=dumps(old)) return True
def main(): global time_str while True: origin_proxy_data = list_remove_repeat_dict( target=deserializate_pickle_object( redis_cli.get(_key) or dumps([])), repeat_key='ip') # print() while len(origin_proxy_data) < MAX_PROXY_NUM: print('\r' + _get_simulate_log_info() + 'Ip Pools --->>> 已存在proxy_num(匿名度未知): {}'.format( len(origin_proxy_data)), end='', flush=True) get_proxy_process_data() # 重置 origin_proxy_data = list_remove_repeat_dict( target=deserializate_pickle_object( redis_cli.get(_key) or dumps([])), repeat_key='ip') else: print() lg.info('达标!休眠{}s...'.format(WAIT_TIME)) sleep(WAIT_TIME) lg.info('Async Checking all_proxy(匿名度未知)...') origin_proxy_data = list_remove_repeat_dict( target=origin_proxy_data, repeat_key='ip') check_all_proxy(origin_proxy_data, redis_key_name=_key, delete_score=88) '''删除失效的, 时刻保持最新高匿可用proxy''' high_origin_proxy_list = list_remove_repeat_dict( target=deserializate_pickle_object( redis_cli.get(_h_key) or dumps([])), repeat_key='ip') lg.info('Async Checking hign_proxy(高匿名)状态...') check_all_proxy(high_origin_proxy_list, redis_key_name=_h_key, delete_score=MIN_SCORE)
def _get_random_ip_proxy(self) -> str: ''' 随机获取一个代理 :return: 格式: 'http://175.6.2.174:8088' ''' _ = deserializate_pickle_object(self.redis_cli.get(name=self._k) or dumps([])) if _ == []: return '' random_porxy = choice(_) return 'http://{}:{}'.format( random_porxy.get('ip'), random_porxy.get('port'))
def _handle_tasks_result_list(**kwargs): all = kwargs.get('all', []) origin_data = redis_cli.get(_key) or dumps([]) # get为None, 则返回[] old = deserializate_pickle_object(origin_data) for res_content in all: if res_content != []: old += res_content old = list_remove_repeat_dict(target=old, repeat_key='ip') old = serialize_obj_item_2_dict(old) # 转化为dict, 避免反序列化时无法识别ProxyItem redis_cli.set(name=_key, value=dumps(old)) return True
def read_celery_tasks_result_info(celery_id_list: list) -> list: ''' 读取celery tasks的结果 :param celery_id_list: :return: ''' res = [] for item in celery_id_list: # 读取 _k = 'celery-task-meta-' + str(item.id) result = deserializate_pickle_object(redis_cli.get(_k)) if result.get('status', '') == 'SUCCESS': res.append(result.get('result', [])) else: lg.info('获取key值为{}失败!'.format(_k)) return res
def _get_proxies() -> dict: ''' 随机一个proxy :return: ''' origin_data = redis_cli.get(_key) or dumps([]) proxy_list = deserializate_pickle_object(origin_data) proxies = choice(proxy_list) if len(proxy_list) > 0 else None if proxies is not None: proxies = { 'http': 'http://{}:{}'.format(proxies['ip'], proxies['port']) } lg.info('正在使用代理{}crawl...'.format(proxies['http'])) else: lg.info('第一次抓取使用本机ip...') return proxies or {} # 如果None则返回{}
def write_hign_proxy_info_2_redis(one_proxy_info): '''redis新写入高匿名ip''' old_h_proxy_list = deserializate_pickle_object( redis_cli.get(name=_h_key) or dumps([])) old_ip_list = [i.get('ip') for i in old_h_proxy_list] if one_proxy_info.get('ip') not in old_ip_list: old_score = one_proxy_info.get('score') one_proxy_info.update({ # 加分 'score': old_score + 5, }) old_h_proxy_list.append(one_proxy_info) old_h_proxy_list = serialize_obj_item_2_dict( old_h_proxy_list) # 防止反序列化时, 提示无法识别ProxyItem redis_cli.set(name=_h_key, value=dumps(old_h_proxy_list)) else: pass return None
def _get_proxies() -> dict: ''' 随机一个高匿名proxy(极大概率失败, 耐心!) :return: ''' proxy_list = deserializate_pickle_object( redis_cli.get(_h_key) or dumps([])) proxies = choice(proxy_list) if len(proxy_list) > 0 else None if proxies is not None: proxies = { 'http': 'http://{}:{}'.format(proxies['ip'], proxies['port']) } lg.info('正在使用代理 {} crawl...'.format(proxies['http'])) else: lg.info('第一次抓取使用本机ip...') # 使用66ip,免费高匿ip # if a_66_ip == []: # _get_66_ip_list() # proxies = { # 'http': 'http://{}'.format(choice(a_66_ip)), # } # lg.info('正在使用代理 {} crawl...'.format(proxies['http'])) return proxies or {} # 如果None则返回{}
def _get_all_ip_proxy(self) -> list: '''得到所有ip proxy''' _ = deserializate_pickle_object(self.redis_cli.get(name=self._k) or dumps([])) return _
else: logger.error('get_goods_data得到的data为空dict!') return None return _ if __name__ == '__main__': url = 'https://item.taobao.com/item.htm?id=534498954634' _r = get_tb_process_data(tb_object=tb, url=url) # logger.info(_r.get(timeout=2)) _r.get(timeout=2) print('tasks的id: {0}, status: {1}'.format(_r.id, _r.status)) # 从redis获取结果 pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0) redis_cli = redis.StrictRedis(connection_pool=pool) _k = 'celery-task-meta-' + str(_r) # 将redis里面的序列化python对象进行反序列化 result = deserializate_pickle_object(redis_cli.get(_k)) if result.get('status', '') == 'SUCCESS': result = result.get('result', '{}') print(result) else: print('获取失败!')
# coding:utf-8 ''' @author = super_fazai @File : utils.py @connect : [email protected] ''' from fzutils.sql_utils import BaseRedisCli from fzutils.safe_utils import get_uuid3 from fzutils.data.pickle_utils import deserializate_pickle_object from fzutils.linux_utils import kill_process_by_name from fzutils.time_utils import get_shanghai_time from fzutils.common_utils import get_random_int_number from fzutils.common_utils import retry from pprint import pprint from pickle import dumps from time import sleep from random import choice from settings import high_proxy_list_key_name # print(get_uuid3('proxy_tasks')) # print(get_uuid3(high_proxy_list_key_name)) _ = BaseRedisCli() pprint( deserializate_pickle_object( _.get('5e421d78-a394-3b44-aae1-fd86aa127255') or dumps([]))) # 清除celery workers kill_process_by_name(process_name='celery')