def get_session(name, password, need_verify): url, yundama_obj, cid, session = do_login(name, password, need_verify) # 打码出错处理 while url == 'pinerror' and yundama_obj is not None: yundama_obj.report_error(cid) url, yundama_obj, cid, session = do_login(name, password, need_verify) if url != '': rs_cont = session.get(url, headers=headers) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m and m.group(1): # 任意验证一个页面看能否访问,使用这个方法验证比较依赖外部条件,但是没找到更好的方式(有的情况下, # 账号存在问题,但是可以访问自己的主页,所以通过自己的主页验证账号是否正常不恰当) check_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more' resp = session.get(check_url, headers=headers) # 通过实验,目前发现未经过手机验证的账号是救不回来了... if is_403(resp.text): other.error('账号{}已被冻结'.format(name)) crawler.warning('账号{}已经被冻结'.format(name)) freeze_account(name, 0) return None other.info('本次登陆账号为:{}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict()) return session other.error('本次账号{}登陆失败'.format(name)) return None
def get_session(name, password): proxy = getip.getIP("") url, yundama_obj, cid, session = do_login(name, password, proxy) if url != '': rs_cont = session.get(url, headers=headers, proxies=proxy) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m and m.group(1): # check if account is valid check_url = 'http://weibo.com/2671109275/about' resp = session.get(check_url, headers=headers, proxies=proxy) if is_403(resp.text): other.error('account {} has been forbidden'.format(name)) LoginInfoOper.freeze_account(name, 0) return None other.info('Login successful! The login account is {}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict(), proxy['http']) return session other.error('login failed for {}'.format(name)) return None
def get_session(name, password): proxy = getip.getIP("") url, yundama_obj, cid, session = do_login(name, password, proxy) if url != '': rs_cont = session.get(url, headers=headers, proxies=proxy) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m and m.group(1): # check if account is valid check_url = 'http://weibo.com/2671109275/about' resp = session.get(check_url, headers=headers, proxies=proxy) if is_403(resp.text): other.error('account {} has been forbidden'.format(name)) LoginInfoOper.freeze_account(name, 0) return None other.info( 'Login successful! The login account is {}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict(), proxy['http']) return session other.error('login failed for {}'.format(name)) return None
def excute_login_task(): infos = login_info.get_login_info() # Clear all stacked login tasks before each time for login Cookies.check_login_task() log.crawler.info('The login task is starting...') for info in infos: app.send_task('tasks.login.login_task', args=(info.name, info.password), queue='login_queue', routing_key='for_login') time.sleep(10)
def excute_login_task(): infos = login_info.get_login_info() # Clear all stacked login celery_tasks before each time for login Cookies.check_login_task() log.crawler.info('The excute_login_task is starting...') for info in infos: celery.send_task('celery_tasks.weibo.login.login_task', args=(info.name, info.password, info.source), queue='login_task', routing_key='login_task') time.sleep(10)
def excute_login_task(): infos = login_info.get_login_info() # 每次登陆前清楚所有堆积的登录任务 Cookies.check_login_task() log.crawler.info('本轮模拟登陆开始') for info in infos: app.send_task('tasks.login.login_task', args=(info.name, info.password), queue='login_queue', routing_key='for_login') time.sleep(10)
def execute_login_task(): infos = LoginInfoOper.get_login_info() # Clear all stacked login tasks before each time for login Cookies.check_login_task() crawler.info('The login task is starting...') if len(infos) <= 0: crawler.info('登陆的账号为空') for info in infos: # app.send_task('tasks.login.login_task', args=(info.name, info.password), queue='login_queue', # routing_key='for_login') login_task(info.name, info.password) time.sleep(10)
def execute_login_task(): # 获取所有的需要登录的weibo账号信息 infos = LoginInfoOper.get_login_info() # Clear all stacked login tasks before each time for login Cookies.check_login_task() crawler.info('The login task is starting...') for info in infos: # 对xx任务 发送参数args # 让这个任务启动 # queue参数:表示通过这个队列来路由通知任务 # 路由的key由参数routing_key 指定 app.send_task('tasks.login.login_task', args=(info.name, info.password), queue='login_queue', routing_key='for_login') time.sleep(10)
def test_delete_cookies(self): """ delete according to key """ from db.redis_db import Cookies r = Cookies.delete_cookies('18708103033') self.assertEqual(r, True)
def test_delete_cookies(self): """ 测试根据键来删除cookie """ from db.redis_db import Cookies r = Cookies.delete_cookies('18708103033') self.assertEqual(r, True)
def get_page(url, need_login=True): """ :param url: url to be crawled :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning( 'no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # slow down to aviod being banned time.sleep(interal) except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning( 'excepitons happens when crawling {},specific infos are {}'. format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('max tries for {},check the url in redis db2'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_session(name, password): url, yundama_obj, cid, session = do_login(name, password) if url != '': rs_cont = session.get(url, headers=headers) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m and m.group(1): # 访问微博官方账号看是否正常 check_url = 'http://weibo.com/2671109275/about' resp = session.get(check_url, headers=headers) # 通过实验,目前发现未经过手机验证的账号是救不回来了... if is_403(resp.text): other.error('账号{}已被冻结'.format(name)) freeze_account(name, 0) return None other.info('本次登陆账号为:{}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict()) return session other.error('本次账号{}登陆失败'.format(name)) return None
def login(self): user_name = input('请输入登陆名: ') password = input('请输入密码: ') #判断登录方式 if re.match(r'\d{4}-\d{7}|1[34578]\d{9}', user_name): print('当前登录方式:手机登陆') post_url = 'https://www.zhihu.com/login/phone_num' post_data = { 'phone_num': user_name, 'password': password, '_xsrf': self.getXsrf() } else: print('当前登录方式:邮箱登陆') post_url = 'https://www.zhihu.com/login/email' post_data = { 'email': user_name, 'password': password, '_xsrf': self.getXsrf() } try: login_page = self.session.post(post_url, headers=headers, data=post_data) print(login_page.status) print('直接登陆成功!') except: print('直接登陆失败!') post_data['captcha'] = self.getCaptcha() login_page = self.session.post(post_url, headers=headers, data=post_data) Cookies.store_cookies(user_name, self.session.cookies.get_dict()) login_code = eval(login_page.text) print(login_code['msg'])
def excute_login_task(): infos = login_info.get_login_info() log.crawler.info('本轮模拟登陆开始') for info in infos: try: rs = Cookies.check_login_task(info.name) except KeyError: log.crawler.warning('请检查是否已经启动worker及指定了login_queue') else: if not rs: app.send_task('tasks.login.login_task', args=(info.name, info.password, info.need_verify), queue='login_queue', routing_key='for_login')
def get_page(url, user_verify=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.error('cookie池中不存在cookie,请检查账号和登录任务是否正常。采集程序退出。') os._exit(0) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def test_store_and_fetch_cookies(self): assert Cookies.fetch_cookies() is None Cookies.store_cookies(FAKE_STR, FAKE_STR) assert Cookies.fetch_cookies() is not None
def get_page(url, user_verify=True, need_login=True): """ :param url: url to be crawled :param user_verify: if it's ajax url, the value is False, else True :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('account {} has been banned'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # slow down to aviod being banned time.sleep(interal) if user_verify: if is_banned(resp.url) or is_403(page): crawler.warning('account {} has been banned'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('account {} has been locked,you should use your phone to unlock it'. format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('{url} seems to be 404'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('excepitons happens when crawling {},specific infos are {}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('max tries for {},check the url in redis db2'.format(url)) Urls.store_crawl_url(url, 0) return ''
def send_personal_message(target_uid, adver_message, user_verify=True, need_login=True): """ :param url: url to be crawled :param user_verify: if it's ajax url, the value is False, else True :param need_login: if the url is need to login, the value is True, else False :return: return '' if exception happens or status_code != 200 """ crawler.info('the send_personal_message uid is {uid}'.format(uid=str(target_uid))) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() print(name_cookies) # check adver_timers if int(name_cookies[3]) >= int(adver_timers): continue if name_cookies is None: crawler.warning('no cookies in cookies pool, please find out the reason') send_email() os.kill(os.getppid(), signal.SIGTERM) try: if need_login: resp = requests.post('http://api.weibo.com/webim/2/direct_messages/new.json?source='+str(name_cookies[2]), data={'text': adver_message, 'uid':str(target_uid)}, cookies=name_cookies[1], headers=personal_message_headers) if "error" in resp.text: crawler.warning('account {} has been banned, resp.text is: {}'.format(name_cookies[0], resp.text)) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: # update adver_times Cookies.store_cookies(name_cookies[0], name_cookies[1], name_cookies[2], 1) return None # if "$CONFIG['islogin'] = '******'" in resp.text: # crawler.warning('account {} has been banned'.format(name_cookies[0])) # freeze_account(name_cookies[0], 0) # Cookies.delete_cookies(name_cookies[0]) # continue # # else: # # resp = requests.get(url, headers=headers, timeout=time_out, verify=False) # # page = resp.text # # if page: # page = page.encode('utf-8', 'ignore').decode('utf-8') # else: # continue # # # slow down to aviod being banned # time.sleep(interal) # # if user_verify: # if is_banned(resp.url) or is_403(page): # crawler.warning('account {} has been banned'.format(name_cookies[0])) # freeze_account(name_cookies[0], 0) # Cookies.delete_cookies(name_cookies[0]) # count += 1 # continue # # if 'verifybmobile' in resp.url: # crawler.warning('account {} has been locked,you should use your phone to unlock it'. # format(name_cookies[0])) # # freeze_account(name_cookies[0], -1) # Cookies.delete_cookies(name_cookies[0]) # continue # # if not is_complete(page): # count += 1 # continue # # if is_404(page): # crawler.warning('send_personal_message{uid} seems to be 404'.format(uid=str(target_uid))) # return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('excepitons happens when send_personal_message {},specific infos are {}'.format(target_uid, e)) count += 1 time.sleep(excp_interal) else: # Urls.store_crawl_url(url, 1) # return page return None crawler.warning('max tries for {},check the target_uid in redis db2'.format(target_uid)) # Urls.store_crawl_url(url, 0) return ''
def test_del_cookies(self): Cookies.delete_cookies(FAKE_STR) assert Cookies.fetch_cookies() is None
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False): """ :param url: url to crawl :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login. :param is_ajax: whether the request is ajax :param need_proxy: whether the request need a http/https proxy :return: response text, when a exception is raised, return '' """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < MAX_RETRIES: if auth_level == 2: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning( 'No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired' ) send_email() os.kill(os.getppid(), signal.SIGTERM) # There is no difference between http and https address. proxy = { 'http': name_cookies[2], 'https': name_cookies[2], } else: proxy = getip.getIPWithoutLogin('') # if proxy['http'] is None: # crawler.warning('No available ip in ip pools. Using local ip instead.') try: if auth_level == 2: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=TIME_OUT, verify=False, proxies=proxy) elif auth_level == 1: resp = requests.get(url, headers=headers, cookies=COOKIES, timeout=TIME_OUT, verify=False, proxies=proxy) else: resp = requests.get(url, headers=headers, timeout=TIME_OUT, verify=False, proxies=proxy) except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning( 'Excepitons are raised when crawling {}.Here are details:{}'. format(url, e)) count += 1 time.sleep(EXCP_INTERAL) continue if resp.status_code == 414: crawler.warning('This ip has been blocked by weibo system') if not need_proxy: send_email() os.kill(os.getppid(), signal.SIGTERM) if resp.text: page = resp.text.encode('utf-8', 'ignore').decode('utf-8') else: count += 1 continue if auth_level == 2: # slow down to aviod being banned time.sleep(INTERAL) if is_banned(resp.url) or is_403(page): crawler.warning('Account {} has been banned'.format( name_cookies[0])) LoginInfoOper.freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_ajax and not is_complete(page): count += 1 continue if is_404(page): crawler.warning('{} seems to be 404'.format(url)) return '' Urls.store_crawl_url(url, 1) return page Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同 name_cookies, cookies_count = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) # 只有cookies总数大于1的时候才会在每次重试的时候切换不同cookie if cookies_count > 1 and name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_session(name, password): session = requests.Session() su = get_encodename(name) sever_data = get_server_data(su, session) servertime = sever_data["servertime"] nonce = sever_data['nonce'] rsakv = sever_data["rsakv"] pubkey = sever_data["pubkey"] sp = get_password(password, servertime, nonce, pubkey) # 提交的数据可以根据抓包获得 data = { 'encoding': 'UTF-8', 'entry': 'weibo', 'from': '', 'gateway': '1', 'nonce': nonce, 'pagerefer': "", 'prelt': 67, 'pwencode': 'rsa2', "returntype": "META", 'rsakv': rsakv, 'savestate': '7', 'servertime': servertime, 'service': 'miniblog', 'sp': sp, 'sr': '1920*1080', 'su': su, 'useticket': '1', 'vsnf': '1', 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack' } post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)' url = get_redirect(data, post_url, session) if url != '': rs_cont = session.get(url, headers=headers) login_info = rs_cont.text u_pattern = r'"uniqueid":"(.*)",' m = re.search(u_pattern, login_info) if m: if m.group(1): # 任意验证一个页面看能否访问,使用这个方法验证比较依赖外部条件,但是没找到更好的方式(有的情况下, # 账号存在问题,但是可以访问自己的主页,所以通过自己的主页验证账号是否正常不恰当) check_url = 'http://weibo.com/p/1005051764222885/info?mod=pedit_more' resp = session.get(check_url, headers=headers) if is_403(resp.text): other.error('账号{}已被冻结'.format(name)) crawler.warning('账号{}已经被冻结'.format(name)) freeze_account(name) return None other.info('本次登陆账号为:{}'.format(name)) Cookies.store_cookies(name, session.cookies.get_dict()) return session else: other.error('本次账号{}登陆失败'.format(name)) return None else: other.error('本次账号{}登陆失败'.format(name)) return None else: other.error('本次账号{}登陆失败'.format(name)) return None
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False): """ :param url: url to crawl :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login. :param is_ajax: whether the request is ajax :param need_proxy: whether the request need a http/https proxy :return: response text, when a exception is raised, return '' """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < MAX_RETRIES: if auth_level == 2: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired') send_email() os.kill(os.getppid(), signal.SIGTERM) # There is no difference between http and https address. proxy = {'http': name_cookies[2], 'https': name_cookies[2], } else: proxy = getip.getIPWithoutLogin('') # if proxy['http'] is None: # crawler.warning('No available ip in ip pools. Using local ip instead.') try: if auth_level == 2: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=TIME_OUT, verify=False, proxies=proxy) elif auth_level == 1: resp = requests.get(url, headers=headers, cookies=COOKIES, timeout=TIME_OUT, verify=False, proxies=proxy) else: resp = requests.get(url, headers=headers, timeout=TIME_OUT, verify=False, proxies=proxy) except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('Excepitons are raised when crawling {}.Here are details:{}'.format(url, e)) count += 1 time.sleep(EXCP_INTERAL) continue if resp.status_code == 414: crawler.warning('This ip has been blocked by weibo system') if not need_proxy: send_email() os.kill(os.getppid(), signal.SIGTERM) if resp.text: page = resp.text.encode('utf-8', 'ignore').decode('utf-8') else: count += 1 continue if auth_level == 2: # slow down to aviod being banned time.sleep(INTERAL) if is_banned(resp.url) or is_403(page): crawler.warning('Account {} has been banned'.format(name_cookies[0])) LoginInfoOper.freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_ajax and not is_complete(page): count += 1 continue if is_404(page): crawler.warning('{} seems to be 404'.format(url)) return '' Urls.store_crawl_url(url, 1) return page Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点 # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制 # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理 # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地 crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码), 否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 while count < max_retries: if need_login: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,请检查账号是否正常') other.warning('正在关闭爬虫程序...') if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''