def time_limit(*args, **kargs): try: return func(*args, **kargs) except Exception as e: crawler.error('抓取{url}失败,具体错误信息为{e},堆栈为{stack}'.format( url=args[0], e=e, stack=format_tb(e.__traceback__)[0])) return ''
def save_data(self, data): tasks = [] for item in data: try: dic = {} uuid = item.get("uuid") dic["uuid"] = uuid dic["url"] = f"https://www.infoq.cn/article/{uuid}" dic["title"] = item.get("article_title") dic["cover"] = item.get("article_cover") dic["summary"] = item.get("article_summary") author = item.get("author") if author: dic["author"] = author[0].get("nickname") else: dic["author"] = item.get("no_author", "").split(":")[-1] score = item.get("publish_time") dic["publish_time"] = datetime.datetime.utcfromtimestamp( score / 1000).strftime("%Y-%m-%d %H:%M:%S") dic["tags"] = ",".join( [data.get("name") for data in item.get("topic")]) translate = item.get("translator") dic["translator"] = dic["author"] if translate: dic["translator"] = translate[0].get("nickname") dic["status"] = 0 dic["update_time"] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") tasks.append(dic) except IndexError as e: crawler.error("解析出错") Mongo().save_data(tasks) crawler.info(f"add {len(tasks)} datas to mongodb") return score
def get_redirect(name, data, post_url, session): """ :param name: 登录用户名 :param data: 需要提交的数据,可以通过抓包来确定部分不变的 :param post_url: post地址 :param session: :return: 服务器返回的下一次需要请求的url,如果打码出错,返回特定字符串好做特殊处理 """ logining_page = session.post(post_url, data=data, headers=headers) login_loop = logining_page.content.decode("GBK") # 如果是账号密码不正确,那么就将该字段置为2 if 'retcode=101' in login_loop: crawler.error('账号{}的密码不正确'.format(name)) freeze_account(name, 2) return '' if 'retcode=2070' in login_loop: crawler.error('输入的验证码不正确') return 'pinerror' if '正在登录' or 'Signing in' in login_loop: pa = r'location\.replace\([\'"](.*?)[\'"]\)' return re.findall(pa, login_loop)[0] else: return ''
async def fetch(item, session, retry_index=0): try: refer = item.get("url") name = item.get("title") uuid = item.get("uuid") md5name = hashlib.md5(name.encode("utf-8")).hexdigest() # 图片的名字 item["md5name"] = md5name data = {"uuid": uuid} headers["Referer"] = refer if retry_index == 0: await MotorBase().change_status(uuid, item, 1) # 开始下载 with async_timeout.timeout(60): async with session.post(url=base_url, headers=headers, data=json.dumps(data)) as req: res_status = req.status if res_status == 200: jsondata = await req.json() await get_content(jsondata, item) await MotorBase().change_status(uuid, item, 2) # 下载成功 except Exception as e: jsondata = None if not jsondata: crawler.error(f'Retry times: {retry_index + 1}') retry_index += 1 return await fetch(item, session, retry_index)
def time_limit(url, session, *k): try: return func(url, session, *k) except Exception as e: crawler.error('抓取{url}失败,具体错误信息为{e},堆栈为{stack}'.format( url=url, e=e, stack=format_tb(e.__traceback__)[0])) return None
def get_redirect(name, data, post_url, session): logining_page = session.post(post_url, data=data, headers=headers) login_loop = logining_page.content.decode("GBK") # if name or password is wrong, set the value to 2 if 'retcode=101' in login_loop: crawler.error( 'invalid password for {}, please ensure your account and password'. format(name)) freeze_account(name, 2) return '' if 'retcode=2070' in login_loop: crawler.error('invalid verification code') return 'pinerror' if 'retcode=4049' in login_loop: crawler.warning('account {} need verification for login'.format(name)) return 'login_need_pincode' if '正在登录' or 'Signing in' in login_loop: pa = r'location\.replace\([\'"](.*?)[\'"]\)' return re.findall(pa, login_loop)[0] else: return ''
def time_limit(*args, **kargs): try: return func(*args, **kargs) except Exception as e: crawler.error( 'failed to crawl {url},here are details:{e}, stack is {stack}'. format(url=args[0], e=e, stack=format_tb(e.__traceback__)[0])) return ''
def time_limit(*args, **kargs): try: return func(*args, **kargs) except Exception as e: crawler.error('failed to crawl {url},here are details:{e}, stack is {stack}'.format(url=args[0], e=e, stack=format_tb (e.__traceback__)[0])) return ''
def log(*args, **kwargs): try: if f: crawler.info(f"{func.__name__} is run") return func(*args, **kwargs) except Exception as e: crawler.error( f"{func.__name__} is error,here are details:{traceback.format_exc()}" )
async def get_list_info(self, url, source): ''' 为了取得元素的正确性,这里按照块进行处理。 :param url: 当前页的url :param source: 源码 :return: ''' div_xpath = "//div[@class='cards cards_layout_text-only']/div" div_node_list = self.xpath(source, div_xpath) tasks = [] t_append = tasks.append for div_node in div_node_list: try: dic = {} dic["obj_id"] = self.xpath(div_node, "@data-object-id")[0] dic["artist"] = self.xpath( div_node, ".//div[@class='card_body']/h4/span/a", "text")[0] dic["title"] = \ self.xpath(div_node, ".//div[@class='card_body']/h4/a[@class='search_result_title ']", "text")[0] _detail_url = \ self.xpath(div_node, ".//div[@class='card_body']/h4/a[@class='search_result_title ']", "href")[0] dic["detail_url"] = urljoin(BASE_URL, _detail_url) card_info_xpath = ".//div[@class='card_body']/p[@class='card_info']" dic["label"] = self.xpath(div_node, f"{card_info_xpath}/a", "text")[0] dic["catalog_number"] = \ self.xpath(div_node, f"{card_info_xpath}/span[@class='card_release_catalog_number']", "text")[0] dic["format"] = self.xpath( div_node, f"{card_info_xpath}/span[@class='card_release_format']", "text")[0] dic["year"] = self.xpath( div_node, f"{card_info_xpath}/span[@class='card_release_year']", "text")[0] dic["country"] = self.xpath( div_node, f"{card_info_xpath}/span[@class='card_release_country']", "text")[0] dic["url"] = url dic["page_index"] = 1 dic["status"] = 0 dic["crawler_time"] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") t_append(dic) except IndexError as e: #https://www.discogs.com/search/?layout=sm&country_exact=Unknown&format_exact=Cassette&limit=100&year=2000&style_exact=House&page=1&decade=2000 crawler.error(f"解析出错,此时的url是:{url}") await MotorOperation().save_data(dic) # 修改种子URL的状态为2表示爬取成功。 condition = {"url": url} await MotorOperation().change_status(condition, status_code=2)
def _get_total_page(wb_mid): page = 1 ajax_url = base_url.format(mid=wb_mid, currpage=page) source = get_page(ajax_url, False) if source == '': crawler.error('本次转发url{}抓取出错'.format(ajax_url)) return 0 crawler.info('本次转发信息url为{}'.format(ajax_url)) try: repost_json = json.loads(source) total_page = int(repost_json['data']['page']['totalpage']) except Exception as why: parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(url=ajax_url, why=why)) return 0 else: return total_page
def get_redirect(name, data, post_url, session): logining_page = session.post(post_url, data=data, headers=headers) login_loop = logining_page.content.decode("GBK") # if name or password is wrong, set the value to 2 if 'retcode=101' in login_loop: crawler.error('invalid password for {}, please ensure your account and password'.format(name)) freeze_account(name, 2) return '' if 'retcode=2070' in login_loop: crawler.error('invalid verification code') return 'pinerror' if 'retcode=4049' in login_loop: crawler.warning('account {} need verification for login'.format(name)) return 'login_need_pincode' if '正在登录' in login_loop or 'Signing in' in login_loop: pa = r'location\.replace\([\'"](.*?)[\'"]\)' return re.findall(pa, login_loop)[0] else: return ''
def get_page(url, session, headers, user_verify=True): """ :param user_verify: 是否为可能出现验证码的页面(搜索页面的403还没解析),否为抓取转发的ajax连接 """ crawler.info('本次抓取的url为{url}'.format(url=url)) try: page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \ encode('utf-8', 'ignore').decode('utf-8') time.sleep(interal) if user_verify: if is_403(page): crawler.warning('本账号已经被冻结') crawler.info('本次抓取结束,时间是:{curtime}'.format(curtime=time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime()))) exit(-1) if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' if not is_complete(page): time.sleep(excp_interal) try: page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \ encode('utf-8', 'ignore').decode('utf-8') except Exception as why: crawler.error(why) return '' except requests.exceptions.ReadTimeout: crawler.warning('抓取{url}时连接目标服务器超时'.format(url=url)) time.sleep(excp_interal) return '' except requests.exceptions.ConnectionError as e: crawler.warning('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e)) time.sleep(excp_interal) return '' else: return page
def get_page(url, user_verify=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.error('cookie池中不存在cookie,请检查账号和登录任务是否正常。采集程序退出。') os._exit(0) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or is_403(page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 latest_name_cookies = None while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点 # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制 # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理 # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地 crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) if name_cookies == latest_name_cookies: continue latest_name_cookies = name_cookies try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0]) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''
def get_page(url, user_verify=True, need_login=True): """ :param url: 待抓取url :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0 while count < max_retries: if need_login: # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同 name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号') rs = get_login_info() # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker if len(rs) == 0: crawler.error('账号均不可用,请检查账号健康状况') # 杀死所有关于celery的进程 if 'win32' in sys.platform: os.popen('taskkill /F /IM "celery*"') else: os.popen('pkill -f "celery"') else: crawler.info('重新获取cookie中...') login.excute_login_task() time.sleep(10) try: if need_login: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False) if "$CONFIG['islogin'] = '******'" in resp.text: crawler.warning('账号{}出现异常'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) continue else: resp = requests.get(url, headers=headers, timeout=time_out, verify=False) page = resp.text if page: page = page.encode('utf-8', 'ignore').decode('utf-8') else: continue # 每次抓取过后程序sleep的时间,降低封号危险 time.sleep(interal) if user_verify: if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403( page): crawler.warning('账号{}已经被冻结'.format(name_cookies[0])) freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if 'verifybmobile' in resp.url: crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0])) freeze_account(name_cookies[0], -1) Cookies.delete_cookies(name_cookies[0]) continue if not is_complete(page): count += 1 continue if is_404(page): crawler.warning('url为{url}的连接不存在'.format(url=url)) return '' except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e)) count += 1 time.sleep(excp_interal) else: Urls.store_crawl_url(url, 1) return page crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url)) Urls.store_crawl_url(url, 0) return ''