def bank_page_list(page=5): """ 想要获取多少页的内容 注意路径 http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/1.html :param page: int 输入从第 1 页到 all_page 页的内容 :return: pd.DataFrame 另存为 csv 文件 """ big_url_list = [] big_title_list = [] flag = True cbirc_headers = cbirc_headers_without_cookie.copy() for i_page in range(1, page): # i_page = 1 print(i_page) main_url = "http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/{}.html".format(i_page) if flag: res = requests.get(main_url, headers=cbirc_headers) cbirc_headers.update({"Cookie": res.headers["Set-Cookie"].split(";")[0]}) res = requests.get(main_url, headers=cbirc_headers) soup = BeautifulSoup(res.text, "lxml") url_list = [item.find("a")["href"] for item in soup.find_all(attrs={"class": "zwbg-2"})] title_list = [item.find("a").get_text() for item in soup.find_all(attrs={"class": "zwbg-2"})] big_url_list.extend(url_list) big_title_list.extend(title_list) flag = 0 else: res = requests.get(main_url, headers=cbirc_headers) soup = BeautifulSoup(res.text, "lxml") url_list = [item.find("a")["href"] for item in soup.find_all(attrs={"class": "zwbg-2"})] title_list = [item.find("a").get_text() for item in soup.find_all(attrs={"class": "zwbg-2"})] big_url_list.extend(url_list) big_title_list.extend(title_list) temp_df = pd.DataFrame([big_title_list, big_url_list]).T return temp_df, cbirc_headers
def bank_page_list(page=5): """ 想要获取多少页的内容 注意路径 http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/1.html :param page: int 输入从第 1 页到 all_page 页的内容 :return: pd.DataFrame 另存为 csv 文件 """ big_url_list = [] big_title_list = [] flag = True cbirc_headers = cbirc_headers_without_cookie.copy() for i_page in range(1, page): # page = 1 print(i_page) main_url = "http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/{}.html".format( i_page) if flag: res = requests.get(main_url, headers=cbirc_headers) temp_cookie = res.headers["Set-Cookie"].split(";")[0] cbirc_headers.update( {"Cookie": res.headers["Set-Cookie"].split(";")[0]}) res = requests.get(main_url, headers=cbirc_headers) soup = BeautifulSoup(res.text, "lxml") res_html = "function getClearance(){" + soup.find_all( "script")[0].get_text() + "};" res_html = res_html.replace("</script>", "") res_html = res_html.replace("eval", "return") res_html = res_html.replace("<script>", "") ctx = execjs.compile(res_html) over_js = "function getClearance2(){var a" + ctx.call( "getClearance").split("document.cookie")[1].split( "Path=/;'")[0] + "Path=/;';return a;};" over_js = over_js.replace("window.headless", "''") over_js = over_js.replace("window['_p'+'hantom']", "''") over_js = over_js.replace("window['__p'+'hantom'+'as']", "''") over_js = over_js.replace("window['callP'+'hantom']", "''") over_js = over_js.replace("return(", "eval(") over_js = over_js.replace( over_js[over_js.find("docum"):over_js.find(".href") + 5], "'http://www.cbirc.gov.cn/'") ctx = execjs.compile(over_js) cookie_2 = ctx.call("getClearance2").split(";")[0] cbirc_headers.update({"Cookie": temp_cookie + ";" + cookie_2}) res = requests.get(main_url, headers=cbirc_headers) soup = BeautifulSoup(res.text, "lxml") url_list = [ item.find("a")["href"] for item in soup.find_all(attrs={"class": "zwbg-2"}) ] title_list = [ item.find("a").get_text() for item in soup.find_all(attrs={"class": "zwbg-2"}) ] big_url_list.extend(url_list) big_title_list.extend(title_list) flag = 0 else: res = requests.get(main_url, headers=cbirc_headers) soup = BeautifulSoup(res.text, "lxml") url_list = [ item.find("a")["href"] for item in soup.find_all(attrs={"class": "zwbg-2"}) ] title_list = [ item.find("a").get_text() for item in soup.find_all(attrs={"class": "zwbg-2"}) ] big_url_list.extend(url_list) big_title_list.extend(title_list) temp_df = pd.DataFrame([big_title_list, big_url_list]).T return temp_df, cbirc_headers