def bank_page_list(page=5): """ 想要获取多少页的内容 http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/1.html :param page: int 输入从第 1 页到 all_page 页的内容 :return: pd.DataFrame 另存为 csv 文件 """ big_url_list = [] big_title_list = [] flag = True cbirc_headers = cbirc_headers_without_cookie_2019.copy() for i_page in range(1, page): # i_page = 1 print(i_page) main_url = "http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/{}.html".format( i_page) if flag: res = requests.get(main_url, headers=cbirc_headers) cbirc_headers.update( {"Cookie": res.headers["Set-Cookie"].split(";")[0]}) res = requests.get(main_url, headers=cbirc_headers) soup = BeautifulSoup(res.text, "lxml") url_list = [ item.find("a")["href"] for item in soup.find_all(attrs={"class": "zwbg-2"}) ] title_list = [ item.find("a").get_text() for item in soup.find_all(attrs={"class": "zwbg-2"}) ] big_url_list.extend(url_list) big_title_list.extend(title_list) flag = 0 else: res = requests.get(main_url, headers=cbirc_headers) soup = BeautifulSoup(res.text, "lxml") url_list = [ item.find("a")["href"] for item in soup.find_all(attrs={"class": "zwbg-2"}) ] title_list = [ item.find("a").get_text() for item in soup.find_all(attrs={"class": "zwbg-2"}) ] big_url_list.extend(url_list) big_title_list.extend(title_list) temp_df = pd.DataFrame([big_title_list, big_url_list]).T return temp_df, cbirc_headers
def bank_page_list(page=5): """ 想要获取多少页的内容 http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/1.html :param page: int 输入从第 1 页到 all_page 页的内容 :return: pd.DataFrame 另存为 csv 文件 """ big_url_list = [] big_title_list = [] flag = True cbirc_headers = cbirc_headers_without_cookie_2019.copy() for i_page in range(1, page): # i_page = 1 print(i_page) main_url = "http://www.cbirc.gov.cn/cn/list/9103/910305/ybjfjcf/{}.html".format( i_page) if flag: res = requests.get(main_url, headers=cbirc_headers) temp_cookie = res.headers["Set-Cookie"].split(";")[0] cbirc_headers.update( {"Cookie": res.headers["Set-Cookie"].split(";")[0]}) res = requests.get(main_url, headers=cbirc_headers) soup = BeautifulSoup(res.text, "lxml") res_html = ("function getClearance(){" + soup.find_all("script")[0].get_text() + "};") res_html = res_html.replace("</script>", "") res_html = res_html.replace("eval", "return") res_html = res_html.replace("<script>", "") ctx = execjs.compile(res_html) if "firstChild.cookie" in ctx.call("getClearance"): over_js = ("function getClearance2(){var a" + ctx.call("getClearance").split("firstChild.cookie") [1].split("Path=/;'")[0] + "Path=/;';return a;};") if "document.cookie" in ctx.call("getClearance"): over_js = ("function getClearance2(){var a" + ctx.call("getClearance").split("document.cookie") [1].split("Path=/;'")[0] + "Path=/;';return a;};") over_js = over_js.replace("window.headless", "''") over_js = over_js.replace("window['_p'+'hantom']", "''") over_js = over_js.replace("window['__p'+'hantom'+'as']", "''") over_js = over_js.replace("window['callP'+'hantom']", "''") over_js = over_js.replace("return(", "eval(") ctx = execjs.compile(over_js) cookie_2 = ctx.call("getClearance2").split(";")[0] cbirc_headers.update({"Cookie": temp_cookie + ";" + cookie_2}) res = requests.get(main_url, headers=cbirc_headers) soup = BeautifulSoup(res.text, "lxml") url_list = [ item.find("a")["href"] for item in soup.find_all(attrs={"class": "zwbg-2"}) ] title_list = [ item.find("a").get_text() for item in soup.find_all(attrs={"class": "zwbg-2"}) ] big_url_list.extend(url_list) big_title_list.extend(title_list) flag = 0 else: res = requests.get(main_url, headers=cbirc_headers) soup = BeautifulSoup(res.text, "lxml") url_list = [ item.find("a")["href"] for item in soup.find_all(attrs={"class": "zwbg-2"}) ] title_list = [ item.find("a").get_text() for item in soup.find_all(attrs={"class": "zwbg-2"}) ] big_url_list.extend(url_list) big_title_list.extend(title_list) temp_df = pd.DataFrame([big_title_list, big_url_list]).T return temp_df, cbirc_headers