def get_us_stock_name() -> pd.DataFrame: """ u.s. stock's english name, chinese name and symbol you should use symbol to get apply into the next function http://finance.sina.com.cn/stock/usstock/sector.shtml :return: stock's english name, chinese name and symbol :rtype: pandas.DataFrame """ big_df = pd.DataFrame() page_count = get_us_page_count() for page in tqdm(range(1, page_count + 1)): # page = "1" us_js_decode = "US_CategoryService.getList?page={}&num=20&sort=&asc=0&market=&id=".format( page) js_code = py_mini_racer.MiniRacer() js_code.eval(js_hash_text) dict_list = js_code.call("d", us_js_decode) # 执行js解密代码 us_sina_stock_dict_payload.update({"page": "{}".format(page)}) res = requests.get(us_sina_stock_list_url.format(dict_list), params=us_sina_stock_dict_payload) data_json = json.loads(res.text[res.text.find("({") + 1:res.text.rfind(");")]) big_df = big_df.append(pd.DataFrame(data_json["data"]), ignore_index=True) return big_df[["name", "cname", "symbol"]]
async def _get_us_stock_name_async( request_per_batch: int = 15) -> pd.DataFrame: count_per_page = 20 big_df = pd.DataFrame() page_count = get_us_page_count(count_per_page) start = time.time() with tqdm(total=page_count) as pbar: all_pages = range(1, page_count + 1) finished_pages = [] while len(finished_pages) < page_count: to_req_pages = [x for x in all_pages if x not in finished_pages] request_per_batch = min(request_per_batch, len(to_req_pages)) tasks = {} for page in to_req_pages: if len(tasks) < request_per_batch: us_js_decode = f"US_CategoryService.getList?page={page}&num={count_per_page}&sort=&asc=0&market=&id=" js_code = py_mini_racer.MiniRacer() js_code.eval(js_hash_text) dict_list = js_code.call("d", us_js_decode) # 执行js解密代码 us_sina_stock_dict_payload.update( {"page": "{}".format(page)}) tasks[page] = asyncio.create_task( request(us_sina_stock_list_url.format(dict_list), us_sina_stock_dict_payload)) continue # n requests per aio loop for _, task in tasks.items(): await task n_failed_req = 0 for page_no, task in tasks.items(): try: res = task.result() data_json = json.loads(res[res.find("({") + 1:res.rfind(");")]) big_df = big_df.append(pd.DataFrame(data_json["data"]), ignore_index=True) finished_pages.append(page_no) pbar.update(1) except requests.exceptions.ConnectionError as ident: n_failed_req += 1 print( f'{ident} page_no={page_no}, sleep for longer time and try in next batch' ) pass tasks.clear() interval_time = 3 if n_failed_req >= 1: # wait longger for sina anti-spider interval_time = 10 time.sleep(interval_time) end = time.time() print('Cost time:', end - start) return big_df[["name", "cname", "symbol"]]
def get_us_page_count(): page = "1" us_js_decode = f"US_CategoryService.getList?page={page}&num=20&sort=&asc=0&market=&id=" js_code = execjs.compile(js_hash_text) dict_list = js_code.call("d", us_js_decode) # 执行js解密代码 us_sina_stock_dict_payload.update({"page": "{}".format(page)}) res = requests.get(us_sina_stock_list_url.format(dict_list), params=us_sina_stock_dict_payload) data_json = json.loads(res.text[res.text.find("({") + 1:res.text.rfind(");")]) if not isinstance(int(data_json["count"]) / 20, int): page_count = int(int(data_json["count"]) / 20) + 1 else: page_count = int(int(data_json["count"]) / 20) return page_count
def stock_us_spot(): big_df = pd.DataFrame() page_count = get_us_page_count() for page in tqdm(range(1, page_count + 1)): # page = "1" us_js_decode = "US_CategoryService.getList?page={}&num=20&sort=&asc=0&market=&id=".format( page) js_code = execjs.compile(js_hash_text) dict_list = js_code.call("d", us_js_decode) # 执行js解密代码 us_sina_stock_dict_payload.update({"page": "{}".format(page)}) res = requests.get(us_sina_stock_list_url.format(dict_list), params=us_sina_stock_dict_payload) data_json = json.loads(res.text[res.text.find("({") + 1:res.text.rfind(");")]) big_df = big_df.append(pd.DataFrame(data_json["data"]), ignore_index=True) return big_df
def get_us_page_count(count_per_page: int = 20) -> int: """ 新浪财经-美股-总页数 http://finance.sina.com.cn/stock/usstock/sector.shtml :return: 美股总页数 :rtype: int """ page = "1" us_js_decode = f"US_CategoryService.getList?page={page}&num={count_per_page}&sort=&asc=0&market=&id=" us_sina_stock_dict_payload.update({"num": "{}".format(count_per_page)}) js_code = py_mini_racer.MiniRacer() js_code.eval(js_hash_text) dict_list = js_code.call("d", us_js_decode) # 执行js解密代码 us_sina_stock_dict_payload.update({"page": "{}".format(page)}) res = requests.get(us_sina_stock_list_url.format(dict_list), params=us_sina_stock_dict_payload) data_json = json.loads(res.text[res.text.find("({") + 1:res.text.rfind(");")]) page_count = math.ceil(int(data_json["count"]) / count_per_page) return page_count
def stock_us_spot() -> pd.DataFrame: """ 新浪财经-所有美股的数据, 注意延迟 15 分钟 :return: 美股所有股票实时行情 :rtype: pandas.DataFrame """ big_df = pd.DataFrame() page_count = get_us_page_count() for page in tqdm(range(1, page_count + 1)): # page = "1" us_js_decode = "US_CategoryService.getList?page={}&num=20&sort=&asc=0&market=&id=".format( page) js_code = py_mini_racer.MiniRacer() js_code.eval(js_hash_text) dict_list = js_code.call("d", us_js_decode) # 执行js解密代码 us_sina_stock_dict_payload.update({"page": "{}".format(page)}) res = requests.get(us_sina_stock_list_url.format(dict_list), params=us_sina_stock_dict_payload) data_json = json.loads(res.text[res.text.find("({") + 1:res.text.rfind(");")]) big_df = big_df.append(pd.DataFrame(data_json["data"]), ignore_index=True) return big_df