Esempio n. 1
0
def get_us_stock_name() -> pd.DataFrame:
    """
    u.s. stock's english name, chinese name and symbol
    you should use symbol to get apply into the next function
    http://finance.sina.com.cn/stock/usstock/sector.shtml
    :return: stock's english name, chinese name and symbol
    :rtype: pandas.DataFrame
    """
    big_df = pd.DataFrame()
    page_count = get_us_page_count()
    for page in tqdm(range(1, page_count + 1)):
        # page = "1"
        us_js_decode = "US_CategoryService.getList?page={}&num=20&sort=&asc=0&market=&id=".format(
            page)
        js_code = py_mini_racer.MiniRacer()
        js_code.eval(js_hash_text)
        dict_list = js_code.call("d", us_js_decode)  # 执行js解密代码
        us_sina_stock_dict_payload.update({"page": "{}".format(page)})
        res = requests.get(us_sina_stock_list_url.format(dict_list),
                           params=us_sina_stock_dict_payload)
        data_json = json.loads(res.text[res.text.find("({") +
                                        1:res.text.rfind(");")])
        big_df = big_df.append(pd.DataFrame(data_json["data"]),
                               ignore_index=True)
    return big_df[["name", "cname", "symbol"]]
Esempio n. 2
0
async def _get_us_stock_name_async(
        request_per_batch: int = 15) -> pd.DataFrame:
    count_per_page = 20
    big_df = pd.DataFrame()
    page_count = get_us_page_count(count_per_page)

    start = time.time()
    with tqdm(total=page_count) as pbar:
        all_pages = range(1, page_count + 1)
        finished_pages = []
        while len(finished_pages) < page_count:
            to_req_pages = [x for x in all_pages if x not in finished_pages]
            request_per_batch = min(request_per_batch, len(to_req_pages))
            tasks = {}
            for page in to_req_pages:
                if len(tasks) < request_per_batch:
                    us_js_decode = f"US_CategoryService.getList?page={page}&num={count_per_page}&sort=&asc=0&market=&id="
                    js_code = py_mini_racer.MiniRacer()
                    js_code.eval(js_hash_text)
                    dict_list = js_code.call("d", us_js_decode)  # 执行js解密代码
                    us_sina_stock_dict_payload.update(
                        {"page": "{}".format(page)})
                    tasks[page] = asyncio.create_task(
                        request(us_sina_stock_list_url.format(dict_list),
                                us_sina_stock_dict_payload))
                    continue

                # n requests per aio loop
                for _, task in tasks.items():
                    await task
                n_failed_req = 0
                for page_no, task in tasks.items():
                    try:
                        res = task.result()
                        data_json = json.loads(res[res.find("({") +
                                                   1:res.rfind(");")])
                        big_df = big_df.append(pd.DataFrame(data_json["data"]),
                                               ignore_index=True)
                        finished_pages.append(page_no)
                        pbar.update(1)
                    except requests.exceptions.ConnectionError as ident:
                        n_failed_req += 1
                        print(
                            f'{ident} page_no={page_no}, sleep for longer time and try in next batch'
                        )
                        pass
                tasks.clear()
                interval_time = 3
                if n_failed_req >= 1:
                    # wait longger for sina anti-spider
                    interval_time = 10
                time.sleep(interval_time)

    end = time.time()
    print('Cost time:', end - start)

    return big_df[["name", "cname", "symbol"]]
Esempio n. 3
0
def get_us_page_count():
    page = "1"
    us_js_decode = f"US_CategoryService.getList?page={page}&num=20&sort=&asc=0&market=&id="
    js_code = execjs.compile(js_hash_text)
    dict_list = js_code.call("d", us_js_decode)  # 执行js解密代码
    us_sina_stock_dict_payload.update({"page": "{}".format(page)})
    res = requests.get(us_sina_stock_list_url.format(dict_list),
                       params=us_sina_stock_dict_payload)
    data_json = json.loads(res.text[res.text.find("({") +
                                    1:res.text.rfind(");")])
    if not isinstance(int(data_json["count"]) / 20, int):
        page_count = int(int(data_json["count"]) / 20) + 1
    else:
        page_count = int(int(data_json["count"]) / 20)
    return page_count
Esempio n. 4
0
def stock_us_spot():
    big_df = pd.DataFrame()
    page_count = get_us_page_count()
    for page in tqdm(range(1, page_count + 1)):
        # page = "1"
        us_js_decode = "US_CategoryService.getList?page={}&num=20&sort=&asc=0&market=&id=".format(
            page)
        js_code = execjs.compile(js_hash_text)
        dict_list = js_code.call("d", us_js_decode)  # 执行js解密代码
        us_sina_stock_dict_payload.update({"page": "{}".format(page)})
        res = requests.get(us_sina_stock_list_url.format(dict_list),
                           params=us_sina_stock_dict_payload)
        data_json = json.loads(res.text[res.text.find("({") +
                                        1:res.text.rfind(");")])
        big_df = big_df.append(pd.DataFrame(data_json["data"]),
                               ignore_index=True)
    return big_df
Esempio n. 5
0
def get_us_page_count(count_per_page: int = 20) -> int:
    """
    新浪财经-美股-总页数
    http://finance.sina.com.cn/stock/usstock/sector.shtml
    :return: 美股总页数
    :rtype: int
    """
    page = "1"
    us_js_decode = f"US_CategoryService.getList?page={page}&num={count_per_page}&sort=&asc=0&market=&id="
    us_sina_stock_dict_payload.update({"num": "{}".format(count_per_page)})

    js_code = py_mini_racer.MiniRacer()
    js_code.eval(js_hash_text)
    dict_list = js_code.call("d", us_js_decode)  # 执行js解密代码
    us_sina_stock_dict_payload.update({"page": "{}".format(page)})
    res = requests.get(us_sina_stock_list_url.format(dict_list),
                       params=us_sina_stock_dict_payload)
    data_json = json.loads(res.text[res.text.find("({") +
                                    1:res.text.rfind(");")])
    page_count = math.ceil(int(data_json["count"]) / count_per_page)
    return page_count
Esempio n. 6
0
def stock_us_spot() -> pd.DataFrame:
    """
    新浪财经-所有美股的数据, 注意延迟 15 分钟
    :return: 美股所有股票实时行情
    :rtype: pandas.DataFrame
    """
    big_df = pd.DataFrame()
    page_count = get_us_page_count()
    for page in tqdm(range(1, page_count + 1)):
        # page = "1"
        us_js_decode = "US_CategoryService.getList?page={}&num=20&sort=&asc=0&market=&id=".format(
            page)
        js_code = py_mini_racer.MiniRacer()
        js_code.eval(js_hash_text)
        dict_list = js_code.call("d", us_js_decode)  # 执行js解密代码
        us_sina_stock_dict_payload.update({"page": "{}".format(page)})
        res = requests.get(us_sina_stock_list_url.format(dict_list),
                           params=us_sina_stock_dict_payload)
        data_json = json.loads(res.text[res.text.find("({") +
                                        1:res.text.rfind(");")])
        big_df = big_df.append(pd.DataFrame(data_json["data"]),
                               ignore_index=True)
    return big_df