def get_minutes_data(stock_code, k_time, num=300): if k_time > 60: k_time = 60 url = 'http://ifzq.gtimg.cn/appstock/app/kline/mkline?param=%s,m%s,,%s&_var=m%s_today&r=0.%s' url = url % (stock_code, k_time, num, k_time, _random()) req = Download(url) content = req.get_html_text() # ===将数据转换成dict格式 content = content.split('=', maxsplit=1)[-1] content = json.loads(content) # ===将数据转换成DataFrame格式 k_data = content['data'][stock_code]['m'+str(k_time)] df = pd.DataFrame(k_data) # ===对数据进行整理 rename_dict = {0: 'candle_end_time', 1: 'open', 2: 'close', 3: 'high', 4: 'low', 5: 'amount'} # 其中amount单位是手 df.rename(columns=rename_dict, inplace=True) df['candle_end_time'] = df['candle_end_time'].apply(lambda x: '%s-%s-%s %s:%s' % (x[0:4], x[4:6], x[6:8], x[8:10], x[10:12])) df['candle_end_time'] = pd.to_datetime(df['candle_end_time']) df = df[['candle_end_time', 'open', 'high', 'low', 'close', 'amount']] return df
def get_list_from_js(): sk_download = Download(data_url) text = sk_download.get_html_text() lst = [] pattern = re.compile(r'.*?"(\d{6})".*?"(.*?)".*?;', re.S) lst = re.findall(pattern, text) return lst
def get_content_from_internet(url, max_try_num=10, sleep_time=5): """ 使用python自带的urlopen函数,从网页上抓取数据 :param url: 要抓取数据的网址 :param max_try_num: 最多尝试抓取次数 :param sleep_time: 抓取失败后停顿的时间 :return: 返回抓取到的网页内容 """ get_success = False # 是否成功抓取到内容 # 抓取内容 req = Download(url) for i in range(max_try_num): #content = urlopen(url=url, timeout=10).read() # 使用python自带的库,从网络上获取信息 content = req.get_html_text() if content != '': get_success = True # 成功抓取到内容 break else: print('抓取数据报错,次数:', i + 1) time.sleep(sleep_time) # 判断是否成功抓取内容 if get_success: return content else: raise ValueError('使用urlopen抓取网页数据不断报错,达到尝试上限,停止程序,请尽快检查问题所在')
def process_func(url, queue): sk_download = Download(url) text = sk_download.get_html_text(url) pattern = re.compile(r's[h|z]\d{6}', re.S) results = re.findall(pattern, text) #print(results) for data in results: #lst.append(data) queue.put(data) time.sleep(0.1) print('%s is ok' % url)
def get_day_data(stock_code, k_type, num=600): ''' stock_code = 'sh000001' # 正常股票sz000001,指数sh000001, ETF sh510500 k_type = 'day' # day, week, month分别对用日线、周线、月线 num = 30000 # 股票最多不能超过640,指数、etf等没有限制 ''' # 构建url url = 'http://web.ifzq.gtimg.cn/appstock/app/fqkline/get?_var=kline_%sqfq¶m=%s,%s,,,%s,qfq&r=0.%s' url = url % (k_type, stock_code, k_type, num, _random()) req = Download(url) content = req.get_html_text() print(content) # ===将数据转换成dict格式 content = content.split('=', maxsplit=1)[-1] content = json.loads(content) # ===将数据转换成DataFrame格式 k_data = content['data'][stock_code] if k_type in k_data: k_data = k_data[k_type] elif 'qfq' + k_type in k_data: # qfq是前复权的缩写 k_data = k_data['qfq' + k_type] else: raise ValueError('已知的key在dict中均不存在,请检查数据') df = pd.DataFrame(k_data) # ===对数据进行整理 rename_dict = { 0: 'candle_end_time', 1: 'open', 2: 'close', 3: 'high', 4: 'low', 5: 'amount', 6: 'info' } # 其中amount单位是手,说明数据不够精确 df.rename(columns=rename_dict, inplace=True) df['candle_end_time'] = pd.to_datetime(df['candle_end_time']) if 'info' not in df: df['info'] = None df = df[[ 'candle_end_time', 'open', 'high', 'low', 'close', 'amount', 'info' ]] print(df) return df
def get_stock_data(code): url = sina_get_stock_url + code sk_data = dict() req = Download(url) text = req.get_html_text() if text == '': return None results = re.findall(r'"(.*?)"', text, re.S) datas = results[0] data = datas.split(',') if code[:2] == 'sz' or code[:2] == 'sh': sk_data.update({'name': data[0]}) sk_data.update({'open': data[1]}) sk_data.update({'close': data[2]}) sk_data.update({'cur_price': data[3]}) sk_data.update({'high': data[4]}) sk_data.update({'low': data[5]}) sk_data.update({'number': data[8]}) sk_data.update({'money': data[9]}) sk_data.update({'flush_time': data[30] + ' ' + data[31]}) cur = float(data[3]) close = float(data[2]) p_change = 100 * (cur - close) / close p_change = round(p_change, 2) sk_data.update({'p_change': p_change}) else: print(data) sk_data.update({'name': data[0]}) sk_data.update({'open': data[2]}) sk_data.update({'close': data[5]}) sk_data.update({'cur_price': data[8]}) sk_data.update({'high': data[3]}) sk_data.update({'low': data[4]}) sk_data.update({'number': data[13]}) sk_data.update({'money': data[14]}) sk_data.update({'flush_time': data[17]}) cur = float(data[8]) close = float(data[5]) p_change = 100 * (cur - close) / close p_change = round(p_change, 2) sk_data.update({'p_change': p_change}) return sk_data
def get_real_time_data(codes): # =====抓取数据 url = sina_get_stock_url + ','.join(codes) #url = sina_get_stock_url + ','.join(['sh000001', 'sh601068']) #print(url) req = Download(url) content = req.get_html_text() if content == '': return None #print(content) # =====将数据转换成DataFrame content = content.strip() # 去掉文本前后的空格、回车等 data_line = content.split('\n') # 每行是一个股票的数据 data_line = [i.replace('var hq_str_', '').split(',') for i in data_line] df = pd.DataFrame(data_line, dtype='float') # =====对DataFrame进行整理 df[0] = df[0].str.split('="') df['code'] = df[0].str[0].str.strip() df['name'] = df[0].str[-1].str.strip() df['date'] = df[30] + ' ' + df[31] # 股票市场的K线,是普遍以当跟K线结束时间来命名的 df['date'] = pd.to_datetime(df['date']) rename_dict = { 1: 'open', 2: 'preclose', 3: 'close', 4: 'high', 5: 'low', 6: 'buy1', 7: 'sell1', 8: 'amount', 9: 'volume', 32: 'status' } # 其中amount单位是股,volume单位是元 df.rename(columns=rename_dict, inplace=True) df['status'] = df['status'].astype(str).str.strip('";') df = df[[ 'code', 'name', 'date', 'open', 'high', 'low', 'close', 'preclose', 'amount', 'volume', 'buy1', 'sell1', 'status' ]] return df
def get_list_from_tencent(): total = 1 url = '' params = { 'appn': 'rank', 't': 'ranka/chr', 'p': '0', 'o': '-1', 'l': '80', 'v': 'list_data' } data = urllib.parse.urlencode(params) url = tencent_url + data sk_download = Download(url) text = sk_download.get_html_text(url) if text == '': print('get text fail..url: %s' % url) return None total = re.search(r'.*?total:(\d+),', text).group(1) print(int(total)) pool = multiprocessing.Pool(processes=10) queue = multiprocessing.Manager().Queue() #for index in range(0, 1): for index in range(0, int(total)): params['p'] = index data = urllib.parse.urlencode(params) url = tencent_url + data print('add %s to pool' % url) pool.apply_async(process_func, (url, queue)) pool.close() pool.join() lst = [] print(queue.qsize()) for i in range(queue.qsize()): data = queue.get() if data not in lst: #lst.append(queue.get()) lst.append(data) #print('[%d] : %s' % (i,queue.get())) return lst