def _today_ticks(symbol, tdate, pageNo, retry_count, pause): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: html = lxml.html.parse(ct.TODAY_TICKS_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['t_ticks'], symbol, tdate, pageNo )) res = html.xpath('//table[@id=\"datatbl\"]/tbody/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>' % sarr sarr = sarr.replace('--', '0') df = pd.read_html(StringIO(sarr), parse_dates=False)[0] df.columns = ct.TODAY_TICK_COLUMNS df['pchange'] = df['pchange'].map(lambda x: x.replace('%', '')) except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _parsing_sina_dd_price_json(url): """ 处理当日行情分页数据,格式为json Parameters ------ pageNum:页码 return ------- DataFrame 当日所有股票交易数据(DataFrame) """ ct._write_console() # request = Request(ct.SINA_DAY_PRICE_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], # ct.PAGES['jv'], pageNum)) # request = Request(url) # text = urlopen(request, timeout=10).read() text = _get_url_data(url) if len(text) < 10: return '' reg = re.compile(r'\,(.*?)\:') text = reg.sub(r',"\1":', text.decode('gbk') if ct.PY3 else text) text = text.replace('"{symbol', '{"code') text = text.replace('{symbol', '{"code"') if ct.PY3: jstr = json.dumps(text) else: jstr = json.dumps(text, encoding='GBK') js = json.loads(jstr) df = pd.DataFrame(pd.read_json(js, dtype={'code': object}), columns=ct.DAY_REAL_DD_COLUMNS) df = df.drop('symbol', axis=1) df = df.ix[df.volume > 0] # print "" # print df['name'][len(df.index)-1:],len(df.index) return df
def get_sina_all_dd(vol='0', type='0', retry_count=3, pause=0.001): """ 获取sina全部大单数据 Parameters ------ volume:string 股票代码 e.g. 0=400,1=1k,2=2k,3=5k,4=1w type:string 日期 e.g. 0=5T,1=10,2=20,3=50,4=100T retry_count : int, 默认 3 如遇网络等问题重复执行的次数 pause : int, 默认 0 重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题 return ------- DataFrame 当日所有股票交易数据(DataFrame) 属性:股票代码 股票名称 交易时间 价格 成交量 前一笔价格 类型(买、卖、中性盘) """ if len(vol) != 1 or len(type) != 1: return None # symbol = _code_to_symbol(code) for _ in range(retry_count): time.sleep(pause) try: ct._write_console() print (ct.DD_VOL_List[vol], ct.DD_TYPE_List[type]) # print(ct.SINA_DD_VRatio % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['sinadd_all'], # ct.DD_VOL_List[vol], ct.DD_TYPE_List[type])) print ct.SINA_DD_VRatio % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['sinadd_all'],ct.DD_VOL_List[vol], ct.DD_TYPE_List[type]) html = lxml.html.parse(ct.SINA_DD_VRatio % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['sinadd_all'], ct.DD_VOL_List[vol], ct.DD_TYPE_List[type])) # else: # re = Request(ct.SINA_DD % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['sinadd'], # symbol, date)) # lines = urlopen(re, timeout=10).read() # lines = lines.decode('GBK') # if len(lines) < 100: # return None # df = pd.read_csv(StringIO(lines), names=ct.SINA_DD_COLS, # skiprows=[0]) # parser = etree.HTMLParser() # //div[@class="main"]/div[@class="divList"]/table/tbody/tr # //div[@class="main"]/div[@id="divListTemplate"]/table/tbody/tr # tree= etree.parse(StringIO.StringIO(html), parser) res = html.xpath('//div[@class="main"]/div[@id="divListTemplate"]/table/tbody/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] print sarr # sarr = ''.join(sarr) # sarr = '<table>%s</table>'%sarr # sarr = sarr.replace('--', '0') # df = pd.read_html(StringIO(sarr), parse_dates=False)[0] # df.columns = ct.TODAY_TICK_COLUMNS # df['pchange'] = df['pchange'].map(lambda x : x.replace('%', '')) df=None # if df is not None: # df['code'] = df['code'].map(lambda x: x[2:]) except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def get_sina_all_dd(vol="0", type="0", retry_count=3, pause=0.001): if len(vol) != 1 or len(type) != 1: return None else: print ("Vol:%s Type:%s" % (ct.DD_VOL_List[vol], ct.DD_TYPE_List[type])) # symbol = _code_to_symbol(code) for _ in range(retry_count): time.sleep(pause) try: ct._write_console() url = get_sina_url(vol, type) # url= ct.SINA_DD_VRatio % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['sinadd_all'],ct.DD_VOL_List[vol], ct.DD_TYPE_List[type]) page = urllib2.urlopen(url) html_doc = page.read() # print (html_doc) # soup = BeautifulSoup(html_doc,fromEncoding='gb18030') # print html_doc pageCount = re.findall('fillCount"\]\((\d+)', html_doc, re.S) if len(pageCount) > 0: start_t = time.time() pageCount = pageCount[0] if int(pageCount) > 100: if int(pageCount) > 10000: print "BigBig:", pageCount pageCount = "10000" print "AllBig:", pageCount html_doc = urllib2.urlopen(get_sina_url(vol, type, pageCount=pageCount)).read() print (time.time() - start_t) soup = BeautifulSoup(html_doc, "lxml") print (time.time() - start_t) # abc= (soup.find_all('script',type="text/javascript")) # print(len(abc)) # print (abc[4].text).strip().find('window["fillCount"]') # print abc[4].contents # pageCount= soup.find_all(string=re.compile('fillCount\"\]\((\d+)')) # pageCount=re.findall('(\d+)',pageCount[0]) # sys.exit(0) # print soup.find_all('__stringHtmlPages') # sys.exit(0) # soup = BeautifulSoup(html_doc.decode('gb2312','ignore')) # print soup.find_all('div', id="divListTemplate") # for i in soup.find_all('tr',attrs={"class": "gray"."class":""}): alldata = {} dict_data = {} # print soup.find_all('div',id='divListTemplate') row = soup.find_all("div", id="divListTemplate") sdata = [] if len(row) >= 1: """ colums:CHN name """ # firstCells = row[0].find('tr') # th_cells = firstCells.find_all('th') # td_cells = firstCells.find_all('td') # m_name=th_cells[0].find(text=True) # m_code=th_cells[1].find(text=True) # m_time=th_cells[2].find(text=True) # m_status=th_cells[3].find(text=True) # m_detail=th_cells[4].find(text=True) # m_price=td_cells[0].find(text=True) # m_vol=td_cells[1].find(text=True) # m_pre_p=td_cells[2].find(text=True) # print "m_name:",m_name,m_pre_p for tag in row[0].find_all("tr", attrs={"class": True}): # print tag th_cells = tag.find_all("th") td_cells = tag.find_all("td") m_name = th_cells[0].find(text=True) m_code = th_cells[1].find(text=True) m_time = th_cells[2].find(text=True) # m_detail=(th_cells[4]).find('a')["href"] #detail_url m_price = td_cells[0].find(text=True) m_vol = float(td_cells[1].find(text=True).replace(",", "")) * 100 m_pre_p = td_cells[2].find(text=True) m_status_t = th_cells[3].find(text=True) if m_status_t in status_dict.keys(): m_status = status_dict[m_status_t] # print m_status sdata.append( { "code": m_code, "time": m_time, "vol": m_vol, "price": m_price, "pre_p": m_pre_p, "status": m_status, "name": m_name, } ) # sdata.append({'code':m_code,'time':m_time,'vol':m_vol,'price':m_price,'pre_p':m_pre_p,'detail':m_detail,'status':m_status,'name':m_name}) # print sdata # print m_name # break # pd = DataFrame(sdata,columns=['code','time','vol','price','pre_p','detail','status','name']) df = DataFrame(sdata, columns=["code", "time", "vol", "price", "pre_p", "status", "name"]) # for row in soup.find_all('tr',attrs={"class":"gray","class":""}): except Exception as e: print "Except:", (e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)