def _today_ticks(symbol, tdate, pageNo, retry_count, pause): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: html = lxml.html.parse(ct.TODAY_TICKS_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['t_ticks'], symbol, tdate, pageNo )) res = html.xpath('//table[@id=\"datatbl\"]/tbody/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>' % sarr sarr = sarr.replace('--', '0') df = pd.read_html(StringIO(sarr), parse_dates=False)[0] df.columns = ct.TODAY_TICK_COLUMNS df['pchange'] = df['pchange'].map(lambda x: x.replace('%', '')) except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _parsing_sina_dd_price_json(url): """ 处理当日行情分页数据,格式为json Parameters ------ pageNum:页码 return ------- DataFrame 当日所有股票交易数据(DataFrame) """ ct._write_console() # request = Request(ct.SINA_DAY_PRICE_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], # ct.PAGES['jv'], pageNum)) # request = Request(url) # text = urlopen(request, timeout=10).read() text = cct.get_url_data(url) # print(len(text)) # return text if len(text) < 10: return '' reg = re.compile(r'\,(.*?)\:') text = reg.sub(r',"\1":', text.decode('gbk') if ct.PY3 else text) text = text.replace('"{symbol', '{"code') text = text.replace('{symbol', '{"code"') if ct.PY3: jstr = json.dumps(text) else: # jstr = json.dumps(text, encoding='GBK') jstr = json.dumps(text) js = json.loads(jstr) df = pd.DataFrame(pd.read_json(js, dtype={'code': object}), columns=ct.DAY_REAL_DD_COLUMNS) df = df.drop('symbol', axis=1) df = df.ix[df.volume > 0] # print "" # print df['name'][len(df.index)-1:],len(df.index) return df
def get_sina_all_dd(vol='0', type='0', retry_count=3, pause=0.001): if len(vol) != 1 or len(type) != 1: return None else: print ("Vol:%s Type:%s" % (ct.DD_VOL_List[vol], ct.DD_TYPE_List[type])) # symbol = _code_to_symbol(code) for _ in range(retry_count): time.sleep(pause) try: ct._write_console() url = get_sina_url(vol, type) page = urllib2.urlopen(url) html_doc = page.read() # print (html_doc) # soup = BeautifulSoup(html_doc,fromEncoding='gb18030') # print html_doc pageCount = re.findall('fillCount\"\]\((\d+)', html_doc, re.S) if len(pageCount) > 0: start_t = time.time() pageCount = pageCount[0] if int(pageCount) > 100: if int(pageCount) > 10000: print "BigBig:", pageCount pageCount = '10000' print "AllBig:", pageCount html_doc = urllib2.urlopen(get_sina_url(vol, type, pageCount=pageCount)).read() print (time.time() - start_t) soup = BeautifulSoup(html_doc, "lxml") print (time.time() - start_t) # abc= (soup.find_all('script',type="text/javascript")) # print(len(abc)) # print (abc[4].text).strip().find('window["fillCount"]') # print abc[4].contents # pageCount= soup.find_all(string=re.compile('fillCount\"\]\((\d+)')) # pageCount=re.findall('(\d+)',pageCount[0]) # sys.exit(0) # print soup.find_all('__stringHtmlPages') # sys.exit(0) # soup = BeautifulSoup(html_doc.decode('gb2312','ignore')) # print soup.find_all('div', id="divListTemplate") # for i in soup.find_all('tr',attrs={"class": "gray"."class":""}): alldata = {} dict_data = {} # print soup.find_all('div',id='divListTemplate') row = soup.find_all('div', id='divListTemplate') sdata = [] if len(row) >= 1: ''' colums:CHN name ''' # firstCells = row[0].find('tr') # th_cells = firstCells.find_all('th') # td_cells = firstCells.find_all('td') # m_name=th_cells[0].find(text=True) # m_code=th_cells[1].find(text=True) # m_time=th_cells[2].find(text=True) # m_status=th_cells[3].find(text=True) # m_detail=th_cells[4].find(text=True) # m_price=td_cells[0].find(text=True) # m_vol=td_cells[1].find(text=True) # m_pre_p=td_cells[2].find(text=True) # print "m_name:",m_name,m_pre_p for tag in row[0].find_all('tr', attrs={"class": True}): # print tag th_cells = tag.find_all('th') td_cells = tag.find_all('td') m_name = th_cells[0].find(text=True) m_code = th_cells[1].find(text=True) m_time = th_cells[2].find(text=True) # m_detail=(th_cells[4]).find('a')["href"] #detail_url m_price = td_cells[0].find(text=True) m_vol = float(td_cells[1].find(text=True).replace(',', '')) * 100 m_pre_p = td_cells[2].find(text=True) m_status_t = th_cells[3].find(text=True) if m_status_t in status_dict.keys(): m_status = status_dict[m_status_t] # print m_status sdata.append({'code': m_code, 'time': m_time, 'vol': m_vol, 'price': m_price, 'pre_p': m_pre_p, 'status': m_status, 'name': m_name}) # sdata.append({'code':m_code,'time':m_time,'vol':m_vol,'price':m_price,'pre_p':m_pre_p,'detail':m_detail,'status':m_status,'name':m_name}) # print sdata # print m_name # break # pd = DataFrame(sdata,columns=['code','time','vol','price','pre_p','detail','status','name']) df = DataFrame(sdata, columns=['code', 'time', 'vol', 'price', 'pre_p', 'status', 'name']) # for row in soup.find_all('tr',attrs={"class":"gray","class":""}): except Exception as e: print "Except:", (e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def get_lhb_dd(retry_count=3, pause=0.001): # symbol = _code_to_symbol(code) lhburl = 'http://data.eastmoney.com/stock/tradedetail.html' for _ in range(retry_count): time.sleep(pause) try: ct._write_console() html_doc = cct.get_url_data_R(lhburl) # print html_doc # page = urllib2.urlopen() # html_doc = page.read() # print (html_doc) # soup = BeautifulSoup(html_doc,fromEncoding='gb18030') # print html_doc # pageCount = re.findall('fillCount\"\]\((\d+)', html_doc, re.S) # if len(pageCount) > 0: # start_t = time.time() # pageCount = pageCount[0] # if int(pageCount) > 100: # if int(pageCount) > 10000: # print "BigBig:", pageCount # pageCount = '10000' # print "AllBig:", pageCount # html_doc = urllib2.urlopen(get_sina_url(vol, type, pageCount=pageCount)).read() # print (time.time() - start_t) soup = BeautifulSoup(html_doc, "lxml") # print (time.time() - start_t) abc = (soup.find_all('table', type="tab1")) # abc= (soup.find_all('h101',type="class")) # <thead class="h101"> print(len(abc)) # print (abc[4].text).strip().find('window["fillCount"]') # print abc[4].contents # pageCount= soup.find_all(string=re.compile('fillCount\"\]\((\d+)')) # pageCount=re.findall('(\d+)',pageCount[0]) # sys.exit(0) # print soup.find_all('__stringHtmlPages') sys.exit(0) # soup = BeautifulSoup(html_doc.decode('gb2312','ignore')) # print soup.find_all('div', id="divListTemplate") # for i in soup.find_all('tr',attrs={"class": "gray"."class":""}): # alldata = {} # dict_data = {} # print soup.find_all('div',id='divListTemplate') row = soup.find_all('div', id='divListTemplate') sdata = [] if len(row) >= 1: ''' colums:CHN name ''' # firstCells = row[0].find('tr') # th_cells = firstCells.find_all('th') # td_cells = firstCells.find_all('td') # m_name=th_cells[0].find(text=True) # m_code=th_cells[1].find(text=True) # m_time=th_cells[2].find(text=True) # m_status=th_cells[3].find(text=True) # m_detail=th_cells[4].find(text=True) # m_price=td_cells[0].find(text=True) # m_vol=td_cells[1].find(text=True) # m_pre_p=td_cells[2].find(text=True) # print "m_name:",m_name,m_pre_p for tag in row[0].find_all('tr', attrs={"class": True}): # print tag th_cells = tag.find_all('th') td_cells = tag.find_all('td') m_name = th_cells[0].find(text=True) m_code = th_cells[1].find(text=True) m_time = th_cells[2].find(text=True) # m_detail=(th_cells[4]).find('a')["href"] #detail_url m_price = td_cells[0].find(text=True) m_vol = float( td_cells[1].find(text=True).replace(',', '')) * 100 m_pre_p = td_cells[2].find(text=True) m_status_t = th_cells[3].find(text=True) if m_status_t in status_dict.keys(): m_status = status_dict[m_status_t] # print m_status sdata.append({'code': m_code, 'time': m_time, 'vol': m_vol, 'price': m_price, 'pre_p': m_pre_p, 'status': m_status, 'name': m_name}) # sdata.append({'code':m_code,'time':m_time,'vol':m_vol,'price':m_price,'pre_p':m_pre_p,'detail':m_detail,'status':m_status,'name':m_name}) # print sdata # print m_name # break # pd = DataFrame(sdata,columns=['code','time','vol','price','pre_p','detail','status','name']) df = DataFrame( sdata, columns=['code', 'time', 'vol', 'price', 'pre_p', 'status', 'name']) # for row in soup.find_all('tr',attrs={"class":"gray","class":""}): except Exception as e: print "Except:", (e) import traceback traceback.print_exc() else: return df
def get_lhb_dd(retry_count=3, pause=0.001): # symbol = _code_to_symbol(code) lhburl='http://data.eastmoney.com/stock/tradedetail.html' for _ in range(retry_count): time.sleep(pause) try: ct._write_console() html_doc = cct.get_url_data_R(lhburl) # print html_doc # page = urllib2.urlopen() # html_doc = page.read() # print (html_doc) # soup = BeautifulSoup(html_doc,fromEncoding='gb18030') # print html_doc # pageCount = re.findall('fillCount\"\]\((\d+)', html_doc, re.S) # if len(pageCount) > 0: # start_t = time.time() # pageCount = pageCount[0] # if int(pageCount) > 100: # if int(pageCount) > 10000: # print "BigBig:", pageCount # pageCount = '10000' # print "AllBig:", pageCount # html_doc = urllib2.urlopen(get_sina_url(vol, type, pageCount=pageCount)).read() # print (time.time() - start_t) soup = BeautifulSoup(html_doc, "lxml") # print (time.time() - start_t) abc= (soup.find_all('table',type="tab1")) # abc= (soup.find_all('h101',type="class")) # <thead class="h101"> print(len(abc)) # print (abc[4].text).strip().find('window["fillCount"]') # print abc[4].contents # pageCount= soup.find_all(string=re.compile('fillCount\"\]\((\d+)')) # pageCount=re.findall('(\d+)',pageCount[0]) # sys.exit(0) # print soup.find_all('__stringHtmlPages') sys.exit(0) # soup = BeautifulSoup(html_doc.decode('gb2312','ignore')) # print soup.find_all('div', id="divListTemplate") # for i in soup.find_all('tr',attrs={"class": "gray"."class":""}): alldata = {} dict_data = {} # print soup.find_all('div',id='divListTemplate') row = soup.find_all('div', id='divListTemplate') sdata = [] if len(row) >= 1: ''' colums:CHN name ''' # firstCells = row[0].find('tr') # th_cells = firstCells.find_all('th') # td_cells = firstCells.find_all('td') # m_name=th_cells[0].find(text=True) # m_code=th_cells[1].find(text=True) # m_time=th_cells[2].find(text=True) # m_status=th_cells[3].find(text=True) # m_detail=th_cells[4].find(text=True) # m_price=td_cells[0].find(text=True) # m_vol=td_cells[1].find(text=True) # m_pre_p=td_cells[2].find(text=True) # print "m_name:",m_name,m_pre_p for tag in row[0].find_all('tr', attrs={"class": True}): # print tag th_cells = tag.find_all('th') td_cells = tag.find_all('td') m_name = th_cells[0].find(text=True) m_code = th_cells[1].find(text=True) m_time = th_cells[2].find(text=True) # m_detail=(th_cells[4]).find('a')["href"] #detail_url m_price = td_cells[0].find(text=True) m_vol = float(td_cells[1].find(text=True).replace(',', '')) * 100 m_pre_p = td_cells[2].find(text=True) m_status_t = th_cells[3].find(text=True) if m_status_t in status_dict.keys(): m_status = status_dict[m_status_t] # print m_status sdata.append({'code': m_code, 'time': m_time, 'vol': m_vol, 'price': m_price, 'pre_p': m_pre_p, 'status': m_status, 'name': m_name}) # sdata.append({'code':m_code,'time':m_time,'vol':m_vol,'price':m_price,'pre_p':m_pre_p,'detail':m_detail,'status':m_status,'name':m_name}) # print sdata # print m_name # break # pd = DataFrame(sdata,columns=['code','time','vol','price','pre_p','detail','status','name']) df = DataFrame(sdata, columns=['code', 'time', 'vol', 'price', 'pre_p', 'status', 'name']) # for row in soup.find_all('tr',attrs={"class":"gray","class":""}): except Exception as e: print "Except:", (e) else: return df traceback.print_exc() raise IOError(ct.NETWORK_URL_ERROR_MSG.decode('utf8').encode('gbk'))