def crawler_pages(district, url): # print district, url try: html = httptool.getResponseHtml(url) soup = BeautifulSoup(html) totalPages = int( soup.find(name="span", attrs={ 'class': 'fy_text' }).getText().split('/')[1]) # totalNumber = int(soup.find(name="span", attrs={'class':'number orange'}).getText()) # totalPages = (totalNumber / 20) if totalNumber % 20 == 0 else (totalNumber / 20 + 1) # print str(totalPages) # loop pages to get community shequlist = [] for pn in range(totalPages): parts = url.split('_') parts[-3] = str(pn + 1) url = '_'.join(parts) print url shequlist = shequlist + crawl_page(district, url) time.sleep(random.randint(1, 3)) return shequlist except Exception, e: print "Exception in crawler_pages ", e
def fetch_detail(dt, tik, counter): url = 'http://data.eastmoney.com/stock/lhb,#dt#,#tik#.html'.replace( '#dt#', dt).replace('#tik#', tik) html = httptool.getResponseHtml(url) sleeptime = random.randint(1, 3) time.sleep(sleeptime) print dt, tik, sleeptime, counter if html is None: print 'bad response :', dt, tik return [], [] soup = BeautifulSoup(html) buy_tab = soup.find(name="table", attrs={'id': 'tab-2'}) sell_tab = soup.find(name="table", attrs={'id': 'tab-4'}) buy_rank = parse_table(buy_tab, dt, tik, 'buy') sell_rank = parse_table(sell_tab, dt, tik, 'sell') return buy_rank, sell_rank
def fetch_winner_list2(dt='2016-12-30'): print 'start to get ', dt url = "http://datainterface3.eastmoney.com//EM_DataCenter_V3/api/LHBGGDRTJ/GetLHBGGDRTJ?tkn=eastmoney&mkt=0&dateNum=&startDateTime=#{dt}&endDateTime=#{dt}&sortRule=1&sortColumn=&pageNum=1&pageSize=200&cfg=lhbggdrtj".replace( '#{dt}', dt) result = httptool.getResponseHtml(url) ''' {"Message":"","Status":0,"Data":[{"TableName":"RptLhbXQMap","TotalPage":1,"ConsumeMSecond":0,"SplitSymbol":"|", "FieldName":"SCode,SName,ClosePrice,Chgradio,Dchratio,JmMoney,Turnover,Ntransac,Ctypedes,Oldid,Smoney,BMoney,ZeMoney,Tdate,JmRate,ZeRate,Ltsz,Rchange1dc,Rchange1do, Rchange2dc,Rchange2do,Rchange3dc,Rchange3do,Rchange5dc,Rchange5do,Rchange10dc,Rchange10do,Rchange15dc,Rchange15do,Rchange20dc,Rchange20do,Rchange30dc,Rchange30do, Rchange1m,Rchange3m,Rchange6m,Rchange1y,SumCount,JGBSumCount,JGSSumCount,JGBMoney,JGSMoney,JGJMMoney,DP", "Data":["000538|云南白药|76.15|9.9957||9084847.3|161382715|2119274|日涨幅偏离值达到7%的前五只证券|2445780|96344827.7|105429675|201774502.7|2016-12-30|5.63|125.03|79302033544.5|||||||||||||||||||||||||||实力游资买入,成功率72.18%", "000612|焦作万方|11.55|8.0449||210253710.31|850814366|75774853|日涨幅偏离值达到7%的前五只证券|2445781|165166791.68|375420501.99|540587293.67|2016-12-30|24.71|63.54|11448045389.55|||||||||||||||||||||||||||买一主买,成功率42.69%", "000635|英力特|30.72|-9.9912||-64527768.79|996865377|31485498|日跌幅偏离值达到7%的前五只证券|2445782|110720856.95|46193088.16|156913945.11|2016-12-30|-6.47|15.74|9310851133.44|||||||||||||||||||||||||||实力游资卖出,成功率11.67%" ''' result = json.loads(result) seculist = result['Data'][0]['Data'] lhb = [] related_securities = [] counter = 0 for secuinfo in seculist: sf = secuinfo.split('|') secu = str(sf[0]) name = sf[1] close = sf[2] chg = sf[3] # 涨跌幅 dp = sf[-1] # 解读 jm = sf[5] # 龙虎榜净买额 需要1000 mr = sf[11] # 龙虎榜买入额 mc = sf[10] # 龙虎榜卖出额 ze = sf[12] # 龙虎榜成交额 turn = sf[6] # 市场总成交额 jmrate = sf[14] # 净买额占总成交比例 zerate = sf[15] # 成交额占总成交比 turn_rate = sf[4] # 换手率 ltsz = sf[17] # 流通市值 list_reason = sf[8] # 上榜原因 lhb.append([ dt, secu, name.encode('GBK'), close, chg, dp.encode('GBK'), jm, mr, mc, ze, turn, jmrate, zerate, turn_rate, ltsz, list_reason.encode('GBK') ]) counter = counter + 1 buy, sell = fetch_detail(dt, secu, counter) related_securities.extend(buy) related_securities.extend(sell) return lhb, related_securities
def crawlerjob(url): try: html = httptool.getResponseHtml(url) soup = BeautifulSoup(html) list_dist = soup.find(name="li", attrs={'id': 'hlist_21'}) dist = list_dist.findAll(name="a", attrs={'class': ''}) # loop to get district for e in dist: crawler_shangquan(e.getText(), url, e.get('href')) time.sleep(random.randint(1, 10)) except Exception, e: print "Exception in crawlerjob ", e
def crawl_page(district, url): try: html = httptool.getResponseHtml(url) soup = BeautifulSoup(html) houselistdom = soup.find(name="ul", attrs={'id': 'houselist'}) houselist = houselistdom.findAll('dt') shequlist = [] for h in houselist: shequlist.append(h.getText().replace('[', ',').replace(']', ',').split(',')) return shequlist except Exception, e: print "Exception in crawl_page ", e
def fetch_winner_list(): dt = '2016-12-30' url = 'http://data.eastmoney.com/stock/tradedetail/#dt#.html'.replace( '#dt#', dt) html = httptool.getResponseHtml(url) data_re = re.compile('var data_tab_1=(.*?);') # default_tab = soup.findAll(data_re) default_tab = data_re.findall(html) result = json.loads(unicode(default_tab[0], 'GBK')) # {u'Rchange1do': u'', u'Chgradio': u'9.9957', u'Rchange3m': u'9.99566662', u'Rchange10do': u'', u'Rchange1dc': u'', # u'JD': u'\u5b9e\u529b\u6e38\u8d44\u4e70\u5165\uff0c\u6210\u529f\u738772.18%', u'Rchange20dc': u'', u'Rchange10dc': u'', u'Rchange5do': u'', # u'ZeRate': u'125.03', u'Rchange20do': u'', u'Rchange1y': u'3.41536031', u'Rchange5dc': u'', u'JGSMoney': u'', u'JmMoney': u'9084847.3', # u'Ctypedes': u'\u65e5\u6da8\u5e45\u504f\u79bb\u503c\u8fbe\u52307%\u7684\u524d\u4e94\u53ea\u8bc1\u5238', u'Rchange2do': u'', # u'JGBMoney': u'', u'Rchange3do': u'', u'Rchange30do': u'', u'Ntransac': u'2119274', u'Oldid': u'2445780', u'Rchange15dc': u'', u'Rchange15do': u'', # u'Turnover': u'161382715', u'Rchange3dc': u'', u'Rchange2dc': u'', u'JmRate': u'5.63', u'ClosePrice': u'76.15', u'SName': u'\u4e91\u5357\u767d\u836f', # u'Rchange6m': u'18.68765586', u'Tdate': u'2016-12-30', u'Rchange1m': u'9.99566662', u'SCode': u'000538', u'Smoney': u'96344827.7', u'Bmoney': u'105429675', # u'ZeMoney': u'201774502.7', u'Dchratio': u'0.204', u'JGSSumCount': u'', u'DP': u'\u5b9e\u529b\u6e38\u8d44\u4e70\u5165\uff0c\u6210\u529f\u738772.18%', # u'JGBSumCount': u'', u'SumCount': u'', u'Ltsz': u'79302033544.5', u'Rchange30dc': u'', u'JGJMMoney': u''} for tik in result['data']: secu = tik['SCode'] name = tik['SName'] close = tik['ClosePrice'] chg = tik['Chgradio'] dp = tik['DP'] # 解读 jm = tik['JmMoney'] # 龙虎榜净买额 需要1000 mr = tik['Bmoney'] # 龙虎榜买入额 mc = tik['Smoney'] # 龙虎榜卖出额 ze = tik['ZeMoney'] # 龙虎榜成交额 turn = tik['Turnover'] # 市场总成交额 jmrate = tik['JmRate'] # 净买额占总成交比例 zerate = tik['ZeRate'] # 成交额占总成交比 turn_rate = tik['Dchratio'] # 换手率 ltsz = tik['Ltsz'] # 流通市值 list_reason = tik['Ctypedes'] # 上榜原因 fetch_detail(dt, secu) print secu, name, close, chg, dp, jm, mr, mc, ze, turn, jmrate, zerate, turn_rate, ltsz, list_reason break
def crawler_shangquan(district, baseurl, href): try: html = httptool.getResponseHtml(url.replace('/housing/', href)) soup = BeautifulSoup(html) list_shangquan = soup.find(name="li", attrs={'class': 'shangquan'}) shq = list_shangquan.findAll(name="a", attrs={'class': ''}) # loop to get district shequlist = [] for e in shq: shequlist = shequlist + crawler_pages( e.getText(), url.replace('/housing/', e.get('href'))) time.sleep(random.randint(1, 5)) write_to_excel(district, shequlist) except Exception, e: print "Exception in crawlerjob ", e
def craw_investor_info(investorid, url): invests = [] try: html = httptool.getResponseHtml(baseurl + url) soup = BeautifulSoup(html) investinfo_html = soup.find(name="table", attrs={'class': 'detailsList mTop'}) investinfo_rows = investinfo_html.findAll(name="tr", recursive=False) size = len(investinfo_rows) if size > 1: investinfo_rows = investinfo_rows[1:size] for row in investinfo_rows: cols = row.findAll(name="td", recursive=False) t2 = cols[2].find(name="table", attrs={'class': 'detailsList mTop'}) group_rows = t2.findAll(name="tr", recursive=False) for grouprow in group_rows: group_cols = grouprow.findAll(name="td", recursive=False) t3 = group_cols[3].find( name="table", attrs={'class': 'detailsList mTop'}) invest_times = t3.findAll(name="tr", recursive=False) for inv in invest_times: inv_cols = inv.findAll(name="td", recursive=False) investinfo = [investorid] investinfo.append(group_cols[0].getText()) investinfo.append(group_cols[1].getText()) investinfo.append(group_cols[2].getText()) investinfo.append(inv_cols[0].getText()) investinfo.append(inv_cols[1].getText()) investinfo.append(inv_cols[2].getText()) invests.append(list(investinfo)) except Exception, e: print "Exception in craw_investor_info ", e
def crawlcompany(companyid, company, url): baseinfo = [] investors = [] investorinfo = [] changeitems = [] members = [] punishs = [] try: html = httptool.getResponseHtml(url) soup = BeautifulSoup(html) if company == '_': name = soup.find(name="h2") company = name.getText().replace(' ', '').strip() #======================================================================= # baseinfo #======================================================================= profile_html = soup.find(name="table", attrs={'class': 'detailsList mTop'}) profile_info = profile_html.findAll(name="td", attrs={'class': 'left'}) baseprofile = [companyid, company] for e in profile_info: baseprofile.append(e.getText()) baseinfo.append(list(baseprofile)) print 'start to craw investors' #======================================================================= # investors #======================================================================= investor_html = soup.find(name="table", attrs={ 'class': 'detailsList mTop', 'id': 'investor' }) investor_rows = investor_html.findAll(name="tr") size = len(investor_rows) if size > 3: investor_rows = investor_rows[2:size - 1] rowid = 1 for row in investor_rows: cols = row.findAll(name="td") investor = [companyid] investorid = companyid + '_' + str(rowid) investor.append(investorid) for i in range(4): investor.append(cols[i].getText()) investors.append(list(investor)) alink = cols[4].find(name="a", attrs={'target': '_blank'}) investorinfo = investorinfo + craw_investor_info( investorid, alink.get('href')) rowid = rowid + 1 print 'start to craw changeitems' #======================================================================= # changeitems #======================================================================= changeitems_html = soup.find(name="table", attrs={ 'class': 'detailsList mTop', 'id': 'changeItem' }) changeitems_rows = changeitems_html.findAll(name="tr") size = len(changeitems_rows) if size > 3: changeitems_rows = changeitems_rows[2:size - 1] for row in changeitems_rows: cols = row.findAll(name="td") changeinfo = [companyid] for i in range(4): changeinfo.append(cols[i].getText()) changeitems.append(list(changeinfo)) print 'start to craw members' #======================================================================= # members #======================================================================= members_html = soup.find(name="table", attrs={ 'class': 'detailsList mTop', 'id': 'member' }) members_rows = members_html.findAll(name="tr") size = len(members_rows) if size > 3: members_rows = members_rows[2:size - 1] for row in members_rows: cols = row.findAll(name="td") memberinfo = [companyid] for i in range(3): memberinfo.append(cols[i].getText()) members.append(list(memberinfo)) if len(cols) == 6: memberinfo = [companyid] for i in range(3): memberinfo.append(cols[i + 3].getText()) members.append(list(memberinfo)) print 'start to craw punish' #======================================================================= # punish #======================================================================= punish_div = soup.find(name="div", attrs={'id': 'punishDiv'}) punish_html = punish_div.find(name="table", attrs={'class': 'detailsList mTop'}) punish_rows = punish_html.findAll(name="tr") size = len(punish_rows) if size > 2: punish_rows = punish_rows[2:size] for row in punish_rows: cols = row.findAll(name="td") punishinfo = [companyid] for i in range(7): punishinfo.append(cols[i].getText()) punishs.append(punishinfo) except Exception, e: print "Exception in crawlcompany ", e