def __init__(self, docfile): HTMLParser.__init__(self) self.docfile = docfile self.doc = Document(docfile) self.myclient = HTMLClient() self.text = '' self.title = False self.isdes = False self.picList=[]
def getname(self, sno): surfix = self.surfix(sno) webdata = HTMLClient().GetPage('http://hq.sinajs.cn/list='+str(surfix), 'gbk') if not webdata: return '' webdata = webdata.split('"')[1] if not webdata.strip(): return '' data = webdata.split(',') return data[0]
class XQHTMLParser(HTMLParser): def __init__(self, docfile): HTMLParser.__init__(self) self.docfile = docfile self.doc = Document(docfile) self.myclient = HTMLClient() self.text = '' self.title = False self.isdes = False self.picList=[] def handle_starttag(self, tag, attrs): #print "Encountered the beginning of a %s tag" % tag self.title = False self.isdes = False if re.match(r'h(\d)', tag): self.title = True if tag == "img": if len(attrs) == 0: pass else: for (variable, value) in attrs: if variable == "src": picdata = self.myclient.GetPic(value.split('!')[0]) if picdata == None: pass else: pictmp = value.split('/')[-1].split('!')[0] picfix = value.split('/')[-1].split('!')[-1] with open(pictmp, 'wb') as pic: pic.write(bytes(picdata)) pic.close() #if os.path.getsize(pictmp) < 90000: try: if picfix[0:1] == 'c': self.doc.add_picture(pictmp, width=Inches(4.5)) else: self.doc.add_picture(pictmp)#, width=Inches(2.25)) except docx.image.exceptions.UnexpectedEndOfFileError as e: print(e) self.picList.append(pictmp) if tag == 'script': self.isdes = True def handle_data(self, data): if self.title == True: if self.text != '': self.doc.add_paragraph(self.text) self.text = '' self.doc.add_heading(data, level=2) if self.isdes == False: self.text += data def handle_endtag(self, tag): #if tag == 'br' or tag == 'p' or tag == 'div': if self.text != '': self.doc.add_paragraph(self.text) self.text = '' def complete(self, html): self.feed(html) self.doc.save(self.docfile) for item in self.picList: if os.path.exists(item): os.remove(item)
def getrtdata(self, sno): webdata = HTMLClient().GetPage('http://hq.sinajs.cn/list=sz'+str(sno), 'gbk') if not webdata: return '' webdata = webdata.split('"')[1] if not webdata.strip(): return '' data = webdata.split(',') stockdata={} stockdata["name"] = data[0] stockdata["openprice"] = data[1] stockdata["yesprice"] = data[2] stockdata["parprice"] = data[3] stockdata["hprice"] = data[4] stockdata["lprice"] = data[5] stockdata["jb1"] = data[6] stockdata["js1"] = data[7] stockdata["amount"] = data[8] stockdata["mamount"] = data[9] stockdata["b1"] = data[10] stockdata["b1price"] = data[11] stockdata["b2"] = data[12] stockdata["b2price"] = data[13] stockdata["b3"] = data[14] stockdata["b3price"] = data[15] stockdata["b4"] = data[16] stockdata["b4price"] = data[17] stockdata["b5"] = data[18] stockdata["b5price"] = data[19] stockdata["s1"] = data[20] stockdata["s1price"] = data[21] stockdata["s2"] = data[22] stockdata["s2price"] = data[23] stockdata["s3"] = data[24] stockdata["s3price"] = data[25] stockdata["s4"] = data[26] stockdata["s4price"] = data[27] stockdata["s5"] = data[28] stockdata["s5price"] = data[29] stockdata["date"] = data[30] stockdata["time"] = data[31] print (stockdata["curprice"]) return stockdata
def __init__(self): self.myclient = HTMLClient()
class XQ_Spider: def __init__(self): self.myclient = HTMLClient() def Get_Json(self, page): myparser = Simple_Parser() return myparser.feed(str(page), 'SNB.data.req_isBrick = 0;', "SNB.data.statusType") def Get_Div(self, page): myparser = Simple_Parser() return myparser.feed(str(page), '<div class="status-content">', "</div>") def Get_Url(self, userid): pg = 1 maxpg = 1000 urlList = [] #retweetList = [] while True: mypage = self.myclient.GetPage("http://xueqiu.com/" + userid +'?page=' + str(pg))#"2821861040") if mypage == None: continue xq_spider = XQ_Spider() xq_json = xq_spider.Get_Json(mypage) #infile = input('>') for item in xq_json: s = item.find('{') e = item.rfind('}') content = item[s:e+1] xml_content = json.loads(content) maxpg = xml_content["maxPage"] for status in xml_content["statuses"]: retweeded_status = status['retweet_status_id'] if retweeded_status == 0: urlList.append(str(status['target'])) #flag = True #for retweed in retweetList: # if retweed == str(retweeded_status): # flag = False #if flag == True: # print(str(status['target'])) # urlList.append(str(status['target'])) # if retweeded_status != 0: # retweetList.append(str(retweeded_status)) pg += 1 if pg > maxpg: break return urlList def Get_HTML(self, userid): urlList = self.Get_Url(userid) with open("xq_article_"+ userid +".html", 'wb') as xqfile: xqfile.write(b'<head>') xqfile.write(b'<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">') xqfile.write(b'</head>') for url in urlList: #print(url) if url == None: continue mypage = self.myclient.GetPage("http://xueqiu.com" + url) if mypage == None: continue div_ctx = self.Get_Div(mypage) for ctx in div_ctx: xqfile.write(bytes(ctx, 'utf-8')) xqfile.close() def Get_Doc(self, userid): urlList = self.Get_Url(userid) h2d = HTML2Doc() h2d.open('xq_' + userid + '.doc') for url in urlList: if url == None: continue print("download from" + url) mypage = self.myclient.GetPage("http://xueqiu.com" + url) div_ctx = self.Get_Div(mypage) for ctx in div_ctx: h2d.write(ctx) print("Done!")
from tools.Simple_WebCatcher import HTMLClient class JX3_Spider: def Get_News(self, page): myparser = Simple_Parser() return myparser.feed(page, u'<div class="news_list news_list02">', u'</div>') def Get_CSS(self, page): myparser = Simple_Parser() return myparser.feed(page, u'<link ', u'/>') if __name__ == '__main__': myclient = HTMLClient() mypage = myclient.GetPage("http://xw.jx3.xoyo.com/news/") jx3_spider = JX3_Spider() jx3_news = jx3_spider.Get_News(mypage) jx3_css = jx3_spider.Get_CSS(mypage) infile = input('>') with open("jx3_news.html", 'wb') as jx3file: jx3file.write(b'<head>') jx3file.write( b'<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' ) for item in jx3_css: jx3file.write(bytes(item, 'utf-8')) for item in jx3_news: jx3file.write(bytes(item, 'utf-8')) jx3file.close()