def __init__(self, docfile):
     HTMLParser.__init__(self)
     self.docfile = docfile
     self.doc = Document(docfile)
     self.myclient = HTMLClient()
     self.text = ''
     self.title = False
     self.isdes = False
     self.picList=[]
 def getname(self, sno):
     surfix = self.surfix(sno)
     webdata = HTMLClient().GetPage('http://hq.sinajs.cn/list='+str(surfix), 'gbk')
     if not webdata:
         return ''
     webdata = webdata.split('"')[1]
     if not webdata.strip():
         return ''
     data = webdata.split(',')
     return data[0]
class XQHTMLParser(HTMLParser):
    def __init__(self, docfile):
        HTMLParser.__init__(self)
        self.docfile = docfile
        self.doc = Document(docfile)
        self.myclient = HTMLClient()
        self.text = ''
        self.title = False
        self.isdes = False
        self.picList=[]
    def handle_starttag(self, tag, attrs):
        #print "Encountered the beginning of a %s tag" % tag
        self.title = False
        self.isdes = False
        if re.match(r'h(\d)', tag):
            self.title = True 
        if tag == "img":
            if len(attrs) == 0: pass
            else:
                for (variable, value)  in attrs:
                    if variable == "src":
                        picdata = self.myclient.GetPic(value.split('!')[0])
                        if picdata == None:
                            pass
                        else:
                            pictmp = value.split('/')[-1].split('!')[0]
                            picfix = value.split('/')[-1].split('!')[-1]
                            with open(pictmp, 'wb') as pic:
                                pic.write(bytes(picdata))
                                pic.close()
                            #if os.path.getsize(pictmp) < 90000:
                            try:
                                if picfix[0:1] == 'c':
                                    self.doc.add_picture(pictmp, width=Inches(4.5))
                                else:
                                    self.doc.add_picture(pictmp)#, width=Inches(2.25))
                            except docx.image.exceptions.UnexpectedEndOfFileError as e:
                                print(e)
                            self.picList.append(pictmp)
        if tag == 'script':
            self.isdes = True
    def handle_data(self, data):
        if self.title == True:
            if self.text != '':
                self.doc.add_paragraph(self.text)
            self.text = ''
            self.doc.add_heading(data, level=2)
        if self.isdes == False:
            self.text += data
    def handle_endtag(self, tag):
        #if tag == 'br' or tag == 'p' or tag == 'div':
        if self.text != '':
            self.doc.add_paragraph(self.text)
            self.text = ''
    def complete(self, html):
        self.feed(html)
        self.doc.save(self.docfile)
        for item in self.picList:
            if os.path.exists(item):
                os.remove(item)
 def getrtdata(self, sno):
     webdata = HTMLClient().GetPage('http://hq.sinajs.cn/list=sz'+str(sno), 'gbk')
     if not webdata:
         return ''
     webdata = webdata.split('"')[1]
     if not webdata.strip():
         return ''
     data = webdata.split(',')
     stockdata={}
     stockdata["name"] = data[0]
     stockdata["openprice"] = data[1]
     stockdata["yesprice"] = data[2]
     stockdata["parprice"] = data[3]
     stockdata["hprice"] = data[4]
     stockdata["lprice"] = data[5]
     stockdata["jb1"] = data[6]
     stockdata["js1"] = data[7]
     stockdata["amount"] = data[8]
     stockdata["mamount"] = data[9]
     stockdata["b1"] = data[10]
     stockdata["b1price"] = data[11]
     stockdata["b2"] = data[12]
     stockdata["b2price"] = data[13]
     stockdata["b3"] = data[14]
     stockdata["b3price"] = data[15]
     stockdata["b4"] = data[16]
     stockdata["b4price"] = data[17]
     stockdata["b5"] = data[18]
     stockdata["b5price"] = data[19]
     stockdata["s1"] = data[20]
     stockdata["s1price"] = data[21]
     stockdata["s2"] = data[22]
     stockdata["s2price"] = data[23]
     stockdata["s3"] = data[24]
     stockdata["s3price"] = data[25]
     stockdata["s4"] = data[26]
     stockdata["s4price"] = data[27]
     stockdata["s5"] = data[28]
     stockdata["s5price"] = data[29]
     stockdata["date"] = data[30]
     stockdata["time"] = data[31]
     print (stockdata["curprice"])
     return stockdata
 def __init__(self):
     self.myclient = HTMLClient()
class XQ_Spider:
    def __init__(self):
        self.myclient = HTMLClient()
    def Get_Json(self, page):
       myparser = Simple_Parser()
       return myparser.feed(str(page), 'SNB.data.req_isBrick = 0;', "SNB.data.statusType") 
    def Get_Div(self, page):
       myparser = Simple_Parser()
       return myparser.feed(str(page), '<div class="status-content">', "</div>") 
    def Get_Url(self, userid):
        pg = 1
        maxpg = 1000
        urlList = []
        #retweetList = []
        while True:
            mypage = self.myclient.GetPage("http://xueqiu.com/" + userid +'?page=' + str(pg))#"2821861040")
            if mypage == None:
                continue
            xq_spider = XQ_Spider()
            xq_json = xq_spider.Get_Json(mypage)
            #infile = input('>')
            
            for item in xq_json:
                s = item.find('{')
                e = item.rfind('}')
                content = item[s:e+1]
                xml_content = json.loads(content)
                maxpg = xml_content["maxPage"]
                for status in xml_content["statuses"]:
                    retweeded_status = status['retweet_status_id']
                    if retweeded_status == 0:
                        urlList.append(str(status['target']))
                    #flag = True
                    #for retweed in retweetList:
                    #    if retweed == str(retweeded_status):
                    #        flag = False
                    #if flag == True:
                    #    print(str(status['target']))
                    #    urlList.append(str(status['target']))
                    #    if retweeded_status != 0:
                    #        retweetList.append(str(retweeded_status))
            pg += 1
            if pg > maxpg:
                break
        return urlList
    def Get_HTML(self, userid):
        urlList = self.Get_Url(userid)
        with open("xq_article_"+ userid +".html", 'wb') as xqfile:
            xqfile.write(b'<head>')
            xqfile.write(b'<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">')
            xqfile.write(b'</head>')
            for url in urlList:
                #print(url)
                if url == None:
                    continue
                mypage = self.myclient.GetPage("http://xueqiu.com" + url)
                if mypage == None:
                    continue
                div_ctx = self.Get_Div(mypage)
                for ctx in div_ctx:
                    xqfile.write(bytes(ctx, 'utf-8'))
            xqfile.close()
    def Get_Doc(self, userid):
        urlList = self.Get_Url(userid)
        h2d = HTML2Doc()
        h2d.open('xq_' + userid + '.doc')
        for url in urlList:
            if url == None:
                continue
            print("download from" + url)
            mypage = self.myclient.GetPage("http://xueqiu.com" + url)
            div_ctx = self.Get_Div(mypage)
            for ctx in div_ctx:
               h2d.write(ctx) 
            print("Done!")
Beispiel #7
0
from tools.Simple_WebCatcher import HTMLClient


class JX3_Spider:
    def Get_News(self, page):
        myparser = Simple_Parser()
        return myparser.feed(page, u'<div class="news_list news_list02">',
                             u'</div>')

    def Get_CSS(self, page):
        myparser = Simple_Parser()
        return myparser.feed(page, u'<link ', u'/>')


if __name__ == '__main__':
    myclient = HTMLClient()
    mypage = myclient.GetPage("http://xw.jx3.xoyo.com/news/")
    jx3_spider = JX3_Spider()
    jx3_news = jx3_spider.Get_News(mypage)
    jx3_css = jx3_spider.Get_CSS(mypage)
    infile = input('>')
    with open("jx3_news.html", 'wb') as jx3file:
        jx3file.write(b'<head>')
        jx3file.write(
            b'<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'
        )
        for item in jx3_css:
            jx3file.write(bytes(item, 'utf-8'))
        for item in jx3_news:
            jx3file.write(bytes(item, 'utf-8'))
        jx3file.close()