def __init__(self, docfile):
     HTMLParser.__init__(self)
     self.docfile = docfile
     self.doc = Document(docfile)
     self.myclient = HTMLClient()
     self.text = ''
     self.title = False
     self.isdes = False
     self.picList=[]
 def __init__(self):
     self.myclient = HTMLClient()
Exemple #3
0
from tools.Simple_WebCatcher import HTMLClient


class JX3_Spider:
    def Get_News(self, page):
        myparser = Simple_Parser()
        return myparser.feed(page, u'<div class="news_list news_list02">',
                             u'</div>')

    def Get_CSS(self, page):
        myparser = Simple_Parser()
        return myparser.feed(page, u'<link ', u'/>')


if __name__ == '__main__':
    myclient = HTMLClient()
    mypage = myclient.GetPage("http://xw.jx3.xoyo.com/news/")
    jx3_spider = JX3_Spider()
    jx3_news = jx3_spider.Get_News(mypage)
    jx3_css = jx3_spider.Get_CSS(mypage)
    infile = input('>')
    with open("jx3_news.html", 'wb') as jx3file:
        jx3file.write(b'<head>')
        jx3file.write(
            b'<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'
        )
        for item in jx3_css:
            jx3file.write(bytes(item, 'utf-8'))
        for item in jx3_news:
            jx3file.write(bytes(item, 'utf-8'))
        jx3file.close()