def get(self): link = self.extract( 'class="pn" href="', '" target=""> 中英对照') spider.put(HTTP%link)
# tid = int(self.get_argument('tid')) # print tid, name # self.page.append((tid, self.request.url, name, html)) # # @classmethod # def write(cls): # page = cls.page # page.sort(key=itemgetter(0), reverse=True) # with open(join(PREFIX, 'ecocn_org.xml'), 'w') as rss: # rss.write( # cls.template.render( # rss_title='经济学人 . 中文网', # rss_link='http://www.ecocn.org', # li=[ # dict( # link=link, # title=title, # txt=txt # ) for id, link, title, txt in cls.page # ] # ) # ) # if __name__ == '__main__': spider.put('http://www.ecocn.org/portal.php?mod=list&catid=1') #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30) forum.write()