def getContent(self, text, folder): links_p = re.compile('<td class="td6"><a href=bbstcon\?board=(.*?)>') result = links_p.findall(text) url_board = 'http://bbs.sysu.edu.cn/bbstcon?board=' #url+result[0]... for i in result: each_page_link = url_board + i print each_page_link content = requests.get(each_page_link, headers=self.header) content.encoding = 'gbk' s = content.text #req=urllib2.Request(each_page_link,headers=self.header) #resp=urllib2.urlopen(req).read() #c=content.decode('utf-8') #c=content #print c #print type(s) #print resp.decode('gbk') html = etree.HTML(s) #print content.decode('gbk') #t= chardet.detect(content) #print content['encoding'] title = html.xpath('//title/text()')[0] #t=title[0].decode('gbk').encode('utf-8') #t= title[0].decode('gbk') #t= unicode(title[0],'gbk') try: print title except: print "Can't decode title, return" return 0 filename = re.sub(u' - 逸仙时空BBS', '', title) filename = Toolkit.filename_filter(filename) f_fullpath = os.path.join(folder, filename) try: Toolkit.save2filecn(f_fullpath, title) Toolkit.save2filecn(f_fullpath, '\n\n*******************\n\n') Toolkit.save2filecn(f_fullpath, each_page_link) Toolkit.save2filecn(f_fullpath, '\n\n*******************\n\n') except: print each_page_link print "Create file error, go to next article" return 0 detail = html.xpath('//td[@class="border content2"]') #print detail for i in detail: #print type(i) Toolkit.save2filecn(f_fullpath, i.xpath('string(.)')) #print i.xpath('string(.)') #f = open('log.txt','w') #f = codecs.open(filename,'w',encod) #f.write(t) #f.close() #print t #Toolkit.save2filezn("log",t) time.sleep(5)
def bbs_filename_check(): url = 'http://bbs.sysu.edu.cn/bbstcon?board=Love&file=M.1104508652.A' headers = {'User-Agent': agent} resp = requests.get(url, headers=headers) resp.encoding = 'gbk' content = resp.text tree = etree.HTML(content) title = tree.xpath('//title/text()')[0] print title filename = Toolkit.filename_filter(title) print filename