def getDetailPage(self): content=FileUtil.readlines(self.saveFile) for s in content: title=s.split('|')[0] #标题 url=s.split('|')[1] #url print url page = HttpUtil.getPage(url); arr = HtmlUtil.select_all(page, '.newText .Info span') date='' source='' #获取时间/来源 for k in arr: if k is not None: if "年" in str(k): date=str(k); if "来源" in str(k): source=str(k); content_review = HtmlUtil.select_v(page, '#ContentBody .c_review') if content_review is None: content_review='' arr = HtmlUtil.select_text(page, '#ContentBody p') #记录到文件 newFile="data/finance"+url.split(',')[1][:-6]+".txt" FileUtil.put(newFile, '') FileUtil.appendline(newFile, title+"\n") FileUtil.appendline(newFile, url) FileUtil.appendline(newFile, date+"\n") FileUtil.appendline(newFile, source+"\n") FileUtil.appendline(newFile, content_review+"\n") for k in arr: try: FileUtil.appendline(newFile, str(k)) except: continue;
def getDetailPage(self): srcFile=TimeUtil.prefix()+".txt" content=FileUtil.readlines(srcFile) for str in content: url=str.split('|')[1] print url