def test_leaves_original_file_after_aborted_overwrite(self): self.mkfile(self.filename, 'yo!') f = savefile.SaveFile(self.filename, 'w') f.write('foo') f.abort() self.assertEqual(os.listdir(self.tempdir), [self.basename]) self.assertEqual(self.cat(self.filename), 'yo!')
def test_overwrites_existing_file(self): self.mkfile(self.filename, 'yo!') f = savefile.SaveFile(self.filename, 'w') f.write('foo') f.close() self.assertEqual(os.listdir(self.tempdir), [self.basename]) self.assertEqual(self.cat(self.filename), 'foo')
def test_saves_normally_with_exception_within_with(self): try: with savefile.SaveFile(self.filename, 'w') as f: f.write('foo') raise Exception() except Exception: pass self.assertEqual(os.listdir(self.tempdir), [self.basename]) self.assertEqual(self.cat(self.filename), 'foo')
def start_spider(self): # print("开始爬取内容!") savepath = select_path() entytext = self.entry_baseurl.get() if entytext == "": _msgBox() # print(" 输入内容为: " + entytext) self.progresbar.config(maximum=int(self.entry_num.get())) self.progresbar.pack() urlbase = baseurl.replace("KEYWORD", urllib.request.quote(entytext)) count = 0 barvalue = 0 endnum = int(self.entry_num.get()) while count < endnum: # 获取链接 text = bs4tool.geturls(urlbase, count) count = count + 50 urls = text.split('\n') self.progresbar.config(value=barvalue) self.progresbar.update() for url in urls: # 链接为空 if url == '': continue filename = url.split("\t")[0].replace( '/', '') + '_' + url.split('\t')[1] # 设置存储文件名,去掉非法字符 filename = filename.replace('?', '').replace('=', '') # 帖子链接地址 url = childbaseurl + url.split("\t")[0] print(url + ' -> start download ') # res = bs4tool.gettext(url) # 格式化内容 # 获取内容 res = bs4tool.gethtml(url) # 保存内容 savefile.SaveFile(res, savepath + "/" + filename + '.html').save() # 更新进度条 barvalue = barvalue + 1 self.progresbar.config(value=barvalue) self.progresbar.update() self.progresbar.config(value=endnum) self.progresbar.update()
# -*- coding=utf-8 -*- import savefile import bs4tool baseurl = 'http://tieba.baidu.com/f?kw=%E7%9F%B3%E5%AE%B6%E5%BA%84%E9%93%81%E9%81%93%E5%A4%A7%E5%AD%A6&ie=utf-8&pn=' childbaseurl = 'http://tieba.baidu.com' count = 0 endnum = 100 text = '' while count <= endnum: text = text + bs4tool.geturls(baseurl,count) # 获取链接 count = count + 50 urls = text.split('\n') for url in urls: filename = url.split("\t")[0].replace('/','')+'_'+url.split('\t')[1] filename = filename.replace('?','').replace('=','') # 设置存储文件名 url = childbaseurl + url.split("\t")[0] # 帖子链接 print(url+' -> start download ') text = bs4tool.gettext(url) savefile.SaveFile(text,'tiezi/'+filename+'.html').save()
def test_saves_normally_with_with(self): with savefile.SaveFile(self.filename, 'w') as f: f.write('foo') self.assertEqual(os.listdir(self.tempdir), [self.basename]) self.assertEqual(self.cat(self.filename), 'foo')
def test_leaves_no_file_after_aborted_new_file(self): f = savefile.SaveFile(self.filename, 'w') f.write('foo') f.abort() self.assertEqual(os.listdir(self.tempdir), [])
def test_saves_new_file(self): f = savefile.SaveFile(self.filename, 'w') f.write('foo') f.close() self.assertEqual(os.listdir(self.tempdir), [self.basename]) self.assertEqual(self.cat(self.filename), 'foo')
def test_sets_name_to_temporary_name(self): f = savefile.SaveFile(self.filename, 'w') self.assertNotEqual(f.name, self.filename)
def test_sets_real_filename(self): f = savefile.SaveFile(self.filename, 'w') self.assertEqual(f.real_filename, self.filename)