Beispiel #1
0
 def test_leaves_original_file_after_aborted_overwrite(self):
     self.mkfile(self.filename, 'yo!')
     f = savefile.SaveFile(self.filename, 'w')
     f.write('foo')
     f.abort()
     self.assertEqual(os.listdir(self.tempdir), [self.basename])
     self.assertEqual(self.cat(self.filename), 'yo!')
Beispiel #2
0
 def test_overwrites_existing_file(self):
     self.mkfile(self.filename, 'yo!')
     f = savefile.SaveFile(self.filename, 'w')
     f.write('foo')
     f.close()
     self.assertEqual(os.listdir(self.tempdir), [self.basename])
     self.assertEqual(self.cat(self.filename), 'foo')
Beispiel #3
0
 def test_saves_normally_with_exception_within_with(self):
     try:
         with savefile.SaveFile(self.filename, 'w') as f:
             f.write('foo')
             raise Exception()
     except Exception:
         pass
     self.assertEqual(os.listdir(self.tempdir), [self.basename])
     self.assertEqual(self.cat(self.filename), 'foo')
Beispiel #4
0
 def start_spider(self):
     # print("开始爬取内容!")
     savepath = select_path()
     entytext = self.entry_baseurl.get()
     if entytext == "":
         _msgBox()
     # print(" 输入内容为: " + entytext)
     self.progresbar.config(maximum=int(self.entry_num.get()))
     self.progresbar.pack()
     urlbase = baseurl.replace("KEYWORD", urllib.request.quote(entytext))
     count = 0
     barvalue = 0
     endnum = int(self.entry_num.get())
     while count < endnum:
         # 获取链接
         text = bs4tool.geturls(urlbase, count)
         count = count + 50
         urls = text.split('\n')
         self.progresbar.config(value=barvalue)
         self.progresbar.update()
         for url in urls:
             # 链接为空
             if url == '':
                 continue
             filename = url.split("\t")[0].replace(
                 '/', '') + '_' + url.split('\t')[1]
             # 设置存储文件名,去掉非法字符
             filename = filename.replace('?', '').replace('=', '')
             # 帖子链接地址
             url = childbaseurl + url.split("\t")[0]
             print(url + ' -> start download ')
             # res = bs4tool.gettext(url)    # 格式化内容
             # 获取内容
             res = bs4tool.gethtml(url)
             # 保存内容
             savefile.SaveFile(res,
                               savepath + "/" + filename + '.html').save()
             # 更新进度条
             barvalue = barvalue + 1
             self.progresbar.config(value=barvalue)
             self.progresbar.update()
     self.progresbar.config(value=endnum)
     self.progresbar.update()
# -*- coding=utf-8 -*-

import savefile
import bs4tool

baseurl = 'http://tieba.baidu.com/f?kw=%E7%9F%B3%E5%AE%B6%E5%BA%84%E9%93%81%E9%81%93%E5%A4%A7%E5%AD%A6&ie=utf-8&pn='
childbaseurl = 'http://tieba.baidu.com'
count = 0
endnum = 100
text = ''
while count <= endnum:
    text = text + bs4tool.geturls(baseurl,count)                            # 获取链接
    count = count + 50
    urls = text.split('\n')
    for url in urls:
        filename = url.split("\t")[0].replace('/','')+'_'+url.split('\t')[1]
        filename = filename.replace('?','').replace('=','')                     # 设置存储文件名
        url = childbaseurl + url.split("\t")[0]                                 # 帖子链接
        print(url+' -> start download ')
        text = bs4tool.gettext(url)
        savefile.SaveFile(text,'tiezi/'+filename+'.html').save()
Beispiel #6
0
 def test_saves_normally_with_with(self):
     with savefile.SaveFile(self.filename, 'w') as f:
         f.write('foo')
     self.assertEqual(os.listdir(self.tempdir), [self.basename])
     self.assertEqual(self.cat(self.filename), 'foo')
Beispiel #7
0
 def test_leaves_no_file_after_aborted_new_file(self):
     f = savefile.SaveFile(self.filename, 'w')
     f.write('foo')
     f.abort()
     self.assertEqual(os.listdir(self.tempdir), [])
Beispiel #8
0
 def test_saves_new_file(self):
     f = savefile.SaveFile(self.filename, 'w')
     f.write('foo')
     f.close()
     self.assertEqual(os.listdir(self.tempdir), [self.basename])
     self.assertEqual(self.cat(self.filename), 'foo')
Beispiel #9
0
 def test_sets_name_to_temporary_name(self):
     f = savefile.SaveFile(self.filename, 'w')
     self.assertNotEqual(f.name, self.filename)
Beispiel #10
0
 def test_sets_real_filename(self):
     f = savefile.SaveFile(self.filename, 'w')
     self.assertEqual(f.real_filename, self.filename)