def catchbooklist(requreip = 0, v=0, lockprefix= 'lock'): """ 输入参数为: 是否使用代理,默认否 是否限制爬虫速度,默认否,时间为1秒仿人工 文件加锁后缀 """ # 进行计时 start = time.clock() taglist = readexcel('web/booktag.xlsx') # 读取标签 daili0 = daili() # 代理IP数组 changeip = 0 # 代理ip下标 # 循环对标签进行抓取 for i in range(1,len(taglist)): kinds = taglist[i][0] # 大分类 tagname = taglist[i][1] # 标签名 tag = urllib.parse.quote(tagname) # url中文转码 mulu0 = 'web/'+kinds # 存在大分类文件夹则跳过 if os.path.exists(mulu0): pass else: # 否则新建 print('新建大分类:'+mulu0) os.makedirs(mulu0) mulu = mulu0+'/'+tagname # 存在标签文件夹则跳过 if os.path.exists(mulu): pass else: # 否则新建方便网页存放 print('新建标签文件夹'+mulu) os.makedirs(mulu) # 网络中断后重新抓取时判断是否加锁 ok = listfiles(mulu, '.'+lockprefix) if ok: print('类别:'+kinds+'----标签:'+tagname+'----已经抓完') # 抓完 continue url = 'http://www.douban.com/tag/'+tag+'/book?start=' # 基础网址 pagesize = 15 # 每页15本 i = 0 # 翻页助手 while(True): # 需要爬取的网页 site = url+str(i*pagesize) # 开始爬取 # 构造文件名称 # web/小说/0.html src = mulu+'/'+str(i*15)+'.html' # 判断文件是否存在,存在则不抓取节省时间 if(os.path.exists(src) == True): pass else: # 写入本地文件 print('准备抓取:'+site+'类别:'+kinds+'----标签:'+tagname) iprefuse = 1 # 如果抓取成功设为0 # 如果抓取出错那重新抓取 while iprefuse: try: daili1= daili0[changeip] # 代理ip # 爬虫速度控制 if v: a = time.clock() time.sleep(v) b = time.clock() print('时间暂停:'+str(b-a)) # 不需要代理 if requreip==0: webcontent = getHtml(site).encode('utf-8') # 爬取 # print(webcontent.decode('utf-8','ignore')) notnull = re.search(r'<dl>',webcontent.decode('utf-8','ignore')) # 匹配看是否抓取到末页 iprefuse = 0 # 抓完设置0 else: # 需要代理 print('代理:'+daili1) webcontent = getBinaryHtml(site, daili1) # print(webcontent.decode('utf-8','ignore')) notnull = re.search(r'<dl>',webcontent.decode('utf-8','ignore')) print(notnull) iprefuse = 0 except Exception as e: print(e) if requreip: changeip = changeip+1 # 更换ip下标 if changeip==len(daili0): # 到达ip数组末循环再来 changeip = 0 print('更换代理:'+daili0[changeip]) else: print("IP被封") raise return # break # 如果抓不到<dl>标签,证明已经抓取完 if notnull: webfile = open(src, 'wb') webfile.write(webcontent) webfile.close() print("已经抓取:"+site+'类别:'+kinds+'----标签:'+tagname) else: lock = open(src.replace('html',lockprefix),'w') # 加锁证明抓完 # 日期:http://blog.csdn.net/caisini_vc/article/details/5619954 finish = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) lock.write('抓取完成时间:'+finish) print("抓取完毕:"+tagname) break i =i + 1 # 加页 # 计时 end = time.clock() print("爬取总共运行时间 : %.03f 秒" %(end-start))
def catchbook(requreip = 0, v=0,startbook=0): """ 输入参数为: 是否使用代理,默认否 是否限制爬虫速度,默认否,时间为1秒仿人工 startbook = 0 查询起始位置 """ # 进行计时 start = time.clock() webe=[] selecttotal = 'select count(distinct bookno) from booktag' selectsql = 'SELECT bookname,bookkind,bookno FROM booktag group by bookno' database = Mysql(host="localhost", user="******", pwd="6833066", db="doubanbook") total = database.ExecQuery(selecttotal) # 总记录 total=int(total[0][0]) daili0 = daili() # 代理IP数组 dailino = 0 changeip = 0 # 代理ip下标 # 循环对分类进行抓取 while startbook < total+100: selectsql1=selectsql+' limit '+str(startbook)+',100' taglist=database.ExecQuery(selectsql1) for i in range(0,len(taglist)): try: bookname = taglist[i][0] kinds = taglist[i][1] # 分类 bookno = taglist[i][2] # 图书编号 url = 'http://book.douban.com/subject/'+bookno # 抓取网址 #http://book.douban.com/subject/25862578 except: raise return mulu0 = 'book/'+kinds # 存在大分类文件夹则跳过 if os.path.exists(mulu0): pass else: # 否则新建 print('新建大分类:'+mulu0) os.makedirs(mulu0) # 判断文件是否存在,存在则不抓取节省时间 try: filename =mulu0+'/'+bookno+validateTitle(bookname)+'.html' if(os.path.exists(filename) == True): print(filename+':已经存在') continue elif bookno in web504: # 写入本地文件 print('----'*5) print("504错误,跳过:"+bookno) print('----'*5) continue else: #print("-"*50) print('准备抓取:'+url+'类别:'+kinds) except: print(filename+"文件名异常") continue iprefuse = 1 # 如果抓取成功设为0 # 如果抓取出错那重新抓取 while iprefuse: try: daili1= daili0[changeip] # 代理ip # 爬虫速度控制 if v: a = time.clock() time.sleep(v) b = time.clock() print('时间暂停:'+str(b-a)) # 不需要代理 if requreip==0: webcontent = getHtml(url).encode('utf-8') # 爬取,有时间限制,应对504错误 notnull = re.search(r'<div class="top-nav-doubanapp">',webcontent.decode('utf-8','ignore')) if notnull: pass else: raise Exception("抓到的页面不是正确的页面"+filename) webfile = open(filename, 'wb') webfile.write(webcontent) webfile.close() print("已经抓取:"+url+'类别:'+kinds) iprefuse = 0 # 抓完设置0 else: # 需要代理 print('代理:'+daili1) webcontent = getBinaryHtml(url, daili1) notnull = re.search(r'<div class="top-nav-doubanapp">',webcontent.decode('utf-8','ignore')) if notnull: pass else: raise Exception("抓到的页面不是正确的页面"+filename) webfile = open(filename, 'wb') webfile.write(webcontent) webfile.close() print("已经抓取:"+url+'类别:'+kinds) iprefuse = 0 dailino=dailino+1 print('此次转换代理次数:'+str(dailino)) if dailino>20: dailino=0 requreip=0 # 代理100次后转为非代理 #except urllib.error.URLError as e: except Exception as e: print(url) if hasattr(e, 'code'): print('页面不存在或时间太长.') print('Error code:', e.code) if e.code==404: print('404错误,忽略') webe.append(bookno) break elif hasattr(e, 'reason'): print("无法到达主机.") print('Reason: ', e.reason) print(e) if requreip: changeip = changeip+1 # 更换ip下标 if changeip==len(daili0): # 到达ip数组末循环再来 changeip = 0 print('更换代理:'+daili0[changeip]) dailino=dailino+1 print('此次转换代理次数:'+str(dailino)) if dailino>20: dailino=0 requreip=0 # 代理100次后转为非代理 else: print("IP被封或断网") requreip=1 # 转为代理 print('已经抓了'+str(startbook+100)+'本') print() print() print() startbook=startbook+100 if len(webe) > 20: print(webe) webep=open("book/book.txt",'a+') webep.write(','.join(webe)+'/n') webep.close() webe=[] else: pass # 计时 end = time.clock() print("爬取总共运行时间 : %.03f 秒" %(end-start))
# booklist = book(open("web/douban250.html",'rb').read()) # print(booklist) # 爬取得网页 urllist = [] # 要爬取的网页 url = 'http://book.douban.com/top250?start=' # 基础网址 page = 10 # 总共爬10页 pagesize = 25 # 每页25本 for i in range(page): urllist.append(url+str(i*pagesize)) # print(urllist) # 一张张爬取所有图书列表 bookslist = [] for url in urllist: html_doc = getHtml(url) #返回一个 'utf-8' 网页文本文档 bookslist.append(book(html_doc)) # # 存入Exexl # w = Workbook() #创建一个工作簿 # ws = w.add_sheet('图书') #创建一个工作表 # ws.write(0,0,'最热图书250本') # ws.write(1,0,'序号') # ws.write(1,1,'图书名称') # ws.write(1,2,'图书别称') # ws.write(1,3,'图书链接') # ws.write(1,4,'图书封面') # ws.write(1,5,'图书出版信息') # ws.write(1,6,'图书星数') # ws.write(1,7,'图书评论数')
# -*- coding:utf-8 -*- from tool.gethtml import getHtml import bookdeal # 抓取分类标签页 tag =getHtml('http://book.douban.com/tag/') file = open('web/booktag.html','wb') file.write(tag.encode()) file.close() # 抓取列表页方便测试 tag1 = getHtml("http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book") file1 = open('web/books.html','wb') file1.write(tag1.encode()) file1.close() # 抓取图书页方便测试 tag3 = getHtml("http://book.douban.com/subject/25862578/?from=tag_all") file2 = open('web/book.html','wb') file2.write(tag3.encode()) file2.close() print("成功")
def catchbook(requreip=0, v=0, startbook=0): """ 输入参数为: 是否使用代理,默认否 是否限制爬虫速度,默认否,时间为1秒仿人工 startbook = 0 查询起始位置 """ # 进行计时 start = time.clock() webe = [] selecttotal = 'select count(distinct bookno) from booktag' selectsql = 'SELECT bookname,bookkind,bookno FROM booktag group by bookno' database = Mysql(host="localhost", user="******", pwd="6833066", db="doubanbook") total = database.ExecQuery(selecttotal) # 总记录 total = int(total[0][0]) daili0 = daili() # 代理IP数组 dailino = 0 changeip = 0 # 代理ip下标 # 循环对分类进行抓取 while startbook < total + 100: selectsql1 = selectsql + ' limit ' + str(startbook) + ',100' taglist = database.ExecQuery(selectsql1) for i in range(0, len(taglist)): try: bookname = taglist[i][0] kinds = taglist[i][1] # 分类 bookno = taglist[i][2] # 图书编号 url = 'http://book.douban.com/subject/' + bookno # 抓取网址 #http://book.douban.com/subject/25862578 except: raise return mulu0 = 'book/' + kinds # 存在大分类文件夹则跳过 if os.path.exists(mulu0): pass else: # 否则新建 print('新建大分类:' + mulu0) os.makedirs(mulu0) # 判断文件是否存在,存在则不抓取节省时间 try: filename = mulu0 + '/' + bookno + validateTitle( bookname) + '.html' if (os.path.exists(filename) == True): print(filename + ':已经存在') continue elif bookno in web504: # 写入本地文件 print('----' * 5) print("504错误,跳过:" + bookno) print('----' * 5) continue else: #print("-"*50) print('准备抓取:' + url + '类别:' + kinds) except: print(filename + "文件名异常") continue iprefuse = 1 # 如果抓取成功设为0 # 如果抓取出错那重新抓取 while iprefuse: try: daili1 = daili0[changeip] # 代理ip # 爬虫速度控制 if v: a = time.clock() time.sleep(v) b = time.clock() print('时间暂停:' + str(v)) print('真实时间暂停(Unix CPU时间,Windows 真实时间):' + str(b - a)) # 不需要代理 if requreip == 0: webcontent = getHtml(url).encode( 'utf-8') # 爬取,有时间限制,应对504错误 notnull = re.search( r'<div class="top-nav-doubanapp">', webcontent.decode('utf-8', 'ignore')) if notnull: pass else: raise Exception("抓到的页面不是正确的页面" + filename) webfile = open(filename, 'wb') webfile.write(webcontent) webfile.close() print("已经抓取:" + url + '类别:' + kinds) iprefuse = 0 # 抓完设置0 else: # 需要代理 print('代理:' + daili1) webcontent = getBinaryHtml(url, daili1) notnull = re.search( r'<div class="top-nav-doubanapp">', webcontent.decode('utf-8', 'ignore')) if notnull: pass else: raise Exception("抓到的页面不是正确的页面" + filename) webfile = open(filename, 'wb') webfile.write(webcontent) webfile.close() print("已经抓取:" + url + '类别:' + kinds) iprefuse = 0 dailino = dailino + 1 print('此次转换代理次数:' + str(dailino)) if dailino > 20: dailino = 0 requreip = 0 # 代理100次后转为非代理 #except urllib.error.URLError as e: except Exception as e: print(url) if hasattr(e, 'code'): print('页面不存在或时间太长.') print('Error code:', e.code) if e.code == 404: print('404错误,忽略') webe.append(bookno) break elif hasattr(e, 'reason'): print("无法到达主机.") print('Reason: ', e.reason) print(e) if requreip: changeip = changeip + 1 # 更换ip下标 if changeip == len(daili0): # 到达ip数组末循环再来 changeip = 0 print('更换代理:' + daili0[changeip]) dailino = dailino + 1 print('此次转换代理次数:' + str(dailino)) if dailino > 20: dailino = 0 requreip = 0 # 代理100次后转为非代理 else: print("IP被封或断网") requreip = 1 # 转为代理 print('已经抓了' + str(startbook + 100) + '本') print() print() print() startbook = startbook + 100 if len(webe) > 20: print(webe) webep = open("book/book.txt", 'a+') webep.write(','.join(webe) + '/n') webep.close() webe = [] else: pass # 计时 end = time.clock() print("爬取总共运行时间 : %.03f 秒" % (end - start))
def catchbooklist(requreip=0, v=0, lockprefix='lock'): """ 输入参数为: requireip 是否使用代理,默认否 v 是否限制爬虫速度,默认否,时间为1秒仿人工 lockprefix 文件加锁后缀 """ # 进行计时 start = time.clock() taglist = readexcel('web/booktag.xlsx') # 读取标签 daili0 = daili() # 代理IP数组 changeip = 0 # 代理ip下标 # 循环对标签进行抓取 for i in range(1, len(taglist)): kinds = taglist[i][0] # 大分类 tagname = taglist[i][1] # 标签名 tag = urllib.parse.quote(tagname) # url中文转码 mulu0 = 'web/' + kinds # 存在大分类文件夹则跳过 if os.path.exists(mulu0): pass else: # 否则新建 print('新建大分类:' + mulu0) os.makedirs(mulu0) mulu = mulu0 + '/' + tagname # 存在标签文件夹则跳过 if os.path.exists(mulu): pass else: # 否则新建方便网页存放 print('新建标签文件夹' + mulu) os.makedirs(mulu) # 网络中断后重新抓取时判断是否加锁 ok = listfiles(mulu, '.' + lockprefix) if ok: print('类别:' + kinds + '----标签:' + tagname + '----已经抓完') # 抓完 continue url = 'http://www.douban.com/tag/' + tag + '/book?start=' # 基础网址 pagesize = 15 # 每页15本 i = 0 # 翻页助手 while (True): # 需要爬取的网页 site = url + str(i * pagesize) # 开始爬取 # 构造文件名称 # web/小说/0.html src = mulu + '/' + str(i * 15) + '.html' # 判断文件是否存在,存在则不抓取节省时间 if (os.path.exists(src) == True): pass else: # 写入本地文件 print('准备抓取:' + site + '类别:' + kinds + '----标签:' + tagname) iprefuse = 1 # 如果抓取成功设为0 # 如果抓取出错那重新抓取 while iprefuse: try: daili1 = daili0[changeip] # 代理ip # 爬虫速度控制 if v: a = time.clock() time.sleep(v) b = time.clock() print('时间暂停:' + str(v)) print('真实时间暂停(Unix CPU时间,Windows 真实时间):' + str(b - a)) # 不需要代理 if requreip == 0: # webcontent = getHtml(site).encode('utf-8') # 爬取 webcontent = getHtml(site).encode('utf-8') # 爬取 # print(webcontent.decode('utf-8','ignore')) notnull = re.search(r'<dl>', webcontent.decode( 'utf-8', 'ignore')) # 匹配看是否抓取到末页 iprefuse = 0 # 抓完设置0 else: # 需要代理 print('代理:' + daili1) webcontent = getBinaryHtml(site, daili1) # print(webcontent.decode('utf-8','ignore')) notnull = re.search( r'<dl>', webcontent.decode('utf-8', 'ignore')) print(notnull) iprefuse = 0 except Exception as e: print(e) if requreip: changeip = changeip + 1 # 更换ip下标 if changeip == len(daili0): # 到达ip数组末循环再来 changeip = 0 print('更换代理:' + daili0[changeip]) else: print("IP被封") raise return # break # 如果抓不到<dl>标签,证明已经抓取完 if notnull: webfile = open(src, 'wb') webfile.write(webcontent) webfile.close() print("已经抓取:" + site + '类别:' + kinds + '----标签:' + tagname) else: lock = open(src.replace('html', lockprefix), 'w') # 加锁证明抓完 # 日期:http://blog.csdn.net/caisini_vc/article/details/5619954 finish = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) lock.write('抓取完成时间:' + finish) print("抓取完毕:" + tagname) break i = i + 1 # 加页 # 计时 end = time.clock() print("爬取总共运行时间 : %.03f 秒" % (end - start))
# -*- coding:utf-8 -*- from tool.gethtml import getHtml import bookdeal # 抓取分类标签页 tag = getHtml('http://book.douban.com/tag/') file = open('web/booktag.html', 'wb') file.write(tag.encode()) file.close() # 抓取列表页方便测试 tag1 = getHtml("http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book") file1 = open('web/books.html', 'wb') file1.write(tag1.encode()) file1.close() # 抓取图书页方便测试 tag3 = getHtml("http://book.douban.com/subject/25862578/?from=tag_all") file2 = open('web/book.html', 'wb') file2.write(tag3.encode()) file2.close() print("成功")