Beispiel #1
0
def catchbooklist(requreip = 0, v=0, lockprefix= 'lock'):
	"""
	输入参数为:
	是否使用代理,默认否
	是否限制爬虫速度,默认否,时间为1秒仿人工
	文件加锁后缀
	"""
	# 进行计时
	start = time.clock()
	taglist = readexcel('web/booktag.xlsx') # 读取标签
	daili0 = daili()   # 代理IP数组
	changeip = 0  # 代理ip下标
	# 循环对标签进行抓取
	for i in range(1,len(taglist)):
		kinds = taglist[i][0] # 大分类
		tagname = taglist[i][1] # 标签名
		tag = urllib.parse.quote(tagname) # url中文转码
		mulu0 = 'web/'+kinds
		# 存在大分类文件夹则跳过
		if os.path.exists(mulu0):
			pass
		else: # 否则新建
			print('新建大分类:'+mulu0)
			os.makedirs(mulu0)

		mulu = mulu0+'/'+tagname
		# 存在标签文件夹则跳过
		if os.path.exists(mulu):
			pass
		else: # 否则新建方便网页存放
			print('新建标签文件夹'+mulu)
			os.makedirs(mulu)

		# 网络中断后重新抓取时判断是否加锁
		ok = listfiles(mulu, '.'+lockprefix)
		if ok:
			print('类别:'+kinds+'----标签:'+tagname+'----已经抓完') # 抓完
			continue
		url = 'http://www.douban.com/tag/'+tag+'/book?start=' 	# 基础网址
		pagesize = 15									# 每页15本
		i = 0   # 翻页助手
		while(True):
			# 需要爬取的网页
			site = url+str(i*pagesize)

			# 开始爬取
			# 构造文件名称
			# web/小说/0.html
			src = mulu+'/'+str(i*15)+'.html'

			# 判断文件是否存在,存在则不抓取节省时间
			if(os.path.exists(src) == True):
				pass
			else:
				# 写入本地文件
				print('准备抓取:'+site+'类别:'+kinds+'----标签:'+tagname)
				iprefuse = 1  # 如果抓取成功设为0
				# 如果抓取出错那重新抓取
				while iprefuse:
					try:
						daili1= daili0[changeip]  # 代理ip
						# 爬虫速度控制
						if v:
							a = time.clock()
							time.sleep(v)
							b = time.clock()
							print('时间暂停:'+str(b-a))
						# 不需要代理
						if requreip==0:
							webcontent = getHtml(site).encode('utf-8') # 爬取
							# print(webcontent.decode('utf-8','ignore'))
							notnull = re.search(r'<dl>',webcontent.decode('utf-8','ignore')) # 匹配看是否抓取到末页
							iprefuse = 0 # 抓完设置0
						else: # 需要代理
							print('代理:'+daili1)
							webcontent = getBinaryHtml(site, daili1)
							# print(webcontent.decode('utf-8','ignore'))
							notnull = re.search(r'<dl>',webcontent.decode('utf-8','ignore'))
							print(notnull)
							iprefuse = 0
					except Exception as e:
						print(e)
						if requreip:
							changeip = changeip+1 # 更换ip下标
							if changeip==len(daili0): # 到达ip数组末循环再来
								changeip = 0
							print('更换代理:'+daili0[changeip])
						else:
							print("IP被封")
							raise
							return
						# break

				# 如果抓不到<dl>标签,证明已经抓取完
				if notnull:
					webfile = open(src, 'wb')
					webfile.write(webcontent)
					webfile.close()
					print("已经抓取:"+site+'类别:'+kinds+'----标签:'+tagname)
				else:
					lock = open(src.replace('html',lockprefix),'w') # 加锁证明抓完
					# 日期:http://blog.csdn.net/caisini_vc/article/details/5619954
					finish = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
					lock.write('抓取完成时间:'+finish)
					print("抓取完毕:"+tagname)
					break
			i =i + 1  # 加页
	# 计时
	end = time.clock()
	print("爬取总共运行时间 : %.03f 秒" %(end-start))
Beispiel #2
0
def catchbook(requreip = 0, v=0,startbook=0):
	"""
	输入参数为:
	是否使用代理,默认否
	是否限制爬虫速度,默认否,时间为1秒仿人工
	startbook = 0 查询起始位置
	"""
	# 进行计时
	start = time.clock()
	webe=[]
	selecttotal = 'select count(distinct bookno) from booktag'
	selectsql = 'SELECT bookname,bookkind,bookno FROM booktag group by bookno'
	database = Mysql(host="localhost", user="******", pwd="6833066", db="doubanbook")
	total = database.ExecQuery(selecttotal) # 总记录
	total=int(total[0][0])
	daili0 = daili()   # 代理IP数组
	dailino = 0
	changeip = 0  # 代理ip下标
	# 循环对分类进行抓取
	while startbook < total+100:
		selectsql1=selectsql+' limit '+str(startbook)+',100'
		taglist=database.ExecQuery(selectsql1)
		for i in range(0,len(taglist)):
			try:
				bookname = taglist[i][0]
				kinds = taglist[i][1] # 分类
				bookno = taglist[i][2] # 图书编号
				url = 'http://book.douban.com/subject/'+bookno # 抓取网址
				#http://book.douban.com/subject/25862578
			except:
				raise
				return
			mulu0 = 'book/'+kinds
			# 存在大分类文件夹则跳过
			if os.path.exists(mulu0):
				pass
			else: # 否则新建
				print('新建大分类:'+mulu0)
				os.makedirs(mulu0)
			# 判断文件是否存在,存在则不抓取节省时间
			try:
				filename =mulu0+'/'+bookno+validateTitle(bookname)+'.html'
				if(os.path.exists(filename) == True):
					print(filename+':已经存在')
					continue
				elif bookno in web504:
					# 写入本地文件
					print('----'*5)
					print("504错误,跳过:"+bookno)
					print('----'*5)
					continue
				else:
					#print("-"*50)
					print('准备抓取:'+url+'类别:'+kinds)
			except:
				print(filename+"文件名异常")
				continue
			iprefuse = 1  # 如果抓取成功设为0
			# 如果抓取出错那重新抓取
			while iprefuse:
				try:
					daili1= daili0[changeip]  # 代理ip
					# 爬虫速度控制
					if v:
						a = time.clock()
						time.sleep(v)
						b = time.clock()
						print('时间暂停:'+str(b-a))
					# 不需要代理
					if requreip==0:
						webcontent = getHtml(url).encode('utf-8') # 爬取,有时间限制,应对504错误
						notnull = re.search(r'<div class="top-nav-doubanapp">',webcontent.decode('utf-8','ignore'))
						if notnull:
							pass
						else:
							raise Exception("抓到的页面不是正确的页面"+filename)
						webfile = open(filename, 'wb')
						webfile.write(webcontent)
						webfile.close()
						print("已经抓取:"+url+'类别:'+kinds)
						iprefuse = 0 # 抓完设置0
					else: # 需要代理
						print('代理:'+daili1)
						webcontent = getBinaryHtml(url, daili1)
						notnull = re.search(r'<div class="top-nav-doubanapp">',webcontent.decode('utf-8','ignore'))
						if notnull:
							pass
						else:
							raise Exception("抓到的页面不是正确的页面"+filename)
						webfile = open(filename, 'wb')
						webfile.write(webcontent)
						webfile.close()
						print("已经抓取:"+url+'类别:'+kinds)
						iprefuse = 0
						dailino=dailino+1
						print('此次转换代理次数:'+str(dailino))
						if dailino>20:
							dailino=0
							requreip=0 # 代理100次后转为非代理
				#except urllib.error.URLError as e:
				except Exception as e:
					print(url)
					if hasattr(e, 'code'):
						print('页面不存在或时间太长.')
						print('Error code:', e.code)
						if e.code==404:
							print('404错误,忽略')
							webe.append(bookno)
							break
					elif hasattr(e, 'reason'):
						print("无法到达主机.")
						print('Reason:  ', e.reason)
					print(e)
					if requreip:
						changeip = changeip+1 # 更换ip下标
						if changeip==len(daili0): # 到达ip数组末循环再来
							changeip = 0
						print('更换代理:'+daili0[changeip])
						dailino=dailino+1
						print('此次转换代理次数:'+str(dailino))
						if dailino>20:
							dailino=0
							requreip=0 # 代理100次后转为非代理
					else:
						print("IP被封或断网")
						requreip=1 # 转为代理
		print('已经抓了'+str(startbook+100)+'本')
		print()
		print()
		print()
		startbook=startbook+100
		if len(webe) > 20:
			print(webe)
			webep=open("book/book.txt",'a+')
			webep.write(','.join(webe)+'/n')
			webep.close()
			webe=[]
		else:
			pass
	# 计时
	end = time.clock()
	print("爬取总共运行时间 : %.03f 秒" %(end-start))
Beispiel #3
0
# booklist = book(open("web/douban250.html",'rb').read())
# print(booklist)

# 爬取得网页
urllist = []                                     # 要爬取的网页
url = 'http://book.douban.com/top250?start='     # 基础网址
page = 10                                         # 总共爬10页
pagesize = 25                                    # 每页25本
for i in range(page):
    urllist.append(url+str(i*pagesize))
# print(urllist)

# 一张张爬取所有图书列表
bookslist = []
for url in urllist:
    html_doc = getHtml(url)  #返回一个  'utf-8' 网页文本文档
    bookslist.append(book(html_doc))


# # 存入Exexl
# w = Workbook()     #创建一个工作簿
# ws = w.add_sheet('图书')     #创建一个工作表
# ws.write(0,0,'最热图书250本')
# ws.write(1,0,'序号')
# ws.write(1,1,'图书名称')
# ws.write(1,2,'图书别称')
# ws.write(1,3,'图书链接')
# ws.write(1,4,'图书封面')
# ws.write(1,5,'图书出版信息')
# ws.write(1,6,'图书星数')
# ws.write(1,7,'图书评论数')
Beispiel #4
0
# -*- coding:utf-8 -*-
from tool.gethtml import getHtml
import bookdeal
# 抓取分类标签页
tag =getHtml('http://book.douban.com/tag/')
file = open('web/booktag.html','wb')
file.write(tag.encode())
file.close()

# 抓取列表页方便测试
tag1 = getHtml("http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book")
file1 = open('web/books.html','wb')
file1.write(tag1.encode())
file1.close()

# 抓取图书页方便测试
tag3 = getHtml("http://book.douban.com/subject/25862578/?from=tag_all")
file2 = open('web/book.html','wb')
file2.write(tag3.encode())
file2.close()
print("成功")
Beispiel #5
0
def catchbook(requreip=0, v=0, startbook=0):
    """
	输入参数为:
	是否使用代理,默认否
	是否限制爬虫速度,默认否,时间为1秒仿人工
	startbook = 0 查询起始位置
	"""
    # 进行计时
    start = time.clock()
    webe = []
    selecttotal = 'select count(distinct bookno) from booktag'
    selectsql = 'SELECT bookname,bookkind,bookno FROM booktag group by bookno'
    database = Mysql(host="localhost",
                     user="******",
                     pwd="6833066",
                     db="doubanbook")
    total = database.ExecQuery(selecttotal)  # 总记录
    total = int(total[0][0])
    daili0 = daili()  # 代理IP数组
    dailino = 0
    changeip = 0  # 代理ip下标
    # 循环对分类进行抓取
    while startbook < total + 100:
        selectsql1 = selectsql + ' limit ' + str(startbook) + ',100'
        taglist = database.ExecQuery(selectsql1)
        for i in range(0, len(taglist)):
            try:
                bookname = taglist[i][0]
                kinds = taglist[i][1]  # 分类
                bookno = taglist[i][2]  # 图书编号
                url = 'http://book.douban.com/subject/' + bookno  # 抓取网址
                #http://book.douban.com/subject/25862578
            except:
                raise
                return
            mulu0 = 'book/' + kinds
            # 存在大分类文件夹则跳过
            if os.path.exists(mulu0):
                pass
            else:  # 否则新建
                print('新建大分类:' + mulu0)
                os.makedirs(mulu0)
            # 判断文件是否存在,存在则不抓取节省时间
            try:
                filename = mulu0 + '/' + bookno + validateTitle(
                    bookname) + '.html'
                if (os.path.exists(filename) == True):
                    print(filename + ':已经存在')
                    continue
                elif bookno in web504:
                    # 写入本地文件
                    print('----' * 5)
                    print("504错误,跳过:" + bookno)
                    print('----' * 5)
                    continue
                else:
                    #print("-"*50)
                    print('准备抓取:' + url + '类别:' + kinds)
            except:
                print(filename + "文件名异常")
                continue
            iprefuse = 1  # 如果抓取成功设为0
            # 如果抓取出错那重新抓取
            while iprefuse:
                try:
                    daili1 = daili0[changeip]  # 代理ip
                    # 爬虫速度控制
                    if v:
                        a = time.clock()
                        time.sleep(v)
                        b = time.clock()
                        print('时间暂停:' + str(v))
                        print('真实时间暂停(Unix CPU时间,Windows 真实时间):' + str(b - a))
                    # 不需要代理
                    if requreip == 0:
                        webcontent = getHtml(url).encode(
                            'utf-8')  # 爬取,有时间限制,应对504错误
                        notnull = re.search(
                            r'<div class="top-nav-doubanapp">',
                            webcontent.decode('utf-8', 'ignore'))
                        if notnull:
                            pass
                        else:
                            raise Exception("抓到的页面不是正确的页面" + filename)
                        webfile = open(filename, 'wb')
                        webfile.write(webcontent)
                        webfile.close()
                        print("已经抓取:" + url + '类别:' + kinds)
                        iprefuse = 0  # 抓完设置0
                    else:  # 需要代理
                        print('代理:' + daili1)
                        webcontent = getBinaryHtml(url, daili1)
                        notnull = re.search(
                            r'<div class="top-nav-doubanapp">',
                            webcontent.decode('utf-8', 'ignore'))
                        if notnull:
                            pass
                        else:
                            raise Exception("抓到的页面不是正确的页面" + filename)
                        webfile = open(filename, 'wb')
                        webfile.write(webcontent)
                        webfile.close()
                        print("已经抓取:" + url + '类别:' + kinds)
                        iprefuse = 0
                        dailino = dailino + 1
                        print('此次转换代理次数:' + str(dailino))
                        if dailino > 20:
                            dailino = 0
                            requreip = 0  # 代理100次后转为非代理
                #except urllib.error.URLError as e:
                except Exception as e:
                    print(url)
                    if hasattr(e, 'code'):
                        print('页面不存在或时间太长.')
                        print('Error code:', e.code)
                        if e.code == 404:
                            print('404错误,忽略')
                            webe.append(bookno)
                            break
                    elif hasattr(e, 'reason'):
                        print("无法到达主机.")
                        print('Reason:  ', e.reason)
                    print(e)
                    if requreip:
                        changeip = changeip + 1  # 更换ip下标
                        if changeip == len(daili0):  # 到达ip数组末循环再来
                            changeip = 0
                        print('更换代理:' + daili0[changeip])
                        dailino = dailino + 1
                        print('此次转换代理次数:' + str(dailino))
                        if dailino > 20:
                            dailino = 0
                            requreip = 0  # 代理100次后转为非代理
                    else:
                        print("IP被封或断网")
                        requreip = 1  # 转为代理
        print('已经抓了' + str(startbook + 100) + '本')
        print()
        print()
        print()
        startbook = startbook + 100
        if len(webe) > 20:
            print(webe)
            webep = open("book/book.txt", 'a+')
            webep.write(','.join(webe) + '/n')
            webep.close()
            webe = []
        else:
            pass
    # 计时
    end = time.clock()
    print("爬取总共运行时间 : %.03f 秒" % (end - start))
Beispiel #6
0
def catchbooklist(requreip=0, v=0, lockprefix='lock'):
    """
	输入参数为:
	requireip 是否使用代理,默认否
	v 是否限制爬虫速度,默认否,时间为1秒仿人工
	lockprefix 文件加锁后缀
	"""
    # 进行计时
    start = time.clock()
    taglist = readexcel('web/booktag.xlsx')  # 读取标签
    daili0 = daili()  # 代理IP数组
    changeip = 0  # 代理ip下标
    # 循环对标签进行抓取
    for i in range(1, len(taglist)):
        kinds = taglist[i][0]  # 大分类
        tagname = taglist[i][1]  # 标签名
        tag = urllib.parse.quote(tagname)  # url中文转码
        mulu0 = 'web/' + kinds
        # 存在大分类文件夹则跳过
        if os.path.exists(mulu0):
            pass
        else:  # 否则新建
            print('新建大分类:' + mulu0)
            os.makedirs(mulu0)

        mulu = mulu0 + '/' + tagname
        # 存在标签文件夹则跳过
        if os.path.exists(mulu):
            pass
        else:  # 否则新建方便网页存放
            print('新建标签文件夹' + mulu)
            os.makedirs(mulu)

        # 网络中断后重新抓取时判断是否加锁
        ok = listfiles(mulu, '.' + lockprefix)
        if ok:
            print('类别:' + kinds + '----标签:' + tagname + '----已经抓完')  # 抓完
            continue
        url = 'http://www.douban.com/tag/' + tag + '/book?start='  # 基础网址
        pagesize = 15  # 每页15本
        i = 0  # 翻页助手
        while (True):
            # 需要爬取的网页
            site = url + str(i * pagesize)

            # 开始爬取
            # 构造文件名称
            # web/小说/0.html
            src = mulu + '/' + str(i * 15) + '.html'

            # 判断文件是否存在,存在则不抓取节省时间
            if (os.path.exists(src) == True):
                pass
            else:
                # 写入本地文件
                print('准备抓取:' + site + '类别:' + kinds + '----标签:' + tagname)
                iprefuse = 1  # 如果抓取成功设为0
                # 如果抓取出错那重新抓取
                while iprefuse:
                    try:
                        daili1 = daili0[changeip]  # 代理ip
                        # 爬虫速度控制
                        if v:
                            a = time.clock()
                            time.sleep(v)
                            b = time.clock()
                            print('时间暂停:' + str(v))
                            print('真实时间暂停(Unix CPU时间,Windows 真实时间):' +
                                  str(b - a))
                        # 不需要代理
                        if requreip == 0:
                            # webcontent = getHtml(site).encode('utf-8') # 爬取
                            webcontent = getHtml(site).encode('utf-8')  # 爬取
                            # print(webcontent.decode('utf-8','ignore'))
                            notnull = re.search(r'<dl>',
                                                webcontent.decode(
                                                    'utf-8',
                                                    'ignore'))  # 匹配看是否抓取到末页
                            iprefuse = 0  # 抓完设置0
                        else:  # 需要代理
                            print('代理:' + daili1)
                            webcontent = getBinaryHtml(site, daili1)
                            # print(webcontent.decode('utf-8','ignore'))
                            notnull = re.search(
                                r'<dl>', webcontent.decode('utf-8', 'ignore'))
                            print(notnull)
                            iprefuse = 0
                    except Exception as e:
                        print(e)
                        if requreip:
                            changeip = changeip + 1  # 更换ip下标
                            if changeip == len(daili0):  # 到达ip数组末循环再来
                                changeip = 0
                            print('更换代理:' + daili0[changeip])
                        else:
                            print("IP被封")
                            raise
                            return
                        # break

                # 如果抓不到<dl>标签,证明已经抓取完
                if notnull:
                    webfile = open(src, 'wb')
                    webfile.write(webcontent)
                    webfile.close()
                    print("已经抓取:" + site + '类别:' + kinds + '----标签:' + tagname)
                else:
                    lock = open(src.replace('html', lockprefix), 'w')  # 加锁证明抓完
                    # 日期:http://blog.csdn.net/caisini_vc/article/details/5619954
                    finish = time.strftime("%Y-%m-%d %H:%M:%S",
                                           time.localtime())
                    lock.write('抓取完成时间:' + finish)
                    print("抓取完毕:" + tagname)
                    break
            i = i + 1  # 加页
    # 计时
    end = time.clock()
    print("爬取总共运行时间 : %.03f 秒" % (end - start))
Beispiel #7
0
# -*- coding:utf-8 -*-
from tool.gethtml import getHtml
import bookdeal
# 抓取分类标签页
tag = getHtml('http://book.douban.com/tag/')
file = open('web/booktag.html', 'wb')
file.write(tag.encode())
file.close()

# 抓取列表页方便测试
tag1 = getHtml("http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book")
file1 = open('web/books.html', 'wb')
file1.write(tag1.encode())
file1.close()

# 抓取图书页方便测试
tag3 = getHtml("http://book.douban.com/subject/25862578/?from=tag_all")
file2 = open('web/book.html', 'wb')
file2.write(tag3.encode())
file2.close()
print("成功")