def booktag(url_content, path='web/booktag.xlsx'): """ 抓取标签提取 写入Excel """ soup = BeautifulSoup(url_content, 'html.parser') # 开始解析 booktag1 = soup.select('div#content div.article div div') # print(booktag1[0]) taglist = [['标签类别', '标签名', '链接', '图书数']] for booktag2 in booktag1: print(str(booktag2)) soup1 = BeautifulSoup(str(booktag2), 'html.parser') # 开始解析 booktag2 = soup1.find('a', attrs={'class': 'tag-title-wrapper'}) type = booktag2['name'] # 标签类别 # print(type) # booktag3 = soup1.findAll('a',attrs={'class':'tag'}) booktag3 = soup1.findAll("a") booktag4 = soup1.findAll("b") # print(booktag3) for i in range(0, len(booktag4)): tag = booktag3[i + 1].string # 标签名 taglink = booktag3[i + 1]['href'] # 链接 tagnum = booktag4[i].string taglist.append([type, tag, taglink, tagnum]) print(taglist) writeexcel(path, taglist) print("写入EXCEL成功")
def booktag(url_content, path = 'web/booktag.xlsx'): """ 抓取标签提取 写入Excel """ soup = BeautifulSoup(url_content, 'html.parser') # 开始解析 booktag1 = soup.select('div#content div.article div div') # print(booktag1[0]) taglist = [['标签类别', '标签名', '链接','图书数']] for booktag2 in booktag1: print(str(booktag2)) soup1 = BeautifulSoup(str(booktag2), 'html.parser') # 开始解析 booktag2 = soup1.find('a',attrs={'class':'tag-title-wrapper'}) type = booktag2['name'] # 标签类别 # print(type) # booktag3 = soup1.findAll('a',attrs={'class':'tag'}) booktag3 = soup1.findAll("a") booktag4 = soup1.findAll("b") # print(booktag3) for i in range(0,len(booktag4)): tag = booktag3[i+1].string # 标签名 taglink = booktag3[i+1]['href'] # 链接 tagnum=booktag4[i].string taglist.append([type, tag, taglink,tagnum]) print(taglist) writeexcel(path, taglist) print("写入EXCEL成功")
def dealbooklist(): start = time.clock() putplace = 'books' # 判断存放位置是否存在 if os.path.exists(putplace): pass else: # 否则新建 print('新建图书提取存放excel处:'+putplace) os.makedirs(putplace) taglist = readexcel('web/booktag.xlsx') # 读取标签列表 del taglist[0] # 对于每个标签 for tag in taglist: # 图书按照标签存放于文件夹中 mulu=putplace+'/'+tag[0] if os.path.exists(mulu): pass else: os.makedirs(mulu) excelpath = mulu+'/'+tag[1]+'.xlsx' # 存在处理过的excel文件则跳过 if os.path.exists(excelpath): print(excelpath+'已经存在') continue tagbooks = [] # 该标签所有书存放处 path = 'web/'+tag[0]+'/'+tag[1] # 构造读取文件夹入口 print('本地提取:'+path) # 查找目录下已经抓取的Html files = listfiles(path) # 遍历分析 for i in files: file = path+'/'+i print('提取:'+file) content = open(file,'rb').read() book = bookdeal.manybook(content) # 提取图书列表 for j in book: # 重新包装图书 # print('提取:'+','.join(j)) tagbooks.append(j) # 将信息写入本地文件中 booksattr=['书籍名','URL入口','图片地址','出版信息','评价星数'] tagbooks.insert(0,booksattr) writeexcel(excelpath,tagbooks) print('写入成功:'+excelpath) end = time.clock() print("提取图书列表总共运行时间 : %.03f 秒" %(end-start))
def dealbooklist(): start = time.clock() putplace = 'books' # 判断存放位置是否存在 if os.path.exists(putplace): pass else: # 否则新建 print('新建图书提取存放excel处:' + putplace) os.makedirs(putplace) taglist = readexcel('web/booktag.xlsx') # 读取标签列表 del taglist[0] # 对于每个标签 for tag in taglist: # 图书按照标签存放于文件夹中 mulu = putplace + '/' + tag[0] if os.path.exists(mulu): pass else: os.makedirs(mulu) excelpath = mulu + '/' + tag[1] + '.xlsx' # 存在处理过的excel文件则跳过 if os.path.exists(excelpath): print(excelpath + '已经存在') continue tagbooks = [] # 该标签所有书存放处 path = 'web/' + tag[0] + '/' + tag[1] # 构造读取文件夹入口 print('本地提取:' + path) # 查找目录下已经抓取的Html files = listfiles(path) # 遍历分析 for i in files: file = path + '/' + i print('提取:' + file) content = open(file, 'rb').read() book = bookdeal.manybook(content) # 提取图书列表 for j in book: # 重新包装图书 # print('提取:'+','.join(j)) tagbooks.append(j) # 将信息写入本地文件中 booksattr = ['书籍名', 'URL入口', '图片地址', '出版信息', '评价星数'] tagbooks.insert(0, booksattr) writeexcel(excelpath, tagbooks) print('写入成功:' + excelpath) end = time.clock() print("提取图书列表总共运行时间 : %.03f 秒" % (end - start))
def booktag(url_content, path = 'web/booktag.xlsx'): """ 抓取标签提取 写入Excel """ soup = BeautifulSoup(url_content, 'html.parser') # 开始解析 booktag1 = soup.select('div#content div.article div div') # print(booktag1[0]) taglist = [['标签类别', '标签名', '链接']] for booktag2 in booktag1: soup1 = BeautifulSoup(str(booktag2), 'html.parser') # 开始解析 booktag2 = soup1.find('a',attrs={'class':'tag-title-wrapper'}) type = booktag2['name'] # 标签类别 booktag3 = soup1.findAll('a',attrs={'class':'tag'}) for i in booktag3: tag = i.string # 标签名 taglink = i['href'] # 链接 taglist.append([type, tag, taglink]) print(taglist) writeexcel(path, taglist) print("写入EXCEL成功")