def get_article(index_url): ''' 获取当前页每一个原创帖子的url连接 并返回一个字典类型k:帖子名称 v:帖子url地址 ''' title_list = [] path_list = [] url_list = [] html = down.get(index_url, 3) html.encoding = 'gbk' all_font = BeautifulSoup(html.text, 'lxml').find_all(color="green") for font in all_font: title = font.get_text() title_list.append(title) print(u'发现帖子:', title) ##加点提示不然太枯燥了 path = str(title).replace( '?', '') ##我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉 path_list.append(path) mkdir(path) ##调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦! href = font.parent['href'] if dagaier_collection.find_one( {'主题页面': href}): ##判断这个主题是否已经在数据库中、不在就运行else下的内容,在则忽略。 print(u'这个页面已经爬取过了') else: url_list.append('https://liuyouba.tk/' + href) Aticles = dict(title=title_list, name=path_list, urls=url_list) return Aticles
def img(self, url): girl = down.get(url, 3) img_soup = BeautifulSoup(girl.text, 'lxml') img_url = img_soup.find('div', class_='main-image').find('img')['src'] self.img_urls.append(img_url) name = img_url[-9:-4] self.save(img_url, name)
def save(self, img_url): name = img_url[-9:-4] print(u'开始保存:', img_url) img = down.get(img_url, 3) f = open(name + '.jpg', 'ab') f.write(img.content) f.close()
def start(url): respone = down.get(url, 3) Soup = BeautifulSoup(respone.text, 'lxml') imgs_list = Soup.find('ul', id='pins').select('span > a') for imgs in imgs_list: title = imgs.get_text() href = imgs['href'] spider_queue.push(href, title)
def start(url): response = down.get(url, 3) all_a = BeautifulSoup(response.text, 'lxml').find('div', class_='all').find_all('a') for a in all_a: title = a.get_text() url = a['href'] spider_queue.push(url, title)
def html(self, href): img_a_girl_html = down.get(href, 3) img_a_girl = BeautifulSoup(img_a_girl_html.text, 'lxml') max_span = img_a_girl.find( 'div', class_='pagenavi').find_all('span')[-2].get_text() self.img(href) for page in range(2, int(max_span) + 1): page_url = href + '/' + str(page) self.img(page_url)
def html(self, href): html = down.get(href, 3) max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[-2].get_text() page_num = 0 ##这个当作计数器用 (用来判断图片是否下载完毕) for page in range(1, int(max_span) + 1): page_num = page_num + 1 ##每for循环一次就+1 (当page_num等于max_span的时候,就证明我们的在下载最后一张图片了) page_url = href + '/' + str(page) self.img(page_url, max_span, page_num) ##把上面我们我们需要的两个变量,传递给下一个函数。
def SavePic(item): ''' 通过中间件down 将抓取到的图片保存到本地 ''' url = item.url filename = item.name basepath = str('E:/result/dagaier/') content = down.get(url, 3).content with open(basepath + filename, 'wb') as f: f.write(content) print('当前文件 {} 下载完毕'.format(basepath + filename))
def all_url(self, url): html = down.get(url, 3) all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a') for a in all_a: title = a.get_text() self.title = title ##将主题保存到self.title中 print(u'开始保存:', title) path = str(title).replace("?", '_') self.mkdir(path) os.chdir("C:\mzitu\\" + path) href = a['href'] self.url = href ##将页面地址保存到self.url中 if self.meizitu_collection.find_one( {'主题页面': href}): ##判断这个主题是否已经在数据库中、不在就运行else下的内容,在则忽略。 print(u'这个页面已经爬取过了') else: self.html(href)
def img(self, page_url, max_span, page_num): ##添加上面传递的参数 img_html = down.get(page_url, 3) img_url = BeautifulSoup(img_html.text, 'lxml').find( 'div', class_='main-image').find('img')['src'] self.img_urls.append( img_url ) ##每一次 for page in range(1, int(max_span) + 1)获取到的图片地址都会添加到 img_urls这个初始化的列表 if ( int(max_span) // 10 ) + 1 == page_num: ##我们传递下来的两个参数用上了 当max_span和Page_num相等时,就是最后一张图片了,最后一次下载图片并保存到数据库中。 self.save(img_url) post = { ##这是构造一个字典,里面有啥都是中文,很好理解吧! '标题': self.title, '主题页面': self.url, '图片地址': self.img_urls, '获取时间': datetime.datetime.now() } self.meizitu_collection.save(post) ##将post中的内容写入数据库。 print(u'插入数据库成功') else: ##max_span 不等于 page_num执行这下面 self.save(page_url, img_url)
def get_pic(Aticles): ''' 打开每个帖子的url, 找到图片的地址, 保存在列表并返回 ''' # 用于保存item的列表 aticles_urls = [] # 从dict里分离漫画名和章节链接 aticle_list = Aticles['urls'] basedir = Aticles['name'] mongo_title = Aticles['title'] # print(len(basedir)) # print(len(aticle_list)) pathNum = 0 for url in aticle_list: img_html = down.get(url, 3) img_urls = BeautifulSoup(img_html.text, 'lxml').find_all('input', type='image') post = { ##这是构造一个字典,里面有啥都是中文,很好理解吧! '标题': mongo_title[pathNum], '主题页面': url, '图片地址': img_urls['src'], '获取时间': datetime.datetime.now() } dagaier_collection.save(post) ##将post中的内容写入数据库。 print(u'插入数据库成功') picNum = 1 for img_url in img_urls: item = Item() item.url = img_url['src'] item.name = str(basedir[pathNum]) + '/' + str(picNum) + '.jpg' aticles_urls.append(item) picNum = picNum + 1 pathNum = pathNum + 1 return aticles_urls
def all_url(self, url): html = down.get(url, 3) Soup = BeautifulSoup(html.text, 'lxml') imgs_list = Soup.find('ul', id='pins').select('span > a') for imgs in imgs_list: title = imgs.get_text() self.title = title self.mkdir(title) href = imgs['href'] self.url = href if self.mzitu_collection.find_one({'主题页面': href}): print('The title already repliled') else: self.html(href) post = { '标题': self.title, '主题页面': self.url, '图片地址': self.img_urls, '获取时间': datetime.datetime.now() } self.mzitu_collection.save(post) self.title = '' self.url = '' self.img_urls = []
def save(self, img_url, name): img = down.get(img_url, 3) print('save a image who name is:' + name + '.jpg') f = open(name + '.jpg', 'ab') f.write(img.content) f.close()
def main(): if (len(sys.argv) > 1): username = sys.argv[1] else: username=input('type the username you want to crawl: ') printWithTime('connecting database...') connect('douban_album') link='https://www.douban.com/people/'+username # http get main_page = down.get(link) # parse main_tree = html.fromstring(main_page.text) name = main_tree.xpath('//div[@class="user-info"]/div[@class="pl"]/text()')[0].strip() intro = main_tree.xpath('//span[@id="intro_display"]/text()') photo_link=link.rstrip('/')+'/photos' try: os.mkdir('data') except FileExistsError: pass os.chdir('data') try: os.mkdir(name) except FileExistsError: pass os.chdir(name) try: os.mkdir('albums') except FileExistsError: pass os.chdir('albums') # db try: user=userRecord.objects.get(username=username) except DoesNotExist: # not found -> create one user = userRecord(username=username) user.save() albumDict={} i=0 while True: printWithTime('get album link at page '+str(i)+'...') photo_page = down.get(photo_link) time.sleep(1) photo_tree=html.fromstring(photo_page.text) albumList=photo_tree.xpath('//div[@class="wr"]/div[@class="albumlst"]') for album in albumList: link=album.xpath('.//div[@class="pl2"]/a/@href')[0] title = album.xpath('.//div[@class="pl2"]/a/text()')[0] #album filter total = int(album.xpath('.//span[@class="pl"]/text()')[0].split('张')[0].strip()) # get the total number of pictures of an album albumID = int(link.split('/')[-2]) if total == albumRecord.objects(albumID=albumID,status=True).count(): printWithTime("jump album {}".format(title)) continue # next album else: # not finished album pass albumDict[title]=[link,total] try: i+=1 photo_link=photo_tree.xpath('//link[@rel="next"]/@href')[0] except IndexError: # the end of album page break for title,linkAndTotal in dict.items(albumDict): getAllPhotoInAlbum(linkAndTotal[0],title,linkAndTotal[1]) printWithTime("Mission Complete!") os.chdir('../../..')
def getAllPhotoInAlbum(albumLink,title,total): photoIDList=[] albumID=int(albumLink.split('/')[-2]) i=0 printWithTime('{} begin...'.format(title)) # if all in db (including status==False) if total == albumRecord.objects(albumID=albumID).count(): pass # jump collecting else: # else continue collecting (add those not in db) while True: printWithTime('get photo link at page ' + str(i) + '...') page = down.get(albumLink) time.sleep(1) tree = html.fromstring(page.text) photoListInOnePage=tree.xpath('//a[@class="photolst_photo"]/@href') for photo in photoListInOnePage: picID = photo.split('/')[-2] try: albumRecord.objects.get(albumID=albumID, picID=picID) except: # if not in db, add temp = albumRecord(albumID=albumID, picID=picID, status=False) temp.save() else: # already in db, jump continue try: albumLink = tree.xpath('//link[@rel="next"]/@href')[0] if albumLink[-3:]=='sep': # in order to distinguish the 'next page' of album from the 'next page' of reviews break i += 1 except IndexError: break for queryResult in albumRecord.objects(albumID=albumID, status=False): photoIDList.append(queryResult['picID']) # todo display percentage for i,picID in enumerate(tqdm(photoIDList)): # #pic filter # try: # albumRecord.objects.get(albumID=albumID, picID=picID) # except DoesNotExist: # #crawl # # and record # temp = albumRecord(albumID=albumID, picID=picID) # # todo append album to user (error in python3.5, non local variable) # # user.album.append(temp) # temp.save() # else: # # visited # continue photoURL='https://img3.doubanio.com/view/photo/l/public/p{}.webp'.format(picID) try: os.mkdir(title) except FileExistsError: pass filename = title+'/'+str(picID)+'.webp' with open(filename,'wb') as f: print('Download '+photoURL.split('/')[-1]+' of album "'+title) f.write(down.get(photoURL).content) temp_img = Image.open(filename).convert("RGB") temp_img.save(title+'/'+str(picID)+'.jpg') os.remove(filename) time.sleep(1) # update status to True albumRecord.objects(albumID=albumID, picID=picID).update_one(status=True) printWithTime('done with album: '+title)