def html(self, href): html = request.get(href, 3) html_soup = BeautifulSoup(html.text, 'lxml') max_span = html_soup.find('div', class_="pagenavi").find_all('span')[-2].get_text() for page in range(1, int(max_span) + 1): page_url = href + '/' + str(page) self.img(page_url) time.sleep(3)
def start(start_url): start_html = request.get(start_url, 3) soup = BeautifulSoup(start_html.text, 'lxml') li_list = soup.find('div', {'class': 'all'}).find_all('a') for li in li_list: title = li.get_text() url = li['href'] spider_queue.push(url, title)
def all_url(self, start_url): start_html = request.get(start_url, 3) soup = BeautifulSoup(start_html.text, 'lxml') li_list = soup.find('div', {'class': 'all'}).find_all('a') for li in li_list: title = li.get_text() print(u'开始保存:', title) path = str(title).replace('?', '_') self.mkdir(path) # 调用mkdir函数来创建文件夹 os.chdir('/home/rising/图片/meizitu/' + path) # 切换到对应的目录 href = li['href'] self.html(href)
def all_url(self, url): html = request.get(url, 3) ##这儿更改了一下(是不是发现 self 没见了?) all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a') for a in all_a[1:]: title = a.get_text() self.title = title print(u'开始保存:', title) ##加点提示不然太枯燥了 path = str(title).replace( "?", ' ') ##我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉 self.mkdir(path) os.chdir('E:\图片\mzitu\\' + path) href = a['href'] self.url = href ##将页面地址保存到self.url中 if self.meizitu_collection.find_one( {'主题页面': href}): ##判断这个主题是否已经在数据库中、不在就运行else下的内容,在则忽略。 print(u'这个页面已经爬取过了') else: self.html(href)
def get(self, url, timeout=10, num_retries=5): print('开始获取:', url) UA = random.choice(self.user_agent) headers = {'User-Agent': UA} try: IP = ''.join(str(random.choice(self.iplist)).strip()) proxy = {'http': IP} return requests.get(url, headers=headers, proxies=proxy, timeout=timeout) except: if num_retries > 0: print('获取失败,10s后重新获取,剩余尝试次数:', num_retries) time.sleep(10) return self.get(url, timeout, num_retries - 1) else: print('代理不好使了!取消代理') return request.get(url)
def pageurl_crawler(): while True: try: (url, name) = img_queue.pop() print(url) except KeyError: print('队列没有数据') break else: title = img_queue.pop_title_(url) path = str(title).replace('?', '') mkdir(path) os.chdir('C:\Data\\' + path) html = request.get(url, 3) html_soup = BeautifulSoup(html.text, 'lxml') img = html_soup.find('div', id='i3').find('img') img_url = img['src'] print(u'得到图片的链接') save(img_url, name) img_queue.complete(url)
def all_url(self, url): html = request.get(url, 3) html.encoding = html.apparent_encoding Soup = BeautifulSoup(html.text, 'lxml') all_p = Soup.find('div', class_='box mtop').find_all('div', class_='p') for p in all_p: a = p.find('a') title = a.find('img').attrs["alt"] self.title = title print(u'开始保存:', title) href = a['href'] self.url = href if self.meizitu_collection.find_one({'主题页面': href}): print(u'这个页面已经爬取过了') else: path0 = str(title).replace("?", "_") path = str(path0).replace(r'/', "-") self.mkdir(path) os.chdir("E:\meitulu\\" + path) self.html(href)
def save(self, url): html = self.get(url) soup = BeautifulSoup(html, 'lxml') tag_all = soup.find_all('div', class_='text') for tag in tag_all: img_url = tag.find('img')['src'] if cmp(img_url[0:5], 'http:'): img_url = img_url[7:] elif cmp(img_url[0:2], '////'): img_url = img_url[2:] else: continue [filename, filetype] = img_url.split('/')[-1].split('.') with open( '/media/wangs/Docs/pic/jiandan/{}.{}'.format( filename, filetype), 'wb') as img: img.write(request.get('http://' + img_url, 3).content) img.close()
def img(self, page_url, max_span, page_num): #添加上边传递参数 img_html = request.get(page_url, 3) ##这儿更改了一下(删掉 self ;3为timeout参数) img_url = BeautifulSoup(img_html.text, 'lxml').find( 'div', class_='main-image').find('img')['src'] self.img_urls.append( img_url ) #每一次 for page in range(1, int(max_span) + 1)获取到的图片地址都会添加到 img_urls这个初始化的列表 if int( max_span ) == page_num: #传递来的两个参数 当max_span和page_num相等时,即为最后一张图片,下载图片并保存至数据库 self.save(img_url) post = { #构造一个字典 '标题': self.title, '主题页面': self.url, '图片地址': self.img_urls, '获取时间': datetime.datetime.now() } self.meizitu_collection.save(post) #将post内容写入数据库 print(u'插入数据库成功') else: #max_span 不等于 page_num执行下面 self.save(img_url)
def img(self, page_url, max_span, page_num): img_html = request.get(page_url, 3) img_url = BeautifulSoup(img_html.text, 'lxml').find( 'div', class_='main-image').find('img')['src'] #每次循环都把图片地址存到list self.img_urls.append(img_url) #当max_span和Page_num相等时,就是最后一张图片了,最后一次下载图片并保存到数据库中。 print('imge_url:', img_url) if int(max_span) == page_num: self.save(img_url) #构造字典,存入mongodb post = { '标题': self.title, '主题页面': self.url, '图片地址': self.img_urls, '获取时间': datetime.datetime.now() } #把信息存入mongodb self.meizitu_collection.save(post) print(post, '已存入monggodb') else: self.save(img_url)
def all_url(self, url): ##调用request函数把套图地址传进去会返回给我们一个response html = request.get(url, 3) all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a') for a in all_a: title = a.get_text() self.title = title # 有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉 path = str(title).replace('?', '_') # 调用mkdir函数创建文件夹!这儿path代表的是标题title self.mkdir(path) os.chdir("D:\mzitu\\" + path) print('开始保存:', title, '路径:', 'D:\mzitu\\', path) href = a['href'] #把页面地址保存到self.url self.url = href ##调用html函数把href参数传递过去!href是套图的地址 print('href:', href) if self.meizitu_collection.find_one({'主题页面': href}): print(href, '页面已经爬取') else: self.html(href)
def save(self, img_url): name = img_url[-9:-4] print(u'开始保存', img_url) img = request.get(img_url, 3) with open(name + '.jpg', 'ab') as f: f.write(img.content)
def img(self, page_url): img_html = request.get(page_url, 3) img_soup = BeautifulSoup(img_html.text, 'lxml') img_url = img_soup.find('div', class_="main-image").find('img')['src'] self.save(img_url)
def get(self, url): html = request.get(url, 3) return html.text
def request(self, url): return request.get(url, self.timeout)
def img(self, page_url): img_html = request.get(page_url, 3) # img_url = Download.get(self, page_url, 3) img_url = BeautifulSoup(img_html.text, 'lxml').find( 'div', class_='main-image').find('img')['src'] self.save(img_url)
def download_img(self, img_url, referer): name = img_url[-9:-4] + '.jpg' # img = self.request(img_url) img = request.get(img_url, 3, referer=referer) with open(name, 'wb') as f: f.write(img.content)
def get_img(self, page_url, referer): headers = request.build_headers(referer=referer) img_html = request.get(page_url, headers, 10) src = BeautifulSoup(img_html.text, 'lxml').find( 'div', class_='main-image').find('img')['src'] self.save_img(src, referer)
def save(self, img_url): name = img_url[-9:-4] img = request.get(img_url, 3) f = open(name + '.jpg', 'ab') f.write(img.content) f.close()
'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'jandan.net', 'Referer': 'http://jandan.net/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' } url = 'http://jandan.net/ooxx/page-{}#comments'.format(3) html = request.get(url, 3).text j = re.search( r'.*<script\ssrc=\"\/\/(cdn.jandan.net\/static\/min.*?)\"><\/script>.*', html) jsFileUrl = "http://" + j.group(1) jsFile = requests.get(jsFileUrl, headers=headers).text index = 0 class jiandan(): def get(self, url): html = request.get(url, 3) return html.text def save(self, url):
def img(self, page_url): img_html = request.get(page_url, 3) ##这儿更改了一下(是不是发现 self 没见了?) img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src'] self.save(img_url)
def html(self, href): html = request.get(href, 3)##这儿更改了一下(是不是发现 self 没见了?) max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[10].get_text() for page in range(1, int(max_span) + 1): page_url = href + '/' + str(page) self.img(page_url)
def save(self, img_url): #这个函数保存图片 name = img_url[-9:-4] img = request.get(img_url, 3) f = open(name + 'jpg', 'ab') #创建文件,写入多媒体文件必须要 b 这个参数! f.write(img.content) #多媒体文件要是用conctent! f.close()