def __init__(self, url): self.category = parse_url(url) if self.is_valid(): (scheme, netloc, path, params, query, fragment) = urlparse(url) self.scheme = scheme self.netloc = netloc self.baseurl = urlunparse((scheme, netloc, path, "", "", "")) self.start = None if self.category <= 3: self.start = 0 if query: self.start = int(query[query.rindex("=") + 1 :]) else: self.baseurl = None self.start = None
def __init__(self, url): self.category = parse_url(url) if self.is_valid(): (scheme, netloc, path, params, query, fragment) = urlparse(url) self.scheme = scheme self.netloc = netloc self.baseurl = urlunparse((scheme, netloc, path, '', '', '')) self.start = None if self.category <= 3: self.start = 0 if query: self.start = int(query[query.rindex('=')+1:]) else: self.baseurl = None self.start = None
def run(self): num = 0 total = 10 while num < total + 18: # 1、start_url url = self.temp_url.format(num) print(url) # 2、发送请求,获得响应 html_str = parse_url(url) # 3、提取数据,注意这里使用self进行调用! content_list, total = self.get_content_list(html_str) # 4、保存 self.save_content_list(content_list) # 5、构造下一页的url地址,循环2-5步 num += 18
def run(self): num = 0 total = 100 while num < total + 18: # 1.start_url url = self.temp_url.format(num) print(url) # 2.发送请求,获取响应 html_str = parse_url(url) # 3.提起数据 content_list, total = self.get_content_list(html_str) # 4.保存 self.save_content_list(content_list) # 5.构造下一页的url地址 num += 18
def run(self): #实现主要逻辑 num = 0 total = 100 while num < total + 20: #1.start_url地址 url = self.temp_url.format(num) print(url) #2.发送请求,获取响应 html_str = parse_url(url) #3.提取数据 content_list, total = self.get_contentf_list(html_str) #4.保存 self.save_content_list(content_list) #5.构造下一页的url地址,循环2-5步 num += 20
def run(self): num = 0 total = 100 while num < total + 18: start_url = self.temp_url.format(num) html_str = parse_url(start_url) content_list, total = self.get_content_list(html_str) self.save_content_list(content_list) num += 18
def run(self): #实现主要逻辑 num = 340 total = 1200 while num < total: #1.start_url url = self.temp_url.format(num) #print(url) #2.发送请求,获取响应 html_str = parse_url(url) #3.提取数据 content_list = self.get_contentf_list(html_str) #4.保存 self.save_content_list(content_list) #5.构造下一页的URL地址,循环2-5部 num += 20 print( "************************已经成功爬取%d条数据*************************" % (num))
def run(self): """实现主要逻辑""" start = 0 # count = 18 # total = 18 # while start <= total: # 循环1~5;total=50 # while start < total + count: # 循环1~5;total + count = 68 while True: print(start) # 0 --> 18 --> 36 --> 54 """ url:统一资源定位器 1 起始的url:start_url 2 发送请求,获取响应 3 提取数据 4 保存 5 构造下一页的起始url地址:start_url 6 循环1~5 """ # 1 起始的统一资源定位器:start_url start_url = self.temp_url.format(start) # 2 发送请求,获取响应 html_str = parse_url(start_url) """ request_headers = {"Referer": "https://m.douban.com/tv/", "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Mobile Safari/537.36"} response = requests.get(start_url, headers=request_headers) html_str = response.content.decode() """ # 3 提取数据 count, start, subject_collection_items, total = self.get_data( html_str) # 4 保存 self.save_data(subject_collection_items) # 5 构造下一页的url地址 start += count # 6 判断是否继续循环 if start > total + count: break
def run(self): #实现主要逻辑 num = 0 total = 100 while num < total + 18: # 1.start_url start_url = self.temp_url.format(num) print(start_url) #2.发送请求获取相应 html_str = parse_url(start_url) #3.提取数据 content_list, total = self.get_content_list(html_str) #4.保存 self.save_content_list(content_list) #5。构造下一页的URL地址,循环2到5 num += 18
def run(self): # 主要逻辑 ''' # 1 start_url # 2 改送请求,获取响应 # 3 提取数据 # 4 保存 # 5 构造下一页的url地址,循环2-5步 ''' num = 0 total = 100 while num < total + 18: # 1 start_url start_url = self.temp_url.format(num) print(start_url) # 2 改送请求,获取响应 html_str = parse_url(start_url) # 3 提取数据 content_list, total = self.get_contentf_list(html_str) print(total) # 4 保存 self.save_content_lis(content_list) # 5 构造下一页的url地址,循环2-5步 num += 18