Ejemplo n.º 1
0
 def __init__(self, url):
     self.category = parse_url(url)
     if self.is_valid():
         (scheme, netloc, path, params, query, fragment) = urlparse(url)
         self.scheme = scheme
         self.netloc = netloc
         self.baseurl = urlunparse((scheme, netloc, path, "", "", ""))
         self.start = None
         if self.category <= 3:
             self.start = 0
             if query:
                 self.start = int(query[query.rindex("=") + 1 :])
     else:
         self.baseurl = None
         self.start = None
Ejemplo n.º 2
0
	def __init__(self, url):
		self.category = parse_url(url)
		if self.is_valid():
			(scheme, netloc, path, params, query, fragment) = urlparse(url)
			self.scheme = scheme
			self.netloc = netloc
			self.baseurl = urlunparse((scheme, netloc, path, '', '', ''))
			self.start = None
			if self.category <= 3:
				self.start = 0
				if query:
					self.start = int(query[query.rindex('=')+1:])
		else:
			self.baseurl = None
			self.start = None
Ejemplo n.º 3
0
 def run(self):
     num = 0
     total = 10
     while num < total + 18:
         # 1、start_url
         url = self.temp_url.format(num)
         print(url)
         # 2、发送请求,获得响应
         html_str = parse_url(url)
         # 3、提取数据,注意这里使用self进行调用!
         content_list, total = self.get_content_list(html_str)
         # 4、保存
         self.save_content_list(content_list)
         # 5、构造下一页的url地址,循环2-5步
         num += 18
Ejemplo n.º 4
0
 def run(self):
     num = 0
     total = 100
     while num < total + 18:
         # 1.start_url
         url = self.temp_url.format(num)
         print(url)
         # 2.发送请求,获取响应
         html_str = parse_url(url)
         # 3.提起数据
         content_list, total = self.get_content_list(html_str)
         # 4.保存
         self.save_content_list(content_list)
         # 5.构造下一页的url地址
         num += 18
Ejemplo n.º 5
0
    def run(self):  #实现主要逻辑
        num = 0
        total = 100
        while num < total + 20:
            #1.start_url地址
            url = self.temp_url.format(num)
            print(url)
            #2.发送请求,获取响应
            html_str = parse_url(url)
            #3.提取数据
            content_list, total = self.get_contentf_list(html_str)
            #4.保存
            self.save_content_list(content_list)

            #5.构造下一页的url地址,循环2-5步
            num += 20
    def run(self):

        num = 0
        total = 100

        while num < total + 18:

            start_url = self.temp_url.format(num)

            html_str = parse_url(start_url)

            content_list, total = self.get_content_list(html_str)

            self.save_content_list(content_list)

            num += 18
Ejemplo n.º 7
0
 def run(self):  #实现主要逻辑
     num = 340
     total = 1200
     while num < total:
         #1.start_url
         url = self.temp_url.format(num)
         #print(url)
         #2.发送请求,获取响应
         html_str = parse_url(url)
         #3.提取数据
         content_list = self.get_contentf_list(html_str)
         #4.保存
         self.save_content_list(content_list)
         #5.构造下一页的URL地址,循环2-5部
         num += 20
         print(
             "************************已经成功爬取%d条数据*************************"
             % (num))
Ejemplo n.º 8
0
    def run(self):
        """实现主要逻辑"""
        start = 0
        # count = 18
        # total = 18
        # while start <= total:  # 循环1~5;total=50
        # while start < total + count:  # 循环1~5;total + count = 68
        while True:
            print(start)  # 0 --> 18 --> 36 --> 54
            """
            url:统一资源定位器
            1 起始的url:start_url
            2 发送请求,获取响应
            3 提取数据
            4 保存
            5 构造下一页的起始url地址:start_url
            6 循环1~5
            """
            # 1 起始的统一资源定位器:start_url
            start_url = self.temp_url.format(start)

            # 2 发送请求,获取响应
            html_str = parse_url(start_url)
            """
            request_headers = {"Referer": "https://m.douban.com/tv/",
                               "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Mobile Safari/537.36"}
            response = requests.get(start_url, headers=request_headers)
            html_str = response.content.decode()
            """

            # 3 提取数据
            count, start, subject_collection_items, total = self.get_data(
                html_str)

            # 4 保存
            self.save_data(subject_collection_items)

            # 5 构造下一页的url地址
            start += count

            # 6 判断是否继续循环
            if start > total + count:
                break
Ejemplo n.º 9
0
    def run(self):  #实现主要逻辑
        num = 0
        total = 100

        while num < total + 18:
            # 1.start_url
            start_url = self.temp_url.format(num)
            print(start_url)

            #2.发送请求获取相应
            html_str = parse_url(start_url)

            #3.提取数据
            content_list, total = self.get_content_list(html_str)

            #4.保存
            self.save_content_list(content_list)

            #5。构造下一页的URL地址,循环2到5
            num += 18
Ejemplo n.º 10
0
 def run(self):  # 主要逻辑
     '''
     # 1 start_url
     # 2 改送请求,获取响应
     # 3 提取数据
     # 4 保存
     # 5 构造下一页的url地址,循环2-5步
     '''
     num = 0
     total = 100
     while num < total + 18:
         # 1 start_url
         start_url = self.temp_url.format(num)
         print(start_url)
         # 2 改送请求,获取响应
         html_str = parse_url(start_url)
         # 3 提取数据
         content_list, total = self.get_contentf_list(html_str)
         print(total)
         # 4 保存
         self.save_content_lis(content_list)
         # 5 构造下一页的url地址,循环2-5步
         num += 18