Ejemplo n.º 1
0
	def run(self):
		spider_start_time = str(datetime.now()).split('.')[0]
		print spider_start_time, 'time to spider start!'
		proxy_manager = ProxyManager()
		page = get_html(BASE_URL)
		page = unicode(page,'GBK').encode('UTF-8')
		page_count = self.get_page_count(page)
		page_count_time = str(datetime.now()).split('.')[0]
		print page_count_time, 'get page count:', page_count
		default_ip = get_default_ip()
		if page_count != 0:
			last_proxy = None
			for i in xrange(1, page_count):
				page = get_html(URL_HEADER + str(i) + URL_END, last_proxy)
				proxy_list = filte(page)
				for proxy in proxy_list:
					if proxy.anonymous_type == '高匿':
						check_result = check_anonymous(proxy, default_ip)
						spider_time = str(datetime.now()).split('.')[0]
						if check_result[0]:
							proxy.delay_time = check_result[1]
							proxy.created_time = str(datetime.now()).split('.')[0]
							proxy.is_in_china = 2
							proxy_manager.add_proxy(proxy, spider_time)
							last_proxy = proxy
						else:
							pass
Ejemplo n.º 2
0
	def run(self):
		spider_start_time = str(datetime.now()).split('.')[0]
		print spider_start_time, 'time to spider start!'
		proxy_manager = ProxyManager()
		last_proxy = None
		for url in self.urls:
			page = get_html(url)
			page_count = self.get_page_count(page)
			page_count_time = str(datetime.now()).split('.')[0]
			print page_count_time, 'get page count:', page_count
			default_ip = get_default_ip()
			for i in xrange(1, page_count):
				page = get_html(url + str(i))
				proxy_list = filte(page)
				for proxy in proxy_list:
					if proxy.anonymous_type == '高匿':
						check_result = check_anonymous(proxy, default_ip)
						spider_time = str(datetime.now()).split('.')[0]
						if check_result[0]:
							proxy.delay_time = check_result[1]
							proxy.created_time = str(datetime.now()).split('.')[0]
							proxy.is_in_china = 0
							if url.endswith(CHINA_ANONYMOUS) or url.endswith(CHINA_NORMAL):
								proxy.is_in_china = 1
							proxy_manager.add_proxy(proxy, spider_time)
							last_proxy = proxy
						else:
							pass
Ejemplo n.º 3
0
 def run(self):
     spider_start_time = str(datetime.now()).split('.')[0]
     print spider_start_time, 'time to spider start!'
     proxy_manager = ProxyManager()
     page = get_html(BASE_URL)
     page = unicode(page, 'GBK').encode('UTF-8')
     page_count = self.get_page_count(page)
     page_count_time = str(datetime.now()).split('.')[0]
     print page_count_time, 'get page count:', page_count
     default_ip = get_default_ip()
     if page_count != 0:
         last_proxy = None
         for i in xrange(1, page_count):
             page = get_html(URL_HEADER + str(i) + URL_END, last_proxy)
             proxy_list = filte(page)
             for proxy in proxy_list:
                 if proxy.anonymous_type == '高匿':
                     check_result = check_anonymous(proxy, default_ip)
                     spider_time = str(datetime.now()).split('.')[0]
                     if check_result[0]:
                         proxy.delay_time = check_result[1]
                         proxy.created_time = str(
                             datetime.now()).split('.')[0]
                         proxy.is_in_china = 2
                         proxy_manager.add_proxy(proxy, spider_time)
                         last_proxy = proxy
                     else:
                         pass
Ejemplo n.º 4
0
 def run(self):
     spider_start_time = str(datetime.now()).split('.')[0]
     print spider_start_time, 'time to spider start!'
     proxy_manager = ProxyManager()
     last_proxy = None
     for url in self.urls:
         page = get_html(url)
         page_count = self.get_page_count(page)
         page_count_time = str(datetime.now()).split('.')[0]
         print page_count_time, 'get page count:', page_count
         default_ip = get_default_ip()
         for i in xrange(1, page_count):
             page = get_html(url + str(i))
             proxy_list = filte(page)
             for proxy in proxy_list:
                 if proxy.anonymous_type == '高匿':
                     check_result = check_anonymous(proxy, default_ip)
                     spider_time = str(datetime.now()).split('.')[0]
                     if check_result[0]:
                         proxy.delay_time = check_result[1]
                         proxy.created_time = str(
                             datetime.now()).split('.')[0]
                         proxy.is_in_china = 0
                         if url.endswith(CHINA_ANONYMOUS) or url.endswith(
                                 CHINA_NORMAL):
                             proxy.is_in_china = 1
                         proxy_manager.add_proxy(proxy, spider_time)
                         last_proxy = proxy
                     else:
                         pass