Esempio n. 1
0
    def verify_proxy_ip(self):
        '''
		验证数据库中的【全部的】代理ip是否过期,如果过期,更新status=1,标记为待删除状态
		'''
        p = SqliteHelper()
        result = p.db_select_all_for_verify()
        if result:
            for pro in result:
                pid = pro[0]
                aa_show = 'verify {0}-{1}:{2}'.format(pro[3], pro[1], pro[2])
                print(aa_show)
                proxyman.info(aa_show)

                p_ip = {
                    "{0}".format(pro[3]):
                    "http://{0}:{1}".format(pro[1], pro[2])
                }
                res = self.check_proxy_ip(p_ip)
                if not res:
                    # 该代理ip不可用了
                    sign_show = 'proxy ip【{0}】can not used ,signed for delete it'.format(
                        pro[1])
                    print(sign_show)
                    proxyman.info(sign_show)
                    # 标记为待删除状态
                    p.db_update_for_status(pid, 1)

                Shelper.makeSleep(3, False)

        else:
            res_show = '未从数据库中获取到待检测的代理ip'
            print(res_show)
            proxyman.info(res_show)
Esempio n. 2
0
    def do_get(self):  # 请注意之前是do_GET(self)
        """
        """
        ip_dict = {}  # 之前是dict

        parsed_path = urlparse.urlparse(self.path)
        # 定义一个获取成功链接的返回结果
        try:
            query = urllib.unquote(parsed_path.query)
            # print(query)  # 测试输出
            logger.info("query %s" % query)  # 指定query指定给logger
            # 获取返回的url的值
            if query.find('&') != -1:
                param_list = query.split('&')
                for param in param_list:
                    ip_dict[param.split('=')[0]] = param.split('=')[1]
            else:
                    ip_dict[query.split('=')[0]] = query.split('=')[1]

            sql_helper = SqliteHelper()  #
            # 处理删除代理的请求
            if 'delete' in ip_dict:   # 之前的写法dict.has_key('delete')
                condition = "ip='" + ip_dict['ip'] + "' AND port=" + ip_dict['port']
                sql_helper.delete(SqliteHelper.tableName, condition)
                self.send_response(200)
                self.end_headers()
                self.wfile.write("Success delete proxy: " + ip_dict['ip'] + ":" + ip_dict['port'])
            else:
                str_count = ''
                conditions = []
                for key in ip_dict:
                    if key == 'count':
                        str_count = 'LIMIT 0,%s' % ip_dict[key]
                    if key == 'country' or key == 'area':
                        conditions .append(key+" LIKE '"+ip_dict[key]+"%'")
                    elif key == 'types' or key == 'protocol' or key == 'country' or key == 'area':
                        conditions .append(key+"="+ip_dict[key])
                if len(conditions) > 1:
                    conditions = ' AND '.join(conditions)
                else:
                    conditions = conditions[0]
                result = sql_helper.select(sql_helper.tableName, conditions, str_count)
                # print type(result)
                # for r in  result:
                #     print r
                data = [{'ip': item[0], 'port': item[1]} for item in result]  # 返回有效的ip数据
                data = json.dumps(data)  # 生成json数据格式
                self.send_response(200)
                self.end_headers()
                self.wfile.write(data)
        except Exception as e:
            logger.warning(str(e))
            self.send_response(404)
Esempio n. 3
0
    def do_GET(self):
        """
        """
        dict = {}

        parsed_path = urlparse.urlparse(self.path)
        try:
            query = urllib.unquote(parsed_path.query)
            logger.info("query %s" % query)
            if query.find('&') != -1:
                params = query.split('&')
                for param in params:
                    dict[param.split('=')[0]] = param.split('=')[1]
            else:
                dict[query.split('=')[0]] = query.split('=')[1]

            sqlHelper = SqliteHelper()
            # 处理删除代理的请求
            if dict.has_key('delete'):
                condition = "ip='" + dict['ip'] + "' AND port=" + dict['port']
                sqlHelper.delete(SqliteHelper.tableName, condition)
                self.send_response(200)
                self.end_headers()
                self.wfile.write("Success delete proxy: " + dict['ip'] + ":" +
                                 dict['port'])
            else:
                str_count = ''
                conditions = []
                for key in dict:
                    if key == 'count':
                        str_count = 'LIMIT 0,%s' % dict[key]
                    if key == 'country' or key == 'area':
                        conditions.append(key + " LIKE '" + dict[key] + "%'")
                    elif key == 'types' or key == 'protocol' or key == 'country' or key == 'area':
                        conditions.append(key + "=" + dict[key])
                if len(conditions) > 1:
                    conditions = ' AND '.join(conditions)
                else:
                    conditions = conditions[0]
                result = sqlHelper.select(sqlHelper.tableName, conditions,
                                          str_count)
                # print type(result)
                # for r in  result:
                #     print r
                data = [{'ip': item[0], 'port': item[1]} for item in result]
                data = json.dumps(data)
                self.send_response(200)
                self.end_headers()
                self.wfile.write(data)
        except Exception, e:
            logger.warning(str(e))
            self.send_response(404)
Esempio n. 4
0
    def do_GET(self):
        """
        """
        dict = {}

        parsed_path = urlparse.urlparse(self.path)
        try:
            query = urllib.unquote(parsed_path.query)
            print query
            if query.find('&') != -1:
                params = query.split('&')
                for param in params:
                    dict[param.split('=')[0]] = param.split('=')[1]
            else:
                dict[query.split('=')[0]] = query.split('=')[1]
            str_count = ''
            conditions = []
            for key in dict:
                if key == 'count':
                    str_count = 'lIMIT 0,%s' % dict[key]
                if key == 'country' or key == 'area':
                    conditions.append(key + " LIKE '" + dict[key] + "%'")
                elif key == 'types' or key == 'protocol' or key == 'country' or key == 'area':
                    conditions.append(key + "=" + dict[key])
            if len(conditions) > 1:
                conditions = ' AND '.join(conditions)
            else:
                conditions = conditions[0]
            sqlHelper = SqliteHelper()
            result = sqlHelper.select(sqlHelper.tableName, conditions,
                                      str_count)
            # print type(result)
            # for r in  result:
            #     print r
            print result
            data = json.dumps(result)
            self.send_response(200)
            self.end_headers()
            self.wfile.write(data)
        except Exception, e:
            print e
            self.send_response(404)
Esempio n. 5
0
    def do_GET(self):
        """
        """
        dict={}

        parsed_path = urlparse.urlparse(self.path)
        try:
            query = urllib.unquote(parsed_path.query)
            print query
            if query.find('&')!=-1:
                params = query.split('&')
                for param in params:
                    dict[param.split('=')[0]]=param.split('=')[1]
            else:
                    dict[query.split('=')[0]]=query.split('=')[1]
            str_count=''
            conditions=[]
            for key in dict:
                if key =='count':
                    str_count = 'lIMIT 0,%s'% dict[key]
                if key =='country' or key =='area':
                    conditions .append(key+" LIKE '"+dict[key]+"%'")
                elif key =='types' or key =='protocol' or key =='country' or key =='area':
                    conditions .append(key+"="+dict[key])
            if len(conditions)>1:
                conditions = ' AND '.join(conditions)
            else:
                conditions =conditions[0]
            sqlHelper = SqliteHelper()
            result = sqlHelper.select(sqlHelper.tableName,conditions,str_count)
            # print type(result)
            # for r in  result:
            #     print r
            print result
            data = json.dumps(result)
            self.send_response(200)
            self.end_headers()
            self.wfile.write(data)
        except Exception,e:
            print e
            self.send_response(404)
Esempio n. 6
0
    def run(self):
        while True:
            logger.info("Start to run spider")
            sqlHelper = SqliteHelper()
            logger.info('Start to run validator')
            validator = Validator(sqlHelper)
            count = validator.run_db()
            logger.info('Finished to run validator, count=%s' % count)
            if count[0] < MINNUM:
                proxys = self.crawl_pool.map(self.crawl, parserList)
                #这个时候proxys的格式是[[{},{},{}],[{},{},{}]]
                # print proxys
                #这个时候应该去重:

                proxys_tmp = []
                for proxy in proxys:
                    proxys_tmp.extend(proxy)

                proxys = proxys_tmp
                logger.info('first_proxys: %s' % len(proxys))
                #这个时候proxys的格式是[{},{},{},{},{},{}]
                proxys_tmp = None
                #这个时候开始去重:
                proxys = [
                    dict(t)
                    for t in set([tuple(proxy.items()) for proxy in proxys])
                ]
                logger.info('end_proxy: %s' % len(proxys))
                logger.info('spider proxys: %s' % type(proxys))
                proxys = validator.run_list(proxys)  #这个是检测后的ip地址

                sqlHelper.batch_insert(sqlHelper.tableName, proxys)

                logger.info('success ip: %s' % sqlHelper.selectCount())
                sqlHelper.close()
            logger.info('Finished to run spider')
            time.sleep(UPDATE_TIME)
Esempio n. 7
0
    def run(self):
        while True:
            print 'spider beginning -------'
            sqlHelper = SqliteHelper()
            print 'validator beginning -------'
            validator = Validator(sqlHelper)
            count = validator.run_db()
            print 'validator end ----count=%s'%count
            if count[0]< MINNUM:
                proxys = self.crawl_pool.map(self.crawl,parserList)
                #这个时候proxys的格式是[[{},{},{}],[{},{},{}]]
                # print proxys
                #这个时候应该去重:

                proxys_tmp = []
                for proxy in proxys:
                    proxys_tmp.extend(proxy)

                proxys = proxys_tmp
                print 'first_proxys--%s',len(proxys)
                #这个时候proxys的格式是[{},{},{},{},{},{}]
                proxys_tmp=None
                #这个时候开始去重:
                proxys = [dict(t) for t in set([tuple(proxy.items()) for proxy in proxys])]
                print 'end_proxys--%s',len(proxys)
                print 'spider proxys -------%s'%type(proxys)
                proxys = validator.run_list(proxys)#这个是检测后的ip地址


                sqlHelper.batch_insert(sqlHelper.tableName,proxys)


                print 'success ip =%s'%sqlHelper.selectCount()
                sqlHelper.close()
            print 'spider end -------'
            time.sleep(UPDATE_TIME)
Esempio n. 8
0
    def get_proxy_ip_mimiip(self, urlFormat, tmpName, maxPageNo=1, urlType=1):
        """[获取代理ip for mimiip]
		
		Arguments:
			urlFormat {[type]} -- [链接]
			tmpName {[type]} -- [模板目录]
		
		Keyword Arguments:
			maxPageNo {number} -- [最大页码] (default: {1})
			urlType {number} -- [1:静态页  2:动态页] (default: {1})
		"""

        extra = Extractor()
        extra.setXsltFromFile(tmpName)
        doSpider = Spider()
        p = SqliteHelper()

        if maxPageNo <= 1:
            maxPageNo = 1
        maxPageNo += 1

        for page in range(1, maxPageNo):
            url = urlFormat.format(page)
            # url='http://www.mimiip.com/gngao/{0}'.format(page)

            html_dom = doSpider.getContent(url, urlType)
            op_xml = extra.extractHtmlDomtoXml(html_dom)
            op_json = doSpider.xmlToJson(op_xml)
            # proxyman.info(op_json)
            # print(op_json)
            # return False
            # break

            # 解析转换后的json
            obj = json.loads(op_json)
            proxy_list = []
            if obj['proxyshow']:
                for ps in obj['proxyshow']['item']:
                    proxy_dict = {}

                    proxy_dict['xip'] = ps['xip']
                    proxy_dict['xport'] = ps['xport']
                    proxy_dict['xaddr'] = ps['xaddr'].replace('\n', '')
                    proxy_dict['xlevel'] = ps['xlevel']
                    proxy_dict['xprotocal'] = ps['xprotocal'].lower()
                    proxy_list.append(proxy_dict)

            proxy_list_ok = []
            # 遍历,验证代理ip是否可用
            for pro in proxy_list:
                aa_show = 'the {0}-{1}:{2} for {3}'.format(
                    pro['xprotocal'], pro['xip'], pro['xport'],
                    pro['xaddr'].encode('utf-8'))
                print(aa_show)
                proxyman.info(aa_show)

                p_ip = {
                    "{0}".format(pro['xprotocal']):
                    "http://{0}:{1}".format(pro['xip'], pro['xport'])
                }
                res = self.check_proxy_ip(p_ip)
                if res:
                    proxy_list_ok.append(pro)

            # 将筛选处理后的代理IP添加到数据库中
            count = p.db_insert_for_proxyip(proxy_list_ok)
            print('insert %d ips success' % (count))

            # 获取完一页数据后,休息一下
            Shelper.makeSleep(5)
Esempio n. 9
0
	def __init__(self):
		self.SQLdb=SqliteHelper()
		self.proxyman=shelper.setLog('spider')
Esempio n. 10
0
class ProxySpider(object):
	"""docstring for ProxySpider"""
	def __init__(self):
		self.SQLdb=SqliteHelper()
		self.proxyman=shelper.setLog('spider')

	# 获取代理ip for mimiip
	def get_proxy_ip_mimiip(self,urlFormat,tmpName,maxPageNo=1,urlType=1):
		"""[获取代理ip for mimiip]
		
		[注意:该方法为示例方法,用于演示抓取站点代理IP列表及验证到保存入库的完整过程]

		Arguments:
			urlFormat {[type]} -- [链接]
			tmpName {[type]} -- [模板目录]
		
		Keyword Arguments:
			maxPageNo {number} -- [最大页码] (default: {1})
			urlType {number} -- [1:静态页  2:动态页] (default: {1})
		"""
		
		extra=Extractor()
		extra.setXsltFromFile(tmpName)
		doSpider=Spider()
		p=SqliteHelper()

		if maxPageNo <= 1:
			maxPageNo=1
		maxPageNo+=1

		for page in range(1,maxPageNo):
			url=urlFormat.format(page)
			# url='http://www.mimiip.com/gngao/{0}'.format(page)

			html_dom=doSpider.getContent(url,urlType)
			op_xml=extra.extractHtmlDomtoXml(html_dom)
			op_json=doSpider.xmlToJson(op_xml)

			# proxyman.info(op_json)
			# print(op_json)
			# return False
			# break

			# 解析转换后的json
			obj=json.loads(op_json)
			proxy_list=[]
			if obj['proxyshow']:
				for ps in obj['proxyshow']['item']:
					proxy_dict={}

					proxy_dict['xip']=ps['xip']
					proxy_dict['xport']=ps['xport']
					proxy_dict['xaddr']=ps['xaddr'].replace('\n','')
					proxy_dict['xlevel']=ps['xlevel']
					proxy_dict['xprotocal']=ps['xprotocal'].lower()
					proxy_list.append(proxy_dict)

			proxy_list_ok=[]
			# 遍历,验证代理ip是否可用
			for pro in proxy_list:
				aa_show='the {0}-{1}:{2} for {3}'.format(pro['xprotocal'],pro['xip'],pro['xport'],pro['xaddr'].encode('utf-8'))
				print(aa_show)
				self.proxyman.info(aa_show)

				p_ip={"{0}".format(pro['xprotocal']):"http://{0}:{1}".format(pro['xip'],pro['xport'])}
				res=self.check_proxy_ip(p_ip)
				if res:
					proxy_list_ok.append(pro)

			# 将筛选处理后的代理IP添加到数据库中
			count = p.db_insert_for_proxyip(proxy_list_ok)
			print('insert %d ips success' %(count))

			# 获取完一页数据后,休息一下
			shelper.makeSleep(5)


	# 获取代理ip
	def get_proxy_ip(self,funcSite,urlFormat,tmpName,maxPageNo=1,urlType=1):
		"""[获取某站点下的代理ip]
		
		[通过指定抓取站点链接,指定xslt模板文件的方式来抓取指定代理站点下的可用的高匿代理IP]
		
		Arguments:
			funcSite {[type]} -- [针对于指定站点解析json数据的方法]
			urlFormat {[type]} -- [指定站点url,页码部分为“{0}”]
			tmpName {[type]} -- [指定站点的xslt模板]
		
		Keyword Arguments:
			maxPageNo {number} -- [最大页码] (default: {1})
			urlType {number} -- [站点html类型 1静态  2动态] (default: {1})
		"""
		extra=Extractor()
		extra.setXsltFromFile(tmpName)
		doSpider=Spider()

		if maxPageNo <= 1:
			maxPageNo=1
		maxPageNo+=1
		try:
			for page in range(1,maxPageNo):
				url=urlFormat.format(page)
				# 获取某页面html内容
				page_html_dom=doSpider.getContent(url,urlType)
				page_xml=extra.extractHtmlDomtoXml(page_html_dom)
				page_json_data=doSpider.xmlToJson(page_xml)

				# **************************************
				# Debug html
				# page_htmlStr=doSpider.htmlStr
				# self.proxyman.info(page_htmlStr)

				# Debug jsondata
				self.proxyman.info(page_json_data)
				# print(page_json_data)
				# **************************************

				# 针对于抓取的站点,对得到的内容进行解析处理后得到抓取的代理IP集合
				page_proxy_list=funcSite(page_json_data)

				# 对代理IP进行可用性验证筛选
				page_proxy_list_ok=self.availabile_proxy_ip(page_proxy_list)

				# 将验证通过的代理IP添加到数据库中
				self.save_proxy_ip(page_proxy_list_ok)

				# 获取完一页数据后,休息一下
				shelper.makeSleep(5)
		except Exception as e:
			err_show='[get_proxy_ip]--error-{0}'.format(str(e))
			print(err_show)
			self.proxyman.error(err_show)
		finally:
			fina_show='[get_proxy_ip]--The work is Done'
			print(fina_show)
			self.proxyman.error(fina_show)


	# 代理Ip可用性验证
	def availabile_proxy_ip(self,proxyList):
		"""[可用性验证]
		
		[遍历,验证代理ip是否可用]
		
		Arguments:
			proxyList {[list]} -- [待验证的代理ip集合]
		
		Returns:
			[list] -- [验证通过的代理ip集合]
		"""
		
		proxy_list_ok=[]
		try:
			for pro in proxyList:
				aa_show='the {0}-{1}:{2} for {3}'.format(pro['xprotocal'],pro['xip'],pro['xport'],pro['xaddr'].encode('utf-8'))
				print(aa_show)
				self.proxyman.info(aa_show)

				# {"http":"http://102.168.5.103:8080"}
				p_ip={"{0}".format(pro['xprotocal']):"http://{0}:{1}".format(pro['xip'],pro['xport'])}
				# 通过比对正常请求和通过代理请求的结果判断该代理ip是否可用
				res=self.check_proxy_ip(p_ip)
				if res:
					proxy_list_ok.append(pro)
		except Exception as e:
			err_show='[availabile_proxy_ip]--error-{0}'.format(str(e))
			print(err_show)
			self.proxyman.error(err_show)
		finally:
			return proxy_list_ok


	# 将验证后的代理ip添加到数据库中
	def save_proxy_ip(self,proxyList):
		"""[添加到数据库中]
		
		[将验证后的代理ip添加到数据库中]
		
		Arguments:
			proxyList {[list]} -- [验证后的代理Ip集合]
		"""
		count = self.SQLdb.db_insert_for_proxyip(proxyList)
		print('insert %d ips success' %(count))
		self.proxyman.info('insert %d ips success' %(count))


	# 验证指定代理ip是否可用
	def check_proxy_ip(self,proxyip):
		"""[验证代理ip是否可用]
		
		[proxyip 格式 {"http":"http://120.52.73.97:8081"}]
		
		Arguments:
			proxyip {[dict]} -- [待验证的代理ip字典]
		
		Returns:
			bool -- [是否通过]
		"""

		s = requests.Session()
		a = requests.adapters.HTTPAdapter(max_retries=3)
		b = requests.adapters.HTTPAdapter(max_retries=3)
		s.mount('http://', a)
		s.mount('https://', b)

		the_checked_ip=proxyip.values()[0]

		try:
			MaskedIP = s.get("http://icanhazip.com", timeout=10, proxies=proxyip).content.strip()

			# 用正则判断请求返回的内容是否是ip
			pattern=r"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$"
			pattern_res=re.match(pattern,MaskedIP)
			if not pattern_res:
				res_show='return result is not ip'
				print(res_show)
				self.proxyman.error('Result Content is Not Ip')
				return False

			mask_ip=pattern_res.group(0)

			# 直接访问
			OrigionalIP = requests.get("http://icanhazip.com", timeout=30).content.strip()
		
			ip_show='origional_ip is [{0}] -- mask_ip is [{1}]'.format(OrigionalIP,mask_ip)
			print(ip_show)

			if OrigionalIP != mask_ip:
				print('Proxy IP ok')
				self.proxyman.info('the mask ip【{0}】and return ip【{1}】is {2}'.format(the_checked_ip,mask_ip,'【OK】'))
				return True
			else:
				print('Not Anonymous')
				self.proxyman.info('the mask ip【{0}】and return ip【{1}】is {2}'.format(the_checked_ip,mask_ip,'Not Anonymous'))
				return False
		except requests.exceptions.Timeout:
			print('the request timeout')
			self.proxyman.error('Timeout')
			return False
		except Exception as e:
			print('the request error')
			self.proxyman.error('Error')
			return False

	# 判断数据库中代理ip是否可用并进行标记处理
	def verify_proxy_ip(self):
		"""[判断代理ip是否过期]
		
		[验证数据库中的【全部的】代理ip是否过期,如果过期,更新status=1,标记为待删除状态]
		"""
		# 从数据库中获取全部代理IP列表
		result=self.SQLdb.db_select_all_for_verify()
		if result:
			for pro in result:
				pid=pro[0]
				aa_show='verify {0}-{1}:{2}'.format(pro[3],pro[1],pro[2])
				print(aa_show)
				self.proxyman.info(aa_show)

				p_ip={"{0}".format(pro[3]):"http://{0}:{1}".format(pro[1],pro[2])}
				res=self.check_proxy_ip(p_ip)
				if not res:
					# 该代理ip不可用了
					sign_show='proxy ip【{0}】can not used ,signed for delete it'.format(pro[1])
					print(sign_show)
					self.proxyman.info(sign_show)
					# 在数据库中标记为待删除状态
					self.SQLdb.db_update_for_status(pid,1)

				shelper.makeSleep(3,False)

		else:
			res_show='未从数据库中获取到待检测的代理ip'
			print(res_show)
			self.proxyman.info(res_show)