def get_proxy_ip_mimiip(self, urlFormat, tmpName, maxPageNo=1, urlType=1): """[获取代理ip for mimiip] Arguments: urlFormat {[type]} -- [链接] tmpName {[type]} -- [模板目录] Keyword Arguments: maxPageNo {number} -- [最大页码] (default: {1}) urlType {number} -- [1:静态页 2:动态页] (default: {1}) """ extra = Extractor() extra.setXsltFromFile(tmpName) doSpider = Spider() p = SqliteHelper() if maxPageNo <= 1: maxPageNo = 1 maxPageNo += 1 for page in range(1, maxPageNo): url = urlFormat.format(page) # url='http://www.mimiip.com/gngao/{0}'.format(page) html_dom = doSpider.getContent(url, urlType) op_xml = extra.extractHtmlDomtoXml(html_dom) op_json = doSpider.xmlToJson(op_xml) # proxyman.info(op_json) # print(op_json) # return False # break # 解析转换后的json obj = json.loads(op_json) proxy_list = [] if obj['proxyshow']: for ps in obj['proxyshow']['item']: proxy_dict = {} proxy_dict['xip'] = ps['xip'] proxy_dict['xport'] = ps['xport'] proxy_dict['xaddr'] = ps['xaddr'].replace('\n', '') proxy_dict['xlevel'] = ps['xlevel'] proxy_dict['xprotocal'] = ps['xprotocal'].lower() proxy_list.append(proxy_dict) proxy_list_ok = [] # 遍历,验证代理ip是否可用 for pro in proxy_list: aa_show = 'the {0}-{1}:{2} for {3}'.format( pro['xprotocal'], pro['xip'], pro['xport'], pro['xaddr'].encode('utf-8')) print(aa_show) proxyman.info(aa_show) p_ip = { "{0}".format(pro['xprotocal']): "http://{0}:{1}".format(pro['xip'], pro['xport']) } res = self.check_proxy_ip(p_ip) if res: proxy_list_ok.append(pro) # 将筛选处理后的代理IP添加到数据库中 count = p.db_insert_for_proxyip(proxy_list_ok) print('insert %d ips success' % (count)) # 获取完一页数据后,休息一下 Shelper.makeSleep(5)
class ProxySpider(object): """docstring for ProxySpider""" def __init__(self): self.SQLdb=SqliteHelper() self.proxyman=shelper.setLog('spider') # 获取代理ip for mimiip def get_proxy_ip_mimiip(self,urlFormat,tmpName,maxPageNo=1,urlType=1): """[获取代理ip for mimiip] [注意:该方法为示例方法,用于演示抓取站点代理IP列表及验证到保存入库的完整过程] Arguments: urlFormat {[type]} -- [链接] tmpName {[type]} -- [模板目录] Keyword Arguments: maxPageNo {number} -- [最大页码] (default: {1}) urlType {number} -- [1:静态页 2:动态页] (default: {1}) """ extra=Extractor() extra.setXsltFromFile(tmpName) doSpider=Spider() p=SqliteHelper() if maxPageNo <= 1: maxPageNo=1 maxPageNo+=1 for page in range(1,maxPageNo): url=urlFormat.format(page) # url='http://www.mimiip.com/gngao/{0}'.format(page) html_dom=doSpider.getContent(url,urlType) op_xml=extra.extractHtmlDomtoXml(html_dom) op_json=doSpider.xmlToJson(op_xml) # proxyman.info(op_json) # print(op_json) # return False # break # 解析转换后的json obj=json.loads(op_json) proxy_list=[] if obj['proxyshow']: for ps in obj['proxyshow']['item']: proxy_dict={} proxy_dict['xip']=ps['xip'] proxy_dict['xport']=ps['xport'] proxy_dict['xaddr']=ps['xaddr'].replace('\n','') proxy_dict['xlevel']=ps['xlevel'] proxy_dict['xprotocal']=ps['xprotocal'].lower() proxy_list.append(proxy_dict) proxy_list_ok=[] # 遍历,验证代理ip是否可用 for pro in proxy_list: aa_show='the {0}-{1}:{2} for {3}'.format(pro['xprotocal'],pro['xip'],pro['xport'],pro['xaddr'].encode('utf-8')) print(aa_show) self.proxyman.info(aa_show) p_ip={"{0}".format(pro['xprotocal']):"http://{0}:{1}".format(pro['xip'],pro['xport'])} res=self.check_proxy_ip(p_ip) if res: proxy_list_ok.append(pro) # 将筛选处理后的代理IP添加到数据库中 count = p.db_insert_for_proxyip(proxy_list_ok) print('insert %d ips success' %(count)) # 获取完一页数据后,休息一下 shelper.makeSleep(5) # 获取代理ip def get_proxy_ip(self,funcSite,urlFormat,tmpName,maxPageNo=1,urlType=1): """[获取某站点下的代理ip] [通过指定抓取站点链接,指定xslt模板文件的方式来抓取指定代理站点下的可用的高匿代理IP] Arguments: funcSite {[type]} -- [针对于指定站点解析json数据的方法] urlFormat {[type]} -- [指定站点url,页码部分为“{0}”] tmpName {[type]} -- [指定站点的xslt模板] Keyword Arguments: maxPageNo {number} -- [最大页码] (default: {1}) urlType {number} -- [站点html类型 1静态 2动态] (default: {1}) """ extra=Extractor() extra.setXsltFromFile(tmpName) doSpider=Spider() if maxPageNo <= 1: maxPageNo=1 maxPageNo+=1 try: for page in range(1,maxPageNo): url=urlFormat.format(page) # 获取某页面html内容 page_html_dom=doSpider.getContent(url,urlType) page_xml=extra.extractHtmlDomtoXml(page_html_dom) page_json_data=doSpider.xmlToJson(page_xml) # ************************************** # Debug html # page_htmlStr=doSpider.htmlStr # self.proxyman.info(page_htmlStr) # Debug jsondata self.proxyman.info(page_json_data) # print(page_json_data) # ************************************** # 针对于抓取的站点,对得到的内容进行解析处理后得到抓取的代理IP集合 page_proxy_list=funcSite(page_json_data) # 对代理IP进行可用性验证筛选 page_proxy_list_ok=self.availabile_proxy_ip(page_proxy_list) # 将验证通过的代理IP添加到数据库中 self.save_proxy_ip(page_proxy_list_ok) # 获取完一页数据后,休息一下 shelper.makeSleep(5) except Exception as e: err_show='[get_proxy_ip]--error-{0}'.format(str(e)) print(err_show) self.proxyman.error(err_show) finally: fina_show='[get_proxy_ip]--The work is Done' print(fina_show) self.proxyman.error(fina_show) # 代理Ip可用性验证 def availabile_proxy_ip(self,proxyList): """[可用性验证] [遍历,验证代理ip是否可用] Arguments: proxyList {[list]} -- [待验证的代理ip集合] Returns: [list] -- [验证通过的代理ip集合] """ proxy_list_ok=[] try: for pro in proxyList: aa_show='the {0}-{1}:{2} for {3}'.format(pro['xprotocal'],pro['xip'],pro['xport'],pro['xaddr'].encode('utf-8')) print(aa_show) self.proxyman.info(aa_show) # {"http":"http://102.168.5.103:8080"} p_ip={"{0}".format(pro['xprotocal']):"http://{0}:{1}".format(pro['xip'],pro['xport'])} # 通过比对正常请求和通过代理请求的结果判断该代理ip是否可用 res=self.check_proxy_ip(p_ip) if res: proxy_list_ok.append(pro) except Exception as e: err_show='[availabile_proxy_ip]--error-{0}'.format(str(e)) print(err_show) self.proxyman.error(err_show) finally: return proxy_list_ok # 将验证后的代理ip添加到数据库中 def save_proxy_ip(self,proxyList): """[添加到数据库中] [将验证后的代理ip添加到数据库中] Arguments: proxyList {[list]} -- [验证后的代理Ip集合] """ count = self.SQLdb.db_insert_for_proxyip(proxyList) print('insert %d ips success' %(count)) self.proxyman.info('insert %d ips success' %(count)) # 验证指定代理ip是否可用 def check_proxy_ip(self,proxyip): """[验证代理ip是否可用] [proxyip 格式 {"http":"http://120.52.73.97:8081"}] Arguments: proxyip {[dict]} -- [待验证的代理ip字典] Returns: bool -- [是否通过] """ s = requests.Session() a = requests.adapters.HTTPAdapter(max_retries=3) b = requests.adapters.HTTPAdapter(max_retries=3) s.mount('http://', a) s.mount('https://', b) the_checked_ip=proxyip.values()[0] try: MaskedIP = s.get("http://icanhazip.com", timeout=10, proxies=proxyip).content.strip() # 用正则判断请求返回的内容是否是ip pattern=r"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" pattern_res=re.match(pattern,MaskedIP) if not pattern_res: res_show='return result is not ip' print(res_show) self.proxyman.error('Result Content is Not Ip') return False mask_ip=pattern_res.group(0) # 直接访问 OrigionalIP = requests.get("http://icanhazip.com", timeout=30).content.strip() ip_show='origional_ip is [{0}] -- mask_ip is [{1}]'.format(OrigionalIP,mask_ip) print(ip_show) if OrigionalIP != mask_ip: print('Proxy IP ok') self.proxyman.info('the mask ip【{0}】and return ip【{1}】is {2}'.format(the_checked_ip,mask_ip,'【OK】')) return True else: print('Not Anonymous') self.proxyman.info('the mask ip【{0}】and return ip【{1}】is {2}'.format(the_checked_ip,mask_ip,'Not Anonymous')) return False except requests.exceptions.Timeout: print('the request timeout') self.proxyman.error('Timeout') return False except Exception as e: print('the request error') self.proxyman.error('Error') return False # 判断数据库中代理ip是否可用并进行标记处理 def verify_proxy_ip(self): """[判断代理ip是否过期] [验证数据库中的【全部的】代理ip是否过期,如果过期,更新status=1,标记为待删除状态] """ # 从数据库中获取全部代理IP列表 result=self.SQLdb.db_select_all_for_verify() if result: for pro in result: pid=pro[0] aa_show='verify {0}-{1}:{2}'.format(pro[3],pro[1],pro[2]) print(aa_show) self.proxyman.info(aa_show) p_ip={"{0}".format(pro[3]):"http://{0}:{1}".format(pro[1],pro[2])} res=self.check_proxy_ip(p_ip) if not res: # 该代理ip不可用了 sign_show='proxy ip【{0}】can not used ,signed for delete it'.format(pro[1]) print(sign_show) self.proxyman.info(sign_show) # 在数据库中标记为待删除状态 self.SQLdb.db_update_for_status(pid,1) shelper.makeSleep(3,False) else: res_show='未从数据库中获取到待检测的代理ip' print(res_show) self.proxyman.info(res_show)