def _get_66_ip_list(): ''' 先获取66高匿名ip :return: ''' global a_66_ip headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.66ip.cn/nm.html', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } params = ( ('getnum', ''), ('isp', '0'), ('anonymoustype', '3'), ('start', ''), ('ports', ''), ('export', ''), ('ipaddress', ''), ('area', '0'), ('proxytype', '2'), ('api', '66ip'), ) response = requests.get('http://www.66ip.cn/nmtq.php', headers=headers, params=params, cookies=None) body = Requests._wash_html(response.content.decode('gbk')) try: part = re.compile(r'</script>(.*)</div>').findall(body)[0] except IndexError: part = '' part = re.compile('<script>.*?</script>|</div>.*</div>').sub('', part) # print(part) ip_list = delete_list_null_str(part.split('<br />')) # print(ip_list) a_66_ip = ip_list if ip_list != [] else [] return ip_list
def _get_66_ip_list(): ''' 先获取66高匿名ip :return: ''' global ori_ip_list params = ( ('getnum', ''), ('isp', '0'), ('anonymoustype', '3'), ('start', ''), ('ports', ''), ('export', ''), ('ipaddress', ''), ('area', '0'), ('proxytype', '2'), ('api', '66ip'), ) with session() as s: try: response = s.get('http://www.66ip.cn/nmtq.php', headers=_get_base_headers(), params=params, cookies=None) except Exception: return [] body = Requests._wash_html(response.content.decode('gbk')) try: part = re.compile(r'</script>(.*)</div>').findall(body)[0] except IndexError: part = '' part = re.compile('<script>.*?</script>|</div>.*</div>').sub('', part) # print(part) ip_list = delete_list_null_str(part.split('<br />')) # print(ip_list) ori_ip_list = ip_list if ip_list != [] else [] return ip_list
def _get_proxy(self, random_parser_list_item_index, proxy_url) -> list: ''' spiders: 获取代理高匿名ip :return: ''' def parse_body(body): '''解析url body''' def _get_ip(**kwargs) -> str: tr = kwargs['tr'] ip_selector = kwargs['ip_selector'] ip = parse_field(parser=ip_selector, target_obj=tr) assert ip != '', 'ip为空值!' ip = re.compile(r'<script .*?</script>').sub('', ip) if re.compile('\d+').findall(ip) == []: # 处理不是ip地址 raise NotIpException lg.info(str(ip)) ip = re.compile('\d+\.\d+\.\d+\.\d+').findall(ip)[0] assert ip != '', 'ip为空值!' return ip def _get_port(**kwargs) -> str: tr = kwargs['tr'] port_selector = kwargs['port_selector'] port = parse_field(parser=port_selector, target_obj=tr) assert port != '', 'port为空值!' return port def _get_ip_type(**kwargs) -> str: '''获取ip_type''' tr = kwargs['tr'] ip_type_selector = kwargs['ip_type_selector'] ip_type = parse_field(parser=ip_type_selector, target_obj=tr) # 可空 # assert ip_type != '', 'ip_type为空值!' # return 'http' if ip_type == 'HTTP' else 'https' return 'http' # 全部返回'http' _ = [] parser_obj = parser_list[random_parser_list_item_index] try: part_selector = parser_obj.get('part', {}) assert part_selector != {}, '获取到part为空值!' position = parser_obj.get('position', {}) assert position != {}, '获取到position为空dict!' ip_selector = position.get('ip', {}) assert ip_selector != {}, '获取到ip_selector为空dict!' port_selector = position.get('port', {}) assert port_selector != {}, '获取到port_selector为空dict!' # 可为None ip_type_selector = position.get('ip_type', None) # assert ip_type_selector is not None, '获取到ip_type_selector为None!' except AssertionError: return [] for tr in parse_field(parser=part_selector, target_obj=body, is_first=False): try: ip = _get_ip(tr=tr, ip_selector=ip_selector) port = _get_port(tr=tr, port_selector=port_selector) ip_type = _get_ip_type(tr=tr, ip_type_selector=ip_type_selector) except NotIpException: continue except IndexError: lg.error('获取ip时索引异常!跳过!') continue except (AssertionError, Exception): lg.error('遇到错误:', exc_info=True) continue o = ProxyItem() o['ip'] = ip try: o['port'] = int(port) except Exception: lg.error('int转换port时出错!跳过!') continue o['ip_type'] = ip_type o['anonymity'] = 1 o['score'] = 100 o['last_check_time'] = str(get_shanghai_time()) # o['country'] = '' # o['city'] = '' # lg.info('[+] {}:{}'.format(ip, port)) _.append(dict(o)) return _ # 从已抓取的代理中随机代理采集, 没有则用本机ip(first crawl)! try: encoding = parser_list[random_parser_list_item_index].get('charset') proxies = _get_proxies() with session() as s: response = s.get(url=proxy_url, headers=_get_base_headers(), params=None, cookies=None, proxies=proxies, timeout=CHECK_PROXY_TIMEOUT) try: body = response.content.decode(encoding) except UnicodeDecodeError: body = response.text body = Requests._wash_html(body) # lg.info(body) except (ConnectTimeout, ProxyError, ReadTimeout, ConnectionError, TooManyRedirects) as e: lg.error('遇到错误: {}'.format(e.args[0])) return [] except Exception: lg.error('遇到错误:', exc_info=True) return [] # sleep(2) res = parse_body(body) if res == []: lg.error('html页面解析错误!跳过!') return res