def crawl(self, *args, **kwargs): theader = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', } html = requests.get(self.url, timeout=5, headers=theader).text ll = self.pattern1.findall(html) for l in ll: detail_url = l detail_html = requests.get(self.host + detail_url[2:], timeout=10, headers=theader).text if detail_html: ips = self.pattern2.findall(detail_html) if len(ips) == 0: ips = self.pattern4.findall(detail_html) for proxy in ips: ip = proxy[0] port = proxy[1] if not str(port).isdigit() or len(port) > 5: port = self._get_port(port) if port: proxy = ProxyItem(self.site, ip, port) self.write(proxy)
def crawl(self, *args, **kwargs): theader = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', } url_map = {} html = requests.get(self.base_url, timeout=5, headers=theader).text ll = self.base_pattern.findall(html) for l in ll: url_map[l] = 1 for k, v in url_map.items(): proxy_url = self.base_url + k proxy_html = requests.get(proxy_url, timeout=5, headers=theader).text matchers = self.proxy_pattern.findall(proxy_html) for proxy in matchers: print proxy ip = proxy[0] port = proxy[1] proxy = ProxyItem(self.site, ip, port) self.write(proxy)
def crawl(self, *args, **kwargs): for page in range(1, 20): html = requests.get(self.url + str(page), timeout=5).text ll = self.pattern1.findall(html) for l in ll: ip = l[0] port = l[1] proxy = ProxyItem(self.site, ip, port) self.write(proxy)
def parse_body(body): '''解析url body''' def _get_ip_type(ip_type): '''获取ip_type''' # return 'http' if ip_type == 'HTTP' else 'https' return 'http' # 全部返回'http' _ = [] parser_obj = parser_list[random_parser_list_item_index] try: part_selector = parser_obj.get('part', '') assert part_selector != '', '获取到part为空值!' position = parser_obj.get('position', {}) assert position != {}, '获取到position为空dict!' ip_selector = position.get('ip', '') assert ip_selector != '', '获取到ip_selector为空值!' port_selector = position.get('port', '') assert port_selector != '', '获取到port_selector为空值!' ip_type_selector = position.get('ip_type', '') assert ip_type_selector != '', '获取到ip_type_selector为空值!' except AssertionError: return [] for tr in Selector(text=body).css(part_selector).extract(): o = ProxyItem() try: ip = Selector(text=tr).css( '{} ::text'.format(ip_selector)).extract_first() if re.compile('\d+').findall(ip) == []: # 处理不是ip地址 continue assert ip != '', 'ip为空值!' port = Selector(text=tr).css( '{} ::text'.format(port_selector)).extract_first() assert port != '', 'port为空值!' ip_type = Selector(text=tr).css( '{} ::text'.format(ip_type_selector)).extract_first() assert ip_type != '', 'ip_type为空值!' ip_type = _get_ip_type(ip_type) except AssertionError or Exception: lg.error('遇到错误:', exc_info=True) continue o['ip'] = ip try: o['port'] = int(port) except Exception: lg.error('int转换port时出错!跳过!') continue o['ip_type'] = ip_type o['anonymity'] = 1 o['score'] = 100 o['last_check_time'] = str(get_shanghai_time()) # lg.info('[+] {}:{}'.format(ip, port)) _.append(o) return _
def crawl(self, *args, **kwargs): theader = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', } html = requests.get(self.url, timeout=5, headers=theader).text ll = self.pattern1.findall(html) for l in ll: ip = l[0] port = l[1] proxy = ProxyItem(self.site, ip, port) self.write(proxy)
async def _parse_ori_proxy_list_data(self, **kwargs) -> list: """ 解析原始proxy_list数据 :return: """ all = [] data = kwargs.get('data', {}) area = kwargs.get('area', '') id = kwargs.get('id') try: this_rule = await self._dynamic_get_new_dict_rule(data=data, area=area, id=id) proxy_list = await self._get_ori_proxy_list( parser=this_rule['proxy_list'], target_obj=data) except Exception as e: print(e) return all for item in proxy_list: try: this_rule = await self._dynamic_get_new_dict_rule(data=item, area=area, id=id) ip = await self._get_ip(parser=this_rule['ip'], target_obj=item) port = await self._get_port(parser=this_rule['port'], target_obj=item) except Exception as e: print(e) continue proxy_item = ProxyItem() proxy_item['ip'] = ip proxy_item['port'] = port proxy_item['agency_agreement'] = 'https' proxy_item['score'] = self.score proxy_item['check_time'] = get_shanghai_time() all.append(dict(proxy_item)) return all
def parse_body(body): '''解析url body''' def _get_ip(**kwargs) -> str: tr = kwargs['tr'] ip_selector = kwargs['ip_selector'] ip = parse_field(parser=ip_selector, target_obj=tr) assert ip != '', 'ip为空值!' ip = re.compile(r'<script .*?</script>').sub('', ip) if re.compile('\d+').findall(ip) == []: # 处理不是ip地址 raise NotIpException lg.info(str(ip)) ip = re.compile('\d+\.\d+\.\d+\.\d+').findall(ip)[0] assert ip != '', 'ip为空值!' return ip def _get_port(**kwargs) -> str: tr = kwargs['tr'] port_selector = kwargs['port_selector'] port = parse_field(parser=port_selector, target_obj=tr) assert port != '', 'port为空值!' return port def _get_ip_type(**kwargs) -> str: '''获取ip_type''' tr = kwargs['tr'] ip_type_selector = kwargs['ip_type_selector'] ip_type = parse_field(parser=ip_type_selector, target_obj=tr) # 可空 # assert ip_type != '', 'ip_type为空值!' # return 'http' if ip_type == 'HTTP' else 'https' return 'http' # 全部返回'http' _ = [] parser_obj = parser_list[random_parser_list_item_index] try: part_selector = parser_obj.get('part', {}) assert part_selector != {}, '获取到part为空值!' position = parser_obj.get('position', {}) assert position != {}, '获取到position为空dict!' ip_selector = position.get('ip', {}) assert ip_selector != {}, '获取到ip_selector为空dict!' port_selector = position.get('port', {}) assert port_selector != {}, '获取到port_selector为空dict!' # 可为None ip_type_selector = position.get('ip_type', None) # assert ip_type_selector is not None, '获取到ip_type_selector为None!' except AssertionError: return [] for tr in parse_field(parser=part_selector, target_obj=body, is_first=False): try: ip = _get_ip(tr=tr, ip_selector=ip_selector) port = _get_port(tr=tr, port_selector=port_selector) ip_type = _get_ip_type(tr=tr, ip_type_selector=ip_type_selector) except NotIpException: continue except IndexError: lg.error('获取ip时索引异常!跳过!') continue except (AssertionError, Exception): lg.error('遇到错误:', exc_info=True) continue o = ProxyItem() o['ip'] = ip try: o['port'] = int(port) except Exception: lg.error('int转换port时出错!跳过!') continue o['ip_type'] = ip_type o['anonymity'] = 1 o['score'] = 100 o['last_check_time'] = str(get_shanghai_time()) # o['country'] = '' # o['city'] = '' # lg.info('[+] {}:{}'.format(ip, port)) _.append(dict(o)) return _