Example #1
0
    def crawl(self, *args, **kwargs):

        theader = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
        }

        html = requests.get(self.url, timeout=5, headers=theader).text
        ll = self.pattern1.findall(html)
        for l in ll:
            detail_url = l
            detail_html = requests.get(self.host + detail_url[2:],
                                       timeout=10,
                                       headers=theader).text
            if detail_html:
                ips = self.pattern2.findall(detail_html)
                if len(ips) == 0:
                    ips = self.pattern4.findall(detail_html)
                for proxy in ips:
                    ip = proxy[0]
                    port = proxy[1]
                    if not str(port).isdigit() or len(port) > 5:
                        port = self._get_port(port)
                    if port:
                        proxy = ProxyItem(self.site, ip, port)
                        self.write(proxy)
    def crawl(self, *args, **kwargs):
        theader = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
        }

        url_map = {}
        html = requests.get(self.base_url, timeout=5, headers=theader).text
        ll = self.base_pattern.findall(html)
        for l in ll:
            url_map[l] = 1

        for k, v in url_map.items():
            proxy_url = self.base_url + k
            proxy_html = requests.get(proxy_url, timeout=5,
                                      headers=theader).text
            matchers = self.proxy_pattern.findall(proxy_html)
            for proxy in matchers:
                print proxy
                ip = proxy[0]
                port = proxy[1]
                proxy = ProxyItem(self.site, ip, port)
                self.write(proxy)
Example #3
0
 def crawl(self, *args, **kwargs):
     for page in range(1, 20):
         html = requests.get(self.url + str(page), timeout=5).text
         ll = self.pattern1.findall(html)
         for l in ll:
             ip = l[0]
             port = l[1]
             proxy = ProxyItem(self.site, ip, port)
             self.write(proxy)
Example #4
0
    def parse_body(body):
        '''解析url body'''
        def _get_ip_type(ip_type):
            '''获取ip_type'''
            # return 'http' if ip_type == 'HTTP' else 'https'
            return 'http'  # 全部返回'http'

        _ = []
        parser_obj = parser_list[random_parser_list_item_index]
        try:
            part_selector = parser_obj.get('part', '')
            assert part_selector != '', '获取到part为空值!'
            position = parser_obj.get('position', {})
            assert position != {}, '获取到position为空dict!'
            ip_selector = position.get('ip', '')
            assert ip_selector != '', '获取到ip_selector为空值!'
            port_selector = position.get('port', '')
            assert port_selector != '', '获取到port_selector为空值!'
            ip_type_selector = position.get('ip_type', '')
            assert ip_type_selector != '', '获取到ip_type_selector为空值!'
        except AssertionError:
            return []

        for tr in Selector(text=body).css(part_selector).extract():
            o = ProxyItem()
            try:
                ip = Selector(text=tr).css(
                    '{} ::text'.format(ip_selector)).extract_first()
                if re.compile('\d+').findall(ip) == []:  # 处理不是ip地址
                    continue
                assert ip != '', 'ip为空值!'
                port = Selector(text=tr).css(
                    '{} ::text'.format(port_selector)).extract_first()
                assert port != '', 'port为空值!'
                ip_type = Selector(text=tr).css(
                    '{} ::text'.format(ip_type_selector)).extract_first()
                assert ip_type != '', 'ip_type为空值!'
                ip_type = _get_ip_type(ip_type)
            except AssertionError or Exception:
                lg.error('遇到错误:', exc_info=True)
                continue
            o['ip'] = ip
            try:
                o['port'] = int(port)
            except Exception:
                lg.error('int转换port时出错!跳过!')
                continue
            o['ip_type'] = ip_type
            o['anonymity'] = 1
            o['score'] = 100
            o['last_check_time'] = str(get_shanghai_time())
            # lg.info('[+] {}:{}'.format(ip, port))
            _.append(o)

        return _
Example #5
0
    def crawl(self, *args, **kwargs):
        theader = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
        }

        html = requests.get(self.url, timeout=5, headers=theader).text
        ll = self.pattern1.findall(html)
        for l in ll:
            ip = l[0]
            port = l[1]
            proxy = ProxyItem(self.site, ip, port)
            self.write(proxy)
Example #6
0
    async def _parse_ori_proxy_list_data(self, **kwargs) -> list:
        """
        解析原始proxy_list数据
        :return:
        """
        all = []
        data = kwargs.get('data', {})
        area = kwargs.get('area', '')
        id = kwargs.get('id')

        try:
            this_rule = await self._dynamic_get_new_dict_rule(data=data,
                                                              area=area,
                                                              id=id)
            proxy_list = await self._get_ori_proxy_list(
                parser=this_rule['proxy_list'], target_obj=data)
        except Exception as e:
            print(e)
            return all

        for item in proxy_list:
            try:
                this_rule = await self._dynamic_get_new_dict_rule(data=item,
                                                                  area=area,
                                                                  id=id)
                ip = await self._get_ip(parser=this_rule['ip'],
                                        target_obj=item)
                port = await self._get_port(parser=this_rule['port'],
                                            target_obj=item)
            except Exception as e:
                print(e)
                continue
            proxy_item = ProxyItem()
            proxy_item['ip'] = ip
            proxy_item['port'] = port
            proxy_item['agency_agreement'] = 'https'
            proxy_item['score'] = self.score
            proxy_item['check_time'] = get_shanghai_time()
            all.append(dict(proxy_item))

        return all
Example #7
0
    def parse_body(body):
        '''解析url body'''
        def _get_ip(**kwargs) -> str:
            tr = kwargs['tr']
            ip_selector = kwargs['ip_selector']

            ip = parse_field(parser=ip_selector, target_obj=tr)
            assert ip != '', 'ip为空值!'
            ip = re.compile(r'<script .*?</script>').sub('', ip)
            if re.compile('\d+').findall(ip) == []:  # 处理不是ip地址
                raise NotIpException

            lg.info(str(ip))
            ip = re.compile('\d+\.\d+\.\d+\.\d+').findall(ip)[0]
            assert ip != '', 'ip为空值!'

            return ip

        def _get_port(**kwargs) -> str:
            tr = kwargs['tr']
            port_selector = kwargs['port_selector']

            port = parse_field(parser=port_selector, target_obj=tr)
            assert port != '', 'port为空值!'

            return port

        def _get_ip_type(**kwargs) -> str:
            '''获取ip_type'''
            tr = kwargs['tr']
            ip_type_selector = kwargs['ip_type_selector']

            ip_type = parse_field(parser=ip_type_selector, target_obj=tr)
            # 可空
            # assert ip_type != '', 'ip_type为空值!'
            # return 'http' if ip_type == 'HTTP' else 'https'

            return 'http'  # 全部返回'http'

        _ = []
        parser_obj = parser_list[random_parser_list_item_index]
        try:
            part_selector = parser_obj.get('part', {})
            assert part_selector != {}, '获取到part为空值!'
            position = parser_obj.get('position', {})
            assert position != {}, '获取到position为空dict!'
            ip_selector = position.get('ip', {})
            assert ip_selector != {}, '获取到ip_selector为空dict!'
            port_selector = position.get('port', {})
            assert port_selector != {}, '获取到port_selector为空dict!'
            # 可为None
            ip_type_selector = position.get('ip_type', None)
            # assert ip_type_selector is not None, '获取到ip_type_selector为None!'
        except AssertionError:
            return []

        for tr in parse_field(parser=part_selector,
                              target_obj=body,
                              is_first=False):
            try:
                ip = _get_ip(tr=tr, ip_selector=ip_selector)
                port = _get_port(tr=tr, port_selector=port_selector)
                ip_type = _get_ip_type(tr=tr,
                                       ip_type_selector=ip_type_selector)
            except NotIpException:
                continue
            except IndexError:
                lg.error('获取ip时索引异常!跳过!')
                continue
            except (AssertionError, Exception):
                lg.error('遇到错误:', exc_info=True)
                continue

            o = ProxyItem()
            o['ip'] = ip
            try:
                o['port'] = int(port)
            except Exception:
                lg.error('int转换port时出错!跳过!')
                continue
            o['ip_type'] = ip_type
            o['anonymity'] = 1
            o['score'] = 100
            o['last_check_time'] = str(get_shanghai_time())
            # o['country'] = ''
            # o['city'] = ''
            # lg.info('[+] {}:{}'.format(ip, port))
            _.append(dict(o))

        return _