Example #1
0
 def __init__(self):
     self.ips = IPAddresss(QQWRY_PATH)
Example #2
0
class Html_Parser(object):
    def __init__(self):
        self.ips = IPAddresss(QQWRY_PATH)

    def parse(self, response, parser):
        '''

        :param response: 响应
        :param type: 解析方式
        :return:
        '''
        if parser['type'] == 'xpath':
            return self.XpathPraser(response, parser)
        elif parser['type'] == 'regular':
            return self.RegularPraser(response, parser)
        elif parser['type'] == 'module':
            return getattr(self, parser['moduleName'], None)(response, parser)
        else:
            return None

    def AuthCountry(self, addr):
        '''
        用来判断地址是哪个国家的
        :param addr:
        :return:
        '''
        for area in CHINA_AREA:
            if text_(area) in addr:
                return True
        return False

    def XpathPraser(self, response, parser):
        '''
        针对xpath方式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])
        for proxy in proxys:
            try:
                ip = proxy.xpath(parser['position']['ip'])[0].text
                port = proxy.xpath(parser['position']['port'])[0].text
                type = text_('Transparent')
                protocol = text_("http")
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                country = text_('')
                area = text_('')
                if text_('省') in addr or self.AuthCountry(addr):
                    country = text_('home')
                    area = addr
                else:
                    country = text_('abroad')
                    area = addr
            except Exception as e:
                continue
            # updatetime = datetime.datetime.now()
            # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
            proxy = {
                'ip': ip,
                'port': int(port),
                'types': type,
                'protocol': protocol,
                'country': country,
                'area': area,
                'speed': 100
            }
            proxylist.append(proxy)
        return proxylist

    def RegularPraser(self, response, parser):
        '''
        针对正则表达式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs != None:
            for match in matchs:
                try:
                    ip = match[parser['position']['ip']]
                    port = match[parser['position']['port']]
                    # 网站的类型一直不靠谱所以还是默认,之后会检测
                    type = text_('Transparent')
                    # if parser['postion']['protocol'] > 0:
                    # protocol = match[parser['postion']['protocol']]
                    # if protocol.lower().find('https')!=-1:
                    #         protocol = 1
                    #     else:
                    #         protocol = 0
                    # else:
                    protocol = text_("http")
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_('')
                    area = text_('')
                    # print(ip,port)
                    if text_('省') in addr or self.AuthCountry(addr):
                        country = text_('home')
                        area = addr
                    else:
                        country = text_('abroad')
                        area = addr
                except Exception as e:
                    continue

                proxy = {
                    'ip': ip,
                    'port': port,
                    'types': type,
                    'protocol': protocol,
                    'country': country,
                    'area': area,
                    'speed': 100
                }

                proxylist.append(proxy)
            return proxylist

    def CnproxyPraser(self, response, parser):
        proxylist = self.RegularPraser(response, parser)
        chardict = {
            'v': '3',
            'm': '4',
            'a': '2',
            'l': '9',
            'q': '0',
            'b': '5',
            'i': '7',
            'w': '6',
            'r': '8',
            'c': '1'
        }

        for proxy in proxylist:
            port = proxy['port']
            new_port = ''
            for i in range(len(port)):
                if port[i] != '+':
                    new_port += chardict[port[i]]
            new_port = int(new_port)
            proxy['port'] = new_port
        return proxylist

    def proxy_listPraser(self, response, parser):
        proxylist = []
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs:
            for match in matchs:
                try:
                    ip_port = base64.b64decode(
                        match.replace("Proxy('", "").replace("')", ""))
                    ip = ip_port.split(':')[0]
                    port = ip_port.split(':')[1]
                    type = text_('Transparent')
                    protocol = text_("http")
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_('')
                    area = text_('')
                    # print(ip,port)
                    if text_('省') in addr or self.AuthCountry(addr):
                        country = text_('home')
                        area = addr
                    else:
                        country = text_('abroad')
                        area = addr
                except Exception as e:
                    continue
                proxy = {
                    'ip': ip,
                    'port': int(port),
                    'types': type,
                    'protocol': protocol,
                    'country': country,
                    'area': area,
                    'speed': 100
                }
                proxylist.append(proxy)
            return proxylist
Example #3
0
class Html_Parser(object):
    def __init__(self):
        self.ips = IPAddresss(QQWRY_PATH)

    def parse(self, response, parser):
        '''

        :param response: 响应
        :param type: 解析方式
        :return:
        '''
        if parser['type'] == 'xpath':
            return self.XpathPraser(response, parser)
        elif parser['type'] == 'regular':
            return self.RegularPraser(response, parser)
        elif parser['type'] == 'module':
            return getattr(self, parser['moduleName'], None)(response, parser)
        else:
            return None

    def AuthCountry(self, addr):
        '''
        用来判断地址是哪个国家的
        :param addr:
        :return:
        '''
        for area in CHINA_AREA:
            if addr.find(area) != -1:
                return True
        return False

    def XpathPraser(self, response, parser):
        '''
        针对xpath方式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])
        for proxy in proxys:
            # print parser['postion']['ip']
            try:
                ip = proxy.xpath(parser['postion']['ip'])[0].text
                port = proxy.xpath(parser['postion']['port'])[0].text
                type = proxy.xpath(parser['postion']['type'])[0].text
                if type.find(u'高匿') != -1:
                    type = 0
                else:
                    type = 1
                protocol = ''
                if len(parser['postion']['protocol']) > 0:
                    protocol = proxy.xpath(
                        parser['postion']['protocol'])[0].text
                    if protocol.lower().find('https') != -1:
                        protocol = 1
                    else:
                        protocol = 0
                else:
                    protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                country = ''
                area = ''
                if addr.find(u'省') != -1 or self.AuthCountry(addr):
                    country = u'中国'
                    area = addr
                else:
                    country = addr
                    area = ''
            except Exception, e:
                continue
            # updatetime = datetime.datetime.now()
            # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
            proxy = {
                'ip': ip,
                'port': int(port),
                'type': int(type),
                'protocol': int(protocol),
                'country': country,
                'area': area,
                'speed': 100
            }
            logger.info("Fetch proxy %s" % str(proxy))
            proxylist.append(proxy)

        return proxylist
Example #4
0
 def __init__(self):
     self.ips = IPAddresss(QQWRY_PATH)
Example #5
0
class Html_Parser(object):
    def __init__(self):
        self.ips = IPAddresss(QQWRY_PATH)

    def parse(self, response, parser):
        '''

        :param response: 响应
        :param type: 解析方式
        :return:
        '''
        if parser['type'] == 'xpath':
            proxylist = []
            root = etree.HTML(response)
            proxys = root.xpath(parser['pattern'])
            for proxy in proxys:
                try:
                    ip = proxy.xpath(parser['postion']['ip'])[0].text
                    port = proxy.xpath(parser['postion']['port'])[0].text
                    type = proxy.xpath(parser['postion']['type'])[0].text
                    if type.find(u'高匿') != -1:
                        type = 0
                    else:
                        type = 1
                    protocol = ''
                    if len(parser['postion']['protocol']) > 0:
                        protocol = proxy.xpath(
                            parser['postion']['protocol'])[0].text
                        if protocol.lower().find('https') != -1:
                            protocol = 1
                        else:
                            protocol = 0
                    else:
                        protocol = 0
                        # print response
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = ''
                    area = ''
                    if addr.find(u'省') != -1 or self.AuthCountry(addr):
                        country = u'中国'
                        area = addr
                    else:
                        country = addr
                        area = ''
                    # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

                    # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
                    proxy = {
                        'ip': ip,
                        'port': int(port),
                        'type': int(type),
                        'protocol': int(protocol),
                        'country': country,
                        'area': area,
                        'speed': 100,
                        'createtime': datetime.datetime.now(),
                        'updatetime': datetime.datetime.now()
                    }
                    # print proxy
                    proxylist.append(proxy)
                except Exception as err:
                    print err

            return proxylist

    def AuthCountry(self, addr):
        '''
        用来判断地址是哪个国家的
        :param addr:
        :return:
        '''
        for area in CHINA_AREA:
            if addr.find(area) != -1:
                return True
        return False
Example #6
0
class Html_Parser(object):
    def __init__(self):
        self.ips = IPAddresss(QQWRY_PATH)

    def parse(self, response, parser):
        """

        :param response: 响应
        :param type: 解析方式
        :return:
        """
        if parser["type"] == "xpath":
            return self.XpathPraser(response, parser)
        elif parser["type"] == "regular":
            return self.RegularPraser(response, parser)
        elif parser["type"] == "module":
            return getattr(self, parser["moduleName"], None)(response, parser)
        else:
            return None

    def AuthCountry(self, addr):
        """
        用来判断地址是哪个国家的
        :param addr:
        :return:
        """
        for area in CHINA_AREA:
            if text_(area) in addr:
                return True
        return False

    def XpathPraser(self, response, parser):
        """
        针对xpath方式进行解析
        :param response:
        :param parser:
        :return:
        """
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser["pattern"])
        for proxy in proxys:
            try:
                ip = proxy.xpath(parser["position"]["ip"])[0].text
                port = proxy.xpath(parser["position"]["port"])[0].text
                type = 0
                protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                country = text_("")
                area = text_("")
                if text_("省") in addr or self.AuthCountry(addr):
                    country = text_("国内")
                    area = addr
                else:
                    country = text_("国外")
                    area = addr
            except Exception as e:
                continue
            # updatetime = datetime.datetime.now()
            # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
            proxy = {
                "ip": ip,
                "port": int(port),
                "types": int(type),
                "protocol": int(protocol),
                "country": country,
                "area": area,
                "speed": 100,
            }
            proxylist.append(proxy)
        return proxylist

    def RegularPraser(self, response, parser):
        """
        针对正则表达式进行解析
        :param response:
        :param parser:
        :return:
        """
        proxylist = []
        pattern = re.compile(parser["pattern"])
        matchs = pattern.findall(response)
        if matchs != None:
            for match in matchs:
                try:
                    ip = match[parser["position"]["ip"]]
                    port = match[parser["position"]["port"]]
                    # 网站的类型一直不靠谱所以还是默认,之后会检测
                    type = 0
                    # if parser['postion']['protocol'] > 0:
                    # protocol = match[parser['postion']['protocol']]
                    # if protocol.lower().find('https')!=-1:
                    #         protocol = 1
                    #     else:
                    #         protocol = 0
                    # else:
                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_("")
                    area = text_("")
                    # print(ip,port)
                    if text_("省") in addr or self.AuthCountry(addr):
                        country = text_("国内")
                        area = addr
                    else:
                        country = text_("国外")
                        area = addr
                except Exception as e:
                    continue

                proxy = {
                    "ip": ip,
                    "port": port,
                    "types": type,
                    "protocol": protocol,
                    "country": country,
                    "area": area,
                    "speed": 100,
                }

                proxylist.append(proxy)
            return proxylist

    def CnproxyPraser(self, response, parser):
        proxylist = self.RegularPraser(response, parser)
        chardict = {
            "v": "3",
            "m": "4",
            "a": "2",
            "l": "9",
            "q": "0",
            "b": "5",
            "i": "7",
            "w": "6",
            "r": "8",
            "c": "1",
        }

        for proxy in proxylist:
            port = proxy["port"]
            new_port = ""
            for i in range(len(port)):
                if port[i] != "+":
                    new_port += chardict[port[i]]
            new_port = int(new_port)
            proxy["port"] = new_port
        return proxylist

    def proxy_listPraser(self, response, parser):
        proxylist = []
        pattern = re.compile(parser["pattern"])
        matchs = pattern.findall(response)
        if matchs:
            for match in matchs:
                try:
                    ip_port = base64.b64decode(
                        match.replace("Proxy('", "").replace("')", "")
                    )
                    ip = ip_port.split(":")[0]
                    port = ip_port.split(":")[1]
                    type = 0
                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_("")
                    area = text_("")
                    # print(ip,port)
                    if text_("省") in addr or self.AuthCountry(addr):
                        country = text_("国内")
                        area = addr
                    else:
                        country = text_("国外")
                        area = addr
                except Exception as e:
                    continue
                proxy = {
                    "ip": ip,
                    "port": int(port),
                    "types": type,
                    "protocol": protocol,
                    "country": country,
                    "area": area,
                    "speed": 100,
                }
                proxylist.append(proxy)
            return proxylist
Example #7
0
class Html_Parser(object):
    def __init__(self):
        self.ips = IPAddresss(QQWRY_PATH)

    def parse(self, response, parser):
        '''

        :param response: 响应
        :param type: 解析方式
        :return:
        '''
        if parser['type'] == 'xpath':
            return self.XpathPraser(response, parser)
        elif parser['type'] == 'regular':
            return self.RegularPraser(response, parser)
        elif parser['type'] == 'module':
            return getattr(self, parser['moduleName'], None)(response, parser)
        else:
            return None

    def AuthCountry(self, addr):
        '''
        用来判断地址是哪个国家的
        :param addr:
        :return:
        '''
        for area in CHINA_AREA:
            if text_(area) in addr:
                return True
        return False


    def XpathPraser(self, response, parser):
        '''
        针对xpath方式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])
        for proxy in proxys:
            try:
                ip = proxy.xpath(parser['position']['ip'])[0].text
                port = proxy.xpath(parser['position']['port'])[0].text
                type = 0
                protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                country = text_('')
                area = text_('')
                if text_('省') in addr or self.AuthCountry(addr):
                    country = text_('国内')
                    area = addr
                else:
                    country = text_('国外')
                    area = addr
            except Exception as e:
                continue
            # updatetime = datetime.datetime.now()
            # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
            proxy = {'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country,
                     'area': area, 'speed': 100}
            proxylist.append(proxy)
        return proxylist

    def RegularPraser(self, response, parser):
        '''
        针对正则表达式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs != None:
            for match in matchs:
                try:
                    ip = match[parser['position']['ip']]
                    port = match[parser['position']['port']]
                    # 网站的类型一直不靠谱所以还是默认,之后会检测
                    type = 0
                    # if parser['postion']['protocol'] > 0:
                    # protocol = match[parser['postion']['protocol']]
                    # if protocol.lower().find('https')!=-1:
                    #         protocol = 1
                    #     else:
                    #         protocol = 0
                    # else:
                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_('')
                    area = text_('')
                    # print(ip,port)
                    if text_('省') in addr or self.AuthCountry(addr):
                        country = text_('国内')
                        area = addr
                    else:
                        country = text_('国外')
                        area = addr
                except Exception as e:
                    continue

                proxy = {'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area,
                         'speed': 100}

                proxylist.append(proxy)
            return proxylist


    def CnproxyPraser(self, response, parser):
        proxylist = self.RegularPraser(response, parser)
        chardict = {'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1'}

        for proxy in proxylist:
            port = proxy['port']
            new_port = ''
            for i in range(len(port)):
                if port[i] != '+':
                    new_port += chardict[port[i]]
            new_port = int(new_port)
            proxy['port'] = new_port
        return proxylist


    def proxy_listPraser(self, response, parser):
        proxylist = []
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs:
            for match in matchs:
                try:
                    ip_port = base64.b64decode(match.replace("Proxy('", "").replace("')", ""))
                    ip = ip_port.split(':')[0]
                    port = ip_port.split(':')[1]
                    type = 0
                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_('')
                    area = text_('')
                    # print(ip,port)
                    if text_('省') in addr or self.AuthCountry(addr):
                        country = text_('国内')
                        area = addr
                    else:
                        country = text_('国外')
                        area = addr
                except Exception as e:
                    continue
                proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country,
                         'area': area, 'speed': 100}
                proxylist.append(proxy)
            return proxylist
Example #8
0
class Html_Parser(object):
    def __init__(self):
        self.ips = IPAddresss(QQWRY_PATH)

    def parse(self, response, parser):
        '''

        :param response: 响应
        :param type: 解析方式
        :return:
        '''
        if parser['type'] == 'xpath':
            return self.XpathPraser(response, parser)
        elif parser['type'] == 'regular':
            return self.RegularPraser(response, parser)
        elif parser['type'] == 'module':
            return getattr(self, parser['moduleName'], None)(response, parser)
        else:
            return None

    def AuthCountry(self, addr):
        '''
        用来判断地址是哪个国家的
        :param addr:
        :return:
        '''
        for area in CHINA_AREA:
            if addr.find(area) != -1:
                return True
        return False

    def XpathPraser(self, response, parser):
        '''
        针对xpath方式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])
        for proxy in proxys:
            # print parser['postion']['ip']
            ip = proxy.xpath(parser['postion']['ip'])[0].text
            port = proxy.xpath(parser['postion']['port'])[0].text
            type = proxy.xpath(parser['postion']['type'])[0].text
            if type.find(u'高匿') != -1:
                type = 0
            else:
                type = 1
            protocol = ''
            if len(parser['postion']['protocol']) > 0:
                protocol = proxy.xpath(parser['postion']['protocol'])[0].text
                if protocol.lower().find('https') != -1:
                    protocol = 1
                else:
                    protocol = 0
            else:
                protocol = 0
            addr = self.ips.getIpAddr(self.ips.str2ip(ip))
            country = ''
            area = ''
            if addr.find(u'省') != -1 or self.AuthCountry(addr):
                country = u'中国'
                area = addr
            else:
                country = addr
                area = ''
            # updatetime = datetime.datetime.now()
            # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
            proxy = {
                'ip': ip,
                'port': int(port),
                'type': int(type),
                'protocol': int(protocol),
                'country': country,
                'area': area,
                'speed': 100
            }
            logger.info("Fetch proxy %s" % str(proxy))
            proxylist.append(proxy)

        return proxylist

    def RegularPraser(self, response, parser):
        '''
        针对正则表达式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs != None:
            for match in matchs:
                logging.info(str(match))
                ip = match[parser['postion']['ip']]
                port = match[parser['postion']['port']]
                #网站的类型一直不靠谱所以还是默认,之后会检测
                type = 0
                if parser['postion']['protocol'] > 0:
                    protocol = match[parser['postion']['protocol']]
                    if protocol.lower().find('https') != -1:
                        protocol = 1
                    else:
                        protocol = 0
                else:
                    protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                country = ''
                area = ''
                if addr.find(u'省') != -1 or self.AuthCountry(addr):
                    country = u'中国'
                    area = addr
                else:
                    country = addr
                    area = ''
                proxy = {
                    'ip': ip,
                    'port': port,
                    'type': type,
                    'protocol': protocol,
                    'country': country,
                    'area': area,
                    'speed': 100
                }
                logger.info("Fetch proxy %s" % str(proxy))
                proxylist.append(proxy)
            return proxylist

    def CnproxyPraser(self, response, parser):
        proxylist = self.RegularPraser(response, parser)
        chardict = {
            'v': '3',
            'm': '4',
            'a': '2',
            'l': '9',
            'q': '0',
            'b': '5',
            'i': '7',
            'w': '6',
            'r': '8',
            'c': '1'
        }

        for proxy in proxylist:
            port = proxy['port']
            new_port = ''
            for i in range(len(port)):
                if port[i] != '+':
                    new_port += chardict[port[i]]
            new_port = int(new_port)
            proxy['port'] = new_port
        return proxylist
Example #9
0
class Html_Parser(object):
    '''html解析器'''
    def __init__(self):
        self.ips = IPAddresss(QQWRY_PATH)

    def parse(self, response, parser):
        '''解析方式选择'''
        if parser['type'] == 'xpath':
            return self.XpathPraser(response, parser)
        elif parser['type'] == 'regular':
            return self.RegularPraser(response, parser)
        elif parser['type'] == 'module':
            return getattr(self, parser['moduleName'], None)(response, parser)
        else:
            return None

    def AuthCountry(self, addr):
        '''判断国内国外'''
        for area in CHINA_AREA:
            if text_(area) in addr:
                return True
        return False

    def addrcut(self, addr):
        '''addr切割
            国内:省+市+服务/市+服务
            国外:地区+服务'''
        if text_('省') in addr or self.AuthCountry(addr):
            country = text_('国内')
            addr = addr.split('市')[0]
            if '省' in addr:
                addr = addr.split('省')[1]
            else:
                addr = addr[:2]
        else:
            country = text_('国外')
            addr = addr[:-2]
        return country, addr

    def XpathPraser(self, response, parser):
        '''xpath方式解析'''
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])
        for proxy in proxys:
            try:
                ip = proxy.xpath(parser['position']['ip'])[0].text
                port = proxy.xpath(parser['position']['port'])[0].text
                t_way = 0
                protocol = 0
                # country = text_('')
                # addr = text_('')
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                t_service = addr[-2:]
                country, addr = self.addrcut(addr)
            except Exception as e:
                continue
            proxy = {
                'ip': ip,
                'port': int(port),
                't_way': int(t_way),
                'protocol': int(protocol),
                'country': country,
                't_service': t_service,
                'addr': addr,
                'attr': 0,
                'score': 0
            }
            proxylist.append(proxy)
        return proxylist

    def RegularPraser(self, response, parser):
        '''正则表达式解析'''
        proxylist = []
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs != None:
            for match in matchs:
                try:
                    ip = match[parser['position']['ip']]
                    port = match[parser['position']['port']]
                    t_way = 0
                    protocol = 0
                    # country = text_('')
                    # addr = text_('')
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    t_service = addr[-2:]
                    country, addr = self.addrcut(addr)
                except Exception as e:
                    continue
                proxy = {
                    'ip': ip,
                    'port': int(port),
                    't_way': int(t_way),
                    'protocol': int(protocol),
                    'country': country,
                    't_service': t_service,
                    'addr': addr,
                    'attr': 0,
                    'score': 0
                }
                proxylist.append(proxy)
            return proxylist

    def CnproxyPraser(self, response, parser):
        '''端口号数据优化'''
        proxylist = self.RegularPraser(response, parser)
        chardict = {
            'v': '3',
            'm': '4',
            'a': '2',
            'l': '9',
            'q': '0',
            'b': '5',
            'i': '7',
            'w': '6',
            'r': '8',
            'c': '1'
        }
        for proxy in proxylist:
            port = proxy['port']
            new_port = ''
            for i in range(len(port)):
                if port[i] != '+':
                    new_port += chardict[port[i]]
            new_port = int(new_port)
            proxy['port'] = new_port
        return proxylist

    def proxy_listPraser(self, response, parser):
        proxylist = []
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs:
            for match in matchs:
                try:
                    ip_port = base64.b64decode(
                        match.replace("Proxy('", "").replace("')", ""))
                    ip = ip_port.split(':')[0]
                    port = ip_port.split(':')[1]
                    t_way = 0
                    protocol = 0
                    # country = text_('')
                    # addr = text_('')
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    t_service = addr[-2:]
                    country, addr = self.addrcut(addr)
                except Exception as e:
                    continue
                proxy = {
                    'ip': ip,
                    'port': int(port),
                    't_way': int(t_way),
                    'protocol': int(protocol),
                    'country': country,
                    't_service': t_service,
                    'addr': addr,
                    'attr': 0,
                    'score': 0
                }
                proxylist.append(proxy)
            return proxylist
Example #10
0
class Html_Parser(object):
    def __init__(self):
        self.ips = IPAddresss(QQWRY_PATH)

    def parse(self, response, parser):
        """
        :param response: 响应
        :param type: 解析方式
        :return:
        """
        if parser['type'] == 'xpath':
            return self.XpathPraser(response, parser)
        elif parser['type'] == 'regular':
            return self.RegularPraser(response, parser)
        elif parser['type'] == 'module':
            return getattr(self, parser['moduleName'], None)(response, parser)
        else:
            return None

    @staticmethod
    def auth_country(addr):
        """
        用来判断地址是哪个国家的
        :param addr:
        :return:
        """
        for area in CHINA_AREA:
            if text_(area) in addr:
                return True
        return False

    def parse_ip_to_addr(self, ip):
        addr = self.ips.getIpAddr(self.ips.str2ip(ip))

        country = ''
        area = ''

        if '省' in addr or self.auth_country(addr):
            country = text_('国内')
            area = addr
        else:
            country = text_('国外')
            area = addr

        return country, area

    def XpathPraser(self, response, parser):
        """
        针对xpath方式进行解析
        :param response:
        :param parser:
        :return:
        """
        proxy_list = []
        root = etree.HTML(response)
        proxies = root.xpath(parser['pattern'])
        for proxy in proxies:
            try:
                ip = proxy.xpath(parser['position']['ip'])[0].text
                port = proxy.xpath(parser['position']['port'])[0].text
                country, area = self.parse_ip_to_addr(ip)
            except Exception as e:
                continue

            proxy = {
                'ip': ip,
                'port': int(port),
                'country': country,
                'area': area
            }
            proxy_list.append(proxy)
        return proxy_list

    def RegularPraser(self, response, parser):
        """
        针对正则表达式进行解析
        :param response:
        :param parser:
        :return:
        """
        proxy_list = []
        pattern = re.compile(parser['pattern'])
        matches = pattern.findall(response)
        if matches is not None:
            for match in matches:
                try:
                    ip = match[parser['position']['ip']]
                    port = match[parser['position']['port']]
                    country, area = self.parse_ip_to_addr(ip)
                except Exception as e:
                    continue

                proxy = {
                    'ip': ip,
                    'port': port,
                    'country': country,
                    'area': area
                }

                proxy_list.append(proxy)
            return proxy_list

    def CnproxyPraser(self, response, parser):
        """
        :param response:
        :param parser:
        :return:
        """
        proxy_list = self.RegularPraser(response, parser)
        char_dict = {
            'v': '3',
            'm': '4',
            'a': '2',
            'l': '9',
            'q': '0',
            'b': '5',
            'i': '7',
            'w': '6',
            'r': '8',
            'c': '1'
        }

        for proxy in proxy_list:
            port = proxy['port']
            new_port = ''
            for i in range(len(port)):
                if port[i] != '+':
                    new_port += char_dict[port[i]]
            proxy['port'] = int(new_port)
        return proxy_list

    def proxy_listPraser(self, response, parser):
        proxy_list = []
        pattern = re.compile(parser['pattern'])
        matches = pattern.findall(response)
        if matches:
            for match in matches:
                try:
                    ip_port = base64.b64decode(
                        match.replace("Proxy('", "").replace("')", ""))
                    ip = ip_port.split(':')[0]
                    port = ip_port.split(':')[1]
                    country, area = self.parse_ip_to_addr(ip)
                except Exception as e:
                    continue
                proxy = {
                    'ip': ip,
                    'port': int(port),
                    'country': country,
                    'area': area
                }
                proxy_list.append(proxy)
            return proxy_list
Example #11
0
class Html_Parser(object):
    def __init__(self):
        self.ips = IPAddresss(QQWRY_PATH)

    def parse(self, response, parser):
        """

        :param response: 响应
        :param type: 解析方式
        :return:
        """
        if parser["type"] == "xpath":
            proxylist = []
            root = etree.HTML(response)
            proxys = root.xpath(parser["pattern"])
            for proxy in proxys:
                # print parser['postion']['ip']
                ip = proxy.xpath(parser["postion"]["ip"])[0].text
                port = proxy.xpath(parser["postion"]["port"])[0].text
                type = proxy.xpath(parser["postion"]["type"])[0].text
                if type.find(u"高匿") != -1:
                    type = 0
                else:
                    type = 1
                protocol = ""
                if len(parser["postion"]["protocol"]) > 0:
                    protocol = proxy.xpath(parser["postion"]["protocol"])[0].text
                    if protocol.lower().find("https") != -1:
                        protocol = 1
                    else:
                        protocol = 0
                else:
                    protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                country = ""
                area = ""
                if addr.find(u"省") != -1 or self.AuthCountry(addr):
                    country = u"中国"
                    area = addr
                else:
                    country = addr
                    area = ""
                # updatetime = datetime.datetime.now()
                # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

                # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
                proxy = {
                    "ip": ip,
                    "port": int(port),
                    "type": int(type),
                    "protocol": int(protocol),
                    "country": country,
                    "area": area,
                    "speed": 100,
                }
                print proxy
                proxylist.append(proxy)

            return proxylist

    def AuthCountry(self, addr):
        """
        用来判断地址是哪个国家的
        :param addr:
        :return:
        """
        for area in CHINA_AREA:
            if addr.find(area) != -1:
                return True
        return False
Example #12
0
class Html_Parser(object):
    def __init__(self):
        self.ips = IPAddresss(QQWRY_PATH)

    def parse(self, response, parser):

        if parser['type'] == 'xpath':
            return self.XpathPraser(response, parser)

        elif parser['type'] == 'regular':

            return self.RegularPraser(response, parser)

        elif parser['type'] == 'module':
            return getattr(self, parser['moduleName'], None)(response, parser)
        else:
            return None

    def AuthCountry(self, addr):
        for area in CHINA_AREA:
            if text_(area) in addr:
                return True
            return False

    def XpathPraser(self, response, parser):
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])

        for proxy in proxys:
            try:
                ip = proxy.xpath(parser['position']['ip'])[0].text
                port = proxy.xpath(parser['position']['port'])[0].text
                type = 0
                protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))

                print(addr)
                country = text_('')
                area = text_('')

                if text_('省') in addr or self.AuthCountry(addr):
                    country = text_('国内')
                    area = addr
                else:
                    country = text_('国外')
                    area = addr
            except Exception as e:
                continue

            proxy = {
                'ip': ip,
                'port': int(port),
                'types': int(type),
                'protocol': int(protocol),
                'country': country,
                'area': area,
                'speed': 100
            }
            proxylist.append(proxy)

        return proxylist

    def RegularPraser(self, response, parser):
        proxylist = []

        pattern = re.compile(parser['pattern'])

        matchs = pattern.findall(response)
        if matchs != None:
            for match in matchs:
                try:
                    ip = match[parser['position']['ip']]
                    port = match[parser['position']['port']]
                    type = 0

                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_("")
                    area = text_('')

                    if text_('省') in addr or self.AuthCountry(addr):
                        country = text_('国内')
                        area = addr
                    else:
                        country = text_('国外')
                        area = addr

                except Exception as e:
                    continue

                proxy = {
                    'ip': ip,
                    'port': port,
                    'types': type,
                    'protocol': protocol,
                    'country': country,
                    'area': area,
                    'speed': 100
                }
            proxylist.append(proxy)

        return proxylist

    def CnproxyPraser(self, response, parser):
        proxylist = self.RegularPraser(response, parser)
        chardict = {
            'v': '3',
            'm': '4',
            'a': '2',
            'l': '9',
            'q': '0',
            'b': '5',
            'i': '7',
            'w': '6',
            'r': '8',
            'c': '1'
        }

        for proxy in proxylist:
            port = proxy['port']
            new_port = ''
            for i in range(len(port)):
                if port[i] != '+':
                    new_port += chardict[port[i]]
            new_port = int(new_port)
            proxy['port'] = new_port

        return proxylist

    def proxy_listPraser(self, response, parser):
        proxylist = []
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)

        if matchs:
            for match in matchs:
                try:
                    ip_port = base64.b64decode(
                        match.replace("Proxy('", "").replace("')", ""))

                    ip = ip_port.split(':')[0]
                    port = ip_port.split(':')[1]
                    type = 0
                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_('')
                    area = text_('')

                    if text_('省') in addr or self.AuthCountry(addr):
                        country = text_('国内')
                        area = addr

                    else:
                        country = text_('国外')

                        area = addr
                except Exception as e:
                    continue

                proxy = {
                    'ip': ip,
                    'port': int(port),
                    'types': type,
                    'protocol': protocol,
                    'country': country,
                    'area': area,
                    'speed': 100
                }
                proxylist.append(proxy)
            return proxylist