Ejemplo n.º 1
0
    def XpathPraser(self, response, parser):
        '''
        针对xpath方式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])
        for proxy in proxys:
            try:
                ip = proxy.xpath(parser['position']['ip'])[0].text
                port = proxy.xpath(parser['position']['port'])[0].text
                type = 0
                protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                country = text_('')
                area = text_('')
                if text_('省') in addr or self.AuthCountry(addr):
                    country = text_('国内')
                    area = addr
                else:
                    country = text_('国外')
                    area = addr
            except Exception as e:
                continue
            # updatetime = datetime.datetime.now()
            # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
            proxy = {'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country,
                     'area': area, 'speed': 100}
            proxylist.append(proxy)
        return proxylist
Ejemplo n.º 2
0
 def proxy_listPraser(self, response, parser):
     proxylist = []
     pattern = re.compile(parser['pattern'])
     matchs = pattern.findall(response)
     if matchs:
         for match in matchs:
             try:
                 ip_port = base64.b64decode(match.replace("Proxy('", "").replace("')", ""))
                 ip = ip_port.split(':')[0]
                 port = ip_port.split(':')[1]
                 type = 0
                 protocol = 0
                 addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                 country = text_('')
                 area = text_('')
                 # print(ip,port)
                 if text_('省') in addr or self.AuthCountry(addr):
                     country = text_('国内')
                     area = addr
                 else:
                     country = text_('国外')
                     area = addr
             except Exception as e:
                 continue
             proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country,
                      'area': area, 'speed': 100}
             proxylist.append(proxy)
         return proxylist
Ejemplo n.º 3
0
	def getAddr(self,offset,ip=0):
		self.ipdb.seek(offset + 4)
		countryAddr = text_("")
		areaAddr = text_("")
		str = self.ipdb.read(1)
		(byte,) = struct.unpack('B',str)
		
		if byte == 0x01:
			countryOffset = self.getLong3()
			self.ipdb.seek(countryOffset)
			str = self.ipdb.read(1)
			(b,) = struct.unpack('B',str)
			if b == 0x02:
				countryAddr = self.getString(self.getLong3())
				self.ipdb.seek(countryOffset + 4)
				
			else:
				countryAddr = self.getString(countryOffset)
			areaAddr = self.getAreaAddr()
		elif byte == 0x02:
			countryAddr = self.getString(self.getLong3())
			areaAddr = self.getAreaAddr(offset+8)
			
		else:
			countryAddr = self.getString(offset + 4)
			areaAddr = self.getAreaAddr()
			
		return countryAddr + text_(" ") + areaAddr
Ejemplo n.º 4
0
	def getIpAddr(self,ip):
		L = 0 
		R = self.indexCount - 1 
		while L < R -1:
			M = int((L+R)/2)
			self.setIpRange(M)
			
			if ip == self.curStartIp:
				L = M 
				break 
				
			if ip > self.curStartIp:
				L = M 
				
			else:
				R = M 
			
		self.setIpRange(L)
		
		if ip & 0xffffff00 == 0xffffff00:
			self.setIpRange(R)
		if self.curStartIp <= ip <= self.curEndIp:
			address = self.getAddr(self.curEndIpOffset)
			address = text_(address)
		else:
			address = text_('未找到该IP的地址')
		return address 
Ejemplo n.º 5
0
 def proxy_listPraser(self, response, parser):
     proxylist = []
     pattern = re.compile(parser['pattern'])
     matchs = pattern.findall(response)
     if matchs:
         for match in matchs:
             try:
                 ip_port = base64.b64decode(match.replace("Proxy('", "").replace("')", ""))
                 ip = ip_port.split(':')[0]
                 port = ip_port.split(':')[1]
                 type = 0
                 protocol = 0
                 addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                 country = text_('')
                 area = text_('')
                 # print(ip,port)
                 if text_('省') in addr or self.AuthCountry(addr):
                     country = text_('国内')
                     area = addr
                 else:
                     country = text_('国外')
                     area = addr
             except Exception as e:
                 continue
             proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country,
                      'area': area, 'speed': 100}
             proxylist.append(proxy)
         return proxylist
Ejemplo n.º 6
0
    def XpathPraser(self, response, parser):
        '''
        针对xpath方式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])
        for proxy in proxys:
            try:
                ip = proxy.xpath(parser['position']['ip'])[0].text
                port = proxy.xpath(parser['position']['port'])[0].text
                type = 0
                protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                country = text_('')
                area = text_('')
                if text_('省') in addr or self.AuthCountry(addr):
                    country = text_('国内')
                    area = addr
                else:
                    country = text_('国外')
                    area = addr
            except Exception as e:
                continue
            # updatetime = datetime.datetime.now()
            # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
            proxy = {'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country,
                     'area': area, 'speed': 100}
            proxylist.append(proxy)
        return proxylist
Ejemplo n.º 7
0
    def RegularPraser(self, response, parser):
        """
        针对正则表达式进行解析
        :param response:
        :param parser:
        :return:
        """
        proxylist = []
        pattern = re.compile(parser["pattern"])
        matchs = pattern.findall(response)
        if matchs != None:
            for match in matchs:
                try:
                    ip = match[parser["position"]["ip"]]
                    port = match[parser["position"]["port"]]
                    # 网站的类型一直不靠谱所以还是默认,之后会检测
                    type = 0
                    # if parser['postion']['protocol'] > 0:
                    # protocol = match[parser['postion']['protocol']]
                    # if protocol.lower().find('https')!=-1:
                    #         protocol = 1
                    #     else:
                    #         protocol = 0
                    # else:
                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_("")
                    area = text_("")
                    # print(ip,port)
                    if text_("省") in addr or self.AuthCountry(addr):
                        country = text_("国内")
                        area = addr
                    else:
                        country = text_("国外")
                        area = addr
                except Exception as e:
                    continue

                proxy = {
                    "ip": ip,
                    "port": port,
                    "types": type,
                    "protocol": protocol,
                    "country": country,
                    "area": area,
                    "speed": 100,
                }

                proxylist.append(proxy)
            return proxylist
Ejemplo n.º 8
0
    def RegularPraser(self, response, parser):
        '''
        针对正则表达式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs != None:
            for match in matchs:
                try:
                    ip = match[parser['position']['ip']]
                    port = match[parser['position']['port']]
                    # 网站的类型一直不靠谱所以还是默认,之后会检测
                    type = 0
                    # if parser['postion']['protocol'] > 0:
                    # protocol = match[parser['postion']['protocol']]
                    # if protocol.lower().find('https')!=-1:
                    #         protocol = 1
                    #     else:
                    #         protocol = 0
                    # else:
                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_('')
                    area = text_('')
                    # print(ip,port)
                    if text_('省') in addr or self.AuthCountry(addr):
                        country = text_('国内')
                        area = addr
                    else:
                        country = text_('国外')
                        area = addr
                except Exception as e:
                    continue

                proxy = {
                    'ip': ip,
                    'port': port,
                    'types': type,
                    'protocol': protocol,
                    'country': country,
                    'area': area,
                    'speed': 100
                }

                proxylist.append(proxy)
            return proxylist
Ejemplo n.º 9
0
    def parse_ip_to_addr(self, ip):
        addr = self.ips.getIpAddr(self.ips.str2ip(ip))

        country = ''
        area = ''

        if '省' in addr or self.auth_country(addr):
            country = text_('国内')
            area = addr
        else:
            country = text_('国外')
            area = addr

        return country, area
Ejemplo n.º 10
0
 def addrcut(self, addr):
     '''addr切割
         国内:省+市+服务/市+服务
         国外:地区+服务'''
     if text_('省') in addr or self.AuthCountry(addr):
         country = text_('国内')
         addr = addr.split('市')[0]
         if '省' in addr:
             addr = addr.split('省')[1]
         else:
             addr = addr[:2]
     else:
         country = text_('国外')
         addr = addr[:-2]
     return country, addr
Ejemplo n.º 11
0
 def AuthCountry(self, addr):
     """
     判断地址为哪个国家的
     """
     for area in CHINA_AREA:
         if text_(area) in addr:
             return True
     return False
Ejemplo n.º 12
0
 def auth_country(addr):
     """
     用来判断地址是哪个国家的
     :param addr:
     :return:
     """
     for area in CHINA_AREA:
         if text_(area) in addr:
             return True
     return False
Ejemplo n.º 13
0
    def XpathParser(self, response, parser):
        """
        针对Xpath进行解析
        response :网页内容
        parser:解析语句
        """
        proxylist = []
        # 调用lxml.etree解析网页内容
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])  # 整个页面的代理内容
        for proxy in proxys:
            try:
                ip = proxy.xpath(parser['position']['ip'])[0].text
                port = proxy.xpath(parser['position']['port'])[0].text
                type = 0
                protocol = 0
                country = text_('')
                area = text_('')

                # 对IP地址地理位置判断
                # addr = self.ips.getIpAdder(self.ips.str2ip(ip))
                # if text_('省') in addr or self.AuthCountry(addr):
                #     country = text_('国内')
                #     area = addr
                # else:
                #     country = text_('国外')
                #     area = addr
            except Exception as e:
                print(e)
                print("XpathParser Error")
                continue

            proxy = {
                'ip': ip,
                'port': int(port),
                'types': int(type),
                'protocol': int(protocol),
                'country': country,
                'area': area,
                'speed': 100
            }
            proxylist.append(proxy)
        return proxylist
Ejemplo n.º 14
0
 def AuthCountry(self, addr):
     '''
     用来判断地址是哪个国家的
     :param addr:
     :return:
     '''
     for area in CHINA_AREA:
         if text_(area) in addr:
             return True
     return False
Ejemplo n.º 15
0
    def RegularPraser(self, response, parser):
        '''
        针对正则表达式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs != None:
            for match in matchs:
                try:
                    ip = match[parser['position']['ip']]
                    port = match[parser['position']['port']]
                    # 网站的类型一直不靠谱所以还是默认,之后会检测
                    type = 0
                    # if parser['postion']['protocol'] > 0:
                    # protocol = match[parser['postion']['protocol']]
                    # if protocol.lower().find('https')!=-1:
                    #         protocol = 1
                    #     else:
                    #         protocol = 0
                    # else:
                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_('')
                    area = text_('')
                    # print(ip,port)
                    if text_('省') in addr or self.AuthCountry(addr):
                        country = text_('国内')
                        area = addr
                    else:
                        country = text_('国外')
                        area = addr
                except Exception as e:
                    continue

                proxy = {'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area,
                         'speed': 100}

                proxylist.append(proxy)
            return proxylist
Ejemplo n.º 16
0
 def AuthCountry(self, addr):
     '''
     用来判断地址是哪个国家的
     :param addr:
     :return:
     '''
     for area in CHINA_AREA:
         if text_(area) in addr:
             return True
     return False
Ejemplo n.º 17
0
    def XpathPraser(self, response, parser):
        """
        针对xpath方式进行解析
        :param response:
        :param parser:
        :return:
        """
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser["pattern"])
        for proxy in proxys:
            try:
                ip = proxy.xpath(parser["position"]["ip"])[0].text
                port = proxy.xpath(parser["position"]["port"])[0].text
                type = 0
                protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                country = text_("")
                area = text_("")
                if text_("省") in addr or self.AuthCountry(addr):
                    country = text_("国内")
                    area = addr
                else:
                    country = text_("国外")
                    area = addr
            except Exception as e:
                continue
            # updatetime = datetime.datetime.now()
            # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
            proxy = {
                "ip": ip,
                "port": int(port),
                "types": int(type),
                "protocol": int(protocol),
                "country": country,
                "area": area,
                "speed": 100,
            }
            proxylist.append(proxy)
        return proxylist
Ejemplo n.º 18
0
 def dump(self, first, last):
     if last > self.indexCount:
         last = self.indexCount
     for index in range(first, last):
         offset = self.firstIndex + index * 7
         self.ipdb.seek(offset)
         buf = self.ipdb.read(7)
         (ip, of1, of2) = struct.unpack("IHB", buf)
         address = self.getAddr(of1 + (of2 << 16))  # 把GBK转为utf-8
         address = text_(address, 'gbk').encode("utf-8")
         logger.info("%d %s %s" % (index, self.ip2str(ip), address))
Ejemplo n.º 19
0
 def proxy_listParser(self, response, parser):
     proxylist = []
     pattern = re.compile(parser['pattern'])
     matchs = pattern.findall(response)
     if matchs:
         for match in matchs:
             try:
                 # 对ip使用64base进行解码
                 ip_port_temp = base64.b64decode(
                     match.replace("Proxy('", "").replace("')", ""))
                 # 解码后重新编码成str类型
                 ip_port = ip_port_temp.decode()
                 # 再进行切分
                 ip = ip_port.split(':')[0]
                 port = ip_port.split(':')[1]
                 type = 0
                 protocol = 0
                 country = text_('')
                 area = text_('')
                 # 对IP地址地理位置判断
                 # addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                 # if text_('省') in addr or self.AuthCountry(addr):
                 #     country = text_('国内')
                 #     area = addr
                 # else:
                 #     country = text_('国外')
                 #     area = addr
             except Exception as e:
                 print(e)
                 continue
             proxy = {
                 'ip': ip,
                 'port': int(port),
                 'types': type,
                 'protocol': protocol,
                 'country': country,
                 'area': area,
                 'speed': 100
             }
             proxylist.append(proxy)
         return proxylist
Ejemplo n.º 20
0
 def dump(self, first, last):
     if last > self.indexCount:
         last = self.indexCount
     for index in range(first, last):
         offset = self.firstIndex + index * 7
         self.ipdb.seek(offset)
         buf = self.ipdb.read(7)
         (ip, of1, of2) = struct.unpack("IHB", buf)
         address = self.getAddr(of1 + (of2 << 16))
         # 把GBK转为utf-8
         address = text_(address, 'gbk').encode("utf-8")
         logger.info("%d %s %s" % (index, self.ip2str(ip), address))
Ejemplo n.º 21
0
    def RegularPraser(self, response, parser):
        proxylist = []

        pattern = re.compile(parser['pattern'])

        matchs = pattern.findall(response)
        if matchs != None:
            for match in matchs:
                try:
                    ip = match[parser['position']['ip']]
                    port = match[parser['position']['port']]
                    type = 0

                    protocol = 0
                    addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                    country = text_("")
                    area = text_('')

                    if text_('省') in addr or self.AuthCountry(addr):
                        country = text_('国内')
                        area = addr
                    else:
                        country = text_('国外')
                        area = addr

                except Exception as e:
                    continue

                proxy = {
                    'ip': ip,
                    'port': port,
                    'types': type,
                    'protocol': protocol,
                    'country': country,
                    'area': area,
                    'speed': 100
                }
            proxylist.append(proxy)

        return proxylist
Ejemplo n.º 22
0
    def RegularParser(self, response, parser):
        '''正则表达式'''
        proxylist = []
        pattern = re.compile(parser['pattern'])
        matchs = pattern.findall(response)
        if matchs != None:
            for match in matchs:
                try:
                    ip = match[parser['position']['ip']]
                    port = match[parser['postiont']['port']]
                    # 网站的类型一直不靠谱所以还是默认,之后会检测
                    type = 0
                    protocol = 0
                    country = text_('')
                    area = text_('')
                    # 对IP地址地理位置判断
                    #  addr = self.ips.getIpAdder(self.ips.str2ip(ip))
                    #  if text_('省') in addr or self.AuthCountry(addr):
                    #      country = text_('国内')
                    #      area = addr
                    #  else:
                    #      country = text_('国外')
                    #      area = addr
                except Exception as e:
                    print(e)
                    continue

                proxy = {
                    'ip': ip,
                    'port': port,
                    'types': type,
                    'protocol': protocol,
                    'country': country,
                    'area': area,
                    'speed': 100
                }
                proxylist.append(proxy)
            return proxylist
Ejemplo n.º 23
0
    def XpathPraser(self, response, parser):
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])

        for proxy in proxys:
            try:
                ip = proxy.xpath(parser['position']['ip'])[0].text
                port = proxy.xpath(parser['position']['port'])[0].text
                type = 0
                protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))

                print(addr)
                country = text_('')
                area = text_('')

                if text_('省') in addr or self.AuthCountry(addr):
                    country = text_('国内')
                    area = addr
                else:
                    country = text_('国外')
                    area = addr
            except Exception as e:
                continue

            proxy = {
                'ip': ip,
                'port': int(port),
                'types': int(type),
                'protocol': int(protocol),
                'country': country,
                'area': area,
                'speed': 100
            }
            proxylist.append(proxy)

        return proxylist
Ejemplo n.º 24
0
 def proxy_listPraser(self, response, parser):
     proxylist = []
     pattern = re.compile(parser["pattern"])
     matchs = pattern.findall(response)
     if matchs:
         for match in matchs:
             try:
                 ip_port = base64.b64decode(
                     match.replace("Proxy('", "").replace("')", "")
                 )
                 ip = ip_port.split(":")[0]
                 port = ip_port.split(":")[1]
                 type = 0
                 protocol = 0
                 addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                 country = text_("")
                 area = text_("")
                 # print(ip,port)
                 if text_("省") in addr or self.AuthCountry(addr):
                     country = text_("国内")
                     area = addr
                 else:
                     country = text_("国外")
                     area = addr
             except Exception as e:
                 continue
             proxy = {
                 "ip": ip,
                 "port": int(port),
                 "types": type,
                 "protocol": protocol,
                 "country": country,
                 "area": area,
                 "speed": 100,
             }
             proxylist.append(proxy)
         return proxylist
Ejemplo n.º 25
0
 def getAddr(self, offset, ip=0):
     self.ipdb.seek(offset + 4)
     countryAddr = text_("")
     areaAddr = text_("")
     str = self.ipdb.read(1)
     (byte,) = struct.unpack('B', str)
     if byte == 0x01:
         countryOffset = self.getLong3()
         self.ipdb.seek(countryOffset)
         str = self.ipdb.read(1)
         (b,) = struct.unpack('B', str)
         if b == 0x02:
             countryAddr = self.getString(self.getLong3())
             self.ipdb.seek(countryOffset + 4)
         else:
             countryAddr = self.getString(countryOffset)
         areaAddr = self.getAreaAddr()
     elif byte == 0x02:
         countryAddr = self.getString(self.getLong3())
         areaAddr = self.getAreaAddr(offset + 8)
     else:
         countryAddr = self.getString(offset + 4)
         areaAddr = self.getAreaAddr()
     return countryAddr + text_(" ") + areaAddr
Ejemplo n.º 26
0
 def getIpAddr(self, ip):
     L = 0
     R = self.indexCount - 1
     while L < R - 1:
         M = int((L + R) / 2)
         self.setIpRange(M)
         if ip == self.curStartIp:
             L = M
             break
         if ip > self.curStartIp:
             L = M
         else:
             R = M
     self.setIpRange(L)
     # version information, 255.255.255.X, urgy but useful
     if ip & 0xffffff00 == 0xffffff00:
         self.setIpRange(R)
     if self.curStartIp <= ip <= self.curEndIp:
         address = self.getAddr(self.curEndIpOffset)
         # 把GBK转为utf-8
         address = text_(address)
     else:
         address = text_("未找到该IP的地址")
     return address
Ejemplo n.º 27
0
 def getIpAddr(self, ip):
     L = 0
     R = self.indexCount - 1
     while L < R - 1:
         M = int((L + R) / 2)
         self.setIpRange(M)
         if ip == self.curStartIp:
             L = M
             break
         if ip > self.curStartIp:
             L = M
         else:
             R = M
     self.setIpRange(L)
     # version information, 255.255.255.X, urgy but useful
     if ip & 0xffffff00 == 0xffffff00:
         self.setIpRange(R)
     if self.curStartIp <= ip <= self.curEndIp:
         address = self.getAddr(self.curEndIpOffset)
         # 把GBK转为utf-8
         address = text_(address)
     else:
         address = text_("未找到该IP的地址")
     return address
Ejemplo n.º 28
0
    def XpathPraser(self, response, parser):
        '''
        针对xpath方式进行解析
        :param response:
        :param parser:
        :return:
        '''
        proxylist = []
        root = etree.HTML(response)
        proxys = root.xpath(parser['pattern'])
        logger.info("xpath parser identified ip size:{}", len(proxys))
        for proxy in proxys:
            try:
                ip = str(proxy.xpath(parser['position']['ip'])[0])
                port = str(proxy.xpath(parser['position']['port'])[0])
                logger.info("ip:{}, type:{}", ip, type(ip))
                datatype = 0
                protocol = 0
                addr = self.ips.getIpAddr(self.ips.str2ip(ip))
                country = text_('')
                area = text_('')
                if text_('省') in addr or self.AuthCountry(addr):
                    country = text_('国内')
                    country = text_('国内')
                    area = addr
                else:
                    country = text_('国外')
                    area = addr
            except Exception as e:
                info = traceback.format_exc()
                logger.info("parse ip/port error:{}", info)
                continue
            # updatetime = datetime.datetime.now()
            # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

            # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
            proxy = {'ip': ip, 'port': int(port), 'types': int(datatype), 'protocol': int(protocol), 'country': country,
                     'area': area, 'speed': 100}
            proxylist.append(proxy)
        return proxylist
Ejemplo n.º 29
0
 def AuthCountry(self, addr):
     '''判断国内国外'''
     for area in CHINA_AREA:
         if text_(area) in addr:
             return True
     return False