def XpathPraser(self, response, parser): ''' 针对xpath方式进行解析 :param response: :param parser: :return: ''' proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser['pattern']) for proxy in proxys: try: ip = proxy.xpath(parser['position']['ip'])[0].text port = proxy.xpath(parser['position']['port'])[0].text type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue # updatetime = datetime.datetime.now() # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} proxy = {'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country, 'area': area, 'speed': 100} proxylist.append(proxy) return proxylist
def proxy_listPraser(self, response, parser): proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs: for match in matchs: try: ip_port = base64.b64decode(match.replace("Proxy('", "").replace("')", "")) ip = ip_port.split(':')[0] port = ip_port.split(':')[1] type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') # print(ip,port) if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100} proxylist.append(proxy) return proxylist
def getAddr(self,offset,ip=0): self.ipdb.seek(offset + 4) countryAddr = text_("") areaAddr = text_("") str = self.ipdb.read(1) (byte,) = struct.unpack('B',str) if byte == 0x01: countryOffset = self.getLong3() self.ipdb.seek(countryOffset) str = self.ipdb.read(1) (b,) = struct.unpack('B',str) if b == 0x02: countryAddr = self.getString(self.getLong3()) self.ipdb.seek(countryOffset + 4) else: countryAddr = self.getString(countryOffset) areaAddr = self.getAreaAddr() elif byte == 0x02: countryAddr = self.getString(self.getLong3()) areaAddr = self.getAreaAddr(offset+8) else: countryAddr = self.getString(offset + 4) areaAddr = self.getAreaAddr() return countryAddr + text_(" ") + areaAddr
def getIpAddr(self,ip): L = 0 R = self.indexCount - 1 while L < R -1: M = int((L+R)/2) self.setIpRange(M) if ip == self.curStartIp: L = M break if ip > self.curStartIp: L = M else: R = M self.setIpRange(L) if ip & 0xffffff00 == 0xffffff00: self.setIpRange(R) if self.curStartIp <= ip <= self.curEndIp: address = self.getAddr(self.curEndIpOffset) address = text_(address) else: address = text_('未找到该IP的地址') return address
def RegularPraser(self, response, parser): """ 针对正则表达式进行解析 :param response: :param parser: :return: """ proxylist = [] pattern = re.compile(parser["pattern"]) matchs = pattern.findall(response) if matchs != None: for match in matchs: try: ip = match[parser["position"]["ip"]] port = match[parser["position"]["port"]] # 网站的类型一直不靠谱所以还是默认,之后会检测 type = 0 # if parser['postion']['protocol'] > 0: # protocol = match[parser['postion']['protocol']] # if protocol.lower().find('https')!=-1: # protocol = 1 # else: # protocol = 0 # else: protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_("") area = text_("") # print(ip,port) if text_("省") in addr or self.AuthCountry(addr): country = text_("国内") area = addr else: country = text_("国外") area = addr except Exception as e: continue proxy = { "ip": ip, "port": port, "types": type, "protocol": protocol, "country": country, "area": area, "speed": 100, } proxylist.append(proxy) return proxylist
def RegularPraser(self, response, parser): ''' 针对正则表达式进行解析 :param response: :param parser: :return: ''' proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs != None: for match in matchs: try: ip = match[parser['position']['ip']] port = match[parser['position']['port']] # 网站的类型一直不靠谱所以还是默认,之后会检测 type = 0 # if parser['postion']['protocol'] > 0: # protocol = match[parser['postion']['protocol']] # if protocol.lower().find('https')!=-1: # protocol = 1 # else: # protocol = 0 # else: protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') # print(ip,port) if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = { 'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist
def parse_ip_to_addr(self, ip): addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = '' area = '' if '省' in addr or self.auth_country(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr return country, area
def addrcut(self, addr): '''addr切割 国内:省+市+服务/市+服务 国外:地区+服务''' if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') addr = addr.split('市')[0] if '省' in addr: addr = addr.split('省')[1] else: addr = addr[:2] else: country = text_('国外') addr = addr[:-2] return country, addr
def AuthCountry(self, addr): """ 判断地址为哪个国家的 """ for area in CHINA_AREA: if text_(area) in addr: return True return False
def auth_country(addr): """ 用来判断地址是哪个国家的 :param addr: :return: """ for area in CHINA_AREA: if text_(area) in addr: return True return False
def XpathParser(self, response, parser): """ 针对Xpath进行解析 response :网页内容 parser:解析语句 """ proxylist = [] # 调用lxml.etree解析网页内容 root = etree.HTML(response) proxys = root.xpath(parser['pattern']) # 整个页面的代理内容 for proxy in proxys: try: ip = proxy.xpath(parser['position']['ip'])[0].text port = proxy.xpath(parser['position']['port'])[0].text type = 0 protocol = 0 country = text_('') area = text_('') # 对IP地址地理位置判断 # addr = self.ips.getIpAdder(self.ips.str2ip(ip)) # if text_('省') in addr or self.AuthCountry(addr): # country = text_('国内') # area = addr # else: # country = text_('国外') # area = addr except Exception as e: print(e) print("XpathParser Error") continue proxy = { 'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist
def AuthCountry(self, addr): ''' 用来判断地址是哪个国家的 :param addr: :return: ''' for area in CHINA_AREA: if text_(area) in addr: return True return False
def RegularPraser(self, response, parser): ''' 针对正则表达式进行解析 :param response: :param parser: :return: ''' proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs != None: for match in matchs: try: ip = match[parser['position']['ip']] port = match[parser['position']['port']] # 网站的类型一直不靠谱所以还是默认,之后会检测 type = 0 # if parser['postion']['protocol'] > 0: # protocol = match[parser['postion']['protocol']] # if protocol.lower().find('https')!=-1: # protocol = 1 # else: # protocol = 0 # else: protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') # print(ip,port) if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = {'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100} proxylist.append(proxy) return proxylist
def XpathPraser(self, response, parser): """ 针对xpath方式进行解析 :param response: :param parser: :return: """ proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser["pattern"]) for proxy in proxys: try: ip = proxy.xpath(parser["position"]["ip"])[0].text port = proxy.xpath(parser["position"]["port"])[0].text type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_("") area = text_("") if text_("省") in addr or self.AuthCountry(addr): country = text_("国内") area = addr else: country = text_("国外") area = addr except Exception as e: continue # updatetime = datetime.datetime.now() # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} proxy = { "ip": ip, "port": int(port), "types": int(type), "protocol": int(protocol), "country": country, "area": area, "speed": 100, } proxylist.append(proxy) return proxylist
def dump(self, first, last): if last > self.indexCount: last = self.indexCount for index in range(first, last): offset = self.firstIndex + index * 7 self.ipdb.seek(offset) buf = self.ipdb.read(7) (ip, of1, of2) = struct.unpack("IHB", buf) address = self.getAddr(of1 + (of2 << 16)) # 把GBK转为utf-8 address = text_(address, 'gbk').encode("utf-8") logger.info("%d %s %s" % (index, self.ip2str(ip), address))
def proxy_listParser(self, response, parser): proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs: for match in matchs: try: # 对ip使用64base进行解码 ip_port_temp = base64.b64decode( match.replace("Proxy('", "").replace("')", "")) # 解码后重新编码成str类型 ip_port = ip_port_temp.decode() # 再进行切分 ip = ip_port.split(':')[0] port = ip_port.split(':')[1] type = 0 protocol = 0 country = text_('') area = text_('') # 对IP地址地理位置判断 # addr = self.ips.getIpAddr(self.ips.str2ip(ip)) # if text_('省') in addr or self.AuthCountry(addr): # country = text_('国内') # area = addr # else: # country = text_('国外') # area = addr except Exception as e: print(e) continue proxy = { 'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist
def RegularPraser(self, response, parser): proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs != None: for match in matchs: try: ip = match[parser['position']['ip']] port = match[parser['position']['port']] type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_("") area = text_('') if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = { 'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist
def RegularParser(self, response, parser): '''正则表达式''' proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs != None: for match in matchs: try: ip = match[parser['position']['ip']] port = match[parser['postiont']['port']] # 网站的类型一直不靠谱所以还是默认,之后会检测 type = 0 protocol = 0 country = text_('') area = text_('') # 对IP地址地理位置判断 # addr = self.ips.getIpAdder(self.ips.str2ip(ip)) # if text_('省') in addr or self.AuthCountry(addr): # country = text_('国内') # area = addr # else: # country = text_('国外') # area = addr except Exception as e: print(e) continue proxy = { 'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist
def XpathPraser(self, response, parser): proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser['pattern']) for proxy in proxys: try: ip = proxy.xpath(parser['position']['ip'])[0].text port = proxy.xpath(parser['position']['port'])[0].text type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) print(addr) country = text_('') area = text_('') if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = { 'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist
def proxy_listPraser(self, response, parser): proxylist = [] pattern = re.compile(parser["pattern"]) matchs = pattern.findall(response) if matchs: for match in matchs: try: ip_port = base64.b64decode( match.replace("Proxy('", "").replace("')", "") ) ip = ip_port.split(":")[0] port = ip_port.split(":")[1] type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_("") area = text_("") # print(ip,port) if text_("省") in addr or self.AuthCountry(addr): country = text_("国内") area = addr else: country = text_("国外") area = addr except Exception as e: continue proxy = { "ip": ip, "port": int(port), "types": type, "protocol": protocol, "country": country, "area": area, "speed": 100, } proxylist.append(proxy) return proxylist
def getAddr(self, offset, ip=0): self.ipdb.seek(offset + 4) countryAddr = text_("") areaAddr = text_("") str = self.ipdb.read(1) (byte,) = struct.unpack('B', str) if byte == 0x01: countryOffset = self.getLong3() self.ipdb.seek(countryOffset) str = self.ipdb.read(1) (b,) = struct.unpack('B', str) if b == 0x02: countryAddr = self.getString(self.getLong3()) self.ipdb.seek(countryOffset + 4) else: countryAddr = self.getString(countryOffset) areaAddr = self.getAreaAddr() elif byte == 0x02: countryAddr = self.getString(self.getLong3()) areaAddr = self.getAreaAddr(offset + 8) else: countryAddr = self.getString(offset + 4) areaAddr = self.getAreaAddr() return countryAddr + text_(" ") + areaAddr
def getIpAddr(self, ip): L = 0 R = self.indexCount - 1 while L < R - 1: M = int((L + R) / 2) self.setIpRange(M) if ip == self.curStartIp: L = M break if ip > self.curStartIp: L = M else: R = M self.setIpRange(L) # version information, 255.255.255.X, urgy but useful if ip & 0xffffff00 == 0xffffff00: self.setIpRange(R) if self.curStartIp <= ip <= self.curEndIp: address = self.getAddr(self.curEndIpOffset) # 把GBK转为utf-8 address = text_(address) else: address = text_("未找到该IP的地址") return address
def XpathPraser(self, response, parser): ''' 针对xpath方式进行解析 :param response: :param parser: :return: ''' proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser['pattern']) logger.info("xpath parser identified ip size:{}", len(proxys)) for proxy in proxys: try: ip = str(proxy.xpath(parser['position']['ip'])[0]) port = str(proxy.xpath(parser['position']['port'])[0]) logger.info("ip:{}, type:{}", ip, type(ip)) datatype = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: info = traceback.format_exc() logger.info("parse ip/port error:{}", info) continue # updatetime = datetime.datetime.now() # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} proxy = {'ip': ip, 'port': int(port), 'types': int(datatype), 'protocol': int(protocol), 'country': country, 'area': area, 'speed': 100} proxylist.append(proxy) return proxylist
def AuthCountry(self, addr): '''判断国内国外''' for area in CHINA_AREA: if text_(area) in addr: return True return False