def RegularPraser(self, response, parser): ''' 针对正则表达式进行解析 :param response: :param parser: :return: ''' proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs != None: for match in matchs: try: ip = match[parser['position']['ip']] port = match[parser['position']['port']] # 网站的类型一直不靠谱所以还是默认,之后会检测 type = 0 # if parser['postion']['protocol'] > 0: # protocol = match[parser['postion']['protocol']] # if protocol.lower().find('https')!=-1: # protocol = 1 # else: # protocol = 0 # else: protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') # print(ip,port) if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = { 'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist
def AuthCountry(self, addr): ''' 用来判断地址是哪个国家的 :param addr: :return: ''' for area in CHINA_AREA: if text_(area) in addr: return True return False
def XpathPraser(self, response, parser): ''' 针对xpath方式进行解析 :param response: :param parser: :return: ''' proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser['pattern']) for proxy in proxys: try: ip = proxy.xpath(parser['position']['ip'])[0].text port = proxy.xpath(parser['position']['port'])[0].text type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue # updatetime = datetime.datetime.now() # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} proxy = { 'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist
def dump(self, first, last): if last > self.indexCount: last = self.indexCount for index in range(first, last): offset = self.firstIndex + index * 7 self.ipdb.seek(offset) buf = self.ipdb.read(7) (ip, of1, of2) = struct.unpack("IHB", buf) address = self.getAddr(of1 + (of2 << 16)) # 把GBK转为utf-8 address = text_(address, 'gbk').encode("utf-8") logger.info("%d %s %s" % (index, self.ip2str(ip), address))
def getIpAddr(self, ip): L = 0 R = self.indexCount - 1 while L < R - 1: M = int((L + R) / 2) self.setIpRange(M) if ip == self.curStartIp: L = M break if ip > self.curStartIp: L = M else: R = M self.setIpRange(L) # version information, 255.255.255.X, urgy but useful if ip & 0xffffff00 == 0xffffff00: self.setIpRange(R) if self.curStartIp <= ip <= self.curEndIp: address = self.getAddr(self.curEndIpOffset) # 把GBK转为utf-8 address = text_(address) else: address = text_("未找到该IP的地址") return address
def getAddr(self, offset, ip=0): self.ipdb.seek(offset + 4) countryAddr = text_("") areaAddr = text_("") str = self.ipdb.read(1) (byte, ) = struct.unpack('B', str) if byte == 0x01: countryOffset = self.getLong3() self.ipdb.seek(countryOffset) str = self.ipdb.read(1) (b, ) = struct.unpack('B', str) if b == 0x02: countryAddr = self.getString(self.getLong3()) self.ipdb.seek(countryOffset + 4) else: countryAddr = self.getString(countryOffset) areaAddr = self.getAreaAddr() elif byte == 0x02: countryAddr = self.getString(self.getLong3()) areaAddr = self.getAreaAddr(offset + 8) else: countryAddr = self.getString(offset + 4) areaAddr = self.getAreaAddr() return countryAddr + text_(" ") + areaAddr
def proxy_listPraser(self, response, parser): proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs: for match in matchs: try: ip_port = base64.b64decode( match.replace("Proxy('", "").replace("')", "")) ip = ip_port.split(':')[0] port = ip_port.split(':')[1] type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') # print(ip,port) if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = { 'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist