def __init__(self): self.ips = IPAddresss(QQWRY_PATH)
class Html_Parser(object): def __init__(self): self.ips = IPAddresss(QQWRY_PATH) def parse(self, response, parser): ''' :param response: 响应 :param type: 解析方式 :return: ''' if parser['type'] == 'xpath': return self.XpathPraser(response, parser) elif parser['type'] == 'regular': return self.RegularPraser(response, parser) elif parser['type'] == 'module': return getattr(self, parser['moduleName'], None)(response, parser) else: return None def AuthCountry(self, addr): ''' 用来判断地址是哪个国家的 :param addr: :return: ''' for area in CHINA_AREA: if text_(area) in addr: return True return False def XpathPraser(self, response, parser): ''' 针对xpath方式进行解析 :param response: :param parser: :return: ''' proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser['pattern']) for proxy in proxys: try: ip = proxy.xpath(parser['position']['ip'])[0].text port = proxy.xpath(parser['position']['port'])[0].text type = text_('Transparent') protocol = text_("http") addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') if text_('省') in addr or self.AuthCountry(addr): country = text_('home') area = addr else: country = text_('abroad') area = addr except Exception as e: continue # updatetime = datetime.datetime.now() # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} proxy = { 'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist def RegularPraser(self, response, parser): ''' 针对正则表达式进行解析 :param response: :param parser: :return: ''' proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs != None: for match in matchs: try: ip = match[parser['position']['ip']] port = match[parser['position']['port']] # 网站的类型一直不靠谱所以还是默认,之后会检测 type = text_('Transparent') # if parser['postion']['protocol'] > 0: # protocol = match[parser['postion']['protocol']] # if protocol.lower().find('https')!=-1: # protocol = 1 # else: # protocol = 0 # else: protocol = text_("http") addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') # print(ip,port) if text_('省') in addr or self.AuthCountry(addr): country = text_('home') area = addr else: country = text_('abroad') area = addr except Exception as e: continue proxy = { 'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist def CnproxyPraser(self, response, parser): proxylist = self.RegularPraser(response, parser) chardict = { 'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1' } for proxy in proxylist: port = proxy['port'] new_port = '' for i in range(len(port)): if port[i] != '+': new_port += chardict[port[i]] new_port = int(new_port) proxy['port'] = new_port return proxylist def proxy_listPraser(self, response, parser): proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs: for match in matchs: try: ip_port = base64.b64decode( match.replace("Proxy('", "").replace("')", "")) ip = ip_port.split(':')[0] port = ip_port.split(':')[1] type = text_('Transparent') protocol = text_("http") addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') # print(ip,port) if text_('省') in addr or self.AuthCountry(addr): country = text_('home') area = addr else: country = text_('abroad') area = addr except Exception as e: continue proxy = { 'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist
class Html_Parser(object): def __init__(self): self.ips = IPAddresss(QQWRY_PATH) def parse(self, response, parser): ''' :param response: 响应 :param type: 解析方式 :return: ''' if parser['type'] == 'xpath': return self.XpathPraser(response, parser) elif parser['type'] == 'regular': return self.RegularPraser(response, parser) elif parser['type'] == 'module': return getattr(self, parser['moduleName'], None)(response, parser) else: return None def AuthCountry(self, addr): ''' 用来判断地址是哪个国家的 :param addr: :return: ''' for area in CHINA_AREA: if addr.find(area) != -1: return True return False def XpathPraser(self, response, parser): ''' 针对xpath方式进行解析 :param response: :param parser: :return: ''' proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser['pattern']) for proxy in proxys: # print parser['postion']['ip'] try: ip = proxy.xpath(parser['postion']['ip'])[0].text port = proxy.xpath(parser['postion']['port'])[0].text type = proxy.xpath(parser['postion']['type'])[0].text if type.find(u'高匿') != -1: type = 0 else: type = 1 protocol = '' if len(parser['postion']['protocol']) > 0: protocol = proxy.xpath( parser['postion']['protocol'])[0].text if protocol.lower().find('https') != -1: protocol = 1 else: protocol = 0 else: protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = '' area = '' if addr.find(u'省') != -1 or self.AuthCountry(addr): country = u'中国' area = addr else: country = addr area = '' except Exception, e: continue # updatetime = datetime.datetime.now() # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} proxy = { 'ip': ip, 'port': int(port), 'type': int(type), 'protocol': int(protocol), 'country': country, 'area': area, 'speed': 100 } logger.info("Fetch proxy %s" % str(proxy)) proxylist.append(proxy) return proxylist
class Html_Parser(object): def __init__(self): self.ips = IPAddresss(QQWRY_PATH) def parse(self, response, parser): ''' :param response: 响应 :param type: 解析方式 :return: ''' if parser['type'] == 'xpath': proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser['pattern']) for proxy in proxys: try: ip = proxy.xpath(parser['postion']['ip'])[0].text port = proxy.xpath(parser['postion']['port'])[0].text type = proxy.xpath(parser['postion']['type'])[0].text if type.find(u'高匿') != -1: type = 0 else: type = 1 protocol = '' if len(parser['postion']['protocol']) > 0: protocol = proxy.xpath( parser['postion']['protocol'])[0].text if protocol.lower().find('https') != -1: protocol = 1 else: protocol = 0 else: protocol = 0 # print response addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = '' area = '' if addr.find(u'省') != -1 or self.AuthCountry(addr): country = u'中国' area = addr else: country = addr area = '' # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} proxy = { 'ip': ip, 'port': int(port), 'type': int(type), 'protocol': int(protocol), 'country': country, 'area': area, 'speed': 100, 'createtime': datetime.datetime.now(), 'updatetime': datetime.datetime.now() } # print proxy proxylist.append(proxy) except Exception as err: print err return proxylist def AuthCountry(self, addr): ''' 用来判断地址是哪个国家的 :param addr: :return: ''' for area in CHINA_AREA: if addr.find(area) != -1: return True return False
class Html_Parser(object): def __init__(self): self.ips = IPAddresss(QQWRY_PATH) def parse(self, response, parser): """ :param response: 响应 :param type: 解析方式 :return: """ if parser["type"] == "xpath": return self.XpathPraser(response, parser) elif parser["type"] == "regular": return self.RegularPraser(response, parser) elif parser["type"] == "module": return getattr(self, parser["moduleName"], None)(response, parser) else: return None def AuthCountry(self, addr): """ 用来判断地址是哪个国家的 :param addr: :return: """ for area in CHINA_AREA: if text_(area) in addr: return True return False def XpathPraser(self, response, parser): """ 针对xpath方式进行解析 :param response: :param parser: :return: """ proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser["pattern"]) for proxy in proxys: try: ip = proxy.xpath(parser["position"]["ip"])[0].text port = proxy.xpath(parser["position"]["port"])[0].text type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_("") area = text_("") if text_("省") in addr or self.AuthCountry(addr): country = text_("国内") area = addr else: country = text_("国外") area = addr except Exception as e: continue # updatetime = datetime.datetime.now() # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} proxy = { "ip": ip, "port": int(port), "types": int(type), "protocol": int(protocol), "country": country, "area": area, "speed": 100, } proxylist.append(proxy) return proxylist def RegularPraser(self, response, parser): """ 针对正则表达式进行解析 :param response: :param parser: :return: """ proxylist = [] pattern = re.compile(parser["pattern"]) matchs = pattern.findall(response) if matchs != None: for match in matchs: try: ip = match[parser["position"]["ip"]] port = match[parser["position"]["port"]] # 网站的类型一直不靠谱所以还是默认,之后会检测 type = 0 # if parser['postion']['protocol'] > 0: # protocol = match[parser['postion']['protocol']] # if protocol.lower().find('https')!=-1: # protocol = 1 # else: # protocol = 0 # else: protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_("") area = text_("") # print(ip,port) if text_("省") in addr or self.AuthCountry(addr): country = text_("国内") area = addr else: country = text_("国外") area = addr except Exception as e: continue proxy = { "ip": ip, "port": port, "types": type, "protocol": protocol, "country": country, "area": area, "speed": 100, } proxylist.append(proxy) return proxylist def CnproxyPraser(self, response, parser): proxylist = self.RegularPraser(response, parser) chardict = { "v": "3", "m": "4", "a": "2", "l": "9", "q": "0", "b": "5", "i": "7", "w": "6", "r": "8", "c": "1", } for proxy in proxylist: port = proxy["port"] new_port = "" for i in range(len(port)): if port[i] != "+": new_port += chardict[port[i]] new_port = int(new_port) proxy["port"] = new_port return proxylist def proxy_listPraser(self, response, parser): proxylist = [] pattern = re.compile(parser["pattern"]) matchs = pattern.findall(response) if matchs: for match in matchs: try: ip_port = base64.b64decode( match.replace("Proxy('", "").replace("')", "") ) ip = ip_port.split(":")[0] port = ip_port.split(":")[1] type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_("") area = text_("") # print(ip,port) if text_("省") in addr or self.AuthCountry(addr): country = text_("国内") area = addr else: country = text_("国外") area = addr except Exception as e: continue proxy = { "ip": ip, "port": int(port), "types": type, "protocol": protocol, "country": country, "area": area, "speed": 100, } proxylist.append(proxy) return proxylist
class Html_Parser(object): def __init__(self): self.ips = IPAddresss(QQWRY_PATH) def parse(self, response, parser): ''' :param response: 响应 :param type: 解析方式 :return: ''' if parser['type'] == 'xpath': return self.XpathPraser(response, parser) elif parser['type'] == 'regular': return self.RegularPraser(response, parser) elif parser['type'] == 'module': return getattr(self, parser['moduleName'], None)(response, parser) else: return None def AuthCountry(self, addr): ''' 用来判断地址是哪个国家的 :param addr: :return: ''' for area in CHINA_AREA: if text_(area) in addr: return True return False def XpathPraser(self, response, parser): ''' 针对xpath方式进行解析 :param response: :param parser: :return: ''' proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser['pattern']) for proxy in proxys: try: ip = proxy.xpath(parser['position']['ip'])[0].text port = proxy.xpath(parser['position']['port'])[0].text type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue # updatetime = datetime.datetime.now() # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} proxy = {'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country, 'area': area, 'speed': 100} proxylist.append(proxy) return proxylist def RegularPraser(self, response, parser): ''' 针对正则表达式进行解析 :param response: :param parser: :return: ''' proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs != None: for match in matchs: try: ip = match[parser['position']['ip']] port = match[parser['position']['port']] # 网站的类型一直不靠谱所以还是默认,之后会检测 type = 0 # if parser['postion']['protocol'] > 0: # protocol = match[parser['postion']['protocol']] # if protocol.lower().find('https')!=-1: # protocol = 1 # else: # protocol = 0 # else: protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') # print(ip,port) if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = {'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100} proxylist.append(proxy) return proxylist def CnproxyPraser(self, response, parser): proxylist = self.RegularPraser(response, parser) chardict = {'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1'} for proxy in proxylist: port = proxy['port'] new_port = '' for i in range(len(port)): if port[i] != '+': new_port += chardict[port[i]] new_port = int(new_port) proxy['port'] = new_port return proxylist def proxy_listPraser(self, response, parser): proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs: for match in matchs: try: ip_port = base64.b64decode(match.replace("Proxy('", "").replace("')", "")) ip = ip_port.split(':')[0] port = ip_port.split(':')[1] type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') # print(ip,port) if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100} proxylist.append(proxy) return proxylist
class Html_Parser(object): def __init__(self): self.ips = IPAddresss(QQWRY_PATH) def parse(self, response, parser): ''' :param response: 响应 :param type: 解析方式 :return: ''' if parser['type'] == 'xpath': return self.XpathPraser(response, parser) elif parser['type'] == 'regular': return self.RegularPraser(response, parser) elif parser['type'] == 'module': return getattr(self, parser['moduleName'], None)(response, parser) else: return None def AuthCountry(self, addr): ''' 用来判断地址是哪个国家的 :param addr: :return: ''' for area in CHINA_AREA: if addr.find(area) != -1: return True return False def XpathPraser(self, response, parser): ''' 针对xpath方式进行解析 :param response: :param parser: :return: ''' proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser['pattern']) for proxy in proxys: # print parser['postion']['ip'] ip = proxy.xpath(parser['postion']['ip'])[0].text port = proxy.xpath(parser['postion']['port'])[0].text type = proxy.xpath(parser['postion']['type'])[0].text if type.find(u'高匿') != -1: type = 0 else: type = 1 protocol = '' if len(parser['postion']['protocol']) > 0: protocol = proxy.xpath(parser['postion']['protocol'])[0].text if protocol.lower().find('https') != -1: protocol = 1 else: protocol = 0 else: protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = '' area = '' if addr.find(u'省') != -1 or self.AuthCountry(addr): country = u'中国' area = addr else: country = addr area = '' # updatetime = datetime.datetime.now() # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} proxy = { 'ip': ip, 'port': int(port), 'type': int(type), 'protocol': int(protocol), 'country': country, 'area': area, 'speed': 100 } logger.info("Fetch proxy %s" % str(proxy)) proxylist.append(proxy) return proxylist def RegularPraser(self, response, parser): ''' 针对正则表达式进行解析 :param response: :param parser: :return: ''' proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs != None: for match in matchs: logging.info(str(match)) ip = match[parser['postion']['ip']] port = match[parser['postion']['port']] #网站的类型一直不靠谱所以还是默认,之后会检测 type = 0 if parser['postion']['protocol'] > 0: protocol = match[parser['postion']['protocol']] if protocol.lower().find('https') != -1: protocol = 1 else: protocol = 0 else: protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = '' area = '' if addr.find(u'省') != -1 or self.AuthCountry(addr): country = u'中国' area = addr else: country = addr area = '' proxy = { 'ip': ip, 'port': port, 'type': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } logger.info("Fetch proxy %s" % str(proxy)) proxylist.append(proxy) return proxylist def CnproxyPraser(self, response, parser): proxylist = self.RegularPraser(response, parser) chardict = { 'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1' } for proxy in proxylist: port = proxy['port'] new_port = '' for i in range(len(port)): if port[i] != '+': new_port += chardict[port[i]] new_port = int(new_port) proxy['port'] = new_port return proxylist
class Html_Parser(object): '''html解析器''' def __init__(self): self.ips = IPAddresss(QQWRY_PATH) def parse(self, response, parser): '''解析方式选择''' if parser['type'] == 'xpath': return self.XpathPraser(response, parser) elif parser['type'] == 'regular': return self.RegularPraser(response, parser) elif parser['type'] == 'module': return getattr(self, parser['moduleName'], None)(response, parser) else: return None def AuthCountry(self, addr): '''判断国内国外''' for area in CHINA_AREA: if text_(area) in addr: return True return False def addrcut(self, addr): '''addr切割 国内:省+市+服务/市+服务 国外:地区+服务''' if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') addr = addr.split('市')[0] if '省' in addr: addr = addr.split('省')[1] else: addr = addr[:2] else: country = text_('国外') addr = addr[:-2] return country, addr def XpathPraser(self, response, parser): '''xpath方式解析''' proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser['pattern']) for proxy in proxys: try: ip = proxy.xpath(parser['position']['ip'])[0].text port = proxy.xpath(parser['position']['port'])[0].text t_way = 0 protocol = 0 # country = text_('') # addr = text_('') addr = self.ips.getIpAddr(self.ips.str2ip(ip)) t_service = addr[-2:] country, addr = self.addrcut(addr) except Exception as e: continue proxy = { 'ip': ip, 'port': int(port), 't_way': int(t_way), 'protocol': int(protocol), 'country': country, 't_service': t_service, 'addr': addr, 'attr': 0, 'score': 0 } proxylist.append(proxy) return proxylist def RegularPraser(self, response, parser): '''正则表达式解析''' proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs != None: for match in matchs: try: ip = match[parser['position']['ip']] port = match[parser['position']['port']] t_way = 0 protocol = 0 # country = text_('') # addr = text_('') addr = self.ips.getIpAddr(self.ips.str2ip(ip)) t_service = addr[-2:] country, addr = self.addrcut(addr) except Exception as e: continue proxy = { 'ip': ip, 'port': int(port), 't_way': int(t_way), 'protocol': int(protocol), 'country': country, 't_service': t_service, 'addr': addr, 'attr': 0, 'score': 0 } proxylist.append(proxy) return proxylist def CnproxyPraser(self, response, parser): '''端口号数据优化''' proxylist = self.RegularPraser(response, parser) chardict = { 'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1' } for proxy in proxylist: port = proxy['port'] new_port = '' for i in range(len(port)): if port[i] != '+': new_port += chardict[port[i]] new_port = int(new_port) proxy['port'] = new_port return proxylist def proxy_listPraser(self, response, parser): proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs: for match in matchs: try: ip_port = base64.b64decode( match.replace("Proxy('", "").replace("')", "")) ip = ip_port.split(':')[0] port = ip_port.split(':')[1] t_way = 0 protocol = 0 # country = text_('') # addr = text_('') addr = self.ips.getIpAddr(self.ips.str2ip(ip)) t_service = addr[-2:] country, addr = self.addrcut(addr) except Exception as e: continue proxy = { 'ip': ip, 'port': int(port), 't_way': int(t_way), 'protocol': int(protocol), 'country': country, 't_service': t_service, 'addr': addr, 'attr': 0, 'score': 0 } proxylist.append(proxy) return proxylist
class Html_Parser(object): def __init__(self): self.ips = IPAddresss(QQWRY_PATH) def parse(self, response, parser): """ :param response: 响应 :param type: 解析方式 :return: """ if parser['type'] == 'xpath': return self.XpathPraser(response, parser) elif parser['type'] == 'regular': return self.RegularPraser(response, parser) elif parser['type'] == 'module': return getattr(self, parser['moduleName'], None)(response, parser) else: return None @staticmethod def auth_country(addr): """ 用来判断地址是哪个国家的 :param addr: :return: """ for area in CHINA_AREA: if text_(area) in addr: return True return False def parse_ip_to_addr(self, ip): addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = '' area = '' if '省' in addr or self.auth_country(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr return country, area def XpathPraser(self, response, parser): """ 针对xpath方式进行解析 :param response: :param parser: :return: """ proxy_list = [] root = etree.HTML(response) proxies = root.xpath(parser['pattern']) for proxy in proxies: try: ip = proxy.xpath(parser['position']['ip'])[0].text port = proxy.xpath(parser['position']['port'])[0].text country, area = self.parse_ip_to_addr(ip) except Exception as e: continue proxy = { 'ip': ip, 'port': int(port), 'country': country, 'area': area } proxy_list.append(proxy) return proxy_list def RegularPraser(self, response, parser): """ 针对正则表达式进行解析 :param response: :param parser: :return: """ proxy_list = [] pattern = re.compile(parser['pattern']) matches = pattern.findall(response) if matches is not None: for match in matches: try: ip = match[parser['position']['ip']] port = match[parser['position']['port']] country, area = self.parse_ip_to_addr(ip) except Exception as e: continue proxy = { 'ip': ip, 'port': port, 'country': country, 'area': area } proxy_list.append(proxy) return proxy_list def CnproxyPraser(self, response, parser): """ :param response: :param parser: :return: """ proxy_list = self.RegularPraser(response, parser) char_dict = { 'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1' } for proxy in proxy_list: port = proxy['port'] new_port = '' for i in range(len(port)): if port[i] != '+': new_port += char_dict[port[i]] proxy['port'] = int(new_port) return proxy_list def proxy_listPraser(self, response, parser): proxy_list = [] pattern = re.compile(parser['pattern']) matches = pattern.findall(response) if matches: for match in matches: try: ip_port = base64.b64decode( match.replace("Proxy('", "").replace("')", "")) ip = ip_port.split(':')[0] port = ip_port.split(':')[1] country, area = self.parse_ip_to_addr(ip) except Exception as e: continue proxy = { 'ip': ip, 'port': int(port), 'country': country, 'area': area } proxy_list.append(proxy) return proxy_list
class Html_Parser(object): def __init__(self): self.ips = IPAddresss(QQWRY_PATH) def parse(self, response, parser): """ :param response: 响应 :param type: 解析方式 :return: """ if parser["type"] == "xpath": proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser["pattern"]) for proxy in proxys: # print parser['postion']['ip'] ip = proxy.xpath(parser["postion"]["ip"])[0].text port = proxy.xpath(parser["postion"]["port"])[0].text type = proxy.xpath(parser["postion"]["type"])[0].text if type.find(u"高匿") != -1: type = 0 else: type = 1 protocol = "" if len(parser["postion"]["protocol"]) > 0: protocol = proxy.xpath(parser["postion"]["protocol"])[0].text if protocol.lower().find("https") != -1: protocol = 1 else: protocol = 0 else: protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = "" area = "" if addr.find(u"省") != -1 or self.AuthCountry(addr): country = u"中国" area = addr else: country = addr area = "" # updatetime = datetime.datetime.now() # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} proxy = { "ip": ip, "port": int(port), "type": int(type), "protocol": int(protocol), "country": country, "area": area, "speed": 100, } print proxy proxylist.append(proxy) return proxylist def AuthCountry(self, addr): """ 用来判断地址是哪个国家的 :param addr: :return: """ for area in CHINA_AREA: if addr.find(area) != -1: return True return False
class Html_Parser(object): def __init__(self): self.ips = IPAddresss(QQWRY_PATH) def parse(self, response, parser): if parser['type'] == 'xpath': return self.XpathPraser(response, parser) elif parser['type'] == 'regular': return self.RegularPraser(response, parser) elif parser['type'] == 'module': return getattr(self, parser['moduleName'], None)(response, parser) else: return None def AuthCountry(self, addr): for area in CHINA_AREA: if text_(area) in addr: return True return False def XpathPraser(self, response, parser): proxylist = [] root = etree.HTML(response) proxys = root.xpath(parser['pattern']) for proxy in proxys: try: ip = proxy.xpath(parser['position']['ip'])[0].text port = proxy.xpath(parser['position']['port'])[0].text type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) print(addr) country = text_('') area = text_('') if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = { 'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist def RegularPraser(self, response, parser): proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs != None: for match in matchs: try: ip = match[parser['position']['ip']] port = match[parser['position']['port']] type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_("") area = text_('') if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = { 'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist def CnproxyPraser(self, response, parser): proxylist = self.RegularPraser(response, parser) chardict = { 'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1' } for proxy in proxylist: port = proxy['port'] new_port = '' for i in range(len(port)): if port[i] != '+': new_port += chardict[port[i]] new_port = int(new_port) proxy['port'] = new_port return proxylist def proxy_listPraser(self, response, parser): proxylist = [] pattern = re.compile(parser['pattern']) matchs = pattern.findall(response) if matchs: for match in matchs: try: ip_port = base64.b64decode( match.replace("Proxy('", "").replace("')", "")) ip = ip_port.split(':')[0] port = ip_port.split(':')[1] type = 0 protocol = 0 addr = self.ips.getIpAddr(self.ips.str2ip(ip)) country = text_('') area = text_('') if text_('省') in addr or self.AuthCountry(addr): country = text_('国内') area = addr else: country = text_('国外') area = addr except Exception as e: continue proxy = { 'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country, 'area': area, 'speed': 100 } proxylist.append(proxy) return proxylist