def max(self, proxy: Proxy) -> int: """ 将代理的分数设置为100 :param proxy: proxy :return: new score """ logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}') if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: """ 往有序集合中添加代理并设置分数 :param proxy: proxy, ip:port, like 8.8.8.8:88 :param score: int score :return: result """ if not is_valid_proxy(f'{proxy.host}:{proxy.port}'): logger.info(f'invalid proxy {proxy}, throw it') return if not self.exists(proxy): if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, score, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): score})
def decrease(self, proxy: Proxy): """ 代理无效时,分数减1,等于0删除 :param proxy: proxy :return: new score """ if IS_REDIS_VERSION_2: self.db.zincrby(REDIS_KEY, proxy.string(), -1) else: self.db.zincrby(REDIS_KEY, -1, proxy.string()) score = self.db.zscore(REDIS_KEY, proxy.string()) logger.info(f'{proxy.string()} score decrease 1, current {score}') if score <= PROXY_SCORE_MIN: logger.info(f'{proxy.string()} current score {score}, remove') self.db.zrem(REDIS_KEY, proxy.string())
def exists(self, proxy: Proxy) -> bool: """ if proxy exists :param proxy: proxy :return: if exists, bool """ return not self.db.zscore(REDIS_KEY, proxy.string()) is None
def parse(self, html): doc = pq(html) trs = doc('.cont br').items() for tr in trs: line = tr[0].tail match = re.search(r'(\d+\.\d+\.\d+\.\d+):(\d+)', line) if match: host = match.group(1) port = match.group(2) yield Proxy(host=host, port=port)
def convert_proxy_or_proxies(data): """ convert list of str to valid proxies or proxy :param data: :return: """ if not data: return None if isinstance(data, list): result = [] for item in data: item = item.strip() if not is_valid_proxy(item): continue host, port = item.split(':') result.append(Proxy(host=host, port=int(port))) return result if isinstance(data, str) and is_valid_proxy(data): host, port = data.split(':') return Proxy(host=host, port=int(port))
def convert_proxy_or_proxies(data): """ convert list of str to valid proxies or proxy :param data: :return: """ if not data: return None # if list of proxies if isinstance(data, list): result = [] for item in data: # skip invalid item item = item.strip() if not is_valid_proxy(item): continue pattern = re.compile( r'((?P<username>\S*?)\:(?P<password>\S*?)@)?(?P<ip>[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})\:(?P<port>\d*)' ) match = re.search(pattern, item) username = match.groupdict()['username'] password = match.groupdict()['password'] ip = match.groupdict()['ip'] port = match.groupdict()['port'] result.append( Proxy(host=ip, port=int(port), username=username, password=password)) return result if isinstance(data, str) and is_valid_proxy(data): pattern = re.compile( r'((?P<username>\S*?)\:(?P<password>\S*?)@)?(?P<ip>[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})\:(?P<port>\d*)' ) match = re.search(pattern, data) username = match.groupdict()['username'] password = match.groupdict()['password'] ip = match.groupdict()['ip'] port = match.groupdict()['port'] return Proxy(host=ip, port=int(port), username=username, password=password)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) items = doc('.wlist ul.l2').items() for item in items: host = item.find('span:first-child').text() port = int(item.find('span:nth-child(2)').text()) yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s * 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html) for address, port in re_ip_address: proxy = Proxy(host=address.strip(), port=int(port.strip())) yield proxy
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: host = tr.find('td:nth-child(1)').text() port = int(tr.find('td:nth-child(2)').text()) yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) trs = doc('#ip_list .tr.ip_tr').items() for tr in trs: host = tr.find('div:nth-child(1)').text() port = tr.find('div:nth-child(2)').text() yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ etree_html = etree.HTML(html) ip_ports = etree_html.xpath("//tbody/tr/td[1]/text()") for ip_port in ip_ports: host = ip_port.partition(":")[0] port = ip_port.partition(":")[2] yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ ip_address = re.compile('([\d:\.]*)<br>') hosts_ports = ip_address.findall(html) for addr in hosts_ports: addr_split = addr.split(':') if (len(addr_split) == 2): host = addr_split[0] port = addr_split[1] yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ hosts_ports = html.split('\n') for addr in hosts_ports: if (addr): ip_address = json.loads(addr) host = ip_address['host'] port = ip_address['port'] yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) items = doc('#ip_list tr:contains(高匿)').items() for item in items: country = item.find('td.country').text() if not country or country.strip() != '高匿': continue host = item.find('td:nth-child(2)').text() port = int(item.find('td:nth-child(3)').text()) yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S) re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S) re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): proxy = Proxy(host=address.strip(), port=int(port.strip())) yield proxy
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html)('.ip').items() # ''.join([*filter(lambda x: x != '',re.compile('\>([\d:\.]*)\<').findall(td.html()))]) for td in doc: trs = td.children() ip_str = '' for tr in trs: attrib = tr.attrib if 'style' in attrib and 'none' in tr.attrib['style']: continue ip_str += '' if not tr.text else tr.text addr_split = ip_str.split(':') if (len(addr_split) == 2): host = addr_split[0] port = addr_split[1] yield Proxy(host=host, port=port) else: port = trs[-1].text host = ip_str.replace(port, '') yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ result = json.loads(html) if result['code'] != 0: return MAX_PAGE = int(result['data']['last_page']) hosts_ports = result['data']['data'] for ip_address in hosts_ports: if(ip_address): host = ip_address['ip'] port = ip_address['port'] yield Proxy(host=host, port=port)
def run_tester(): host = '96.113.165.182' port = '3128' tasks = [tester.test(Proxy(host=host, port=port))] tester.loop.run_until_complete(asyncio.wait(tasks))