def max(self, proxy: Proxy) -> int: """ set proxy to max score :param proxy: proxy :return: new score """ logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}') if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: """ add proxy and set it to init score :param proxy: proxy, ip:port, like 8.8.8.8:88 :param score: int score :return: result """ if not is_valid_proxy(f'{proxy.host}:{proxy.port}'):#判断获得的ip是否合法 logger.info(f'invalid proxy {proxy}, throw it') return if not self.exists(proxy): if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, score, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): score})
def decrease(self, proxy: Proxy) -> int: """ decrease score of proxy, if small than PROXY_SCORE_MIN, delete it :param proxy: proxy :return: new score """ if IS_REDIS_VERSION_2: self.db.zincrby(REDIS_KEY, proxy.string(), -1) else: self.db.zincrby(REDIS_KEY, -1, proxy.string())#如果ip验证失败则分数减一 score = self.db.zscore(REDIS_KEY, proxy.string()) logger.info(f'{proxy.string()} score decrease 1, current {score}') if score <= PROXY_SCORE_MIN: logger.info(f'{proxy.string()} current score {score}, remove') self.db.zrem(REDIS_KEY, proxy.string())#如果分数小于0则移除ip
def exists(self, proxy: Proxy) -> bool: """ if proxy exists :param proxy: proxy :return: if exists, bool """ return not self.db.zscore(REDIS_KEY, proxy.string()) is None
def decrease(self, proxy: Proxy) -> int: """ decrease score of proxy, if small than PROXY_SCORE_MIN, delete it :param proxy: proxy :return: new score """ score = self.db.zscore(REDIS_KEY, proxy.string()) # current score is larger than PROXY_SCORE_MIN if score and score > PROXY_SCORE_MIN: logger.info(f'{proxy.string()} current score {score}, decrease 1') if IS_REDIS_VERSION_2: return self.db.zincrby(REDIS_KEY, proxy.string(), -1) return self.db.zincrby(REDIS_KEY, -1, proxy.string()) # otherwise delete proxy else: logger.info(f'{proxy.string()} current score {score}, remove') return self.db.zrem(REDIS_KEY, proxy.string())
def parse(self, html): doc = pq(html) trs = doc('.cont br').items() for tr in trs: line = tr[0].tail match = re.search(r'(\d+\.\d+\.\d+\.\d+):(\d+)', line) if match: host = match.group(1) port = match.group(2) yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: host = tr.find('td:nth-child(1)').text() port = int(tr.find('td:nth-child(2)').text()) yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) items = doc('.wlist ul.l2').items() for item in items: host = item.find('span:first-child').text() port = int(item.find('span:nth-child(2)').text()) yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) for item in doc('table tr').items(): td_ip = item.find('td[data-title="IP"]').text() td_port = item.find('td[data-title="PORT"]').text() if td_ip and td_port: yield Proxy(host=td_ip, port=td_port)
def parse(self, html): """ parse html file to get proxies :return: """ ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s * 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html) for address, port in re_ip_address: proxy = Proxy(host=address.strip(), port=int(port.strip())) yield proxy
def convert_proxy_or_proxies(data): """ convert list of str to valid proxies or proxy :param data: :return: """ if not data: return None # if list of proxies if isinstance(data, list): result = [] for item in data: # skip invalid item item = item.strip() if not is_valid_proxy(item): continue host, port = item.split(':') result.append(Proxy(host=host, port=int(port))) return result if isinstance(data, str) and is_valid_proxy(data): host, port = data.split(':') return Proxy(host=host, port=int(port))
def parse(self, html): """ parse html file to get proxies :return: """ etree_html = etree.HTML(html) ip_ports = etree_html.xpath("//tbody/tr/td[1]/text()") for ip_port in ip_ports: host = ip_port.partition(":")[0] port = ip_port.partition(":")[2] yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) items = doc('#ip_list tr:contains(高匿)').items() for item in items: country = item.find('td.country').text() if not country or country.strip() != '高匿': continue host = item.find('td:nth-child(2)').text() port = int(item.find('td:nth-child(3)').text()) yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S) re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S) re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): proxy = Proxy(host=address.strip(), port=int(port.strip())) yield proxy