def run(self): """ run crawlers to get proxy :return: """ if self.is_full(): return proxyfile = "staticproxy.txt" with open(proxyfile, 'r') as fh: proxylines = fh.readlines() logger.info(f'read {proxyfile}') for line in proxylines: if line.strip() != "" and not line.startswith("#"): line = line.replace("\r\n", "").replace("\n", "") pattern = re.compile( r'((?P<username>\S*?)\:(?P<password>\S*?)@)?(?P<ip>[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})\:(?P<port>\d*)' ) match = re.search(pattern, line) if match: username = match.groupdict()['username'] password = match.groupdict()['password'] ip = match.groupdict()['ip'] port = match.groupdict()['port'] proxy = Proxy(host=ip, port=port, username=username, password=password) logger.info("getproxy " + proxy.string()) self.redis.add(proxy) for crawler in self.crawlers: logger.info(f'crawler {crawler} to get proxy') for proxy in crawler.crawl(): print(proxy.string()) self.redis.add(proxy)
def max(self, proxy: Proxy) -> int: """ 将代理设置为 MAX_SCORE :param proxy: 代理 :return: 设置结果 """ logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}') if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
def max(self, proxy: Proxy) -> int: ''' 将代理分数设成最大 :param proxy: 代理 :return: 新的分数 ''' logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}') if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
def max(self, proxy: Proxy) -> int: """ set proxy to max score :param proxy: proxy :return: new score """ logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}') if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
def decrease(self, proxy: Proxy) -> int: """ decrease score of proxy, if mall than PROXY_SCORE_MIN, delete it :param proxy: proxy :return new score """ if IS_REDIS_VERSION_2: self.db.zincrby(REDIS_KEY, proxy.string(), -1) else: self.db.zincrby(REDIS_KEY, -1, proxy.string()) score = self.db.zscore(REDIS_KEY, proxy.string()) logger.info(f'{proxy.string()} curent score {score}, remove') self.db.zrem(REDIS_KEY, proxy.string())
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT): """ 添加代理,设置分数为最高 :param proxy: 代理 :param score: 分数 :return: 添加结果 """ if not is_valid_proxy(f'{proxy.host}:{proxy.port}'): logger.info(f'invalid proxy {proxy}, throw it') return if not self.exists(proxy): if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, score, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): score})
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: """ add proxy and set it to init score :param proxy: proxy, ip:port, like 8.8.8.8:88 :param score: int score :return: result """ if not is_valid_proxy(f'{proxy.host}:{proxy.port}'): logger.info(f'invalid proxy {proxy}, throw it') return if not self.exists(proxy): if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, score, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): score})
def exists(self, proxy: Proxy) -> bool: """ if proxy exists :param proxy: proxy :return: if exists, bool """ return not self.db.zscore(REDIS_KEY, proxy.string()) is None
def exists(self, proxy: Proxy) -> bool: """ 判断是否存在 :param proxy: 代理 :return: 是否存在 """ return not self.db.zscore(REDIS_KEY, proxy.string()) is None
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int: ''' 将代理加到 redis 中,并设置初始分数 :param proxy: 代理,格式 ip:port, 例如 8.8.8.8:888 :param score: 代理初始化的分数 :type score: int :return: 成功添加的数量 ''' if not is_valid_proxy(f'{proxy.host}:{proxy.port}'): logger.info(f'invalid proxy {proxy}, throw it') return # if not self.db.exists(proxy): # 将代理添加到有序集合中 if IS_REDIS_VERSION_2: return self.db.zadd(REDIS_KEY, score, proxy.string()) return self.db.zadd(REDIS_KEY, {proxy.string(): score})
def parse(self, html): doc = pq(html) trs = doc('.cont br').items() for tr in trs: line = tr[0].tail host = line.split(':')[0] port = line.split(':')[1][:4] yield Proxy(host=host, port=port)
def parse(self, html): r = json.loads(html) keys = [k for k in r if 'proxy' in k] for key in keys: for item in r[key]: host = item['ip'] port = int(str(item['port'])) yield Proxy(host=host, port=port)
def decrease(self, proxy: Proxy) -> int: """ 代理值减一分,小于最小值则删除 :param proxy: 代理 :return: 修改后的代理分数 """ score = self.db.zscore(REDIS_KEY, proxy.string()) # current score is larger than PROXY_SCORE_MIN if score and score > PROXY_SCORE_MIN: logger.info(f'{proxy.string()} current score {score}, decrease 1') if IS_REDIS_VERSION_2: return self.db.zincrby(REDIS_KEY, proxy.string(), -1) return self.db.zincrby(REDIS_KEY, -1, proxy.string()) # otherwise delete proxy else: logger.info(f'{proxy.string()} current score {score}, remove') return self.db.zrem(REDIS_KEY, proxy.string())
def decrease(self, proxy: Proxy) -> int: ''' 降低代理的分数,如果比最小值还低,则删除 :param proxy: proxy :return: new score ''' score = self.db.zscore(REDIS_KEY, proxy.string()) # 当前分数比最小值大 if score and score > PROXY_SCORE_MIN: logger.info(f'{proxy.string()} current score {score}, decrease 1') if IS_REDIS_VERSION_2: return self.db.zincrby(REDIS_KEY, proxy.string(), -1) return self.db.zincrby(REDIS_KEY, -1, proxy.string()) # 当前分数比最小值小 else: logger.info(f'{proxy.string()} current score {score}, remove') return self.db.zrem(REDIS_KEY, proxy.string())
def parse(self, html): doc = pq(html) trs = doc('#main > div > div:nth-child(1) > table ').items() for tr in trs: host = tr.find('td:nth-child(1)').text() # port = int(tr.find('td:nth-child(2)').text()) port = int(tr.find('td:nth-child(2)').text()) yield Proxy(host=host, port=port)
def parse(self, html): doc = pq(html) trs = doc('div.layui-form > table > tbody > tr').items() for tr in trs: host = tr.find('td:nth-child(1)').text() port = int(tr.find('td:nth-child(2)').text()) yield Proxy(host=host, port=port)
def decrease(self, proxy: Proxy) -> int: """ decrease score of proxy, if small than PROXY_SCORE_MIN, delete it :param proxy: proxy :return: new score """ score = self.db.zscore(REDIS_KEY, proxy.string()) # current score is larger than PROXY_SCORE_MIN if score and score > PROXY_SCORE_MIN: logger.info(f'{proxy.string()} current score {score}, decrease 1') if IS_REDIS_VERSION_2: return self.db.zincrby(REDIS_KEY, proxy.string(), -1) return self.db.zincrby(REDIS_KEY, -1, proxy.string()) # otherwise delete proxy else: logger.info(f'{proxy.string()} current score {score}, remove') return self.db.zrem(REDIS_KEY, proxy)
def parse(self, html): """:param """ doc = pq(html) trs = doc('.containerbox table tr:gt (0)').times() for tr in trs: host = tr.find('td:nth-child(1)').text() port = int(tr.find('td:ntl-child(2)').text()) yield Proxy(host=host, port=port)
def parse(self, html): doc = pq(html) trs = doc('.cont br').items() for tr in trs: line = tr[0].tail match = re.search(r'(\d+\.\d+\.\d+\.\d+):(\d+)', line) if match: host = match.group(1) port = match.group(2) yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) for item in doc('tr[class="odd"]').items(): td_ip = item.find('td:nth-child(2)').text() td_port = item.find('td:nth-child(3)').text() if td_ip and td_port: yield Proxy(host=td_ip, port=td_port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) trs = doc('.layui-table tbody tr ').items() for tr in trs: host = tr.find('td div')[0].text port = tr.find('td div')[1].text yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: host = tr.find('td:nth-child(1)').text() port = int(tr.find('td:nth-child(2)').text()) yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) items = doc('.wlist ul.l2').items() for item in items: host = item.find('span:first-child').text() port = int(item.find('span:nth-child(2)').text()) yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s * 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html) for address, port in re_ip_address: proxy = Proxy(host=address.strip(), port=int(port.strip())) yield proxy
def convert_proxy_or_proxies(data): ''' 将 list 或 str 类型的代理转成 多个或一个格式正确的代理 :param data: 代理 :type data: list or str :return: [Proxy] or Proxy ''' if not data: return None if isinstance(data, list): result = [] for item in data: item.strip() if not is_valid_proxy(item): continue host, port = item.split(':') result.append(Proxy(host=host, port=port)) return result if isinstance(data, str) and is_valid_proxy(data.strip()): host, port = data.split(':') return Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) trs = doc('#ip_list .tr.ip_tr').items() for tr in trs: host = tr.find('div:nth-child(1)').text() port = tr.find('div:nth-child(2)').text() yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) for item in doc('table tr').items(): td_ip = item.find('td[data-title="IP"]').text() td_port = item.find('td[data-title="PORT"]').text() if td_ip and td_port: yield Proxy(host=td_ip, port=td_port)
def parse(self, html): """ parse html file to get proxies :return: """ doc = pq(html) contents = doc('.cont').text() contents = contents.split("\n") for content in contents: c = content[:content.find("@")] host, port = c.split(":") yield Proxy(host=host, port=int(port))
def parse(self, html): """ parse html file to get proxies :return: """ etree_html = etree.HTML(html) ip_ports = etree_html.xpath("//tbody/tr/td[1]/text()") for ip_port in ip_ports: host = ip_port.partition(":")[0] port = ip_port.partition(":")[2] yield Proxy(host=host, port=port)
def parse(self, html): """ parse html file to get proxies :return: """ html = etree.HTML(html) trs = html.xpath('//table/tbody/tr') for tr in trs: address = tr.xpath('./td[1]/a/text()')[0] port = tr.xpath('./td[2]/text()')[0] proxy = Proxy(host=address.strip('"'), port=int(port.strip())) yield proxy