class ProxyPoolScraper: def __init__(self, url, bs_parser="lxml"): self.parser = WebParser(url) self.bs_parser = bs_parser def get_proxy_stream(self, limit): raw_records = self.extract_table_raw_records() clean_records = list( map(self._clear_up_record, raw_records) ) for record in clean_records[:limit]: self.logger.info(f"Proxy record: {record}") if record: yield ProxyRecord(*record) def extract_table_raw_records(self): content = self.parser.get_content() soup_object = BeautifulSoup(content, self.bs_parser) return ( soup_object .find(id="list") .find_all("tr") ) def _clear_up_record(self, raw_record): return [ val.text for val in raw_record.find_all("td") ]
class ProxyPoolValidator: def __init__(self, url, timeout=10): self.timeout = timeout self.parser = WebParser(url, rotate_header=True) def validate_proxy(self, proxy_record): content = self.parser.get_content(timeout=self.timeout, proxies=proxy_record.proxy) proxy_status = ProxyStatus(proxy_record.proxy, content is not None) self.logger.info(f"Proxy status: {proxy_status}") return proxy_status
class NewsProducer: def __init__(self, rss_feed): self.parser = WebParser(rss_feed, rotate_header=True) self.formatter = NewsFormatter() def _extract_news_feed_items(self, proxies): content = self.parser.get_content(proxies=proxies) news_feed = atoma.parse_rss_bytes(content) return news_feed.items def get_news_stream(self, proxies): news_feed_items = self._extract_news_feed_items(proxies) for entry in news_feed_items: formatted_entry = self.formatter.format_entry(entry) yield formatted_entry
class ProxyPoolValidator: def __init__(self, url, timeout=10, checks=3, sleep_interval=0.1): self.timeout = timeout self.checks = checks self.sleep_interval = sleep_interval self.parser = WebParser(url, rotate_header=True) def validate_proxy(self, proxy_record): consecutive_checks = [] for _ in range(self.checks): content = self.parser.get_content(timeout=self.timeout, proxies=proxy_record.proxy) time.sleep(self.sleep_interval) consecutive_checks.append(int(content is not None)) health = sum(consecutive_checks) / self.checks proxy_status = ProxyStatus(proxy=proxy_record.proxy, health=health, is_valid=health > 0.66) self.logger.info(f"Proxy status: {proxy_status}") return proxy_status