class ProxyPoolScraper:
    def __init__(self, url, bs_parser="lxml"):
        self.parser = WebParser(url)
        self.bs_parser = bs_parser

    def get_proxy_stream(self, limit):
        raw_records = self.extract_table_raw_records()
        clean_records = list(
            map(self._clear_up_record, raw_records)
        )
        for record in clean_records[:limit]:
            self.logger.info(f"Proxy record: {record}")
            if record:
                yield ProxyRecord(*record)

    def extract_table_raw_records(self):
        content = self.parser.get_content()
        soup_object = BeautifulSoup(content, self.bs_parser)
        return (
            soup_object
            .find(id="list")
            .find_all("tr")
        )

    def _clear_up_record(self, raw_record):
        return [
            val.text for val
            in raw_record.find_all("td")
        ]
class ProxyPoolValidator:
    def __init__(self, url, timeout=10):
        self.timeout = timeout
        self.parser = WebParser(url, rotate_header=True)

    def validate_proxy(self, proxy_record):
        content = self.parser.get_content(timeout=self.timeout,
                                          proxies=proxy_record.proxy)
        proxy_status = ProxyStatus(proxy_record.proxy, content is not None)
        self.logger.info(f"Proxy status: {proxy_status}")
        return proxy_status
Exemple #3
0
class NewsProducer:
    def __init__(self, rss_feed):
        self.parser = WebParser(rss_feed, rotate_header=True)
        self.formatter = NewsFormatter()

    def _extract_news_feed_items(self, proxies):
        content = self.parser.get_content(proxies=proxies)
        news_feed = atoma.parse_rss_bytes(content)
        return news_feed.items

    def get_news_stream(self, proxies):
        news_feed_items = self._extract_news_feed_items(proxies)
        for entry in news_feed_items:
            formatted_entry = self.formatter.format_entry(entry)
            yield formatted_entry
class ProxyPoolValidator:
    def __init__(self, url, timeout=10, checks=3, sleep_interval=0.1):
        self.timeout = timeout
        self.checks = checks
        self.sleep_interval = sleep_interval
        self.parser = WebParser(url, rotate_header=True)

    def validate_proxy(self, proxy_record):
        consecutive_checks = []
        for _ in range(self.checks):
            content = self.parser.get_content(timeout=self.timeout,
                                              proxies=proxy_record.proxy)
            time.sleep(self.sleep_interval)
            consecutive_checks.append(int(content is not None))

        health = sum(consecutive_checks) / self.checks
        proxy_status = ProxyStatus(proxy=proxy_record.proxy,
                                   health=health,
                                   is_valid=health > 0.66)
        self.logger.info(f"Proxy status: {proxy_status}")
        return proxy_status