Beispiel #1
0
class Getter(object):
    """
    获取代理池
    """
    def __init__(self):
        """
        初始化 db 和 爬虫
        """
        self.redis = RedisClient()
        self.crawlers_cls = crawlers_cls
        self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]

    def is_full(self):
        """
        判断代理池是否已经满了
        return: bool
        """
        return self.redis.count() >= PROXY_NUMBER_MAX

    @logger.catch
    def run(self):
        """
        运行代理抓取工具
        :return:
        """
        if self.is_full():
            return
        for crawler in self.crawlers:
            logger.info(f'爬取 {crawler} to get proxy')
            for proxy in crawler.crawl():
                self.redis.add(proxy)
Beispiel #2
0
class Getter(object):
    """
    getter of proxypool
    """

    def __init__(self):
        """
        init db and crawlers
        """
        self.redis = RedisClient()
        self.crawlers_cls = crawlers_cls
        self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]

    def is_full(self):
        """
        if proxypool if full
        return: bool
        """
        return self.redis.count() >= PROXY_NUMBER_MAX

    @logger.catch
    def run(self):
        """
        run crawlers to get proxy
        :return:
        """
        if self.is_full():
            return
        for crawler in self.crawlers:
            for proxy in crawler.crawl():
                self.redis.add(proxy)
Beispiel #3
0
class Getter(object):
    """
    getter of proxypool
    """
    def __init__(self):
        """
        init db and crawlers
        """
        self.redis = RedisClient()
        self.crawlers_cls = crawlers_cls
        self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]

    def is_full(self):
        """
        if proxypool if full
        return: bool
        """
        return self.redis.count() >= PROXY_NUMBER_MAX

    @logger.catch
    def run(self):
        """
        run crawlers to get proxy
        :return:
        """
        if self.is_full():
            return
        proxyfile = "staticproxy.txt"
        with open(proxyfile, 'r') as fh:
            proxylines = fh.readlines()
        logger.info(f'read {proxyfile}')
        for line in proxylines:
            if line.strip() != "" and not line.startswith("#"):
                line = line.replace("\r\n", "").replace("\n", "")
                pattern = re.compile(
                    r'((?P<username>\S*?)\:(?P<password>\S*?)@)?(?P<ip>[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})\:(?P<port>\d*)'
                )
                match = re.search(pattern, line)
                if match:
                    username = match.groupdict()['username']
                    password = match.groupdict()['password']
                    ip = match.groupdict()['ip']
                    port = match.groupdict()['port']
                    proxy = Proxy(host=ip,
                                  port=port,
                                  username=username,
                                  password=password)
                    logger.info("getproxy " + proxy.string())
                    self.redis.add(proxy)

        for crawler in self.crawlers:
            logger.info(f'crawler {crawler} to get proxy')
            for proxy in crawler.crawl():
                print(proxy.string())
                self.redis.add(proxy)