Ejemplo n.º 1
0
    def run(self):
        """
        run crawlers to get proxy
        :return:
        """
        if self.is_full():
            return
        proxyfile = "staticproxy.txt"
        with open(proxyfile, 'r') as fh:
            proxylines = fh.readlines()
        logger.info(f'read {proxyfile}')
        for line in proxylines:
            if line.strip() != "" and not line.startswith("#"):
                line = line.replace("\r\n", "").replace("\n", "")
                pattern = re.compile(
                    r'((?P<username>\S*?)\:(?P<password>\S*?)@)?(?P<ip>[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})\:(?P<port>\d*)'
                )
                match = re.search(pattern, line)
                if match:
                    username = match.groupdict()['username']
                    password = match.groupdict()['password']
                    ip = match.groupdict()['ip']
                    port = match.groupdict()['port']
                    proxy = Proxy(host=ip,
                                  port=port,
                                  username=username,
                                  password=password)
                    logger.info("getproxy " + proxy.string())
                    self.redis.add(proxy)

        for crawler in self.crawlers:
            logger.info(f'crawler {crawler} to get proxy')
            for proxy in crawler.crawl():
                print(proxy.string())
                self.redis.add(proxy)
Ejemplo n.º 2
0
 def max(self, proxy: Proxy) -> int:
     """
     将代理设置为 MAX_SCORE
     :param proxy: 代理
     :return: 设置结果
     """
     logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}')
     if IS_REDIS_VERSION_2:
         return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string())
     return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
Ejemplo n.º 3
0
 def max(self, proxy: Proxy) -> int:
     '''
     将代理分数设成最大
     :param proxy: 代理
     :return: 新的分数
     '''
     logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}')
     if IS_REDIS_VERSION_2:
         return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string())
     return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
Ejemplo n.º 4
0
 def max(self, proxy: Proxy) -> int:
     """
     set proxy to max score
     :param proxy: proxy
     :return: new score
     """
     logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}')
     if IS_REDIS_VERSION_2:
         return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string())
     return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
Ejemplo n.º 5
0
 def decrease(self, proxy: Proxy) -> int:
     """
     decrease score of proxy, if mall than PROXY_SCORE_MIN, delete it
     :param proxy: proxy
     :return new score
     """
     if IS_REDIS_VERSION_2:
         self.db.zincrby(REDIS_KEY, proxy.string(), -1)
     else:
         self.db.zincrby(REDIS_KEY, -1, proxy.string())
     score = self.db.zscore(REDIS_KEY, proxy.string())
     logger.info(f'{proxy.string()} curent score {score}, remove')
     self.db.zrem(REDIS_KEY, proxy.string())
Ejemplo n.º 6
0
 def add(self, proxy: Proxy, score=PROXY_SCORE_INIT):
     """
     添加代理,设置分数为最高
     :param proxy: 代理
     :param score: 分数
     :return: 添加结果
     """
     if not is_valid_proxy(f'{proxy.host}:{proxy.port}'):
         logger.info(f'invalid proxy {proxy}, throw it')
         return
     if not self.exists(proxy):
         if IS_REDIS_VERSION_2:
             return self.db.zadd(REDIS_KEY, score, proxy.string())
         return self.db.zadd(REDIS_KEY, {proxy.string(): score})
Ejemplo n.º 7
0
 def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
     """
     add proxy and set it to init score
     :param proxy: proxy, ip:port, like 8.8.8.8:88
     :param score: int score
     :return: result
     """
     if not is_valid_proxy(f'{proxy.host}:{proxy.port}'):
         logger.info(f'invalid proxy {proxy}, throw it')
         return
     if not self.exists(proxy):
         if IS_REDIS_VERSION_2:
             return self.db.zadd(REDIS_KEY, score, proxy.string())
         return self.db.zadd(REDIS_KEY, {proxy.string(): score})
Ejemplo n.º 8
0
 def exists(self, proxy: Proxy) -> bool:
     """
     if proxy exists
     :param proxy: proxy
     :return: if exists, bool
     """
     return not self.db.zscore(REDIS_KEY, proxy.string()) is None
Ejemplo n.º 9
0
 def exists(self, proxy: Proxy) -> bool:
     """
     判断是否存在
     :param proxy: 代理
     :return: 是否存在
     """
     return not self.db.zscore(REDIS_KEY, proxy.string()) is None
Ejemplo n.º 10
0
 def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
     '''
     将代理加到 redis 中,并设置初始分数
     :param proxy: 代理,格式 ip:port, 例如 8.8.8.8:888
     :param score: 代理初始化的分数
     :type score: int
     :return: 成功添加的数量
     '''
     if not is_valid_proxy(f'{proxy.host}:{proxy.port}'):
         logger.info(f'invalid proxy {proxy}, throw it')
         return
     # if not self.db.exists(proxy):
     # 将代理添加到有序集合中
     if IS_REDIS_VERSION_2:
         return self.db.zadd(REDIS_KEY, score, proxy.string())
     return self.db.zadd(REDIS_KEY, {proxy.string(): score})
Ejemplo n.º 11
0
 def parse(self, html):
     doc = pq(html)
     trs = doc('.cont br').items()
     for tr in trs:
         line = tr[0].tail
         host = line.split(':')[0]
         port = line.split(':')[1][:4]
         yield Proxy(host=host, port=port)
Ejemplo n.º 12
0
 def parse(self, html):
     r = json.loads(html)
     keys = [k for k in r if 'proxy' in k]
     for key in keys:
         for item in r[key]:
             host = item['ip']
             port = int(str(item['port']))
             yield Proxy(host=host, port=port)
Ejemplo n.º 13
0
 def decrease(self, proxy: Proxy) -> int:
     """
     代理值减一分,小于最小值则删除
     :param proxy: 代理
     :return: 修改后的代理分数
     """
     score = self.db.zscore(REDIS_KEY, proxy.string())
     # current score is larger than PROXY_SCORE_MIN
     if score and score > PROXY_SCORE_MIN:
         logger.info(f'{proxy.string()} current score {score}, decrease 1')
         if IS_REDIS_VERSION_2:
             return self.db.zincrby(REDIS_KEY, proxy.string(), -1)
         return self.db.zincrby(REDIS_KEY, -1, proxy.string())
     # otherwise delete proxy
     else:
         logger.info(f'{proxy.string()} current score {score}, remove')
         return self.db.zrem(REDIS_KEY, proxy.string())
Ejemplo n.º 14
0
 def decrease(self, proxy: Proxy) -> int:
     '''
     降低代理的分数,如果比最小值还低,则删除
     :param proxy: proxy
     :return: new score
     '''
     score = self.db.zscore(REDIS_KEY, proxy.string())
     # 当前分数比最小值大
     if score and score > PROXY_SCORE_MIN:
         logger.info(f'{proxy.string()} current score {score}, decrease 1')
         if IS_REDIS_VERSION_2:
             return self.db.zincrby(REDIS_KEY, proxy.string(), -1)
         return self.db.zincrby(REDIS_KEY, -1, proxy.string())
     # 当前分数比最小值小
     else:
         logger.info(f'{proxy.string()} current score {score}, remove')
         return self.db.zrem(REDIS_KEY, proxy.string())
Ejemplo n.º 15
0
 def parse(self, html):
     doc = pq(html)
     trs = doc('#main > div > div:nth-child(1) > table ').items()
     for tr in trs:
         host = tr.find('td:nth-child(1)').text()
         # port = int(tr.find('td:nth-child(2)').text())
         port = int(tr.find('td:nth-child(2)').text())
         yield Proxy(host=host, port=port)
Ejemplo n.º 16
0
    def parse(self, html):
        doc = pq(html)
        trs = doc('div.layui-form > table > tbody > tr').items()

        for tr in trs:
            host = tr.find('td:nth-child(1)').text()
            port = int(tr.find('td:nth-child(2)').text())
            yield Proxy(host=host, port=port)
Ejemplo n.º 17
0
 def decrease(self, proxy: Proxy) -> int:
     """
     decrease score of proxy, if small than PROXY_SCORE_MIN, delete it
     :param proxy: proxy
     :return: new score
     """
     score = self.db.zscore(REDIS_KEY, proxy.string())
     # current score is larger than PROXY_SCORE_MIN
     if score and score > PROXY_SCORE_MIN:
         logger.info(f'{proxy.string()} current score {score}, decrease 1')
         if IS_REDIS_VERSION_2:
             return self.db.zincrby(REDIS_KEY, proxy.string(), -1)
         return self.db.zincrby(REDIS_KEY, -1, proxy.string())
     # otherwise delete proxy
     else:
         logger.info(f'{proxy.string()} current score {score}, remove')
         return self.db.zrem(REDIS_KEY, proxy)
Ejemplo n.º 18
0
 def parse(self, html):
     """:param
     """
     doc = pq(html)
     trs = doc('.containerbox table tr:gt (0)').times()
     for tr in trs:
         host = tr.find('td:nth-child(1)').text()
         port = int(tr.find('td:ntl-child(2)').text())
         yield Proxy(host=host, port=port)
Ejemplo n.º 19
0
 def parse(self, html):
     doc = pq(html)
     trs = doc('.cont br').items()
     for tr in trs:
         line = tr[0].tail
         match = re.search(r'(\d+\.\d+\.\d+\.\d+):(\d+)', line)
         if match:
             host = match.group(1)
             port = match.group(2)
             yield Proxy(host=host, port=port)
Ejemplo n.º 20
0
 def parse(self, html):
     """
     parse html file to get proxies
     :return:
     """
     doc = pq(html)
     for item in doc('tr[class="odd"]').items():
         td_ip = item.find('td:nth-child(2)').text()
         td_port = item.find('td:nth-child(3)').text()
         if td_ip and td_port:
             yield Proxy(host=td_ip, port=td_port)
Ejemplo n.º 21
0
 def parse(self, html):
     """
     parse html file to get proxies
     :return:
     """
     doc = pq(html)
     trs = doc('.layui-table tbody tr ').items()
     for tr in trs:
         host = tr.find('td div')[0].text
         port = tr.find('td div')[1].text
         yield Proxy(host=host, port=port)
Ejemplo n.º 22
0
 def parse(self, html):
     """
     parse html file to get proxies
     :return:
     """
     doc = pq(html)
     trs = doc('.containerbox table tr:gt(0)').items()
     for tr in trs:
         host = tr.find('td:nth-child(1)').text()
         port = int(tr.find('td:nth-child(2)').text())
         yield Proxy(host=host, port=port)
Ejemplo n.º 23
0
 def parse(self, html):
     """
     parse html file to get proxies
     :return:
     """
     doc = pq(html)
     items = doc('.wlist ul.l2').items()
     for item in items:
         host = item.find('span:first-child').text()
         port = int(item.find('span:nth-child(2)').text())
         yield Proxy(host=host, port=port)
Ejemplo n.º 24
0
 def parse(self, html):
     """
     parse html file to get proxies
     :return:
     """
     ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
     # \s * 匹配空格,起到换行作用
     re_ip_address = ip_address.findall(html)
     for address, port in re_ip_address:
         proxy = Proxy(host=address.strip(), port=int(port.strip()))
         yield proxy
Ejemplo n.º 25
0
def convert_proxy_or_proxies(data):
    '''
    将 list 或 str 类型的代理转成 多个或一个格式正确的代理
    :param data: 代理
    :type data: list or str
    :return: [Proxy] or Proxy
    '''
    if not data:
        return None
    if isinstance(data, list):
        result = []
        for item in data:
            item.strip()
            if not is_valid_proxy(item): continue
            host, port = item.split(':')
            result.append(Proxy(host=host, port=port))
        return result
    if isinstance(data, str) and is_valid_proxy(data.strip()):
        host, port = data.split(':')
        return Proxy(host=host, port=port)
Ejemplo n.º 26
0
 def parse(self, html):
     """
     parse html file to get proxies
     :return:
     """
     doc = pq(html)
     trs = doc('#ip_list .tr.ip_tr').items()
     for tr in trs:
         host = tr.find('div:nth-child(1)').text()
         port = tr.find('div:nth-child(2)').text()
         yield Proxy(host=host, port=port)
Ejemplo n.º 27
0
 def parse(self, html):
     """
     parse html file to get proxies
     :return:
     """
     doc = pq(html)
     for item in doc('table tr').items():
         td_ip = item.find('td[data-title="IP"]').text()
         td_port = item.find('td[data-title="PORT"]').text()
         if td_ip and td_port:
             yield Proxy(host=td_ip, port=td_port)
Ejemplo n.º 28
0
 def parse(self, html):
     """
     parse html file to get proxies
     :return:
     """
     doc = pq(html)
     contents = doc('.cont').text()
     contents = contents.split("\n")
     for content in contents:
         c = content[:content.find("@")]
         host, port = c.split(":")
         yield Proxy(host=host, port=int(port))
Ejemplo n.º 29
0
    def parse(self, html):
        """
        parse html file to get proxies
        :return:
        """
        etree_html = etree.HTML(html)
        ip_ports = etree_html.xpath("//tbody/tr/td[1]/text()")

        for ip_port in ip_ports:
            host = ip_port.partition(":")[0]
            port = ip_port.partition(":")[2]
            yield Proxy(host=host, port=port)
Ejemplo n.º 30
0
 def parse(self, html):
     """
     parse html file to get proxies
     :return:
     """
     html = etree.HTML(html)
     trs = html.xpath('//table/tbody/tr')
     for tr in trs:
         address = tr.xpath('./td[1]/a/text()')[0]
         port = tr.xpath('./td[2]/text()')[0]
         proxy = Proxy(host=address.strip('"'), port=int(port.strip()))
         yield proxy