Example #1
0
class SubscribesCleaner(CoroutineSpeedup):
    """解耦清洗插件:国内IP调用很可能出现性能滑坡"""
    def __init__(self, debug=False, kill_target: str = None):
        super(SubscribesCleaner, self).__init__()
        self.debug = debug
        self.keys = [REDIS_SECRET_KEY.format(s) for s in CRAWLER_SEQUENCE]
        self.rc = RedisClient().get_driver()
        self.kill_ = kill_target

    def offload_task(self):
        for key_ in self.keys:
            for sub, _ in self.rc.hgetall(key_).items():
                self.work_q.put_nowait([sub, key_])

    def _del_subs(self, key_: str, subs: str, err_: str = '') -> None:
        self.rc.hdel(key_, subs)
        logger.debug(f'>> Detach -> {subs} -- {err_}')

    def control_driver(self, sub_info: List[str]):
        """

        @param sub_info: [subs,key_secret_class]
        @return:
        """
        try:
            # 解耦指定簇
            if self.kill_ and self.kill_ in sub_info[0]:
                self._del_subs(sub_info[-1], sub_info[0], "target")

            else:
                # 解析订阅
                node_info: dict = subs2node(sub_info[0], False)
                # 打印debug信息
                if self.debug:
                    print(
                        f"check -- {node_info['subs']} -- {node_info['node'].__len__()}"
                    )
                # 订阅解耦
                if node_info['node'].__len__() <= 4:
                    self._del_subs(sub_info[-1], sub_info[0], "decouple")

        except UnicodeDecodeError or TypeError as e:
            logger.debug(
                f"Retry put the subscribe({sub_info}) to work queue -- {e}")

            # 单个链接重试3次,标记超时链接
            if self.temp_cache.get(sub_info[0]):
                self.temp_cache[sub_info[0]] += 1
            else:
                self.temp_cache[sub_info[0]] = 1
            if self.temp_cache[sub_info[0]] <= 3:
                self.work_q.put_nowait(sub_info)
            else:
                self._del_subs(sub_info[-1], sub_info[0], e)

        except SystemExit:
            logger.critical("请关闭系统代理后再执行订阅清洗操作")
        except Exception as e:
            logger.warning(f"{sub_info} -- {e}")
            self._del_subs(sub_info[-1], sub_info[0])
def detach(subscribe, beat_sync=False):
    """

    @param subscribe:
    @param beat_sync: 是否立即删除, True:立即删除,False:节拍同步,随ddt删除
    @return:
    """
    from faker import Faker
    from urllib.parse import urlparse

    # 清洗出订阅中的token
    token = urlparse(subscribe).path

    r = RedisClient().get_driver()

    # 遍历所有任务类型
    for task in CRAWLER_SEQUENCE:
        # 遍历某种类型的链接池
        for sub in r.hgetall(REDIS_SECRET_KEY.format(task)).items():
            # 匹配用户token
            if token == urlparse(sub[0]).path:
                # 若节拍同步,立即移除订阅
                if beat_sync:
                    r.hdel(REDIS_SECRET_KEY.format(task), sub[0])
                    logger.debug(f'>> Detach -> {sub[0]}')
                # 否则将订阅过期时间标记为过期,该链接将随下一波任一节点的ddt任务被删除
                else:
                    r.hset(REDIS_SECRET_KEY.format(task), sub[0],
                           str(Faker().past_datetime()))
                break
Example #3
0
class SubscribesCleaner(CoroutineSpeedup):
    """解耦清洗插件:国内IP调用很可能出现性能滑坡"""
    def __init__(self, debug=False, kill_target: str = None):
        super(SubscribesCleaner, self).__init__()
        self.debug = debug
        self.keys = [REDIS_SECRET_KEY.format(s) for s in CRAWLER_SEQUENCE]
        self.rc = RedisClient().get_driver()
        self.kill_ = kill_target

    def offload_task(self):
        for key_ in self.keys:
            for sub, _ in self.rc.hgetall(key_).items():
                self.work_q.put_nowait([sub, key_])

    def _del_subs(self, key_: str, subs: str, err_) -> None:
        self.rc.hdel(key_, subs)
        # logger.debug(f'>> Detach -> {subs} -- {err_}')
        print(Fore.BLUE, f"[{datetime.now()}] detach -> {subs} {err_}")

    def control_driver(self, sub_info: List[str], threshold: int = 4):
        """

        :param sub_info: [subs,key_secret_class]
        :param threshold: 解耦置信阈值 小于或等于这个值的订阅将被剔除
        :return:
        """
        try:
            # 针对指定订阅源进行清洗工作
            if self.kill_ and self.kill_ in sub_info[0]:
                self._del_subs(sub_info[-1], sub_info[0],
                               "target active removal")
            else:
                # 解析订阅
                node_info: dict = subs2node(sub_info[0])
                # 订阅解耦
                if node_info['node'].__len__() <= threshold:
                    self._del_subs(sub_info[-1], sub_info[0],
                                   "decouple active removal")
                elif self.debug:
                    print(
                        Fore.WHITE,
                        f"[{datetime.now()}] valid -- {node_info['subs']} -- {len(node_info['node'])}"
                    )

        except (UnicodeDecodeError, TypeError) as e:
            # 对于已标记“解析错误”的订阅 更新其请求次数
            if self.temp_cache.get(sub_info[0]):
                self.temp_cache[sub_info[0]] += 1
            # 否则标记为“解析错误”的订阅
            else:
                print(Fore.YELLOW,
                      f"[{datetime.now()}] recheck -- {sub_info[0]}")
                self.temp_cache[sub_info[0]] = 1
            # 若链接重试次数少于3次 重添加至任务队列尾部
            if self.temp_cache[sub_info[0]] <= 3:
                self.work_q.put_nowait(sub_info)
            # 若链接重试次数大于3次 剔除
            else:
                self._del_subs(sub_info[-1], sub_info[0], e)
        except SystemExit:
            warnings.warn("请关闭系统代理后部署订阅清洗任务")
        except Exception as e:
            logger.warning(f"{sub_info} -- {e}")
            self._del_subs(sub_info[-1], sub_info[0], e)
Example #4
0
class SubscribesCleaner(lsu):
    """解耦清洗插件:国内IP调用很可能出现性能滑坡"""
    def __init__(self, debug=False, kill_target: str = None):
        super(SubscribesCleaner, self).__init__()
        self.debug = debug
        self.keys = [REDIS_SECRET_KEY.format(s) for s in CRAWLER_SEQUENCE]
        self.rc = RedisClient().get_driver()
        self.kill_ = kill_target

    def offload_task(self):
        for key_ in self.keys:
            for sub, _ in self.rc.hgetall(key_).items():
                self.work_q.put_nowait([sub, key_])

    def killer(self):
        """
        @todo redis批量移除或移动hash
        @return:
        """
        if self.apollo:
            for kill_ in self.apollo:
                self.rc.hdel(kill_[0], kill_[-1])
                logger.debug(f'>> Detach -> {kill_[-1]}')

    def control_driver(self, sub_info: List[str]):
        """

        @param sub_info: [subs,key_secret_class]
        @return:
        """
        try:
            # 解耦指定簇
            if self.kill_ and self.kill_ in sub_info[0]:
                self.apollo.append([sub_info[-1], sub_info[0]])
            else:
                # 解析订阅
                node_info: dict = subs2node(sub_info[0], False)
                # 打印debug信息
                if self.debug:
                    print(
                        f"check -- {node_info['subs']} -- {node_info['node'].__len__()}"
                    )
                # 订阅解耦
                if node_info['node'].__len__() <= 3:
                    self.apollo.append([sub_info[-1], sub_info[0]])
        except UnicodeDecodeError or TypeError as e:
            logger.debug(
                f"Retry put the subscribe({sub_info}) to work queue -- {e}")

            # 单个链接重试3次,标记超时链接
            if self.temp_cache.get(sub_info[0]):
                self.temp_cache[sub_info[0]] += 1
            else:
                self.temp_cache[sub_info[0]] = 1
            if self.temp_cache[sub_info[0]] <= 3:
                self.work_q.put_nowait(sub_info)
            else:
                self.apollo.append([sub_info[-1], sub_info[0]])

        except Exception as e:
            logger.warning(f"{sub_info} -- {e}")
            self.apollo.append([sub_info[-1], sub_info[0]])
Example #5
0
class SubscribesCleaner(CoroutineSpeedup):
    """解耦清洗插件:国内IP调用很可能出现性能滑坡"""
    def __init__(self, debug=False, kill_target: str = None):
        super(SubscribesCleaner, self).__init__()
        self.debug = debug
        self.keys = [REDIS_SECRET_KEY.format(s) for s in CRAWLER_SEQUENCE]
        self.rc = RedisClient().get_driver()
        self.kill_ = kill_target

    def offload_task(self):
        for key_ in self.keys:
            try:
                for sub, _ in self.rc.hgetall(key_).items():
                    self.work_q.put_nowait([sub, key_])
            except redis_error.ResponseError:
                logger.critical("Link pool is broken down.")

    def _del_subs(self, key_: str, subs: str, err_) -> None:
        try:
            self.rc.hdel(key_, subs)
            terminal_echo(f"detach -> {subs} {err_}", 3)
        except redis_error.ConnectionError:
            logger.critical(
                "<SubscribeCleaner> The local network communication is abnormal."
            )

    def control_driver(self, sub_info: List[str], threshold: int = 4):
        """

        :param sub_info: [subs,key_secret_class]
        :param threshold: 解耦置信阈值 小于或等于这个值的订阅将被剔除
        :return:
        """
        super(SubscribesCleaner, self).control_driver(task=sub_info)
        try:
            # 针对指定订阅源进行清洗工作
            if self.kill_ and self.kill_ in sub_info[0]:
                self._del_subs(sub_info[-1], sub_info[0],
                               "target active removal")
            else:
                # 解析订阅
                node_info: dict = subs2node(sub_info[0])
                # 订阅解耦
                if node_info['node'].__len__() <= threshold:
                    self._del_subs(sub_info[-1], sub_info[0],
                                   "decouple active removal")
                elif self.debug:
                    terminal_echo(
                        f"valid -- {node_info['subs']} -- {len(node_info['node'])}",
                        1)
        except (UnicodeDecodeError, TypeError) as e:
            # 对于已标记“解析错误”的订阅 更新其请求次数
            if self.temp_cache.get(sub_info[0]):
                self.temp_cache[sub_info[0]] += 1
            # 否则标记为“解析错误”的订阅
            else:
                terminal_echo(f"recheck -- {sub_info[0]}", 2)
                self.temp_cache[sub_info[0]] = 1
            # 若链接重试次数少于3次 重添加至任务队列尾部
            if self.temp_cache[sub_info[0]] <= 3:
                self.work_q.put_nowait(sub_info)
            # 若链接重试次数大于3次 剔除
            else:
                self._del_subs(sub_info[-1], sub_info[0], e)
        except SystemExit:
            warnings.warn("请关闭系统代理后部署订阅清洗任务")
        except Exception as e:
            logger.warning(f"{sub_info} -- {e}")
            self._del_subs(sub_info[-1], sub_info[0], e)

    def killer(self):
        if not self.debug:
            logger.success("<SubscribesCleaner> --> decouple compete.")