Beispiel #1
0
 def _filling_default_idown_cmd(self, icmd: IdownCmd) -> str:
     """
     在存储默认配置的时候可能需要补齐一些默认配置
     补齐默认的配置后再存储
     :param icmd:
     :return:
     """
     default_cmd: str = self.get_default_idown_cmd().get("cmd")
     dcmd = IdownCmd(default_cmd)
     # 如果任务自己带有一些设置,那么补齐一些配置即可,只要调用了fill那么cmd就会是完整的
     icmd.fill_defcmd(dcmd)
     return icmd.filled_cmd_str
Beispiel #2
0
    def _get_pause_tasks(self):
        """
        获取被暂停的任务,如果任务开启了下载那么就开始下载
        这个任务是被暂停的任务,所以是一定会有cmdid的,同理cmd也是全的
        如果没有直接抛弃即可
        :return:
        """
        pause_task_list = []
        # pause_task = self._sqlfunc.query_task('taskstatus', ETaskStatus.TemporarilyStop.value)
        pause_task = self._sqlfunc.query_task(
            SqlConditions(
                SqlCondition(
                    colname="taskstatus",
                    val=ETaskStatus.TemporarilyStop.value,
                    comb=ESqlComb.Or,
                ), ))
        if len(pause_task) == 0:
            return pause_task_list
        for pt in pause_task:
            try:
                # 需要解析下设置里的下载开关
                cmd = IdownCmd(pt.get("cmd"))
                # 如果为None,就保持之前的状态不变(暂停着)
                if cmd.switch_control is None:
                    continue

                # 1表示开启下载
                if int(cmd.switch_control.download_switch) == 1:
                    pause_task_list.append(pt)
            except:
                self._logger.error(
                    f"Get pause task error, err:{traceback.format_exc()}")
                continue
        return pause_task_list
Beispiel #3
0
 def _get_stop_sign(self):
     """
     单个线程不断在数据库中查询停止标志
     改变停止的状态
     :return:
     """
     sql = """
     SELECT cmdid, cmd FROM task 
     LEFT OUTER JOIN idowncmd USING (cmdid)
     WHERE batchid=? and taskid=?
     """
     pars = (self.task.batchid, self.task.taskid)
     while self._running:
         try:
             res = self._sqlfunc.query_task_by_sql(sql, pars)
             if len(res) == 0 or res[0].get("cmd") is None:
                 continue
             cmd = IdownCmd(res[0].get("cmd"))
             if (cmd.switch_control is not None
                     and cmd.switch_control.download_switch is not None):
                 self._stop_sign = int(
                     cmd.switch_control.download_switch) == 0
         except:
             self._logger.error(
                 f"Something wrong when get stopsign,err:{traceback.format_exc()}"
             )
             continue
         finally:
             # 以防万一不要频繁的访问数据库,
             # 这个设置的时间可以等长一点,也许整个任务都不会有停止下载的设置
             time.sleep(2)
Beispiel #4
0
    def __init__(self):
        self._cookie_queue = Queue()
        self._sqlfunc = DbManager
        self._cookie_keeper = SpiderCookieKeep()
        self._logger: MsLogger = MsLogManager.get_logger("CookieKeepAlive")
        # 默认配置
        _defaultcmd: str = self._sqlfunc.get_default_idown_cmd().get("cmd")
        self.d_cmd = IdownCmd(_defaultcmd)

        # 正在处理的任务队列
        self._dealing_queue: dict = {}
        # 正在处理新任务队列,如果有新任务是不会执行循环下载任务的

        self._dealing_queue_locker = threading.Lock()
 def __init__(self):
     # 正在处理的任务队列
     self._spider_manage_queue_dict: dict = {}
     self._spider_manage_dealing_queue_locker = threading.Lock()
     self._logger: MsLogger = MsLogManager.get_logger("SpiderManagerAllot")
     self._batch_login_test = SpiderBatchLoginTest()
     self._download_task_store = SpiderDownloadTaskStore()
     self._login_only = SpiderLoginOnly()
     self._logout = SpiderLogout()
     self._online_check = SpiderOnlineCheck()
     self._register_check = SpiderRegisterCheck()
     self._store_vercode = SpiderStoreInput()
     # 默认配置
     _defcmdstr: str = DbManager.get_default_idown_cmd().get("cmd")
     self.defcmd: IdownCmd = IdownCmd(_defcmdstr)
Beispiel #6
0
    def _task_usercfg_filter(self, l_e_time: int, cmdstr: str):
        """
        根据用户的设置来判断这个任务是否满足循环下载的条件
        :param l_e_time:
        :param cmdstr:
        :return:
        """
        res = False
        date_unix = int(
            datetime.now(pytz.timezone("Asia/Shanghai")).timestamp())
        if cmdstr is None or cmdstr == "":
            # 如果任务没有带有cmd,那么使用默认配置
            cmd: IdownCmd = self.d_cmd
        else:
            # 任务里带有的设置可能不是完整的,需要补齐设置
            cmd: IdownCmd = IdownCmd(cmdstr)
            cmd.fill_defcmd(self.d_cmd)
        # -----------新增判断任务模式,如果不是循环任务那么就不进行循环下载
        if int(cmd.stratagy.circulation_mode) != 2:
            return res

        # 2、下载开着,3、监控开着
        if int(cmd.switch_control.download_switch) != 1:
            return res
        # 监控现在只针对cookie保活
        # if int(cmd.switch_control.monitor_switch) != 1:
        #     return res
        # 通用设置
        # 4、在定义的下载时间段内
        # 同一天0-24
        # if cmd.cmd_stratagy.time_peroid_end > cmd.cmd_stratagy.time_peroid_start \
        #         and cmd.cmd_stratagy.time_peroid_start <= date_hour <= cmd.cmd_stratagy.time_peroid_end:
        #     res = True
        # 到了第二天
        # elif cmd.cmd_stratagy.time_peroid_end < cmd.cmd_stratagy.time_peroid_start and \
        #         (date_hour >= cmd.cmd_stratagy.time_peroid_start or date_hour <= cmd.cmd_stratagy.time_peroid_end):
        #     res = True
        # 这里判断下是否满足了在下载时间区间
        # if not res:
        #     return res
        # --------------------------------时间在得到任务时统一判断了
        # 5、现在的时间距离上次下载的时间过了定义的时间间隔
        if date_unix - int(l_e_time) >= int(cmd.stratagy.interval):
            res = True
        return res
Beispiel #7
0
    def __init__(self):
        self._sqlfunc = DbManager
        # 新任务队列
        self._new_task_queue = PriorityQueue()
        # 循环任务队列
        self._cycle_task_queue = PriorityQueue()
        # 并发队列任务数
        self._concur_num = clienttaskconfig.concurrent_number

        self._spider_download = SpiderDownload()
        self._logger: MsLogger = MsLogManager.get_logger("TaskDownload")

        # 正在处理的任务队列
        self._dealing_queue: dict = {}
        # 正在处理新任务队列,如果有新任务是不会执行循环下载任务的
        self._dealing_queue_locker = threading.Lock()

        # 默认配置
        _defaultcmd: str = self._sqlfunc.get_default_idown_cmd().get("cmd")
        self.d_cmd = IdownCmd(_defaultcmd)
Beispiel #8
0
    def _judge_task_to_queue(self, task_dict):
        """
        判断task中的队列是否满足cookie保活的条件
        :return:
        """
        res = False
        cookiealive = task_dict.get("cookiealive")
        cookielastkeeptime = task_dict.get("cookielastkeeptime")
        # 查看是否满足cookie保活状态, 这个条件得待定,万一以后登录更新了cookie
        # 但是cookie保活状态没有更新那么也应该尝试下,但是应该打一句日志
        # cookie 已经失活就不去更新了
        # if int(cookiealive) != 1:
        # self._logger.info('Cookie has already lost effectiveness, may be cookie has been update,so try again')
        # return res
        cmd_str = task_dict.get("cmd")
        if cmd_str is None:
            cmd: IdownCmd = self.d_cmd
        else:
            cmd: IdownCmd = IdownCmd(cmd_str)
            cmd.fill_defcmd(self.d_cmd)
        # 1、修改流程,如果任务没开监控那么就不去进行cookie保活
        if int(cmd.switch_control.monitor_switch) != 1:
            return False
        # 2、开了监控,并且是第一次进行cookie保活
        if cookiealive is None and cookielastkeeptime is None:
            return True

        # 3、如果cookie已经失活那么就不再进入保活队列
        if cookiealive is not None and int(cookiealive) != 1:
            return False

        # 这里如果任务不是一个循环任务那么就不需要进行cookie保活
        # if int(cmd.stratagy.circulation_mode) != 2:
        #   return res
        # 4、最后判断下如果到了保活时间那么就开始进行保活
        unixtime_now = int(
            datetime.now(pytz.timezone("Asia/Shanghai")).timestamp())
        if unixtime_now - int(cookielastkeeptime) >= int(
                cmd.stratagy.cookie_keeplive):
            res = True
        return res
Beispiel #9
0
 def _taskparse(self, data: dict, file_suffix: str):
     """
     用于判断不同的任务类型
     目前有:idowntask
             idowncmd
     后面应该根据文件的后缀来判断文件类型
     所以后面这个方法要改
     add by judy 2019/06/11
     :param data:
     :return:
     """
     if file_suffix is None:
         raise Exception(
             "To distinguish file types, file_suffix can not be None.")
     # 单独处理an_dns的数据 add by judy 2020/03/04
     # if file_suffix == 'an_dns_client':
     #     return DnsData(data)
     # 初始化数据要增加clientid,这样无论是task,和idowncmd就会有clientid了
     data['clientid'] = basic_client_config.clientid
     if file_suffix == 'idown_task':
         return Task(data)
     elif file_suffix == 'idown_cmd':
         return IdownCmd.parse_from_dict(data)
     elif file_suffix == 'iscan_task':
         return IscanTask(data)
     elif file_suffix == 'iscout_task':
         return IscoutTask(data)
     elif file_suffix == 'automated_task':
         return AutomatedTask.create_from_dict(data)
     # -------------------------------------这些东西目前改了,用后缀来判断任务类型,这样更准确些
     # if data.get('taskid') is not None:
     #     taskid 不为空目前一定是task
     # return Task(data)
     # elif data.get('taskid') is None and data.get('cmdid') is not None:
     #     没有taskid但是有cmdid
     # return IdownCmd.parse_from_dict(data)
     else:
         # 希望别走到这,走到这就说明这个任务解析错了
         self._logger.error('Unkown task type')
Beispiel #10
0
    def _process_task_execution_time(self, q_task: dict):
        """
        处理刚从数据库查出数据,检测任务是否在有效期
        和任务是否满足在执行时间段
        :param q_task:
        :return:
        """
        q_cmdid = q_task.get("cmdid")
        is_effective = True
        if q_cmdid is None or q_cmdid == "":
            # 这里的true表示默认的任务时没有任务活性和时间限制的
            # 目前来看是这样,如果使用时出问题再修改吧,judy190603
            return True
        cmd = IdownCmd(q_task.get("cmd"))
        # 因为任务里面保存的设置可能不是完整的,所以需要使用默认设置补齐
        cmd.fill_defcmd(self.d_cmd)
        # 每个任务都去判断一下并发数
        if int(cmd.stratagy.concur_num) != self._concur_num:
            self._concur_num = int(cmd.stratagy.concur_num)
        # 统一使用东8区的时间
        beijing = pytz.timezone("Asia/Shanghai")
        now_datetime = datetime.now(beijing)
        now_time = now_datetime.time()
        try:
            if cmd.stratagy.time_start is not None:
                task_time_start = datetime.strptime(cmd.stratagy.time_start,
                                                    "%Y-%m-%d %H:%M:%S")
                if now_datetime >= beijing.localize(task_time_start):
                    is_effective = True
                else:
                    return False
            if cmd.stratagy.time_end is not None:
                task_time_end = datetime.strptime(cmd.stratagy.time_end,
                                                  "%Y-%m-%d %H:%M:%S")
                if now_datetime <= beijing.localize(task_time_end):
                    is_effective = True
                else:
                    return False
            # ----------------------------------------------上面的判断为任务是否在有效时间
            if len(cmd.stratagy.period) == 0:
                return is_effective
            # 这里如果没有时间限制那么应该直接返回true

            for t_p in cmd.stratagy.period:
                t_p_list = t_p.split("-")
                # 处理界面发错了,列表里有个"null"
                try:
                    if (datetime.strptime(t_p_list[0], "%H:%M:%S").time() <=
                            now_time <= datetime.strptime(
                                t_p_list[1], "%H:%M:%S").time()):
                        is_effective = True
                        break
                    else:
                        is_effective = False
                except Exception:
                    is_effective = True
            # ---------------------------------------------上面为判断任务是否在执行时间内

        except:
            self._logger.error(
                f"Determine the effective and execution time of the task error, err:{traceback.format_exc()}"
            )
        return is_effective