Exemple #1
0
    def __init__(
        self,
        stddatatype: EStandardDataType,
        platform: str,
        cmdstatus: ECommandStatus,
        cmdrcvmsg: str = None,
        currenttime: str = None,
    ):

        OutputData.__init__(self, platform, stddatatype)
        OutputDataSeg.__init__(self)

        self._cmdstatus: ECommandStatus = None
        if isinstance(cmdstatus, ECommandStatus):
            self._cmdstatus = cmdstatus
        else:
            try:
                self._cmdstatus = ECommandStatus(int(cmdstatus))
            except Exception:
                pass

        self._cmdrcvmsg: str = None
        if isinstance(cmdrcvmsg, str) and cmdrcvmsg != '':
            self._cmdrcvmsg = cmdrcvmsg
        # 当前时间, 字符串的datetime
        self._time: str = datetime.now(
            pytz.timezone('Asia/Shanghai')).strftime('%Y-%m-%d %H:%M:%S')
        if currenttime is not None and currenttime != '':
            self._time = currenttime
Exemple #2
0
    def __init__(self, task: IscoutTask, platform: str, url: str, source: str,
                 rsctp: EResourceType):
        if not isinstance(task, IscoutTask):
            raise Exception("Invalid IscoutTask")
        if not isinstance(platform, str):
            raise Exception("Invalid platform")
        if not isinstance(url, str):
            raise Exception("Invalid url")
        if not isinstance(source, str):
            raise Exception("Invalid source")

        Resource.__init__(self, url, rsctp)
        OutputData.__init__(self, platform,
                            EStandardDataType.IScoutNetworkResource)
        OutputDataSeg.__init__(self)

        self._task: IscoutTask = task
        self._source: str = source

        self.resourceid: str = None
        self.filename: str = None
        self.extension: str = None
        self.remark: str = None

        self.stream = None
Exemple #3
0
    def __init__(self, task: IscoutTask, level: int, parentobj: str,
                 parentobjtype: EObjectType, url: str, keyword: str):
        if not isinstance(task, IscoutTask):
            raise Exception("Invalid iscouttask")
        # if not isinstance(level, int):
        #     raise Exception("Invalid level")
        if not isinstance(parentobjtype, EObjectType):
            raise Exception("Invalid parentobjtype")
        if not isinstance(url, str) or url == '':
            raise Exception("Invalid url")
        if not isinstance(keyword, str) or keyword == '':
            raise Exception("Invalid keyword")
        OutputData.__init__(self, task.platform,
                            EStandardDataType.IScoutScreenShotSE)
        OutputDataSeg.__init__(self)

        # OutputData.__init__(self, task._platform, EStandardDataType.IScoutScreenShotUrl)

        self._task: IscoutTask = task
        self._level: int = level
        self._parentobj: str = parentobj
        self.parentobjtype: EObjectType = parentobjtype
        self.url = url
        self.keyword: str = keyword
        # self.platform = platform
        # self.datatype: EStandardDataType = datatype

        self._stream: io.RawIOBase = None
Exemple #4
0
    def delete_complete_file(self, succ: bool, data: OutputData):
        """
        回调函数,在文件被读完后删除
        :param filename:
        :return:
        """
        res = True
        try:
            if not hasattr(data, "isdeleteable") or not hasattr(
                    data, "filepath_telegram"):
                return res
            if not data.isdeleteable:
                return res
            if data.filepath_telegram is None:
                self._logger.error("Telegram deleteable filepath is None")
                return res

            stm = data.get_stream()
            if stm is not None and not stm.closed:
                stm.close()
            if not data.filepath_telegram.exists():
                self._logger.error(
                    "Telegram deletable file is not found: {}".format(
                        data.filepath_telegram))
                return res

            data.filepath_telegram.unlink()
            # filename.unlink()
        except:
            res = False
        return res
Exemple #5
0
    def output_to_file(cls, data: OutputData, datastd: OutputDataConfig,
                       targetdir: str) -> bool:
        """输出数据到指定目录"""
        res: bool = False
        try:
            if not isinstance(data, OutputData):
                cls._logger.error("Invalid OutputData object: {}".format(data))
                return res

            if not isinstance(datastd, OutputDataConfig):
                cls._logger.error(
                    "Invalid OutputDataStandard: {}".format(datastd))
                return res

            stm = data.get_stream()

            if not isinstance(stm, io.IOBase) or not stm.readable():
                succ: bool = True
                for b in cls._get_mutiple_bs(data, datastd):
                    if not cls._output_to_file(data, b, datastd, targetdir):
                        succ = False
                        break
                res = succ
            else:
                b: bytes = cls._get_single_bs(data, datastd, stm)
                return cls._output_to_file(data, b, datastd, targetdir, stm)

        except Exception:
            cls._logger.error("Output data error: {} {}\n{}".format(
                data._platform, datastd._uniquename, traceback.format_exc()))
        return res
Exemple #6
0
    def __init__(self,
                 suffix,
                 datatype: EStandardDataType,
                 task: Task,
                 apptype: int,
                 clientid: str,
                 is_muti_seg: bool = False):
        UniqueData.__init__(self, task, apptype)
        OutputData.__init__(self, self._task.platform, datatype)
        OutputDataSeg.__init__(self)

        if not isinstance(clientid, str) or clientid == "":
            raise Exception("Invalid param 'clientid' for FeedDataBase")
        self._clientid: str = clientid

        # 东8区时间
        self.time = datetime.datetime.now(
            pytz.timezone('Asia/Shanghai')).strftime('%Y-%m-%d %H:%M:%S')

        if not isinstance(suffix, str) or suffix == "":
            raise Exception("Suffix is invalid.")

        self._is_muti_seg: bool = False
        if isinstance(is_muti_seg, bool):
            self._is_muti_seg = is_muti_seg

        self._suffix: str = suffix  # 文件后缀

        self.__innerdatas: list = []  # 内部多段数据
        self.__innerdata_locker = threading.Lock()

        self._io_stream = None  # 从网上下载回来的数据流
        # 统一使用ha来获取head里面的length
        # resp = ha.get_response()
        # lengthn = resp.headers.get('Content-Length')
        # responseio = ResponseIO(resp)
        self.stream_length = 0  # 下载的文件流大小,用来做文件大小过滤
        self.remarks = None  # 用于将一些爬取过程中有用的东西记录下来
Exemple #7
0
    def output(self, data: OutputData, datastd: OutputDataConfig) -> bool:
        """异步输出。
        根据标准检验数据字段是否符合规范,并输出数据,返回bool指示是否输出成功\n
        data: 要输出的数据对象\n
        datastd: 此数据对应的数据标准"""
        res: bool = False
        try:
            if not isinstance(data, OutputData):
                self._logger.error(
                    "Invalid OutputData object: {}".format(data))
                return res

            if not isinstance(datastd, OutputDataConfig):
                self._logger.error(
                    "Invalid OutputDataStandard: {}".format(datastd))
                return res

            stm = data.get_stream()

            if not issubclass(type(stm), io.IOBase) or not stm.readable():
                succ: bool = True
                for b in self._get_mutiple_bs(data, datastd):
                    if not self._output_sub(data, b, datastd):
                        succ = False
                res = succ
            else:
                b: bytes = self._get_single_bs(data, datastd, stm)
                res = self._output_sub(data, b, datastd, stm)

            if callable(data.on_complete):
                data.on_complete(res, data)

        except Exception:
            self._logger.error("Output data error: {} {}\n{}".format(
                data._platform, datastd._uniquename, traceback.format_exc()))
        return res
Exemple #8
0
    def _get_single_bs(
            cls,
            data: OutputData,
            datastd: OutputDataConfig,
            stm: io.RawIOBase,
            enc: str = "utf-8",
    ) -> bytes:
        """输出带文件体的数据类型"""
        res: bytes = None
        try:
            if not isinstance(data,
                              OutputData) or stm is None or not stm.readable():
                cls._logger.error(
                    "Invalid OutputData object or stream for output single")
                return res

            for seg in data.get_output_segs():
                # seg: OutputDataSeg = data.get_output_segs()
                fields: dict = cls._parse_fields(seg, datastd)
                if not isinstance(fields, dict) or len(fields) < 1:
                    cls._logger.error(
                        "Invalid fields after check output segment fields:\nplatform:{}\ndatatype:{}"
                        .format(data._platform, data._datatype.name))
                    return res
                bs: bytes = cls._fields_to_bytes(fields, enc)
                if bs is None or not any(bs):
                    return res

                res = bs
                return res

        except Exception:
            res = None
            cls._logger.error(
                "Output single data segment error:\nplatform:{}\ndatatype:{}\nerror:{}"
                .format(data._platform, data._datatype.name,
                        traceback.format_exc()))
Exemple #9
0
    def _get_mutiple_bs(
            cls,
            data: OutputData,
            datastd: OutputDataConfig,
            enc: str = "utf-8",
            maxsegcount: int = 1000,
    ) -> iter:
        """输出多段类型数据,返回bytes迭代器"""
        res: bool = True
        segcount = 0
        segbs: bytes = bytes()
        try:
            if not datastd._enable:
                cls._logger.debug(
                    "Data standard '{}' in platform '{}' is not enabled, data won't output"
                    .format(datastd._datatype.name, datastd.owner._platform))
                return res
            for seg in data.get_output_segs():
                try:
                    # 构建输出bytes
                    try:
                        if not isinstance(seg, OutputDataSeg):
                            cls._logger.error(
                                "Invalid OutputDataSeg object: {}".format(seg))
                            res = False
                            return res
                        seg: OutputDataSeg = seg
                        # 检查输出数据字段有效性
                        fields = cls._parse_fields(seg, datastd)
                        if not isinstance(fields, dict) or len(fields) < 1:
                            continue
                        bs = cls._fields_to_bytes(fields, enc)
                        if bs is None or not any(bs):
                            continue

                        segbs += bs
                        segcount += 1
                    except Exception:
                        res = False
                        cls._logger.error(
                            "Check segment fields validation failed:\nplatform:{}\ndatatype:{}\nerror:{}"
                            .format(
                                data._platform,
                                data._datatype.name,
                                traceback.format_exc(),
                            ))

                    if segcount < maxsegcount:
                        continue

                    # 达到segment段落数量上限输出
                    try:
                        yield segbs
                    except Exception:
                        cls._logger.error(
                            "Output mutiple segments error: {}".format(
                                traceback.format_exc()))
                    finally:
                        segbs = bytes()
                        segcount = 0

                except Exception:
                    res = False
                    cls._logger.error(
                        "Check output data segment failed:\nplatform:{}\ndatatype:{}\nerror:{}"
                        .format(data._platform, data._datatype.name,
                                traceback.format_exc()))

            # 或者遍历所有seg完成时输出
            if not segbs is None and any(segbs):
                yield segbs

        except Exception:
            res = False
            cls._logger.error(
                "Output mutiple data segment error:\nplatform:{}\ndatatype:{}\nerror:{}"
                .format(data._platform, data._datatype.name,
                        traceback.format_exc()))
Exemple #10
0
    def _output_to_file(
        cls,
        data: OutputData,
        bs: bytes,
        datastd: OutputDataConfig,
        targetdir: str,
        stm: io.RawIOBase = None,
    ) -> bool:
        """输出到指定目录\n
        bs:要输出的数据\n
        datastd:数据对应的数据标准,用于构建文件名等\n
        targetdir:目标目录\n
        stm: 附带的数据流"""
        res: bool = False
        tmppath: str = None
        outfi: str = None
        try:
            with cls.tmpdir_locker:
                # 临时路径
                tmppath: str = cls._get_datapath(cls._tmpdir, datastd)
                if not isinstance(tmppath, str) or tmppath == "":
                    return res

                with open(tmppath, mode="wb") as fs:
                    fs.write(bs)
                    if not stm is None and stm.readable():
                        # stm.readinto(fs)
                        readlen = 1024 * 1024 * 1024
                        while True:
                            buf = stm.read(readlen)
                            if buf is None:
                                break
                            readcount = len(buf)
                            fs.write(buf)
                            if readcount < readlen:
                                break

            # 加了一个验证步骤..
            # 后面如果要搞扩展输出方式,
            # 应吧输出到临时,和输出到目标分成两个函数,
            # 在两个函数调用的中间加一个验证步骤,各自实现

            if not data.validate_file(tmppath):
                # 不打日志了,错误数据直接不输出
                cls._logger.debug("Corrupted data: {}".format(tmppath))
                if os.path.isfile(tmppath):
                    os.remove(tmppath)
                return res

            with cls.outdir_locker:
                outfi: str = cls._get_datapath(targetdir, datastd)
                shutil.move(tmppath, outfi)
            res = True
        except Exception:
            if not tmppath is None and tmppath != "" and os.path.isfile(
                    tmppath):
                os.remove(tmppath)
            if not outfi is None and outfi != "" and os.path.isfile(outfi):
                os.remove(outfi)

            cls._logger.error("Output data segments sub error: {}".format(
                traceback.format_exc()))
        return res