Example #1
0
 def to_mp3(cls,
            input_file,
            output_file,
            output_ac,
            output_ar,
            input_ac=None,
            input_ar=None,
            input_format=None,
            input_acodec=None):
     """
     转 mp3 格式音频
     :param str input_file: 待转换文件路径
     :param str output_file:
     :param int output_ac:
     :param int output_ar:
     :param int input_ac:
     :param int input_ar:
     :param str input_format:
     :param str input_acodec:
     :return:
     """
     Assert.is_true(cls.is_mp3(output_file, is_strict=False),
                    '不是 mp3 拓展文件:{0}'.format(output_file))
     cls._format_convert(input_file=input_file,
                         output_file=output_file,
                         input_format=input_format,
                         input_acodec=input_acodec,
                         input_ac=input_ac,
                         input_ar=input_ar,
                         output_ac=output_ac,
                         output_ar=output_ar)
Example #2
0
    def lexer(self, text, ne_list=[], url=None):
        """
        词法分析
        :param url:
        :param text:
        :param ne_list: 过滤命名实体类型
        :return:
        """
        url = url if url else getattr(self, 'lexer_url', None)
        url = url.format(access_token=self.token)
        res = requests.post(url=url,
                            data=json.dumps(obj={'text': text}),
                            headers=ContentType.JSON_UTF8.value)
        res.encoding = encodes.Unicode.UTF_8.value

        if res is None or res.status_code != codes.ok:
            raise MyError(code=codes.failed, msg='百度词法分析请求失败.')

        lexer_res, errors = LexerRes.LexerResSchema().load(
            res.json())  # type: LexerRes
        Assert.is_true(is_empty(errors), errors)

        if is_not_empty(ne_list):
            for lexer_item in lexer_res.items[:]:
                if lexer_item.ne not in ne_list:
                    # 根据实体列表过滤
                    lexer_res.items.remove(lexer_item)
        return lexer_res
Example #3
0
 def to_wav(cls,
            input_file,
            output_file,
            output_ac,
            output_ar,
            input_ac=None,
            input_ar=None,
            input_format=None,
            input_acodec=None):
     """
     转 wav 格式音频
     :param input_file:
     :param output_file:
     :param output_ac:
     :param output_ar:
     :param input_ac:
     :param input_ar:
     :param input_format:
     :param input_acodec:
     :return:
     """
     Assert.is_true(cls.is_wav(output_file, is_strict=False),
                    '不是 wav 拓展文件:{0}'.format(output_file))
     cls._format_convert(input_file=input_file,
                         output_file=output_file,
                         input_format=input_format,
                         input_acodec=input_acodec,
                         input_ac=input_ac,
                         input_ar=input_ar,
                         output_ac=output_ac,
                         output_ar=output_ar)
Example #4
0
    def get_asr_progress(self):
        """
        查询音频转写处理进度
        :return:
        """
        url = getattr(self, 'asr_progress_url', None)
        data = dict(app_id=self.app_id,
                    signa=self.signa,
                    ts=self.ts,
                    task_id=self.task_id)
        res = requests.post(url=url, data=data)
        res.encoding = encodes.Unicode.UTF_8.value

        if res is None or res.status_code != codes.ok:
            return AsrInfo(
                ok=codes.failed,
                err_no=res.status_code if res is not None else codes.bad,
                failed='查询音频转写处理进度请求失败')
        progress_info, errors = AsrInfo.AsrInfoSchema().load(res.json())
        Assert.is_true(is_empty(errors), errors)
        if progress_info.ok == codes.success:
            progress_details, errors = AsrProgress.AsrProgressSchema().load(
                json.loads(progress_info.data))
            Assert.is_true(is_empty(errors), errors)
            progress_info.data = progress_details
        return progress_info
Example #5
0
    def __init__(self, key):
        """

        :param key: 秘钥
        """
        Assert.is_true(is_not_empty(key), 'AES 秘钥不能为空')
        self.key = key
Example #6
0
    def probe(cls, file_path, is_ffprobe=True):
        """
        解析音频文件参数
        :param is_ffprobe: 是否使用 ffprobe 解析文件格式
        :param file_path:
        :return:
        :rtype: Audio
        """
        Assert.is_true(os.path.isfile(file_path),
                       '文件不存在:{0}'.format(file_path))
        _, file_name, format_name = FileUtil.get_path_name_ext(file_path)

        probe_json = ffmpeg.probe(file_path)
        audio_json = probe_json.get('streams')[0]
        audio_json.update(probe_json.get('format'))
        audio_json.update({
            'file_name':
            '{name}.{ext}'.format(name=file_name, ext=format_name)
        })
        audio, errors = AudioSchema().load(audio_json)
        Assert.is_true(is_empty(errors), errors)

        if not is_ffprobe:
            audio.format_name = format_name
        return audio
Example #7
0
 def get_file_list(self, path):
     """
     获取文件列表
     :param path:文件目录所在文件夹
     :return:文件对象组成的元组,注意返回结果里面包含所有文件甚至是 . 和 .. 也会包含进去
     """
     Assert.is_true(assert_condition=self.is_success(), assert_msg='SMB连接失败...')
     return self.listPath(service_name=self.service_name, path=path)
Example #8
0
 def del_file(file_path):
     """
     删除文件
     :param file_path:
     :return:
     """
     Assert.is_true(os.path.isfile(file_path),
                    '文件不存在:{0}'.format(file_path))
     os.remove(file_path)
Example #9
0
 def get_file_create_time(file_path):
     """
     获取文件创建时间
     :param file_path:
     :return:
     """
     Assert.is_true(os.path.isfile(file_path),
                    '文件不存在, path: {0}'.format(file_path))
     return os.path.getctime(file_path)
Example #10
0
 def get_file_size(file_path):
     """
     获取文件大小 保留两位小数,单位MB
     :param file_path:
     :return:
     """
     Assert.is_true(os.path.isfile(file_path),
                    '文件不存在, path: {0}'.format(file_path))
     return round(os.path.getsize(file_path) / float(1024 * 1024), 2)
Example #11
0
 def get_img_pixel(path):
     """
     获取图片像素大小
     :param path:
     :return:
     """
     Assert.is_true(os.path.isfile(path), '图片不存在:{0}'.format(path))
     with Image.open(path) as i:
         return i.size
Example #12
0
 def get_file_access_time(file_path):
     """
     获取文件访问时间
     :param file_path:
     :return:
     """
     Assert.is_true(os.path.isfile(file_path),
                    '文件不存在, path: {0}'.format(file_path))
     return os.path.getatime(file_path)
Example #13
0
 def upload_file(self, local_file, remote_file):
     """
     上传文件到服务器
     :param local_file:本地文件路径
     :param remote_file:远程文件路径
     :return:
     """
     Assert.is_true(assert_condition=self.is_success(), assert_msg='SMB连接失败...')
     with open(file=local_file, mode='rb') as f:
         self.storeFile(service_name=self.service_name, path=remote_file, file_obj=f)
Example #14
0
 def down_file(self, local_file, remote_file):
     """
     下载文件到本地
     :param local_file:本地文件路径
     :param remote_file:远程文件路径
     :return:
     """
     Assert.is_true(assert_condition=self.is_success(), assert_msg='SMB连接失败...')
     with open(file=local_file, mode='wb') as f:
         self.retrieveFile(service_name=self.service_name, path=remote_file, file_obj=f)
Example #15
0
 def creat_dirs(dir_path):
     """
     创建多级目录
     :param dir_path:
     :return:
     """
     Assert.is_true(dir_path, '目录不能为空')
     dir_path = dir_path.strip()
     if not os.path.exists(dir_path):
         os.makedirs(dir_path)
     Assert.is_true(os.path.isdir(dir_path), '创建目录失败:{0}'.format(dir_path))
     return dir_path
Example #16
0
 def loss_less(in_path, out_path, format=FileFormat.JPEG.value):
     """
     图片近无损压缩
     :param in_path:
     :param out_path: 输出目录
     :param format:
     :return:
     """
     Assert.is_true(os.path.isfile(in_path), '图片不存在:{0}'.format(in_path))
     with Image.open(in_path) as img:
         img = img.convert(ImgChannel.RGB.value)
         exif_bytes = piexif.dump({})
         img.save(out_path, format, exif=exif_bytes)
Example #17
0
 def copy_file(cls, old_file, new_file):
     """
     复制文件到指定目录
     :param old_file:源文件路径
     :param new_file:新文件目录
     :return:
     """
     Assert.is_true(os.path.isfile(old_file), '文件不存在:{0}'.format(old_file))
     file_name = os.path.basename(old_file)
     FileUtil.creat_dirs(new_file)
     new_file_path = os.path.join(new_file, file_name)
     shutil.copyfile(src=old_file, dst=new_file_path)
     return cls.normcase(new_file_path)
Example #18
0
 def get_path_name_ext(file_path, is_strict=True):
     """
     解析文件路径,文件名,文件扩展
     :param bool is_strict: 是否严格检测文件是否存在
     :param file_path:
     :return:
     """
     if is_strict:
         Assert.is_true(os.path.isfile(file_path),
                        '文件不存在, path: {0}'.format(file_path))
     (path, file_name) = os.path.split(file_path)
     (name, ext) = os.path.splitext(file_name)
     return path, name, ext.strip(os.curdir)
Example #19
0
 def get_files_by_suffix2(dir, suffixs):
     """
     获取目录下所有符合后缀条件的文件路径列表
     :param dir:目录
     :param suffixs:后缀
     :return:
     """
     Assert.is_true(assert_condition=os.path.isdir(dir),
                    assert_msg='路径无效:{0}'.format(dir))
     file_list = []
     for suffix in suffixs:
         file_glob = os.path.join(dir, '*.' + suffix)
         file_list.extend(glob.glob(pathname=file_glob))
     return file_list
Example #20
0
    def _format_convert(cls,
                        input_file,
                        output_file,
                        output_ac,
                        output_ar,
                        input_ac=None,
                        input_ar=None,
                        input_format=None,
                        input_acodec=None,
                        output_format=None,
                        output_acodec=None):
        """
        音频文件格式转换
        :param input_file:
        :param output_file:
        :param output_ac:
        :param output_ar:
        :param input_ac:
        :param input_ar:
        :param input_format:
        :param input_acodec:
        :param output_format:
        :param output_acodec:
        :return:
        """
        output_dir, _, _ = FileUtil.get_path_name_ext(output_file,
                                                      is_strict=False)
        FileUtil.creat_dirs(output_dir)

        if cls.is_pcm(input_file):
            # 格式为 pcm 格式,需要音频文件参数
            Assert.is_true((is_not_empty(input_ac) and is_not_empty(input_ar)
                            and is_not_empty(input_format)
                            and is_not_empty(input_acodec)), 'pcm 缺失文件参数')

        _, error = (ffmpeg.input(
            input_file,
            **cls._file_args(
                input_format, input_acodec, input_ac, input_ar)).output(
                    output_file,
                    **cls._file_args(output_format, output_acodec, output_ac,
                                     output_ar)).run(capture_stdout=False,
                                                     overwrite_output=True))
Example #21
0
    def img_compress(path, threshold=4):
        """
        将图片压缩到指定阀值大小的 base64
        :param path:文件路径
        :param threshold:阀值大小(单位:M)
        :return:
        """
        from utils.encodes import pil_to_base64
        # 阈值换算成比特
        _threshold = threshold * 1024 * 1024
        Assert.is_true(os.path.isfile(path), '图片不存在:{0}'.format(path))
        w, h = ImgUtil.get_img_pixel(path)

        with Image.open(path) as im:
            if w * h > _threshold:
                new_width = 1024
                new_height = int(new_width * h * 1.0 / w)
                resized_im = im.resize((new_width, new_height))
                return pil_to_base64(resized_im)
            return pil_to_base64(im)
Example #22
0
    def _asr_prepare(self, **kwargs):
        """
        音频转写预处理
        :return:
        """
        url = getattr(self, 'asr_prepare_url', None)

        kwargs.update(dict(app_id=self.app_id, signa=self.signa, ts=self.ts))

        res = requests.post(url=url, data=kwargs)
        res.encoding = encodes.Unicode.UTF_8.value

        if res is None or res.status_code != codes.ok:
            return AsrInfo(
                ok=codes.failed,
                err_no=res.status_code if res is not None else codes.bad,
                failed='音频转写预处理请求失败')
        prepare_info, errors = AsrInfo.AsrInfoSchema().load(res.json())
        Assert.is_true(is_empty(errors), errors)
        return prepare_info
Example #23
0
    def get_asr_result(self):
        """
        查询音频转写结果
        :return:
        """
        url = getattr(self, 'asr_result_url', None)
        data = dict(app_id=self.app_id,
                    signa=self.signa,
                    ts=self.ts,
                    task_id=self.task_id)
        res = requests.post(url=url, data=data)
        res.encoding = encodes.Unicode.UTF_8.value

        if res is None or res.status_code != codes.ok:
            return AsrInfo(
                ok=codes.failed,
                err_no=res.status_code if res is not None else codes.bad,
                failed='查询音频转写结果请求失败')
        result_info, errors = AsrInfo.AsrInfoSchema().load(res.json())
        Assert.is_true(is_empty(errors), errors)
        return result_info
Example #24
0
    def _asr_upload(self, upload_file_path):
        """
        音频转写文件分片上传
        :param upload_file_path: 待上传音频文件路径
        :return:
        """
        url = getattr(self, 'asr_upload_url', None)
        sig = SliceIdGenerator()

        with open(upload_file_path, 'rb') as f:
            while True:
                content = f.read(self.__FILE_PIECE_SIZE)
                if not content or len(content) == 0:
                    break

                data = dict(app_id=self.app_id,
                            signa=self.signa,
                            ts=self.ts,
                            task_id=self.task_id,
                            slice_id=sig.get_next_slice_id())

                res = requests.post(url=url,
                                    data=data,
                                    files={'content': content})
                res.encoding = encodes.Unicode.UTF_8.value

                if res is None or res.status_code != codes.ok:
                    return AsrInfo(ok=codes.failed,
                                   err_no=res.status_code
                                   if res is not None else codes.bad,
                                   failed='音频转写文件分片上传请求失败')
                upload_info, errors = AsrInfo.AsrInfoSchema().load(res.json())
                Assert.is_true(is_empty(errors), errors)

                # 上传分片失败
                Assert.is_true(upload_info.ok == codes.success,
                               upload_info.err_no,
                               '分片上传失败:{0}'.format(upload_info.failed))
Example #25
0
    def asr(self,
            file_path,
            has_participle=False,
            has_seperate=True,
            speaker_number=2):
        """
        音频转写
        :param str file_path: 音频文件路径
        :param bool has_participle: 转写结果是否包含分词信息
        :param bool has_seperate: 转写结果中是否包含发音人分离信息
        :param int speaker_number: 发音人个数,可选值:0-10,0表示盲分
        :return:
        """
        from utils.file import Audio as BasicAudio
        has_participle = 'true' if has_participle else 'false'
        has_seperate = 'true' if has_seperate else 'false'
        audio_file = BasicAudio.probe(file_path)

        # 音频转写预处理
        prepare_info = self._asr_prepare(
            file_len=audio_file.size,
            file_name=audio_file.file_name,
            slice_num=math.ceil(audio_file.size / self.__FILE_PIECE_SIZE),
            has_participle=has_participle,
            has_seperate=has_seperate,
            speaker_number=speaker_number)
        Assert.is_true(prepare_info.ok == codes.success, prepare_info.failed,
                       prepare_info.err_no)
        # 预处理成功,更新task_id
        self.task_id = prepare_info.data

        # 文件发片上传
        self._asr_upload(file_path)

        # 文件合并
        merge_info = self._asr_merge()
        Assert.is_true(merge_info.ok == codes.success, merge_info.failed,
                       merge_info.err_no)
Example #26
0
 def to_pcm(cls,
            input_file,
            output_file,
            output_ac,
            output_ar,
            input_ac=None,
            input_ar=None,
            input_format=None,
            input_acodec=None,
            output_format=AudioFormat.S16LE.value,
            output_acodec=AudioCodecs.PCM_S16LE.value):
     """
     转 pcm 格式音频
     :param str input_file: 待转换文件路径
     :param int input_ac: 输入声道数(pcm 格式需要)
     :param int input_ar: 输入采样率 (pcm 格式需要)
     :param str input_format: 输入格式 (pcm 格式需要)
     :param str input_acodec: 输入编码器 (pcm 格式需要)
     :param str output_file: 转换输出文件路径
     :param int output_ac: 输出声道数
     :param int output_ar: 输出采样率
     :param str output_format: 输出格式
     :param str output_acodec: 输出编码器
     :return:
     """
     Assert.is_true(cls.is_pcm(output_file, is_strict=False),
                    '不是 pcm 拓展文件:{0}'.format(output_file))
     cls._format_convert(input_file=input_file,
                         output_file=output_file,
                         input_format=input_format,
                         input_acodec=input_acodec,
                         input_ac=input_ac,
                         input_ar=input_ar,
                         output_format=output_format,
                         output_acodec=output_acodec,
                         output_ac=output_ac,
                         output_ar=output_ar)
Example #27
0
    def get_md5_path(file_path,
                     block_size=64 * 1024,
                     hexadecimal=True,
                     b64=False):
        """
        获取文件 md5
        :param file_path: 文件路径
        :param block_size: 批次读取size大小
        :param hexadecimal: 是否使用十六进制,否则使用二进制
        :param b64: 是否 base64 编码
        :return:
        """
        from utils.encodes import hash_code

        Assert.is_true(os.path.isfile(file_path),
                       '文件不存在, path: {0}'.format(file_path))
        with open(file_path, 'rb') as f:
            md5 = hashlib.md5()
            while True:
                data = f.read(block_size)
                if not data:
                    break
                md5.update(data)
            return hash_code(md5, hexadecimal, b64)
Example #28
0
    def pdf_to_pic(path,
                   pic_dir,
                   format=FileFormat.JPG.value,
                   loss=True,
                   gamma=True,
                   zoom=210,
                   min_size=1.50,
                   max_size=15.0):
        """
        从pdf中提取图片
        :param path: pdf的路径
        :param pic_dir: 图片保存的路径
        :param format: 图片格式
        :param bool loss: 是否压缩
        :param bool gamma: 是否 gamma 矫正
        :param int zoom: 保存图片分辨率
        :param min_size: 文件大小最小阈值
        :param max_size: 文件大小最大阈值
        :return: {page_num, success_num, fail_num, msg}
        """
        Assert.is_true(os.path.isfile(path), '文件不存在, path: {0}'.format(path))
        page_num, success_num, fail_num = 0, 0, 0
        detail_info = {'images': []}
        pdf = None
        # 初始化图片压缩
        pngquant = PngQuant(min_quality=80,
                            max_quality=100,
                            tmp_file=os.path.join(pic_dir, 'quant.tmp.png'))

        try:
            FileUtil.creat_dirs(pic_dir)
            pdf = fitz.Document(path)
            page_num = pdf.pageCount

            for pg in range(page_num):
                pg_zoom = zoom
                pm_dict = {'page_code': pg + 1}
                try:
                    page = pdf[pg]  # type: Page
                    trans = fitz.Matrix(pg_zoom / 100.0,
                                        pg_zoom / 100.0).preRotate(0)
                    pm = page.getPixmap(matrix=trans, alpha=False)  # 获得每一页的流对象
                    page_path = FileUtil.path_join(pic_dir, '{0}.{1}'.format(
                        (pg + 1), format))  # 图片路径
                    pm.writeImage(page_path)  # 保存图片

                    file_size = FileUtil.get_file_size(page_path)
                    if file_size <= min_size:
                        # 低于最小阀值
                        pg_zoom = zoom * 2
                        trans = fitz.Matrix(pg_zoom / 100.0,
                                            pg_zoom / 100.0).preRotate(0)
                        pm = page.getPixmap(matrix=trans, alpha=False)
                        pm.writeImage(page_path)
                    elif file_size >= max_size:
                        # 大于最大阀值
                        pg_zoom = zoom * 2 / 3
                        trans = fitz.Matrix(pg_zoom / 100.0,
                                            pg_zoom / 100.0).preRotate(0)
                        pm = page.getPixmap(matrix=trans, alpha=False)
                        pm.writeImage(page_path)

                    if gamma:
                        # gamma 矫正
                        Enhancer().enhance(page_path, False, False, False,
                                           False, True)

                    if loss:
                        # 对图片进行近无损压缩
                        pngquant.quant_image(page_path)

                    pm_dict['img_path'] = page_path
                    success_num = success_num + 1
                except Exception as e:
                    page_path = pm_dict.get('img_path', '')
                    if is_not_empty(page_path):
                        pm_dict.pop('img_path')
                    if os.path.isfile(page_path):
                        # 处理失败,删除失败文件
                        FileUtil.del_file(page_path)
                    pm_dict['error_msg'] = repr(e) if repr(e) else 'pdf转图片失败'
                    fail_num = fail_num + 1
                finally:
                    detail_info.get('images').append(pm_dict)
        except Exception as e1:
            detail_info['error_msg'] = repr(e1) if repr(e1) else '处理pdf失败'
        finally:
            if is_not_empty(pdf):
                pdf.close()
        if success_num + fail_num != page_num:
            fail_num = page_num - success_num
        return page_num, success_num, fail_num, detail_info