Example #1
0
def postlist_son(request):
    if 'delete_list' in request.POST:  # 删子文件夹-批量
        pdf_id = request.POST.getlist("d2p_list")
        file_infos = FileInfo.objects.filter(id__in=pdf_id)
        for file_info in file_infos:
            if file_info.file_type == 'FOLDER':
                delete_all(file_info.file_path)
                file_info.delete()
            else:
                os.remove(file_info.file_path)
                file_info.delete()
        file_infos = FileInfo.objects.all()
        messages.success(request, "文件删除成功!")
    elif 'mpdf_list' in request.POST:  # 合并PDF文件夹-批量
        output = PdfFileWriter()
        outputPages = 0
        output_name = ''
        pdf_id = request.POST.getlist("d2p_list")
        file_infos = FileInfo.objects.filter(id__in=pdf_id)
        for file_info in file_infos:
            # 读取源PDF文件
            input = PdfFileReader(open(file_info.file_path, "rb"))

            # 获得源PDF文件中页面总数
            pageCount = input.getNumPages()
            outputPages += pageCount
            print("页数:%d" % pageCount)

            # 分别将page添加到输出output中
            for iPage in range(pageCount):
                output.addPage(input.getPage(iPage))
            # output_name=output_name+file_info.file_name.split('.')[0][0]+'-'
        output_name = '整合-' + file_info.folder_name.split('\\')[-1]

        # 写入到目标PDF文件
        outputStream = open(
            file_infos[0].folder_name + '\\' + output_name + '.pdf', "wb")
        output.write(outputStream)
        outputStream.close()
        file_info1 = FileInfo(file_name=output_name + '.pdf',
                              file_path=file_infos[0].folder_name + '\\' +
                              output_name + '.pdf',
                              file_type='pdf',
                              load_user=get_user(request),
                              is_personal=int(B),
                              folder_name=file_infos[0].folder_name)
        file_info1.save()
        file_size1 = os.path.getsize(file_infos[0].folder_name + '\\' +
                                     output_name + '.pdf')
        FileInfo.objects.filter(
            file_path=file_infos[0].folder_name + '\\' + output_name +
            '.pdf').update(
                file_size=1 if 0 < file_size1 < 1024 else file_size1 / 1024)
        messages.success(request, "PDF合并成功!")
    elif 'download_list' in request.POST:  # 下载-子文件夹-批量
        download_id = request.POST.getlist("d2p_list")
        file_infos = FileInfo.objects.filter(id__in=download_id)
        # print('下载的文件名:' + file_info.file_name)
        for file_info in file_infos:
            file = open(file_info.file_path, 'rb')
            response = FileResponse(file)
            response[
                'Content-Disposition'] = 'attachment;filename="%s"' % urlquote(
                    file_info.file_name)
            return response
    elif 'tj_list' in request.POST:
        word_id = request.POST.getlist("d2p_list")
        file_infos = FileInfo.objects.filter(id__in=word_id)
        for file_info in file_infos:
            in_file = file_info.file_path
            out_file = file_info.file_path.split(".")[0] + ".doc"
            fp = open(in_file, 'rb')  # 以二进制读模式打开
            # 用文件对象来创建一个pdf文档分析器
            parser = PDFParser(fp)
            # 创建一个PDF文档
            doc = PDFDocument()
            # 连接分析器 与文档对象
            parser.set_document(doc)
            doc.set_parser(parser)
            # 提供初始化密码
            # 如果没有密码 就创建一个空的字符串
            doc.initialize()
            if not doc.is_extractable:
                raise PDFTextExtractionNotAllowed
            else:
                # 创建PDf 资源管理器 来管理共享资源
                rsrcmgr = PDFResourceManager()
                # 创建一个PDF设备对象
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                # 创建一个PDF解释器对象
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量
                num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0
                for page in doc.get_pages():  # doc.get_pages() 获取page列表
                    num_page += 1  # 页面增一
                    interpreter.process_page(page)
                    # 接受该页面的LTPage对象
                    layout = device.get_result()
                    for x in layout:
                        if isinstance(x, LTImage):  # 图片对象
                            num_image += 1
                        if isinstance(x, LTCurve):  # 曲线对象
                            num_curve += 1
                        if isinstance(x, LTFigure):  # figure对象
                            num_figure += 1
                        if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                            num_TextBoxHorizontal += 1  # 水平文本框对象增一
                            # 保存文本内容
                            with open(out_file, 'a',
                                      encoding='utf-8') as f:  #生成doc文件的文件名及路径
                                results = x.get_text()
                                f.write(results)
                                f.write('\n')
                print('对象数量:\n', '页面数:%s\n' % num_page, '图片数:%s\n' % num_image,
                      '曲线数:%s\n' % num_curve,
                      '水平文本框:%s\n' % num_TextBoxHorizontal)
                file_info1 = FileInfo(
                    file_name=file_info.file_name.split('.')[0] + '.doc',
                    file_path=out_file,
                    file_type='doc',
                    load_user=get_user(request),
                    is_personal=int(B),
                    folder_name=file_info.folder_name)
                file_info1.save()
                file_size1 = os.path.getsize(out_file)
                FileInfo.objects.filter(file_path=out_file).update(
                    file_size=1 if 0 < file_size1 < 1024 else file_size1 /
                    1024)
        messages.success(request, "DOC转换成功!")
    else:
        pdf_id = request.POST.getlist("d2p_list")
        file_infos = FileInfo.objects.filter(id__in=pdf_id)
        for file_info in file_infos:
            in_file = file_info.file_path
            out_file = file_info.file_path.split(".")[0] + ".pdf"
            pythoncom.CoInitialize()
            word = win32com.client.Dispatch('Word.Application')
            doc = word.Documents.Open(in_file)
            doc.SaveAs(out_file, FileFormat=17)
            doc.Close()
            time.sleep(1)
            file_info1 = FileInfo(file_name=file_info.file_name.split('.')[0] +
                                  '.pdf',
                                  file_path=out_file,
                                  file_type='pdf',
                                  load_user=get_user(request),
                                  is_personal=int(B),
                                  folder_name=file_info.folder_name)
            file_info1.save()
            file_size1 = os.path.getsize(out_file)
            FileInfo.objects.filter(file_path=out_file).update(
                file_size=1 if 0 < file_size1 < 1024 else file_size1 / 1024)
        messages.success(request, "PDF转换成功!")
    return HttpResponseRedirect(reverse('fileserver:list1', args=[a]))
Example #2
0
                        interpreter.process_page(page)
                        # 接受该页面的LTPage对象
                        layout = device.get_result()
                        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
                        # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
                        # 想要获取文本就获得对象的text属性,
                        for x in layout:
                            if (isinstance(x, LTTextBoxHorizontal)):
                                with open(full_name1, 'ab') as f:
                                    results = x.get_text()
                                    f.write((results + '\r\n').encode('utf-8'))
            else:
                if a[1] == '.doc':
                    doc = word.Documents.Open(full_name)
                    doc.SaveAs(FileFormat=7, FileName=full_name1)
                    doc.Close()
                    textdoc = readfile(full_name1)
                    textdoc = textdoc.decode("GBK")
                    savefile(full_name1, textdoc.encode("utf-8"))
                else:
                    print(corpus_path + "存在无法识别的文件,请检查文件夹内容")
                    sys.exit(-1)
word.Quit()  # 退出word

# 对测试集进行分词
# 若采用命令行传入参数方式,改下面
# corpus_path = sys.argv[2]  # 未分词分类语料库路径(绝对路径)
# corpus_path = "./测试数据/"  # 未分词分类语料库路径(相对路径)
seg_path = path1 + "/model/测试数据分词后/"  # 分词后分类语料库路径
corpus_segment2(corpus_path1, seg_path)