def postlist_son(request): if 'delete_list' in request.POST: # 删子文件夹-批量 pdf_id = request.POST.getlist("d2p_list") file_infos = FileInfo.objects.filter(id__in=pdf_id) for file_info in file_infos: if file_info.file_type == 'FOLDER': delete_all(file_info.file_path) file_info.delete() else: os.remove(file_info.file_path) file_info.delete() file_infos = FileInfo.objects.all() messages.success(request, "文件删除成功!") elif 'mpdf_list' in request.POST: # 合并PDF文件夹-批量 output = PdfFileWriter() outputPages = 0 output_name = '' pdf_id = request.POST.getlist("d2p_list") file_infos = FileInfo.objects.filter(id__in=pdf_id) for file_info in file_infos: # 读取源PDF文件 input = PdfFileReader(open(file_info.file_path, "rb")) # 获得源PDF文件中页面总数 pageCount = input.getNumPages() outputPages += pageCount print("页数:%d" % pageCount) # 分别将page添加到输出output中 for iPage in range(pageCount): output.addPage(input.getPage(iPage)) # output_name=output_name+file_info.file_name.split('.')[0][0]+'-' output_name = '整合-' + file_info.folder_name.split('\\')[-1] # 写入到目标PDF文件 outputStream = open( file_infos[0].folder_name + '\\' + output_name + '.pdf', "wb") output.write(outputStream) outputStream.close() file_info1 = FileInfo(file_name=output_name + '.pdf', file_path=file_infos[0].folder_name + '\\' + output_name + '.pdf', file_type='pdf', load_user=get_user(request), is_personal=int(B), folder_name=file_infos[0].folder_name) file_info1.save() file_size1 = os.path.getsize(file_infos[0].folder_name + '\\' + output_name + '.pdf') FileInfo.objects.filter( file_path=file_infos[0].folder_name + '\\' + output_name + '.pdf').update( file_size=1 if 0 < file_size1 < 1024 else file_size1 / 1024) messages.success(request, "PDF合并成功!") elif 'download_list' in request.POST: # 下载-子文件夹-批量 download_id = request.POST.getlist("d2p_list") file_infos = FileInfo.objects.filter(id__in=download_id) # print('下载的文件名:' + file_info.file_name) for file_info in file_infos: file = open(file_info.file_path, 'rb') response = FileResponse(file) response[ 'Content-Disposition'] = 'attachment;filename="%s"' % urlquote( file_info.file_name) return response elif 'tj_list' in request.POST: word_id = request.POST.getlist("d2p_list") file_infos = FileInfo.objects.filter(id__in=word_id) for file_info in file_infos: in_file = file_info.file_path out_file = file_info.file_path.split(".")[0] + ".doc" fp = open(in_file, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量 num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0 for page in doc.get_pages(): # doc.get_pages() 获取page列表 num_page += 1 # 页面增一 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if isinstance(x, LTImage): # 图片对象 num_image += 1 if isinstance(x, LTCurve): # 曲线对象 num_curve += 1 if isinstance(x, LTFigure): # figure对象 num_figure += 1 if isinstance(x, LTTextBoxHorizontal): # 获取文本内容 num_TextBoxHorizontal += 1 # 水平文本框对象增一 # 保存文本内容 with open(out_file, 'a', encoding='utf-8') as f: #生成doc文件的文件名及路径 results = x.get_text() f.write(results) f.write('\n') print('对象数量:\n', '页面数:%s\n' % num_page, '图片数:%s\n' % num_image, '曲线数:%s\n' % num_curve, '水平文本框:%s\n' % num_TextBoxHorizontal) file_info1 = FileInfo( file_name=file_info.file_name.split('.')[0] + '.doc', file_path=out_file, file_type='doc', load_user=get_user(request), is_personal=int(B), folder_name=file_info.folder_name) file_info1.save() file_size1 = os.path.getsize(out_file) FileInfo.objects.filter(file_path=out_file).update( file_size=1 if 0 < file_size1 < 1024 else file_size1 / 1024) messages.success(request, "DOC转换成功!") else: pdf_id = request.POST.getlist("d2p_list") file_infos = FileInfo.objects.filter(id__in=pdf_id) for file_info in file_infos: in_file = file_info.file_path out_file = file_info.file_path.split(".")[0] + ".pdf" pythoncom.CoInitialize() word = win32com.client.Dispatch('Word.Application') doc = word.Documents.Open(in_file) doc.SaveAs(out_file, FileFormat=17) doc.Close() time.sleep(1) file_info1 = FileInfo(file_name=file_info.file_name.split('.')[0] + '.pdf', file_path=out_file, file_type='pdf', load_user=get_user(request), is_personal=int(B), folder_name=file_info.folder_name) file_info1.save() file_size1 = os.path.getsize(out_file) FileInfo.objects.filter(file_path=out_file).update( file_size=1 if 0 < file_size1 < 1024 else file_size1 / 1024) messages.success(request, "PDF转换成功!") return HttpResponseRedirect(reverse('fileserver:list1', args=[a]))
interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 # 想要获取文本就获得对象的text属性, for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(full_name1, 'ab') as f: results = x.get_text() f.write((results + '\r\n').encode('utf-8')) else: if a[1] == '.doc': doc = word.Documents.Open(full_name) doc.SaveAs(FileFormat=7, FileName=full_name1) doc.Close() textdoc = readfile(full_name1) textdoc = textdoc.decode("GBK") savefile(full_name1, textdoc.encode("utf-8")) else: print(corpus_path + "存在无法识别的文件,请检查文件夹内容") sys.exit(-1) word.Quit() # 退出word # 对测试集进行分词 # 若采用命令行传入参数方式,改下面 # corpus_path = sys.argv[2] # 未分词分类语料库路径(绝对路径) # corpus_path = "./测试数据/" # 未分词分类语料库路径(相对路径) seg_path = path1 + "/model/测试数据分词后/" # 分词后分类语料库路径 corpus_segment2(corpus_path1, seg_path)