def docx2pdf(doc_fn, pdf_fn): log.info("Converting: %s --> %s" %(os.path.basename(doc_fn), os.path.basename(pdf_fn))) if os.path.exists(pdf_fn): log.info("File exist and skip.") return word = client.DispatchEx("Word.Application") doc_trans = word.Documents.Open(doc_fn) doc_trans.SaveAs(pdf_fn, FileFormat=17) doc_trans.Close()
def translate_doc(doc_raw, doc_trans, trans_cache): if os.path.exists(doc_trans): log.info("File %s exists, translate_doc will be skipped." % (doc_trans)) return doc = docx.Document(doc_raw) process_paragraphs = doc.paragraphs log.info("Translating from caiyun...") for i, batch_par in tqdm(enumerate(batch(process_paragraphs, size=100))): source = [] for j, par in enumerate(batch_par): en_text = para2text(par) if len(en_text) == 0: continue if en_text in trans_cache: continue source.append(en_text) if len(source) == 0: continue translated = translate_caiyun(source) for src, target in zip(source, translated): trans_cache[src] = target log.info("Translate local...") for par in tqdm(process_paragraphs): en_text = para2text(par) if len(en_text) == 0: continue if en_text in trans_cache: par.text = trans_cache[en_text] log.info("Translate done and save.") doc.save(doc_trans)
def merge_pages(pdfTransFn1, pdfTransFn2, dst_fn, page_nums1=None, page_nums2=None): infile1 = PdfFileReader(pdfTransFn1, 'rb') infile2 = PdfFileReader(pdfTransFn2, 'rb') if page_nums1 is None: page_nums1 = [("first", [i]) for i in range(infile1.getNumPages())] if page_nums2 is None: page_nums2 = [("second", [i]) for i in range(infile2.getNumPages())] pages_sides = cross_iter(page_nums1, page_nums2) log.info("Merge Pages: %d+%d --> %s" % (len(page_nums1), len(page_nums2), dst_fn)) # if os.path.exists(dst_fn): # log.info("File Exist and skip.") # return output = PdfFileWriter() for (side, page_nums) in pages_sides: if side == "first": pages = get_page_from_nums(infile1, page_nums) elif side == "second": pages = get_page_from_nums(infile2, page_nums) else: raise Exception("F**k!") for p in pages: output.addPage(p) # 只考虑头个文件的书签 # bookmarks = infile1.getOutlines() # if len(bookmarks) > 0: # for bm in bookmarks: # bm_page_num = infile1.getDestinationPageNumber(bm) # 利用python处理pdf:奇数页pdf末尾添加一个空白页 - https://zhuanlan.zhihu.com/p/34246341 # 青梅煮马: 刚好遇到这个问题 把 PyPDF2\utils.py 第238行的'latin-1'编码修改为'uft-8'即可 # 上面的方法会影响兼容性,改下面的方法 # PyPDF2 编码问题’latin-1′ codec can’t encode characters in position 8-11: ordinal not in range(256) https://www.codenong.com/cs105218309/ with open(dst_fn, 'wb') as f: output.write(f)
def translate_doc(doc_raw, doc_trans, trans_cache): doc = docx.Document(doc_raw) process_paragraphs = doc.paragraphs log.info("Translating from caiyun...") for batch_par in tqdm(batch(process_paragraphs, size=100)): source = [] for par in batch_par: en_text = para2text(par) if len(en_text) == 0: continue if en_text in trans_cache: continue source.append(en_text) if len(source) == 0: continue translated = translate_caiyun(source) for src,target in zip(source, translated): trans_cache[src] = target log.info("Translate local...") for par in tqdm(process_paragraphs): en_text = para2text(par) if len(en_text) == 0: continue if en_text in trans_cache: par.text = trans_cache[en_text] log.info("Translate done and save.") doc.save(doc_trans)
def convert_pdf_to_docx_v2(inputFileName, outputFileName): log.info("Converting: %s --> %s" % (os.path.basename(inputFileName), os.path.basename(outputFileName))) if os.path.exists(outputFileName): log.info("File Exist, skip...") return pdf2word = PDF2Word.PDF2Word() try: pdf2word.setConversionMethod(PDF2Word.optConversionMethod.CNV_METHOD_USE_TEXTBOXES) pdf2word.setOutputDocumentFormat(PDF2Word.optOutputDocumentFormat.OPT_OUTPUT_DOCX_VIA_OFFICE) pdf2word.setDocumentType(PDF2Word.optDocumentType.DOCTYPE_MULTI_COLUMN) pdf2word.setAdjustSpacing(True) pdf2word.ConvertToWord(inputFileName, outputFileName, "", 0, -1) except PDF2Word.PDF2WordException as ex: log.info(ex) sys.exit()
def convert_pdf_to_docx_v1(pdf_fn, word_fn): log.info("Converting: %s --> %s" % (os.path.basename(pdf_fn), os.path.basename(word_fn))) if os.path.exists(word_fn): log.info("File Exist, skip...") return pdf2word = PDF2Word.PDF2Word() try: pdf2word.setOutputDocumentFormat(PDF2Word.optOutputDocumentFormat.OPT_OUTPUT_DOCX) pdf2word.setConnectHyphens(True) pdf2word.setShrinkCharacterSpacingToPreventWrap(True) pdf2word.setFileConversionTimeout(600000) pdf2word.ConvertToWord(pdf_fn, word_fn, "", 0, -1) except PDF2Word.PDF2WordException as ex: log.info(ex) sys.exit()
def remove_first_page(pdf_fn, dst_fn): log.info("Converting: %s --> %s" %(os.path.basename(pdf_fn), os.path.basename(dst_fn))) if os.path.exists(dst_fn): log.info("File Exist, skip...") return dst_fn infile = PdfFileReader(pdf_fn, 'rb') output = PdfFileWriter() skip_pages = [0] for i in range(infile.getNumPages()): if i in skip_pages: continue p = infile.getPage(i) output.addPage(p) with open(dst_fn, 'wb') as f: output.write(f) log.info("Done.") return dst_fn
check_file_exists(inputFileName) check_dir_exists(outputDirName) cache_trans = dc.Cache(outputDirName) if mode not in SUPPORTED_MODES: print("Supported Modes: %s" %(",".join(SUPPORTED_MODES))) print("Check help for how to choose modes.") sys.exit() # 第0步:将原始PDF复制到目标目录 inputBaseNameWithoutExt = os.path.basename(inputFileName).replace(".pdf", "") if mode.lower().startswith("single"): # 第1步:将PDF分为2个,以应对该共享软件每次间隔1页留空白,方法是将PDF的第1页删掉 pdfFileName1 = os.path.join(outputDirName, "%s_1_RawPart1.pdf" %(inputBaseNameWithoutExt)) pdfFileName2 = os.path.join(outputDirName, "%s_1_RawPart2.pdf" % (inputBaseNameWithoutExt)) log.info("%s, %s" %(pdfFileName1, pdfFileName2)) if not os.path.exists(pdfFileName1): copyfile(inputFileName, pdfFileName1) remove_first_page(pdfFileName1, pdfFileName2) # 第2步:分别将这2个PDF转为DOCX docFileName1 = os.path.join(outputDirName, "%s_2_RawPart1.docx" %(inputBaseNameWithoutExt)) docFileName2 = os.path.join(outputDirName, "%s_2_RawPart2.docx" %(inputBaseNameWithoutExt)) log.info("%s, %s" %(docFileName1, docFileName2)) convert_pdf_to_docx(pdfFileName1, docFileName1) convert_pdf_to_docx(pdfFileName2, docFileName2) # 第3步:将这2个DOCX翻译,仍然保存为DOCX docTransFn1 = os.path.join(outputDirName, "%s_3_TranslatedPart1.docx" %(inputBaseNameWithoutExt)) docTransFn2 = os.path.join(outputDirName, "%s_3_TranslatedPart2.docx" %(inputBaseNameWithoutExt)) log.info("%s, %s" %(docTransFn1, docTransFn2))