Example #1
0
def docx2pdf(doc_fn, pdf_fn):
    log.info("Converting: %s --> %s" %(os.path.basename(doc_fn), os.path.basename(pdf_fn)))
    if os.path.exists(pdf_fn):
        log.info("File exist and skip.")
        return
    word = client.DispatchEx("Word.Application")
    doc_trans = word.Documents.Open(doc_fn)
    doc_trans.SaveAs(pdf_fn, FileFormat=17)
    doc_trans.Close()
Example #2
0
def translate_doc(doc_raw, doc_trans, trans_cache):
    if os.path.exists(doc_trans):
        log.info("File %s exists, translate_doc will be skipped." %
                 (doc_trans))
        return
    doc = docx.Document(doc_raw)
    process_paragraphs = doc.paragraphs

    log.info("Translating from caiyun...")
    for i, batch_par in tqdm(enumerate(batch(process_paragraphs, size=100))):
        source = []
        for j, par in enumerate(batch_par):
            en_text = para2text(par)
            if len(en_text) == 0:
                continue
            if en_text in trans_cache:
                continue
            source.append(en_text)
        if len(source) == 0:
            continue
        translated = translate_caiyun(source)
        for src, target in zip(source, translated):
            trans_cache[src] = target

    log.info("Translate local...")
    for par in tqdm(process_paragraphs):
        en_text = para2text(par)
        if len(en_text) == 0:
            continue
        if en_text in trans_cache:
            par.text = trans_cache[en_text]

    log.info("Translate done and save.")
    doc.save(doc_trans)
Example #3
0
def merge_pages(pdfTransFn1,
                pdfTransFn2,
                dst_fn,
                page_nums1=None,
                page_nums2=None):
    infile1 = PdfFileReader(pdfTransFn1, 'rb')
    infile2 = PdfFileReader(pdfTransFn2, 'rb')
    if page_nums1 is None:
        page_nums1 = [("first", [i]) for i in range(infile1.getNumPages())]
    if page_nums2 is None:
        page_nums2 = [("second", [i]) for i in range(infile2.getNumPages())]
    pages_sides = cross_iter(page_nums1, page_nums2)

    log.info("Merge Pages: %d+%d --> %s" %
             (len(page_nums1), len(page_nums2), dst_fn))
    # if os.path.exists(dst_fn):
    #     log.info("File Exist and skip.")
    #     return

    output = PdfFileWriter()
    for (side, page_nums) in pages_sides:
        if side == "first":
            pages = get_page_from_nums(infile1, page_nums)
        elif side == "second":
            pages = get_page_from_nums(infile2, page_nums)
        else:
            raise Exception("F**k!")
        for p in pages:
            output.addPage(p)

    # 只考虑头个文件的书签
    # bookmarks = infile1.getOutlines()
    # if len(bookmarks) > 0:
    #     for bm in bookmarks:
    #         bm_page_num = infile1.getDestinationPageNumber(bm)

    # 利用python处理pdf:奇数页pdf末尾添加一个空白页 - https://zhuanlan.zhihu.com/p/34246341
    # 青梅煮马: 刚好遇到这个问题 把 PyPDF2\utils.py 第238行的'latin-1'编码修改为'uft-8'即可
    # 上面的方法会影响兼容性,改下面的方法
    # PyPDF2 编码问题’latin-1′ codec can’t encode characters in position 8-11: ordinal not in range(256)  https://www.codenong.com/cs105218309/
    with open(dst_fn, 'wb') as f:
        output.write(f)
Example #4
0
def translate_doc(doc_raw, doc_trans, trans_cache):
    doc = docx.Document(doc_raw)
    process_paragraphs = doc.paragraphs

    log.info("Translating from caiyun...")
    for batch_par in tqdm(batch(process_paragraphs, size=100)):
        source = []
        for par in batch_par:
            en_text = para2text(par)
            if len(en_text) == 0:
                continue
            if en_text in trans_cache:
                continue
            source.append(en_text)
        if len(source) == 0:
            continue
        translated = translate_caiyun(source)
        for src,target in zip(source, translated):
            trans_cache[src] = target

    log.info("Translate local...")
    for par in tqdm(process_paragraphs):
        en_text = para2text(par)
        if len(en_text) == 0:
            continue
        if en_text in trans_cache:
            par.text = trans_cache[en_text]

    log.info("Translate done and save.")
    doc.save(doc_trans)
Example #5
0
def convert_pdf_to_docx_v2(inputFileName, outputFileName):
    log.info("Converting: %s --> %s" % (os.path.basename(inputFileName), os.path.basename(outputFileName)))
    if os.path.exists(outputFileName):
        log.info("File Exist, skip...")
        return
    pdf2word = PDF2Word.PDF2Word()
    try:
        pdf2word.setConversionMethod(PDF2Word.optConversionMethod.CNV_METHOD_USE_TEXTBOXES)
        pdf2word.setOutputDocumentFormat(PDF2Word.optOutputDocumentFormat.OPT_OUTPUT_DOCX_VIA_OFFICE)
        pdf2word.setDocumentType(PDF2Word.optDocumentType.DOCTYPE_MULTI_COLUMN)
        pdf2word.setAdjustSpacing(True)
        pdf2word.ConvertToWord(inputFileName, outputFileName, "", 0, -1)
    except PDF2Word.PDF2WordException as ex:
        log.info(ex)
        sys.exit()
Example #6
0
def convert_pdf_to_docx_v1(pdf_fn, word_fn):
    log.info("Converting: %s --> %s" % (os.path.basename(pdf_fn), os.path.basename(word_fn)))
    if os.path.exists(word_fn):
        log.info("File Exist, skip...")
        return
    pdf2word = PDF2Word.PDF2Word()
    try:
        pdf2word.setOutputDocumentFormat(PDF2Word.optOutputDocumentFormat.OPT_OUTPUT_DOCX)
        pdf2word.setConnectHyphens(True)
        pdf2word.setShrinkCharacterSpacingToPreventWrap(True)
        pdf2word.setFileConversionTimeout(600000)
        pdf2word.ConvertToWord(pdf_fn, word_fn, "", 0, -1)
    except PDF2Word.PDF2WordException as ex:
        log.info(ex)
        sys.exit()
Example #7
0
def remove_first_page(pdf_fn, dst_fn):
    log.info("Converting: %s --> %s" %(os.path.basename(pdf_fn), os.path.basename(dst_fn)))
    if os.path.exists(dst_fn):
        log.info("File Exist, skip...")
        return dst_fn

    infile = PdfFileReader(pdf_fn, 'rb')
    output = PdfFileWriter()

    skip_pages = [0]
    for i in range(infile.getNumPages()):
        if i in skip_pages:
            continue
        p = infile.getPage(i)
        output.addPage(p)

    with open(dst_fn, 'wb') as f:
        output.write(f)
    log.info("Done.")
    return dst_fn
Example #8
0
check_file_exists(inputFileName)
check_dir_exists(outputDirName)
cache_trans = dc.Cache(outputDirName)
if mode not in SUPPORTED_MODES:
    print("Supported Modes: %s" %(",".join(SUPPORTED_MODES)))
    print("Check help for how to choose modes.")
    sys.exit()

# 第0步:将原始PDF复制到目标目录
inputBaseNameWithoutExt = os.path.basename(inputFileName).replace(".pdf", "")

if mode.lower().startswith("single"):
    # 第1步:将PDF分为2个,以应对该共享软件每次间隔1页留空白,方法是将PDF的第1页删掉
    pdfFileName1 = os.path.join(outputDirName, "%s_1_RawPart1.pdf" %(inputBaseNameWithoutExt))
    pdfFileName2 = os.path.join(outputDirName, "%s_1_RawPart2.pdf" % (inputBaseNameWithoutExt))
    log.info("%s, %s" %(pdfFileName1, pdfFileName2))
    if not os.path.exists(pdfFileName1):
        copyfile(inputFileName, pdfFileName1)
    remove_first_page(pdfFileName1, pdfFileName2)

    # 第2步:分别将这2个PDF转为DOCX
    docFileName1 = os.path.join(outputDirName, "%s_2_RawPart1.docx" %(inputBaseNameWithoutExt))
    docFileName2 = os.path.join(outputDirName, "%s_2_RawPart2.docx" %(inputBaseNameWithoutExt))
    log.info("%s, %s" %(docFileName1, docFileName2))
    convert_pdf_to_docx(pdfFileName1, docFileName1)
    convert_pdf_to_docx(pdfFileName2, docFileName2)

    # 第3步:将这2个DOCX翻译,仍然保存为DOCX
    docTransFn1 = os.path.join(outputDirName, "%s_3_TranslatedPart1.docx" %(inputBaseNameWithoutExt))
    docTransFn2 = os.path.join(outputDirName, "%s_3_TranslatedPart2.docx" %(inputBaseNameWithoutExt))
    log.info("%s, %s" %(docTransFn1, docTransFn2))