def ConvertImageToPDF(image_path, docx_path, dest_path, wordapp): if os.path.exists(dest_path): return try: doc = Document() #以默认模板建立文档对象 doc.add_picture(image_path, width=Inches(6), height=Inches(8)) except Exception as e: if os.path.exists(docx_path): os.remove(docx_path) if os.path.exists(dest_path): os.remove(dest_path) print('Error:' + image_path, e) return doc.save(docx_path) #wdFormatDocument = 0 #wdFormatDocument97 = 0 #wdFormatDocumentDefault = 16 #wdFormatDOSText = 4 #wdFormatDOSTextLineBreaks = 5 #wdFormatEncodedText = 7 #wdFormatFilteredHTML = 10 #wdFormatFlatXML = 19 #wdFormatFlatXMLMacroEnabled = 20 #wdFormatFlatXMLTemplate = 21 #wdFormatFlatXMLTemplateMacroEnabled = 22 #wdFormatHTML = 8 wdFormatPDF = 17 #wdFormatRTF = 6 #wdFormatTemplate = 1 #wdFormatTemplate97 = 1 #wdFormatText = 2 #wdFormatTextLineBreaks = 3 #wdFormatUnicodeText = 7 #wdFormatWebArchive = 9 #wdFormatXML = 11 #wdFormatXMLDocument = 12 #wdFormatXMLDocumentMacroEnabled = 13 #wdFormatXMLTemplate = 14 #wdFormatXMLTemplateMacroEnabled = 15 #wdFormatXPS = 18 #wordapp.Visible = True #doc = wordapp.Documents.Add() doc = wordapp.Documents.Open(docx_path) # 插入文字 range = doc.Range(0, 0) #range.InsertBefore('6b.jpg') #doc.SaveAs('6b.docx') #doc.SaveAs('6b.pdf', win32com.client.constants.wdFormatPDF) doc.SaveAs(dest_path, wdFormatPDF) print(dest_path) doc.Close()
def get_text(file_name): file_name = os.path.abspath(file_name) _, actual_file_name = os.path.split(file_name) if actual_file_name.startswith("~"): return "" print(file_name) ext = get_file_ext(file_name) if ext is None or ext in ["txt", "rst", "text", "adoc"]: try: with codecs.open(file_name, "r", "utf-8") as f: return f.read() except Exception: print("File could not be read ", file_name) traceback.print_exc() elif ext == "rtf": try: with codecs.open(file_name, "r", "utf-8") as f: return striprtf(f.read()) except Exception: print("File could not be read ", file_name) traceback.print_exc() elif ext in ["pdf"]: text = extract_text(file_name) full_text = [text] with open(file_name, 'rb') as f: reader = PyPDF2.PdfFileReader(f) for pageNumber in range(reader.numPages): page = reader.getPage(pageNumber) try: txt = page.extractText() full_text.append(txt) except Exception: print("Error PDF reader ", file_name, pageNumber) traceback.print_exc() return "\n".join(full_text) elif ext in ["docx"]: full_text = [] try: doc = Document(file_name) for para in doc.paragraphs: full_text.append(para.text) except Exception: traceback.print_exc() return '\n'.join(full_text) elif ext in ["doc"]: if os.name == 'nt': import win32com.client word = win32com.client.Dispatch("Word.Application") word.visible = False _ = word.Documents.Open(file_name) doc = word.ActiveDocument return doc.Range().Text os.system( "/Applications/LibreOffice.app/Contents/MacOS/soffice --headless --convert-to txt:Text " + file_name) fileX = os.path.split(file_name)[1].split(".") + ".txt" try: with codecs.open(fileX, "r", "utf-8") as f: return f.read() except Exception: print("File could not be read ", fileX) traceback.print_exc() else: print("Unknown file extension", file_name) return ""