class Docx(IdAble): def __init__(self, path): super(Docx, self).__init__() if path is None or not isinstance(path, str): raise Exception("Path is not allowed None") if not os.path.exists(TEMP_BASE_DIR): try: os.mkdir(TEMP_BASE_DIR) except FileExistsError as e: pass self.document = None self.content_types = None self.relationships = None self.numbering = None self.styles = None self.base_dir = uuid1().hex file = ZipFile(path) self.file_path = os.path.join(TEMP_BASE_DIR, self.base_dir) os.mkdir(self.file_path) file.extractall(self.file_path) file.close() self.get_document() self.get_content_types() self.get_numbering() self.get_relationships() self.get_styles() def get_numbering(self): if self.numbering: return self.numbering numbering_path = os.path.join(self.file_path, "word/numbering.xml") if not os.path.exists(numbering_path): self.numbering = Numbering() return self.numbering with open(numbering_path, encoding="UTF-8") as f: numbering = f.read() numbering = BeautifulSoup(numbering, "xml") self.numbering = Numbering(numbering) return self.numbering def get_document(self): if self.document: return self.document doc_path = os.path.join(self.file_path, "word/document.xml") with open(doc_path, encoding="UTF-8") as f: document = f.read() document = BeautifulSoup(document, "xml") self.document = Document(document) return self.document def get_relationships(self): if self.relationships: return self.relationships doc_path = os.path.join(self.file_path, "word/_rels/document.xml.rels") with open(doc_path, encoding="UTF-8") as f: doc = f.read() doc = BeautifulSoup(doc, "xml") self.relationships = Relationships(doc) return self.relationships def get_content_types(self): if self.content_types: return self.content_types content_path = os.path.join(self.file_path, "[Content_Types].xml") with open(content_path, encoding="UTF-8") as f: content_types = f.read() content_types = BeautifulSoup(content_types, "xml") self.content_types = ContentTypes(content_types) return self.content_types def get_styles(self): if self.styles: return self.styles style_path = os.path.join(self.file_path, "word/styles.xml") with open(style_path, encoding="UTF-8") as f: styles = f.read() styles = BeautifulSoup(styles, "xml") self.styles = Styles(styles) return self.styles def extract_media_files(self, path): relationships = self.get_relationships() file_mapping = relationships.get_file_mapping() template = "cp {} {}" base_dir = os.path.join(self.file_path, "word") #print(file_mapping) for file in file_mapping.keys(): from_file = os.path.join(base_dir, file) to_file = os.path.join(path, file_mapping[file]) dir_name = os.path.dirname(to_file) if not os.path.exists(dir_name): os.makedirs(dir_name) extract = template.format(from_file, to_file) os.system(extract) def merge(self, doc, page=False): if not isinstance(doc, Docx): raise Exception("merge parameter is not docx") source_content_types = doc.get_content_types() self.get_content_types().merge_content_types(source_content_types) source_relationships = doc.get_relationships() #print(source_relationships.get_file_mapping()) source_relationships.generate_id(doc.id) doc.extract_media_files(os.path.join(self.file_path, "word")) self.get_relationships().merge_relationships(source_relationships) source_styles = doc.get_styles() source_styles.generate_id(doc.id) self.styles.merge(source_styles) source_numberings = doc.get_numbering() source_numberings.generate_id(doc.num) self.numbering.merge(source_numberings) source_document = doc.get_document() source_document.generate_id(doc.id, doc.num) self.get_document().merge(source_document, page) def save(self, name): import zipfile self._save_document() self._save_content_types() self._save_relationships() self._save_numbering() self._save_styles() file = ZipFile(name, "w", compression=zipfile.ZIP_DEFLATED) for base, children, files in os.walk(self.file_path): base_name = base.split(self.base_dir)[-1] for f in files: zip_path = os.path.join(base_name, f) real_path = os.path.join(base, f) file.write(real_path, zip_path) file.close() def _save_document(self): with open(os.path.join(self.file_path, "word/document.xml"), mode="w", encoding="UTF-8") as f: f.write(str(self.document.get_dom())) def _save_content_types(self): with open(os.path.join(self.file_path, "[Content_Types].xml"), mode="w", encoding="UTF-8") as f: f.write(str(self.content_types.get_dom())) def _save_relationships(self): with open(os.path.join(self.file_path, "word/_rels/document.xml.rels"), mode="w", encoding="UTF-8") as f: f.write(str(self.relationships.get_dom())) def _save_numbering(self): numbering = self.numbering.get_dom() if not numbering: return numbering_path = os.path.join(self.file_path, "word/numbering.xml") with open(numbering_path, "w+", encoding="UTF-8") as f: f.write(str(numbering)) def _save_styles(self): with open(os.path.join(self.file_path, "word/styles.xml"), "w+", encoding="UTF-8") as f: f.write(str(self.styles.get_dom())) def append_paragraph(self, text, align="left"): self.document.append_paragraph(text, align) def append_picture(self, filepath, align="left"): if not os.path.exists(filepath): return media_dir = os.path.join(self.file_path, "word/media") if not os.path.exists(media_dir): os.mkdir(media_dir) suffix = filepath.split(".")[-1] self.content_types.append_extension(suffix) id_file = self.relationships.append_relationship(suffix) #print(id_file) file_path = os.path.join( self.file_path, "word/media/{filename}".format(filename=id_file["filename"])) os.system("cp {f_file} {t_file}".format(f_file=filepath, t_file=file_path)) img = Image.open(file_path) width, height = img.size img.close() self.document.append_picture(id_file["rid"], width * 6350, height * 6350, align) def close(self): os.system("rm -rf {0}".format(self.file_path))