def get_relationships(self): if self.relationships: return self.relationships doc_path = os.path.join(self.file_path, "word/_rels/document.xml.rels") with open(doc_path, encoding="UTF-8") as f: doc = f.read() doc = BeautifulSoup(doc, "xml") self.relationships = Relationships(doc) return self.relationships
def _write_chartsheet_rels_files(self): # Write the chartsheet .rels files for links to drawing files. index = 0 for worksheet in self.workbook.worksheets(): if not worksheet.is_chartsheet: continue index += 1 external_links = worksheet.external_drawing_links if not external_links: continue # Create the chartsheet .rels xlsx_dir. rels = Relationships() for link_data in external_links: rels._add_worksheet_relationship(*link_data) # Create .rels file such as /xl/chartsheets/_rels/sheet1.xml.rels. rels._set_xml_writer(self._filename('xl/chartsheets/_rels/sheet' + str(index) + '.xml.rels')) rels._assemble_xml_file()
def _write_worksheet_rels_files(self): # Write data such as hyperlinks or drawings. index = 0 for worksheet in self.workbook.worksheets(): if worksheet.is_chartsheet: continue index += 1 external_links = (worksheet.external_hyper_links + worksheet.external_drawing_links + worksheet.external_vml_links + worksheet.external_table_links + worksheet.external_comment_links) if not external_links: continue # Create the worksheet .rels dirs. rels = Relationships() for link_data in external_links: rels._add_worksheet_relationship(*link_data) # Create .rels file such as /xl/worksheets/_rels/sheet1.xml.rels. rels._set_xml_writer(self._filename('xl/worksheets/_rels/sheet' + str(index) + '.xml.rels')) rels._assemble_xml_file()
def _write_vml_drawing_rels_file(self, worksheet, index): # Write the vmlDdrawing .rels files for worksheets with images in # headers or footers. # Create the drawing .rels dir. rels = Relationships() for drawing_data in worksheet.vml_drawing_links: rels._add_document_relationship(*drawing_data) # Create .rels file such as /xl/drawings/_rels/vmlDrawing1.vml.rels. rels._set_xml_writer(self._filename('xl/drawings/_rels/vmlDrawing' + str(index) + '.vml.rels')) rels._assemble_xml_file()
def _find_relationships(self, list_tagged, global_entities): relationships = Relationships() relation_stops_type = ['CONJ', 'WPRO', ',', '(', ')'] relationship_stop_words = ['ex'] for tagged in list_tagged: for index_sentence, sentence in enumerate(tagged): last_entity = None last_entity_index = 0 last_relation = None for index, item in enumerate(sentence): # In order to avoid stop words if( len(item[0]) == 1 or item[0].lower() in relationship_stop_words): continue # to get the entity already identified elif( item[1] == 'NE'): # In order to build the relationship if(last_entity is not None and self._contain_main_entity(last_entity[0], item[0])): # to build a relationship with anything between entities # just if there is only one token between entities if(index-last_entity_index == 2 and len(sentence[index-1][0])>1 ): id1 = self._search_parent_entity(last_entity[2], global_entities).id() id2 = self._search_parent_entity(item[2], global_entities).id() relation = (sentence[index-1][0], id1, last_entity[0], id2, item[0]) relationships.add(relation) # In order to build a relationship from relation already identified elif(last_relation is not None): id1 = self._search_parent_entity(last_entity[2], global_entities).id() id2 = self._search_parent_entity(item[2], global_entities).id() relation = (last_relation, id1, last_entity[0], id2, item[0]) relationships.add(relation) last_entity = item last_entity_index = index last_relation = None # In order to get just relationships between entities if(last_entity is None): continue # In order to get relationship composed by verb and noun elif('N' in item[1]): last_relation = self._compose_verb_noun(sentence, index, last_entity_index, relation_stops_type) # In order to get relationship composed by one or more verbs elif('VB' in item[1]): last_relation = self._get_composed_verbs(sentence, index, last_entity_index, relation_stops_type) # In order to break relationships elif(item[1] in relation_stops_type): last_relation = None last_entity = None last_entity_index = 0 # In order to remove relationships if a conjuction is found if last_relation is not None and last_relation[0].isupper(): last_relation = None return relationships
def _write_drawing_rels_files(self): # Write the drawing .rels files for worksheets with charts or drawings. index = 0 for worksheet in self.workbook.worksheets(): if not worksheet.drawing_links: continue index += 1 # Create the drawing .rels xlsx_dir. rels = Relationships() for drawing_data in worksheet.drawing_links: rels._add_document_relationship(*drawing_data) # Create .rels file such as /xl/drawings/_rels/sheet1.xml.rels. rels._set_xml_writer(self._filename('xl/drawings/_rels/drawing' + str(index) + '.xml.rels')) rels._assemble_xml_file()
def _write_workbook_rels_file(self): # Write the _rels/.rels xml file. rels = Relationships() worksheet_index = 1 chartsheet_index = 1 for worksheet in self.workbook.worksheets(): if worksheet.is_chartsheet: rels._add_document_relationship('/chartsheet', 'chartsheets/sheet' + str(chartsheet_index) + '.xml') chartsheet_index += 1 else: rels._add_document_relationship('/worksheet', 'worksheets/sheet' + str(worksheet_index) + '.xml') worksheet_index += 1 rels._add_document_relationship('/theme', 'theme/theme1.xml') rels._add_document_relationship('/styles', 'styles.xml') # Add the sharedString rel if there is string data in the workbook. if self.workbook.str_table.count: rels._add_document_relationship('/sharedStrings', 'sharedStrings.xml') # Add vbaProject if present. if self.workbook.vba_project: rels._add_ms_package_relationship('/vbaProject', 'vbaProject.bin') rels._set_xml_writer(self._filename('xl/_rels/workbook.xml.rels')) rels._assemble_xml_file()
def _write_root_rels_file(self): # Write the _rels/.rels xml file. rels = Relationships() rels._add_document_relationship('/officeDocument', 'xl/workbook.xml') rels._add_package_relationship('/metadata/core-properties', 'docProps/core.xml') rels._add_document_relationship('/extended-properties', 'docProps/app.xml') rels._set_xml_writer(self._filename('_rels/.rels')) rels._assemble_xml_file()
class Docx(IdAble): def __init__(self, path): super(Docx, self).__init__() if path is None or not isinstance(path, str): raise Exception("Path is not allowed None") if not os.path.exists(TEMP_BASE_DIR): try: os.mkdir(TEMP_BASE_DIR) except FileExistsError as e: pass self.document = None self.content_types = None self.relationships = None self.numbering = None self.styles = None self.base_dir = uuid1().hex file = ZipFile(path) self.file_path = os.path.join(TEMP_BASE_DIR, self.base_dir) os.mkdir(self.file_path) file.extractall(self.file_path) file.close() self.get_document() self.get_content_types() self.get_numbering() self.get_relationships() self.get_styles() def get_numbering(self): if self.numbering: return self.numbering numbering_path = os.path.join(self.file_path, "word/numbering.xml") if not os.path.exists(numbering_path): self.numbering = Numbering() return self.numbering with open(numbering_path, encoding="UTF-8") as f: numbering = f.read() numbering = BeautifulSoup(numbering, "xml") self.numbering = Numbering(numbering) return self.numbering def get_document(self): if self.document: return self.document doc_path = os.path.join(self.file_path, "word/document.xml") with open(doc_path, encoding="UTF-8") as f: document = f.read() document = BeautifulSoup(document, "xml") self.document = Document(document) return self.document def get_relationships(self): if self.relationships: return self.relationships doc_path = os.path.join(self.file_path, "word/_rels/document.xml.rels") with open(doc_path, encoding="UTF-8") as f: doc = f.read() doc = BeautifulSoup(doc, "xml") self.relationships = Relationships(doc) return self.relationships def get_content_types(self): if self.content_types: return self.content_types content_path = os.path.join(self.file_path, "[Content_Types].xml") with open(content_path, encoding="UTF-8") as f: content_types = f.read() content_types = BeautifulSoup(content_types, "xml") self.content_types = ContentTypes(content_types) return self.content_types def get_styles(self): if self.styles: return self.styles style_path = os.path.join(self.file_path, "word/styles.xml") with open(style_path, encoding="UTF-8") as f: styles = f.read() styles = BeautifulSoup(styles, "xml") self.styles = Styles(styles) return self.styles def extract_media_files(self, path): relationships = self.get_relationships() file_mapping = relationships.get_file_mapping() template = "cp {} {}" base_dir = os.path.join(self.file_path, "word") #print(file_mapping) for file in file_mapping.keys(): from_file = os.path.join(base_dir, file) to_file = os.path.join(path, file_mapping[file]) dir_name = os.path.dirname(to_file) if not os.path.exists(dir_name): os.makedirs(dir_name) extract = template.format(from_file, to_file) os.system(extract) def merge(self, doc, page=False): if not isinstance(doc, Docx): raise Exception("merge parameter is not docx") source_content_types = doc.get_content_types() self.get_content_types().merge_content_types(source_content_types) source_relationships = doc.get_relationships() #print(source_relationships.get_file_mapping()) source_relationships.generate_id(doc.id) doc.extract_media_files(os.path.join(self.file_path, "word")) self.get_relationships().merge_relationships(source_relationships) source_styles = doc.get_styles() source_styles.generate_id(doc.id) self.styles.merge(source_styles) source_numberings = doc.get_numbering() source_numberings.generate_id(doc.num) self.numbering.merge(source_numberings) source_document = doc.get_document() source_document.generate_id(doc.id, doc.num) self.get_document().merge(source_document, page) def save(self, name): import zipfile self._save_document() self._save_content_types() self._save_relationships() self._save_numbering() self._save_styles() file = ZipFile(name, "w", compression=zipfile.ZIP_DEFLATED) for base, children, files in os.walk(self.file_path): base_name = base.split(self.base_dir)[-1] for f in files: zip_path = os.path.join(base_name, f) real_path = os.path.join(base, f) file.write(real_path, zip_path) file.close() def _save_document(self): with open(os.path.join(self.file_path, "word/document.xml"), mode="w", encoding="UTF-8") as f: f.write(str(self.document.get_dom())) def _save_content_types(self): with open(os.path.join(self.file_path, "[Content_Types].xml"), mode="w", encoding="UTF-8") as f: f.write(str(self.content_types.get_dom())) def _save_relationships(self): with open(os.path.join(self.file_path, "word/_rels/document.xml.rels"), mode="w", encoding="UTF-8") as f: f.write(str(self.relationships.get_dom())) def _save_numbering(self): numbering = self.numbering.get_dom() if not numbering: return numbering_path = os.path.join(self.file_path, "word/numbering.xml") with open(numbering_path, "w+", encoding="UTF-8") as f: f.write(str(numbering)) def _save_styles(self): with open(os.path.join(self.file_path, "word/styles.xml"), "w+", encoding="UTF-8") as f: f.write(str(self.styles.get_dom())) def append_paragraph(self, text, align="left"): self.document.append_paragraph(text, align) def append_picture(self, filepath, align="left"): if not os.path.exists(filepath): return media_dir = os.path.join(self.file_path, "word/media") if not os.path.exists(media_dir): os.mkdir(media_dir) suffix = filepath.split(".")[-1] self.content_types.append_extension(suffix) id_file = self.relationships.append_relationship(suffix) #print(id_file) file_path = os.path.join( self.file_path, "word/media/{filename}".format(filename=id_file["filename"])) os.system("cp {f_file} {t_file}".format(f_file=filepath, t_file=file_path)) img = Image.open(file_path) width, height = img.size img.close() self.document.append_picture(id_file["rid"], width * 6350, height * 6350, align) def close(self): os.system("rm -rf {0}".format(self.file_path))
from relationships import Relationships from relationshipstats import RelationshipStats from history import History Relationships.create_table(fail_silently=True) RelationshipStats.create_table(fail_silently=True) History.create_table(fail_silently=True)