def process_text_group(self, path): with open(path) as f: metadata = XmlCtsTextgroupMetadata.parse(resource=f) urn = str(metadata.urn) if urn in self.inventory: self.inventory[urn].update(metadata) else: self.dispatcher.dispatch(metadata, path=path) return metadata
def process_text_group(self, path): with open(path) as f: metadata = XmlCtsTextgroupMetadata.parse(resource=f) urn = str(metadata.urn) if urn in self.inventory["default"].textgroups: try: metadata = self.inventory[urn].update(metadata) except UnknownCollection as e: if self.RAISE_ON_UNKNOWN_COLLECTION: raise e self.logger.warning(f"Unknown text group: {e}") try: self.dispatcher.dispatch(metadata, path=path) except Exception: pass else: self.dispatcher.dispatch(metadata, path=path) return metadata
def getMetadata(self, objectId=None, **filters): """ Request metadata about a text or a collection :param objectId: Object Identifier to filter on :type objectId: str :param filters: Kwargs parameters. :type filters: dict :return: Collection """ if objectId is None: return self.inventory elif objectId in self.inventory.children.keys(): return self.inventory[objectId] texts, _, _ = self.__getTextMetadata__(urn=objectId) # We store inventory names and if there is only one we recreate the inventory inv_names = [text.parent.parent.parent.id for text in texts] if len(set(inv_names)) == 1: inventory = XmlCtsTextInventoryMetadata(name=inv_names[0]) else: inventory = XmlCtsTextInventoryMetadata() # For each text we found using the filter for text in texts: tg_urn = str(text.parent.parent.urn) wk_urn = str(text.parent.urn) txt_urn = str(text.urn) # If we need to generate a textgroup object if tg_urn not in inventory.textgroups: XmlCtsTextgroupMetadata(urn=tg_urn, parent=inventory) # If we need to generate a work object if wk_urn not in inventory.textgroups[tg_urn].works: XmlCtsWorkMetadata(urn=wk_urn, parent=inventory.textgroups[tg_urn]) if isinstance(text, XmlCtsEditionMetadata): x = XmlCtsEditionMetadata(urn=txt_urn, parent=inventory.textgroups[tg_urn].works[wk_urn]) x.citation = text.citation elif isinstance(text, XmlCtsTranslationMetadata): x = XmlCtsTranslationMetadata(urn=txt_urn, parent=inventory.textgroups[tg_urn].works[wk_urn], lang=text.lang) x.citation = text.citation elif isinstance(text, XmlCtsCommentaryMetadata): x = XmlCtsCommentaryMetadata(urn=txt_urn, parent=inventory.textgroups[tg_urn].works[wk_urn], lang=text.lang) x.citation = text.citation return inventory[objectId]
def parse(self, resource): """ Parse a list of directories and reades it into a collection :param resource: List of folders :return: An inventory resource and a list of CtsTextMetadata metadata-objects """ for folder in resource: textgroups = glob("{base_folder}/data/*/__cts__.xml".format(base_folder=folder)) for __cts__ in textgroups: try: with io.open(__cts__) as __xml__: textgroup = XmlCtsTextgroupMetadata.parse( resource=__xml__ ) tg_urn = str(textgroup.urn) if tg_urn in self.inventory: self.inventory[tg_urn].update(textgroup) else: self.dispatcher.dispatch(textgroup, path=__cts__) for __subcts__ in glob("{parent}/*/__cts__.xml".format(parent=os.path.dirname(__cts__))): with io.open(__subcts__) as __xml__: work = XmlCtsWorkMetadata.parse( resource=__xml__, parent=self.inventory[tg_urn] ) work_urn = str(work.urn) if work_urn in self.inventory[tg_urn].works: self.inventory[work_urn].update(work) for __textkey__ in work.texts: __text__ = self.inventory[__textkey__] __text__.path = "{directory}/{textgroup}.{work}.{version}.xml".format( directory=os.path.dirname(__subcts__), textgroup=__text__.urn.textgroup, work=__text__.urn.work, version=__text__.urn.version ) if os.path.isfile(__text__.path): try: with io.open(__text__.path) as f: t = CapitainsCtsText(resource=self.xmlparse(f)) cites = list() for cite in [c for c in t.citation][::-1]: if len(cites) >= 1: cites.append(XmlCtsCitation( xpath=cite.xpath.replace("'", '"'), scope=cite.scope.replace("'", '"'), name=cite.name, child=cites[-1] )) else: cites.append(XmlCtsCitation( xpath=cite.xpath.replace("'", '"'), scope=cite.scope.replace("'", '"'), name=cite.name )) del t __text__.citation = cites[-1] self.logger.info("%s has been parsed ", __text__.path) if __text__.citation.isEmpty() is False: self.texts.append(__text__) else: self.logger.error("%s has no passages", __text__.path) except Exception: self.logger.error( "%s does not accept parsing at some level (most probably citation) ", __text__.path ) else: self.logger.error("%s is not present", __text__.path) except UndispatchedTextError as E: self.logger.error("Error dispatching %s ", __cts__) if self.RAISE_ON_UNDISPATCHED is True: raise E except Exception as E: self.logger.error("Error parsing %s ", __cts__) return self.inventory, self.texts
def parse(self, resource=None): # NOTE: Extracted from 2dae321722c06fe8873c5f06b3f8fdbd45f643c2 """Parse a list of directories ans :param resource: List of folders :param ret: Return a specific item ("inventory" or "texts") """ if resource is None: resource = self.__resources__ removing = [] # start sv-metadata customization repo_urn_lookup = defaultdict() # end sv-metadata customization for folder in resource: # start sv-metadata customization repo_metadata = self.extract_sv_metadata(folder) repo_metadata["texts"] = [] # end sv-metadata customization textgroups = glob( "{base_folder}/data/*/__cts__.xml".format(base_folder=folder)) for __cts__ in textgroups: try: with open(__cts__) as __xml__: textgroup = TextGroup.parse(resource=__xml__) tg_urn = str(textgroup.urn) if tg_urn in self.dispatcher.collection: self.dispatcher.collection[tg_urn].update(textgroup) else: self.dispatcher.dispatch(textgroup, path=__cts__) for __subcts__ in glob("{parent}/*/__cts__.xml".format( parent=os.path.dirname(__cts__))): with open(__subcts__) as __xml__: work = Work.parse( resource=__xml__, parent=self.dispatcher.collection[tg_urn]) work_urn = str(work.urn) if work_urn in self.dispatcher.collection[ tg_urn].works: self.dispatcher.collection[work_urn].update( work) for __textkey__ in work.texts: __text__ = self.dispatcher.collection[__textkey__] __text__.path = "{directory}/{textgroup}.{work}.{version}.xml".format( directory=os.path.dirname(__subcts__), textgroup=__text__.urn.textgroup, work=__text__.urn.work, version=__text__.urn.version) if os.path.isfile(__text__.path): try: t = self.read(__textkey__, __text__.path) cites = list() for cite in [c for c in t.citation][::-1]: if len(cites) >= 1: cites.append( Citation( xpath=cite.xpath.replace( "'", '"'), scope=cite.scope.replace( "'", '"'), name=cite.name, child=cites[-1])) else: cites.append( Citation( xpath=cite.xpath.replace( "'", '"'), scope=cite.scope.replace( "'", '"'), name=cite.name)) del t __text__.citation = cites[-1] self.logger.info("%s has been parsed ", __text__.path) if __text__.citation.isEmpty() is True: removing.append(__textkey__) self.logger.error( "%s has no passages", __text__.path) except Exception as E: removing.append(__textkey__) self.logger.error( "%s does not accept parsing at some level (most probably citation) ", __text__.path) else: # start sv-metadata customization repo_metadata["texts"].append( str(__text__.urn)) # end sv-metadata customization else: removing.append(__textkey__) self.logger.error("%s is not present", __text__.path) except UndispatchedTextError as E: self.logger.error("Error dispatching %s ", __cts__) if self.RAISE_ON_UNDISPATCHED is True: raise E except Exception as E: self.logger.error("Error parsing %s ", __cts__) # start sv-metadata customization if repo_metadata.get("repo"): repo_urn_lookup[repo_metadata["repo"]] = repo_metadata # end sv-metadata customization for removable in removing: del self.dispatcher.collection[removable] removing = [] if self.REMOVE_EMPTY is True: # Find resource with no readable descendants for item in self.dispatcher.collection.descendants: if item.readable != True and len( item.readableDescendants) == 0: removing.append(item.id) # Remove them only if they have not been removed before for removable in removing: if removable in self.dispatcher.collection: del self.dispatcher.collection[removable] # @@@ write out our own "inventory" corpus_metadata_path = Path(root_dir_path, ".scaife-viewer.json") json.dump(list(repo_urn_lookup.values()), open(corpus_metadata_path, "w"), indent=2) self.inventory = self.dispatcher.collection return self.inventory
def parse(self, resource=None): """ Parse a list of directories ans :param resource: List of folders :param ret: Return a specific item ("inventory" or "texts") """ if resource is None: resource = self.__resources__ removing = [] for folder in resource: textgroups = glob("{base_folder}/data/*/__cts__.xml".format(base_folder=folder)) for __cts__ in textgroups: try: with open(__cts__) as __xml__: textgroup = TextGroup.parse( resource=__xml__ ) tg_urn = str(textgroup.urn) if tg_urn in self.dispatcher.collection: self.dispatcher.collection[tg_urn].update(textgroup) else: self.dispatcher.dispatch(textgroup, path=__cts__) for __subcts__ in glob("{parent}/*/__cts__.xml".format(parent=os.path.dirname(__cts__))): with open(__subcts__) as __xml__: work = Work.parse( resource=__xml__, parent=self.dispatcher.collection[tg_urn] ) work_urn = str(work.urn) if work_urn in self.dispatcher.collection[tg_urn].works: self.dispatcher.collection[work_urn].update(work) for __textkey__ in work.texts: __text__ = self.dispatcher.collection[__textkey__] __text__.path = "{directory}/{textgroup}.{work}.{version}.xml".format( directory=os.path.dirname(__subcts__), textgroup=__text__.urn.textgroup, work=__text__.urn.work, version=__text__.urn.version ) if os.path.isfile(__text__.path): try: t = self.read(__textkey__, __text__.path) cites = list() for cite in [c for c in t.citation][::-1]: if len(cites) >= 1: cites.append(Citation( xpath=cite.xpath.replace("'", '"'), scope=cite.scope.replace("'", '"'), name=cite.name, child=cites[-1] )) else: cites.append(Citation( xpath=cite.xpath.replace("'", '"'), scope=cite.scope.replace("'", '"'), name=cite.name )) del t __text__.citation = cites[-1] self.logger.info("%s has been parsed ", __text__.path) if __text__.citation.isEmpty() is True: removing.append(__textkey__) self.logger.error("%s has no passages", __text__.path) except Exception as E: removing.append(__textkey__) self.logger.error( "%s does not accept parsing at some level (most probably citation) ", __text__.path ) else: removing.append(__textkey__) self.logger.error("%s is not present", __text__.path) except MyCapytain.errors.UndispatchedTextError as E: self.logger.error("Error dispatching %s ", __cts__) if self.RAISE_ON_UNDISPATCHED is True: raise UndispatchedTextError(E) except Exception as E: self.logger.error("Error parsing %s ", __cts__) for removable in removing: del self.dispatcher.collection[removable] removing = [] if self.REMOVE_EMPTY is True: # Find resource with no readable descendants for item in self.dispatcher.collection.descendants: if item.readable != True and len(item.readableDescendants) == 0: removing.append(item.id) # Remove them only if they have not been removed before for removable in removing: if removable in self.dispatcher.collection: del self.dispatcher.collection[removable] self.inventory = self.dispatcher.collection return self.inventory