Exemple #1
0
 def process_work(self, text_group_metadata, path):
     text_group_urn = str(text_group_metadata.urn)
     with open(path) as f:
         metadata = XmlCtsWorkMetadata.parse(
             resource=f,
             parent=self.inventory[text_group_urn],
         )
     work_urn = str(metadata.urn)
     if work_urn in self.inventory[text_group_urn].works:
         self.inventory[work_urn].update(metadata)
     return metadata
Exemple #2
0
 def process_work(self, text_group_metadata, path):
     with open(path) as f:
         metadata = XmlCtsWorkMetadata.parse(
             resource=f,
             parent=text_group_metadata,
         )
     work_urn = str(metadata.urn)
     if work_urn in text_group_metadata.works:
         try:
             metadata = self.inventory[work_urn].update(metadata)
         except UnknownCollection as e:
             if self.RAISE_ON_UNKNOWN_COLLECTION:
                 raise e
             self.logger.warning(f"Unknown work: {e}")
     return metadata
Exemple #3
0
    def getMetadata(self, objectId=None, **filters):
        """ Request metadata about a text or a collection

        :param objectId: Object Identifier to filter on
        :type objectId: str
        :param filters: Kwargs parameters.
        :type filters: dict
        :return: Collection
        """
        if objectId is None:
            return self.inventory
        elif objectId in self.inventory.children.keys():
            return self.inventory[objectId]
        texts, _, _ = self.__getTextMetadata__(urn=objectId)

        # We store inventory names and if there is only one we recreate the inventory
        inv_names = [text.parent.parent.parent.id for text in texts]
        if len(set(inv_names)) == 1:
            inventory = XmlCtsTextInventoryMetadata(name=inv_names[0])
        else:
            inventory = XmlCtsTextInventoryMetadata()
        # For each text we found using the filter
        for text in texts:
            tg_urn = str(text.parent.parent.urn)
            wk_urn = str(text.parent.urn)
            txt_urn = str(text.urn)
            # If we need to generate a textgroup object
            if tg_urn not in inventory.textgroups:
                XmlCtsTextgroupMetadata(urn=tg_urn, parent=inventory)
            # If we need to generate a work object
            if wk_urn not in inventory.textgroups[tg_urn].works:
                XmlCtsWorkMetadata(urn=wk_urn, parent=inventory.textgroups[tg_urn])

            if isinstance(text, XmlCtsEditionMetadata):
                x = XmlCtsEditionMetadata(urn=txt_urn, parent=inventory.textgroups[tg_urn].works[wk_urn])
                x.citation = text.citation
            elif isinstance(text, XmlCtsTranslationMetadata):
                x = XmlCtsTranslationMetadata(urn=txt_urn, parent=inventory.textgroups[tg_urn].works[wk_urn], lang=text.lang)
                x.citation = text.citation
            elif isinstance(text, XmlCtsCommentaryMetadata):
                x = XmlCtsCommentaryMetadata(urn=txt_urn, parent=inventory.textgroups[tg_urn].works[wk_urn], lang=text.lang)
                x.citation = text.citation

        return inventory[objectId]
Exemple #4
0
    def parse(self, resource):
        """ Parse a list of directories and reades it into a collection

        :param resource: List of folders
        :return: An inventory resource and a list of CtsTextMetadata metadata-objects
        """
        for folder in resource:
            textgroups = glob("{base_folder}/data/*/__cts__.xml".format(base_folder=folder))
            for __cts__ in textgroups:
                try:
                    with io.open(__cts__) as __xml__:
                        textgroup = XmlCtsTextgroupMetadata.parse(
                            resource=__xml__
                        )
                        tg_urn = str(textgroup.urn)
                    if tg_urn in self.inventory:
                        self.inventory[tg_urn].update(textgroup)
                    else:
                        self.dispatcher.dispatch(textgroup, path=__cts__)

                    for __subcts__ in glob("{parent}/*/__cts__.xml".format(parent=os.path.dirname(__cts__))):
                        with io.open(__subcts__) as __xml__:
                            work = XmlCtsWorkMetadata.parse(
                                resource=__xml__,
                                parent=self.inventory[tg_urn]
                            )
                            work_urn = str(work.urn)
                            if work_urn in self.inventory[tg_urn].works:
                                self.inventory[work_urn].update(work)

                        for __textkey__ in work.texts:
                            __text__ = self.inventory[__textkey__]
                            __text__.path = "{directory}/{textgroup}.{work}.{version}.xml".format(
                                directory=os.path.dirname(__subcts__),
                                textgroup=__text__.urn.textgroup,
                                work=__text__.urn.work,
                                version=__text__.urn.version
                            )
                            if os.path.isfile(__text__.path):
                                try:
                                    with io.open(__text__.path) as f:
                                        t = CapitainsCtsText(resource=self.xmlparse(f))
                                        cites = list()
                                        for cite in [c for c in t.citation][::-1]:
                                            if len(cites) >= 1:
                                                cites.append(XmlCtsCitation(
                                                    xpath=cite.xpath.replace("'", '"'),
                                                    scope=cite.scope.replace("'", '"'),
                                                    name=cite.name,
                                                    child=cites[-1]
                                                ))
                                            else:
                                                cites.append(XmlCtsCitation(
                                                    xpath=cite.xpath.replace("'", '"'),
                                                    scope=cite.scope.replace("'", '"'),
                                                    name=cite.name
                                                ))
                                        del t
                                    __text__.citation = cites[-1]
                                    self.logger.info("%s has been parsed ", __text__.path)
                                    if __text__.citation.isEmpty() is False:
                                        self.texts.append(__text__)
                                    else:
                                        self.logger.error("%s has no passages", __text__.path)
                                except Exception:
                                    self.logger.error(
                                        "%s does not accept parsing at some level (most probably citation) ",
                                        __text__.path
                                    )
                            else:
                                self.logger.error("%s is not present", __text__.path)
                except UndispatchedTextError as E:
                    self.logger.error("Error dispatching %s ", __cts__)
                    if self.RAISE_ON_UNDISPATCHED is True:
                        raise E
                except Exception as E:
                    self.logger.error("Error parsing %s ", __cts__)

        return self.inventory, self.texts
    def parse(self, resource=None):
        # NOTE: Extracted from 2dae321722c06fe8873c5f06b3f8fdbd45f643c2
        """Parse a list of directories ans
        :param resource: List of folders
        :param ret: Return a specific item ("inventory" or "texts")
        """
        if resource is None:
            resource = self.__resources__
        removing = []
        # start sv-metadata customization
        repo_urn_lookup = defaultdict()
        # end sv-metadata customization
        for folder in resource:
            # start sv-metadata customization
            repo_metadata = self.extract_sv_metadata(folder)
            repo_metadata["texts"] = []
            # end sv-metadata customization

            textgroups = glob(
                "{base_folder}/data/*/__cts__.xml".format(base_folder=folder))
            for __cts__ in textgroups:
                try:
                    with open(__cts__) as __xml__:
                        textgroup = TextGroup.parse(resource=__xml__)
                        tg_urn = str(textgroup.urn)
                    if tg_urn in self.dispatcher.collection:
                        self.dispatcher.collection[tg_urn].update(textgroup)
                    else:
                        self.dispatcher.dispatch(textgroup, path=__cts__)

                    for __subcts__ in glob("{parent}/*/__cts__.xml".format(
                            parent=os.path.dirname(__cts__))):
                        with open(__subcts__) as __xml__:
                            work = Work.parse(
                                resource=__xml__,
                                parent=self.dispatcher.collection[tg_urn])
                            work_urn = str(work.urn)
                            if work_urn in self.dispatcher.collection[
                                    tg_urn].works:
                                self.dispatcher.collection[work_urn].update(
                                    work)

                        for __textkey__ in work.texts:
                            __text__ = self.dispatcher.collection[__textkey__]
                            __text__.path = "{directory}/{textgroup}.{work}.{version}.xml".format(
                                directory=os.path.dirname(__subcts__),
                                textgroup=__text__.urn.textgroup,
                                work=__text__.urn.work,
                                version=__text__.urn.version)
                            if os.path.isfile(__text__.path):
                                try:
                                    t = self.read(__textkey__, __text__.path)
                                    cites = list()
                                    for cite in [c for c in t.citation][::-1]:
                                        if len(cites) >= 1:
                                            cites.append(
                                                Citation(
                                                    xpath=cite.xpath.replace(
                                                        "'", '"'),
                                                    scope=cite.scope.replace(
                                                        "'", '"'),
                                                    name=cite.name,
                                                    child=cites[-1]))
                                        else:
                                            cites.append(
                                                Citation(
                                                    xpath=cite.xpath.replace(
                                                        "'", '"'),
                                                    scope=cite.scope.replace(
                                                        "'", '"'),
                                                    name=cite.name))
                                    del t
                                    __text__.citation = cites[-1]
                                    self.logger.info("%s has been parsed ",
                                                     __text__.path)
                                    if __text__.citation.isEmpty() is True:
                                        removing.append(__textkey__)
                                        self.logger.error(
                                            "%s has no passages",
                                            __text__.path)
                                except Exception as E:
                                    removing.append(__textkey__)
                                    self.logger.error(
                                        "%s does not accept parsing at some level (most probably citation) ",
                                        __text__.path)
                                else:
                                    # start sv-metadata customization
                                    repo_metadata["texts"].append(
                                        str(__text__.urn))
                                    # end sv-metadata customization
                            else:
                                removing.append(__textkey__)
                                self.logger.error("%s is not present",
                                                  __text__.path)
                except UndispatchedTextError as E:
                    self.logger.error("Error dispatching %s ", __cts__)
                    if self.RAISE_ON_UNDISPATCHED is True:
                        raise E
                except Exception as E:
                    self.logger.error("Error parsing %s ", __cts__)

            # start sv-metadata customization
            if repo_metadata.get("repo"):
                repo_urn_lookup[repo_metadata["repo"]] = repo_metadata
            # end sv-metadata customization

        for removable in removing:
            del self.dispatcher.collection[removable]

        removing = []

        if self.REMOVE_EMPTY is True:
            # Find resource with no readable descendants
            for item in self.dispatcher.collection.descendants:
                if item.readable != True and len(
                        item.readableDescendants) == 0:
                    removing.append(item.id)

            # Remove them only if they have not been removed before
            for removable in removing:
                if removable in self.dispatcher.collection:
                    del self.dispatcher.collection[removable]

            # @@@ write out our own "inventory"
        corpus_metadata_path = Path(root_dir_path, ".scaife-viewer.json")
        json.dump(list(repo_urn_lookup.values()),
                  open(corpus_metadata_path, "w"),
                  indent=2)

        self.inventory = self.dispatcher.collection
        return self.inventory
            tgid = "cil004-{}00".format(i)
            i = int(i + "00")
        else:
            i = 0
            tgid = "pages"

        urn = "urn:cts:pompei:{}.{}.manfred-lat1".format(tgid, text_id.strip())

        text = p.xpath(".//br")[-1].tail.replace("&lt;", "<").replace(
            "\n", "").replace("&gt;", ">")
        epi_converter.reset()

        text_converted = epi_converter.convert(text)
        text_xml = template.render(title=text_id, xml=text_converted, urn=urn)

        work = XmlCtsWorkMetadata(urn=(URN(urn)).upTo(URN.WORK))
        work.set_cts_property("title", text_id, lang="eng")

        for ident in additional_ids:
            work.metadata.add(DC.term("identifier"), ident)
        if text_image is not None:
            work.metadata.add(DCTERMS.term("isFormatOf"), text_image)
        if trismegistos is not None:
            work.metadata.add(SAWS.term("identifier"),
                              "www.trismegistos.org/text/" + trismegistos[1:])
        if trismegistos_place is not None:
            work.metadata.add(
                SAWS.term("isLocatedAt"),
                "http://www.trismegistos.org/place/" + trismegistos_place)
        if placename is not None:
            work.metadata.add(SAWS.term("isLocatedAt"), placename)
Exemple #7
0
    def parse(self, resource=None):
        """ Parse a list of directories ans
        :param resource: List of folders
        :param ret: Return a specific item ("inventory" or "texts")
        """
        if resource is None:
            resource = self.__resources__
        removing = []
        for folder in resource:
            textgroups = glob("{base_folder}/data/*/__cts__.xml".format(base_folder=folder))
            for __cts__ in textgroups:
                try:
                    with open(__cts__) as __xml__:
                        textgroup = TextGroup.parse(
                            resource=__xml__
                        )
                        tg_urn = str(textgroup.urn)
                    if tg_urn in self.dispatcher.collection:
                        self.dispatcher.collection[tg_urn].update(textgroup)
                    else:
                        self.dispatcher.dispatch(textgroup, path=__cts__)

                    for __subcts__ in glob("{parent}/*/__cts__.xml".format(parent=os.path.dirname(__cts__))):
                        with open(__subcts__) as __xml__:
                            work = Work.parse(
                                resource=__xml__,
                                parent=self.dispatcher.collection[tg_urn]
                            )
                            work_urn = str(work.urn)
                            if work_urn in self.dispatcher.collection[tg_urn].works:
                                self.dispatcher.collection[work_urn].update(work)

                        for __textkey__ in work.texts:
                            __text__ = self.dispatcher.collection[__textkey__]
                            __text__.path = "{directory}/{textgroup}.{work}.{version}.xml".format(
                                directory=os.path.dirname(__subcts__),
                                textgroup=__text__.urn.textgroup,
                                work=__text__.urn.work,
                                version=__text__.urn.version
                            )
                            if os.path.isfile(__text__.path):
                                try:
                                    t = self.read(__textkey__, __text__.path)
                                    cites = list()
                                    for cite in [c for c in t.citation][::-1]:
                                        if len(cites) >= 1:
                                            cites.append(Citation(
                                                xpath=cite.xpath.replace("'", '"'),
                                                scope=cite.scope.replace("'", '"'),
                                                name=cite.name,
                                                child=cites[-1]
                                            ))
                                        else:
                                            cites.append(Citation(
                                                xpath=cite.xpath.replace("'", '"'),
                                                scope=cite.scope.replace("'", '"'),
                                                name=cite.name
                                            ))
                                    del t
                                    __text__.citation = cites[-1]
                                    self.logger.info("%s has been parsed ", __text__.path)
                                    if __text__.citation.isEmpty() is True:
                                        removing.append(__textkey__)
                                        self.logger.error("%s has no passages", __text__.path)
                                except Exception as E:
                                    removing.append(__textkey__)
                                    self.logger.error(
                                        "%s does not accept parsing at some level (most probably citation) ",
                                        __text__.path
                                    )
                            else:
                                removing.append(__textkey__)
                                self.logger.error("%s is not present", __text__.path)
                except MyCapytain.errors.UndispatchedTextError as E:
                    self.logger.error("Error dispatching %s ", __cts__)
                    if self.RAISE_ON_UNDISPATCHED is True:
                        raise UndispatchedTextError(E)
                except Exception as E:
                    self.logger.error("Error parsing %s ", __cts__)

        for removable in removing:
            del self.dispatcher.collection[removable]

        removing = []

        if self.REMOVE_EMPTY is True:
            # Find resource with no readable descendants
            for item in self.dispatcher.collection.descendants:
                if item.readable != True and len(item.readableDescendants) == 0:
                    removing.append(item.id)

            # Remove them only if they have not been removed before
            for removable in removing:
                if removable in self.dispatcher.collection:
                    del self.dispatcher.collection[removable]

        self.inventory = self.dispatcher.collection
        return self.inventory