コード例 #1
0
ファイル: Dataset.py プロジェクト: sgml/dipper
class Dataset:
    """
     this will produce the metadata about a dataset
     following the example laid out here:
     http://htmlpreview.github.io/?
     https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1
     (mind the wrap)

    """
    def __init__(
            self,
            identifier,  # name? should be Archive url via Source
            title,
            url,
            ingest_desc=None,
            license_url=None,
            data_rights=None,
            graph_type='rdf_graph',  # rdf_graph, streamed_graph
            file_handle=None):

        if graph_type is None:
            self.graph = RDFGraph(None, identifier)
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True,
                                       identifier,
                                       file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph(True, identifier)

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        # TODO: move hard coded curies to translation table calls
        self.identifier = identifier
        if title is None:
            self.title = identifier
        else:
            self.title = title
        self.version = None
        self.date_issued = None

        # The data_accesed value is later used as an literal of properties
        # such as dcterms:issued, which needs to conform xsd:dateTime format.
        # TODO ... we need to have a talk about typed literals and SPARQL
        self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

        self.citation = set()
        self.license_url = license_url
        self.model.addType(self.identifier, 'dctypes:Dataset')
        self.graph.addTriple(self.identifier, 'dcterms:title', title, True)
        self.graph.addTriple(self.identifier, 'dcterms:identifier', identifier,
                             True)
        if url is not None:
            self.graph.addTriple(self.identifier, 'foaf:page', url)
        # maybe in the future add the logo here:
        # schemaorg:logo  <uri>
        # TODO add the license info
        # FIXME:Temporarily making this in IF statement,
        #  can revert after all current resources are updated.
        if license_url is not None:
            self.graph.addTriple(self.identifier, 'dcterms:license',
                                 license_url)
        else:
            LOG.debug('No license provided.')
        if data_rights is not None:
            self.graph.addTriple(self.identifier,
                                 'dcterms:rights',
                                 data_rights,
                                 object_is_literal=True)
        else:
            LOG.debug('No rights provided.')

        if ingest_desc is not None:
            self.model.addDescription(self.identifier, ingest_desc)
        return

    def setVersion(self, date_issued, version_id=None):
        """
        Legacy function...
            should use the other set_* for version and date

        as of 2016-10-20  used in:

        dipper/sources/HPOAnnotations.py 139:
        dipper/sources/CTD.py             99:
        dipper/sources/BioGrid.py        100:
        dipper/sources/MGI.py            255:
        dipper/sources/EOM.py             93:
        dipper/sources/Coriell.py        200:
        dipper/sources/MMRRC.py           77:

        # TODO set as deprecated

        :param date_issued:
        :param version_id:
        :return:

        """

        if date_issued is not None:
            self.set_date_issued(date_issued)
        elif version_id is not None:
            self.set_version_by_num(version_id)
        else:
            LOG.error("date or version not set!")
            # TODO throw error
            return

        if version_id is not None:
            self.set_version_by_num(version_id)
        else:
            LOG.info("set version to %s", self.version)
            self.set_version_by_date(date_issued)

        LOG.info("set version to %s", self.version)

        return

    def set_date_issued(self, date_issued):

        self.date_issued = date_issued
        self.graph.addTriple(self.identifier,
                             'dcterms:issued',
                             date_issued,
                             object_is_literal=True)
        LOG.info("setting date to %s", date_issued)

        return

    def set_version_by_date(self, date_issued=None):
        """
        This will set the version by the date supplied,
        the date already stored in the dataset description,
        or by the download date (today)
        :param date_issued:
        :return:
        """

        if date_issued is not None:
            dat = date_issued
        elif self.date_issued is not None:
            dat = self.date_issued
        else:
            dat = self.date_accessed
            LOG.info(
                "No date supplied, using download timestamp for date_issued")
        LOG.info("setting version by date to: %s", dat)
        self.set_version_by_num(dat)

        return

    def set_version_by_num(self, version_num):

        self.version = self.identifier + version_num
        self.graph.addTriple(self.version, 'dcterms:isVersionOf',
                             self.identifier)
        self.graph.addTriple(self.version,
                             'pav:version',
                             version_num,
                             object_is_literal=True)

        LOG.info("setting version to %s", self.version)

        # set the monarch-generated-version of the resource-version
        # TODO sync this up with the ontology version
        if version_num != self.date_accessed:
            dipperized_version = ':' + str(self.date_accessed)
            self.graph.addTriple(dipperized_version, 'dcterms:isVersionOf',
                                 "MonarchData:" + self.identifier +
                                 ".ttl")  # fix suffix
            self.graph.addTriple(dipperized_version,
                                 'pav:version',
                                 self.date_accessed,
                                 object_is_literal=True)
            self.graph.addTriple(dipperized_version,
                                 'dcterms:issued',
                                 self.date_accessed,
                                 object_is_literal=True,
                                 literal_type="xsd:dateTime")
        return

    def setFileAccessUrl(self, url, is_object_literal=False):
        self.graph.addTriple(self.identifier, 'dcat:accessURL', url,
                             is_object_literal)

    def getGraph(self):
        return self.graph

    def set_license(self, license_url):
        self.license_url = license_url
        return

    def get_license(self):
        return self.license_url

    def set_citation(self, citation_id):

        self.citation.add(citation_id)
        # TODO
        # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id)

        return
コード例 #2
0
ファイル: Dataset.py プロジェクト: lwinfree/dipper
class Dataset:
    """
     this will produce the metadata about a dataset
     following the example laid out here:
     http://htmlpreview.github.io/?
     https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1
     (mind the wrap)

    """
    def __init__(self,
                 identifier,
                 title,
                 url,
                 description=None,
                 license_url=None,
                 data_rights=None,
                 graph_type=None,
                 file_handle=None):
        if graph_type is None:
            self.graph = RDFGraph()
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True, file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph()
        self.model = Model(self.graph)
        self.identifier = ':' + identifier
        self.version = None
        self.date_issued = None

        # The data_accesed value is later used as an object literal of properties such as dct:issued, which needs to conform xsd:dateTime format.
        # self.date_accessed = datetime.now().strftime('%Y-%m-%d-%H-%M')
        self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

        self.citation = set()
        self.license = license_url
        self.model.addType(self.identifier, 'dctypes:Dataset')
        self.graph.addTriple(self.identifier, 'dct:title', title, True)
        self.graph.addTriple(self.identifier,
                             'dct:identifier',
                             identifier,
                             object_is_literal=True)
        self.graph.addTriple(self.identifier, 'foaf:page', url)
        # maybe in the future add the logo here:
        # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> .

        # TODO add the licence info
        # FIXME:Temporarily making this in IF statement,
        #  can revert after all current resources are updated.
        if license_url is not None:
            self.graph.addTriple(self.identifier, 'dct:license', license_url)
        else:
            logger.debug('No license provided.')
        if data_rights is not None:
            self.graph.addTriple(self.identifier,
                                 'dct:rights',
                                 data_rights,
                                 object_is_literal=True)
        else:
            logger.debug('No rights provided.')

        if description is not None:
            self.model.addDescription(self.identifier, description)
        return

    def setVersion(self, date_issued, version_id=None):
        """
        Legacy function...
            should use the other set_* for version and date

        as of 2016-10-20  used in:
        
        dipper/sources/HPOAnnotations.py 139:
        dipper/sources/CTD.py             99:
        dipper/sources/BioGrid.py        100:        
        dipper/sources/MGI.py            255:
        dipper/sources/EOM.py             93:
        dipper/sources/Coriell.py        200:
        dipper/sources/MMRRC.py           77:

        # TODO set as deprecated
        
        :param date_issued:
        :param version_id:
        :return:

        """

        if date_issued is not None:
            self.set_date_issued(date_issued)
        elif version_id is not None:
            self.set_version_by_num(version_id)
        else:
            logger.error("date or version not set!")
            # TODO throw error
            return

        if version_id is not None:
            self.set_version_by_num(version_id)
        else:
            logger.info("set version to %s", self.version)
            self.set_version_by_date(date_issued)

        logger.info("set version to %s", self.version)

        return

    def set_date_issued(self, date_issued):

        self.date_issued = date_issued
        self.graph.addTriple(self.identifier,
                             'dct:issued',
                             date_issued,
                             object_is_literal=True)
        logger.info("setting date to %s", date_issued)

        return

    def set_version_by_date(self, date_issued=None):
        """
        This will set the version by the date supplied,
        the date already stored in the dataset description,
        or by the download date (today)
        :param date_issued:
        :return:
        """

        if date_issued is not None:
            d = date_issued
        elif self.date_issued is not None:
            d = self.date_issued
        else:
            d = self.date_accessed
            logger.info("No date supplied for setting version; "
                        "using download timestamp for date_issued")

        logger.info("setting version by date")
        self.set_version_by_num(d)

        return

    def set_version_by_num(self, version_num):

        self.version = self.identifier + version_num
        self.graph.addTriple(self.version, 'dct:isVersionOf', self.identifier)
        self.graph.addTriple(self.version,
                             'pav:version',
                             version_num,
                             object_is_literal=True)

        logger.info("setting version to %s", self.version)

        # set the monarch-generated-version of the resource-version
        # TODO sync this up with the ontology version
        if version_num != self.date_accessed:
            dipperized_version = ':' + str(self.date_accessed)
            self.graph.addTriple(dipperized_version, 'dct:isVersionOf',
                                 self.version)
            self.graph.addTriple(dipperized_version,
                                 'pav:version',
                                 self.date_accessed,
                                 object_is_literal=True)
            self.graph.addTriple(dipperized_version,
                                 'dct:issued',
                                 self.date_accessed,
                                 object_is_literal=True,
                                 literal_type="xsd:dateTime")
        return

    def setFileAccessUrl(self, url, is_object_literal=False):
        self.graph.addTriple(self.identifier, 'dcat:accessURL', url,
                             is_object_literal)

    def getGraph(self):
        return self.graph

    def set_license(self, license):
        self.license = license
        return

    def get_license(self):

        return self.license

    def set_citation(self, citation_id):

        self.citation.add(citation_id)
        # TODO
        # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id)

        return
コード例 #3
0
disease_graph = add_triples_from_bbop(parent_graph, disease_graph)
disease_graph = add_triples_from_bbop(child_graph, disease_graph)
disease_graph = add_triples_from_bbop(eq_graph, disease_graph)

mondo_stub = './output/mondo-stub.xml'
mondo_stub_ttl = './output/mondo-stub.ttl'

disease_graph.serialize(mondo_stub, 'xml')
disease_graph.serialize(mondo_stub_ttl, 'ttl')

mondo_stub_lbl = './output/mondo-stub-wlabels.xml'
mondo_stub_ttl_lbl = './output/mondo-stub-wlabels.ttl'

for node in parent_graph.nodes:
    disease_graph.addTriple(node.id, 'rdfs:label', node.label, True)

for node in child_graph.nodes:
    disease_graph.addTriple(node.id, 'rdfs:label', node.label, True)

for node in eq_graph.nodes:
    disease_graph.addTriple(node.id, 'rdfs:label', node.label, True)

disease_graph.serialize(mondo_stub_lbl, 'xml')
disease_graph.serialize(mondo_stub_ttl_lbl, 'ttl')

data_graph = RDFGraph()
sg = SciGraph(SCIGRAPH_DATA)


# Get all children + sickle cell
コード例 #4
0
ファイル: Dataset.py プロジェクト: tegar9000/dipper-1
class Dataset:
    """
     This class produces metadata about a dataset that is compliant with the
     HCLS dataset specification:
     https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/#s4_4

     Summary level: The summary level provides a description of a dataset that is
     independent of a specific version or format. (e.g. the Monarch ingest of CTD)
     CURIE for this is something like MonarchData:[SOURCE IDENTIFIER]

     Version level: The version level captures version-specific characteristics of a
     dataset. (e.g. the 01-02-2018 ingest of CTD)
     CURIE for this is something like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP]

     Distribution level: The distribution level captures metadata about a specific form
     and version of a dataset (e.g. turtle file for 01-02-2018 ingest of CTD). There is
     a [distribution level resource] for each different downloadable file we emit,
     i.e. one for the TTL file, one for the ntriples file, etc.
     CURIE for this is like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].ttl
     or
     MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].nt
     or
     MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].[whatever file format]

     We write out at least the following triples:

     SUMMARY LEVEL TRIPLES:
     [summary level resource] - rdf:type -> dctypes:Dataset
     [summary level resource] - dc:title -> title (literal)
     [summary level resource] - dc:description -> description (literal)
                                                (use docstring from Source class)
     [summary level resource] - dc:source -> [source web page, e.g. omim.org]
     [summary level resource] - schema:logo -> [source logo IRI]
     [summary level resource] - dc:publisher -> monarchinitiative.org
        n.b: about summary level resource triples:
        -- HCLS spec says we "should" link to our logo and web page, but I'm not,
        because it would confuse the issue of whether we are pointing to our logo/page
        or the logo/page of the data source for this ingest. Same below for
        [version level resource] and [distibution level resource] - I'm not linking to
        our page/logo down there either.
        - spec says we "should" include summary level triples describing Update
        frequency and SPARQL endpoint but I'm omitting this for now, because these are
        not clearly defined at the moment

     VERSION LEVEL TRIPLES:
     [version level resource] - rdf:type -> dctypes:Dataset
     [version level resource] - dc:title -> version title (literal)
     [version level resource] - dc:description -> version description (literal)
     [version level resource] - dc:created -> ingest timestamp [ISO 8601 compliant]
     [version level resource] - pav:version -> ingest timestamp (same one above)
     [version level resource] - dc:creator	-> monarchinitiative.org
     [version level resource] - dc:publisher -> monarchinitiative.org
     [version level resource] - dc:isVersionOf -> [summary level resource]
     [version level resource] - dc:source -> [source file 1 IRI]
     [version level resource] - dc:source -> [source file 2 IRI]
     ...

     [source file 1 IRI] - pav:retrievedOn -> [download date timestamp]
     [source file 2 IRI] - pav:version -> [source version (if set, optional)]
     [source file 2 IRI] - pav:retrievedOn -> [download date timestamp]
     [source file 2 IRI] - pav:version -> [source version (if set, optional)]
     ...

     [version level resource] - pav:createdWith -> [Dipper github URI]
     [version level resource] - void:dataset -> [distribution level resource]

     [version level resource] - cito:citesAsAuthoriy -> [citation id 1]
     [version level resource] - cito:citesAsAuthoriy -> [citation id 2]
     [version level resource] - cito:citesAsAuthoriy -> [citation id 3]

        n.b: about version level resource triples:
        - spec says we "should" include Date of issue/dc:issued triple, but I'm not
        because it is redundant with this triple above:
        [version level resource] - dc:created -> time stamp
        and would introduce ambiguity and confusion if the two disagree. Same below
        for [distribution level resource] - dc:created -> tgiime stamp below
        Also omitting:
          - triples linking to our logo and page, see above.
          - License/dc:license triple, because we will make this triple via the
            [distribution level resource] below
          - Language/dc:language triple b/c it seems superfluous. Same below for
            [distribution level resource] - no language triple.
        - [version level resource] - pav:version triple is also a bit redundant
        with the pav:version triple below, but the spec requires both these triples
        - I'm omitting the [version level resource] -> pav:previousVersion because
        Dipper doesn't know this info for certain at run time. Same below for
        [distribution level resource] - pav:previousVersion.


     DISTRIBUTION LEVEL TRIPLES:
     [distribution level resource] - rdf:type -> dctypes:Dataset
     [distribution level resource] - rdf:type -> dcat:Distribution
     [distribution level resource] - dc:title -> distribution title (literal)
     [distribution level resource] - dc:description -> distribution description (lit.)
     [distribution level resource] - dc:created -> ingest timestamp[ISO 8601 compliant]
     [distribution level resource] - pav:version -> ingest timestamp (same as above)
     [distribution level resource] - dc:creator -> monarchinitiative.org
     [distribution level resource] - dc:publisher -> monarchinitiative.org
     [distribution level resource] - dc:license -> [license info, if available
                    otherwise indicate unknown]
     [distribution level resource] - dc:rights -> [data rights IRI]
     [distribution level resource] - pav:createdWith -> [Dipper github URI]
     [distribution level resource] - dc:format -> [IRI of ttl|nt|whatever spec]
     [distribution level resource] - dcat:downloadURL -> [ttl|nt URI]
     [distribution level resource] - void:triples -> [triples count (literal)]
     [distribution level resource] - void:entities -> [entities count (literal)]
     [distribution level resource] - void:distinctSubjects -> [subject count (literal)]
     [distribution level resource] - void:distinctObjects -> [object count (literal)]
     [distribution level resource] - void:properties -> [properties count (literal)]
     ...

        n.b: about distribution level resource triples:
        - omitting Vocabularies used/void:vocabulary and Standards
        used/dc:conformTo triples, because they are described in the ttl file
        - also omitting Example identifier/idot:exampleIdentifier and
        Example resource/void:exampleResource, because we don't really have one
        canonical example of either - they're all very different.
        - [distribution level resource] - dc:created should have the exact same
        time stamp as this triple above:
        [version level resource] - dc:created -> time stamp
        - this [distribution level resource] - pav:version triple should have the
        same object as [version level resource] - pav:version triple above
        - Data source provenance/dc:source triples are above in the
        [version level resource]
        - omitting Byte size/dc:byteSize, RDF File URL/void:dataDump, and
        Linkset/void:subset triples because they probably aren't necessary for MI right
        now
        - these triples "should" be emitted, but we will do this in a later iteration:
        # of classes	void:classPartition	IRI
        # of literals	void:classPartition	IRI
        # of RDF graphs	void:classPartition	IRI

     Note: Do not use blank nodes in the dataset graph. This dataset graph is added to
     the main Dipper graph in Source.write() like so

        $ mainGraph = mainGraph + datasetGraph

     which apparently in theory could lead to blank node ID collisions between the two
     graphs.

     Note also that this implementation currently does not support producing metadata
     for StreamedGraph graphs (see dipper/graph/StreamedGraph.py). StreamedGraph is
     currently not being used for any ingests, so this isn't a problem. There was
     talk of using StreamedGraph for a rewrite/refactor of the Clinvar ingest, which
     would probably require adding support here for StreamedGraph's.
    """
    def __init__(
            self,
            identifier,
            data_release_version,
            ingest_name,
            ingest_title,
            ingest_url,
            ingest_logo=None,
            ingest_description=None,
            license_url=None,
            data_rights=None,
            graph_type='rdf_graph',  # rdf_graph, streamed_graph
            file_handle=None,
            distribution_type='ttl',
            dataset_curie_prefix='MonarchArchive'):

        if graph_type is None:
            self.graph = RDFGraph(None,
                                  ":".join([dataset_curie_prefix, identifier]))
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True,
                                       ":".join(
                                           [dataset_curie_prefix, identifier]),
                                       file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph(True,
                                  ':'.join([dataset_curie_prefix, identifier]))

        if data_release_version is not None:
            self.data_release_version = data_release_version
        else:
            self.data_release_version = datetime.today().strftime("%Y%m%d")

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.identifier = ':'.join([dataset_curie_prefix, identifier])
        self.citation = set()

        self.ingest_name = ingest_name
        self.ingest_title = ingest_title
        if self.ingest_title is None:
            self.ingest_title = ":".join([dataset_curie_prefix, identifier])

        self.ingest_url = ingest_url
        self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo
        self.ingest_description = ingest_description

        self.date_issued = None

        self.license_url = license_url
        self.data_rights = data_rights
        self.distribution_type = distribution_type

        # set HCLS resource CURIEs
        self.summary_level_curie = ':'.join(
            [dataset_curie_prefix, '#' + identifier])
        self.version_level_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/#' + identifier
        self.distribution_level_turtle_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/rdf/' + \
            identifier + "." + self.distribution_type

        # The following might seem a little odd, but we need to set downloadURLs this
        # way in order for them to point to where they will end up in archive.MI.org as
        # of Sept 2019. URL is:
        #  https://archive.MI.org/[release version]/[dist type]/[source].[dist type]
        self.download_url = \
            self.curie_map.get("MonarchArchive") + self.data_release_version + \
            "/rdf/" + self.ingest_name + "." + self.distribution_type

        self._set_summary_level_triples()
        self._set_version_level_triples()
        self._set_distribution_level_triples()

    def _set_summary_level_triples(self):
        self.model.addType(self.summary_level_curie, self.globaltt['Dataset'])
        self.graph.addTriple(self.summary_level_curie, self.globaltt['title'],
                             self.ingest_title, True)
        self.model.addTriple(self.summary_level_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))
        self.model.addTriple(self.summary_level_curie, "schema:logo",
                             self.ingest_logo)
        self.graph.addTriple(self.summary_level_curie,
                             self.globaltt['identifier'],
                             self.summary_level_curie)
        if self.ingest_url is not None:
            self.graph.addTriple(self.summary_level_curie,
                                 self.globaltt["Source"], self.ingest_url)
        if self.ingest_description is not None:
            self.model.addDescription(self.summary_level_curie,
                                      self.ingest_description)

    def _set_version_level_triples(self):
        self.model.addType(self.version_level_curie, self.globaltt['Dataset'])
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['title'],
            self.ingest_title + " Monarch version " +
            self.data_release_version, True)
        if self.ingest_description is not None:
            self.model.addDescription(self.version_level_curie,
                                      self.ingest_description)
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['Date Created'],
            Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date))
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['version'],
            Literal(self.data_release_version, datatype=XSD.date))
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['creator'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['isVersionOf'],
                             self.summary_level_curie,
                             object_is_literal=False)
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['distribution'],
                             self.distribution_level_turtle_curie,
                             object_is_literal=False)

    def _set_distribution_level_triples(self):
        self.model.addType(self.distribution_level_turtle_curie,
                           self.globaltt['Dataset'])
        self.model.addType(self.distribution_level_turtle_curie,
                           self.globaltt['Distribution'])
        self.graph.addTriple(
            self.distribution_level_turtle_curie, self.globaltt['title'],
            self.ingest_title + " distribution " + self.distribution_type,
            True)
        if self.ingest_description is not None:
            self.model.addDescription(self.distribution_level_turtle_curie,
                                      self.ingest_description)
        self.graph.addTriple(
            self.distribution_level_turtle_curie, self.globaltt['version'],
            Literal(self.data_release_version, datatype=XSD.date))
        self.graph.addTriple(
            self.distribution_level_turtle_curie,
            self.globaltt['Date Created'],
            Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date))
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['creator'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['created_with'],
                             "https://github.com/monarch-initiative/dipper")
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['format'],
                             "https://www.w3.org/TR/turtle/")
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['downloadURL'], self.download_url)
        if self.license_url is None:
            self.graph.addTriple(
                self.distribution_level_turtle_curie, self.globaltt['license'],
                'https://project-open-data.cio.gov/unknown-license/')
        else:
            self.graph.addTriple(self.distribution_level_turtle_curie,
                                 self.globaltt['license'], self.license_url)

        if self.data_rights is not None:
            self.graph.addTriple(self.distribution_level_turtle_curie,
                                 self.globaltt['rights'], self.data_rights)

        self._declare_as_ontology()

    def set_ingest_source_file_version_num(self, file_iri, version):
        """
        This method sets the version of a remote file or resource that is used in the
        ingest. It writes this triple:

        file_iri - 'pav:version' -> version

        Version is an untyped literal

        Note: if your version is a date or timestamp, use
        set_ingest_source_file_version_date()
        instead

        :param file_iri: a remote file or resource used in ingest
        :param version: a number or string (e.g. v1.2.3) that the source (OMIM, CTD)
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['version'],
                             version,
                             object_is_literal=True)

    def set_ingest_source_file_version_date(self,
                                            file_iri,
                                            date,
                                            datatype=XSD.date):
        """
        This method sets the version that the source (OMIM, CTD, whatever) uses to
        refer to this version of the remote file/resource that was used in the ingest

        It writes this triple:

        file_iri - 'pav:version' -> date or timestamp

        Version is added as a literal of datatype XSD date

        Note: if file_iri was retrieved using get_files(), then the following triple
        was created and you might not need this method:

        file_iri - 'pav:retrievedOn' -> download date

        :param file_iri: a remote file or resource used in ingest
        :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can
        add timestamp as a version by using a different datatype (below)
        :param datatype: an XSD literal datatype, default is XSD.date
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['version'],
                             date,
                             object_is_literal=True,
                             literal_type=datatype)

    def set_ingest_source_file_version_retrieved_on(self,
                                                    file_iri,
                                                    date,
                                                    datatype=XSD.date):
        """
        This method sets the date on which a remote file/resource (from OMIM, CTD, etc)
        was retrieved.

        It writes this triple:

        file_iri - 'pav:retrievedOn' -> date or timestamp

        Version is added as a literal of datatype XSD date by default

        Note: if file_iri was retrieved using get_files(), then the following triple
        was created and you might not need this method:

        file_iri - 'pav:retrievedOn' -> download date

        :param file_iri: a remote file or resource used in ingest
        :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can
        add timestamp as a version by using a different datatype (below)
        :param datatype: an XSD literal datatype, default is XSD.date
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['retrieved_on'],
                             date,
                             object_is_literal=True,
                             literal_type=datatype)

    def set_ingest_source(self, url, predicate=None, is_object_literal=False):
        """
        This method writes a triple to the dataset graph indicating that the ingest
        used a file or resource at [url] during the ingest.

        Triple emitted is version_level_curie dc:source [url]

        This triple is likely to be redundant if Source.get_files() is used to retrieve
        the remote files/resources, since this triple should also be emitted
        as files/resources are being retrieved. This method is provided as a convenience
        method for sources that do their own downloading of files.

        :param url: a remote resource used as a source during ingest
        :param predicate: the predicate to use for the triple ["dc:source"]
                from spec (https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/)
                "Use dc:source when the source dataset was used in whole or in part.
                Use pav:retrievedFrom when the source dataset was used in whole and was
                not modified from its original distribution. Use prov:wasDerivedFrom
                when the source dataset was in whole or in part and was modified from
                its original distribution."
        :return: None
        """
        if predicate is None:
            predicate = self.globaltt["Source"]
        self.graph.addTriple(self.version_level_curie,
                             predicate,
                             url,
                             object_is_literal=is_object_literal,
                             subject_category=blv.terms['DataSetVersion'])

    def get_graph(self):
        """
        This method returns the dataset graph
        :param
        :return: dataset graph
        """
        return self.graph

    def get_license(self):
        """
        This method returns the license info
        :param
        :return: license info
        """
        return self.license_url

    def set_citation(self, citation_id):
        """
        This method adds [citaton_id] argument to the set of citations, and also
        adds a triple indicating that version level cito:citesAsAuthority [citation_id]
        :param: citation_id
        :return: none
        """
        self.citation.add(citation_id)
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['citesAsAuthority'], citation_id)

    def _declare_as_ontology(self, version_info=None):
        """
        Declare the distribution level IRI as an ontology, and also make triple
        distribution level IRI - version_iri -> version level IRI

        TEC: I am not convinced dipper reformatting external data as RDF triples
        makes an OWL ontology (nor that it should be considered a goal).

        Proper ontologies are built by ontologists. Dipper reformats data
        and annotates/decorates it with a minimal set of carefully arranged
        terms drawn from from multiple proper ontologies.
        Which allows the whole (dipper's RDF triples and parent ontologies)
        to function as a single ontology we can reason over when combined
        in a store such as SciGraph.

        Including more than the minimal ontological terms in dipper's RDF
        output constitutes a liability as it allows greater divergence
        between dipper artifacts and the proper ontologies.

        :param version_info: a string describing version info for the ontology
        :return:

        """
        model = Model(self.graph)
        model.addOntologyDeclaration(self.summary_level_curie)
        model.addOWLVersionIRI(self.summary_level_curie,
                               self.version_level_curie)
        if version_info is not None:
            model.addOWLVersionInfo(self.distribution_level_turtle_curie,
                                    version_info)

    @staticmethod
    def make_id(long_string, prefix='MONARCH'):
        """
        A method to create DETERMINISTIC identifiers
        based on a string's digest. currently implemented with sha1
        Duplicated from Source.py to avoid circular imports.
        :param long_string: string to use to generate identifier
        :param prefix: prefix to prepend to identifier [Monarch]
        :return: a Monarch identifier
        """
        return ':'.join((prefix, Dataset.hash_id(long_string)))

    @staticmethod
    def hash_id(word):  # same as graph/GraphUtils.digest_id(wordage)
        """
        Given a string, make a hash
        Duplicated from Source.py.

        :param word: str string to be hashed
        :return: hash of id
        """
        return 'b' + hashlib.sha1(word.encode('utf-8')).hexdigest()[1:20]
コード例 #5
0
ファイル: Dataset.py プロジェクト: DoctorBud/dipper
class Dataset:
    """
     this will produce the metadata about a dataset
     following the example laid out here:
     http://htmlpreview.github.io/?
     https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1
     (mind the wrap)

    """

    def __init__(self, identifier, title, url, description=None,
                 license_url=None, data_rights=None, graph_type=None,
                 file_handle=None):
        if graph_type is None:
            self.graph = RDFGraph(None, identifier)  # 
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True, file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph()
        self.model = Model(self.graph)
        self.identifier = ':' + identifier
        self.version = None
        self.date_issued = None

        # The data_accesed value is later used as an literal of properties
        # such as dct:issued, which needs to conform xsd:dateTime format.
        # TODO ... we need to have a talk about typed literals and SPARQL
        self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

        self.citation = set()
        self.license = license_url
        self.model.addType(self.identifier, 'dctypes:Dataset')
        self.graph.addTriple(self.identifier, 'dct:title', title, True)
        self.graph.addTriple(
            self.identifier, 'dct:identifier',
            identifier, object_is_literal=True)
        self.graph.addTriple(self.identifier, 'foaf:page', url)
        # maybe in the future add the logo here:
        # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> .

        # TODO add the licence info
        # FIXME:Temporarily making this in IF statement,
        #  can revert after all current resources are updated.
        if license_url is not None:
            self.graph.addTriple(
                self.identifier, 'dct:license', license_url)
        else:
            logger.debug('No license provided.')
        if data_rights is not None:
            self.graph.addTriple(
                self.identifier, 'dct:rights',
                data_rights, object_is_literal=True)
        else:
            logger.debug('No rights provided.')

        if description is not None:
            self.model.addDescription(self.identifier, description)
        return

    def setVersion(self, date_issued, version_id=None):
        """
        Legacy function...
            should use the other set_* for version and date

        as of 2016-10-20  used in:
        
        dipper/sources/HPOAnnotations.py 139:
        dipper/sources/CTD.py             99:
        dipper/sources/BioGrid.py        100:        
        dipper/sources/MGI.py            255:
        dipper/sources/EOM.py             93:
        dipper/sources/Coriell.py        200:
        dipper/sources/MMRRC.py           77:

        # TODO set as deprecated
        
        :param date_issued:
        :param version_id:
        :return:

        """

        if date_issued is not None:
            self.set_date_issued(date_issued)
        elif version_id is not None:
            self.set_version_by_num(version_id)
        else:
            logger.error("date or version not set!")
            # TODO throw error
            return

        if version_id is not None:
            self.set_version_by_num(version_id)
        else:
            logger.info("set version to %s", self.version)
            self.set_version_by_date(date_issued)

        logger.info("set version to %s", self.version)

        return

    def set_date_issued(self, date_issued):

        self.date_issued = date_issued
        self.graph.addTriple(
            self.identifier, 'dct:issued', date_issued, object_is_literal=True)
        logger.info("setting date to %s", date_issued)

        return

    def set_version_by_date(self, date_issued=None):
        """
        This will set the version by the date supplied,
        the date already stored in the dataset description,
        or by the download date (today)
        :param date_issued:
        :return:
        """

        if date_issued is not None:
            d = date_issued
        elif self.date_issued is not None:
            d = self.date_issued
        else:
            d = self.date_accessed
            logger.info(
                "No date supplied for setting version; "
                "using download timestamp for date_issued")

        logger.info("setting version by date")
        self.set_version_by_num(d)

        return

    def set_version_by_num(self, version_num):

        self.version = self.identifier+version_num
        self.graph.addTriple(self.version, 'dct:isVersionOf', self.identifier)
        self.graph.addTriple(self.version, 'pav:version', version_num,
                             object_is_literal=True)

        logger.info("setting version to %s", self.version)

        # set the monarch-generated-version of the resource-version
        # TODO sync this up with the ontology version
        if version_num != self.date_accessed:
            dipperized_version = ':' + str(self.date_accessed)
            self.graph.addTriple(
                dipperized_version, 'dct:isVersionOf',
                self.version)
            self.graph.addTriple(
                dipperized_version, 'pav:version',
                self.date_accessed, object_is_literal=True)
            self.graph.addTriple(
                dipperized_version, 'dct:issued',
                self.date_accessed, object_is_literal=True,
                literal_type="xsd:dateTime")
        return


    def setFileAccessUrl(self, url, is_object_literal=False):
        self.graph.addTriple(self.identifier, 'dcat:accessURL',
                             url, is_object_literal)

    def getGraph(self):
        return self.graph

    def set_license(self, license):
        self.license = license
        return

    def get_license(self):

        return self.license

    def set_citation(self, citation_id):

        self.citation.add(citation_id)
        # TODO
        # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id)

        return
コード例 #6
0
ファイル: test_rdfgraph.py プロジェクト: tegar9000/dipper-1
class RDFGraphTestCase(unittest.TestCase):
    def setUp(self):
        self.graph = RDFGraph()

        this_curie_map = curie_map.get()
        self.cutil = CurieUtil(this_curie_map)

        # stuff to make test triples
        self.test_cat_subj = "http://www.google.com"
        self.test_cat_default_pred = self.cutil.get_uri("biolink:category")
        self.test_cat_nondefault_pred = self.cutil.get_uri("rdf:type")
        self.test_cat_default_category = self.cutil.get_uri(
            "biolink:NamedThing")
        self.test_cat_nondefault_category = self.cutil.get_uri("biolink:Gene")
        self.test_cat_type = self.cutil.get_uri("rdf:type")
        self.test_cat_class = self.cutil.get_uri("rdf:class")

    def tearDown(self):
        self.graph = None

    def test_add_triple_makes_triple(self):
        """
        test that addTriple() makes at least one triple
        """
        self.graph.addTriple(subject_id=self.test_cat_subj,
                             predicate_id="rdf:type",
                             obj="rdf:class")
        self.assertTrue(
            len(self.graph) > 0, "addTriples() didn't make >=1 triple")

    def test_add_triple_subject_category_assignment(self):
        """
        test that addTriple() correctly assigns subject category
        """
        self.graph.addTriple(
            subject_id=self.test_cat_subj,
            predicate_id="rdf:comment",
            obj="website",
            subject_category=self.test_cat_nondefault_category)
        triples = list(
            self.graph.triples((URIRef(self.test_cat_subj),
                                URIRef(self.test_cat_default_pred), None)))
        self.assertEqual(
            len(triples), 1,
            "addTriples() didn't make exactly one triple subject category")
        self.assertEqual(
            triples[0][2], URIRef(self.test_cat_nondefault_category),
            "addTriples() didn't assign the right triple subject category")

    def test_add_triple_object_category_assignment(self):
        """
        test that addTriple() correctly assigns object category
        """
        self.graph.addTriple(subject_id=self.test_cat_subj,
                             predicate_id=self.test_cat_type,
                             obj=self.test_cat_class,
                             object_category=self.test_cat_nondefault_category)
        triples = list(
            self.graph.triples((URIRef(self.test_cat_class),
                                URIRef(self.test_cat_default_pred), None)))
        self.assertEqual(
            len(triples), 1,
            "addTriples() didn't make exactly one triple object category")
        self.assertEqual(
            triples[0][2], URIRef(self.test_cat_nondefault_category),
            "addTriples() didn't assign the right triple object category")

    def read_graph_from_turtle_file(self, f):
        """
        This will read the specified file into a graph.  A simple parsing test.
        :param f:
        :return:

        """
        vg = RDFGraph()
        p = os.path.abspath(f)
        logger.info("Testing reading turtle file from %s", p)
        vg.parse(f, format="turtle")
        logger.info('Found %s graph nodes in %s', len(vg), p)
        self.assertTrue(len(vg) > 0, "No nodes found in " + p)

        return

    def read_graph_into_owl(self, f):
        """
        test if the ttl can be parsed by owlparser
        this expects owltools to be accessible from commandline
        :param f: file of ttl
        :return:
        """

        import subprocess
        from subprocess import check_call

        status = check_call(["owltools", f], stderr=subprocess.STDOUT)
        # returns zero is success!
        if status != 0:
            logger.error('finished verifying with owltools with status %s',
                         status)
        self.assertTrue(status == 0)

        return

    def test_make_category_triple_default(self):
        """
        test that method adds category triple to graph correctly (default pred and obj)
        """
        self.graph._make_category_triple(self.test_cat_subj)

        triples = list(self.graph.triples((None, None, None)))
        self.assertEqual(len(triples), 1,
                         "method didn't make exactly one triple")
        self.assertEqual(triples[0][0], URIRef(self.test_cat_subj),
                         "didn't assign correct subject")
        self.assertEqual(triples[0][1], URIRef(self.test_cat_default_pred),
                         "didn't assign correct predicate")
        self.assertEqual(triples[0][2], URIRef(self.test_cat_default_category),
                         "didn't assign correct category")

    def test_make_category_triple_non_default_category(self):
        """
        test that method adds category triple to graph correctly
        """
        self.graph._make_category_triple(self.test_cat_subj,
                                         self.test_cat_nondefault_category)
        triples = list(self.graph.triples((None, None, None)))

        self.assertEqual(len(triples), 1,
                         "method didn't make exactly one triple")
        self.assertEqual(URIRef(self.test_cat_nondefault_category),
                         triples[0][2],
                         "didn't assign correct (non-default) category")

    def test_make_category_triple_non_default_pred(self):
        """
        test that method adds category triple to graph correctly (non default pred)
        """
        self.graph._make_category_triple(
            self.test_cat_subj,
            self.test_cat_default_category,
            predicate=self.test_cat_nondefault_pred)
        triples = list(self.graph.triples((None, None, None)))
        self.assertEqual(len(triples), 1,
                         "method didn't make exactly one triple")
        self.assertEqual(URIRef(self.test_cat_nondefault_pred), triples[0][1],
                         "didn't assign correct (non-default) category")

    def test_make_category_triple_category_none_should_emit_named_thing(self):
        """
        test that method adds category triple to graph correctly (default pred and obj)
        """
        self.graph._make_category_triple(self.test_cat_subj, category=None)
        triples = list(self.graph.triples((None, None, None)))
        self.assertEqual(len(triples), 1,
                         "method didn't make exactly one triple")
        self.assertEqual(URIRef(self.test_cat_default_category), triples[0][2],
                         "didn't assign correct default category")

    def test_is_literal(self):
        """
        test that method infers type (either literal or CURIE) correctly
        """
        self.assertTrue(self.graph._is_literal("1"))
        self.assertTrue(not self.graph._is_literal("foo:bar"))
        self.assertTrue(not self.graph._is_literal("http://www.zombo.com/"))
        self.assertTrue(not self.graph._is_literal("https://www.zombo.com/"))
        self.assertTrue(
            not self.graph._is_literal("ftp://ftp.1000genomes.ebi.ac.uk/"))