Exemple #1
0
    def __init__(self,
                 identifier,
                 title,
                 url,
                 description=None,
                 license_url=None,
                 data_rights=None,
                 graph_type=None,
                 file_handle=None):
        if graph_type is None:
            self.graph = RDFGraph()
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True, file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph()
        self.model = Model(self.graph)
        self.identifier = ':' + identifier
        self.version = None
        self.date_issued = None

        # The data_accesed value is later used as an object literal of properties such as dct:issued, which needs to conform xsd:dateTime format.
        # self.date_accessed = datetime.now().strftime('%Y-%m-%d-%H-%M')
        self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

        self.citation = set()
        self.license = license_url
        self.model.addType(self.identifier, 'dctypes:Dataset')
        self.graph.addTriple(self.identifier, 'dct:title', title, True)
        self.graph.addTriple(self.identifier,
                             'dct:identifier',
                             identifier,
                             object_is_literal=True)
        self.graph.addTriple(self.identifier, 'foaf:page', url)
        # maybe in the future add the logo here:
        # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> .

        # TODO add the licence info
        # FIXME:Temporarily making this in IF statement,
        #  can revert after all current resources are updated.
        if license_url is not None:
            self.graph.addTriple(self.identifier, 'dct:license', license_url)
        else:
            logger.debug('No license provided.')
        if data_rights is not None:
            self.graph.addTriple(self.identifier,
                                 'dct:rights',
                                 data_rights,
                                 object_is_literal=True)
        else:
            logger.debug('No rights provided.')

        if description is not None:
            self.model.addDescription(self.identifier, description)
        return
Exemple #2
0
    def __init__(
            self,
            graph_type='rdf_graph',     # or streamed_graph
            are_bnodes_skized=False,    # typically True
            data_release_version=None,
            name=None,                  # identifier; make an URI for nquads
            ingest_title=None,
            ingest_url=None,
            ingest_logo=None,     # this should be the name of file on 'MonarchLogoRepo'
            ingest_description=None,
            license_url=None,           # only if it is _our_ lic
            data_rights=None,           # their page that points to their current lic
            file_handle=None,
    ):

        # pull in the common test identifiers
        self.all_test_ids = self.open_and_parse_yaml('../../resources/test_ids.yaml')

        self.graph_type = graph_type
        self.are_bnodes_skized = are_bnodes_skized
        self.data_release_version = data_release_version
        self.ingest_title = ingest_title
        self.ingest_url = ingest_url
        self.ingest_logo = ingest_logo
        self.ingest_description = ingest_description
        self.license_url = license_url
        self.data_rights = data_rights
        self.localtt = self.load_local_translationtable(name)

        self.remote_file_timestamps = dict()

        if name is not None:
            self.name = name.lower()
        elif self.whoami() is not None:
            self.name = self.whoami().lower()

        LOG.info("Processing Source \"%s\"", self.name)
        self.test_only = False
        self.path = ""
        # to be used to store a subset of data for testing downstream.
        self.triple_count = 0
        self.outdir = 'out'
        self.testdir = 'tests'

        self.rawdir = '/'.join(('raw', self.name))
        self.testname = name + "_test"
        self.testfile = '/'.join((self.outdir, self.testname + ".ttl"))
        self.datasetfile = None

        # if raw data dir doesn't exist, create it
        if not os.path.exists(self.rawdir):
            os.makedirs(self.rawdir)
            raw_pth = os.path.abspath(self.rawdir)
            LOG.info("creating raw directory for %s at %s", self.name, raw_pth)
        # else:  # raw data dir does  exist. maybe should consider what is in it?

        # if output dir doesn't exist, create it
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir)
            out_pth = os.path.abspath(self.outdir)
            LOG.info("created output directory %s", out_pth)
        else:
            out_pth = os.path.abspath(self.outdir)

        LOG.info("Creating Test graph %s", self.testname)
        # note: tools such as protoge need skolemized blank nodes
        self.testgraph = RDFGraph(True, self.testname)

        if graph_type == 'rdf_graph':
            graph_id = ':MONARCH_' + str(self.name) + "_" + \
                datetime.now().isoformat(' ').split()[0]

            LOG.info("Creating graph  %s", graph_id)
            self.graph = RDFGraph(are_bnodes_skized, graph_id)

        elif graph_type == 'streamed_graph':
            # need to expand on export formats
            dest_file = open(out_pth + '/' + name + '.nt', 'w')   # where is the close?
            self.graph = StreamedGraph(are_bnodes_skized, dest_file)
            # leave test files as turtle (better human readibility)
        else:
            LOG.error(
                "%s graph type not supported\n"
                "valid types: rdf_graph, streamed_graph", graph_type)

        # pull in global ontology mapping datastructures
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid

        self.curie_map = self.graph.curie_map
        # self.prefix_base = {v: k for k, v in self.curie_map.items()}

        # will be set to True if the intention is
        # to only process and write the test data
        self.test_only = False
        self.test_mode = False

        if self.ingest_description and getdoc(self) is not None:
            self.ingest_description = getdoc(self)

        self.dataset = Dataset(
            identifier=self.name,
            data_release_version=self.data_release_version,
            ingest_name=self.name,
            ingest_title=self.ingest_title,
            ingest_url=self.ingest_url,
            ingest_logo=self.ingest_logo,
            ingest_description=self.ingest_description,   # description
            license_url=self.license_url,    # only _OUR_ lic
            data_rights=self.data_rights,    # tries to point to others lics
            graph_type=graph_type,
            file_handle=file_handle
        )

        # see jenkins file   human, mouse, zebrafish, fly, worm        rat
        self.COMMON_TAXON = ['9606','10090','7955','7227','6239']  # '10116'
Exemple #3
0
    def __init__(
            self,
            identifier,  # name? should be Archive url via Source
            title,
            url,
            ingest_desc=None,
            license_url=None,
            data_rights=None,
            graph_type='rdf_graph',  # rdf_graph, streamed_graph
            file_handle=None):

        if graph_type is None:
            self.graph = RDFGraph(None, identifier)
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True,
                                       identifier,
                                       file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph(True, identifier)

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        # TODO: move hard coded curies to translation table calls
        self.identifier = identifier
        if title is None:
            self.title = identifier
        else:
            self.title = title
        self.version = None
        self.date_issued = None

        # The data_accesed value is later used as an literal of properties
        # such as dcterms:issued, which needs to conform xsd:dateTime format.
        # TODO ... we need to have a talk about typed literals and SPARQL
        self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

        self.citation = set()
        self.license_url = license_url
        self.model.addType(self.identifier, 'dctypes:Dataset')
        self.graph.addTriple(self.identifier, 'dcterms:title', title, True)
        self.graph.addTriple(self.identifier, 'dcterms:identifier', identifier,
                             True)
        if url is not None:
            self.graph.addTriple(self.identifier, 'foaf:page', url)
        # maybe in the future add the logo here:
        # schemaorg:logo  <uri>
        # TODO add the license info
        # FIXME:Temporarily making this in IF statement,
        #  can revert after all current resources are updated.
        if license_url is not None:
            self.graph.addTriple(self.identifier, 'dcterms:license',
                                 license_url)
        else:
            LOG.debug('No license provided.')
        if data_rights is not None:
            self.graph.addTriple(self.identifier,
                                 'dcterms:rights',
                                 data_rights,
                                 object_is_literal=True)
        else:
            LOG.debug('No rights provided.')

        if ingest_desc is not None:
            self.model.addDescription(self.identifier, ingest_desc)
        return
Exemple #4
0
    def __init__(self, graph_type, are_bnodes_skized=False, name=None):

        self.graph_type = graph_type
        self.are_bnodes_skized = are_bnodes_skized

        if name is not None:
            logger.info("Processing Source \"%s\"", name)
        self.testOnly = False
        self.name = name
        self.path = ""
        # to be used to store a subset of data for testing downstream.
        self.triple_count = 0
        self.outdir = 'out'
        self.testdir = 'tests'
        self.rawdir = 'raw'
        self.dataset = None
        # set to True if you want to materialze identifiers for BNodes

        if self.name is not None:
            self.rawdir = '/'.join((self.rawdir, self.name))
            # This is redundant when it is not wrong see write()
            # self.outfile = '/'.join((self.outdir, self.name + ".ttl"))
            # logger.info("Setting outfile to %s", self.outfile)

            self.testfile = '/'.join((self.outdir, self.name + "_test.ttl"))
            logger.info("Setting testfile to %s", self.testfile)

            self.datasetfile = '/'.join(
                (self.outdir, self.name + '_dataset.ttl'))
            logger.info("Setting dataset file to %s", self.datasetfile)

        # if raw data dir doesn't exist, create it
        if not os.path.exists(self.rawdir):
            os.makedirs(self.rawdir)
            p = os.path.abspath(self.rawdir)
            logger.info("creating raw directory for %s at %s", self.name, p)

        # if output dir doesn't exist, create it
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir)
            p = os.path.abspath(self.outdir)
            logger.info("created output directory %s", p)

        if graph_type == 'rdf_graph':
            self.graph = RDFGraph(are_bnodes_skized)  # TODO named graph IRI?
            self.testgraph = RDFGraph(True)
        elif graph_type == 'streamed_graph':
            source_file = open(self.outfile.replace(".ttl", ".nt"), 'w')
            test_file = open(self.testfile.replace(".ttl", ".nt"), 'w')
            self.graph = StreamedGraph(are_bnodes_skized, source_file)
            self.testgraph = StreamedGraph(are_bnodes_skized, test_file)
        else:
            logger.error(
                "{} graph type not supported\n"
                "valid types: rdf_graph, streamed_graph".format(graph_type))

        # will be set to True if the intention is
        # to only process and write the test data
        self.testOnly = False
        self.testMode = False

        for g in [self.graph, self.testgraph]:
            self.declareAsOntology(g)

        return
Exemple #5
0
    def __init__(
            self,
            graph_type='rdf_graph',  # or streamed_graph
            are_bnodes_skized=False,  # typically True
            name=None,  # identifier; make an IRI for nquads
            ingest_title=None,
            ingest_url=None,
            license_url=None,  # only if it is _our_ lic
            data_rights=None,  # external page that points to their current lic
            file_handle=None):

        # pull in the common test identifiers
        self.all_test_ids = self.open_and_parse_yaml(
            '../../resources/test_ids.yaml')

        self.graph_type = graph_type
        self.are_bnodes_skized = are_bnodes_skized
        self.ingest_url = ingest_url
        self.ingest_title = ingest_title
        self.localtt = self.load_local_translationtable(name)

        if name is not None:
            self.name = name.lower()
        elif self.whoami() is not None:
            self.name = self.whoami().lower()

        LOG.info("Processing Source \"%s\"", self.name)
        self.test_only = False
        self.path = ""
        # to be used to store a subset of data for testing downstream.
        self.triple_count = 0
        self.outdir = 'out'
        self.testdir = 'tests'
        self.rawdir = 'raw'
        self.rawdir = '/'.join((self.rawdir, self.name))
        self.testname = name + "_test"
        self.testfile = '/'.join((self.outdir, self.testname + ".ttl"))
        self.datasetfile = None

        # still need to pull in file suffix  -- this ia a curie not a url
        self.archive_url = 'MonarchArchive:' + 'ttl/' + self.name + '.ttl'

        # if raw data dir doesn't exist, create it
        if not os.path.exists(self.rawdir):
            os.makedirs(self.rawdir)
            pth = os.path.abspath(self.rawdir)
            LOG.info("creating raw directory for %s at %s", self.name, pth)

        # if output dir doesn't exist, create it
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir)
            pth = os.path.abspath(self.outdir)
            LOG.info("created output directory %s", pth)

        LOG.info("Creating Test graph %s", self.testname)
        # note: tools such as protoge need slolemized blank nodes
        self.testgraph = RDFGraph(True, self.testname)

        if graph_type == 'rdf_graph':
            graph_id = ':MONARCH_' + str(self.name) + "_" + \
                datetime.now().isoformat(' ').split()[0]

            LOG.info("Creating graph  %s", graph_id)
            self.graph = RDFGraph(are_bnodes_skized, graph_id)

        elif graph_type == 'streamed_graph':
            # need to expand on export formats
            dest_file = open(pth + '/' + name + '.nt',
                             'w')  # where is the close?
            self.graph = StreamedGraph(are_bnodes_skized, dest_file)
            # leave test files as turtle (better human readibility)
        else:
            LOG.error(
                "%s graph type not supported\n"
                "valid types: rdf_graph, streamed_graph", graph_type)

        # pull in global ontology mapping datastructures
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid

        self.curie_map = self.graph.curie_map
        # self.prefix_base = {v: k for k, v in self.curie_map.items()}

        # will be set to True if the intention is
        # to only process and write the test data
        self.test_only = False
        self.test_mode = False

        # this may eventually support Bagits
        self.dataset = Dataset(
            self.archive_url,
            self.ingest_title,
            self.ingest_url,
            None,  # description
            license_url,  # only _OUR_ lic
            data_rights,  # tries to point to others lics
            graph_type,
            file_handle)

        for graph in [self.graph, self.testgraph]:
            self.declareAsOntology(graph)
Exemple #6
0
    def __init__(
            self,
            identifier,
            data_release_version,
            ingest_name,
            ingest_title,
            ingest_url,
            ingest_logo=None,
            ingest_description=None,
            license_url=None,
            data_rights=None,
            graph_type='rdf_graph',  # rdf_graph, streamed_graph
            file_handle=None,
            distribution_type='ttl',
            dataset_curie_prefix='MonarchArchive'):

        if graph_type is None:
            self.graph = RDFGraph(None,
                                  ":".join([dataset_curie_prefix, identifier]))
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True,
                                       ":".join(
                                           [dataset_curie_prefix, identifier]),
                                       file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph(True,
                                  ':'.join([dataset_curie_prefix, identifier]))

        if data_release_version is not None:
            self.data_release_version = data_release_version
        else:
            self.data_release_version = datetime.today().strftime("%Y%m%d")

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.identifier = ':'.join([dataset_curie_prefix, identifier])
        self.citation = set()

        self.ingest_name = ingest_name
        self.ingest_title = ingest_title
        if self.ingest_title is None:
            self.ingest_title = ":".join([dataset_curie_prefix, identifier])

        self.ingest_url = ingest_url
        self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo
        self.ingest_description = ingest_description

        self.date_issued = None

        self.license_url = license_url
        self.data_rights = data_rights
        self.distribution_type = distribution_type

        # set HCLS resource CURIEs
        self.summary_level_curie = ':'.join(
            [dataset_curie_prefix, '#' + identifier])
        self.version_level_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/#' + identifier
        self.distribution_level_turtle_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/rdf/' + \
            identifier + "." + self.distribution_type

        # The following might seem a little odd, but we need to set downloadURLs this
        # way in order for them to point to where they will end up in archive.MI.org as
        # of Sept 2019. URL is:
        #  https://archive.MI.org/[release version]/[dist type]/[source].[dist type]
        self.download_url = \
            self.curie_map.get("MonarchArchive") + self.data_release_version + \
            "/rdf/" + self.ingest_name + "." + self.distribution_type

        self._set_summary_level_triples()
        self._set_version_level_triples()
        self._set_distribution_level_triples()
Exemple #7
0
    def __init__(
        self,
        graph_type='rdf_graph',     # or streamed_graph
        are_bnodes_skized=False,    # typically True
        name=None,                  # identifier; make an IRI for nquads
        ingest_title=None,
        ingest_url=None,
        license_url=None,
        data_rights=None,
        file_handle=None
    ):

        self.graph_type = graph_type
        self.are_bnodes_skized = are_bnodes_skized
        self.ingest_url = ingest_url
        self.ingest_title = ingest_title
        self.localtt = self.load_local_translationtable(name)

        if name is not None:
            self.name = name
        else:
            self.name = self.whoami().lower()

        LOG.info("Processing Source \"%s\"", self.name)
        self.testOnly = False
        self.path = ""
        # to be used to store a subset of data for testing downstream.
        self.triple_count = 0
        self.outdir = 'out'
        self.testdir = 'tests'
        self.rawdir = 'raw'
        self.rawdir = '/'.join((self.rawdir, self.name))
        self.testname = name + "_test"
        self.testfile = '/'.join((self.outdir, self.testname + ".ttl"))

        # still need to pull in file suffix
        self.archive_url = 'MonarchArchive:' + 'ttl/' + self.name + '.ttl'

        # if raw data dir doesn't exist, create it
        if not os.path.exists(self.rawdir):
            os.makedirs(self.rawdir)
            pth = os.path.abspath(self.rawdir)
            LOG.info("creating raw directory for %s at %s", self.name, pth)

        # if output dir doesn't exist, create it
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir)
            pth = os.path.abspath(self.outdir)
            LOG.info("created output directory %s", pth)

        LOG.info("Creating Test graph %s", self.testname)
        # note: tools such as protoge need slolemized blank nodes
        self.testgraph = RDFGraph(True,  self.testname)

        if graph_type == 'rdf_graph':
            graph_id = ':MONARCH_' + str(self.name) + "_" + \
                datetime.now().isoformat(' ').split()[0]

            LOG.info("Creating graph  %s", graph_id)
            self.graph = RDFGraph(are_bnodes_skized, graph_id)

        elif graph_type == 'streamed_graph':
            # need to expand on export formats
            source_file = open(self.outfile.replace(".ttl", ".nt"), 'w')
            self.graph = StreamedGraph(are_bnodes_skized, source_file)
            # leave test files as turtle (better human readibility)
        else:
            LOG.error(
                "{} graph type not supported\n"
                "valid types: rdf_graph, streamed_graph".format(graph_type))

        # pull in global ontology mapping datastructures
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        #
        self.curie_map = self.graph.curie_map

        # will be set to True if the intention is
        # to only process and write the test data
        self.testOnly = False
        self.testMode = False

        # this may eventually support Bagits
        self.dataset = Dataset(
            self.archive_url,
            self.ingest_title,
            self.ingest_url,
            None,    # description
            license_url,
            data_rights,
            graph_type,
            file_handle
        )

        for g in [self.graph, self.testgraph]:
            self.declareAsOntology(g)

        return