Beispiel #1
0
class Checker(object):
    def __init__(self, input_dir, output_dir=None, tmp_dir=None):
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.tmp_dir = tmp_dir
        self.storer = Storer()
        self.name = self.__class__.__name__
        self.repok = Reporter(prefix="[%s - INFO] " % self.name)
        self.repok.new_article()
        self.reper = Reporter(prefix="[%s - ERROR] " % self.name)
        self.reper.new_article()

    def process(self):
        for cur_dir, cur_subdir, cur_files in os.walk(self.input_dir):
            for cur_file in cur_files:
                self.repok.new_article()
                self.reper.new_article()
                cur_rdf_path = cur_dir + os.sep + cur_file
                try:
                    self.repok.add_sentence("Processing '%s'" % cur_rdf_path)
                    g = self.storer.load(cur_rdf_path, tmp_dir=self.tmp_dir)
                    if self.output_dir is None:
                        self.repok.add_sentence("The RDF graph has been converted in TRIG as follows:\n%s"
                                                % g.serialize(format="trig"))
                    else:
                        if not os.path.exists(self.output_dir):
                            os.makedirs(self.output_dir)
                        output_file = self.output_dir + os.sep + "converted_" + cur_file + ".ttl"
                        self.repok.add_sentence("The RDF graph has been stored in %s"
                                                % (output_file, g.serialize(output_file, format="trig")))
                except Exception:
                    self.reper.add_sentence("The file '%s' doesn't contain RDF statements", False)
Beispiel #2
0
class Checker(object):
    def __init__(self, input_dir, output_dir=None, tmp_dir=None):
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.tmp_dir = tmp_dir
        self.storer = Storer()
        self.name = self.__class__.__name__
        self.repok = Reporter(prefix="[%s - INFO] " % self.name)
        self.repok.new_article()
        self.reper = Reporter(prefix="[%s - ERROR] " % self.name)
        self.reper.new_article()

    def process(self):
        for cur_dir, cur_subdir, cur_files in os.walk(self.input_dir):
            for cur_file in cur_files:
                self.repok.new_article()
                self.reper.new_article()
                cur_rdf_path = cur_dir + os.sep + cur_file
                try:
                    self.repok.add_sentence("Processing '%s'" % cur_rdf_path)
                    g = self.storer.load(cur_rdf_path, tmp_dir=self.tmp_dir)
                    if self.output_dir is None:
                        self.repok.add_sentence(
                            "The RDF graph has been converted in TRIG as follows:\n%s"
                            % g.serialize(format="trig"))
                    else:
                        if not os.path.exists(self.output_dir):
                            os.makedirs(self.output_dir)
                        output_file = self.output_dir + os.sep + "converted_" + cur_file + ".ttl"
                        self.repok.add_sentence(
                            "The RDF graph has been stored in %s" %
                            (output_file,
                             g.serialize(output_file, format="trig")))
                except Exception:
                    self.reper.add_sentence(
                        "The file '%s' doesn't contain RDF statements", False)
    s = Storer(context_map={context_path: context_file_path})

    all_files = []
    if args.rec:
        for cur_dir, cur_subdir, cur_files in os.walk(args.i_dir):
            for cur_file in cur_files:
                if cur_file.endswith(".json"):
                    all_files += [cur_dir + os.sep + cur_file]
    else:
        for cur_file in os.listdir(args.i_dir):
            if cur_file.endswith(".json"):
                all_files += [args.i_dir + os.sep + cur_file]

    for rdf_path in all_files:
        cur_g = s.load(rdf_path, tmp_dir=temp_dir_for_rdf_loading)
        try:
            for o in cur_g.objects(None, GraphEntity.has_identifier):
                o_local_path = str(o).replace(base_iri, base_dir) + ".json"
                id_g = s.load(o_local_path, tmp_dir=temp_dir_for_rdf_loading)
                for v in id_g.objects(None, GraphEntity.has_literal_value):
                    if v not in id_doc:
                        id_doc[v] = []
                    id_doc[v] += [rdf_path]
        except:
            pass  # Do nothing

    result = []
    for id_lit in id_doc:
        cur_list = id_doc[id_lit]
        if len(cur_list) > 1:
class DatasetHandler(object):
    DCTERMS = Namespace("http://purl.org/dc/terms/")
    DCAT = Namespace("http://www.w3.org/ns/dcat#")
    VOID = Namespace("http://rdfs.org/ns/void#")
    MTT = Namespace("https://w3id.org/spar/mediatype/text/")
    DBR = Namespace("http://dbpedia.org/resource/")

    dataset = DCAT.Dataset
    datafile = DCAT.Distribution

    title = DCTERMS.title
    description = DCTERMS.description
    issued = DCTERMS.issued
    modified = DCTERMS.modified
    keyword = DCAT.keyword
    subject = DCAT.theme
    landing_page = DCAT.landingPage
    subset = VOID.subset
    sparql_endpoint = VOID.sparqlEndpoint
    distribution = DCAT.distribution
    license = DCTERMS.license
    download_url = DCAT.downloadURL
    media_type = DCAT.mediaType
    byte_size = DCAT.byte_size
    label = RDFS.label
    a = RDF.type
    turtle = MTT.turtle
    bibliographic_database = DBR.Bibliographic_database
    open_access = DBR.Open_access
    scholary_communication = DBR.Scholarly_communication
    citations = DBR.Citation

    def __init__(self, tp_url_real, context_path, context_file_path,
                 base_iri, base_dir, info_dir, dataset_home, tmp_dir, triplestore_url=None):
        self.tp_url = triplestore_url
        self.base_iri = base_iri
        self.base_dir = base_dir
        self.info_dir = info_dir
        self.context_path = context_path
        self.dataset_home = URIRef(dataset_home)
        self.tmp_dir = tmp_dir
        self.tp_res = URIRef(tp_url_real)
        self.repok = Reporter(prefix="[DatasetHandler: INFO] ")
        self.reperr = Reporter(prefix="[DatasetHandler: ERROR] ")
        self.st = Storer(context_map={context_path: context_file_path},
                         repok=self.repok, reperr=self.reperr)
        self.st.set_preface_query(
            u"DELETE { ?res <%s> ?date } WHERE { ?res a <%s> ; <%s> ?date }" %
            (str(DatasetHandler.modified), str(DatasetHandler.dataset), str(DatasetHandler.modified)))

    # /START Create Literal
    def create_label(self, g, res, string):
        return create_literal(g, res, RDFS.label, string)

    def create_publication_date(self, g, res, string):
        return create_literal(g, res, self.issued, string, XSD.dateTime)

    def update_modification_date(self, g, res, string):
        g.remove((res, self.modified, None))
        return create_literal(g, res, self.modified, string, XSD.dateTime)

    def create_title(self, g, res, string):
        return create_literal(g, res, self.title, string)

    def create_description(self, g, res, string):
        return create_literal(g, res, self.description, string)

    def create_keyword(self, g, res, string):
        return create_literal(g, res, self.keyword, string)

    def create_byte_size(self, g, res, string):
        return create_literal(g, res, self.byte_size, string, XSD.decimal)
    # /END Create Literal

    # /START Create Complex Attributes
    def has_subject(self, g, res, obj):
        g.add((res, self.subject, obj))

    def has_landing_page(self, g, res, obj):
        g.add((res, self.landing_page, obj))

    def has_subset(self, g, res, obj):
        g.add((res, self.subset, obj))

    def has_sparql_endpoint(self, g, res, obj):
        g.add((res, self.sparql_endpoint, obj))

    def has_distribution(self, g, res, obj):
        g.add((res, self.distribution, obj))

    def has_license(self, g, res, obj):
        g.add((res, self.license, obj))

    def has_download_url(self, g, res, obj):
        g.add((res, self.download_url, obj))

    def has_media_type(self, g, res, obj):
        g.add((res, self.media_type, obj))
    # /END Create Complex Attributes

    # /START Types
    def dataset_type(self, g, res):
        create_type(g, res, self.dataset)

    def distribution_type(self, g, res):
        create_type(g, res, self.datafile)
    # /END Types

    def update_dataset_info(self, graph_set):
        cur_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
        subgraphs_to_update = set()
        all_graphs = []

        for g in graph_set.graphs():
            cur_id = g.identifier
            if cur_id not in subgraphs_to_update:
                subgraphs_to_update.add(cur_id)
                cur_dataset_res = URIRef(cur_id)
                cur_dataset = self.get_dataset_graph(cur_dataset_res, cur_time)
                self.update_modification_date(cur_dataset, cur_dataset_res, cur_time)
                all_graphs += [cur_dataset]

        if subgraphs_to_update:
            cur_occ_res = URIRef(self.base_iri)
            cur_occ = self.get_dataset_graph(cur_occ_res, cur_time)
            self.update_modification_date(cur_occ, cur_occ_res, cur_time)

            for subgraph_id in subgraphs_to_update:
                self.has_subset(cur_occ, cur_occ_res, URIRef(subgraph_id))
            all_graphs += [cur_occ]

        if all_graphs:  # Store everything and upload to triplestore
            if self.tp_url is None:
                self.st.store_all(
                    self.base_dir, self.base_iri, self.context_path,
                    self.tmp_dir, all_graphs, True)
            else:
                self.st.upload_and_store(
                    self.base_dir, self.tp_url, self.base_iri, self.context_path,
                    self.tmp_dir, all_graphs, True)

    def get_dataset_graph(self, res, cur_time):
        dataset_path = self.get_metadata_path_from_resource(res)
        if os.path.exists(dataset_path):
            return list(self.st.load(dataset_path, tmp_dir=self.tmp_dir).contexts())[0]
        else:
            dataset_label = "OCC"
            dataset_title = "The OpenCitations Corpus"
            dataset_description = "The OpenCitations Corpus is an open repository of scholarly " \
                                  "citation data made available under a Creative Commons public " \
                                  "domain dedication, which provides in RDF accurate citation " \
                                  "information (bibliographic references) harvested from the " \
                                  "scholarly literature (described using the SPAR Ontologies) " \
                                  "that others may freely build upon, enhance and reuse for any " \
                                  "purpose, without restriction under copyright or database law."
            if re.search("/../$", str(res)) is not None:
                g = Graph(identifier=str(res))
                dataset_short_name = str(res)[-3:-1]
                dataset_name = GraphSet.labels[dataset_short_name]
                dataset_title += ": %s dataset" % dataset_name.title()
                dataset_description += " This sub-dataset contains all the '%s' resources." % \
                                       dataset_name
                dataset_label += " / %s" % dataset_short_name
                self.create_keyword(g, res, dataset_name)
            else:
                g = Graph()
                self.has_landing_page(g, res, self.dataset_home)
                self.has_sparql_endpoint(g, res, self.tp_res)
            self.dataset_type(g, res)
            self.create_label(g, res, dataset_label)
            self.create_title(g, res, dataset_title)
            self.create_description(g, res, dataset_description)
            self.create_publication_date(g, res, cur_time)
            self.create_keyword(g, res, "OCC")
            self.create_keyword(g, res, "OpenCitations")
            self.create_keyword(g, res, "OpenCitations Corpus")
            self.create_keyword(g, res, "SPAR Ontologies")
            self.create_keyword(g, res, "bibliographic references")
            self.create_keyword(g, res, "citations")
            self.has_subject(g, res, self.bibliographic_database)
            self.has_subject(g, res, self.scholary_communication)
            self.has_subject(g, res, self.open_access)
            self.has_subject(g, res, self.citations)

            return g

    def get_metadata_path_from_resource(self, dataset_res):
        return self.get_metadata_path_from_iri(str(dataset_res))

    def get_metadata_path_from_iri(self, dataset_iri):
        return re.sub("^%s" % self.base_iri, self.base_dir, dataset_iri) + "index.json"
Beispiel #5
0
class DatasetHandler(object):
    DCTERMS = Namespace("http://purl.org/dc/terms/")
    DCAT = Namespace("http://www.w3.org/ns/dcat#")
    VOID = Namespace("http://rdfs.org/ns/void#")
    MTT = Namespace("https://w3id.org/spar/mediatype/text/")
    DBR = Namespace("http://dbpedia.org/resource/")

    dataset = DCAT.Dataset
    datafile = DCAT.Distribution

    title = DCTERMS.title
    description = DCTERMS.description
    issued = DCTERMS.issued
    modified = DCTERMS.modified
    keyword = DCAT.keyword
    subject = DCAT.theme
    landing_page = DCAT.landingPage
    subset = VOID.subset
    sparql_endpoint = VOID.sparqlEndpoint
    distribution = DCAT.distribution
    license = DCTERMS.license
    download_url = DCAT.downloadURL
    media_type = DCAT.mediaType
    byte_size = DCAT.byte_size
    label = RDFS.label
    a = RDF.type
    turtle = MTT.turtle
    bibliographic_database = DBR.Bibliographic_database
    open_access = DBR.Open_access
    scholary_communication = DBR.Scholarly_communication
    citations = DBR.Citation

    def __init__(self,
                 tp_url_real,
                 context_path,
                 context_file_path,
                 base_iri,
                 base_dir,
                 info_dir,
                 dataset_home,
                 tmp_dir,
                 triplestore_url=None):
        self.tp_url = triplestore_url
        self.base_iri = base_iri
        self.base_dir = base_dir
        self.info_dir = info_dir
        self.context_path = context_path
        self.dataset_home = URIRef(dataset_home)
        self.tmp_dir = tmp_dir
        self.tp_res = URIRef(tp_url_real)
        self.repok = Reporter(prefix="[DatasetHandler: INFO] ")
        self.reperr = Reporter(prefix="[DatasetHandler: ERROR] ")
        self.st = Storer(context_map={context_path: context_file_path},
                         repok=self.repok,
                         reperr=self.reperr)
        self.st.set_preface_query(
            u"DELETE { ?res <%s> ?date } WHERE { ?res a <%s> ; <%s> ?date }" %
            (str(DatasetHandler.modified), str(
                DatasetHandler.dataset), str(DatasetHandler.modified)))

    # /START Create Literal
    def create_label(self, g, res, string):
        return create_literal(g, res, RDFS.label, string)

    def create_publication_date(self, g, res, string):
        return create_literal(g, res, self.issued, string, XSD.dateTime)

    def update_modification_date(self, g, res, string):
        g.remove((res, self.modified, None))
        return create_literal(g, res, self.modified, string, XSD.dateTime)

    def create_title(self, g, res, string):
        return create_literal(g, res, self.title, string)

    def create_description(self, g, res, string):
        return create_literal(g, res, self.description, string)

    def create_keyword(self, g, res, string):
        return create_literal(g, res, self.keyword, string)

    def create_byte_size(self, g, res, string):
        return create_literal(g, res, self.byte_size, string, XSD.decimal)

    # /END Create Literal

    # /START Create Complex Attributes
    def has_subject(self, g, res, obj):
        g.add((res, self.subject, obj))

    def has_landing_page(self, g, res, obj):
        g.add((res, self.landing_page, obj))

    def has_subset(self, g, res, obj):
        g.add((res, self.subset, obj))

    def has_sparql_endpoint(self, g, res, obj):
        g.add((res, self.sparql_endpoint, obj))

    def has_distribution(self, g, res, obj):
        g.add((res, self.distribution, obj))

    def has_license(self, g, res, obj):
        g.add((res, self.license, obj))

    def has_download_url(self, g, res, obj):
        g.add((res, self.download_url, obj))

    def has_media_type(self, g, res, obj):
        g.add((res, self.media_type, obj))

    # /END Create Complex Attributes

    # /START Types
    def dataset_type(self, g, res):
        create_type(g, res, self.dataset)

    def distribution_type(self, g, res):
        create_type(g, res, self.datafile)

    # /END Types

    def update_dataset_info(self, graph_set):
        cur_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
        subgraphs_to_update = set()
        all_graphs = []

        for g in graph_set.graphs():
            cur_id = g.identifier
            if cur_id not in subgraphs_to_update:
                subgraphs_to_update.add(cur_id)
                cur_dataset_res = URIRef(cur_id)
                cur_dataset = self.get_dataset_graph(cur_dataset_res, cur_time)
                self.update_modification_date(cur_dataset, cur_dataset_res,
                                              cur_time)
                all_graphs += [cur_dataset]

        if subgraphs_to_update:
            cur_occ_res = URIRef(self.base_iri)
            cur_occ = self.get_dataset_graph(cur_occ_res, cur_time)
            self.update_modification_date(cur_occ, cur_occ_res, cur_time)

            for subgraph_id in subgraphs_to_update:
                self.has_subset(cur_occ, cur_occ_res, URIRef(subgraph_id))
            all_graphs += [cur_occ]

        if all_graphs:  # Store everything and upload to triplestore
            if self.tp_url is None:
                self.st.store_all(self.base_dir, self.base_iri,
                                  self.context_path, self.tmp_dir, all_graphs,
                                  True)
            else:
                self.st.upload_and_store(self.base_dir, self.tp_url,
                                         self.base_iri, self.context_path,
                                         self.tmp_dir, all_graphs, True)

    def get_dataset_graph(self, res, cur_time):
        dataset_path = self.get_metadata_path_from_resource(res)
        if os.path.exists(dataset_path):
            return list(
                self.st.load(dataset_path, tmp_dir=self.tmp_dir).contexts())[0]
        else:
            dataset_label = "OCC"
            dataset_title = "The OpenCitations Corpus"
            dataset_description = "The OpenCitations Corpus is an open repository of scholarly " \
                                  "citation data made available under a Creative Commons public " \
                                  "domain dedication, which provides in RDF accurate citation " \
                                  "information (bibliographic references) harvested from the " \
                                  "scholarly literature (described using the SPAR Ontologies) " \
                                  "that others may freely build upon, enhance and reuse for any " \
                                  "purpose, without restriction under copyright or database law."
            if re.search("/../$", str(res)) is not None:
                g = Graph(identifier=str(res))
                dataset_short_name = str(res)[-3:-1]
                dataset_name = GraphSet.labels[dataset_short_name]
                dataset_title += ": %s dataset" % dataset_name.title()
                dataset_description += " This sub-dataset contains all the '%s' resources." % \
                                       dataset_name
                dataset_label += " / %s" % dataset_short_name
                self.create_keyword(g, res, dataset_name)
            else:
                g = Graph()
                self.has_landing_page(g, res, self.dataset_home)
                self.has_sparql_endpoint(g, res, self.tp_res)
            self.dataset_type(g, res)
            self.create_label(g, res, dataset_label)
            self.create_title(g, res, dataset_title)
            self.create_description(g, res, dataset_description)
            self.create_publication_date(g, res, cur_time)
            self.create_keyword(g, res, "OCC")
            self.create_keyword(g, res, "OpenCitations")
            self.create_keyword(g, res, "OpenCitations Corpus")
            self.create_keyword(g, res, "SPAR Ontologies")
            self.create_keyword(g, res, "bibliographic references")
            self.create_keyword(g, res, "citations")
            self.has_subject(g, res, self.bibliographic_database)
            self.has_subject(g, res, self.scholary_communication)
            self.has_subject(g, res, self.open_access)
            self.has_subject(g, res, self.citations)

            return g

    def get_metadata_path_from_resource(self, dataset_res):
        return self.get_metadata_path_from_iri(str(dataset_res))

    def get_metadata_path_from_iri(self, dataset_iri):
        return re.sub("^%s" % self.base_iri, self.base_dir,
                      dataset_iri) + "index.json"
class ResourceFinder(object):

    def __init__(self, g_set=None, ts_url=None, base_dir=None, base_iri=None,
                 tmp_dir=None, context_map={}):
        self.g = Graph()
        self.base_dir = base_dir
        self.base_iri = base_iri
        self.storer = Storer(context_map=context_map)
        self.tmp_dir = tmp_dir
        self.name = "SPACIN " + self.__class__.__name__
        self.loaded = set()
        if g_set is not None:
            self.update_graph_set(g_set)
        if ts_url is None:
            self.ts = None
        else:
            self.ts = ConjunctiveGraph('SPARQLUpdateStore')
            self.ts.open((ts_url, ts_url))

    def add_prov_triples_in_filesystem(self, res_iri, prov_entity_type=None):
        if self.base_dir is not None and self.base_iri is not None:
            cur_file_path = find_paths(res_iri, self.base_dir, self.base_iri,
                                       dir_split_number, items_per_file)[1]
            if cur_file_path.endswith("index.json"):
                cur_path = cur_file_path.replace("index.json", "") + "prov"
            else:
                cur_path = cur_file_path[:-5] + os.sep + "prov"

            file_list = []
            if os.path.isdir(cur_path):
                for cur_dir, cur_subdir, cur_files in os.walk(cur_path):
                    for cur_file in cur_files:
                        if cur_file.endswith(".json") and \
                           (prov_entity_type is None or cur_file.startswith(prov_entity_type)):
                            file_list += [cur_dir + os.sep + cur_file]

            for file_path in file_list:
                if file_path not in self.loaded:
                    self.loaded.add(file_path)
                    cur_g = self.storer.load(file_path, tmp_dir=self.tmp_dir)
                    self.add_triples_in_graph(cur_g)

    def add_triples_in_graph(self, g):
        if g is not None:
            for s, p, o in g.triples((None, None, None)):
                self.g.add((s, p, o))

    def update_graph_set(self, g_set):
        for g in g_set.graphs():
            self.add_triples_in_graph(g)

    def retrieve(self, id_dict):
        for id_type in id_dict:
            for id_string in id_dict[id_type]:
                res = self.__id_with_type(id_string, id_type)
                if res is not None:
                    return res

    def retrieve_provenance_agent_from_name(self, string):
        query = """
            SELECT DISTINCT ?pa WHERE {
              ?pa a <%s> ;
                <%s> "%s"
            } LIMIT 1
        """ % (ProvEntity.prov_agent,
               GraphEntity.name, string)
        return self.__query(query)

    def retrieve_from_orcid(self, string):
        return self.__id_with_type(string, GraphEntity.orcid)

    def retrieve_citing_from_doi(self, string):
        return self.__id_with_type(
            string.lower(), GraphEntity.doi, "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_citing_from_pmid(self, string):
        return self.__id_with_type(
            string, GraphEntity.pmid, "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_citing_from_pmcid(self, string):
        return self.__id_with_type(
            string, GraphEntity.pmcid, "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_citing_from_url(self, string):
        return self.__id_with_type(
            string.lower(), GraphEntity.url, "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_from_doi(self, string):
        return self.__id_with_type(string.lower(), GraphEntity.doi)

    def retrieve_from_pmid(self, string):
        return self.__id_with_type(string, GraphEntity.pmid)

    def retrieve_from_pmcid(self, string):
        return self.__id_with_type(string, GraphEntity.pmcid)

    def retrieve_from_url(self, string):
        return self.__id_with_type(string.lower(), GraphEntity.url)

    def retrieve_from_issn(self, string):
        return self.__id_with_type(string, GraphEntity.issn)

    def retrieve_from_isbn(self, string):
        return self.__id_with_type(string, GraphEntity.isbn)

    def retrieve_issue_from_journal(self, id_dict, issue_id, volume_id):
        if volume_id is None:
            return self.__retrieve_from_journal(id_dict, GraphEntity.journal_issue, issue_id)
        else:
            retrieved_volume = self.retrieve_volume_from_journal(id_dict, volume_id)
            if retrieved_volume is not None:
                query = """
                    SELECT DISTINCT ?br WHERE {
                        ?br a <%s> ;
                            <%s> <%s> ;
                            <%s> "%s"
                    } LIMIT 1
                """ % (GraphEntity.journal_issue,
                       GraphEntity.part_of, str(retrieved_volume),
                       GraphEntity.has_sequence_identifier, issue_id)
                return self.__query(query)

    def retrieve_volume_from_journal(self, id_dict, volume_id):
        return self.__retrieve_from_journal(id_dict, GraphEntity.journal_volume, volume_id)

    def retrieve_br_url(self, res, string):
        return self.__retrieve_res_id_by_type(res, string.lower(), GraphEntity.url)

    def retrieve_br_doi(self, res, string):
        return self.__retrieve_res_id_by_type(res, string.lower(), GraphEntity.doi)

    def retrieve_br_pmid(self, res, string):
        return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmid)

    def retrieve_br_pmcid(self, res, string):
        return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmcid)

    def retrieve_last_snapshot(self, prov_subj):
        query = """
            SELECT DISTINCT ?se WHERE {
                ?se <%s> <%s> .
                FILTER NOT EXISTS {?se <%s> ?ca }
            } LIMIT 1
        """ % (ProvEntity.specialization_of, str(prov_subj),
               ProvEntity.was_invalidated_by)
        return self.__query(query)

    def __retrieve_res_id_by_type(self, res, id_string, id_type):
        if id_string is not None:
            query = """
            SELECT DISTINCT ?id WHERE {
                <%s> <%s> ?id .
                ?id <%s> <%s> ;
                    <%s> "%s"
            }""" % (
                res, GraphEntity.has_identifier,
                GraphEntity.uses_identifier_scheme, id_type,
                GraphEntity.has_literal_value, id_string)
            return self.__query(query)

    def __retrieve_from_journal(self, id_dict, part_type, part_seq_id):
        for id_type in id_dict:
            for id_string in id_dict[id_type]:
                query = """
                SELECT DISTINCT ?res WHERE {
                    ?j <%s> ?id .
                    ?id
                        <%s> <%s> ;
                        <%s> "%s" .
                    ?res a <%s> ;
                        <%s>+ ?j ;
                        <%s> "%s"
                }""" % (
                    GraphEntity.has_identifier,
                    GraphEntity.uses_identifier_scheme, id_type,
                    GraphEntity.has_literal_value, id_string,
                    part_type,
                    GraphEntity.part_of,
                    GraphEntity.has_sequence_identifier, part_seq_id
                )

                return self.__query(query)

    def __id_with_type(self, id_string, id_type, extras=""):
        query = """
        SELECT DISTINCT ?res WHERE {
            ?res <%s> ?id .
            ?id
                <%s> <%s> ;
                <%s> "%s" .
                %s
        }""" % (
            GraphEntity.has_identifier,
            GraphEntity.uses_identifier_scheme, id_type,
            GraphEntity.has_literal_value, id_string, extras)

        return self.__query(query)

    def __query(self, query):
        if self.ts is not None:
            result = self.ts.query(query)
            for res, in result:
                return res

        # If nothing has been returned, check if there is something
        # in the current graph set
        result = self.g.query(query)
        for res, in result:
            return res
Beispiel #7
0
class ResourceFinder(object):
    def __init__(self,
                 g_set=None,
                 ts_url=None,
                 base_dir=None,
                 base_iri=None,
                 tmp_dir=None,
                 context_map={}):
        self.g = Graph()
        self.base_dir = base_dir
        self.base_iri = base_iri
        self.storer = Storer(context_map=context_map)
        self.tmp_dir = tmp_dir
        self.name = "SPACIN " + self.__class__.__name__
        self.loaded = set()
        if g_set is not None:
            self.update_graph_set(g_set)
        if ts_url is None:
            self.ts = None
        else:
            self.ts = ConjunctiveGraph('SPARQLUpdateStore')
            self.ts.open((ts_url, ts_url))

    def add_prov_triples_in_filesystem(self, res_iri, prov_entity_type=None):
        if self.base_dir is not None and self.base_iri is not None:
            cur_file_path = find_paths(res_iri, self.base_dir, self.base_iri,
                                       dir_split_number, items_per_file)[1]
            if cur_file_path.endswith("index.json"):
                cur_path = cur_file_path.replace("index.json", "") + "prov"
            else:
                cur_path = cur_file_path[:-5] + os.sep + "prov"

            file_list = []
            if os.path.isdir(cur_path):
                for cur_dir, cur_subdir, cur_files in os.walk(cur_path):
                    for cur_file in cur_files:
                        if cur_file.endswith(".json") and \
                           (prov_entity_type is None or cur_file.startswith(prov_entity_type)):
                            file_list += [cur_dir + os.sep + cur_file]

            for file_path in file_list:
                if file_path not in self.loaded:
                    self.loaded.add(file_path)
                    cur_g = self.storer.load(file_path, tmp_dir=self.tmp_dir)
                    self.add_triples_in_graph(cur_g)

    def add_triples_in_graph(self, g):
        if g is not None:
            for s, p, o in g.triples((None, None, None)):
                self.g.add((s, p, o))

    def update_graph_set(self, g_set):
        for g in g_set.graphs():
            self.add_triples_in_graph(g)

    def retrieve(self, id_dict):
        for id_type in id_dict:
            for id_string in id_dict[id_type]:
                res = self.__id_with_type(id_string, id_type)
                if res is not None:
                    return res

    def retrieve_provenance_agent_from_name(self, string):
        query = """
            SELECT DISTINCT ?pa WHERE {
              ?pa a <%s> ;
                <%s> "%s"
            } LIMIT 1
        """ % (ProvEntity.prov_agent, GraphEntity.name, string)
        return self.__query(query)

    def retrieve_from_orcid(self, string):
        return self.__id_with_type(string, GraphEntity.orcid)

    def retrieve_citing_from_doi(self, string):
        return self.__id_with_type(string.lower(), GraphEntity.doi,
                                   "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_citing_from_pmid(self, string):
        return self.__id_with_type(string, GraphEntity.pmid,
                                   "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_citing_from_pmcid(self, string):
        return self.__id_with_type(string, GraphEntity.pmcid,
                                   "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_citing_from_url(self, string):
        return self.__id_with_type(string.lower(), GraphEntity.url,
                                   "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_from_doi(self, string):
        return self.__id_with_type(string.lower(), GraphEntity.doi)

    def retrieve_from_pmid(self, string):
        return self.__id_with_type(string, GraphEntity.pmid)

    def retrieve_from_pmcid(self, string):
        return self.__id_with_type(string, GraphEntity.pmcid)

    def retrieve_from_url(self, string):
        return self.__id_with_type(string.lower(), GraphEntity.url)

    def retrieve_from_issn(self, string):
        return self.__id_with_type(string, GraphEntity.issn)

    def retrieve_from_isbn(self, string):
        return self.__id_with_type(string, GraphEntity.isbn)

    def retrieve_issue_from_journal(self, id_dict, issue_id, volume_id):
        if volume_id is None:
            return self.__retrieve_from_journal(id_dict,
                                                GraphEntity.journal_issue,
                                                issue_id)
        else:
            retrieved_volume = self.retrieve_volume_from_journal(
                id_dict, volume_id)
            if retrieved_volume is not None:
                query = """
                    SELECT DISTINCT ?br WHERE {
                        ?br a <%s> ;
                            <%s> <%s> ;
                            <%s> "%s"
                    } LIMIT 1
                """ % (GraphEntity.journal_issue, GraphEntity.part_of,
                       str(retrieved_volume),
                       GraphEntity.has_sequence_identifier, issue_id)
                return self.__query(query)

    def retrieve_volume_from_journal(self, id_dict, volume_id):
        return self.__retrieve_from_journal(id_dict,
                                            GraphEntity.journal_volume,
                                            volume_id)

    def retrieve_br_url(self, res, string):
        return self.__retrieve_res_id_by_type(res, string.lower(),
                                              GraphEntity.url)

    def retrieve_br_doi(self, res, string):
        return self.__retrieve_res_id_by_type(res, string.lower(),
                                              GraphEntity.doi)

    def retrieve_br_pmid(self, res, string):
        return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmid)

    def retrieve_br_pmcid(self, res, string):
        return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmcid)

    def retrieve_last_snapshot(self, prov_subj):
        query = """
            SELECT DISTINCT ?se WHERE {
                ?se <%s> <%s> .
                FILTER NOT EXISTS {?se <%s> ?ca }
            } LIMIT 1
        """ % (ProvEntity.specialization_of, str(prov_subj),
               ProvEntity.was_invalidated_by)
        return self.__query(query)

    def __retrieve_res_id_by_type(self, res, id_string, id_type):
        if id_string is not None:
            query = """
            SELECT DISTINCT ?id WHERE {
                <%s> <%s> ?id .
                ?id <%s> <%s> ;
                    <%s> "%s"
            }""" % (res, GraphEntity.has_identifier,
                    GraphEntity.uses_identifier_scheme, id_type,
                    GraphEntity.has_literal_value, id_string)
            return self.__query(query)

    def __retrieve_from_journal(self, id_dict, part_type, part_seq_id):
        for id_type in id_dict:
            for id_string in id_dict[id_type]:
                query = """
                SELECT DISTINCT ?res WHERE {
                    ?j <%s> ?id .
                    ?id
                        <%s> <%s> ;
                        <%s> "%s" .
                    ?res a <%s> ;
                        <%s>+ ?j ;
                        <%s> "%s"
                }""" % (GraphEntity.has_identifier,
                        GraphEntity.uses_identifier_scheme, id_type,
                        GraphEntity.has_literal_value, id_string, part_type,
                        GraphEntity.part_of,
                        GraphEntity.has_sequence_identifier, part_seq_id)

                return self.__query(query)

    def __id_with_type(self, id_string, id_type, extras=""):
        query = """
        SELECT DISTINCT ?res WHERE {
            ?res <%s> ?id .
            ?id
                <%s> <%s> ;
                <%s> "%s" .
                %s
        }""" % (GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme,
                id_type, GraphEntity.has_literal_value, id_string, extras)

        return self.__query(query)

    def __query(self, query):
        if self.ts is not None:
            result = self.ts.query(query)
            for res, in result:
                return res

        # If nothing has been returned, check if there is something
        # in the current graph set
        result = self.g.query(query)
        for res, in result:
            return res
    s = Storer(context_map={context_path: context_file_path})

    all_files = []
    if args.rec:
        for cur_dir, cur_subdir, cur_files in os.walk(args.i_dir):
            for cur_file in cur_files:
                if cur_file.endswith(".json"):
                    all_files += [cur_dir + os.sep + cur_file]
    else:
        for cur_file in os.listdir(args.i_dir):
            if cur_file.endswith(".json"):
                all_files += [args.i_dir + os.sep + cur_file]

    for rdf_path in all_files:
        cur_g = s.load(rdf_path, tmp_dir=temp_dir_for_rdf_loading)
        try:
            for o in cur_g.objects(None, GraphEntity.has_identifier):
                o_local_path = str(o).replace(base_iri, base_dir) + ".json"
                id_g = s.load(o_local_path, tmp_dir=temp_dir_for_rdf_loading)
                for v in id_g.objects(None, GraphEntity.has_literal_value):
                    if v not in id_doc:
                        id_doc[v] = []
                    id_doc[v] += [rdf_path]
        except:
            pass  # Do nothing

    result = []
    for id_lit in id_doc:
        cur_list = id_doc[id_lit]
        if len(cur_list) > 1: