Exemple #1
0
 def __init__(self,
              g_set=None,
              ts_url=None,
              base_dir=None,
              base_iri=None,
              default_dir="_",
              tmp_dir=None,
              context_map={},
              dir_split=0,
              n_file_item=1):
     self.g = Graph()
     self.base_dir = base_dir
     self.base_iri = base_iri
     self.storer = Storer(context_map=context_map)
     self.tmp_dir = tmp_dir
     self.dir_split = dir_split
     self.n_file_item = n_file_item
     self.name = "SPACIN " + self.__class__.__name__
     self.loaded = set()
     self.default_dir = default_dir
     if g_set is not None:
         self.update_graph_set(g_set)
     if ts_url is None:
         self.ts = None
     else:
         self.ts = ConjunctiveGraph('SPARQLUpdateStore')
         self.ts.open((ts_url, ts_url))
Exemple #2
0
def fix_reference(timestamp, accept, citing, cited, reference):
    rf, of, cp, cdh = create_resources()
    s = Storer(cp.graph_set(),
               context_map={context_path: context_file_path},
               dir_split=dir_split_number,
               n_file_item=items_per_file,
               default_dir=default_dir)

    r_text = unquote(reference)

    g_add_be = Graph(identifier=base_iri + "be/")
    g_remove_be = Graph(identifier=base_iri + "be/")
    g_add_br = Graph(identifier=base_iri + "br/")
    g_remove_br = Graph(identifier=base_iri + "br/")

    ref_res = rf.retrieve_reference(base_iri + citing, base_iri + cited)
    g_add_be.add((ref_res, GraphEntity.has_content, Literal(r_text)))
    ref_res_text = rf.retrieve_reference_text(ref_res)
    g_remove_be.add((ref_res, GraphEntity.has_content, ref_res_text))

    if accept == "false":
        citing_res = URIRef(base_iri + citing)
        cited_res = URIRef(base_iri + cited)
        cur_time = datetime.fromtimestamp(
            int(timestamp)).strftime('%Y-%m-%dT%H:%M:%S')
        mod_date = str(rf.retrieve_modification_date(ref_res))

        if cur_time == mod_date:  # It didn't exist before
            cur_dir_path, cur_file_path = s.dir_and_file_paths(
                g_remove_br, base_dir, base_iri)
            cur_g = s.load(cur_file_path)
            for s, p, o in cur_g.triples((cited_res, None, None)):
                if p != RDF.type or o != GraphEntity.expression:
                    g_remove_br.add(s, p, o)

        else:  # It exists already
            new_cited = URIRef(
                str(cp.graph_set().add_br(cp.name, doi_curator,
                                          bcite_base_iri)))
            gen_prov_and_store_data(cp, rf, timestamp)
            g_remove_br.add((citing_res, GraphEntity.cites, cited_res))
            g_remove_be.add((ref_res, GraphEntity.references, cited_res))

            g_add_br.add((citing_res, GraphEntity.cites, new_cited))
            g_add_be.add((ref_res, GraphEntity.references, new_cited))

    s.update(g_add_be, g_remove_be, base_dir, base_iri, context_path,
             temp_dir_for_rdf_loading)
    s.update(g_add_br, g_remove_br, base_dir, base_iri, context_path,
             temp_dir_for_rdf_loading)
    s.update_all([g_add_br, g_add_be], [g_remove_br, g_remove_be],
                 triplestore_url, base_dir)

    return timestamp, accept, citing, cited, quote(ref_res_text)
Exemple #3
0
    def setUp(self):
        cur_g = Graph(identifier=base_iri + "br/")
        cur_g.add((URIRef(base_iri + "br/022201"), FOAF.maker,
                   URIRef(base_iri + "ra/011101")))
        self.g = cur_g

        self.s = Storer(None,
                        context_map={context_path: context_file_path},
                        dir_split=dir_split_number,
                        n_file_item=items_per_file,
                        default_dir=default_dir)
Exemple #4
0
 def __init__(self, tp_url_real, context_path, context_file_path,
              base_iri, base_dir, info_dir, dataset_home, tmp_dir, triplestore_url=None):
     self.tp_url = triplestore_url
     self.base_iri = base_iri
     self.base_dir = base_dir
     self.info_dir = info_dir
     self.context_path = context_path
     self.dataset_home = URIRef(dataset_home)
     self.tmp_dir = tmp_dir
     self.tp_res = URIRef(tp_url_real)
     self.repok = Reporter(prefix="[DatasetHandler: INFO] ")
     self.reperr = Reporter(prefix="[DatasetHandler: ERROR] ")
     self.st = Storer(context_map={context_path: context_file_path},
                      repok=self.repok, reperr=self.reperr)
     self.st.set_preface_query(
         u"DELETE { ?res <%s> ?date } WHERE { ?res a <%s> ; <%s> ?date }" %
         (str(DatasetHandler.modified), str(DatasetHandler.dataset), str(DatasetHandler.modified)))
Exemple #5
0
def gen_prov_and_store_data(cp, rf, timestamp):
    prov = ProvSet(cp.graph_set(), base_iri, context_path, default_dir,
                   full_info_dir, rf, dir_split_number, items_per_file, "")
    prov.generate_provenance(int(float(timestamp)))

    # Store all the data
    res_storer = Storer(cp.graph_set(),
                        context_map={context_path: context_file_path},
                        dir_split=dir_split_number,
                        n_file_item=items_per_file,
                        default_dir=default_dir)

    prov_storer = Storer(prov,
                         context_map={context_path: context_file_path},
                         dir_split=dir_split_number,
                         n_file_item=items_per_file)

    res_storer.upload_and_store(base_dir, triplestore_url, base_iri,
                                context_path, temp_dir_for_rdf_loading)

    prov_storer.upload_and_store(base_dir, triplestore_url, base_iri,
                                 context_path, temp_dir_for_rdf_loading)
Exemple #6
0
def store_all(gs):
    prov = ProvSet(
        gs, base_iri, context_path, default_dir, full_info_dir,
        ResourceFinder(base_dir=base_dir,
                       base_iri=base_iri,
                       tmp_dir=temp_dir_for_rdf_loading,
                       context_map={context_path: context_file_path},
                       dir_split=dir_split_number,
                       n_file_item=items_per_file,
                       default_dir=default_dir), dir_split_number,
        items_per_file, "")  # Prefix set to "" so as to avoid it for prov data
    prov.generate_provenance()

    print("Store the data for %s entities." % str(entity_count))
    res_storer = Storer(gs,
                        context_map={context_path: context_file_path},
                        dir_split=dir_split_number,
                        n_file_item=items_per_file,
                        default_dir=default_dir)

    prov_storer = Storer(prov,
                         context_map={context_path: context_file_path},
                         dir_split=dir_split_number,
                         n_file_item=items_per_file,
                         default_dir=default_dir)

    res_storer.store_all(base_dir, base_iri, context_path,
                         temp_dir_for_rdf_loading)

    prov_storer.store_all(base_dir, base_iri, context_path,
                          temp_dir_for_rdf_loading)

    print("Update the dataset description.")
    dset_handler = DatasetHandler(triplestore_url_real, context_path,
                                  context_file_path, base_iri, base_dir,
                                  full_info_dir, dataset_home,
                                  temp_dir_for_rdf_loading)
    dset_handler.update_dataset_info(gs)
Exemple #7
0
def update_all(g_set, remove_entity, full_info_dir):
    prov = ProvSet(
        g_set, base_iri, context_path, default_dir, full_info_dir,
        ResourceFinder(base_dir=base_dir,
                       base_iri=base_iri,
                       tmp_dir=temp_dir_for_rdf_loading,
                       context_map={context_path: context_file_path},
                       dir_split=dir_split_number,
                       n_file_item=items_per_file,
                       default_dir=default_dir), dir_split_number,
        items_per_file, "")
    prov.generate_provenance(do_insert=False, remove_entity=remove_entity)

    res_storer = Storer(g_set,
                        context_map={context_path: context_file_path},
                        dir_split=dir_split_number,
                        n_file_item=items_per_file,
                        default_dir=default_dir)

    prov_storer = Storer(prov,
                         context_map={context_path: context_file_path},
                         dir_split=dir_split_number,
                         n_file_item=items_per_file,
                         default_dir=default_dir)

    res_storer.store_all(base_dir,
                         base_iri,
                         context_path,
                         temp_dir_for_rdf_loading,
                         remove_data=True)

    prov_storer.store_all(base_dir, base_iri, context_path,
                          temp_dir_for_rdf_loading)

    dset_handler = DatasetHandler(triplestore_url_real, context_path,
                                  context_file_path, base_iri, base_dir, "",
                                  dataset_home, temp_dir_for_rdf_loading)
    dset_handler.update_dataset_info(g_set)
Exemple #8
0
class StorerTest(unittest.TestCase):
    def setUp(self):
        cur_g = Graph(identifier=base_iri + "br/")
        cur_g.add((URIRef(base_iri + "br/022201"), FOAF.maker,
                   URIRef(base_iri + "ra/011101")))
        self.g = cur_g

        self.s = Storer(None,
                        context_map={context_path: context_file_path},
                        dir_split=dir_split_number,
                        n_file_item=items_per_file,
                        default_dir=default_dir)

    def test_store(self):
        result = self.s.store(self.g,
                              base_dir,
                              base_iri,
                              context_path,
                              temp_dir_for_rdf_loading,
                              store_now=False)
        print(
            list(result.keys())[0],
            list(result.values())[0].serialize(format="nquads"))
Exemple #9
0
class ResourceFinder(object):
    def __init__(self,
                 g_set=None,
                 ts_url=None,
                 base_dir=None,
                 base_iri=None,
                 default_dir="_",
                 tmp_dir=None,
                 context_map={},
                 dir_split=0,
                 n_file_item=1):
        self.g = Graph()
        self.base_dir = base_dir
        self.base_iri = base_iri
        self.storer = Storer(context_map=context_map)
        self.tmp_dir = tmp_dir
        self.dir_split = dir_split
        self.n_file_item = n_file_item
        self.name = "SPACIN " + self.__class__.__name__
        self.loaded = set()
        self.default_dir = default_dir
        if g_set is not None:
            self.update_graph_set(g_set)
        if ts_url is None:
            self.ts = None
        else:
            self.ts = ConjunctiveGraph('SPARQLUpdateStore')
            self.ts.open((ts_url, ts_url))

    def add_prov_triples_in_filesystem(self, res_iri, prov_entity_type=None):
        if self.base_dir is not None and self.base_iri is not None:
            cur_file_path = find_paths(res_iri, self.base_dir, self.base_iri,
                                       self.default_dir, self.dir_split,
                                       self.n_file_item)[1]
            if cur_file_path.endswith("index.json"):
                cur_path = cur_file_path.replace("index.json", "") + "prov"
            else:
                cur_path = cur_file_path[:-5] + os.sep + "prov"

            file_list = []
            if os.path.isdir(cur_path):
                for cur_dir, cur_subdir, cur_files in os.walk(cur_path):
                    for cur_file in cur_files:
                        if cur_file.endswith(".json") and \
                           (prov_entity_type is None or cur_file.startswith(prov_entity_type)):
                            file_list += [cur_dir + os.sep + cur_file]

            for file_path in file_list:
                if file_path not in self.loaded:
                    self.loaded.add(file_path)
                    cur_g = self.storer.load(file_path, tmp_dir=self.tmp_dir)
                    self.add_triples_in_graph(cur_g)

    def add_triples_in_graph(self, g):
        if g is not None:
            for s, p, o in g.triples((None, None, None)):
                self.g.add((s, p, o))

    def update_graph_set(self, g_set):
        for g in g_set.graphs():
            self.add_triples_in_graph(g)

    def retrieve(self, id_dict):
        for id_type in id_dict:
            for id_string in id_dict[id_type]:
                res = self.__id_with_type(id_string, id_type)
                if res is not None:
                    return res

    def retrieve_provenance_agent_from_name(self, string):
        query = """
            SELECT DISTINCT ?pa WHERE {
              ?pa a <%s> ;
                <%s> "%s"
            } LIMIT 1
        """ % (ProvEntity.prov_agent, GraphEntity.name, string)
        return self.__query(query)

    def retrieve_reference(self, citing_res, cited_res):
        query = """
            SELECT DISTINCT ?res WHERE {
                <%s> <%s> ?res .
                ?res <%s> <%s> 
            }""" % (citing_res, GraphEntity.contains_reference,
                    GraphEntity.references, cited_res)

        return self.__query(query)

    def retrieve_reference_text(self, ref_res):
        query = """
            SELECT DISTINCT ?res WHERE {
                <%s> <%s> ?res 
            }""" % (ref_res, GraphEntity.has_content)

        return self.__query(query)

    def retrieve_from_orcid(self, string):
        return self.__id_with_type(string, GraphEntity.orcid)

    def retrieve_modification_date(self, res_iri):
        query = """
                SELECT DISTINCT ?res WHERE {
                    <%s> ^<%s> ?snapshot .
                    FILTER NOT EXISTS { ?snapshop <%s> ?inv_date }
                    ?snapshop <%s> ?res
                }""" % (res_iri, ProvEntity.specialization_of,
                        ProvEntity.invalidated_at_time,
                        ProvEntity.generated_at_time)

        return self.__query(query)

    def retrieve_entity(self, string, type):
        query = """
                SELECT DISTINCT ?res WHERE {
                    BIND(iri("%s") as ?res) .
                    ?res a <%s> 
                }""" % (string, str(type))

        return self.__query(query)

    def retrieve_citing_from_doi(self, string):
        return self.__id_with_type(string.lower(), GraphEntity.doi,
                                   "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_citing_from_pmid(self, string):
        return self.__id_with_type(string, GraphEntity.pmid,
                                   "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_citing_from_pmcid(self, string):
        return self.__id_with_type(string, GraphEntity.pmcid,
                                   "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_citing_from_url(self, string):
        return self.__id_with_type(string.lower(), GraphEntity.url,
                                   "?res <%s> ?cited" % GraphEntity.cites)

    def retrieve_from_doi(self, string):
        return self.__id_with_type(string.lower(), GraphEntity.doi)

    def retrieve_from_pmid(self, string):
        return self.__id_with_type(string, GraphEntity.pmid)

    def retrieve_from_pmcid(self, string):
        return self.__id_with_type(string, GraphEntity.pmcid)

    def retrieve_from_url(self, string):
        return self.__id_with_type(string.lower(), GraphEntity.url)

    def retrieve_from_issn(self, string):
        return self.__id_with_type(string, GraphEntity.issn)

    def retrieve_from_isbn(self, string):
        return self.__id_with_type(string, GraphEntity.isbn)

    def retrieve_issue_from_journal(self, id_dict, issue_id, volume_id):
        if volume_id is None:
            return self.__retrieve_from_journal(id_dict,
                                                GraphEntity.journal_issue,
                                                issue_id)
        else:
            retrieved_volume = self.retrieve_volume_from_journal(
                id_dict, volume_id)
            if retrieved_volume is not None:
                query = """
                    SELECT DISTINCT ?br WHERE {
                        ?br a <%s> ;
                            <%s> <%s> ;
                            <%s> "%s"
                    } LIMIT 1
                """ % (GraphEntity.journal_issue, GraphEntity.part_of,
                       str(retrieved_volume),
                       GraphEntity.has_sequence_identifier, issue_id)
                return self.__query(query)

    def retrieve_volume_from_journal(self, id_dict, volume_id):
        return self.__retrieve_from_journal(id_dict,
                                            GraphEntity.journal_volume,
                                            volume_id)

    def retrieve_url_string(self, res):
        return self.__retrieve_res_id_string(res, GraphEntity.url)

    def retrieve_doi_string(self, res):
        return self.__retrieve_res_id_string(res, GraphEntity.doi)

    def retrieve_pmid_string(self, res):
        return self.__retrieve_res_id_string(res, GraphEntity.pmid)

    def retrieve_pmcid_string(self, res):
        return self.__retrieve_res_id_string(res, GraphEntity.pmcid)

    def retrieve_br_url(self, res, string):
        return self.__retrieve_res_id_by_type(res, string.lower(),
                                              GraphEntity.url)

    def retrieve_br_doi(self, res, string):
        return self.__retrieve_res_id_by_type(res, string.lower(),
                                              GraphEntity.doi)

    def retrieve_br_pmid(self, res, string):
        return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmid)

    def retrieve_br_pmcid(self, res, string):
        return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmcid)

    def retrieve_last_snapshot(self, prov_subj):
        query = """
            SELECT DISTINCT ?se WHERE {
                ?se <%s> <%s> .
                FILTER NOT EXISTS {?se <%s> ?ca }
            } LIMIT 1
        """ % (ProvEntity.specialization_of, str(prov_subj),
               ProvEntity.was_invalidated_by)
        return self.__query(query)

    def __retrieve_res_id_string(self, res, id_type):
        query = """
        SELECT DISTINCT ?id WHERE {
            <%s> <%s> [
                <%s> <%s> ;
                <%s> ?id
            ]
        }""" % (res, GraphEntity.has_identifier,
                GraphEntity.uses_identifier_scheme, id_type,
                GraphEntity.has_literal_value)
        return self.__query(query)

    def __retrieve_res_id_by_type(self, res, id_string, id_type):
        if id_string is not None:
            query = """
            SELECT DISTINCT ?id WHERE {
                <%s> <%s> ?id .
                ?id <%s> <%s> ;
                    <%s> "%s"
            }""" % (res, GraphEntity.has_identifier,
                    GraphEntity.uses_identifier_scheme, id_type,
                    GraphEntity.has_literal_value, id_string)
            return self.__query(query)

    def __retrieve_from_journal(self, id_dict, part_type, part_seq_id):
        for id_type in id_dict:
            for id_string in id_dict[id_type]:
                query = """
                SELECT DISTINCT ?res WHERE {
                    ?j <%s> ?id .
                    ?id
                        <%s> <%s> ;
                        <%s> "%s" .
                    ?res a <%s> ;
                        <%s>+ ?j ;
                        <%s> "%s"
                }""" % (GraphEntity.has_identifier,
                        GraphEntity.uses_identifier_scheme, id_type,
                        GraphEntity.has_literal_value, id_string, part_type,
                        GraphEntity.part_of,
                        GraphEntity.has_sequence_identifier, part_seq_id)

                return self.__query(query)

    def __id_with_type(self, id_string, id_type, extras=""):
        query = """
        SELECT DISTINCT ?res WHERE {
            ?res <%s> ?id .
            ?id
                <%s> <%s> ;
                <%s> "%s" .
                %s
        }""" % (GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme,
                id_type, GraphEntity.has_literal_value, id_string, extras)

        return self.__query(query)

    def __query(self, query):
        if self.ts is not None:
            result = self.ts.query(query)
            for res, in result:
                return res

        # If nothing has been returned, check if there is something
        # in the current graph set
        result = self.g.query(query)
        for res, in result:
            return res
Exemple #10
0
    def __init__(self,
                 g_set=None,
                 ts_url=None,
                 base_dir=None,
                 base_iri=None,
                 default_dir="_",
                 tmp_dir=None,
                 context_map={},
                 dir_split=0,
                 n_file_item=1):
        self.g = Graph()
        self.base_dir = base_dir
        self.base_iri = base_iri
        self.storer = Storer(context_map=context_map)
        self.tmp_dir = tmp_dir
        self.dir_split = dir_split
        self.n_file_item = n_file_item
        self.name = "SPACIN " + self.__class__.__name__
        self.loaded = set()
        self.default_dir = default_dir
        self.index_for_graph_set = 0
        #self.check = False

        if g_set is not None:
            self.update_graph_set(g_set)
        if ts_url is None:
            self.ts = None
        else:
            self.ts = ConjunctiveGraph('SPARQLUpdateStore')
            self.ts.open((ts_url, ts_url))
            self.ts.namespace_manager.store.nsBindings = {}

        # This is to search eg.: for doi and get the res
        self.doi_store = {}
        self.orcid_store = {}
        self.pmid_store = {}
        self.pmcid_store = {}
        self.url_store = {}
        self.issn_store = {}
        self.isbn_store = {}
        self.crossref_store = {}

        # Used in __retrieve_res_id_string() when you query for the {res} and want to get ids literal values
        self.doi_store_type = {}
        self.orcid_store_type = {}
        self.pmid_store_type = {}
        self.pmcid_store_type = {}
        self.url_store_type = {}
        self.issn_store_type = {}
        self.isbn_store_type = {}
        self.crossref_store_type = {}

        # Used in __retrieve_res_id_by_type() when you query for the {res}_{id_literal} and
        # want to get id's URI,
        #
        # eg: calling
        #               cur_id = self.rf.retrieve_br_url(cur_res.res, extracted_url)
        # in crossrefproc.py
        self.doi_store_type_id = {}
        self.orcid_store_type_id = {}
        self.pmid_store_type_id = {}
        self.pmcid_store_type_id = {}
        self.url_store_type_id = {}
        self.issn_store_type_id = {}
        self.isbn_store_type_id = {}
        self.crossref_store_type_id = {}

        # Used in __retrieve_from_journal() where you query for
        # {id_type}_{id_string}_{part_seq_id} and get the res
        # e.g. http://purl.org/spar/datacite/issn_1388-0209_58
        # ISSN_1388-0209_volume_58
        self.from_journal_volume = {}
        self.from_issue_partof_journal = {}

        # Caching blazegraph queries
        self.cache = {}
        self.cache_local = {}
Exemple #11
0
class ResourceFinder(object):
    def __init__(self,
                 g_set=None,
                 ts_url=None,
                 base_dir=None,
                 base_iri=None,
                 default_dir="_",
                 tmp_dir=None,
                 context_map={},
                 dir_split=0,
                 n_file_item=1):
        self.g = Graph()
        self.base_dir = base_dir
        self.base_iri = base_iri
        self.storer = Storer(context_map=context_map)
        self.tmp_dir = tmp_dir
        self.dir_split = dir_split
        self.n_file_item = n_file_item
        self.name = "SPACIN " + self.__class__.__name__
        self.loaded = set()
        self.default_dir = default_dir
        self.index_for_graph_set = 0
        #self.check = False

        if g_set is not None:
            self.update_graph_set(g_set)
        if ts_url is None:
            self.ts = None
        else:
            self.ts = ConjunctiveGraph('SPARQLUpdateStore')
            self.ts.open((ts_url, ts_url))
            self.ts.namespace_manager.store.nsBindings = {}

        # This is to search eg.: for doi and get the res
        self.doi_store = {}
        self.orcid_store = {}
        self.pmid_store = {}
        self.pmcid_store = {}
        self.url_store = {}
        self.issn_store = {}
        self.isbn_store = {}
        self.crossref_store = {}

        # Used in __retrieve_res_id_string() when you query for the {res} and want to get ids literal values
        self.doi_store_type = {}
        self.orcid_store_type = {}
        self.pmid_store_type = {}
        self.pmcid_store_type = {}
        self.url_store_type = {}
        self.issn_store_type = {}
        self.isbn_store_type = {}
        self.crossref_store_type = {}

        # Used in __retrieve_res_id_by_type() when you query for the {res}_{id_literal} and
        # want to get id's URI,
        #
        # eg: calling
        #               cur_id = self.rf.retrieve_br_url(cur_res.res, extracted_url)
        # in crossrefproc.py
        self.doi_store_type_id = {}
        self.orcid_store_type_id = {}
        self.pmid_store_type_id = {}
        self.pmcid_store_type_id = {}
        self.url_store_type_id = {}
        self.issn_store_type_id = {}
        self.isbn_store_type_id = {}
        self.crossref_store_type_id = {}

        # Used in __retrieve_from_journal() where you query for
        # {id_type}_{id_string}_{part_seq_id} and get the res
        # e.g. http://purl.org/spar/datacite/issn_1388-0209_58
        # ISSN_1388-0209_volume_58
        self.from_journal_volume = {}
        self.from_issue_partof_journal = {}

        # Caching blazegraph queries
        self.cache = {}
        self.cache_local = {}

    def add_prov_triples_in_filesystem(self, res_iri, prov_entity_type=None):
        if self.base_dir is not None and self.base_iri is not None:
            cur_file_path = find_paths(res_iri, self.base_dir, self.base_iri,
                                       self.default_dir, self.dir_split,
                                       self.n_file_item)[1]
            if cur_file_path.endswith("index.json"):
                cur_path = cur_file_path.replace("index.json", "") + "prov"
            else:
                cur_path = cur_file_path[:-5] + os.sep + "prov"

            file_list = []
            if os.path.isdir(cur_path):
                for cur_dir, cur_subdir, cur_files in os.walk(cur_path):
                    for cur_file in cur_files:
                        if (cur_file.endswith(".json") or cur_file.endswith(".ttl")) and \
                                (prov_entity_type is None or cur_file.startswith(prov_entity_type)):
                            file_list += [cur_dir + os.sep + cur_file]

            for file_path in file_list:
                if file_path not in self.loaded:
                    self.loaded.add(file_path)
                    cur_g = self.storer.load(file_path, tmp_dir=self.tmp_dir)
                    #self.add_triples_in_graph(cur_g)

    def add_triples_in_graph(self, g):
        return
        # This is deprecated
        if g is not None:
            for s, p, o in g.triples((None, None, None)):
                self.g.add((s, p, o))

    def update_graph_set(self, g_set):
        return
        # This is deprecated
        for g in g_set.graphs()[self.index_for_graph_set:]:
            self.add_triples_in_graph(g)
            self.index_for_graph_set += 1

    def retrieve(self, id_dict, typ='both'):

        for id_type in id_dict:
            for id_string in id_dict[id_type]:
                res = self.__id_with_type(id_string, id_type, typ=typ)
                if res is not None:
                    return res

    def retrieve_from_orcid(self, string, typ='both'):
        return self.__id_with_type(string, GraphEntity.orcid, typ=typ)

    def retrieve_entity(self, string, typ='both'):
        query = """
                SELECT DISTINCT ?res WHERE {{
                    BIND(iri("{}") as ?res) .
                    ?res a <{}>
                }}""".format(string, str(type))
        return self.__query(query, typ=typ)

    def retrieve_citing_from_doi(self, string, typ='only_blazegraph'):
        return self.__id_with_type(string.lower(), GraphEntity.doi,
                                   "?res <%s> ?cited" % GraphEntity.cites, typ)

    def retrieve_citing_from_pmid(self, string, typ='only_blazegraph'):
        return self.__id_with_type(string, GraphEntity.pmid,
                                   "?res <%s> ?cited" % GraphEntity.cites, typ)

    def retrieve_citing_from_pmcid(self, string, typ='only_blazegraph'):
        return self.__id_with_type(string, GraphEntity.pmcid,
                                   "?res <%s> ?cited" % GraphEntity.cites, typ)

    def retrieve_citing_from_url(self, string, typ='only_blazegraph'):
        return self.__id_with_type(string.lower(), GraphEntity.url,
                                   "?res <%s> ?cited" % GraphEntity.cites, typ)

    def retrieve_from_doi(self, string, typ='both'):
        return self.__id_with_type(string.lower(), GraphEntity.doi, typ=typ)

    def retrieve_from_pmid(self, string, typ='both'):
        return self.__id_with_type(string, GraphEntity.pmid, typ=typ)

    def retrieve_from_pmcid(self, string, typ='both'):
        return self.__id_with_type(string, GraphEntity.pmcid, typ=typ)

    def retrieve_from_url(self, string, typ='both'):
        return self.__id_with_type(string.lower(), GraphEntity.url, typ=typ)

    def retrieve_from_crossref(self, string, typ='both'):
        return self.__id_with_type(string, GraphEntity.crossref, typ=typ)

    def retrieve_from_issn(self, string, typ='both'):
        return self.__id_with_type(string, GraphEntity.issn, typ=typ)

    def retrieve_from_isbn(self, string, typ='both'):
        return self.__id_with_type(string, GraphEntity.isbn, typ=typ)

    def retrieve_issue_from_journal(self, id_dict, issue_id, volume_id):
        retrieved_journal = self.retrieve(id_dict, 'both')

        if retrieved_journal is not None:
            cur_issue = self.from_issue_partof_journal.get(
                (retrieved_journal, volume_id, issue_id))

            if cur_issue is None:
                if volume_id is None:
                    query = """
                            SELECT DISTINCT ?br WHERE {{
                                ?br a <{}> ;
                                    <{}> <{}> ;
                                    <{}> "{}"
                            }} LIMIT 1
                        """.format(GraphEntity.journal_issue,
                                   GraphEntity.part_of, retrieved_journal,
                                   GraphEntity.has_sequence_identifier,
                                   issue_id)
                else:
                    query = """
                            SELECT DISTINCT ?br WHERE {{
                                ?br a <{}> ;
                                    <{}> [
                                        a <{}> ;
                                        <{}> <{}> ;
                                        <{}> "{}" 
                                    ] ;
                                    <{}> "{}" . 
                            }} LIMIT 1
                        """.format(
                        GraphEntity.journal_issue, GraphEntity.part_of,
                        GraphEntity.journal_volume, GraphEntity.part_of,
                        retrieved_journal, GraphEntity.has_sequence_identifier,
                        volume_id, GraphEntity.has_sequence_identifier,
                        issue_id)
                return self.__query(query)

            else:
                return cur_issue

    def retrieve_volume_from_journal(self, id_dict, volume_id):
        retrieved_journal = self.retrieve(id_dict, 'both')

        if retrieved_journal is not None:
            cur_volume = self.from_journal_volume.get(
                (retrieved_journal, volume_id))

            if cur_volume is None:
                query = """
                        SELECT DISTINCT ?br WHERE {{
                            ?br a <{}> ;
                                <{}> <{}> ;
                                <{}> "{}"
                        }} LIMIT 1
                    """.format(GraphEntity.journal_volume, GraphEntity.part_of,
                               retrieved_journal,
                               GraphEntity.has_sequence_identifier, volume_id)
                return self.__query(query)

            else:
                return cur_volume

    def retrieve_url_string(self, res, typ):
        return self.__retrieve_res_id_string(res, GraphEntity.url, typ)

    def retrieve_doi_string(self, res, typ):
        return self.__retrieve_res_id_string(res, GraphEntity.doi, typ)

    def retrieve_pmid_string(self, res, typ):
        return self.__retrieve_res_id_string(res, GraphEntity.pmid, typ)

    def retrieve_pmcid_string(self, res, typ):
        return self.__retrieve_res_id_string(res, GraphEntity.pmcid, typ)

    def retrieve_br_url(self, res, string, typ):
        return self.__retrieve_res_id_by_type(res, string.lower(),
                                              GraphEntity.url, typ)

    def retrieve_br_doi(self, res, string, typ):
        return self.__retrieve_res_id_by_type(res, string.lower(),
                                              GraphEntity.doi, typ)

    def retrieve_br_pmid(self, res, string, typ):
        return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmid,
                                              typ)

    def retrieve_br_pmcid(self, res, string, typ):
        return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmcid,
                                              typ)

    def retrieve_last_snapshot(self, prov_subj):
        query = '''
            SELECT DISTINCT ?se WHERE {{
                ?se <{}> <{}> .
                FILTER NOT EXISTS {{?se <{}> ?ca }}
            }} LIMIT 1
        '''.format(ProvEntity.specialization_of, str(prov_subj),
                   ProvEntity.was_invalidated_by)
        return self.__query(query)

    def __retrieve_res_id_string(self, input_res, id_type, typ):
        if id_type is not None and input_res is not None:
            if type(input_res) is GraphEntity:
                res = input_res.res
            else:
                res = URIRef(input_res)

            # First check if locally there's something
            if str(id_type) == 'http://purl.org/spar/datacite/url':
                store = self.url_store_type
            elif str(id_type) == 'http://purl.org/spar/datacite/doi':
                store = self.doi_store_type
            elif str(id_type) == 'http://purl.org/spar/datacite/orcid':
                store = self.orcid_store_type
            elif str(id_type) == 'http://purl.org/spar/datacite/pmid':
                store = self.pmid_store_type
            elif str(id_type) == 'http://purl.org/spar/datacite/pmcid':
                store = self.pmcid_store_type
            elif str(id_type) == 'http://purl.org/spar/datacite/issn':
                store = self.issn_store_type
            elif str(id_type) == 'http://purl.org/spar/datacite/isbn':
                store = self.isbn_store_type
            elif str(id_type) == 'http://purl.org/spar/datacite/crossref':
                store = self.crossref_store_type

            if str(id_type) == 'http://purl.org/spar/datacite/issn' or \
               str(id_type) == 'http://purl.org/spar/datacite/isbn':
                if res in store:
                    return store[res][0]

            elif res in store:
                return store[res]

            if typ != 'only_local':
                query = '''
                SELECT DISTINCT ?id WHERE {{
                    <{}> <{}> [
                        <{}> <{}> ;
                        <{}> ?id
                    ]
                }}'''.format(res, GraphEntity.has_identifier,
                             GraphEntity.uses_identifier_scheme, id_type,
                             GraphEntity.has_literal_value)
                return self.__query_blazegraph(query, typ)

    def __retrieve_res_id_by_type(self, input_res, id_string, id_type, typ):
        if type(input_res) is GraphEntity:
            res = input_res.res
        else:
            res = URIRef(input_res)

        # First check if locally there's something
        if id_type is not None and id is not None:
            if str(id_type) == 'http://purl.org/spar/datacite/url':
                store = self.url_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/doi':
                store = self.doi_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/orcid':
                store = self.orcid_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/pmid':
                store = self.pmid_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/pmcid':
                store = self.pmcid_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/issn':
                store = self.issn_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/isbn':
                store = self.isbn_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/crossref':
                store = self.crossref_store_type_id

            if (res, id_string) in store:
                return store[(res, id_string)]

        if id_string is not None and typ != 'only_local':
            query = '''
            SELECT DISTINCT ?id WHERE {{
                <{}> <{}> ?id .
                ?id <{}> <{}> ;
                    <{}> "{}"
            }}'''.format(res, GraphEntity.has_identifier,
                         GraphEntity.uses_identifier_scheme, id_type,
                         GraphEntity.has_literal_value, id_string)

            return self.__query_blazegraph(query)

    # TODO REMOVE
    def retrieve_res_id_by_type(self, input_res, id_string, id_type, typ):
        if type(input_res) is GraphEntity:
            res = input_res.res
        else:
            res = URIRef(input_res)

        # First check if locally there's something
        if id_type is not None and id is not None:
            if str(id_type) == 'http://purl.org/spar/datacite/url':
                store = self.url_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/doi':
                store = self.doi_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/orcid':
                store = self.orcid_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/pmid':
                store = self.pmid_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/pmcid':
                store = self.pmcid_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/issn':
                store = self.issn_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/isbn':
                store = self.isbn_store_type_id
            elif str(id_type) == 'http://purl.org/spar/datacite/crossref':
                store = self.crossref_store_type_id

            if (res, id_string) in store:
                return store[(res, id_string)]

        if id_string is not None and typ != 'only_local':
            query = '''
            SELECT DISTINCT ?id WHERE {{
                <{}> <{}> ?id .
                ?id <{}> <{}> ;
                    <{}> "{}"
            }}'''.format(res, GraphEntity.has_identifier,
                         GraphEntity.uses_identifier_scheme, id_type,
                         GraphEntity.has_literal_value, id_string)

            return self.__query_blazegraph(query)

    def add_id_to_store(self,
                        input_res,
                        input_id,
                        extracted_id,
                        store_type_id,
                        store_type,
                        store,
                        is_list=False):

        if type(input_res) is GraphEntity:
            cur_res = input_res.res
        else:
            cur_res = URIRef(input_res)

        if type(input_id) is GraphEntity:
            cur_id = input_id.res
        else:
            cur_id = URIRef(input_id)

        if cur_res is not None and cur_id is not None and extracted_id is not None:
            # Check if local store doesn't contains already the elements
            # la seconda riga dell'if devo eliminarla
            if (cur_res, extracted_id) not in store_type_id \
            and ((cur_res not in store_type and not is_list) or is_list) \
            and extracted_id not in store:
                # Add it
                store_type_id[(cur_res, extracted_id)] = cur_id
                if is_list:
                    cur_list = store_type.get(cur_res)
                    if cur_list is None:
                        cur_list = [extracted_id]
                        store_type[cur_res] = cur_list
                    if extracted_id not in cur_list:
                        cur_list.append(extracted_id)
                else:
                    store_type[cur_res] = extracted_id
                store[extracted_id] = cur_res

    def add_doi_to_store(self, input_res, input_id, extracted_id):
        return self.add_id_to_store(input_res, input_id, extracted_id,
                                    self.doi_store_type_id,
                                    self.doi_store_type, self.doi_store)

    def add_url_to_store(self, input_res, input_id, extracted_id):
        return self.add_id_to_store(input_res, input_id, extracted_id,
                                    self.url_store_type_id,
                                    self.url_store_type, self.url_store)

    def add_pmid_to_store(self, input_res, input_id, extracted_id):
        return self.add_id_to_store(input_res, input_id, extracted_id,
                                    self.pmid_store_type_id,
                                    self.pmid_store_type, self.pmid_store)

    def add_pmcid_to_store(self, input_res, input_id, extracted_id):
        return self.add_id_to_store(input_res, input_id, extracted_id,
                                    self.pmcid_store_type_id,
                                    self.pmcid_store_type, self.pmcid_store)

    def add_crossref_to_store(self, input_res, input_id, extracted_id):
        return self.add_id_to_store(input_res, input_id, extracted_id,
                                    self.crossref_store_type_id,
                                    self.crossref_store_type,
                                    self.crossref_store)

    def add_orcid_to_store(self, input_res, input_id, extracted_id):
        return self.add_id_to_store(input_res, input_id, extracted_id,
                                    self.orcid_store_type_id,
                                    self.orcid_store_type, self.orcid_store)

    def add_isbn_to_store(self, input_res, input_id, extracted_id):
        return self.add_id_to_store(input_res, input_id, extracted_id,
                                    self.isbn_store_type_id,
                                    self.isbn_store_type, self.isbn_store,
                                    True)

    def add_issn_to_store(self, input_res, input_id, extracted_id):
        return self.add_id_to_store(input_res, input_id, extracted_id,
                                    self.issn_store_type_id,
                                    self.issn_store_type, self.issn_store,
                                    True)

    def add_issue_to_store(self, input_jou, volume, issue, input_id):
        if input_jou is not None and issue is not None and input_id is not None:
            if type(input_jou) is GraphEntity:
                jou_br = input_jou.res
            else:
                jou_br = URIRef(input_jou)

            if type(input_id) is GraphEntity:
                cur_id = input_id.res
            else:
                cur_id = URIRef(input_id)

            if (jou_br, volume, issue) not in self.from_issue_partof_journal:
                self.from_issue_partof_journal[(jou_br, volume,
                                                issue)] = cur_id

    def add_volume_to_store(self, input_jou, input_id, volume):
        if input_jou is not None and volume is not None and input_id is not None:
            if type(input_jou) is GraphEntity:
                jou_br = input_jou.res
            else:
                jou_br = URIRef(input_jou)

            if type(input_id) is GraphEntity:
                cur_id = input_id.res
            else:
                cur_id = URIRef(input_id)

            # Check if local store doesn't contains already the elements
            if (jou_br, volume) not in self.from_journal_volume:
                # Add it
                self.from_journal_volume[(jou_br, volume)] = cur_id

    def __id_with_type(self, id_string, id_type, extras="", typ='both'):
        """This method is called when we need to get the resource having a certain identifier. It first check locally
        if something has already been stored and then check on the blazegraph instance"""

        # First check if locally there's something

        if typ != 'only_blazegraph' and id_type is not None and id_string is not None:
            if str(id_type) == 'http://purl.org/spar/datacite/url':
                store = self.url_store
            elif str(id_type) == 'http://purl.org/spar/datacite/doi':
                store = self.doi_store
            elif str(id_type) == 'http://purl.org/spar/datacite/orcid':
                store = self.orcid_store
            elif str(id_type) == 'http://purl.org/spar/datacite/pmid':
                store = self.pmid_store
            elif str(id_type) == 'http://purl.org/spar/datacite/pmcid':
                store = self.pmcid_store
            elif str(id_type) == 'http://purl.org/spar/datacite/issn':
                store = self.issn_store
            elif str(id_type) == 'http://purl.org/spar/datacite/isbn':
                store = self.isbn_store
            elif str(id_type) == 'http://purl.org/spar/datacite/crossref':
                store = self.crossref_store

            if id_string in store:
                return store[id_string]

        # If nothing found, query blazegraph
        if typ != 'only_local':
            query = '''SELECT DISTINCT ?res WHERE {{ ?res <{}> ?id .
                ?id <{}> <{}> ;
                    <{}> "{}" .
                {}
            }}'''.format(GraphEntity.has_identifier,
                         GraphEntity.uses_identifier_scheme, id_type,
                         GraphEntity.has_literal_value, id_string, extras)

            return self.__query(query, typ=typ)

    def __query(self, query, typ='only_blazegraph'):

        if self.ts is not None and (typ == 'both' or typ == 'only_blazegraph'):
            res = self.__query_blazegraph(query)
            if res is not None:
                return res

    def __query_blazegraph(self, query, typ=None):
        if self.ts is not None:

            if self.cache.__contains__(query):
                result = self.cache[query]
                return result
            else:
                result = self.ts.query(query)
                for res, in result:
                    self.cache[query] = res
                    return res

    def __query_local(self, query):
        # Deprecated
        if self.cache_local.__contains__(query):
            result = self.cache_local[query]
        else:
            result = self.g.query(query)
            if result is not None and len(result):
                self.cache_local[query] = result
        for res, in result:
            return res
Exemple #12
0
                        if found:
                            break
                        elif tentative == 5:
                            print(
                                "Process stopped at DOI '%s' due to exceptions"
                                % cur_doi)
                            exit(0)

            if found:
                to_remove[URIRef(base_iri + sub("^g(..):(.+)$", "\\1/\\2", br))] = \
                    [URIRef(iden) for iden in
                     [base_iri + sub("^g(..):(.+)$", "\\1/\\2", r_id["r"]) for r_id in id_list]]

        s = Storer(context_map={context_path: context_file_path},
                   dir_split=dir_split_number,
                   n_file_item=items_per_file,
                   default_dir=default_dir)

        for full_info_dir in info_dirs:
            br_iri = []
            br_files = {}
            id_files = {}
            update_br = GraphSet(base_iri, context_path)
            remove_id = GraphSet(base_iri, context_path)

            print("\n\nSupplier directory '%s'" % full_info_dir)
            to_remove = info_dirs[full_info_dir]
            br_counter = 0

            for br in to_remove:
                if br_counter == 10:  # Write everything on disk
Exemple #13
0
class DatasetHandler(object):
    DCTERMS = Namespace("http://purl.org/dc/terms/")
    DCAT = Namespace("http://www.w3.org/ns/dcat#")
    VOID = Namespace("http://rdfs.org/ns/void#")
    MTT = Namespace("https://w3id.org/spar/mediatype/text/")
    DBR = Namespace("http://dbpedia.org/resource/")

    dataset = DCAT.Dataset
    datafile = DCAT.Distribution

    title = DCTERMS.title
    description = DCTERMS.description
    issued = DCTERMS.issued
    modified = DCTERMS.modified
    keyword = DCAT.keyword
    subject = DCAT.theme
    landing_page = DCAT.landingPage
    subset = VOID.subset
    sparql_endpoint = VOID.sparqlEndpoint
    distribution = DCAT.distribution
    license = DCTERMS.license
    download_url = DCAT.downloadURL
    media_type = DCAT.mediaType
    byte_size = DCAT.byte_size
    label = RDFS.label
    a = RDF.type
    turtle = MTT.turtle
    bibliographic_database = DBR.Bibliographic_database
    open_access = DBR.Open_access
    scholary_communication = DBR.Scholarly_communication
    citations = DBR.Citation

    def __init__(self, tp_url_real, context_path, context_file_path,
                 base_iri, base_dir, info_dir, dataset_home, tmp_dir, triplestore_url=None):
        self.tp_url = triplestore_url
        self.base_iri = base_iri
        self.base_dir = base_dir
        self.info_dir = info_dir
        self.context_path = context_path
        self.dataset_home = URIRef(dataset_home)
        self.tmp_dir = tmp_dir
        self.tp_res = URIRef(tp_url_real)
        self.repok = Reporter(prefix="[DatasetHandler: INFO] ")
        self.reperr = Reporter(prefix="[DatasetHandler: ERROR] ")
        self.st = Storer(context_map={context_path: context_file_path},
                         repok=self.repok, reperr=self.reperr)
        self.st.set_preface_query(
            u"DELETE { ?res <%s> ?date } WHERE { ?res a <%s> ; <%s> ?date }" %
            (str(DatasetHandler.modified), str(DatasetHandler.dataset), str(DatasetHandler.modified)))

    # /START Create Literal
    def create_label(self, g, res, string):
        return create_literal(g, res, RDFS.label, string)

    def create_publication_date(self, g, res, string):
        return create_literal(g, res, self.issued, string, XSD.dateTime)

    def update_modification_date(self, g, res, string):
        g.remove((res, self.modified, None))
        return create_literal(g, res, self.modified, string, XSD.dateTime)

    def create_title(self, g, res, string):
        return create_literal(g, res, self.title, string)

    def create_description(self, g, res, string):
        return create_literal(g, res, self.description, string)

    def create_keyword(self, g, res, string):
        return create_literal(g, res, self.keyword, string)

    def create_byte_size(self, g, res, string):
        return create_literal(g, res, self.byte_size, string, XSD.decimal)
    # /END Create Literal

    # /START Create Complex Attributes
    def has_subject(self, g, res, obj):
        g.add((res, self.subject, obj))

    def has_landing_page(self, g, res, obj):
        g.add((res, self.landing_page, obj))

    def has_subset(self, g, res, obj):
        g.add((res, self.subset, obj))

    def has_sparql_endpoint(self, g, res, obj):
        g.add((res, self.sparql_endpoint, obj))

    def has_distribution(self, g, res, obj):
        g.add((res, self.distribution, obj))

    def has_license(self, g, res, obj):
        g.add((res, self.license, obj))

    def has_download_url(self, g, res, obj):
        g.add((res, self.download_url, obj))

    def has_media_type(self, g, res, obj):
        g.add((res, self.media_type, obj))
    # /END Create Complex Attributes

    # /START Types
    def dataset_type(self, g, res):
        create_type(g, res, self.dataset)

    def distribution_type(self, g, res):
        create_type(g, res, self.datafile)
    # /END Types

    def update_dataset_info(self, graph_set):
        cur_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
        subgraphs_to_update = set()
        all_graphs = []

        for g in graph_set.graphs():
            cur_id = g.identifier
            if cur_id not in subgraphs_to_update:
                subgraphs_to_update.add(cur_id)
                cur_dataset_res = URIRef(cur_id)
                cur_dataset = self.get_dataset_graph(cur_dataset_res, cur_time)
                self.update_modification_date(cur_dataset, cur_dataset_res, cur_time)
                all_graphs += [cur_dataset]

        if subgraphs_to_update:
            cur_occ_res = URIRef(self.base_iri)
            cur_occ = self.get_dataset_graph(cur_occ_res, cur_time)
            self.update_modification_date(cur_occ, cur_occ_res, cur_time)

            for subgraph_id in subgraphs_to_update:
                self.has_subset(cur_occ, cur_occ_res, URIRef(subgraph_id))
            all_graphs += [cur_occ]

        if all_graphs:  # Store everything and upload to triplestore
            if self.tp_url is None:
                self.st.store_all(
                    self.base_dir, self.base_iri, self.context_path,
                    self.tmp_dir, all_graphs, True)
            else:
                self.st.upload_and_store(
                    self.base_dir, self.tp_url, self.base_iri, self.context_path,
                    self.tmp_dir, all_graphs, True)

    def get_dataset_graph(self, res, cur_time):
        dataset_path = self.get_metadata_path_from_resource(res)
        if os.path.exists(dataset_path):
            return list(self.st.load(dataset_path, tmp_dir=self.tmp_dir).contexts())[0]
        else:
            dataset_label = "ccc"
            dataset_title = "The Citations in Context Corpus"
            dataset_description = "The Citations in Context Corpus is an open repository of scholarly " \
                                  "citation data made available under a Creative Commons public " \
                                  "domain dedication, which provides in RDF accurate citation " \
                                  "information (bibliographic references) harvested from the " \
                                  "scholarly literature (described using the SPAR Ontologies) " \
                                  "that others may freely build upon, enhance and reuse for any " \
                                  "purpose, without restriction under copyright or database law."
            if re.search("/../$", str(res)) is not None:
                g = Graph(identifier=str(res))
                dataset_short_name = str(res)[-3:-1]
                dataset_name = GraphSet.labels[dataset_short_name]
                dataset_title += ": %s dataset" % dataset_name.title()
                dataset_description += " This sub-dataset contains all the '%s' resources." % \
                                       dataset_name
                dataset_label += " / %s" % dataset_short_name
                self.create_keyword(g, res, dataset_name)
            else:
                g = Graph()
                self.has_landing_page(g, res, self.dataset_home)
                self.has_sparql_endpoint(g, res, self.tp_res)
            self.dataset_type(g, res)
            self.create_label(g, res, dataset_label)
            self.create_title(g, res, dataset_title)
            self.create_description(g, res, dataset_description)
            self.create_publication_date(g, res, cur_time)
            self.create_keyword(g, res, "OCC")
            self.create_keyword(g, res, "ccc")
            self.create_keyword(g, res, "OpenCitations")
            self.create_keyword(g, res, "Citations in Context Corpus")
            self.create_keyword(g, res, "SPAR Ontologies")
            self.create_keyword(g, res, "bibliographic references")
            self.create_keyword(g, res, "citations")
            self.has_subject(g, res, self.bibliographic_database)
            self.has_subject(g, res, self.scholary_communication)
            self.has_subject(g, res, self.open_access)
            self.has_subject(g, res, self.citations)

            return g

    def get_metadata_path_from_resource(self, dataset_res):
        return self.get_metadata_path_from_iri(str(dataset_res))

    def get_metadata_path_from_iri(self, dataset_iri):
        return re.sub("^%s" % self.base_iri, self.base_dir, dataset_iri) + "index.json"
Exemple #14
0
                                        base_iri=base_iri,
                                        tmp_dir=temp_dir_for_rdf_loading,
                                        context_map={
                                            context_path: context_file_path
                                        },
                                        dir_split=dir_split_number,
                                        n_file_item=items_per_file,
                                        default_dir=default_dir),
                                    dir_split_number, items_per_file,
                                    supplier_prefix)
                                prov.generate_provenance()

                                res_storer = Storer(result,
                                                    context_map={
                                                        context_path:
                                                        context_file_path
                                                    },
                                                    dir_split=dir_split_number,
                                                    n_file_item=items_per_file,
                                                    default_dir=default_dir)

                                prov_storer = Storer(
                                    prov,
                                    context_map={
                                        context_path: context_file_path
                                    },
                                    dir_split=dir_split_number,
                                    n_file_item=items_per_file)

                                if do_parallel:
                                    base_share_dir = sharing_dir + sep + real_dir + \
                                                     datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + sep