class Checker(object): def __init__(self, input_dir, output_dir=None, tmp_dir=None): self.input_dir = input_dir self.output_dir = output_dir self.tmp_dir = tmp_dir self.storer = Storer() self.name = self.__class__.__name__ self.repok = Reporter(prefix="[%s - INFO] " % self.name) self.repok.new_article() self.reper = Reporter(prefix="[%s - ERROR] " % self.name) self.reper.new_article() def process(self): for cur_dir, cur_subdir, cur_files in os.walk(self.input_dir): for cur_file in cur_files: self.repok.new_article() self.reper.new_article() cur_rdf_path = cur_dir + os.sep + cur_file try: self.repok.add_sentence("Processing '%s'" % cur_rdf_path) g = self.storer.load(cur_rdf_path, tmp_dir=self.tmp_dir) if self.output_dir is None: self.repok.add_sentence("The RDF graph has been converted in TRIG as follows:\n%s" % g.serialize(format="trig")) else: if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) output_file = self.output_dir + os.sep + "converted_" + cur_file + ".ttl" self.repok.add_sentence("The RDF graph has been stored in %s" % (output_file, g.serialize(output_file, format="trig"))) except Exception: self.reper.add_sentence("The file '%s' doesn't contain RDF statements", False)
class Checker(object): def __init__(self, input_dir, output_dir=None, tmp_dir=None): self.input_dir = input_dir self.output_dir = output_dir self.tmp_dir = tmp_dir self.storer = Storer() self.name = self.__class__.__name__ self.repok = Reporter(prefix="[%s - INFO] " % self.name) self.repok.new_article() self.reper = Reporter(prefix="[%s - ERROR] " % self.name) self.reper.new_article() def process(self): for cur_dir, cur_subdir, cur_files in os.walk(self.input_dir): for cur_file in cur_files: self.repok.new_article() self.reper.new_article() cur_rdf_path = cur_dir + os.sep + cur_file try: self.repok.add_sentence("Processing '%s'" % cur_rdf_path) g = self.storer.load(cur_rdf_path, tmp_dir=self.tmp_dir) if self.output_dir is None: self.repok.add_sentence( "The RDF graph has been converted in TRIG as follows:\n%s" % g.serialize(format="trig")) else: if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) output_file = self.output_dir + os.sep + "converted_" + cur_file + ".ttl" self.repok.add_sentence( "The RDF graph has been stored in %s" % (output_file, g.serialize(output_file, format="trig"))) except Exception: self.reper.add_sentence( "The file '%s' doesn't contain RDF statements", False)
s = Storer(context_map={context_path: context_file_path}) all_files = [] if args.rec: for cur_dir, cur_subdir, cur_files in os.walk(args.i_dir): for cur_file in cur_files: if cur_file.endswith(".json"): all_files += [cur_dir + os.sep + cur_file] else: for cur_file in os.listdir(args.i_dir): if cur_file.endswith(".json"): all_files += [args.i_dir + os.sep + cur_file] for rdf_path in all_files: cur_g = s.load(rdf_path, tmp_dir=temp_dir_for_rdf_loading) try: for o in cur_g.objects(None, GraphEntity.has_identifier): o_local_path = str(o).replace(base_iri, base_dir) + ".json" id_g = s.load(o_local_path, tmp_dir=temp_dir_for_rdf_loading) for v in id_g.objects(None, GraphEntity.has_literal_value): if v not in id_doc: id_doc[v] = [] id_doc[v] += [rdf_path] except: pass # Do nothing result = [] for id_lit in id_doc: cur_list = id_doc[id_lit] if len(cur_list) > 1:
class DatasetHandler(object): DCTERMS = Namespace("http://purl.org/dc/terms/") DCAT = Namespace("http://www.w3.org/ns/dcat#") VOID = Namespace("http://rdfs.org/ns/void#") MTT = Namespace("https://w3id.org/spar/mediatype/text/") DBR = Namespace("http://dbpedia.org/resource/") dataset = DCAT.Dataset datafile = DCAT.Distribution title = DCTERMS.title description = DCTERMS.description issued = DCTERMS.issued modified = DCTERMS.modified keyword = DCAT.keyword subject = DCAT.theme landing_page = DCAT.landingPage subset = VOID.subset sparql_endpoint = VOID.sparqlEndpoint distribution = DCAT.distribution license = DCTERMS.license download_url = DCAT.downloadURL media_type = DCAT.mediaType byte_size = DCAT.byte_size label = RDFS.label a = RDF.type turtle = MTT.turtle bibliographic_database = DBR.Bibliographic_database open_access = DBR.Open_access scholary_communication = DBR.Scholarly_communication citations = DBR.Citation def __init__(self, tp_url_real, context_path, context_file_path, base_iri, base_dir, info_dir, dataset_home, tmp_dir, triplestore_url=None): self.tp_url = triplestore_url self.base_iri = base_iri self.base_dir = base_dir self.info_dir = info_dir self.context_path = context_path self.dataset_home = URIRef(dataset_home) self.tmp_dir = tmp_dir self.tp_res = URIRef(tp_url_real) self.repok = Reporter(prefix="[DatasetHandler: INFO] ") self.reperr = Reporter(prefix="[DatasetHandler: ERROR] ") self.st = Storer(context_map={context_path: context_file_path}, repok=self.repok, reperr=self.reperr) self.st.set_preface_query( u"DELETE { ?res <%s> ?date } WHERE { ?res a <%s> ; <%s> ?date }" % (str(DatasetHandler.modified), str(DatasetHandler.dataset), str(DatasetHandler.modified))) # /START Create Literal def create_label(self, g, res, string): return create_literal(g, res, RDFS.label, string) def create_publication_date(self, g, res, string): return create_literal(g, res, self.issued, string, XSD.dateTime) def update_modification_date(self, g, res, string): g.remove((res, self.modified, None)) return create_literal(g, res, self.modified, string, XSD.dateTime) def create_title(self, g, res, string): return create_literal(g, res, self.title, string) def create_description(self, g, res, string): return create_literal(g, res, self.description, string) def create_keyword(self, g, res, string): return create_literal(g, res, self.keyword, string) def create_byte_size(self, g, res, string): return create_literal(g, res, self.byte_size, string, XSD.decimal) # /END Create Literal # /START Create Complex Attributes def has_subject(self, g, res, obj): g.add((res, self.subject, obj)) def has_landing_page(self, g, res, obj): g.add((res, self.landing_page, obj)) def has_subset(self, g, res, obj): g.add((res, self.subset, obj)) def has_sparql_endpoint(self, g, res, obj): g.add((res, self.sparql_endpoint, obj)) def has_distribution(self, g, res, obj): g.add((res, self.distribution, obj)) def has_license(self, g, res, obj): g.add((res, self.license, obj)) def has_download_url(self, g, res, obj): g.add((res, self.download_url, obj)) def has_media_type(self, g, res, obj): g.add((res, self.media_type, obj)) # /END Create Complex Attributes # /START Types def dataset_type(self, g, res): create_type(g, res, self.dataset) def distribution_type(self, g, res): create_type(g, res, self.datafile) # /END Types def update_dataset_info(self, graph_set): cur_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') subgraphs_to_update = set() all_graphs = [] for g in graph_set.graphs(): cur_id = g.identifier if cur_id not in subgraphs_to_update: subgraphs_to_update.add(cur_id) cur_dataset_res = URIRef(cur_id) cur_dataset = self.get_dataset_graph(cur_dataset_res, cur_time) self.update_modification_date(cur_dataset, cur_dataset_res, cur_time) all_graphs += [cur_dataset] if subgraphs_to_update: cur_occ_res = URIRef(self.base_iri) cur_occ = self.get_dataset_graph(cur_occ_res, cur_time) self.update_modification_date(cur_occ, cur_occ_res, cur_time) for subgraph_id in subgraphs_to_update: self.has_subset(cur_occ, cur_occ_res, URIRef(subgraph_id)) all_graphs += [cur_occ] if all_graphs: # Store everything and upload to triplestore if self.tp_url is None: self.st.store_all( self.base_dir, self.base_iri, self.context_path, self.tmp_dir, all_graphs, True) else: self.st.upload_and_store( self.base_dir, self.tp_url, self.base_iri, self.context_path, self.tmp_dir, all_graphs, True) def get_dataset_graph(self, res, cur_time): dataset_path = self.get_metadata_path_from_resource(res) if os.path.exists(dataset_path): return list(self.st.load(dataset_path, tmp_dir=self.tmp_dir).contexts())[0] else: dataset_label = "OCC" dataset_title = "The OpenCitations Corpus" dataset_description = "The OpenCitations Corpus is an open repository of scholarly " \ "citation data made available under a Creative Commons public " \ "domain dedication, which provides in RDF accurate citation " \ "information (bibliographic references) harvested from the " \ "scholarly literature (described using the SPAR Ontologies) " \ "that others may freely build upon, enhance and reuse for any " \ "purpose, without restriction under copyright or database law." if re.search("/../$", str(res)) is not None: g = Graph(identifier=str(res)) dataset_short_name = str(res)[-3:-1] dataset_name = GraphSet.labels[dataset_short_name] dataset_title += ": %s dataset" % dataset_name.title() dataset_description += " This sub-dataset contains all the '%s' resources." % \ dataset_name dataset_label += " / %s" % dataset_short_name self.create_keyword(g, res, dataset_name) else: g = Graph() self.has_landing_page(g, res, self.dataset_home) self.has_sparql_endpoint(g, res, self.tp_res) self.dataset_type(g, res) self.create_label(g, res, dataset_label) self.create_title(g, res, dataset_title) self.create_description(g, res, dataset_description) self.create_publication_date(g, res, cur_time) self.create_keyword(g, res, "OCC") self.create_keyword(g, res, "OpenCitations") self.create_keyword(g, res, "OpenCitations Corpus") self.create_keyword(g, res, "SPAR Ontologies") self.create_keyword(g, res, "bibliographic references") self.create_keyword(g, res, "citations") self.has_subject(g, res, self.bibliographic_database) self.has_subject(g, res, self.scholary_communication) self.has_subject(g, res, self.open_access) self.has_subject(g, res, self.citations) return g def get_metadata_path_from_resource(self, dataset_res): return self.get_metadata_path_from_iri(str(dataset_res)) def get_metadata_path_from_iri(self, dataset_iri): return re.sub("^%s" % self.base_iri, self.base_dir, dataset_iri) + "index.json"
class DatasetHandler(object): DCTERMS = Namespace("http://purl.org/dc/terms/") DCAT = Namespace("http://www.w3.org/ns/dcat#") VOID = Namespace("http://rdfs.org/ns/void#") MTT = Namespace("https://w3id.org/spar/mediatype/text/") DBR = Namespace("http://dbpedia.org/resource/") dataset = DCAT.Dataset datafile = DCAT.Distribution title = DCTERMS.title description = DCTERMS.description issued = DCTERMS.issued modified = DCTERMS.modified keyword = DCAT.keyword subject = DCAT.theme landing_page = DCAT.landingPage subset = VOID.subset sparql_endpoint = VOID.sparqlEndpoint distribution = DCAT.distribution license = DCTERMS.license download_url = DCAT.downloadURL media_type = DCAT.mediaType byte_size = DCAT.byte_size label = RDFS.label a = RDF.type turtle = MTT.turtle bibliographic_database = DBR.Bibliographic_database open_access = DBR.Open_access scholary_communication = DBR.Scholarly_communication citations = DBR.Citation def __init__(self, tp_url_real, context_path, context_file_path, base_iri, base_dir, info_dir, dataset_home, tmp_dir, triplestore_url=None): self.tp_url = triplestore_url self.base_iri = base_iri self.base_dir = base_dir self.info_dir = info_dir self.context_path = context_path self.dataset_home = URIRef(dataset_home) self.tmp_dir = tmp_dir self.tp_res = URIRef(tp_url_real) self.repok = Reporter(prefix="[DatasetHandler: INFO] ") self.reperr = Reporter(prefix="[DatasetHandler: ERROR] ") self.st = Storer(context_map={context_path: context_file_path}, repok=self.repok, reperr=self.reperr) self.st.set_preface_query( u"DELETE { ?res <%s> ?date } WHERE { ?res a <%s> ; <%s> ?date }" % (str(DatasetHandler.modified), str( DatasetHandler.dataset), str(DatasetHandler.modified))) # /START Create Literal def create_label(self, g, res, string): return create_literal(g, res, RDFS.label, string) def create_publication_date(self, g, res, string): return create_literal(g, res, self.issued, string, XSD.dateTime) def update_modification_date(self, g, res, string): g.remove((res, self.modified, None)) return create_literal(g, res, self.modified, string, XSD.dateTime) def create_title(self, g, res, string): return create_literal(g, res, self.title, string) def create_description(self, g, res, string): return create_literal(g, res, self.description, string) def create_keyword(self, g, res, string): return create_literal(g, res, self.keyword, string) def create_byte_size(self, g, res, string): return create_literal(g, res, self.byte_size, string, XSD.decimal) # /END Create Literal # /START Create Complex Attributes def has_subject(self, g, res, obj): g.add((res, self.subject, obj)) def has_landing_page(self, g, res, obj): g.add((res, self.landing_page, obj)) def has_subset(self, g, res, obj): g.add((res, self.subset, obj)) def has_sparql_endpoint(self, g, res, obj): g.add((res, self.sparql_endpoint, obj)) def has_distribution(self, g, res, obj): g.add((res, self.distribution, obj)) def has_license(self, g, res, obj): g.add((res, self.license, obj)) def has_download_url(self, g, res, obj): g.add((res, self.download_url, obj)) def has_media_type(self, g, res, obj): g.add((res, self.media_type, obj)) # /END Create Complex Attributes # /START Types def dataset_type(self, g, res): create_type(g, res, self.dataset) def distribution_type(self, g, res): create_type(g, res, self.datafile) # /END Types def update_dataset_info(self, graph_set): cur_time = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') subgraphs_to_update = set() all_graphs = [] for g in graph_set.graphs(): cur_id = g.identifier if cur_id not in subgraphs_to_update: subgraphs_to_update.add(cur_id) cur_dataset_res = URIRef(cur_id) cur_dataset = self.get_dataset_graph(cur_dataset_res, cur_time) self.update_modification_date(cur_dataset, cur_dataset_res, cur_time) all_graphs += [cur_dataset] if subgraphs_to_update: cur_occ_res = URIRef(self.base_iri) cur_occ = self.get_dataset_graph(cur_occ_res, cur_time) self.update_modification_date(cur_occ, cur_occ_res, cur_time) for subgraph_id in subgraphs_to_update: self.has_subset(cur_occ, cur_occ_res, URIRef(subgraph_id)) all_graphs += [cur_occ] if all_graphs: # Store everything and upload to triplestore if self.tp_url is None: self.st.store_all(self.base_dir, self.base_iri, self.context_path, self.tmp_dir, all_graphs, True) else: self.st.upload_and_store(self.base_dir, self.tp_url, self.base_iri, self.context_path, self.tmp_dir, all_graphs, True) def get_dataset_graph(self, res, cur_time): dataset_path = self.get_metadata_path_from_resource(res) if os.path.exists(dataset_path): return list( self.st.load(dataset_path, tmp_dir=self.tmp_dir).contexts())[0] else: dataset_label = "OCC" dataset_title = "The OpenCitations Corpus" dataset_description = "The OpenCitations Corpus is an open repository of scholarly " \ "citation data made available under a Creative Commons public " \ "domain dedication, which provides in RDF accurate citation " \ "information (bibliographic references) harvested from the " \ "scholarly literature (described using the SPAR Ontologies) " \ "that others may freely build upon, enhance and reuse for any " \ "purpose, without restriction under copyright or database law." if re.search("/../$", str(res)) is not None: g = Graph(identifier=str(res)) dataset_short_name = str(res)[-3:-1] dataset_name = GraphSet.labels[dataset_short_name] dataset_title += ": %s dataset" % dataset_name.title() dataset_description += " This sub-dataset contains all the '%s' resources." % \ dataset_name dataset_label += " / %s" % dataset_short_name self.create_keyword(g, res, dataset_name) else: g = Graph() self.has_landing_page(g, res, self.dataset_home) self.has_sparql_endpoint(g, res, self.tp_res) self.dataset_type(g, res) self.create_label(g, res, dataset_label) self.create_title(g, res, dataset_title) self.create_description(g, res, dataset_description) self.create_publication_date(g, res, cur_time) self.create_keyword(g, res, "OCC") self.create_keyword(g, res, "OpenCitations") self.create_keyword(g, res, "OpenCitations Corpus") self.create_keyword(g, res, "SPAR Ontologies") self.create_keyword(g, res, "bibliographic references") self.create_keyword(g, res, "citations") self.has_subject(g, res, self.bibliographic_database) self.has_subject(g, res, self.scholary_communication) self.has_subject(g, res, self.open_access) self.has_subject(g, res, self.citations) return g def get_metadata_path_from_resource(self, dataset_res): return self.get_metadata_path_from_iri(str(dataset_res)) def get_metadata_path_from_iri(self, dataset_iri): return re.sub("^%s" % self.base_iri, self.base_dir, dataset_iri) + "index.json"
class ResourceFinder(object): def __init__(self, g_set=None, ts_url=None, base_dir=None, base_iri=None, tmp_dir=None, context_map={}): self.g = Graph() self.base_dir = base_dir self.base_iri = base_iri self.storer = Storer(context_map=context_map) self.tmp_dir = tmp_dir self.name = "SPACIN " + self.__class__.__name__ self.loaded = set() if g_set is not None: self.update_graph_set(g_set) if ts_url is None: self.ts = None else: self.ts = ConjunctiveGraph('SPARQLUpdateStore') self.ts.open((ts_url, ts_url)) def add_prov_triples_in_filesystem(self, res_iri, prov_entity_type=None): if self.base_dir is not None and self.base_iri is not None: cur_file_path = find_paths(res_iri, self.base_dir, self.base_iri, dir_split_number, items_per_file)[1] if cur_file_path.endswith("index.json"): cur_path = cur_file_path.replace("index.json", "") + "prov" else: cur_path = cur_file_path[:-5] + os.sep + "prov" file_list = [] if os.path.isdir(cur_path): for cur_dir, cur_subdir, cur_files in os.walk(cur_path): for cur_file in cur_files: if cur_file.endswith(".json") and \ (prov_entity_type is None or cur_file.startswith(prov_entity_type)): file_list += [cur_dir + os.sep + cur_file] for file_path in file_list: if file_path not in self.loaded: self.loaded.add(file_path) cur_g = self.storer.load(file_path, tmp_dir=self.tmp_dir) self.add_triples_in_graph(cur_g) def add_triples_in_graph(self, g): if g is not None: for s, p, o in g.triples((None, None, None)): self.g.add((s, p, o)) def update_graph_set(self, g_set): for g in g_set.graphs(): self.add_triples_in_graph(g) def retrieve(self, id_dict): for id_type in id_dict: for id_string in id_dict[id_type]: res = self.__id_with_type(id_string, id_type) if res is not None: return res def retrieve_provenance_agent_from_name(self, string): query = """ SELECT DISTINCT ?pa WHERE { ?pa a <%s> ; <%s> "%s" } LIMIT 1 """ % (ProvEntity.prov_agent, GraphEntity.name, string) return self.__query(query) def retrieve_from_orcid(self, string): return self.__id_with_type(string, GraphEntity.orcid) def retrieve_citing_from_doi(self, string): return self.__id_with_type( string.lower(), GraphEntity.doi, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_citing_from_pmid(self, string): return self.__id_with_type( string, GraphEntity.pmid, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_citing_from_pmcid(self, string): return self.__id_with_type( string, GraphEntity.pmcid, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_citing_from_url(self, string): return self.__id_with_type( string.lower(), GraphEntity.url, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_from_doi(self, string): return self.__id_with_type(string.lower(), GraphEntity.doi) def retrieve_from_pmid(self, string): return self.__id_with_type(string, GraphEntity.pmid) def retrieve_from_pmcid(self, string): return self.__id_with_type(string, GraphEntity.pmcid) def retrieve_from_url(self, string): return self.__id_with_type(string.lower(), GraphEntity.url) def retrieve_from_issn(self, string): return self.__id_with_type(string, GraphEntity.issn) def retrieve_from_isbn(self, string): return self.__id_with_type(string, GraphEntity.isbn) def retrieve_issue_from_journal(self, id_dict, issue_id, volume_id): if volume_id is None: return self.__retrieve_from_journal(id_dict, GraphEntity.journal_issue, issue_id) else: retrieved_volume = self.retrieve_volume_from_journal(id_dict, volume_id) if retrieved_volume is not None: query = """ SELECT DISTINCT ?br WHERE { ?br a <%s> ; <%s> <%s> ; <%s> "%s" } LIMIT 1 """ % (GraphEntity.journal_issue, GraphEntity.part_of, str(retrieved_volume), GraphEntity.has_sequence_identifier, issue_id) return self.__query(query) def retrieve_volume_from_journal(self, id_dict, volume_id): return self.__retrieve_from_journal(id_dict, GraphEntity.journal_volume, volume_id) def retrieve_br_url(self, res, string): return self.__retrieve_res_id_by_type(res, string.lower(), GraphEntity.url) def retrieve_br_doi(self, res, string): return self.__retrieve_res_id_by_type(res, string.lower(), GraphEntity.doi) def retrieve_br_pmid(self, res, string): return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmid) def retrieve_br_pmcid(self, res, string): return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmcid) def retrieve_last_snapshot(self, prov_subj): query = """ SELECT DISTINCT ?se WHERE { ?se <%s> <%s> . FILTER NOT EXISTS {?se <%s> ?ca } } LIMIT 1 """ % (ProvEntity.specialization_of, str(prov_subj), ProvEntity.was_invalidated_by) return self.__query(query) def __retrieve_res_id_by_type(self, res, id_string, id_type): if id_string is not None: query = """ SELECT DISTINCT ?id WHERE { <%s> <%s> ?id . ?id <%s> <%s> ; <%s> "%s" }""" % ( res, GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string) return self.__query(query) def __retrieve_from_journal(self, id_dict, part_type, part_seq_id): for id_type in id_dict: for id_string in id_dict[id_type]: query = """ SELECT DISTINCT ?res WHERE { ?j <%s> ?id . ?id <%s> <%s> ; <%s> "%s" . ?res a <%s> ; <%s>+ ?j ; <%s> "%s" }""" % ( GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string, part_type, GraphEntity.part_of, GraphEntity.has_sequence_identifier, part_seq_id ) return self.__query(query) def __id_with_type(self, id_string, id_type, extras=""): query = """ SELECT DISTINCT ?res WHERE { ?res <%s> ?id . ?id <%s> <%s> ; <%s> "%s" . %s }""" % ( GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string, extras) return self.__query(query) def __query(self, query): if self.ts is not None: result = self.ts.query(query) for res, in result: return res # If nothing has been returned, check if there is something # in the current graph set result = self.g.query(query) for res, in result: return res
class ResourceFinder(object): def __init__(self, g_set=None, ts_url=None, base_dir=None, base_iri=None, tmp_dir=None, context_map={}): self.g = Graph() self.base_dir = base_dir self.base_iri = base_iri self.storer = Storer(context_map=context_map) self.tmp_dir = tmp_dir self.name = "SPACIN " + self.__class__.__name__ self.loaded = set() if g_set is not None: self.update_graph_set(g_set) if ts_url is None: self.ts = None else: self.ts = ConjunctiveGraph('SPARQLUpdateStore') self.ts.open((ts_url, ts_url)) def add_prov_triples_in_filesystem(self, res_iri, prov_entity_type=None): if self.base_dir is not None and self.base_iri is not None: cur_file_path = find_paths(res_iri, self.base_dir, self.base_iri, dir_split_number, items_per_file)[1] if cur_file_path.endswith("index.json"): cur_path = cur_file_path.replace("index.json", "") + "prov" else: cur_path = cur_file_path[:-5] + os.sep + "prov" file_list = [] if os.path.isdir(cur_path): for cur_dir, cur_subdir, cur_files in os.walk(cur_path): for cur_file in cur_files: if cur_file.endswith(".json") and \ (prov_entity_type is None or cur_file.startswith(prov_entity_type)): file_list += [cur_dir + os.sep + cur_file] for file_path in file_list: if file_path not in self.loaded: self.loaded.add(file_path) cur_g = self.storer.load(file_path, tmp_dir=self.tmp_dir) self.add_triples_in_graph(cur_g) def add_triples_in_graph(self, g): if g is not None: for s, p, o in g.triples((None, None, None)): self.g.add((s, p, o)) def update_graph_set(self, g_set): for g in g_set.graphs(): self.add_triples_in_graph(g) def retrieve(self, id_dict): for id_type in id_dict: for id_string in id_dict[id_type]: res = self.__id_with_type(id_string, id_type) if res is not None: return res def retrieve_provenance_agent_from_name(self, string): query = """ SELECT DISTINCT ?pa WHERE { ?pa a <%s> ; <%s> "%s" } LIMIT 1 """ % (ProvEntity.prov_agent, GraphEntity.name, string) return self.__query(query) def retrieve_from_orcid(self, string): return self.__id_with_type(string, GraphEntity.orcid) def retrieve_citing_from_doi(self, string): return self.__id_with_type(string.lower(), GraphEntity.doi, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_citing_from_pmid(self, string): return self.__id_with_type(string, GraphEntity.pmid, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_citing_from_pmcid(self, string): return self.__id_with_type(string, GraphEntity.pmcid, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_citing_from_url(self, string): return self.__id_with_type(string.lower(), GraphEntity.url, "?res <%s> ?cited" % GraphEntity.cites) def retrieve_from_doi(self, string): return self.__id_with_type(string.lower(), GraphEntity.doi) def retrieve_from_pmid(self, string): return self.__id_with_type(string, GraphEntity.pmid) def retrieve_from_pmcid(self, string): return self.__id_with_type(string, GraphEntity.pmcid) def retrieve_from_url(self, string): return self.__id_with_type(string.lower(), GraphEntity.url) def retrieve_from_issn(self, string): return self.__id_with_type(string, GraphEntity.issn) def retrieve_from_isbn(self, string): return self.__id_with_type(string, GraphEntity.isbn) def retrieve_issue_from_journal(self, id_dict, issue_id, volume_id): if volume_id is None: return self.__retrieve_from_journal(id_dict, GraphEntity.journal_issue, issue_id) else: retrieved_volume = self.retrieve_volume_from_journal( id_dict, volume_id) if retrieved_volume is not None: query = """ SELECT DISTINCT ?br WHERE { ?br a <%s> ; <%s> <%s> ; <%s> "%s" } LIMIT 1 """ % (GraphEntity.journal_issue, GraphEntity.part_of, str(retrieved_volume), GraphEntity.has_sequence_identifier, issue_id) return self.__query(query) def retrieve_volume_from_journal(self, id_dict, volume_id): return self.__retrieve_from_journal(id_dict, GraphEntity.journal_volume, volume_id) def retrieve_br_url(self, res, string): return self.__retrieve_res_id_by_type(res, string.lower(), GraphEntity.url) def retrieve_br_doi(self, res, string): return self.__retrieve_res_id_by_type(res, string.lower(), GraphEntity.doi) def retrieve_br_pmid(self, res, string): return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmid) def retrieve_br_pmcid(self, res, string): return self.__retrieve_res_id_by_type(res, string, GraphEntity.pmcid) def retrieve_last_snapshot(self, prov_subj): query = """ SELECT DISTINCT ?se WHERE { ?se <%s> <%s> . FILTER NOT EXISTS {?se <%s> ?ca } } LIMIT 1 """ % (ProvEntity.specialization_of, str(prov_subj), ProvEntity.was_invalidated_by) return self.__query(query) def __retrieve_res_id_by_type(self, res, id_string, id_type): if id_string is not None: query = """ SELECT DISTINCT ?id WHERE { <%s> <%s> ?id . ?id <%s> <%s> ; <%s> "%s" }""" % (res, GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string) return self.__query(query) def __retrieve_from_journal(self, id_dict, part_type, part_seq_id): for id_type in id_dict: for id_string in id_dict[id_type]: query = """ SELECT DISTINCT ?res WHERE { ?j <%s> ?id . ?id <%s> <%s> ; <%s> "%s" . ?res a <%s> ; <%s>+ ?j ; <%s> "%s" }""" % (GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string, part_type, GraphEntity.part_of, GraphEntity.has_sequence_identifier, part_seq_id) return self.__query(query) def __id_with_type(self, id_string, id_type, extras=""): query = """ SELECT DISTINCT ?res WHERE { ?res <%s> ?id . ?id <%s> <%s> ; <%s> "%s" . %s }""" % (GraphEntity.has_identifier, GraphEntity.uses_identifier_scheme, id_type, GraphEntity.has_literal_value, id_string, extras) return self.__query(query) def __query(self, query): if self.ts is not None: result = self.ts.query(query) for res, in result: return res # If nothing has been returned, check if there is something # in the current graph set result = self.g.query(query) for res, in result: return res