def load(rdf_iri_string, tmp_dir=None): res_dir, rdf_file_path = \ find_paths(rdf_iri_string, args.base + os.sep, "https://w3id.org/oc/corpus/", 10000, 1000) cur_graph = None if os.path.isfile(rdf_file_path): try: cur_graph = __load_graph(rdf_file_path) except IOError: if tmp_dir is not None: current_file_path = tmp_dir + os.sep + "tmp_rdf_file.rdf" shutil.copyfile(rdf_file_path, current_file_path) try: cur_graph = __load_graph(current_file_path) except IOError as e: reperr.add_sentence("[2] " "It was impossible to handle the format used for " "storing the file (stored in the temporary path) '%s'. " "Additional details: %s" % (current_file_path, str(e))) os.remove(current_file_path) else: reperr.add_sentence("[3] " "It was impossible to try to load the file from the " "temporary path '%s' since that has not been specified in " "advance" % rdf_file_path) else: reperr.add_sentence("[4] " "The file specified ('%s') doesn't exist." % rdf_file_path) return cur_graph
def __store_graph(cur_g, rdf_iri_string, d_dir): try: res_dir, dest_file = \ find_paths(rdf_iri_string, args.base + os.sep, "https://w3id.org/oc/corpus/", 10000, 1000) dest_dir = res_dir.replace(args.base + os.sep, d_dir + os.sep) if not os.path.exists(dest_dir): os.makedirs(dest_dir) cur_file = dest_file.replace(res_dir, dest_dir) if os.path.exists(cur_file): c_graph = __load_graph(cur_file) else: c_graph = ConjunctiveGraph() c_graph.remove_context(c_graph.get_context(cur_g.identifier)) c_graph.addN([item + (cur_g.identifier,) for item in list(cur_g)]) with open(dest_file.replace(res_dir, dest_dir), "w") as f: cur_json_ld = json.loads(c_graph.serialize(format="json-ld", context=context_json)) cur_json_ld["@context"] = context_path json.dump(cur_json_ld, f, indent=4) # repok.add_sentence("File '%s' added." % cur_file) return dest_file except Exception as e: reperr.add_sentence("[5] It was impossible to store the RDF statements in %s. %s" % (dest_file, str(e)))
def _add_prov(self, short_name, prov_type, res, resp_agent, prov_subject=None): if prov_subject is None: g_prov = self.base_iri + "prov/" prov_info_path = g_prov.replace(self.base_iri, self.info_dir) + short_name + ".txt" else: g_prov = str(prov_subject) + "/prov/" res_file_path = \ find_paths(str(prov_subject), self.info_dir, self.base_iri, dir_split_number, items_per_file)[1][:-5] prov_info_path = res_file_path + "/prov/" + short_name + ".txt" return self._add(g_prov, prov_type, res, resp_agent, None, None, prov_info_path, short_name, [] if prov_subject is None else [prov_subject])
def store(self, cur_g, base_dir, base_iri, context_path, tmp_dir=None, override=False, already_processed={}, store_now=True): self.repok.new_article() self.reperr.new_article() if len(cur_g) > 0: cur_subject = set(cur_g.subjects(None, None)).pop() cur_dir_path, cur_file_path = find_paths( str(cur_subject), base_dir, base_iri, self.dir_split, self.n_file_item) try: if not os.path.exists(cur_dir_path): os.makedirs(cur_dir_path) final_g = ConjunctiveGraph() final_g.addN([item + (cur_g.identifier,) for item in list(cur_g)]) # Merging the data if not override: if cur_file_path in already_processed: stored_g = already_processed[cur_file_path] stored_g.addN(final_g.quads((None, None, None, None))) final_g = stored_g elif os.path.exists(cur_file_path): # This is a conjunctive graps that contains all the triples (and graphs) # the file is actually defining - they could be more than those using # 'cur_subject' as subject. final_g = self.load(cur_file_path, cur_g, tmp_dir) already_processed[cur_file_path] = final_g if store_now: self.__store_in_file(final_g, cur_file_path, context_path) return already_processed except Exception as e: self.reperr.add_sentence("[5] It was impossible to store the RDF statements in %s. %s" % (cur_file_path, str(e))) return None
def add_prov_triples_in_filesystem(self, res_iri, prov_entity_type=None): if self.base_dir is not None and self.base_iri is not None: cur_file_path = find_paths(res_iri, self.base_dir, self.base_iri, dir_split_number, items_per_file)[1] if cur_file_path.endswith("index.json"): cur_path = cur_file_path.replace("index.json", "") + "prov" else: cur_path = cur_file_path[:-5] + os.sep + "prov" file_list = [] if os.path.isdir(cur_path): for cur_dir, cur_subdir, cur_files in os.walk(cur_path): for cur_file in cur_files: if cur_file.endswith(".json") and \ (prov_entity_type is None or cur_file.startswith(prov_entity_type)): file_list += [cur_dir + os.sep + cur_file] for file_path in file_list: if file_path not in self.loaded: self.loaded.add(file_path) cur_g = self.storer.load(file_path, tmp_dir=self.tmp_dir) self.add_triples_in_graph(cur_g)
with open(args.context) as f, open(args.input) as g: context_json = json.load(f) csv_reader = csv.reader(g) for res, n_mod, m_type in csv_reader: last_res = res prov_entity = URIRef(res) if args.id: prov_g = load(res) prov_entity_g = get_entity_graph(res, prov_g) spec_entity = prov_entity_g.value(prov_entity, PROV.specializationOf) res_g = load(str(spec_entity)) res_entity_g = get_entity_graph(spec_entity, res_g) for id_entity in [o for s, p, o in list( res_entity_g.triples((spec_entity, DATACITE.hasIdentifier, None)))]: rdf_dir, rdf_file_path = find_paths( id_entity, args.base + os.sep, "https://w3id.org/oc/corpus/", 10000, 1000) result.add(rdf_file_path) else: repok.add_sentence("Processing '%s'" % res) prov_g = load(res) spec_entity_iri = res.split("/prov/")[0] prov_entity_g = get_entity_graph(res, prov_g, True) generation_dates = [o for s, p, o in list(prov_entity_g.triples( (None, PROV.generatedAtTime, None)))] sources = [o for s, p, o in list(prov_entity_g.triples((None, PROV.hadPrimarySource, None)))] # Get all identifiers creation dates and sources