class Checker(object): def __init__(self, input_dir, output_dir=None, tmp_dir=None): self.input_dir = input_dir self.output_dir = output_dir self.tmp_dir = tmp_dir self.storer = Storer() self.name = self.__class__.__name__ self.repok = Reporter(prefix="[%s - INFO] " % self.name) self.repok.new_article() self.reper = Reporter(prefix="[%s - ERROR] " % self.name) self.reper.new_article() def process(self): for cur_dir, cur_subdir, cur_files in os.walk(self.input_dir): for cur_file in cur_files: self.repok.new_article() self.reper.new_article() cur_rdf_path = cur_dir + os.sep + cur_file try: self.repok.add_sentence("Processing '%s'" % cur_rdf_path) g = self.storer.load(cur_rdf_path, tmp_dir=self.tmp_dir) if self.output_dir is None: self.repok.add_sentence("The RDF graph has been converted in TRIG as follows:\n%s" % g.serialize(format="trig")) else: if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) output_file = self.output_dir + os.sep + "converted_" + cur_file + ".ttl" self.repok.add_sentence("The RDF graph has been stored in %s" % (output_file, g.serialize(output_file, format="trig"))) except Exception: self.reper.add_sentence("The file '%s' doesn't contain RDF statements", False)
class Checker(object): def __init__(self, input_dir, output_dir=None, tmp_dir=None): self.input_dir = input_dir self.output_dir = output_dir self.tmp_dir = tmp_dir self.storer = Storer() self.name = self.__class__.__name__ self.repok = Reporter(prefix="[%s - INFO] " % self.name) self.repok.new_article() self.reper = Reporter(prefix="[%s - ERROR] " % self.name) self.reper.new_article() def process(self): for cur_dir, cur_subdir, cur_files in os.walk(self.input_dir): for cur_file in cur_files: self.repok.new_article() self.reper.new_article() cur_rdf_path = cur_dir + os.sep + cur_file try: self.repok.add_sentence("Processing '%s'" % cur_rdf_path) g = self.storer.load(cur_rdf_path, tmp_dir=self.tmp_dir) if self.output_dir is None: self.repok.add_sentence( "The RDF graph has been converted in TRIG as follows:\n%s" % g.serialize(format="trig")) else: if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) output_file = self.output_dir + os.sep + "converted_" + cur_file + ".ttl" self.repok.add_sentence( "The RDF graph has been stored in %s" % (output_file, g.serialize(output_file, format="trig"))) except Exception: self.reper.add_sentence( "The file '%s' doesn't contain RDF statements", False)
last_res = res prov_entity = URIRef(res) if args.id: prov_g = load(res) prov_entity_g = get_entity_graph(res, prov_g) spec_entity = prov_entity_g.value(prov_entity, PROV.specializationOf) res_g = load(str(spec_entity)) res_entity_g = get_entity_graph(spec_entity, res_g) for id_entity in [o for s, p, o in list( res_entity_g.triples((spec_entity, DATACITE.hasIdentifier, None)))]: rdf_dir, rdf_file_path = find_paths( id_entity, args.base + os.sep, "https://w3id.org/oc/corpus/", 10000, 1000) result.add(rdf_file_path) else: repok.add_sentence("Processing '%s'" % res) prov_g = load(res) spec_entity_iri = res.split("/prov/")[0] prov_entity_g = get_entity_graph(res, prov_g, True) generation_dates = [o for s, p, o in list(prov_entity_g.triples( (None, PROV.generatedAtTime, None)))] sources = [o for s, p, o in list(prov_entity_g.triples((None, PROV.hadPrimarySource, None)))] # Get all identifiers creation dates and sources spec_entity = URIRef(spec_entity_iri) res_g = load(str(spec_entity)) res_entity_g = get_entity_graph(spec_entity, res_g)
help="The max number of resources a file can contain.") arg_parser.add_argument("-t", "--tmp_dir", dest="tmp_dir", help="The directory for easing the RDF loading.") arg_parser.add_argument("-c", "--context", dest="context", required=True, help="The JSON-LD context to use.") args = arg_parser.parse_args() with open(args.context) as f: context_json = json.load(f) if do_file_exist(args.input): res_count = 0 dir_count = 0 new_dir = None repok.add_sentence("Organize all files in directory each containing at " "most %s resources" % args.dir_split) new_dirs = [] new_files = [] while True: res_count += 1 if res_count > dir_count: dir_count += long(args.dir_split) new_dir = args.input + os.sep + "n_" + str(dir_count) new_dirs += [new_dir] src_dir = args.input + os.sep + str(res_count) dst_dir = new_dir + os.sep + str(res_count) src_file = src_dir + ".json" if os.path.exists(src_file): try: if os.path.exists(src_dir): os.renames(src_dir, dst_dir)
if "invalidated_by" in cur_graph: cur_invalidated_by = cur_graph["invalidated_by"] if isinstance(cur_invalidated_by, list): se_generated_by += cur_invalidated_by else: se_generated_by += [cur_invalidated_by] generated = sorted(list(set(generated))) se_generated_by = sorted(list(set(se_generated_by))) sen_string = item["iri"] + "se/1" + ",[%s]," % str(len(generated)) for ca_item in cur_ca["@graph"]: found = False for cur_ca_graph in ca_item["@graph"]: if cur_ca_graph["iri"] in se_generated_by: found = True all_descs = cur_ca_graph["description"] descs = all_descs if isinstance(all_descs, list) else [all_descs] for desc in descs: if "citation data and new identifiers" in desc: sen_string += "[CIT+ID]" elif "citation data" in desc: sen_string += "[CIT]" elif "new identifiers" in desc: sen_string += "[ID]" if found: rep.add_sentence(sen_string) break rep.write_file(args.o_file)
arg_parser.add_argument("-o", "--output", dest="output", required=True, help="The output file.") arg_parser.add_argument("-t", "--tmp_dir", dest="tmp_dir", help="The directory for easing the RDF loading.") arg_parser.add_argument("-c", "--context", dest="context", required=True, help="The JSON-LD context to use.") args = arg_parser.parse_args() with open(args.context) as f: context_json = json.load(f) repok = Reporter(True, prefix="[create_nq.py: INFO] ") reperr = Reporter(True, prefix="[create_nq.py: ERROR] ") repok.new_article() reperr.new_article() for cur_dir, cur_subdir, cur_files in os.walk(args.input): with open(args.output, 'a') as f: for cur_file in cur_files: if cur_file.endswith(".json"): cur_g = ConjunctiveGraph() cur_g = load(cur_g, cur_dir + os.sep + cur_file, args.tmp_dir) nt_strings = cur_g.serialize(format="nquads") f.write(nt_strings) repok.add_sentence("Done.") if not reperr.is_empty(): reperr.write_file("create_nq.rep.%s.err.txt" % ( re.sub("_+", "_", re.sub("[\.%s/]" % os.sep, "_", args.input))))
class Storer(object): def __init__(self, graph_set=None, repok=None, reperr=None, context_map={}, dir_split=0, n_file_item=1): self.dir_split = dir_split self.n_file_item = n_file_item self.context_map = context_map for context_url in context_map: context_file_path = context_map[context_url] with open(context_file_path) as f: context_json = json.load(f) self.context_map[context_url] = context_json if graph_set is None: self.g = [] else: self.g = graph_set.graphs() if repok is None: self.repok = Reporter(prefix="[Storer: INFO] ") else: self.repok = repok if reperr is None: self.reperr = Reporter(prefix="[Storer: ERROR] ") else: self.reperr = reperr self.preface_query = "" def store_all(self, base_dir, base_iri, context_path, tmp_dir=None, g_set=[], override=False): for g in g_set: self.g += [g] self.repok.new_article() self.reperr.new_article() self.repok.add_sentence("Starting the process") processed_graphs = {} for cur_g in self.g: processed_graphs = self.store(cur_g, base_dir, base_iri, context_path, tmp_dir, override, processed_graphs, False) stored_graph_path = [] for cur_file_path in processed_graphs: stored_graph_path += [cur_file_path] self.__store_in_file(processed_graphs[cur_file_path], cur_file_path, context_path) return stored_graph_path def upload_and_store(self, base_dir, triplestore_url, base_iri, context_path, tmp_dir=None, g_set=[], override=False): stored_graph_path = self.store_all(base_dir, base_iri, context_path, tmp_dir, g_set, override) # Some graphs were not stored properly, then no one will be updloaded to the triplestore # but we highlights those ones that could be added in principle, by mentioning them # with a ".notupdloaded" marker if None in stored_graph_path: for file_path in stored_graph_path: # Create a marker for the file not uploaded in the triplestore open("%s.notuploaded" % file_path, "w").close() self.reperr.add_sentence("[6] " "The statements of in the JSON-LD file '%s' were not " "uploaded into the triplestore." % file_path) else: # All the files have been stored self.upload_all(self.g, triplestore_url, base_dir) def __query(self, query_string, triplestore_url, n_statements=None, base_dir=None): if query_string != "": try: tp = SPARQLWrapper(triplestore_url) tp.setMethod('POST') tp.setQuery(query_string) tp.query() if n_statements is None: self.repok.add_sentence( "Triplestore updated by means of a SPARQL Update query.") else: self.repok.add_sentence( "Triplestore updated with %s more RDF statements." % n_statements) return True except Exception as e: self.reperr.add_sentence("[1] " "Graph was not loaded into the " "triplestore due to communication problems: %s" % str(e)) if base_dir is not None: tp_err_dir = base_dir + os.sep + "tp_err" if not os.path.exists(tp_err_dir): os.makedirs(tp_err_dir) cur_file_err = tp_err_dir + os.sep + \ datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f_not_uploaded.txt') with io.open(cur_file_err, "w", encoding="utf-8") as f: f.write(query_string) return False def upload_all(self, all_g, triplestore_url, base_dir): result = True self.repok.new_article() self.reperr.new_article() query_string = None total_new_statements = None for idx, cur_g in enumerate(all_g): cur_idx = idx % 10 if cur_idx == 0: if query_string is not None: result &= self.__query(query_string, triplestore_url, total_new_statements, base_dir) query_string = u"" total_new_statements = 0 else: query_string += u" ; " total_new_statements += len(cur_g) query_string += self.get_preface_query(cur_g) + Storer._make_insert_query(cur_g) if query_string is not None and query_string != "": result &= self.__query(query_string, triplestore_url, total_new_statements, base_dir) return result def execute_upload_query(self, query_string, triplestore_url): self.repok.new_article() self.reperr.new_article() return self.__query(query_string, triplestore_url) def upload(self, cur_g, triplestore_url): self.repok.new_article() self.reperr.new_article() query_string = Storer._make_insert_query(cur_g) return self.__query(query_string, triplestore_url, len(cur_g)) def set_preface_query(self, query_string): self.preface_query = query_string def get_preface_query(self, cur_g): if self.preface_query != "": if type(cur_g.identifier) is BNode: return u"CLEAR DEFAULT ; " else: return u"WITH <%s> " % str(cur_g.identifier) + self.preface_query + " ; " else: return "" @staticmethod def _make_insert_query(cur_g): if type(cur_g.identifier) is BNode: return u"INSERT DATA { %s }" % cur_g.serialize(format="nt") else: return u"INSERT DATA { GRAPH <%s> { %s } }" % \ (str(cur_g.identifier), cur_g.serialize(format="nt")) def __store_in_file(self, cur_g, cur_file_path, context_path): cur_json_ld = json.loads( cur_g.serialize(format="json-ld", context=self.__get_context(context_path))) if isinstance(cur_json_ld, dict): cur_json_ld["@context"] = context_path else: # it is a list for item in cur_json_ld: item["@context"] = context_path with open(cur_file_path, "w") as f: json.dump(cur_json_ld, f, indent=4) self.repok.add_sentence("File '%s' added." % cur_file_path) def store(self, cur_g, base_dir, base_iri, context_path, tmp_dir=None, override=False, already_processed={}, store_now=True): self.repok.new_article() self.reperr.new_article() if len(cur_g) > 0: cur_subject = set(cur_g.subjects(None, None)).pop() cur_dir_path, cur_file_path = find_paths( str(cur_subject), base_dir, base_iri, self.dir_split, self.n_file_item) try: if not os.path.exists(cur_dir_path): os.makedirs(cur_dir_path) final_g = ConjunctiveGraph() final_g.addN([item + (cur_g.identifier,) for item in list(cur_g)]) # Merging the data if not override: if cur_file_path in already_processed: stored_g = already_processed[cur_file_path] stored_g.addN(final_g.quads((None, None, None, None))) final_g = stored_g elif os.path.exists(cur_file_path): # This is a conjunctive graps that contains all the triples (and graphs) # the file is actually defining - they could be more than those using # 'cur_subject' as subject. final_g = self.load(cur_file_path, cur_g, tmp_dir) already_processed[cur_file_path] = final_g if store_now: self.__store_in_file(final_g, cur_file_path, context_path) return already_processed except Exception as e: self.reperr.add_sentence("[5] It was impossible to store the RDF statements in %s. %s" % (cur_file_path, str(e))) return None def __get_context(self, context_url): if context_url in self.context_map: return self.context_map[context_url] else: return context_url def __get_first_context(self): for context_url in self.context_map: return self.context_map[context_url] def load(self, rdf_file_path, cur_graph=None, tmp_dir=None): self.repok.new_article() self.reperr.new_article() if os.path.isfile(rdf_file_path): try: cur_graph = self.__load_graph(rdf_file_path, cur_graph) except IOError: if tmp_dir is not None: current_file_path = tmp_dir + os.sep + "tmp_rdf_file.rdf" shutil.copyfile(rdf_file_path, current_file_path) try: cur_graph = self.__load_graph(current_file_path, cur_graph) except IOError as e: self.reperr.add_sentence("[2] " "It was impossible to handle the format used for " "storing the file (stored in the temporary path) '%s'. " "Additional details: %s" % (current_file_path, str(e))) os.remove(current_file_path) else: self.reperr.add_sentence("[3] " "It was impossible to try to load the file from the " "temporary path '%s' since that has not been specified in " "advance" % rdf_file_path) else: self.reperr.add_sentence("[4] " "The file specified ('%s') doesn't exist." % rdf_file_path) return cur_graph def __load_graph(self, file_path, cur_graph=None): formats = ["json-ld", "rdfxml", "turtle", "trig"] current_graph = ConjunctiveGraph() if cur_graph is not None: current_graph.parse(data=cur_graph.serialize(format="trig"), format="trig") for cur_format in formats: try: if cur_format == "json-ld": with open(file_path) as f: json_ld_file = json.load(f) if isinstance(json_ld_file, dict): json_ld_file = [json_ld_file] for json_ld_resource in json_ld_file: # Trick to force the use of a pre-loaded context if the format # specified is JSON-LD context_json = None if "@context" in json_ld_resource: cur_context = json_ld_resource["@context"] if cur_context in self.context_map: context_json = self.__get_context(cur_context)["@context"] json_ld_resource["@context"] = context_json current_graph.parse(data=json.dumps(json_ld_resource), format=cur_format) else: current_graph.parse(file_path, format=cur_format) return current_graph except Exception as e: errors = " | " + str(e) # Try another format raise IOError("1", "It was impossible to handle the format used for storing the file '%s'%s" % (file_path, errors))