Example #1
0
class Checker(object):
    def __init__(self, input_dir, output_dir=None, tmp_dir=None):
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.tmp_dir = tmp_dir
        self.storer = Storer()
        self.name = self.__class__.__name__
        self.repok = Reporter(prefix="[%s - INFO] " % self.name)
        self.repok.new_article()
        self.reper = Reporter(prefix="[%s - ERROR] " % self.name)
        self.reper.new_article()

    def process(self):
        for cur_dir, cur_subdir, cur_files in os.walk(self.input_dir):
            for cur_file in cur_files:
                self.repok.new_article()
                self.reper.new_article()
                cur_rdf_path = cur_dir + os.sep + cur_file
                try:
                    self.repok.add_sentence("Processing '%s'" % cur_rdf_path)
                    g = self.storer.load(cur_rdf_path, tmp_dir=self.tmp_dir)
                    if self.output_dir is None:
                        self.repok.add_sentence("The RDF graph has been converted in TRIG as follows:\n%s"
                                                % g.serialize(format="trig"))
                    else:
                        if not os.path.exists(self.output_dir):
                            os.makedirs(self.output_dir)
                        output_file = self.output_dir + os.sep + "converted_" + cur_file + ".ttl"
                        self.repok.add_sentence("The RDF graph has been stored in %s"
                                                % (output_file, g.serialize(output_file, format="trig")))
                except Exception:
                    self.reper.add_sentence("The file '%s' doesn't contain RDF statements", False)
Example #2
0
class Checker(object):
    def __init__(self, input_dir, output_dir=None, tmp_dir=None):
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.tmp_dir = tmp_dir
        self.storer = Storer()
        self.name = self.__class__.__name__
        self.repok = Reporter(prefix="[%s - INFO] " % self.name)
        self.repok.new_article()
        self.reper = Reporter(prefix="[%s - ERROR] " % self.name)
        self.reper.new_article()

    def process(self):
        for cur_dir, cur_subdir, cur_files in os.walk(self.input_dir):
            for cur_file in cur_files:
                self.repok.new_article()
                self.reper.new_article()
                cur_rdf_path = cur_dir + os.sep + cur_file
                try:
                    self.repok.add_sentence("Processing '%s'" % cur_rdf_path)
                    g = self.storer.load(cur_rdf_path, tmp_dir=self.tmp_dir)
                    if self.output_dir is None:
                        self.repok.add_sentence(
                            "The RDF graph has been converted in TRIG as follows:\n%s"
                            % g.serialize(format="trig"))
                    else:
                        if not os.path.exists(self.output_dir):
                            os.makedirs(self.output_dir)
                        output_file = self.output_dir + os.sep + "converted_" + cur_file + ".ttl"
                        self.repok.add_sentence(
                            "The RDF graph has been stored in %s" %
                            (output_file,
                             g.serialize(output_file, format="trig")))
                except Exception:
                    self.reper.add_sentence(
                        "The file '%s' doesn't contain RDF statements", False)
Example #3
0
         last_res = res
         prov_entity = URIRef(res)
         
         if args.id:
             prov_g = load(res)
             prov_entity_g = get_entity_graph(res, prov_g)
             spec_entity = prov_entity_g.value(prov_entity, PROV.specializationOf)
             res_g = load(str(spec_entity))
             res_entity_g = get_entity_graph(spec_entity, res_g)
             for id_entity in [o for s, p, o in list(
                     res_entity_g.triples((spec_entity, DATACITE.hasIdentifier, None)))]:
                 rdf_dir, rdf_file_path = find_paths(
                     id_entity, args.base + os.sep, "https://w3id.org/oc/corpus/", 10000, 1000)
                 result.add(rdf_file_path)
         else:
             repok.add_sentence("Processing '%s'" % res)
             
             prov_g = load(res)
             spec_entity_iri = res.split("/prov/")[0]
             prov_entity_g = get_entity_graph(res, prov_g, True)
 
             generation_dates = [o for s, p, o in
                                 list(prov_entity_g.triples(
                                     (None, PROV.generatedAtTime, None)))]
             sources = [o for s, p, o in
                        list(prov_entity_g.triples((None, PROV.hadPrimarySource, None)))]
             
             # Get all identifiers creation dates and sources
             spec_entity = URIRef(spec_entity_iri)
             res_g = load(str(spec_entity))
             res_entity_g = get_entity_graph(spec_entity, res_g)
                            help="The max number of resources a file can contain.")
    arg_parser.add_argument("-t", "--tmp_dir", dest="tmp_dir",
                            help="The directory for easing the RDF loading.")
    arg_parser.add_argument("-c", "--context", dest="context", required=True,
                            help="The JSON-LD context to use.")

    args = arg_parser.parse_args()

    with open(args.context) as f:
        context_json = json.load(f)

    if do_file_exist(args.input):
        res_count = 0
        dir_count = 0
        new_dir = None
        repok.add_sentence("Organize all files in directory each containing at "
                           "most %s resources" % args.dir_split)
        new_dirs = []
        new_files = []
        while True:
            res_count += 1
            if res_count > dir_count:
                dir_count += long(args.dir_split)
                new_dir = args.input + os.sep + "n_" + str(dir_count)
                new_dirs += [new_dir]
            src_dir = args.input + os.sep + str(res_count)
            dst_dir = new_dir + os.sep + str(res_count)
            src_file = src_dir + ".json"
            if os.path.exists(src_file):
                try:
                    if os.path.exists(src_dir):
                        os.renames(src_dir, dst_dir)
Example #5
0
                        if "invalidated_by" in cur_graph:
                            cur_invalidated_by = cur_graph["invalidated_by"]
                            if isinstance(cur_invalidated_by, list):
                                se_generated_by += cur_invalidated_by
                            else:
                                se_generated_by += [cur_invalidated_by]
                        
                generated = sorted(list(set(generated)))
                se_generated_by = sorted(list(set(se_generated_by)))
                sen_string = item["iri"] + "se/1" + ",[%s]," % str(len(generated))
                        
                for ca_item in cur_ca["@graph"]:
                    found = False
                    for cur_ca_graph in ca_item["@graph"]:
                        if cur_ca_graph["iri"] in se_generated_by:
                            found = True
                            all_descs = cur_ca_graph["description"]
                            descs = all_descs if isinstance(all_descs, list) else [all_descs]
                            for desc in descs:
                                if "citation data and new identifiers" in desc:
                                    sen_string += "[CIT+ID]"
                                elif "citation data" in desc:
                                    sen_string += "[CIT]"
                                elif "new identifiers" in desc:
                                    sen_string += "[ID]"
                    if found:
                        rep.add_sentence(sen_string)
                        break

    rep.write_file(args.o_file)
Example #6
0
    arg_parser.add_argument("-o", "--output", dest="output", required=True,
                            help="The output file.")
    arg_parser.add_argument("-t", "--tmp_dir", dest="tmp_dir",
                            help="The directory for easing the RDF loading.")
    arg_parser.add_argument("-c", "--context", dest="context", required=True,
                            help="The JSON-LD context to use.")

    args = arg_parser.parse_args()

    with open(args.context) as f:
        context_json = json.load(f)

    repok = Reporter(True, prefix="[create_nq.py: INFO] ")
    reperr = Reporter(True, prefix="[create_nq.py: ERROR] ")
    repok.new_article()
    reperr.new_article()

    for cur_dir, cur_subdir, cur_files in os.walk(args.input):
        with open(args.output, 'a') as f:
            for cur_file in cur_files:
                if cur_file.endswith(".json"):
                    cur_g = ConjunctiveGraph()
                    cur_g = load(cur_g, cur_dir + os.sep + cur_file, args.tmp_dir)
                    nt_strings = cur_g.serialize(format="nquads")
                    f.write(nt_strings)

    repok.add_sentence("Done.")
    if not reperr.is_empty():
        reperr.write_file("create_nq.rep.%s.err.txt" % (
            re.sub("_+", "_", re.sub("[\.%s/]" % os.sep, "_", args.input))))
Example #7
0
class Storer(object):

    def __init__(self, graph_set=None, repok=None, reperr=None,
                 context_map={}, dir_split=0, n_file_item=1):
        self.dir_split = dir_split
        self.n_file_item = n_file_item
        self.context_map = context_map
        for context_url in context_map:
            context_file_path = context_map[context_url]
            with open(context_file_path) as f:
                context_json = json.load(f)
                self.context_map[context_url] = context_json

        if graph_set is None:
            self.g = []
        else:
            self.g = graph_set.graphs()
        if repok is None:
            self.repok = Reporter(prefix="[Storer: INFO] ")
        else:
            self.repok = repok
        if reperr is None:
            self.reperr = Reporter(prefix="[Storer: ERROR] ")
        else:
            self.reperr = reperr
        self.preface_query = ""

    def store_all(self, base_dir, base_iri, context_path, tmp_dir=None, g_set=[], override=False):
        for g in g_set:
            self.g += [g]

        self.repok.new_article()
        self.reperr.new_article()

        self.repok.add_sentence("Starting the process")

        processed_graphs = {}
        for cur_g in self.g:
            processed_graphs = self.store(cur_g, base_dir, base_iri, context_path, tmp_dir,
                                          override, processed_graphs, False)

        stored_graph_path = []
        for cur_file_path in processed_graphs:
            stored_graph_path += [cur_file_path]
            self.__store_in_file(processed_graphs[cur_file_path], cur_file_path, context_path)

        return stored_graph_path

    def upload_and_store(self, base_dir, triplestore_url, base_iri, context_path,
                         tmp_dir=None, g_set=[], override=False):

        stored_graph_path = self.store_all(base_dir, base_iri, context_path, tmp_dir, g_set, override)

        # Some graphs were not stored properly, then no one will be updloaded to the triplestore
        # but we highlights those ones that could be added in principle, by mentioning them
        # with a ".notupdloaded" marker
        if None in stored_graph_path:
            for file_path in stored_graph_path:
                # Create a marker for the file not uploaded in the triplestore
                open("%s.notuploaded" % file_path, "w").close()
                self.reperr.add_sentence("[6] "
                                         "The statements of in the JSON-LD file '%s' were not "
                                         "uploaded into the triplestore." % file_path)
        else:  # All the files have been stored
            self.upload_all(self.g, triplestore_url, base_dir)

    def __query(self, query_string, triplestore_url, n_statements=None, base_dir=None):
        if query_string != "":
            try:
                tp = SPARQLWrapper(triplestore_url)
                tp.setMethod('POST')
                tp.setQuery(query_string)
                tp.query()

                if n_statements is None:
                    self.repok.add_sentence(
                        "Triplestore updated by means of a SPARQL Update query.")
                else:
                    self.repok.add_sentence(
                        "Triplestore updated with %s more RDF statements." % n_statements)

                return True

            except Exception as e:
                self.reperr.add_sentence("[1] "
                                         "Graph was not loaded into the "
                                         "triplestore due to communication problems: %s" % str(e))
                if base_dir is not None:
                    tp_err_dir = base_dir + os.sep + "tp_err"
                    if not os.path.exists(tp_err_dir):
                        os.makedirs(tp_err_dir)
                    cur_file_err = tp_err_dir + os.sep + \
                                   datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f_not_uploaded.txt')
                    with io.open(cur_file_err, "w", encoding="utf-8") as f:
                        f.write(query_string)

        return False

    def upload_all(self, all_g, triplestore_url, base_dir):
        result = True

        self.repok.new_article()
        self.reperr.new_article()

        query_string = None
        total_new_statements = None

        for idx, cur_g in enumerate(all_g):
            cur_idx = idx % 10
            if cur_idx == 0:
                if query_string is not None:
                    result &= self.__query(query_string, triplestore_url, total_new_statements, base_dir)
                query_string = u""
                total_new_statements = 0
            else:
                query_string += u" ; "
                total_new_statements += len(cur_g)

            query_string += self.get_preface_query(cur_g) + Storer._make_insert_query(cur_g)

        if query_string is not None and query_string != "":
            result &= self.__query(query_string, triplestore_url, total_new_statements, base_dir)

        return result

    def execute_upload_query(self, query_string, triplestore_url):
        self.repok.new_article()
        self.reperr.new_article()

        return self.__query(query_string, triplestore_url)

    def upload(self, cur_g, triplestore_url):
        self.repok.new_article()
        self.reperr.new_article()

        query_string = Storer._make_insert_query(cur_g)

        return self.__query(query_string, triplestore_url, len(cur_g))

    def set_preface_query(self, query_string):
        self.preface_query = query_string

    def get_preface_query(self, cur_g):
        if self.preface_query != "":
            if type(cur_g.identifier) is BNode:
                return u"CLEAR DEFAULT ; "
            else:
                return u"WITH <%s> " % str(cur_g.identifier) + self.preface_query + " ; "
        else:
            return ""

    @staticmethod
    def _make_insert_query(cur_g):
        if type(cur_g.identifier) is BNode:
            return u"INSERT DATA { %s }" % cur_g.serialize(format="nt")
        else:
            return u"INSERT DATA { GRAPH <%s> { %s } }" % \
                   (str(cur_g.identifier), cur_g.serialize(format="nt"))

    def __store_in_file(self, cur_g, cur_file_path, context_path):
        cur_json_ld = json.loads(
            cur_g.serialize(format="json-ld", context=self.__get_context(context_path)))

        if isinstance(cur_json_ld, dict):
            cur_json_ld["@context"] = context_path
        else:  # it is a list
            for item in cur_json_ld:
                item["@context"] = context_path

        with open(cur_file_path, "w") as f:
            json.dump(cur_json_ld, f, indent=4)

        self.repok.add_sentence("File '%s' added." % cur_file_path)

    def store(self, cur_g, base_dir, base_iri, context_path, tmp_dir=None,
              override=False, already_processed={}, store_now=True):
        self.repok.new_article()
        self.reperr.new_article()

        if len(cur_g) > 0:
            cur_subject = set(cur_g.subjects(None, None)).pop()
            cur_dir_path, cur_file_path = find_paths(
                str(cur_subject), base_dir, base_iri, self.dir_split, self.n_file_item)

            try:
                if not os.path.exists(cur_dir_path):
                    os.makedirs(cur_dir_path)

                final_g = ConjunctiveGraph()
                final_g.addN([item + (cur_g.identifier,) for item in list(cur_g)])

                # Merging the data
                if not override:
                    if cur_file_path in already_processed:
                        stored_g = already_processed[cur_file_path]
                        stored_g.addN(final_g.quads((None, None, None, None)))
                        final_g = stored_g
                    elif os.path.exists(cur_file_path):
                        # This is a conjunctive graps that contains all the triples (and graphs)
                        # the file is actually defining - they could be more than those using
                        # 'cur_subject' as subject.
                        final_g = self.load(cur_file_path, cur_g, tmp_dir)

                already_processed[cur_file_path] = final_g

                if store_now:
                    self.__store_in_file(final_g, cur_file_path, context_path)

                return already_processed
            except Exception as e:
                self.reperr.add_sentence("[5] It was impossible to store the RDF statements in %s. %s" %
                                         (cur_file_path, str(e)))

        return None

    def __get_context(self, context_url):
        if context_url in self.context_map:
            return self.context_map[context_url]
        else:
            return context_url

    def __get_first_context(self):
        for context_url in self.context_map:
            return self.context_map[context_url]

    def load(self, rdf_file_path, cur_graph=None, tmp_dir=None):
        self.repok.new_article()
        self.reperr.new_article()

        if os.path.isfile(rdf_file_path):
            try:
                cur_graph = self.__load_graph(rdf_file_path, cur_graph)
            except IOError:
                if tmp_dir is not None:
                    current_file_path = tmp_dir + os.sep + "tmp_rdf_file.rdf"
                    shutil.copyfile(rdf_file_path, current_file_path)
                    try:
                        cur_graph = self.__load_graph(current_file_path, cur_graph)
                    except IOError as e:
                        self.reperr.add_sentence("[2] "
                                                 "It was impossible to handle the format used for "
                                                 "storing the file (stored in the temporary path) '%s'. "
                                                 "Additional details: %s"
                                                 % (current_file_path, str(e)))
                    os.remove(current_file_path)
                else:
                    self.reperr.add_sentence("[3] "
                                             "It was impossible to try to load the file from the "
                                             "temporary path '%s' since that has not been specified in "
                                             "advance" % rdf_file_path)
        else:
            self.reperr.add_sentence("[4] "
                                     "The file specified ('%s') doesn't exist."
                                     % rdf_file_path)

        return cur_graph

    def __load_graph(self, file_path, cur_graph=None):
        formats = ["json-ld", "rdfxml", "turtle", "trig"]

        current_graph = ConjunctiveGraph()

        if cur_graph is not None:
            current_graph.parse(data=cur_graph.serialize(format="trig"), format="trig")

        for cur_format in formats:
            try:
                if cur_format == "json-ld":
                    with open(file_path) as f:
                        json_ld_file = json.load(f)
                        if isinstance(json_ld_file, dict):
                            json_ld_file = [json_ld_file]

                        for json_ld_resource in json_ld_file:
                            # Trick to force the use of a pre-loaded context if the format
                            # specified is JSON-LD
                            context_json = None
                            if "@context" in json_ld_resource:
                                cur_context = json_ld_resource["@context"]
                                if cur_context in self.context_map:
                                    context_json = self.__get_context(cur_context)["@context"]
                                    json_ld_resource["@context"] = context_json

                            current_graph.parse(data=json.dumps(json_ld_resource), format=cur_format)
                else:
                    current_graph.parse(file_path, format=cur_format)

                return current_graph
            except Exception as e:
                errors = " | " + str(e)  # Try another format

        raise IOError("1", "It was impossible to handle the format used for storing the file '%s'%s" %
                      (file_path, errors))