Example #1
0
def load(rdf_iri_string, tmp_dir=None):
    res_dir, rdf_file_path = \
        find_paths(rdf_iri_string, args.base + os.sep, "https://w3id.org/oc/corpus/", 10000, 1000)
    
    cur_graph = None

    if os.path.isfile(rdf_file_path):
        try:
            cur_graph = __load_graph(rdf_file_path)
        except IOError:
            if tmp_dir is not None:
                current_file_path = tmp_dir + os.sep + "tmp_rdf_file.rdf"
                shutil.copyfile(rdf_file_path, current_file_path)
                try:
                    cur_graph = __load_graph(current_file_path)
                except IOError as e:
                    reperr.add_sentence("[2] "
                                        "It was impossible to handle the format used for "
                                        "storing the file (stored in the temporary path) '%s'. "
                                        "Additional details: %s"
                                        % (current_file_path, str(e)))
                os.remove(current_file_path)
            else:
                reperr.add_sentence("[3] "
                                    "It was impossible to try to load the file from the "
                                    "temporary path '%s' since that has not been specified in "
                                    "advance" % rdf_file_path)
    else:
        reperr.add_sentence("[4] "
                            "The file specified ('%s') doesn't exist."
                            % rdf_file_path)

    return cur_graph
Example #2
0
def __store_graph(cur_g, rdf_iri_string, d_dir):
    try:
        res_dir, dest_file = \
            find_paths(rdf_iri_string, args.base + os.sep, "https://w3id.org/oc/corpus/", 10000, 1000)
        
        dest_dir = res_dir.replace(args.base + os.sep, d_dir + os.sep)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        
        cur_file = dest_file.replace(res_dir, dest_dir)
        if os.path.exists(cur_file):
            c_graph = __load_graph(cur_file)
        else:
            c_graph = ConjunctiveGraph()

        c_graph.remove_context(c_graph.get_context(cur_g.identifier))
        c_graph.addN([item + (cur_g.identifier,) for item in list(cur_g)])
        
        with open(dest_file.replace(res_dir, dest_dir), "w") as f:
            cur_json_ld = json.loads(c_graph.serialize(format="json-ld", context=context_json))
            cur_json_ld["@context"] = context_path
            json.dump(cur_json_ld, f, indent=4)
        # repok.add_sentence("File '%s' added." % cur_file)
        return dest_file
    except Exception as e:
        reperr.add_sentence("[5] It was impossible to store the RDF statements in %s. %s" %
                            (dest_file, str(e)))
Example #3
0
 def _add_prov(self, short_name, prov_type, res, resp_agent, prov_subject=None):
     if prov_subject is None:
         g_prov = self.base_iri + "prov/"
         prov_info_path = g_prov.replace(self.base_iri, self.info_dir) + short_name + ".txt"
     else:
         g_prov = str(prov_subject) + "/prov/"
         res_file_path = \
             find_paths(str(prov_subject), self.info_dir, self.base_iri,
                        dir_split_number, items_per_file)[1][:-5]
         prov_info_path = res_file_path + "/prov/" + short_name + ".txt"
     return self._add(g_prov, prov_type, res, resp_agent, None, None,
                      prov_info_path, short_name, [] if prov_subject is None else [prov_subject])
Example #4
0
 def _add_prov(self, short_name, prov_type, res, resp_agent, prov_subject=None):
     if prov_subject is None:
         g_prov = self.base_iri + "prov/"
         prov_info_path = g_prov.replace(self.base_iri, self.info_dir) + short_name + ".txt"
     else:
         g_prov = str(prov_subject) + "/prov/"
         res_file_path = \
             find_paths(str(prov_subject), self.info_dir, self.base_iri,
                        dir_split_number, items_per_file)[1][:-5]
         prov_info_path = res_file_path + "/prov/" + short_name + ".txt"
     return self._add(g_prov, prov_type, res, resp_agent, None, None,
                      prov_info_path, short_name, [] if prov_subject is None else [prov_subject])
Example #5
0
    def store(self, cur_g, base_dir, base_iri, context_path, tmp_dir=None,
              override=False, already_processed={}, store_now=True):
        self.repok.new_article()
        self.reperr.new_article()

        if len(cur_g) > 0:
            cur_subject = set(cur_g.subjects(None, None)).pop()
            cur_dir_path, cur_file_path = find_paths(
                str(cur_subject), base_dir, base_iri, self.dir_split, self.n_file_item)

            try:
                if not os.path.exists(cur_dir_path):
                    os.makedirs(cur_dir_path)

                final_g = ConjunctiveGraph()
                final_g.addN([item + (cur_g.identifier,) for item in list(cur_g)])

                # Merging the data
                if not override:
                    if cur_file_path in already_processed:
                        stored_g = already_processed[cur_file_path]
                        stored_g.addN(final_g.quads((None, None, None, None)))
                        final_g = stored_g
                    elif os.path.exists(cur_file_path):
                        # This is a conjunctive graps that contains all the triples (and graphs)
                        # the file is actually defining - they could be more than those using
                        # 'cur_subject' as subject.
                        final_g = self.load(cur_file_path, cur_g, tmp_dir)

                already_processed[cur_file_path] = final_g

                if store_now:
                    self.__store_in_file(final_g, cur_file_path, context_path)

                return already_processed
            except Exception as e:
                self.reperr.add_sentence("[5] It was impossible to store the RDF statements in %s. %s" %
                                         (cur_file_path, str(e)))

        return None
    def add_prov_triples_in_filesystem(self, res_iri, prov_entity_type=None):
        if self.base_dir is not None and self.base_iri is not None:
            cur_file_path = find_paths(res_iri, self.base_dir, self.base_iri,
                                       dir_split_number, items_per_file)[1]
            if cur_file_path.endswith("index.json"):
                cur_path = cur_file_path.replace("index.json", "") + "prov"
            else:
                cur_path = cur_file_path[:-5] + os.sep + "prov"

            file_list = []
            if os.path.isdir(cur_path):
                for cur_dir, cur_subdir, cur_files in os.walk(cur_path):
                    for cur_file in cur_files:
                        if cur_file.endswith(".json") and \
                           (prov_entity_type is None or cur_file.startswith(prov_entity_type)):
                            file_list += [cur_dir + os.sep + cur_file]

            for file_path in file_list:
                if file_path not in self.loaded:
                    self.loaded.add(file_path)
                    cur_g = self.storer.load(file_path, tmp_dir=self.tmp_dir)
                    self.add_triples_in_graph(cur_g)
Example #7
0
    def add_prov_triples_in_filesystem(self, res_iri, prov_entity_type=None):
        if self.base_dir is not None and self.base_iri is not None:
            cur_file_path = find_paths(res_iri, self.base_dir, self.base_iri,
                                       dir_split_number, items_per_file)[1]
            if cur_file_path.endswith("index.json"):
                cur_path = cur_file_path.replace("index.json", "") + "prov"
            else:
                cur_path = cur_file_path[:-5] + os.sep + "prov"

            file_list = []
            if os.path.isdir(cur_path):
                for cur_dir, cur_subdir, cur_files in os.walk(cur_path):
                    for cur_file in cur_files:
                        if cur_file.endswith(".json") and \
                           (prov_entity_type is None or cur_file.startswith(prov_entity_type)):
                            file_list += [cur_dir + os.sep + cur_file]

            for file_path in file_list:
                if file_path not in self.loaded:
                    self.loaded.add(file_path)
                    cur_g = self.storer.load(file_path, tmp_dir=self.tmp_dir)
                    self.add_triples_in_graph(cur_g)
Example #8
0
 with open(args.context) as f, open(args.input) as g:
     context_json = json.load(f)
     csv_reader = csv.reader(g)
     for res, n_mod, m_type in csv_reader:
         last_res = res
         prov_entity = URIRef(res)
         
         if args.id:
             prov_g = load(res)
             prov_entity_g = get_entity_graph(res, prov_g)
             spec_entity = prov_entity_g.value(prov_entity, PROV.specializationOf)
             res_g = load(str(spec_entity))
             res_entity_g = get_entity_graph(spec_entity, res_g)
             for id_entity in [o for s, p, o in list(
                     res_entity_g.triples((spec_entity, DATACITE.hasIdentifier, None)))]:
                 rdf_dir, rdf_file_path = find_paths(
                     id_entity, args.base + os.sep, "https://w3id.org/oc/corpus/", 10000, 1000)
                 result.add(rdf_file_path)
         else:
             repok.add_sentence("Processing '%s'" % res)
             
             prov_g = load(res)
             spec_entity_iri = res.split("/prov/")[0]
             prov_entity_g = get_entity_graph(res, prov_g, True)
 
             generation_dates = [o for s, p, o in
                                 list(prov_entity_g.triples(
                                     (None, PROV.generatedAtTime, None)))]
             sources = [o for s, p, o in
                        list(prov_entity_g.triples((None, PROV.hadPrimarySource, None)))]
             
             # Get all identifiers creation dates and sources