Ejemplo n.º 1
0
    def __init__(self,
                 base_iri,
                 context_base,
                 info_dir,
                 entries,
                 n_file_item,
                 supplier_prefix,
                 agent_id=None):
        self.occ = None
        self.doi = None
        self.pmid = None
        self.pmcid = None
        self.url = None
        self.curator = None
        self.source = None
        self.source_provider = None
        self.entries = None
        self.reference_pointers = None

        if entries is not None:
            if "occ" in entries:
                self.occ = entries["occ"]
            if "doi" in entries:
                self.doi = entries["doi"].lower()
            if "pmid" in entries:
                self.pmid = entries["pmid"]
            if "pmcid" in entries:
                self.pmcid = entries["pmcid"]
            if "url" in entries:
                self.url = entries["url"].lower()
            if "curator" in entries:
                self.curator = entries["curator"]
            if "source" in entries:
                self.source = entries["source"]
            if "source_provider" in entries:
                self.source_provider = entries["source_provider"]
            if "references" in entries:
                self.entries = entries["references"]
                if "reference_pointers" in entries:
                    self.reference_pointers = entries["reference_pointers"]

        self.name = "SPACIN " + self.__class__.__name__
        self.g_set = GraphSet(base_iri,
                              context_base,
                              info_dir,
                              n_file_item,
                              supplier_prefix,
                              wanted_label=False)  # added no label param
        self.id = agent_id
        self.repok = Reporter(prefix="[%s - INFO] " % self.name)
        self.repok.new_article()
        self.reperr = Reporter(prefix="[%s - ERROR] " % self.name)
        self.reperr.new_article()
Ejemplo n.º 2
0
 def __init__(self, graph_set=GraphSet("http://localhost:8000/corpus/", "corpus/context.json",
                                       "test/share/id-counter/_/", 1000, ""),
              orcid_finder=None, resource_finder=None):
     self.name = "Crossref Data Handler"
     self.id = "Crossref"
     self.of = orcid_finder
     self.rf = resource_finder
     self.g_set = graph_set
Ejemplo n.º 3
0
            g.remove((s, p, o))

        resource_to_remove = set()
        resource_to_remove.add(URIRef(item))

        while len(resource_to_remove):
            res = resource_to_remove.pop()
            for s, p, o in g.triples((res, None, None)):
                g.remove((s, p, o))
                if type(o) is URIRef and "/br/" not in str(o):
                    resource_to_remove.add(o)

    full_info_dir = info_dir + args.prefix + sep

    print("Generate data compliant with the OCDM.")
    gs = GraphSet(base_iri, context_path)
    entity_count = 1000
    counter = 0
    for s in g.subjects():
        if counter == entity_count:
            store_all(gs)
            counter = 0
            gs = GraphSet(base_iri, context_path)

        with open(args.done, "a") as f:
            s_string = str(s)
            if s_string not in done:
                entity = None
                if "/ar/" in s_string:
                    entity = gs.add_ar(agent_name,
                                       source_agent=args.source_agent,
Ejemplo n.º 4
0
            if found:
                to_remove[URIRef(base_iri + sub("^g(..):(.+)$", "\\1/\\2", br))] = \
                    [URIRef(iden) for iden in
                     [base_iri + sub("^g(..):(.+)$", "\\1/\\2", r_id["r"]) for r_id in id_list]]

        s = Storer(context_map={context_path: context_file_path},
                   dir_split=dir_split_number,
                   n_file_item=items_per_file,
                   default_dir=default_dir)

        for full_info_dir in info_dirs:
            br_iri = []
            br_files = {}
            id_files = {}
            update_br = GraphSet(base_iri, context_path)
            remove_id = GraphSet(base_iri, context_path)

            print("\n\nSupplier directory '%s'" % full_info_dir)
            to_remove = info_dirs[full_info_dir]
            br_counter = 0

            for br in to_remove:
                if br_counter == 10:  # Write everything on disk
                    br_counter = 0
                    write_to_disk(update_br, remove_id, full_info_dir, br_iri)
                    # Initialize all the variables again
                    br_iri = []
                    br_files = {}
                    id_files = {}
                    update_br = GraphSet(base_iri, context_path)
Ejemplo n.º 5
0
class FormatProcessor(object):
    #doi_pattern = "[^A-z0-9\.]([0-9]+\.[0-9]+(\.[0-9]+)*/[^%\"# \?<>{}\^\[\]`\|\\\+]+)"
    doi_pattern = "[^A-z0-9\.](10\.[0-9]+(\.[0-9]+)*/[^%\"# \?<>{}\^\[\]`\|\\\+]+)"
    http_pattern = "(https?://([A-z]|[0-9]|%|&|\?|/|\.|_|~|-|:)+)"
    """This class is the abstract one for any kind of processors."""
    def __init__(self,
                 base_iri,
                 context_base,
                 info_dir,
                 entries,
                 n_file_item,
                 supplier_prefix,
                 agent_id=None):
        self.occ = None
        self.doi = None
        self.pmid = None
        self.pmcid = None
        self.url = None
        self.curator = None
        self.source = None
        self.source_provider = None
        self.entries = None
        self.reference_pointers = None

        if entries is not None:
            if "occ" in entries:
                self.occ = entries["occ"]
            if "doi" in entries:
                self.doi = entries["doi"].lower()
            if "pmid" in entries:
                self.pmid = entries["pmid"]
            if "pmcid" in entries:
                self.pmcid = entries["pmcid"]
            if "url" in entries:
                self.url = entries["url"].lower()
            if "curator" in entries:
                self.curator = entries["curator"]
            if "source" in entries:
                self.source = entries["source"]
            if "source_provider" in entries:
                self.source_provider = entries["source_provider"]
            if "references" in entries:
                self.entries = entries["references"]
                if "reference_pointers" in entries:
                    self.reference_pointers = entries["reference_pointers"]

        self.name = "SPACIN " + self.__class__.__name__
        self.g_set = GraphSet(base_iri,
                              context_base,
                              info_dir,
                              n_file_item,
                              supplier_prefix,
                              wanted_label=False)  # added no label param
        self.id = agent_id
        self.repok = Reporter(prefix="[%s - INFO] " % self.name)
        self.repok.new_article()
        self.reperr = Reporter(prefix="[%s - ERROR] " % self.name)
        self.reperr.new_article()

    def process(self):
        pass  # Implemented in the subclasses

    def graph_set(self):
        return self.g_set

    def graphs(self):
        return self.g_set.graphs()

    def message(self, mess):
        return "%s" % mess

    @staticmethod
    def clean_entry(entry):
        return quote(sa(re.sub(":", ",", entry)))

    @staticmethod
    def extract_data(string, pattern):
        if string is not None:
            result = re.search(pattern, string)
            if result:
                return result.group(1)

    @staticmethod
    def extract_doi(string):
        if string is not None:
            result = FormatProcessor.extract_data(string,
                                                  FormatProcessor.doi_pattern)
            if result:
                result = re.sub("(\.|,)?$", "", result)

            return result

    @staticmethod
    def extract_url(string):
        if string is not None:
            result = FormatProcessor.extract_data(string,
                                                  FormatProcessor.http_pattern)
            if result:
                result = re.sub("\\\\", "", re.sub("/?\.?$", "", result))

            return result
Ejemplo n.º 6
0
 def __new_ge(self):
     return GraphEntity(Graph(), self.fe, g_set=GraphSet(base_iri, context_path, info_dir, items_per_file, ""))