Beispiel #1
0
    def _process_action(self, action):
        try:
            self.spec = action["dataset"]
            process_verb = action["action"]
            record = None
            if process_verb in ["clear_orphans"]:
                purge_date = action.get("modification_date")
                if purge_date:
                    orphans_removed = RDFRecord.remove_orphans(spec=self.spec, timestamp=purge_date)
                    logger.info("Deleted {} orphans for {} before {}".format(orphans_removed, self.spec, purge_date))
            elif process_verb in ["disable_index"]:
                RDFRecord.delete_from_index(self.spec)
                logger.info("Deleted dataset {} from index. ".format(self.spec))
            elif process_verb in ["drop_dataset"]:
                RDFRecord.delete_from_index(self.spec)
                DataSet.objects.filter(spec=self.spec).delete()
                logger.info("Deleted dataset {} from index. ".format(self.spec))
            else:
                record_graph_uri = action["graphUri"]
                graph_ntriples = action["graph"]
                acceptance_mode = action.get("acceptanceMode", "false")
                acceptance = True if acceptance_mode is not None and acceptance_mode.lower() in ["true"] else False
                content_hash = action.get("contentHash", None)
                from lod.utils.resolver import ElasticSearchRDFRecord

                record = ElasticSearchRDFRecord(spec=self.spec, rdf_string=graph_ntriples)
                try:
                    rdf_format = record.DEFAULT_RDF_FORMAT if "<rdf:RDF" not in graph_ntriples else "xml"
                    record.from_rdf_string(
                        rdf_string=graph_ntriples, named_graph=record_graph_uri, input_format=rdf_format
                    )
                except ParseError as e:
                    self.rdf_errors.append((e, action))
                    logger.error(e, action)
                    return None
                self.records_stored += 1
                self.es_actions[(record.hub_id, content_hash)] = record.create_es_action(
                    action=process_verb,
                    store=self.store,
                    context=True,
                    flat=True,
                    exclude_fields=None,
                    acceptance=acceptance,
                    doc_type="void_edmrecord",
                    record_type="mdr",
                    content_hash=content_hash,
                )
                if settings.RDF_STORE_TRIPLES:
                    self.sparql_update_queries[(record.hub_id, content_hash)] = record.create_sparql_update_query(
                        acceptance=acceptance
                    )
            return record
        except KeyError as ke:
            self.json_errors.append((ke, action))
            self.records_with_errors += 1
            return None
Beispiel #2
0
    def process_narthex_file(self, spec, store=None, acceptance=False, path=None, console=False):

        start = datetime.now()

        if not store:
            store = rdfstore.get_rdfstore()

        if not path:
            processed_fname = self.get_narthex_processed_fname()
        else:
            processed_fname = path
        print("started processing {} for dataset {}".format(processed_fname, spec))

        with open(processed_fname, 'r') as f:
            rdf_record = []
            lines = 0
            records = 0
            stored = 0
            new = 0
            not_orphaned = []
            sparql_update_queries = []
            es_actions = []
            # set orphaned records

            for line in f:
                lines += 1
                exists, named_graph, content_hash = self.is_line_marker(line)
                if exists:
                    new += 1
                    records += 1
                    triples = " ".join(rdf_record)
                    record = ElasticSearchRDFRecord(rdf_string=triples, spec=spec)
                    try:
                        record.from_rdf_string(named_graph=named_graph, rdf_string=triples, input_format="xml")
                        es_actions.append(record.create_es_action(doc_type="void_edmrecord", record_type="mdr", context=True))
                    except Exception as ex:
                        if console:
                            print("problem with {} for spec {} caused by {}".format(triples, spec, ex))
                        else:
                            logger.error("problem with {} for spec {} caused by {}".format(triples, spec, ex))
                    rdf_record[:] = []
                    if settings.RDF_STORE_TRIPLES:
                        sparql_update_queries.append(
                            record.create_sparql_update_query(acceptance=acceptance)
                        )
                    nr_sparql_updates = len(sparql_update_queries)
                    if settings.RDF_STORE_TRIPLES and nr_sparql_updates > 0 and nr_sparql_updates % 50 == 0:
                        store.update("\n".join(sparql_update_queries))
                        sparql_update_queries[:] = []
                    if records % 100 == 0 and records > 0:
                        logger.info("processed {} records of {} at {}".format(records, spec, ctime()))
                        if console:
                            print("processed {} records of {} at {}".format(records, spec, ctime()))
                        if len(es_actions) > 100:
                            self.bulk_index(es_actions, spec)
                            es_actions[:] = []
                else:
                    rdf_record.append(line)
            # store the remaining bulk items
            self.bulk_index(es_actions, spec)
            if settings.RDF_STORE_TRIPLES and len(sparql_update_queries) > 0:
                store.update("\n".join(sparql_update_queries))
            logger.info(
                "Dataset {}: records inserted {}, records same content hash {}, lines parsed {}, total records processed {}".format(
                    spec, new, stored, lines, records)
            )
            print("Finished loading {spec} with {lines} and {records} in {seconds}\n".format(
                spec=spec,
                lines=lines,
                records=records,
                seconds=datetime.now() - start
            ))

            RDFRecord.remove_orphans(spec, start.isoformat())
            return lines, records