def bulk_index(self, es_actions): logger.debug(self.es_actions) nr, errors = helpers.bulk(get_es(), es_actions) if nr > 0 and not errors: logger.info("Indexed records: {}".format(nr)) return True elif errors: logger.error("Something went wrong with bulk index: {}".format(errors)) return False return False
def diff_by_content_hash(self): ids = [{"_id": key[0]} for key in self.es_actions.keys()] mget_ids = get_es().mget(body={"docs": ids}, index=settings.SITE_NAME, _source_include=["system.content_hash"]) index_sets = { (doc.get("_id"), doc["_source"].get("system", {"content_hash": None}).get("content_hash")) for doc in mget_ids.get("docs") if doc["found"] } new_records = set(self.es_actions.keys()).difference(index_sets) self.records_stored = len(new_records) self.records_already_stored = len(ids) - self.records_stored es_actions = [es_action for k, es_action in self.es_actions.items() if k in new_records] sparql_updates = [sparql_update for k, sparql_update in self.sparql_update_queries.items() if k in new_records] return es_actions, sparql_updates
def reindex_dataset(ds, acceptance=False): """ Reindex the dataset for elasticsearch from the edm_record cache. :param acceptance: :param ds: the DataSet :return: records processed """ ds.processed_records = 0 ds.save() def process_records(): for edm in EDMRecord.objects.filter(dataset=ds): yield edm.create_es_action( index=settings.SITE_NAME, exclude_fields=ds.excluded_index_fields.names(), context=False, acceptance=acceptance ) response = helpers.bulk(get_es(), actions=process_records()) return response
def synchronise_dataset_records(store, dataset_graph_uri=None, ds=None, index=settings.SITE_NAME): """Iterate over all records that are out of sync for a dataset and update them in the index and database. """ if not ds and dataset_graph_uri: ds = DataSet.get_dataset_from_graph(dataset_graph_uri=dataset_graph_uri, store=store) elif ds and not dataset_graph_uri: dataset_graph_uri = ds.document_uri elif not dataset_graph_uri and not ds: raise ValueError("Unable to find dataset due to missing value in dataset_graph_uri and/or ds") logger.info("Graph uri to synchronise: {}".format(dataset_graph_uri)) # materialize nodes # ore:aggregates + remove ore:isAggregatedBy graph_list = get_out_of_sync_dataset_record_graph_uris(dataset_graph_uri, store, 200) if not ds.stay_in_sync: logger.warn("Should not start synchronization for {} when marked as not stay in sync".format(ds.spec)) return 0 elif ds.has_sync_error: logger.warn("Can't start synchronization of {} due to previous sync error.".format(ds.spec)) return 0 ds.has_sync_error = False ds.sync_error_message = None ds.records_in_sync = False ds.processed_records = 0 ds.save() records_processed = 0 try: valid_records = ds.valid while len(graph_list) > 0: actions = [] # todo use the graphs instead of the URIs for graph_uri in graph_list: synchronise_record(graph_uri, ds, store, actions, index=index) # index actions logger.info("number of actions scheduled: {}".format(len(actions))) response = helpers.bulk(client=get_es(), actions=actions, stats_only=True) records_processed += len(graph_list) logger.info("processed {}/{} for {}".format(records_processed, valid_records, ds.spec)) logger.debug("ElasticSearch bulk update: {}".format(response)) update_switch = [QueryType.remove_insert.format( named_graph=g, remove="?s <http://schemas.delving.eu/narthex/terms/synced> false", insert="?s <http://schemas.delving.eu/narthex/terms/synced> true" ) for g in graph_list] response = store.update(query="\n".join(update_switch)) logger.debug("SPARQL update succeeded: {}".format(response)) ds = DataSet.objects.get(id=ds.id) ds.processed_records = records_processed ds.save() graph_list = get_out_of_sync_dataset_record_graph_uris(dataset_graph_uri, store, 200) ds.process_key = None ds.records_in_sync = True ds.dataset_type = DataSetType.aggregated if not ds.oai_pmh.real > 0: ds.oai_pmh = OaiPmhPublished.none ds.save() logger.info("Finishing synchronising {} records from dataset: {}".format(records_processed, dataset_graph_uri)) except Exception as e: logger.error("Unable to index all records for dataset {} due to {} at record {}.".format(ds.spec, e, graph_uri)) ds.sync_error_message = "{} with error: {}".format(graph_uri, e) ds.has_sync_error = True ds.process_key = None ds.save() logger.warn("Only index {} of {} valid for dataset {}".format(records_processed, valid_records, ds.spec)) return records_processed