def format_proxy(proxy, collection, job_id=None): """Apply final denormalisations to the index.""" proxy.context = {} data = proxy.to_full_dict() data['collection_id'] = collection.id data['job_id'] = job_id names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get('properties') text = properties.pop('indexText', []) text.extend(fps) text.append(collection.label) data['text'] = text data['updated_at'] = collection.updated_at for updated_at in properties.pop('indexUpdatedAt', []): data['updated_at'] = updated_at # pprint(data) entity_id = data.pop('id') return { '_id': entity_id, '_index': entities_write_index(data.get('schema')), '_source': data }
def delete_entity(entity_id, exclude=None, sync=False): """Delete an entity from the index.""" if exclude is not None: exclude = entities_write_index(exclude) for entity in entities_by_ids(entity_id, excludes="*"): index = entity.get("_index") if index == exclude: continue delete_safe(index, entity_id)
def delete_entity(entity_id, exclude=None, sync=False): """Delete an entity from the index.""" if exclude is not None: exclude = entities_write_index(exclude) for entity in entities_by_ids(entity_id, excludes='*'): index = entity.get('_index') if index == exclude: continue es.delete(index=index, id=entity_id, refresh=refresh_sync(sync))
def format_proxy(proxy, collection): """Apply final denormalisations to the index.""" # Abstract entities can appear when profile fragments for a missing entity # are present. if proxy.schema.abstract: return None data = proxy.to_full_dict() data["schemata"] = list(proxy.schema.names) data["caption"] = proxy.caption names = data.get("names", []) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data["fingerprints"] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get("properties") data["text"] = properties.pop("indexText", []) # integer casting numeric = {} for prop in proxy.iterprops(): if prop.type in NUMERIC_TYPES: values = proxy.get(prop) numeric[prop.name] = _numeric_values(prop.type, values) # also cast group field for dates numeric["dates"] = _numeric_values(registry.date, data.get("dates")) data["numeric"] = numeric # Context data - from aleph system, not followthemoney. data["collection_id"] = collection.id data["role_id"] = first(data.get("role_id")) data["profile_id"] = first(data.get("profile_id")) data["mutable"] = max(ensure_list(data.get("mutable")), default=False) data["origin"] = ensure_list(data.get("origin")) # Logical simplifications of dates: created_at = ensure_list(data.get("created_at")) if len(created_at) > 0: data["created_at"] = min(created_at) updated_at = ensure_list(data.get("updated_at")) or created_at if len(updated_at) > 0: data["updated_at"] = max(updated_at) # log.info("%s", pformat(data)) entity_id = data.pop("id") return { "_id": entity_id, "_index": entities_write_index(proxy.schema), "_source": data, }
def index_single(obj, proxy, data, texts, sync=False): """Indexing aspects common to entities and documents.""" data = finalize_index(proxy, data, texts) data['bulk'] = False data['collection_id'] = obj.collection_id data['created_at'] = obj.created_at data['updated_at'] = obj.updated_at # pprint(data) index = entities_write_index(proxy.schema) refresh = refresh_sync(sync) if settings.ENTITIES_INDEX_SPLIT: delete_entity(obj.id, exclude=proxy.schema, sync=False) return index_safe(index, obj.id, data, refresh=refresh)
def _index_updates(collection_id, entities): """Look up existing index documents and generate an updated form. This is necessary to make the index accumulative, i.e. if an entity or link gets indexed twice with different field values, it'll add up the different field values into a single record. This is to avoid overwriting the document and losing field values. An alternative solution would be to implement this in Groovy on the ES. """ common = { 'collection_id': collection_id, 'updated_at': datetime.utcnow(), 'bulk': True } timestamps = {} indexes = defaultdict(list) if not len(entities): return [] for result in entities_by_ids(list(entities.keys())): if int(result.get('collection_id')) != collection_id: raise RuntimeError("Key collision between collections.") existing = model.get_proxy(result) indexes[existing.id].append(result.get('_index')) entities[existing.id].merge(existing) timestamps[existing.id] = result.get('created_at') actions = [] for entity_id, entity in entities.items(): context = dict(common) context['created_at'] = timestamps.get(entity.id) body = finalize_index(entity, context, []) index = entities_write_index(entity.schema) for other in indexes.get(entity_id, []): if other != index: # log.info("Delete ID [%s] from index: %s", entity_id, other) actions.append({ '_id': entity_id, '_index': other, '_type': 'doc', '_op_type': 'delete' }) actions.append({ '_id': entity_id, '_index': index, '_type': 'doc', '_source': body }) return actions
def format_proxy(proxy, collection, extra): """Apply final denormalisations to the index.""" proxy.context = {} proxy = collection.ns.apply(proxy) # Pull `indexUpdatedAt` before constructing `data`, so that it doesn't # creep into `data['dates']` and mess up date sorting afterwards updated_at = proxy.pop('indexUpdatedAt', quiet=True) data = proxy.to_full_dict() data['collection_id'] = collection.id data['schemata'] = list(proxy.schema.names) names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get('properties') text = properties.pop('indexText', []) text.extend(fps) data['text'] = text data['updated_at'] = collection.updated_at for value in updated_at: data['updated_at'] = value # integer casting numeric = {} for prop, values in properties.items(): prop = proxy.schema.get(prop) if prop.type in NUMERIC_TYPES: numeric[prop.name] = _numeric_values(prop.type, values) # also cast group field for dates numeric['dates'] = _numeric_values(registry.date, data.get('dates')) data['numeric'] = numeric # add possible overrides data.update(extra) # log.info("%s", pformat(data)) entity_id = data.pop('id') return { '_id': entity_id, '_index': entities_write_index(data.get('schema')), '_source': data }
def format_proxy(proxy, collection): """Apply final denormalisations to the index.""" data = proxy.to_full_dict() data["schemata"] = list(proxy.schema.names) names = data.get("names", []) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data["fingerprints"] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get("properties") text = properties.pop("indexText", []) text.extend(fps) data["text"] = text # integer casting numeric = {} for prop in proxy.iterprops(): if prop.type in NUMERIC_TYPES: values = proxy.get(prop) numeric[prop.name] = _numeric_values(prop.type, values) # also cast group field for dates numeric["dates"] = _numeric_values(registry.date, data.get("dates")) data["numeric"] = numeric # Context data - from aleph system, not followthemoney. # FIXME: Can there ever really be multiple role_ids? data["role_id"] = first(data.get("role_id")) data["mutable"] = max(ensure_list(data.get("mutable")), default=False) data["origin"] = ensure_list(data.get("origin")) created_at = data.get("created_at") if created_at: data["updated_at"] = data.get("updated_at", created_at) data["collection_id"] = collection.id # log.info("%s", pformat(data)) entity_id = data.pop("id") return { "_id": entity_id, "_index": entities_write_index(data.get("schema")), "_source": data, }
def format_proxy(proxy, collection): """Apply final denormalisations to the index.""" data = proxy.to_full_dict() data['schemata'] = list(proxy.schema.names) names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get('properties') text = properties.pop('indexText', []) text.extend(fps) data['text'] = text # integer casting numeric = {} for prop in proxy.iterprops(): if prop.type in NUMERIC_TYPES: values = proxy.get(prop) numeric[prop.name] = _numeric_values(prop.type, values) # also cast group field for dates numeric['dates'] = _numeric_values(registry.date, data.get('dates')) data['numeric'] = numeric # Context data - from aleph system, not followthemoney. now = iso_text(datetime.utcnow()) data['created_at'] = min(ensure_list(data.get('created_at')), default=now) data['updated_at'] = min(ensure_list(data.get('updated_at')), default=now) # FIXME: Can there ever really be multiple role_ids? data['role_id'] = first(data.get('role_id')) data['mutable'] = max(ensure_list(data.get('mutable')), default=False) data['origin'] = ensure_list(data.get('origin')) data['collection_id'] = collection.id # log.info("%s", pformat(data)) entity_id = data.pop('id') return { '_id': entity_id, '_index': entities_write_index(data.get('schema')), '_source': data }
def format_proxy(proxy, collection, job_id=None): """Apply final denormalisations to the index.""" proxy.context = {} proxy = collection.ns.apply(proxy) data = proxy.to_full_dict() data['collection_id'] = collection.id data['job_id'] = job_id names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get('properties') text = properties.pop('indexText', []) text.extend(fps) text.append(collection.label) data['text'] = text data['updated_at'] = collection.updated_at for updated_at in properties.pop('indexUpdatedAt', []): data['updated_at'] = updated_at # integer casting numeric = {} for prop, values in properties.items(): prop = proxy.schema.get(prop) if prop.type in NUMERIC_TYPES: numeric[prop.name] = _numeric_values(prop.type, values) # also cast group field for dates numeric['dates'] = _numeric_values(registry.date, data.get('dates')) data['numeric'] = numeric # pprint(data) entity_id = data.pop('id') return { '_id': entity_id, '_index': entities_write_index(data.get('schema')), '_source': data }
def delete_entity(entity_id, exclude=None, sync=False): """Delete an entity from the index.""" if exclude is not None: exclude = entities_write_index(exclude) for entity in entities_by_ids(entity_id, excludes='*'): index = entity.get('_index') if index == exclude: continue try: es.delete(index=index, id=entity_id, refresh=refresh_sync(sync)) q = {'term': {'entities': entity_id}} query_delete(entities_read_index(), q, sync=sync) except NotFoundError: # This is expected in some cases. For example, when 2 Things are # connected by an Interval and all the 3 entities get deleted # simultaneously, Aleph tries to delete the Interval thrice due to # recursive deletion of adjacent entities. ElasticSearch throws a # 404 in that case. # In those cases, we want to skip both the `es.delete` step and # the `query_delete` step. log.warning("Delete failed for entity %s - not found", entity_id) continue
def index_operation(data): """Apply final denormalisations to the index.""" data['bulk'] = data.get('bulk', False) names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. texts = data.pop('text', []) texts.extend(data.get('properties', {}).pop('indexText', [])) texts.extend(fps) data['text'] = texts if not data.get('created_at'): data['created_at'] = data.get('updated_at') entity_id = str(data.pop('id')) data.pop('_index', None) index = entities_write_index(data.get('schema')) return entity_id, index, data