def entities_by_ids(ids, schemata=None, cached=False, includes=None, excludes=None): """Iterate over unpacked entities based on a search for the given entity IDs.""" ids = ensure_list(ids) if not len(ids): return index = entities_read_index(schema=schemata) query = {'ids': {'values': ids}} # query = {'bool': {'filter': query}} query = { 'query': query, '_source': _source_spec(includes, excludes), 'size': MAX_PAGE } result = es.search(index=index, body=query) for doc in result.get('hits', {}).get('hits', []): entity = unpack_result(doc) if entity is not None: if cached: _cache_entity(entity) yield entity
def iter_entities(authz=None, collection_id=None, schemata=None, includes=None, excludes=None): """Scan all entities matching the given criteria.""" filters = [] if authz is not None: filters.append(authz_query(authz)) if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) if ensure_list(schemata): filters.append({'terms': {'schemata': ensure_list(schemata)}}) query = { 'query': { 'bool': { 'filter': filters } }, '_source': _source_spec(includes, excludes) } index = entities_read_index(schema=schemata) for res in scan(es, index=index, query=query, scroll='1410m'): entity = unpack_result(res) if entity is not None: yield entity
def _query_item(entity): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(entity) if query == none_query(): return query = { "query": query, "size": 100, "_source": { "includes": PROXY_INCLUDES } } matchable = list(entity.schema.matchable_schemata) index = entities_read_index(schema=matchable) result = es.search(index=index, body=query) for result in result.get("hits").get("hits"): result = unpack_result(result) if result is None: continue match = model.get_proxy(result) score = compare(model, entity, match) if score >= SCORE_CUTOFF: log.debug("Match: %s <[%.2f]> %s", entity.caption, score, match.caption) yield score, entity, result.get("collection_id"), match
def iter_entities(authz=None, collection_id=None, schemata=None, includes=None, excludes=None): """Scan all entities matching the given criteria.""" filters = [] if authz is not None: filters.append(authz_query(authz)) if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) if ensure_list(schemata): filters.append({'terms': {'schemata': ensure_list(schemata)}}) source = {} if ensure_list(includes): source['includes'] = ensure_list(includes) if ensure_list(excludes): source['excludes'] = ensure_list(excludes) query = { 'query': { 'bool': { 'filter': filters } }, 'sort': ['_doc'], '_source': source } for res in scan(es, index=entities_index(), query=query, scroll='1410m'): yield unpack_result(res)
def entities_by_ids(ids, schemata=None, cached=False, includes=None, excludes=None): """Iterate over unpacked entities based on a search for the given entity IDs.""" ids = ensure_list(ids) if not len(ids): return index = entities_read_index(schema=schemata) query = {'ids': {'values': ids}} # query = {'bool': {'filter': query}} query = { 'query': query, '_source': _source_spec(includes, excludes), 'size': MAX_PAGE } result = es.search(index=index, body=query) for doc in result.get('hits', {}).get('hits', []): entity = unpack_result(doc) if entity is not None: # Cache entities only briefly to avoid filling up the cache: if cached: key = cache.object_key(Entity, entity.get('id')) cache.set_complex(key, entity, expire=60 * 60) yield entity
def check_alert(alert_id): alert = Alert.by_id(alert_id) if alert is None or alert.role is None: return log.info("Check alert [%s]: %s", alert.id, alert.query) authz = Authz.from_role(alert.role) query = alert_query(alert, authz) index = entities_read_index(schema=Entity.THING) result = es.search(index=index, body=query) for result in result.get('hits').get('hits', []): entity = unpack_result(result) if entity is None: continue log.info('Alert [%s]: %s', alert.query, entity.get('name')) params = { 'alert': alert, 'role': alert.role, 'entity': entity.get('id') } publish(Events.MATCH_ALERT, params=params, channels=[alert.role]) db.session.flush() alert.update() db.session.commit() db.session.close()
def __init__(self, request, query, parser=None, schema=None): super(MatchQueryResult, self).__init__(request, query, parser=parser, schema=schema) ids = set() for match in self.results: ids.add(match.match_id) ids.add(match.entity_id) ids = {'ids': list(ids)} result = es.mget(index=es_index, doc_type=TYPE_ENTITY, body=ids) for doc in result.get('docs', []): entity = unpack_result(doc) if entity is None: continue for match in self.results: if match.match_id == entity['id']: match.match = entity if match.entity_id == entity['id']: match.entity = entity # Do not return results if the entity has been removed in the mean # time. Not sure this is the ideal way of doing this, as it'll mess # with pagination counts etc. for match in list(self.results): if not hasattr(match, 'match') or not hasattr(match, 'entity'): self.results.remove(match)
def check_alert(alert_id): alert = Alert.by_id(alert_id) if alert is None or alert.role is None: return if not alert.role.is_alertable: return authz = Authz.from_role(alert.role) query = alert_query(alert, authz) index = entities_read_index(schema=Entity.THING) result = es.search(index=index, body=query) for result in result.get('hits').get('hits', []): entity = unpack_result(result) if entity is None: continue log.info('Alert [%s]: %s', alert.query, entity.get('name')) params = { 'alert': alert, 'role': alert.role, 'entity': entity } publish(Events.MATCH_ALERT, actor_id=entity.get('uploader_id'), params=params) alert.update() db.session.commit() db.session.close()
def iter_entities( authz=None, collection_id=None, schemata=None, includes=PROXY_INCLUDES, excludes=None, filters=None, sort=None, ): """Scan all entities matching the given criteria.""" query = { "query": _entities_query(filters, authz, collection_id, schemata), "_source": _source_spec(includes, excludes), } preserve_order = False if sort is not None: query["sort"] = ensure_list(sort) preserve_order = True index = entities_read_index(schema=schemata) for res in scan( es, index=index, query=query, timeout=MAX_TIMEOUT, request_timeout=MAX_REQUEST_TIMEOUT, preserve_order=preserve_order, ): entity = unpack_result(res) if entity is not None: yield entity
def iter_matches(collection, authz): """Scan all matching xref results, does not support sorting.""" filters = [{'term': {'collection_id': collection.id}}, authz_query(authz, field='match_collection_id')] query = {'query': {'bool': {'filter': filters}}} for res in scan(es, index=xref_index(), query=query): yield unpack_result(res)
def check_alert(alert_id): alert = Alert.by_id(alert_id) if alert is None or alert.role is None: return log.info("Check alert [%s]: %s", alert.id, alert.query) authz = Authz.from_role(alert.role) try: query = alert_query(alert, authz) index = entities_read_index(schema=Entity.THING) result = es.search(index=index, body=query) except RequestError as re: log.error("Invalid query [%s]: %r", alert.query, re.error) alert.delete() db.session.commit() return for result in result.get("hits").get("hits", []): entity = unpack_result(result) if entity is None: continue log.info("Alert [%s]: %s", alert.query, entity.get("id")) params = { "alert": alert, "role": alert.role, "entity": entity.get("id"), "collection": entity.get("collection_id"), } channels = [alert.role] # channels.append(channel_tag(collection_id, Collection)) publish(Events.MATCH_ALERT, params=params, channels=channels) alert.update() db.session.commit()
def get_collection(collection_id): """Fetch a collection from the index.""" result = es.get(index=collections_index(), doc_type='doc', id=collection_id, ignore=[404], _source_exclude=['text']) return unpack_result(result)
def xref_collection(collection_id, other_id=None): """Cross-reference all the entities and documents in a collection.""" query = {'term': {'collection_id': collection_id}} query = {'query': query, '_source': {'excludes': ['text', 'properties.*']}} scanner = scan(es, index=entities_index(), query=query, scroll='30m') for idx, res in enumerate(scanner): res = unpack_result(res) _xref_item(res, collection_id=other_id)
def get_entity(entity_id): """Fetch an entity from the index.""" result = es.get(index=entities_index(), doc_type='doc', id=entity_id, ignore=[404], _source_exclude=['text']) return unpack_result(result)
def get_xref(xref_id, collection_id=None): """Get an xref match combo by its ID.""" filters = [{"ids": {"values": [xref_id]}}] if collection_id is not None: filters.append({"term": {"collection_id": collection_id}}) query = {"query": {"bool": {"filter": filters}}, "size": 1} result = es.search(index=xref_index(), body=query) for doc in result.get("hits", {}).get("hits", []): return unpack_result(doc)
def __init__(self, request, parser, result, schema=None): super(SearchQueryResult, self).__init__(request, parser=parser, schema=schema) self.result = result hits = self.result.get('hits', {}) self.total = hits.get('total') for doc in hits.get('hits', []): self.results.append(unpack_result(doc))
def get_xref(xref_id, collection_id=None): """Get an xref match combo by its ID.""" filters = [{'ids': {'values': [xref_id]}}] if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) query = {'query': {'bool': {'filter': filters}}, 'size': 1} result = es.search(index=xref_index(), body=query) for doc in result.get('hits', {}).get('hits', []): return unpack_result(doc)
def iter_matches(collection, authz): """Scan all matching xref results, does not support sorting.""" filters = [ {"term": {"collection_id": collection.id}}, authz_query(authz, field="match_collection_id"), ] query = {"query": {"bool": {"filter": filters}}} for res in scan(es, index=xref_index(), query=query): yield unpack_result(res)
def get_entity(entity_id): """Fetch an entity from the index.""" result = es.get(index=es_index, doc_type=TYPE_ENTITY, id=entity_id, ignore=[404]) entity = unpack_result(result) if entity is not None: entity.pop('text', None) return entity
def get_document(document_id): """Fetch a document from the index.""" result = es.get(index=es_index, doc_type=TYPE_DOCUMENT, id=document_id, ignore=[404]) document = unpack_result(result) if document is not None: document.pop('text', None) return document
def iter_records(document_id=None, collection_id=None): """Scan all records matching the given criteria.""" filters = [] if document_id is not None: filters.append({'term': {'document_id': document_id}}) if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) query = {'query': {'bool': {'filter': filters}}, 'sort': ['_doc']} for res in scan(es, index=records_index(), query=query, scroll='1410m'): yield unpack_result(res)
def __init__(self, request, parser, result): super(SearchQueryResult, self).__init__(request, parser=parser) self.result = result hits = self.result.get('hits', {}) self.total = hits.get('total') for doc in hits.get('hits', []): # log.info("Res: %s", pformat(doc)) doc = unpack_result(doc) if doc is not None: self.results.append(doc)
def _counted_msearch(queries, authz, limit=0): """Run batched queries to count or retrieve entities with certain property values.""" # The default case for this is that we want to retrieve only the # counts for a bunch of filtered sub-queries. In this case, we can # group the queries by the affected index. # In some cases, the expand API wants to actually retrieve entities. # Then, we need to make one query per filter. grouped = {} for (index, key), query in sorted(queries.items()): group = index if limit == 0 else (index, key) if group not in grouped: grouped[group] = { "index": index, "filters": [query], "counts": {key: query}, } else: grouped[group]["filters"].append(query) grouped[group]["counts"][key] = query log.debug("Counts: %s queries, %s groups", len(queries), len(grouped)) body = [] for group in grouped.values(): body.append({"index": group.get("index")}) filters = group.get("filters") if limit == 0 and len(filters) > 1: filters = [{"bool": {"should": filters, "minimum_should_match": 1}}] filters.append(authz_query(authz)) query = { "size": limit, "query": {"bool": {"filter": filters}}, "aggs": {"counts": {"filters": {"filters": group.get("counts")}}}, "_source": ENTITY_SOURCE, } body.append(query) counts = {} # FIXME: This doesn't actually retain context on which query a particular # entity is a result from. Doesn't matter if all we do in the end is stuff # everything into an FtMGraph and then traverse for adjacency. entities = [] if not len(body): return entities, counts response = es.msearch(body=body) for resp in response.get("responses", []): for result in resp.get("hits", {}).get("hits", []): entities.append(unpack_result(result)) buckets = resp.get("aggregations", {}).get("counts", {}).get("buckets", {}) for key, count in buckets.items(): counts[key] = count.get("doc_count", 0) return entities, counts
def iter_entities(authz=None, collection_id=None, schemata=None, includes=None, excludes=None): """Scan all entities matching the given criteria.""" query = { 'query': _entities_query(authz, collection_id, schemata), '_source': _source_spec(includes, excludes) } index = entities_read_index(schema=schemata) for res in scan(es, index=index, query=query, scroll='1410m'): entity = unpack_result(res) if entity is not None: yield entity
def __init__(self, request, parser, result): super(SearchQueryResult, self).__init__(request, parser=parser) self.result = result hits = self.result.get('hits', {}) total = hits.get('total', {}) self.total = total.get('value') self.total_type = total.get('relation') for doc in hits.get('hits', []): # log.info("Res: %s", pformat(doc)) doc = unpack_result(doc) if doc is not None: self.results.append(doc)
def __init__(self, request, parser, result): super(SearchQueryResult, self).__init__(request, parser=parser) self.result = result hits = self.result.get("hits", {}) total = hits.get("total", {}) self.total = total.get("value") self.total_type = total.get("relation") for doc in hits.get("hits", []): # log.info("Res: %s", pformat(doc)) doc = unpack_result(doc) if doc is not None: self.results.append(doc)
def entity_matches(result): for doc in result.get('hits').get('hits'): entity = unpack_result(doc) proxy = model.get_proxy(entity) yield { 'id': proxy.id, 'name': proxy.caption, 'n:type': get_freebase_type(proxy.schema), 'type': [get_freebase_type(proxy.schema)], 'r:score': doc.get('_score'), 'uri': entity_url(proxy.id, _relative=True), 'match': False }
def _resolve_index(self, cache): query = [] for (type_, id_) in cache.keys(): if type_ == Role: continue query.append({'_index': type_, '_id': id_}) if not len(query): return results = es.mget(body={'docs': query}, _source_exclude=['text']) for doc in results['docs']: cache[(doc['_index'], doc['_id'])] = unpack_result(doc)
def entity_matches(result): for doc in result.get("hits").get("hits"): entity = unpack_result(doc) proxy = model.get_proxy(entity) yield { "id": proxy.id, "name": proxy.caption, "n:type": get_freebase_type(proxy.schema), "type": [get_freebase_type(proxy.schema)], "r:score": doc.get("_score"), "uri": entity_url(proxy.id, _relative=True), "match": False, }
def get_entity(entity_id): """Fetch an entity from the index.""" if entity_id is None: return None for index in entities_index_list(): result = es.get(index=index, doc_type='doc', id=entity_id, ignore=[404], _source_exclude=['text']) result = unpack_result(result) if result is not None: return result
def render_notification(stub, notification): """Generate a text version of the notification, suitable for use in an email or text message.""" from aleph.logic import resolver notification = unpack_result(notification) event = Events.get(notification.get("event")) if event is None: return for name, clazz, value in _iter_params(notification, event): resolver.queue(stub, clazz, value) resolver.resolve(stub) plain = str(event.template) html = str(event.template) for name, clazz, value in _iter_params(notification, event): data = resolver.get(stub, clazz, value) if data is None: return link, title = None, None if clazz == Role: title = data.get("label") elif clazz == Alert: title = data.get("query") elif clazz == Collection: title = data.get("label") link = collection_url(value) elif clazz == Entity: proxy = model.get_proxy(data) title = proxy.caption link = entity_url(value) elif clazz == EntitySet: title = data.label link = entityset_url(data.id) elif clazz == Export: title = data.get("label") link = archive_url( data.get("content_hash"), file_name=data.get("file_name"), mime_type=data.get("file_name"), ) link = url_for("exports_api.download", export_id=data.get("id")) template = "{{%s}}" % name html = html.replace(template, html_link(title, link)) plain = plain.replace(template, "'%s'" % title) if name == event.link_to: plain = "%s (%s)" % (plain, link) return {"plain": plain, "html": html}
def iter_entities(authz=None, collection_id=None, schemata=None, includes=None, excludes=None): """Scan all entities matching the given criteria.""" filters = [] if authz is not None: filters.append(authz_query(authz)) if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) if ensure_list(schemata): filters.append({'terms': {'schemata': ensure_list(schemata)}}) query = { 'query': {'bool': {'filter': filters}}, '_source': _source_spec(includes, excludes) } index = entities_read_index(schema=schemata) for res in scan(es, index=index, query=query, scroll='1410m'): entity = unpack_result(res) if entity is not None: yield entity
def xref_item(proxy, collection_ids=None): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(proxy, collection_ids=collection_ids) if query == none_query(): return query = { 'query': query, 'size': 100, '_source': {'includes': ['schema', 'properties', 'collection_id']} } matchable = list(proxy.schema.matchable_schemata) index = entities_read_index(schema=matchable) result = es.search(index=index, body=query) results = result.get('hits').get('hits') for result in results: result = unpack_result(result) if result is not None: other = model.get_proxy(result) score = compare(model, proxy, other) if score >= SCORE_CUTOFF: yield score, result.get('collection_id'), other