def entity_tags(entity, authz=None): """Do a search on tags of an entity.""" proxy = model.get_proxy(entity) Thing = model.get(Entity.THING) types = [registry.name, registry.email, registry.identifier, registry.iban, registry.phone, registry.address] facets = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for type_ in types: if type_.group is None: continue for fidx, value in enumerate(proxy.get_type_values(type_)): if type_.specificity(value) < 0.1: continue schemata = model.get_type_schemata(type_) schemata = [s for s in schemata if s.is_a(Thing)] index = entities_read_index(schemata) alias = '%s_%s' % (type_.name, fidx) facets.append((index, alias, type_.group, type_.group, value)) res = _filters_faceted_query(facets, authz=authz) for (_, alias, field, _, value) in facets: total = res.get(alias, 0) if total > 1: yield (field, value, total)
def entity_tags(entity, authz): """Do a search on tags of an entity.""" proxy = model.get_proxy(entity) Thing = model.get(Entity.THING) types = [registry.name, registry.email, registry.identifier, registry.iban, registry.phone, registry.address] facets = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for type_ in types: if type_.group is None: continue for fidx, value in enumerate(proxy.get_type_values(type_)): if type_.specificity(value) < 0.1: continue schemata = model.get_type_schemata(type_) schemata = [s for s in schemata if s.is_a(Thing)] index = entities_read_index(schemata) alias = '%s_%s' % (type_.name, fidx) facets.append((index, alias, type_.group, type_.group, value)) res = _filters_faceted_query(authz, facets) for (_, alias, field, _, value) in facets: total = res.get(alias, 0) if total > 1: yield (field, value, total)
def get_collection_facet(collection_id, facet, refresh=False): """Compute some statistics on the content of a collection.""" key = cache.object_key(Collection, collection_id, facet) data = cache.get_complex(key) if not refresh and data is not None: return data query = {'term': {'collection_id': collection_id}} query = { 'size': 0, 'query': {'bool': {'filter': [query]}}, 'aggs': { 'values': {'terms': {'field': facet, 'size': 300}}, 'total': {'cardinality': {'field': facet}} } } schemata = set() facet_type = registry.groups.get(facet) if facet_type is not None: schemata = model.get_type_schemata(facet_type) result = es.search(index=entities_read_index(schema=schemata), body=query, request_timeout=3600, timeout='20m') aggregations = result.get('aggregations') values = {} for bucket in aggregations.get('values').get('buckets', []): values[bucket['key']] = bucket['doc_count'] data = { 'values': values, 'total': aggregations.get('total').get('value', 0) } cache.set_complex(key, data, expires=cache.EXPIRE) return data
def checksums_count(checksums): """Query how many documents mention a checksum.""" schemata = model.get_type_schemata(registry.checksum) index = entities_read_index(schemata) body = [] for checksum in checksums: body.append({"index": index}) query = {"term": {"checksums": checksum}} body.append({"size": 0, "query": query}) results = es.msearch(body=body) for checksum, result in zip(checksums, results.get("responses", [])): total = result.get("hits", {}).get("total", {}).get("value", 0) yield checksum, total
def __init__(self, query, node, prop=None, limit=0, count=False): self.graph = query.graph self.graph.add(node.proxy) self.node = node self.id = node.id self.limit = limit or 0 self.count = count self.entities = [] self.prop = prop if prop is not None: self.index = entities_read_index(prop.schema) field = 'properties.%s' % prop.name self.filter = field_filter_query(field, node.value) self.id = prop.qname else: schemata = model.get_type_schemata(self.node.type) self.index = entities_read_index(schemata) self.filter = field_filter_query(node.type.group, node.value)
def entity_tags(entity, authz): """Do a search on tags of an entity.""" proxy = model.get_proxy(entity) Thing = model.get(Entity.THING) types = [registry.name, registry.email, registry.identifier, registry.iban, registry.phone, registry.address] pivots = [] queries = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for type_ in types: if type_.group is None: continue for value in proxy.get_type_values(type_): if type_.specificity(value) < 0.1: continue schemata = model.get_type_schemata(type_) schemata = [s for s in schemata if s.is_a(Thing)] index = entities_read_index(schemata) queries.append({'index': index}) queries.append({ 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), field_filter_query(type_.group, value) ], 'must_not': [ {'ids': {'values': [entity.get('id')]}}, ] } } }) pivots.append((type_.group, value)) if not len(queries): return res = es.msearch(body=queries) for (field, value), resp in zip(pivots, res.get('responses', [])): total = resp.get('hits', {}).get('total') if total is not None and total > 0: yield (field, value, total)
def entity_tags(proxy, authz, prop_types=DEFAULT_TAGS): """For a given proxy, determine how many other mentions exist for each property value associated, if it is one of a set of types.""" queries = {} lookup = {} values = set() for prop, value in proxy.itervalues(): if prop.type not in prop_types: continue if prop.specificity(value) > 0.1: values.add((prop.type, value)) type_names = [t.name for t in prop_types] log.debug("Tags[%s]: %s values", type_names, len(values)) for (type_, value) in values: key = type_.node_id(value) lookup[key] = (type_, value) # Determine which indexes may contain further mentions (only things). schemata = model.get_type_schemata(type_) schemata = [s for s in schemata if s.is_a(Entity.THING)] index = entities_read_index(schemata) queries[(index, key)] = field_filter_query(type_.group, value) _, counts = _counted_msearch(queries, authz) results = [] for key, count in counts.items(): if count > 1: type_, value = lookup[key] result = { "id": key, "field": type_.group, "value": value, "count": count - 1, } results.append(result) results.sort(key=lambda p: p["count"], reverse=True) # pprint(results) return results
def test_model_type_schemata(self): schema = model.get_type_schemata(registry.iban) assert model.get('BankAccount') in schema, schema assert model.get('CourtCase') not in schema, schema