Beispiel #1
0
def _filters_faceted_query(authz, queries):
    indexed = {}
    for (idx, alias, filter_) in queries:
        indexed[idx] = indexed.get(idx, {})
        indexed[idx][alias] = filter_

    queries = []
    for (idx, filters) in indexed.items():
        queries.append({'index': idx})
        queries.append({
            'size': 0,
            'query': {'bool': {'filter': [authz_query(authz)]}},
            'aggs': {'counters': {'filters': {'filters': filters}}}
        })

    results = {}
    if not len(queries):
        return results

    res = es.msearch(body=queries)
    for resp in res.get('responses', []):
        aggs = resp.get('aggregations', {}).get('counters', {})
        for alias, value in aggs.get('buckets', {}).items():
            results[alias] = value.get('doc_count', results.get(alias, 0))
    return results
Beispiel #2
0
def execute_documents_query(args, q):
    """ Execute the query and return a set of results. """
    result = es.search(index=es_index, doc_type=TYPE_DOCUMENT, body=q)
    hits = result.get('hits', {})
    output = {
        'status': 'ok',
        'results': [],
        'offset': q['from'],
        'limit': q['size'],
        'total': hits.get('total'),
        'next': None,
        'facets': {},
        'watchlists': {}
    }
    convert_aggregations(result, output, args)
    next_offset = output['offset'] + output['limit']
    if output['total'] > next_offset:
        params = {'offset': next_offset}
        for k, v in args.iterlists():
            if k in ['offset']:
                continue
            params[k] = v
        output['next'] = url_for('search.query', **params)

    sub_queries = []
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['score'] = doc.get('_score')
        document['records'] = {'results': [], 'total': 0}

        sq = records_query(document['id'], args)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))

        document['api_url'] = url_for('document.view',
                                      document_id=doc.get('_id'))
        document['data_url'] = url_for('document.file',
                                       document_id=doc.get('_id'))
        output['results'].append(document)

    if len(sub_queries):
        res = es.msearch(index=es_index, doc_type=TYPE_RECORD,
                         body='\n'.join(sub_queries))
        for doc in output['results']:
            for sq in res.get('responses', []):
                sqhits = sq.get('hits', {})
                for hit in sqhits.get('hits', {}):
                    record = hit.get('_source')
                    if doc['id'] != record.get('document_id'):
                        continue
                    record['score'] = hit.get('_score')
                    record['text'] = hit.get('highlight', {}).get('text')
                    doc['records']['results'].append(record)
                    doc['records']['total'] = sqhits.get('total', 0)

    return output
Beispiel #3
0
def _counted_msearch(queries, authz, limit=0):
    """Run batched queries to count or retrieve entities with certain
    property values."""
    # The default case for this is that we want to retrieve only the
    # counts for a bunch of filtered sub-queries. In this case, we can
    # group the queries by the affected index.
    # In some cases, the expand API wants to actually retrieve entities.
    # Then, we need to make one query per filter.
    grouped = {}
    for (index, key), query in sorted(queries.items()):
        group = index if limit == 0 else (index, key)
        if group not in grouped:
            grouped[group] = {
                "index": index,
                "filters": [query],
                "counts": {key: query},
            }
        else:
            grouped[group]["filters"].append(query)
            grouped[group]["counts"][key] = query

    log.debug("Counts: %s queries, %s groups", len(queries), len(grouped))

    body = []
    for group in grouped.values():
        body.append({"index": group.get("index")})
        filters = group.get("filters")
        if limit == 0 and len(filters) > 1:
            filters = [{"bool": {"should": filters, "minimum_should_match": 1}}]
        filters.append(authz_query(authz))
        query = {
            "size": limit,
            "query": {"bool": {"filter": filters}},
            "aggs": {"counts": {"filters": {"filters": group.get("counts")}}},
            "_source": ENTITY_SOURCE,
        }
        body.append(query)

    counts = {}
    # FIXME: This doesn't actually retain context on which query a particular
    # entity is a result from. Doesn't matter if all we do in the end is stuff
    # everything into an FtMGraph and then traverse for adjacency.
    entities = []

    if not len(body):
        return entities, counts

    response = es.msearch(body=body)
    for resp in response.get("responses", []):
        for result in resp.get("hits", {}).get("hits", []):
            entities.append(unpack_result(result))
        buckets = resp.get("aggregations", {}).get("counts", {}).get("buckets", {})
        for key, count in buckets.items():
            counts[key] = count.get("doc_count", 0)
    return entities, counts
Beispiel #4
0
def entity_tags(entity, authz):
    """Do a search on tags of an entity."""
    # NOTE: This must also work for documents.
    FIELDS = [
        'names',
        'emails',
        'phones',
        'addresses',
        'identifiers'
    ]
    pivots = []
    queries = []
    # Go through all the tags which apply to this entity, and find how
    # often they've been mentioned in other entities.
    for field in FIELDS:
        for value in entity.get(field, []):
            if value is None:
                continue
            queries.append({})
            queries.append({
                'size': 0,
                'query': {
                    'bool': {
                        'filter': [
                            authz_query(authz),
                            field_filter_query(field, value)
                        ],
                        'must_not': [
                            {'ids': {'values': [entity.get('id')]}},
                        ]
                    }
                }
            })
            pivots.append((field, value))

    if not len(queries):
        return []

    res = es.msearch(index=entities_index(), body=queries)
    results = []
    for (field, value), resp in zip(pivots, res.get('responses', [])):
        total = resp.get('hits', {}).get('total')
        if total > 0:
            qvalue = quote(value.encode('utf-8'))
            key = ('filter:%s' % field, qvalue)
            results.append({
                'id': query_string([key]),
                'value': value,
                'field': field,
                'count': total
            })

    results.sort(key=lambda p: p['count'], reverse=True)
    return results
Beispiel #5
0
def checksums_count(checksums):
    """Query how many documents mention a checksum."""
    schemata = model.get_type_schemata(registry.checksum)
    index = entities_read_index(schemata)
    body = []
    for checksum in checksums:
        body.append({"index": index})
        query = {"term": {"checksums": checksum}}
        body.append({"size": 0, "query": query})
    results = es.msearch(body=body)
    for checksum, result in zip(checksums, results.get("responses", [])):
        total = result.get("hits", {}).get("total", {}).get("value", 0)
        yield checksum, total
Beispiel #6
0
def entity_references(entity, authz):
    """Given a particular entity, find all the references to it from other
    entities, grouped by the property where they are used."""
    schema = model[entity.get('schema')]

    # Generate all the possible mention locations.
    properties = []
    queries = []
    for prop in model.properties:
        if not prop.is_entity:
            continue
        if not schema.is_a(prop.range):
            continue

        field = 'properties.%s' % prop.name
        queries.append({})
        queries.append({
            'size': 0,
            'query': {
                'bool': {
                    'filter': [
                        authz_query(authz),
                        {
                            'term': {
                                'schemata': prop.schema.name
                            }
                        },
                        {
                            'term': {
                                field: entity.get('id')
                            }
                        },
                    ]
                }
            }
        })
        properties.append(prop)

    # Run a count search (with schema facet?)
    res = es.msearch(index=entities_index(), body=queries)
    results = []
    for prop, resp in zip(properties, res.get('responses', [])):
        total = resp.get('hits', {}).get('total')
        if total > 0:
            results.append({
                'count': total,
                'property': prop,
                'schema': prop.schema.name
            })
    return results
Beispiel #7
0
def _filters_faceted_query(facets, authz=None):
    filters = {}
    indexed = {}
    for (idx, alias, group, field, value) in facets:
        indexed[idx] = indexed.get(idx, {})
        indexed[idx][alias] = field_filter_query(field, value)
        filters[idx] = filters.get(idx, {})
        filters[idx][group] = filters[idx].get(group, [])
        filters[idx][group].append(value)

    queries = []
    for (idx, facets) in indexed.items():
        shoulds = []
        for field, values in filters[idx].items():
            shoulds.append(field_filter_query(field, values))
        query = []
        if authz is not None:
            query.append(authz_query(authz))
        query = {
            'bool': {
                'should': shoulds,
                'filter': query,
                'minimum_should_match': 1
            }
        }
        queries.append({'index': idx})
        queries.append({
            'size': 0,
            'query': query,
            'aggs': {
                'counters': {
                    'filters': {
                        'filters': facets
                    }
                }
            }
        })

    results = {}
    if not len(queries):
        return results

    res = es.msearch(body=queries)
    for resp in res.get('responses', []):
        aggs = resp.get('aggregations', {}).get('counters', {})
        for alias, value in aggs.get('buckets', {}).items():
            results[alias] = value.get('doc_count', results.get(alias, 0))
    return results
Beispiel #8
0
def entity_tags(entity, authz):
    """Do a search on tags of an entity."""
    proxy = model.get_proxy(entity)
    Thing = model.get(Entity.THING)
    types = [registry.name, registry.email, registry.identifier,
             registry.iban, registry.phone, registry.address]
    pivots = []
    queries = []
    # Go through all the tags which apply to this entity, and find how
    # often they've been mentioned in other entities.
    for type_ in types:
        if type_.group is None:
            continue
        for value in proxy.get_type_values(type_):
            if type_.specificity(value) < 0.1:
                continue
            schemata = model.get_type_schemata(type_)
            schemata = [s for s in schemata if s.is_a(Thing)]
            index = entities_read_index(schemata)
            queries.append({'index': index})
            queries.append({
                'size': 0,
                'query': {
                    'bool': {
                        'filter': [
                            authz_query(authz),
                            field_filter_query(type_.group, value)
                        ],
                        'must_not': [
                            {'ids': {'values': [entity.get('id')]}},
                        ]
                    }
                }
            })
            pivots.append((type_.group, value))

    if not len(queries):
        return

    res = es.msearch(body=queries)
    for (field, value), resp in zip(pivots, res.get('responses', [])):
        total = resp.get('hits', {}).get('total')
        if total is not None and total > 0:
            yield (field, value, total)
Beispiel #9
0
def entity_tags(entity, authz):
    """Do a search on tags of an entity."""
    # NOTE: This must also work for documents.
    FIELDS = [
        'names',
        'emails',
        'phones',
        'addresses',
        'identifiers'
    ]
    pivots = []
    queries = []
    # Go through all the tags which apply to this entity, and find how
    # often they've been mentioned in other entities.
    for field in FIELDS:
        for value in entity.get(field, []):
            if value is None or not len(value):
                continue
            queries.append({})
            queries.append({
                'size': 0,
                'query': {
                    'bool': {
                        'filter': [
                            authz_query(authz),
                            field_filter_query(field, value)
                        ],
                        'must_not': [
                            {'ids': {'values': [entity.get('id')]}},
                        ]
                    }
                }
            })
            pivots.append((field, value))

    if not len(queries):
        return

    res = es.msearch(index=entities_read_index(), body=queries)
    for (field, value), resp in zip(pivots, res.get('responses', [])):
        total = resp.get('hits', {}).get('total')
        if total is not None and total > 0:
            yield (field, value, total)
Beispiel #10
0
 def execute(self):
     "Run queries and dingle apart the returned responses."
     queries = self.compile()
     if not len(queries):
         return []
     body = []
     for (_, index, query) in queries:
         body.append({'index': index})
         body.append(query)
     results = es.msearch(body=body)
     responses = results.get('responses', [])
     for ((patterns, _, _), result) in zip(queries, responses):
         hits = result.get('hits', {}).get('hits', [])
         results = [unpack_result(r) for r in hits]
         aggs = result.get('aggregations', {}).get('counters', {})
         counters = aggs.get('buckets', {})
         for pattern in patterns:
             count = counters.get(pattern.id, {}).get('doc_count')
             pattern.apply(count, results)
     return self.patterns
Beispiel #11
0
def _filters_faceted_query(authz, facets):
    filters = {}
    indexed = {}
    for (idx, alias, group, field, value) in facets:
        indexed[idx] = indexed.get(idx, {})
        indexed[idx][alias] = field_filter_query(field, value)
        filters[idx] = filters.get(idx, {})
        filters[idx][group] = filters[idx].get(group, [])
        filters[idx][group].append(value)

    queries = []
    for (idx, facets) in indexed.items():
        shoulds = []
        for field, values in filters[idx].items():
            shoulds.append(field_filter_query(field, values))
        query = {
            'bool': {
                'should': shoulds,
                'filter': [authz_query(authz)],
                'minimum_should_match': 1
            }
        }
        queries.append({'index': idx})
        queries.append({
            'size': 0,
            'query': query,
            'aggs': {'counters': {'filters': {'filters': facets}}}
        })

    results = {}
    if not len(queries):
        return results

    res = es.msearch(body=queries)
    for resp in res.get('responses', []):
        aggs = resp.get('aggregations', {}).get('counters', {})
        for alias, value in aggs.get('buckets', {}).items():
            results[alias] = value.get('doc_count', results.get(alias, 0))
    return results
Beispiel #12
0
def entities_query(state, fields=None, facets=True, doc_counts=False):
    """Parse a user query string, compose and execute a query."""
    if state.has_text:
        q = {
            "query_string": {
                "query": state.text,
                "fields": ['name^5', 'names^2', 'text'],
                "default_operator": "AND",
                "use_dis_max": True
            }
        }
    else:
        q = match_all()

    if state.raw_query:
        q = {"bool": {"must": [q, state.raw_query]}}

    q = authz_filter(q, state.authz, roles=True)

    aggs = {'scoped': {'global': {}, 'aggs': {}}}
    if facets:
        facets = list(state.facet_names)
        if 'collections' in facets:
            aggs = facet_collections(state, q, aggs)
            facets.remove('collections')
        aggs = aggregate(state, q, aggs, facets)

    if state.sort == 'doc_count':
        sort = [{'doc_count': 'desc'}, '_score']
    elif state.sort == 'score':
        sort = ['_score', {'name_sort': 'asc'}]
    else:
        sort = [{'name_sort': 'asc'}]

    # pprint(q)
    q = {
        'sort': sort,
        'query': filter_query(q, state.filters),
        'aggregations': aggs,
        'size': state.limit,
        'from': state.offset,
        '_source': fields or DEFAULT_FIELDS
    }

    result, hits, output = execute_basic(TYPE_ENTITY, q)
    output['facets'] = parse_facet_result(state, result)
    sub_queries = []
    for doc in hits.get('hits', []):
        entity = doc.get('_source')
        entity['id'] = doc.get('_id')
        entity['score'] = doc.get('_score')
        entity['api_url'] = url_for('entities_api.view', id=doc.get('_id'))
        output['results'].append(entity)

        sq = {'term': {'entities.id': entity['id']}}
        sq = add_filter(
            sq, {'terms': {
                'collection_id': state.authz.collections_read
            }})
        sq = {'size': 0, 'query': sq}
        sub_queries.append(json.dumps({}))
        sub_queries.append(json.dumps(sq))

    if doc_counts and len(sub_queries):
        # Get the number of matching documents for each entity.
        body = '\n'.join(sub_queries)
        res = es.msearch(index=es_index, doc_type=TYPE_DOCUMENT, body=body)
        for (entity, res) in zip(output['results'], res.get('responses')):
            entity['doc_count'] = res.get('hits', {}).get('total')

    return output
Beispiel #13
0
def documents_query(state, fields=None, facets=True, since=None):
    """Parse a user query string, compose and execute a query."""
    # This used to be several functions, but it's actually incredibly
    # procedural and so it's been linearised into one function. To really
    # clean this up, I think it should be based around an object model of
    # some sort.
    q = text_query(state.text)

    if state.raw_query:
        q = {"bool": {"must": [q, state.raw_query]}}

    q = authz_filter(q, state.authz, roles=False)

    # Used by alerting to find only updated results:
    if since is not None:
        q = add_filter(q, {"range": {"created_at": {"gt": since}}})

    # Sorting
    if state.sort == 'newest':
        sort = [{'dates': 'desc'}, {'created_at': 'desc'}, '_score']
    if state.sort == 'oldest':
        sort = [{'dates': 'asc'}, {'created_at': 'asc'}, '_score']
    else:
        sort = ['_score']

    # TODO: find a better way to handle "slightly special" aggregations like
    # entities and collections.
    aggs = {'scoped': {'global': {}, 'aggs': {}}}
    if facets:
        facets = list(state.facet_names)
        if 'collections' in facets:
            aggs = facet_collections(state, q, aggs)
            facets.remove('collections')
        if 'entities' in facets:
            aggs = facet_entities(state, aggs)
            facets.remove('entities')
        aggs = aggregate(state, q, aggs, facets)

    # allow plug-ins to post-process the query.
    signals.document_query_process.send(q=q, state=state)

    q = {
        'sort': sort,
        'size': state.limit,
        'from': state.offset,
        'query': filter_query(q, state.filters),
        'aggregations': aggs,
        '_source': fields or DEFAULT_FIELDS
    }
    result, hits, output = execute_basic(TYPE_DOCUMENT, q)

    # This will add labels and other contextual information.
    output['facets'] = parse_facet_result(state, result)

    # After the main query has run, a sub-query will be run for each returned
    # result in order to find relevant records for result highlighting.
    sub_shoulds = records_query_shoulds(state)
    sub_queries = []
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['score'] = doc.get('_score')
        document['records'] = {'results': [], 'total': 0}
        collection_id = document.get('collection_id')
        try:
            # FIXME: there's got to be a nicer way of doing this....
            document['public'] = state.authz.collection_public(collection_id)
        except:
            document['public'] = None

        sq = records_query_internal(document['id'], sub_shoulds)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))

        output['results'].append(document)

    if not len(sub_queries):
        return output

    body = '\n'.join(sub_queries)
    res = es.msearch(index=es_index, doc_type=TYPE_RECORD, body=body)
    for doc in output['results']:
        for sq in res.get('responses', []):
            sqhits = sq.get('hits', {})
            doc['records']['total'] = sqhits.get('total', 0)
            for hit in sqhits.get('hits', {}):
                record = hit.get('_source')
                if doc['id'] != record['document_id']:
                    continue
                hlt = hit.get('highlight', {})
                texts = hlt.get('text', []) or hlt.get('text_latin', [])
                texts = [clean_highlight(t) for t in texts]
                texts = [t for t in texts if len(t)]
                if len(texts):
                    record['text'] = texts[0]
                    doc['records']['results'].append(record)

    return output