Beispiel #1
0
def execute_entities_query(args, query, doc_counts=False):
    """Execute the query and return a set of results."""
    result, hits, output = execute_basic(TYPE_ENTITY, query)
    convert_entity_aggregations(result, output, args)
    sub_queries = []
    for doc in hits.get('hits', []):
        entity = doc.get('_source')
        entity['id'] = doc.get('_id')
        entity['score'] = doc.get('_score')
        entity['api_url'] = url_for('entities_api.view', id=doc.get('_id'))
        output['results'].append(entity)

        sq = {'term': {'entities.uuid': entity['id']}}
        sq = authz_sources_filter(sq)
        sq = {'size': 0, 'query': sq}
        sub_queries.append(json.dumps({}))
        sub_queries.append(json.dumps(sq))

    if doc_counts and len(sub_queries):
        res = get_es().msearch(index=get_es_index(),
                               doc_type=TYPE_DOCUMENT,
                               body='\n'.join(sub_queries))
        for (entity, res) in zip(output['results'], res.get('responses')):
            entity['doc_count'] = res.get('hits', {}).get('total')
    return output
Beispiel #2
0
def alert_query(alert):
    """Execute the query and return a set of results."""
    q = text_query(alert.query_text)
    q = authz_filter(q)
    if alert.entity_id:
        q = filter_query(q, [('entities.id', alert.entity_id)], OR_FIELDS)
    if alert.notified_at:
        q = add_filter(q, {"range": {"created_at": {"gt": alert.notified_at}}})
    q = {'query': q, 'size': 150}

    result, hits, output = execute_basic(TYPE_DOCUMENT, q)
    collections = {}
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['collections'] = []
        for coll in document['collection_id']:
            if coll not in authz.collections(authz.READ):
                continue
            if coll not in collections:
                collections[coll] = Collection.by_id(coll)
            if collections[coll] is None:
                continue
            document['collections'].append(collections[coll])
        document['records'] = {'results': [], 'total': 0}
        output['results'].append(document)
    return output
Beispiel #3
0
def execute_documents_query(args, query):
    """Execute the query and return a set of results."""
    begin_time = time.time()
    result, hits, output = execute_basic(TYPE_DOCUMENT, query)
    query_duration = (time.time() - begin_time) * 1000
    log.debug('Query ES time: %.5fms', query_duration)
    convert_document_aggregations(result, output, args)
    query_duration = (time.time() - begin_time) * 1000
    log.debug('Post-facet accumulated: %.5fms', query_duration)
    sub_shoulds = records_query_shoulds(args)

    sub_queries = []
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['score'] = doc.get('_score')
        document['records'] = {'results': [], 'total': 0}

        # TODO: restore entity highlighting somehow.
        sq = records_query_internal(document['id'], sub_shoulds)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))

        output['results'].append(document)

    run_sub_queries(output, sub_queries)
    query_duration = (time.time() - begin_time) * 1000
    log.debug('Post-subquery accumulated: %.5fms', query_duration)
    return output
Beispiel #4
0
def execute_documents_query(args, query):
    """Execute the query and return a set of results."""
    begin_time = time.time()
    result, hits, output = execute_basic(TYPE_DOCUMENT, query)
    query_duration = (time.time() - begin_time) * 1000
    log.debug('Query ES time: %.5fms', query_duration)
    convert_document_aggregations(result, output, args)
    query_duration = (time.time() - begin_time) * 1000
    log.debug('Post-facet accumulated: %.5fms', query_duration)
    sub_shoulds = records_query_shoulds(args)

    sub_queries = []
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['score'] = doc.get('_score')
        document['records'] = {'results': [], 'total': 0}

        # TODO: restore entity highlighting somehow.
        sq = records_query_internal(document['id'], sub_shoulds)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))

        output['results'].append(document)

    run_sub_queries(output, sub_queries)
    query_duration = (time.time() - begin_time) * 1000
    log.debug('Post-subquery accumulated: %.5fms', query_duration)
    return output
Beispiel #5
0
def alert_query(alert):
    """Execute the query and return a set of results."""
    q = text_query(alert.query_text)
    q = authz_sources_filter(q)
    if alert.entity_id:
        q = filter_query(q, [('entities.uuid', alert.entity_id)], OR_FIELDS)
    if alert.notified_at:
        q = add_filter(q, {"range": {"created_at": {"gt": alert.notified_at}}})
    q = {'query': q, 'size': 150}

    result, hits, output = execute_basic(TYPE_DOCUMENT, q)
    sub_queries = []
    sources = {}
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        source_id = document['source_id']
        if source_id not in sources:
            sources[source_id] = Source.by_id(source_id)
        if sources[source_id] is None:
            continue
        document['source'] = sources[source_id]
        document['records'] = {'results': [], 'total': 0}

        sq = records_query(document['id'], alert.to_query(), size=1)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))
        output['results'].append(document)

    run_sub_queries(output, sub_queries)
    return output
Beispiel #6
0
def execute_tabular_query(query):
    """Execute a query against records and return a set of results."""
    result, hits, output = execute_basic(TYPE_RECORD, query)
    for rec in hits.get('hits', []):
        record = rec.get('_source').get('raw')
        record['_id'] = rec.get('_source', {}).get('row_id')
        output['results'].append(record)
    return output
Beispiel #7
0
def leads_query(collection_id, state):
    q = {'term': {'entity_collection_id': collection_id}}
    q = authz_filter(q, state.authz, roles=True)
    aggs = {'scoped': {'global': {}, 'aggs': {}}}
    facets = list(state.facet_names)
    if 'collections' in facets:
        aggs = facet_collections(state, q, aggs)
        facets.remove('collections')
    aggs = aggregate(state, q, aggs, facets)

    q = {
        'sort': [{
            'judgement': 'asc'
        }, {
            'score': 'desc'
        }, {
            'match_id': 'asc'
        }],
        'query': filter_query(q, state.filters),
        'aggregations': aggs,
        'size': state.limit,
        'from': state.offset
    }
    result, hits, output = execute_basic(TYPE_LEAD, q)
    output['facets'] = parse_facet_result(state, result)
    entities = set([])
    for doc in hits.get('hits', []):
        link = doc.get('_source')
        link['id'] = doc.get('_id')
        entities.add(link.get('entity_id'))
        entities.add(link.get('match_id'))
        output['results'].append(link)

    q = {'terms': {'_id': list(entities)}}
    q = {'query': q, 'size': len(entities) + 2}
    _, hits, _ = execute_basic(TYPE_ENTITY, q)
    for doc in hits.get('hits', []):
        entity = doc.get('_source')
        entity['id'] = doc.get('_id')
        for result in output['results']:
            if result.get('match_id') == entity['id']:
                result['match'] = entity
            if result.get('entity_id') == entity['id']:
                result['entity'] = entity
    return output
Beispiel #8
0
def execute_records_query(query):
    """Execute a query against records and return a set of results."""
    result, hits, output = execute_basic(TYPE_RECORD, query)
    for rec in hits.get('hits', []):
        record = rec.get('_source')
        record['score'] = rec.get('_score')
        record['text'] = rec.get('highlight', {}).get('text')
        output['results'].append(record)
    return output
Beispiel #9
0
def execute_records_query(query):
    """Execute a query against records and return a set of results."""
    result, hits, output = execute_basic(TYPE_RECORD, query)
    for rec in hits.get('hits', []):
        record = rec.get('_source')
        record['score'] = rec.get('_score')
        record['text'] = rec.get('highlight', {}).get('text')
        output['results'].append(record)
    return output
Beispiel #10
0
def execute_records_query(query):
    """Execute a query against records and return a set of results."""
    result, hits, output = execute_basic(TYPE_RECORD, query)
    for rec in hits.get("hits", []):
        record = rec.get("_source")
        record["score"] = rec.get("_score")
        record["text"] = rec.get("highlight", {}).get("text")
        output["results"].append(record)
    return output
Beispiel #11
0
def links_query(origin, state):
    """Parse a user query string, compose and execute a query."""
    if state.has_text:
        q = {
            "query_string": {
                "query": state.text,
                "fields": ['name^5', 'names^2', 'text'],
                "default_operator": "AND",
                "use_dis_max": True
            }
        }
    else:
        q = match_all()
    ids = origin.get('ids') or [origin.get('id')]
    q = add_filter(q, {'terms': {'origin.id': ids}})
    q = authz_filter(q, state.authz, roles=True)

    aggs = {'scoped': {'global': {}, 'aggs': {}}}
    aggs = aggregate(state, q, aggs, state.facet_names)

    if state.sort == 'score':
        sort = ['_score']
    else:
        sort = [{
            'properties.start_date': 'desc'
        }, {
            'properties.end_date': 'desc'
        }]

    q = {
        'sort': sort,
        'query': filter_query(q, state.filters),
        'aggregations': aggs,
        'size': state.limit,
        'from': state.offset,
        '_source': DEFAULT_FIELDS
    }

    result, hits, output = execute_basic(TYPE_LINK, q)
    output['facets'] = parse_facet_result(state, result)
    for doc in hits.get('hits', []):
        link = doc.get('_source')
        link['id'] = doc.get('_id')
        link['score'] = doc.get('_score')
        output['results'].append(link)
    return output
Beispiel #12
0
def alert_query(alert):
    """Execute the query and return a set of results."""
    q = text_query(alert.query_text)
    q = authz_filter(q)
    if alert.entity_id:
        q = filter_query(q, [('entities.id', alert.entity_id)], OR_FIELDS)
    if alert.notified_at:
        q = add_filter(q, {
            "range": {
                "created_at": {
                    "gt": alert.notified_at
                }
            }
        })
    q = {
        'query': q,
        'size': 150
    }

    result, hits, output = execute_basic(TYPE_DOCUMENT, q)
    sub_queries = []
    collections = {}
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['collections'] = []
        for coll in document['collection_id']:
            if coll not in authz.collections(authz.READ):
                continue
            if coll not in collections:
                collections[coll] = Collection.by_id(coll)
            if collections[coll] is None:
                continue
            document['collections'].append(collections[coll])
        document['records'] = {'results': [], 'total': 0}

        sq = records_query(document['id'], alert.to_query(), size=1)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))
        output['results'].append(document)

    run_sub_queries(output, sub_queries)
    return output
def execute_records_query(document_id, state, query):
    """Execute a query against records and return a set of results."""
    result, hits, output = execute_basic(TYPE_RECORD, query)
    rows = []
    for rec in hits.get('hits', []):
        record = rec.get('_source')
        record['score'] = rec.get('_score')
        if record.get('row_id'):
            rows.append((record.get('sheet'), record.get('row_id')))

        for text in rec.get('highlight', {}).get('text', []):
            record['text'] = text
        output['results'].append(record)

    for record in DocumentRecord.find_rows(document_id, rows):
        for res in output['results']:
            if res['sheet'] == record.sheet and res['row_id'] == record.row_id:
                res['data'] = record.data
    return output
Beispiel #14
0
def execute_records_query(document_id, state, query):
    """Execute a query against records and return a set of results."""
    result, hits, output = execute_basic(TYPE_RECORD, query)
    ids = []
    for rec in hits.get('hits', []):
        record = rec.get('_source')
        record['score'] = rec.get('_score')
        record['id'] = int(rec.get('_id'))
        ids.append(rec.get('_id'))
        for text in rec.get('highlight', {}).get('text', []):
            record['text'] = text
        output['results'].append(record)

    for record in DocumentRecord.find_records(document_id, ids):
        for result in output['results']:
            if result['id'] == record.id:
                result['data'] = record.data
                result['text'] = record.text
    return output
Beispiel #15
0
def alert_query(alert):
    """Execute the query and return a set of results."""
    q = text_query(alert.query_text)
    q = authz_sources_filter(q)
    if alert.entity_id:
        q = filter_query(q, [('entities.uuid', alert.entity_id)], OR_FIELDS)
    if alert.notified_at:
        q = add_filter(q, {
            "range": {
                "created_at": {
                    "gt": alert.notified_at
                }
            }
        })
    q = {
        'query': q,
        'size': 150
    }

    result, hits, output = execute_basic(TYPE_DOCUMENT, q)
    sub_queries = []
    sources = {}
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        source_id = document['source_id']
        if source_id not in sources:
            sources[source_id] = Source.by_id(source_id)
        if sources[source_id] is None:
            continue
        document['source'] = sources[source_id]
        document['records'] = {'results': [], 'total': 0}

        sq = records_query(document['id'], alert.to_query(), size=1)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))
        output['results'].append(document)

    run_sub_queries(output, sub_queries)
    return output
Beispiel #16
0
def execute_documents_query(args, query):
    """Execute the query and return a set of results."""
    result, hits, output = execute_basic(TYPE_DOCUMENT, query)
    convert_document_aggregations(result, output, args)
    sub_queries = []
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['score'] = doc.get('_score')
        document['records'] = {'results': [], 'total': 0}

        sq = records_query(document['id'], args)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))

        document['api_url'] = url_for('documents_api.view',
                                      document_id=doc.get('_id'))
        document['data_url'] = url_for('documents_api.file',
                                       document_id=doc.get('_id'))
        output['results'].append(document)

    run_sub_queries(output, sub_queries)
    return output
Beispiel #17
0
def execute_documents_alert_query(args, query):
    """Execute the query and return a set of results."""
    if not isinstance(args, MultiDict):
        args = MultiDict(args)
    query['size'] = 50
    result, hits, output = execute_basic(TYPE_DOCUMENT, query)
    convert_aggregations(result, output, args)
    sub_queries = []
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        for source in output['sources']['values']:
            if source['id'] == document['source_id']:
                document['source'] = source
        document['records'] = {'results': [], 'total': 0}

        sq = records_query(document['id'], args, size=1)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))
        output['results'].append(document)

    run_sub_queries(output, sub_queries)
    return output
Beispiel #18
0
def execute_documents_query(args, query):
    """Execute the query and return a set of results."""
    result, hits, output = execute_basic(TYPE_DOCUMENT, query)
    convert_aggregations(result, output, args)
    sub_queries = []
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['score'] = doc.get('_score')
        document['records'] = {'results': [], 'total': 0}

        sq = records_query(document['id'], args)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))

        document['api_url'] = url_for('documents_api.view',
                                      document_id=doc.get('_id'))
        document['data_url'] = url_for('documents_api.file',
                                       document_id=doc.get('_id'))
        output['results'].append(document)

    run_sub_queries(output, sub_queries)
    return output
Beispiel #19
0
def new_doc_count():
    #XXX this needs to be cached
    query = {'query': {'range': {'created_at': {'gte': 'now-7d/d'}}}}
    res = execute_basic('document', query)
    total = res[1].get('total')
    return jsonify({'week': total})
Beispiel #20
0
def entities_query(state, fields=None, facets=True, doc_counts=False):
    """Parse a user query string, compose and execute a query."""
    if state.has_text:
        q = {
            "query_string": {
                "query": state.text,
                "fields": ['name^5', 'names^2', 'text'],
                "default_operator": "AND",
                "use_dis_max": True
            }
        }
    else:
        q = match_all()

    if state.raw_query:
        q = {"bool": {"must": [q, state.raw_query]}}

    q = authz_filter(q, state.authz, roles=True)

    aggs = {'scoped': {'global': {}, 'aggs': {}}}
    if facets:
        facets = list(state.facet_names)
        if 'collections' in facets:
            aggs = facet_collections(state, q, aggs)
            facets.remove('collections')
        aggs = aggregate(state, q, aggs, facets)

    if state.sort == 'doc_count':
        sort = [{'doc_count': 'desc'}, '_score']
    elif state.sort == 'score':
        sort = ['_score', {'name_sort': 'asc'}]
    else:
        sort = [{'name_sort': 'asc'}]

    # pprint(q)
    q = {
        'sort': sort,
        'query': filter_query(q, state.filters),
        'aggregations': aggs,
        'size': state.limit,
        'from': state.offset,
        '_source': fields or DEFAULT_FIELDS
    }

    result, hits, output = execute_basic(TYPE_ENTITY, q)
    output['facets'] = parse_facet_result(state, result)
    sub_queries = []
    for doc in hits.get('hits', []):
        entity = doc.get('_source')
        entity['id'] = doc.get('_id')
        entity['score'] = doc.get('_score')
        entity['api_url'] = url_for('entities_api.view', id=doc.get('_id'))
        output['results'].append(entity)

        sq = {'term': {'entities.id': entity['id']}}
        sq = add_filter(
            sq, {'terms': {
                'collection_id': state.authz.collections_read
            }})
        sq = {'size': 0, 'query': sq}
        sub_queries.append(json.dumps({}))
        sub_queries.append(json.dumps(sq))

    if doc_counts and len(sub_queries):
        # Get the number of matching documents for each entity.
        body = '\n'.join(sub_queries)
        res = es.msearch(index=es_index, doc_type=TYPE_DOCUMENT, body=body)
        for (entity, res) in zip(output['results'], res.get('responses')):
            entity['doc_count'] = res.get('hits', {}).get('total')

    return output
Beispiel #21
0
def documents_query(state, fields=None, facets=True, since=None):
    """Parse a user query string, compose and execute a query."""
    # This used to be several functions, but it's actually incredibly
    # procedural and so it's been linearised into one function. To really
    # clean this up, I think it should be based around an object model of
    # some sort.
    q = text_query(state.text)

    if state.raw_query:
        q = {"bool": {"must": [q, state.raw_query]}}

    q = authz_filter(q, state.authz, roles=False)

    # Used by alerting to find only updated results:
    if since is not None:
        q = add_filter(q, {"range": {"created_at": {"gt": since}}})

    # Sorting
    if state.sort == 'newest':
        sort = [{'dates': 'desc'}, {'created_at': 'desc'}, '_score']
    if state.sort == 'oldest':
        sort = [{'dates': 'asc'}, {'created_at': 'asc'}, '_score']
    else:
        sort = ['_score']

    # TODO: find a better way to handle "slightly special" aggregations like
    # entities and collections.
    aggs = {'scoped': {'global': {}, 'aggs': {}}}
    if facets:
        facets = list(state.facet_names)
        if 'collections' in facets:
            aggs = facet_collections(state, q, aggs)
            facets.remove('collections')
        if 'entities' in facets:
            aggs = facet_entities(state, aggs)
            facets.remove('entities')
        aggs = aggregate(state, q, aggs, facets)

    # allow plug-ins to post-process the query.
    signals.document_query_process.send(q=q, state=state)

    q = {
        'sort': sort,
        'size': state.limit,
        'from': state.offset,
        'query': filter_query(q, state.filters),
        'aggregations': aggs,
        '_source': fields or DEFAULT_FIELDS
    }
    result, hits, output = execute_basic(TYPE_DOCUMENT, q)

    # This will add labels and other contextual information.
    output['facets'] = parse_facet_result(state, result)

    # After the main query has run, a sub-query will be run for each returned
    # result in order to find relevant records for result highlighting.
    sub_shoulds = records_query_shoulds(state)
    sub_queries = []
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['score'] = doc.get('_score')
        document['records'] = {'results': [], 'total': 0}
        collection_id = document.get('collection_id')
        try:
            # FIXME: there's got to be a nicer way of doing this....
            document['public'] = state.authz.collection_public(collection_id)
        except:
            document['public'] = None

        sq = records_query_internal(document['id'], sub_shoulds)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))

        output['results'].append(document)

    if not len(sub_queries):
        return output

    body = '\n'.join(sub_queries)
    res = es.msearch(index=es_index, doc_type=TYPE_RECORD, body=body)
    for doc in output['results']:
        for sq in res.get('responses', []):
            sqhits = sq.get('hits', {})
            doc['records']['total'] = sqhits.get('total', 0)
            for hit in sqhits.get('hits', {}):
                record = hit.get('_source')
                if doc['id'] != record['document_id']:
                    continue
                hlt = hit.get('highlight', {})
                texts = hlt.get('text', []) or hlt.get('text_latin', [])
                texts = [clean_highlight(t) for t in texts]
                texts = [t for t in texts if len(t)]
                if len(texts):
                    record['text'] = texts[0]
                    doc['records']['results'].append(record)

    return output