Python clean_highlight Examples, aleph.search.util.clean_highlight Python Examples

Example #1

0

Show file

def run_sub_queries(output, sub_queries):
    if len(sub_queries):
        res = get_es().msearch(index=get_es_index(),
                               doc_type=TYPE_RECORD,
                               body='\n'.join(sub_queries))
        for doc in output['results']:
            for sq in res.get('responses', []):
                sqhits = sq.get('hits', {})
                for hit in sqhits.get('hits', {}):
                    record = hit.get('_source')
                    if doc['id'] != record.get('document_id'):
                        continue
                    record['score'] = hit.get('_score')
                    highlights = hit.get('highlight', {})
                    if len(highlights.get('text', [])):
                        record['text'] = highlights.get('text')
                    elif len(highlights.get('text_latin', [])):
                        record['text'] = highlights.get('text_latin', [])
                    else:
                        continue
                    record['text'] = [
                        clean_highlight(t) for t in record['text']
                    ]
                    doc['records']['results'].append(record)
                    doc['records']['total'] = sqhits.get('total', 0)

Example #2

0

Show file

File: documents.py Project: stefanw/aleph

def run_sub_queries(output, sub_queries):
    if len(sub_queries):
        res = get_es().msearch(index=get_es_index(), doc_type=TYPE_RECORD,
                               body='\n'.join(sub_queries))
        for doc in output['results']:
            for sq in res.get('responses', []):
                sqhits = sq.get('hits', {})
                for hit in sqhits.get('hits', {}):
                    record = hit.get('_source')
                    if doc['id'] != record.get('document_id'):
                        continue
                    record['score'] = hit.get('_score')
                    highlights = hit.get('highlight', {})
                    if len(highlights.get('text', [])):
                        record['text'] = highlights.get('text')
                    elif len(highlights.get('text_latin', [])):
                        record['text'] = highlights.get('text_latin', [])
                    else:
                        continue
                    record['text'] = [clean_highlight(t) for t in record['text']]
                    doc['records']['results'].append(record)
                    doc['records']['total'] = sqhits.get('total', 0)

Example #3

0

Show file

File: documents.py Project: wethepeopleonline/aleph

def documents_query(state, fields=None, facets=True, since=None):
    """Parse a user query string, compose and execute a query."""
    # This used to be several functions, but it's actually incredibly
    # procedural and so it's been linearised into one function. To really
    # clean this up, I think it should be based around an object model of
    # some sort.
    q = text_query(state.text)

    if state.raw_query:
        q = {"bool": {"must": [q, state.raw_query]}}

    q = authz_filter(q, state.authz, roles=False)

    # Used by alerting to find only updated results:
    if since is not None:
        q = add_filter(q, {"range": {"created_at": {"gt": since}}})

    # Sorting
    if state.sort == 'newest':
        sort = [{'dates': 'desc'}, {'created_at': 'desc'}, '_score']
    if state.sort == 'oldest':
        sort = [{'dates': 'asc'}, {'created_at': 'asc'}, '_score']
    else:
        sort = ['_score']

    # TODO: find a better way to handle "slightly special" aggregations like
    # entities and collections.
    aggs = {'scoped': {'global': {}, 'aggs': {}}}
    if facets:
        facets = list(state.facet_names)
        if 'collections' in facets:
            aggs = facet_collections(state, q, aggs)
            facets.remove('collections')
        if 'entities' in facets:
            aggs = facet_entities(state, aggs)
            facets.remove('entities')
        aggs = aggregate(state, q, aggs, facets)

    # allow plug-ins to post-process the query.
    signals.document_query_process.send(q=q, state=state)

    q = {
        'sort': sort,
        'size': state.limit,
        'from': state.offset,
        'query': filter_query(q, state.filters),
        'aggregations': aggs,
        '_source': fields or DEFAULT_FIELDS
    }
    result, hits, output = execute_basic(TYPE_DOCUMENT, q)

    # This will add labels and other contextual information.
    output['facets'] = parse_facet_result(state, result)

    # After the main query has run, a sub-query will be run for each returned
    # result in order to find relevant records for result highlighting.
    sub_shoulds = records_query_shoulds(state)
    sub_queries = []
    for doc in hits.get('hits', []):
        document = doc.get('_source')
        document['id'] = int(doc.get('_id'))
        document['score'] = doc.get('_score')
        document['records'] = {'results': [], 'total': 0}
        collection_id = document.get('collection_id')
        try:
            # FIXME: there's got to be a nicer way of doing this....
            document['public'] = state.authz.collection_public(collection_id)
        except:
            document['public'] = None

        sq = records_query_internal(document['id'], sub_shoulds)
        if sq is not None:
            sub_queries.append(json.dumps({}))
            sub_queries.append(json.dumps(sq))

        output['results'].append(document)

    if not len(sub_queries):
        return output

    body = '\n'.join(sub_queries)
    res = es.msearch(index=es_index, doc_type=TYPE_RECORD, body=body)
    for doc in output['results']:
        for sq in res.get('responses', []):
            sqhits = sq.get('hits', {})
            doc['records']['total'] = sqhits.get('total', 0)
            for hit in sqhits.get('hits', {}):
                record = hit.get('_source')
                if doc['id'] != record['document_id']:
                    continue
                hlt = hit.get('highlight', {})
                texts = hlt.get('text', []) or hlt.get('text_latin', [])
                texts = [clean_highlight(t) for t in texts]
                texts = [t for t in texts if len(t)]
                if len(texts):
                    record['text'] = texts[0]
                    doc['records']['results'].append(record)

    return output