예제 #1
0
파일: __init__.py 프로젝트: 01-/aleph
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    try:
        log.info("Index document: %r", document)
        data = document.to_index_dict()
        data['entities'] = generate_entities(document)
        data['title_latin'] = latinize_text(data.get('title'))
        data['summary_latin'] = latinize_text(data.get('summary'))
        get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                       id=document.id)

        clear_children(document)
        if document.type == Document.TYPE_TEXT:
            bulk(get_es(), generate_pages(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)

        if document.type == Document.TYPE_TABULAR:
            bulk(get_es(), generate_records(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INDEX, component=__name__,
                          document_id=document.id, meta=document.meta,
                          source_id=document.source_id, exception=ex)
예제 #2
0
def delete_document(document_id):
    clear_records(document_id)
    try:
        get_es().delete(index=get_es_index(), doc_type=TYPE_DOCUMENT,
                        id=document_id)
    except NotFoundError:
        pass
예제 #3
0
파일: documents.py 프로젝트: tomjie/aleph
def delete_document(document_id):
    clear_records(document_id)
    try:
        get_es().delete(index=get_es_index(),
                        doc_type=TYPE_DOCUMENT,
                        id=document_id)
    except NotFoundError:
        pass
예제 #4
0
파일: __init__.py 프로젝트: 01-/aleph
def init_search():
    log.info("Creating ElasticSearch index and uploading mapping...")
    get_es().indices.create(get_es_index(), body={
        'mappings': {
            TYPE_DOCUMENT: DOCUMENT_MAPPING,
            TYPE_RECORD: RECORD_MAPPING
        }
    })
예제 #5
0
파일: admin.py 프로젝트: stefanw/aleph
def init_search():
    log.info("Creating ElasticSearch index and uploading mapping...")
    get_es().indices.create(get_es_index(), body={
        'mappings': {
            TYPE_DOCUMENT: DOCUMENT_MAPPING,
            TYPE_RECORD: RECORD_MAPPING
        }
    })
    get_es().indices.open(index=get_es_index())
예제 #6
0
def init_search():
    log.info("Creating ElasticSearch index and uploading mapping...")
    get_es().indices.create(get_es_index(), body={
        'mappings': {
            TYPE_DOCUMENT: DOCUMENT_MAPPING,
            TYPE_RECORD: RECORD_MAPPING,
            TYPE_ENTITY: ENTITY_MAPPING
        }
    }, ignore=[400, 404])
    get_es().indices.open(index=get_es_index(), ignore=[400, 404])
예제 #7
0
def index_document(document, index_records=True):
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    if index_records:
        clear_records(document.id)
        bulk_op(generate_records(document))
예제 #8
0
def index_entity(entity):
    """Index an entity."""
    data = entity.to_dict()
    data.pop('id', None)
    data['doc_count'] = get_count(entity)
    data['terms'] = entity.terms
    data['terms_latin'] = [latinize_text(t) for t in entity.terms]
    data['name_latin'] = latinize_text(data.get('name'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    data['description_latin'] = latinize_text(data.get('description'))
    data = expand_json(data)
    get_es().index(index=get_es_index(), doc_type=TYPE_ENTITY,
                   id=entity.id, body=data)
예제 #9
0
파일: documents.py 프로젝트: tomjie/aleph
def index_document(document, index_records=True):
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(),
                   doc_type=TYPE_DOCUMENT,
                   body=data,
                   id=document.id)

    if index_records:
        clear_records(document.id)
        bulk_op(generate_records(document))
예제 #10
0
파일: __init__.py 프로젝트: andkamau/aleph
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    clear_records(document)
    bulk_op(generate_records(document))
예제 #11
0
def init_search():
    log.info("Creating ElasticSearch index and uploading mapping...")
    get_es().indices.create(get_es_index(),
                            body={
                                'mappings': {
                                    TYPE_DOCUMENT: DOCUMENT_MAPPING,
                                    TYPE_RECORD: RECORD_MAPPING,
                                    TYPE_ENTITY: ENTITY_MAPPING
                                },
                                'settings': {
                                    'number_of_shards': 10,
                                    'number_of_replicas': 1,
                                }
                            })
    get_es().indices.open(index=get_es_index())
예제 #12
0
def suggest_entities(prefix, min_count=0, schemas=None, size=5):
    """Auto-complete API."""
    options = []
    if prefix is not None and len(prefix.strip()):
        q = {'match_phrase_prefix': {'terms': prefix.strip()}}
        if min_count > 0:
            q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}})
        if schemas is not None and len(schemas):
            q = add_filter(q, {'terms': {'$schema': schemas}})
        q = {
            'size': size,
            'sort': [{
                'doc_count': 'desc'
            }, '_score'],
            'query': authz_filter(q),
            '_source': ['name', '$schema', 'terms', 'doc_count']
        }
        ref = latinize_text(prefix)
        result = get_es().search(index=get_es_index(),
                                 doc_type=TYPE_ENTITY,
                                 body=q)
        for res in result.get('hits', {}).get('hits', []):
            ent = res.get('_source')
            terms = [latinize_text(t) for t in ent.pop('terms', [])]
            ent['match'] = ref in terms
            ent['score'] = res.get('_score')
            ent['id'] = res.get('_id')
            options.append(ent)
    return {'prefix': prefix, 'results': options}
예제 #13
0
def suggest_entities(prefix, min_count=0, schemas=None, size=5):
    """Auto-complete API."""
    options = []
    if prefix is not None and len(prefix.strip()):
        q = {
            'match_phrase_prefix': {'terms': prefix.strip()}
        }
        if min_count > 0:
            q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}})
        if schemas is not None and len(schemas):
            q = add_filter(q, {'terms': {'$schema': schemas}})
        q = {
            'size': size,
            'sort': [{'doc_count': 'desc'}, '_score'],
            'query': authz_filter(q),
            '_source': ['name', '$schema', 'terms', 'doc_count']
        }
        ref = latinize_text(prefix)
        result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY,
                                 body=q)
        for res in result.get('hits', {}).get('hits', []):
            ent = res.get('_source')
            terms = [latinize_text(t) for t in ent.pop('terms', [])]
            ent['match'] = ref in terms
            ent['score'] = res.get('_score')
            ent['id'] = res.get('_id')
            options.append(ent)
    return {
        'prefix': prefix,
        'results': options
    }
예제 #14
0
파일: __init__.py 프로젝트: 01-/aleph
def delete_source(source_id):
    """Delete all documents from a particular source."""
    q = {'query': {'term': {'source_id': source_id}}}

    def deletes():
            q['_source'] = ['document_id']
            for res in scan(get_es(), query=q, index=get_es_index(),
                            doc_type=[TYPE_RECORD]):
                yield {
                    '_op_type': 'delete',
                    '_index': get_es_index(),
                    '_parent': res.get('_source', {}).get('document_id'),
                    '_type': res.get('_type'),
                    '_id': res.get('_id')
                }

            q['_source'] = []
            for res in scan(get_es(), query=q, index=get_es_index(),
                            doc_type=[TYPE_DOCUMENT]):
                yield {
                    '_op_type': 'delete',
                    '_index': get_es_index(),
                    '_type': res.get('_type'),
                    '_id': res.get('_id')
                }

    try:
        bulk(get_es(), deletes(), stats_only=True, chunk_size=2000,
             request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
예제 #15
0
파일: peek.py 프로젝트: nivertech/aleph
def peek_query(args):
    if not isinstance(args, MultiDict):
        args = MultiDict(args)
    text = args.get('q', '').strip()
    q = text_query(text)

    filters = parse_filters(args)
    for entity in args.getlist('entity'):
        filters.append(('entities.id', entity))

    q = filter_query(q, filters, [])
    q = add_filter(q, {
        'not': {
            'terms': {
                'collection_id': authz.collections(authz.READ)
            }
        }
    })
    q = {
        'query': q,
        'size': 0,
        'aggregations': {
            'collections': {
                'terms': {'field': 'collection_id', 'size': 30}
            }
        },
        '_source': False
    }
    # import json
    # print json.dumps(q, indent=2)
    result = get_es().search(index=get_es_index(), body=q,
                             doc_type=TYPE_DOCUMENT)

    aggs = result.get('aggregations', {}).get('collections', {})
    buckets = aggs.get('buckets', [])
    q = Collection.all_by_ids([b['key'] for b in buckets])
    q = q.filter(Collection.creator_id != None)  # noqa
    objs = {o.id: o for o in q.all()}
    roles = {}
    for bucket in buckets:
        collection = objs.get(bucket.get('key'))
        if collection is None or collection.private:
            continue
        if collection.creator_id in roles:
            roles[collection.creator_id]['total'] += bucket.get('doc_count')
        else:
            roles[collection.creator_id] = {
                'name': collection.creator.name,
                'email': collection.creator.email,
                'total': bucket.get('doc_count')
            }

    roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True)
    roles = [format_total(r) for r in roles]
    total = result.get('hits', {}).get('total')
    return format_total({
        'roles': roles,
        'active': total > 0,
        'total': total
    })
예제 #16
0
def suggest_entities(args):
    """Auto-complete API."""
    text = args.get('prefix')
    min_count = int(args.get('min_count', 0))
    options = []
    if text is not None and len(text.strip()):
        q = {
            'bool': {
                'must': [
                    {'match_phrase_prefix': {'terms': text.strip()}},
                    {'range': {'doc_count': {'gte': min_count}}}
                ]
            }
        }
        q = {
            'size': 5,
            'sort': [{'doc_count': 'desc'}, '_score'],
            'query': authz_collections_filter(q),
            '_source': ['name', '$schema', 'terms', 'doc_count']
        }
        ref = latinize_text(text)
        result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY,
                                 body=q)
        for res in result.get('hits', {}).get('hits', []):
            ent = res.get('_source')
            terms = [latinize_text(t) for t in ent.pop('terms', [])]
            ent['match'] = ref in terms
            ent['id'] = res.get('_id')
            options.append(ent)
    return {
        'text': text,
        'results': options
    }
예제 #17
0
def run_sub_queries(output, sub_queries):
    if len(sub_queries):
        res = get_es().msearch(index=get_es_index(),
                               doc_type=TYPE_RECORD,
                               body='\n'.join(sub_queries))
        for doc in output['results']:
            for sq in res.get('responses', []):
                sqhits = sq.get('hits', {})
                for hit in sqhits.get('hits', {}):
                    record = hit.get('_source')
                    if doc['id'] != record.get('document_id'):
                        continue
                    record['score'] = hit.get('_score')
                    highlights = hit.get('highlight', {})
                    if len(highlights.get('text', [])):
                        record['text'] = highlights.get('text')
                    elif len(highlights.get('text_latin', [])):
                        record['text'] = highlights.get('text_latin', [])
                    else:
                        continue
                    record['text'] = [
                        clean_highlight(t) for t in record['text']
                    ]
                    doc['records']['results'].append(record)
                    doc['records']['total'] = sqhits.get('total', 0)
예제 #18
0
파일: __init__.py 프로젝트: stefanw/aleph
def delete_source(source_id):
    """Delete all documents from a particular source."""
    q = {'query': {'term': {'source_id': source_id}}, '_source': False}

    def deletes():
        for res in scan(get_es(), query=q, index=get_es_index(),
                        doc_type=[TYPE_RECORD]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_parent': res.get('_parent'),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }
        for res in scan(get_es(), query=q, index=get_es_index(),
                        doc_type=[TYPE_DOCUMENT]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }

    try:
        bulk(get_es(), deletes(), stats_only=True, chunk_size=2000,
             request_timeout=60.0)
    except Exception:
        log.debug("Failed to clear documents: %r", source_id)
예제 #19
0
def delete_source(source_id):
    """Delete all documents from a particular source."""
    q = {'query': {'term': {'source_id': source_id}}, '_source': False}

    def deletes():
        for res in scan(get_es(),
                        query=q,
                        index=get_es_index(),
                        doc_type=[TYPE_RECORD]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_parent': res.get('_parent'),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }
        for res in scan(get_es(),
                        query=q,
                        index=get_es_index(),
                        doc_type=[TYPE_DOCUMENT]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }

    try:
        bulk(get_es(),
             deletes(),
             stats_only=True,
             chunk_size=2000,
             request_timeout=60.0)
    except Exception:
        log.debug("Failed to clear documents: %r", source_id)
예제 #20
0
파일: peek.py 프로젝트: tomjie/aleph
def peek_query(args):
    if not isinstance(args, MultiDict):
        args = MultiDict(args)
    text = args.get('q', '').strip()
    q = text_query(text)

    filters = parse_filters(args)
    for entity in args.getlist('entity'):
        filters.append(('entities.id', entity))

    q = filter_query(q, filters, [])
    q = add_filter(
        q,
        {'not': {
            'terms': {
                'collection_id': authz.collections(authz.READ)
            }
        }})
    q = {
        'query': q,
        'size': 0,
        'aggregations': {
            'collections': {
                'terms': {
                    'field': 'collection_id',
                    'size': 30
                }
            }
        },
        '_source': False
    }
    # import json
    # print json.dumps(q, indent=2)
    result = get_es().search(index=get_es_index(),
                             body=q,
                             doc_type=TYPE_DOCUMENT)

    aggs = result.get('aggregations', {}).get('collections', {})
    buckets = aggs.get('buckets', [])
    q = Collection.all_by_ids([b['key'] for b in buckets])
    q = q.filter(Collection.creator_id != None)  # noqa
    objs = {o.id: o for o in q.all()}
    roles = {}
    for bucket in buckets:
        collection = objs.get(bucket.get('key'))
        if collection is None or collection.private:
            continue
        if collection.creator_id in roles:
            roles[collection.creator_id]['total'] += bucket.get('doc_count')
        else:
            roles[collection.creator_id] = {
                'name': collection.creator.name,
                'email': collection.creator.email,
                'total': bucket.get('doc_count')
            }

    roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True)
    roles = [format_total(r) for r in roles]
    total = result.get('hits', {}).get('total')
    return format_total({'roles': roles, 'active': total > 0, 'total': total})
예제 #21
0
def execute_entities_query(args, query, doc_counts=False):
    """Execute the query and return a set of results."""
    result, hits, output = execute_basic(TYPE_ENTITY, query)
    convert_entity_aggregations(result, output, args)
    sub_queries = []
    for doc in hits.get('hits', []):
        entity = doc.get('_source')
        entity['id'] = doc.get('_id')
        entity['score'] = doc.get('_score')
        entity['api_url'] = url_for('entities_api.view', id=doc.get('_id'))
        output['results'].append(entity)

        sq = {'term': {'entities.uuid': entity['id']}}
        sq = authz_sources_filter(sq)
        sq = {'size': 0, 'query': sq}
        sub_queries.append(json.dumps({}))
        sub_queries.append(json.dumps(sq))

    if doc_counts and len(sub_queries):
        res = get_es().msearch(index=get_es_index(),
                               doc_type=TYPE_DOCUMENT,
                               body='\n'.join(sub_queries))
        for (entity, res) in zip(output['results'], res.get('responses')):
            entity['doc_count'] = res.get('hits', {}).get('total')
    return output
예제 #22
0
def delete_entity_references(entity_id):
    q = {'query': {'term': {'entities.uuid': entity_id}}}

    def updates():
        for res in scan(get_es(),
                        query=q,
                        index=get_es_index(),
                        doc_type=[TYPE_DOCUMENT]):
            entities = []
            for ent in res.get('_source').get('entities'):
                if ent['uuid'] != entity_id:
                    entities.append(ent)
            body = res.get('_source')
            body['entities'] = entities
            yield {
                '_id': res['_id'],
                '_type': res['_type'],
                '_index': res['_index'],
                '_source': body
            }

    try:
        bulk(get_es(),
             updates(),
             stats_only=True,
             chunk_size=100,
             request_timeout=120.0)
    except Exception:
        log.debug("Failed to clear entity refs: %r", entity_id)
예제 #23
0
def scan_entity_mentions(entity):
    """Find mentions of a given entity in all records."""
    shoulds = []
    for term in entity.regex_terms:
        shoulds.append(text_query_string(term))

    query = {
        'query': {
            'bool': {
                'should': shoulds,
                "minimum_should_match": 1
            }
        },
        'sort': [{
            'document_id': 'desc'
        }],
        '_source': ['document_id', 'text']
    }
    for res in scan(get_es(),
                    query=query,
                    index=get_es_index(),
                    doc_type=[TYPE_RECORD]):
        text = res.get('_source').get('text')
        texts = text if isinstance(text, list) else [text]
        for text in texts:
            yield (res.get('_source').get('document_id'), text)
예제 #24
0
def clear_records(document_id):
    """Delete all records associated with the given document."""
    q = {'query': {'term': {'document_id': document_id}}, '_source': False}

    def gen_deletes():
        for res in scan(get_es(),
                        query=q,
                        index=get_es_index(),
                        doc_type=[TYPE_RECORD]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_parent': res.get('_parent'),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }

    try:
        bulk(get_es(),
             gen_deletes(),
             stats_only=True,
             chunk_size=2000,
             request_timeout=600.0)
    except Exception:
        log.debug("Failed to clear previous index: %r", document_id)
예제 #25
0
파일: __init__.py 프로젝트: stefanw/aleph
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    clear_records(document)
    bulk(get_es(), generate_records(document), stats_only=True,
         chunk_size=2000, request_timeout=60.0)
예제 #26
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(),
                   doc_type=TYPE_DOCUMENT,
                   body=data,
                   id=document.id)

    clear_records(document)
    bulk_op(generate_records(document))
예제 #27
0
def get_count(entity):
    """Inaccurate, as it does not reflect auth."""
    q = {'term': {'entities.uuid': entity.id}}
    q = {'size': 0, 'query': q}
    result = get_es().search(index=get_es_index(),
                             doc_type=TYPE_DOCUMENT,
                             body=q)
    return result.get('hits', {}).get('total', 0)
예제 #28
0
def get_count(entity):
    """Inaccurate, as it does not reflect auth."""
    q = {'term': {'entities.uuid': entity.id}}
    q = {'size': 0, 'query': q}
    result = get_es().search(index=get_es_index(),
                             doc_type=TYPE_DOCUMENT,
                             body=q)
    return result.get('hits', {}).get('total', 0)
예제 #29
0
파일: util.py 프로젝트: OpenOil-UG/aleph
def bulk_op(iter):
    try:
        bulk(get_es(),
             iter,
             stats_only=True,
             chunk_size=1000,
             request_timeout=220.0)
    except Exception as ex:
        log.debug("Bulk operation failed: %r", ex)
예제 #30
0
파일: __init__.py 프로젝트: andkamau/aleph
 def deletes():
     for res in scan(get_es(), query=q, index=get_es_index(),
                     doc_type=[TYPE_RECORD]):
         yield {
             '_op_type': 'delete',
             '_index': get_es_index(),
             '_parent': res.get('_parent'),
             '_type': res.get('_type'),
             '_id': res.get('_id')
         }
     for res in scan(get_es(), query=q, index=get_es_index(),
                     doc_type=[TYPE_DOCUMENT]):
         yield {
             '_op_type': 'delete',
             '_index': get_es_index(),
             '_type': res.get('_type'),
             '_id': res.get('_id')
         }
예제 #31
0
 def gen_deletes():
         for res in scan(get_es(), query=q, index=get_es_index(),
                         doc_type=[TYPE_RECORD]):
             yield {
                 '_op_type': 'delete',
                 '_index': get_es_index(),
                 '_parent': res.get('_parent'),
                 '_type': res.get('_type'),
                 '_id': res.get('_id')
             }
예제 #32
0
def update_entity_references(entity_id, max_query=1000):
    """Same as above but runs in bulk for a particular entity."""
    q = db.session.query(Reference.document_id)
    q = q.filter(Reference.entity_id == entity_id)
    q = q.filter(Entity.id == entity_id)
    q = q.filter(Entity.state == Entity.STATE_ACTIVE)
    q = q.filter(collection_entity_table.c.entity_id == Entity.id)
    q = q.add_column(collection_entity_table.c.collection_id)
    references = defaultdict(list)
    for row in q:
        references[str(row.document_id)].append(row.collection_id)

    ids = references.keys()
    for i in range(0, len(ids), max_query):
        q = {'query': {'ids': {'values': ids[i:i + max_query]}}}
        bulk_op(document_updates(q, entity_id, references))

    log.info("Clearing ES cache...")
    get_es().indices.clear_cache(index=get_es_index())
예제 #33
0
파일: __init__.py 프로젝트: 01-/aleph
 def gen_deletes():
         for res in scan(get_es(), query=q, index=get_es_index(),
                         doc_type=[TYPE_RECORD]):
             yield {
                 '_op_type': 'delete',
                 '_index': get_es_index(),
                 '_parent': res.get('_source', {}).get('document_id'),
                 '_type': res.get('_type'),
                 '_id': res.get('_id')
             }
예제 #34
0
파일: __init__.py 프로젝트: 01-/aleph
    def deletes():
            q['_source'] = ['document_id']
            for res in scan(get_es(), query=q, index=get_es_index(),
                            doc_type=[TYPE_RECORD]):
                yield {
                    '_op_type': 'delete',
                    '_index': get_es_index(),
                    '_parent': res.get('_source', {}).get('document_id'),
                    '_type': res.get('_type'),
                    '_id': res.get('_id')
                }

            q['_source'] = []
            for res in scan(get_es(), query=q, index=get_es_index(),
                            doc_type=[TYPE_DOCUMENT]):
                yield {
                    '_op_type': 'delete',
                    '_index': get_es_index(),
                    '_type': res.get('_type'),
                    '_id': res.get('_id')
                }
예제 #35
0
파일: __init__.py 프로젝트: stefanw/aleph
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(),
                   doc_type=TYPE_DOCUMENT,
                   body=data,
                   id=document.id)

    clear_records(document)
    bulk(get_es(),
         generate_records(document),
         stats_only=True,
         chunk_size=2000,
         request_timeout=60.0)
예제 #36
0
def similar_entities(entity, args, collections):
    """Merge suggestions API."""
    shoulds = []
    for term in entity.terms:
        shoulds.append({
            'multi_match': {
                "fields": ["name^50", "terms^25", "summary^5"],
                "query": term,
                "fuzziness": 2
            }
        })
        shoulds.append({
            'multi_match': {
                "fields": ["name_latin^10", "terms_latin^5", "summary_latin"],
                "query": latinize_text(term),
                "fuzziness": 2
            }
        })

    q = {
        "bool": {
            "should": shoulds,
            "must_not": {
                "ids": {
                    "values": [entity.id]
                }
            },
            "must": {
                "terms": {
                    "collection_id": collections
                }
            },
            "minimum_should_match": 1
        }
    }
    q = {
        'size': 10,
        'query': authz_filter(q),
        '_source': DEFAULT_FIELDS
    }
    options = []
    result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY,
                             body=q)
    for res in result.get('hits', {}).get('hits', []):
        entity = res.get('_source')
        entity['id'] = res.get('_id')
        entity['score'] = res.get('_score')
        entity['api_url'] = url_for('entities_api.view', id=res.get('_id'))
        options.append(entity)
    return {
        'results': options
    }
예제 #37
0
def upgrade_search():
    """Add any missing properties to the index mappings."""
    get_es().indices.put_mapping(index=get_es_index(), body=DOCUMENT_MAPPING,
                                 doc_type=TYPE_DOCUMENT)
    get_es().indices.put_mapping(index=get_es_index(), body=RECORD_MAPPING,
                                 doc_type=TYPE_RECORD)
    get_es().indices.put_mapping(index=get_es_index(), body=ENTITY_MAPPING,
                                 doc_type=TYPE_ENTITY)
예제 #38
0
def execute_basic(doc_type, query):
    """Common part of running a particular query."""
    result = get_es().search(index=get_es_index(), doc_type=doc_type,
                             body=query)
    hits = result.get('hits', {})
    output = {
        'status': 'ok',
        'results': [],
        'offset': query.get('from', 0),
        'limit': query.get('size'),
        'total': hits.get('total'),
        'next': None
    }
    return result, hits, output
예제 #39
0
파일: util.py 프로젝트: adamchainz/aleph
def execute_basic(doc_type, query):
    """Common part of running a particular query."""
    result = get_es().search(index=get_es_index(), doc_type=doc_type,
                             body=query)
    hits = result.get('hits', {})
    output = {
        'status': 'ok',
        'results': [],
        'offset': query.get('from', 0),
        'limit': query.get('size'),
        'total': hits.get('total'),
        'next': None
    }
    return result, hits, output
예제 #40
0
 def updates():
     for res in scan(get_es(), query=q, index=get_es_index(),
                     doc_type=[TYPE_DOCUMENT]):
         entities = []
         for ent in res.get('_source').get('entities'):
             if ent['uuid'] != entity_id:
                 entities.append(ent)
         body = res.get('_source')
         body['entities'] = entities
         yield {
             '_id': res['_id'],
             '_type': res['_type'],
             '_index': res['_index'],
             '_source': body
         }
예제 #41
0
def scan_entity_mentions(entity):
    """Find mentions of a given entity in all records."""
    shoulds = []
    for term in entity.regex_terms:
        shoulds.append(text_query_string(term))

    query = {
        "query": {"bool": {"should": shoulds, "minimum_should_match": 1}},
        "sort": [{"document_id": "desc"}],
        "_source": ["document_id", "text"],
    }
    for res in scan(get_es(), query=query, index=get_es_index(), doc_type=[TYPE_RECORD]):
        text = res.get("_source").get("text")
        texts = text if isinstance(text, list) else [text]
        for text in texts:
            yield (res.get("_source").get("document_id"), text)
예제 #42
0
def similar_entities(entity, args):
    """Merge suggestions API."""
    shoulds = []
    for term in entity.terms:
        shoulds.append({
            'multi_match': {
                "fields": ["name^50", "terms^25", "summary^5"],
                "query": term,
                "fuzziness": 2
            }
        })
        shoulds.append({
            'multi_match': {
                "fields": ["name_latin^10", "terms_latin^5", "summary_latin"],
                "query": latinize_text(term),
                "fuzziness": 2
            }
        })

    q = {
        "bool": {
            "should": shoulds,
            "must_not": {
                "ids": {
                    "values": [entity.id]
                }
            },
            "minimum_should_match": 1
        }
    }
    q = {
        'size': 10,
        'query': authz_collections_filter(q),
        '_source': DEFAULT_FIELDS
    }
    options = []
    result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY,
                             body=q)
    for res in result.get('hits', {}).get('hits', []):
        entity = res.get('_source')
        entity['id'] = res.get('_id')
        entity['score'] = res.get('_score')
        entity['api_url'] = url_for('entities_api.view', id=res.get('_id'))
        options.append(entity)
    return {
        'results': options
    }
예제 #43
0
파일: admin.py 프로젝트: tomjie/aleph
def upgrade_search():
    """Add any missing properties to the index mappings."""
    get_es().indices.put_mapping(index=get_es_index(),
                                 body=DOCUMENT_MAPPING,
                                 doc_type=TYPE_DOCUMENT)
    get_es().indices.put_mapping(index=get_es_index(),
                                 body=RECORD_MAPPING,
                                 doc_type=TYPE_RECORD)
    get_es().indices.put_mapping(index=get_es_index(),
                                 body=ENTITY_MAPPING,
                                 doc_type=TYPE_ENTITY)
예제 #44
0
 def updates():
     for res in scan(get_es(),
                     query=q,
                     index=get_es_index(),
                     doc_type=[TYPE_DOCUMENT]):
         entities = []
         for ent in res.get('_source').get('entities'):
             if ent['uuid'] != entity_id:
                 entities.append(ent)
         body = res.get('_source')
         body['entities'] = entities
         yield {
             '_id': res['_id'],
             '_type': res['_type'],
             '_index': res['_index'],
             '_source': body
         }
예제 #45
0
파일: records.py 프로젝트: adamchainz/aleph
def scan_entity_mentions(entity):
    """Find mentions of a given entity in all records."""
    shoulds = []
    for term in entity.regex_terms:
        shoulds.append(text_query_string(term))

    query = {
        'query': {
            'bool': {'should': shoulds, "minimum_should_match": 1}
        },
        'sort': [{'document_id': 'desc'}],
        '_source': ['document_id', 'text']
    }
    for res in scan(get_es(), query=query, index=get_es_index(),
                    doc_type=[TYPE_RECORD]):
        text = res.get('_source').get('text')
        texts = text if isinstance(text, list) else [text]
        for text in texts:
            yield (res.get('_source').get('document_id'), text)
예제 #46
0
파일: __init__.py 프로젝트: 01-/aleph
def clear_children(document):
    """Delete all records associated with the given document."""
    q = {'query': {'term': {'document_id': document.id}},
         '_source': ['_id', 'document_id']}

    def gen_deletes():
            for res in scan(get_es(), query=q, index=get_es_index(),
                            doc_type=[TYPE_RECORD]):
                yield {
                    '_op_type': 'delete',
                    '_index': get_es_index(),
                    '_parent': res.get('_source', {}).get('document_id'),
                    '_type': res.get('_type'),
                    '_id': res.get('_id')
                }

    try:
        bulk(get_es(), gen_deletes(), stats_only=True, chunk_size=2000,
             request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
예제 #47
0
def clear_records(document_id):
    """Delete all records associated with the given document."""
    q = {'query': {'term': {'document_id': document_id}},
         '_source': False}

    def gen_deletes():
            for res in scan(get_es(), query=q, index=get_es_index(),
                            doc_type=[TYPE_RECORD]):
                yield {
                    '_op_type': 'delete',
                    '_index': get_es_index(),
                    '_parent': res.get('_parent'),
                    '_type': res.get('_type'),
                    '_id': res.get('_id')
                }

    try:
        bulk(get_es(), gen_deletes(), stats_only=True, chunk_size=2000,
             request_timeout=600.0)
    except (Exception, NotFoundError):
        log.debug("Failed to clear previous index: %r", document_id)
예제 #48
0
파일: documents.py 프로젝트: stefanw/aleph
def run_sub_queries(output, sub_queries):
    if len(sub_queries):
        res = get_es().msearch(index=get_es_index(), doc_type=TYPE_RECORD,
                               body='\n'.join(sub_queries))
        for doc in output['results']:
            for sq in res.get('responses', []):
                sqhits = sq.get('hits', {})
                for hit in sqhits.get('hits', {}):
                    record = hit.get('_source')
                    if doc['id'] != record.get('document_id'):
                        continue
                    record['score'] = hit.get('_score')
                    highlights = hit.get('highlight', {})
                    if len(highlights.get('text', [])):
                        record['text'] = highlights.get('text')
                    elif len(highlights.get('text_latin', [])):
                        record['text'] = highlights.get('text_latin', [])
                    else:
                        continue
                    record['text'] = [clean_highlight(t) for t in record['text']]
                    doc['records']['results'].append(record)
                    doc['records']['total'] = sqhits.get('total', 0)
예제 #49
0
def document_updates(q, entity_id, references=None):
    scanner = scan(get_es(), query=q, index=get_es_index(),
                   doc_type=[TYPE_DOCUMENT])
    for res in scanner:
        body = res.get('_source')
        entities = []
        if references is not None:
            entities.append({
                'id': entity_id,
                'collection_id': references[res['_id']]
            })
        for ent in res.get('_source').get('entities'):
            if ent['id'] != entity_id:
                entities.append(ent)
        body['entities'] = entities
        yield {
            '_op_type': 'update',
            '_id': res['_id'],
            '_type': res['_type'],
            '_index': res['_index'],
            'doc': body
        }
예제 #50
0
def delete_entity_references(entity_id):
    q = {'query': {'term': {'entities.uuid': entity_id}}}

    def updates():
        for res in scan(get_es(), query=q, index=get_es_index(),
                        doc_type=[TYPE_DOCUMENT]):
            entities = []
            for ent in res.get('_source').get('entities'):
                if ent['uuid'] != entity_id:
                    entities.append(ent)
            body = res.get('_source')
            body['entities'] = entities
            yield {
                '_id': res['_id'],
                '_type': res['_type'],
                '_index': res['_index'],
                '_source': body
            }
    try:
        bulk(get_es(), updates(), stats_only=True, chunk_size=100,
             request_timeout=120.0)
    except Exception:
        log.debug("Failed to clear entity refs: %r", entity_id)
예제 #51
0
파일: __init__.py 프로젝트: correctiv/aleph
def scan_iter(query):
    """Scan the results of a query. No pagination is applied."""
    return scan(get_es(), query=query, index=get_es_index(),
                doc_type=[TYPE_DOCUMENT])
예제 #52
0
파일: admin.py 프로젝트: tomjie/aleph
def delete_index():
    get_es().indices.delete(get_es_index(), ignore=[404])
예제 #53
0
파일: admin.py 프로젝트: tomjie/aleph
def optimize_search():
    """Run a full index restructure. May take a while."""
    get_es().indices.optimize(index=get_es_index())
예제 #54
0
def delete_entity(entity_id):
    """Delete an entity from the index."""
    get_es().delete(index=get_es_index(),
                    doc_type=TYPE_ENTITY,
                    id=entity_id,
                    ignore=[404])
예제 #55
0
파일: util.py 프로젝트: tomjie/aleph
def flush_es():
    """Run a refresh to apply all indexing changes."""
    get_es().indices.refresh(index=get_es_index())
예제 #56
0
def delete_entity(entity_id):
    """Delete an entity from the index."""
    get_es().delete(index=get_es_index(), doc_type=TYPE_ENTITY, id=entity_id,
                    ignore=[404])