def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return try: log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_children(document) if document.type == Document.TYPE_TEXT: bulk(get_es(), generate_pages(document), stats_only=True, chunk_size=2000, request_timeout=60.0) if document.type == Document.TYPE_TABULAR: bulk(get_es(), generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex) process.exception(process.INDEX, component=__name__, document_id=document.id, meta=document.meta, source_id=document.source_id, exception=ex)
def delete_document(document_id): clear_records(document_id) try: get_es().delete(index=get_es_index(), doc_type=TYPE_DOCUMENT, id=document_id) except NotFoundError: pass
def init_search(): log.info("Creating ElasticSearch index and uploading mapping...") get_es().indices.create(get_es_index(), body={ 'mappings': { TYPE_DOCUMENT: DOCUMENT_MAPPING, TYPE_RECORD: RECORD_MAPPING } })
def init_search(): log.info("Creating ElasticSearch index and uploading mapping...") get_es().indices.create(get_es_index(), body={ 'mappings': { TYPE_DOCUMENT: DOCUMENT_MAPPING, TYPE_RECORD: RECORD_MAPPING } }) get_es().indices.open(index=get_es_index())
def init_search(): log.info("Creating ElasticSearch index and uploading mapping...") get_es().indices.create(get_es_index(), body={ 'mappings': { TYPE_DOCUMENT: DOCUMENT_MAPPING, TYPE_RECORD: RECORD_MAPPING, TYPE_ENTITY: ENTITY_MAPPING } }, ignore=[400, 404]) get_es().indices.open(index=get_es_index(), ignore=[400, 404])
def index_document(document, index_records=True): log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) if index_records: clear_records(document.id) bulk_op(generate_records(document))
def index_entity(entity): """Index an entity.""" data = entity.to_dict() data.pop('id', None) data['doc_count'] = get_count(entity) data['terms'] = entity.terms data['terms_latin'] = [latinize_text(t) for t in entity.terms] data['name_latin'] = latinize_text(data.get('name')) data['summary_latin'] = latinize_text(data.get('summary')) data['description_latin'] = latinize_text(data.get('description')) data = expand_json(data) get_es().index(index=get_es_index(), doc_type=TYPE_ENTITY, id=entity.id, body=data)
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_records(document) bulk_op(generate_records(document))
def init_search(): log.info("Creating ElasticSearch index and uploading mapping...") get_es().indices.create(get_es_index(), body={ 'mappings': { TYPE_DOCUMENT: DOCUMENT_MAPPING, TYPE_RECORD: RECORD_MAPPING, TYPE_ENTITY: ENTITY_MAPPING }, 'settings': { 'number_of_shards': 10, 'number_of_replicas': 1, } }) get_es().indices.open(index=get_es_index())
def suggest_entities(prefix, min_count=0, schemas=None, size=5): """Auto-complete API.""" options = [] if prefix is not None and len(prefix.strip()): q = {'match_phrase_prefix': {'terms': prefix.strip()}} if min_count > 0: q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}}) if schemas is not None and len(schemas): q = add_filter(q, {'terms': {'$schema': schemas}}) q = { 'size': size, 'sort': [{ 'doc_count': 'desc' }, '_score'], 'query': authz_filter(q), '_source': ['name', '$schema', 'terms', 'doc_count'] } ref = latinize_text(prefix) result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): ent = res.get('_source') terms = [latinize_text(t) for t in ent.pop('terms', [])] ent['match'] = ref in terms ent['score'] = res.get('_score') ent['id'] = res.get('_id') options.append(ent) return {'prefix': prefix, 'results': options}
def suggest_entities(prefix, min_count=0, schemas=None, size=5): """Auto-complete API.""" options = [] if prefix is not None and len(prefix.strip()): q = { 'match_phrase_prefix': {'terms': prefix.strip()} } if min_count > 0: q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}}) if schemas is not None and len(schemas): q = add_filter(q, {'terms': {'$schema': schemas}}) q = { 'size': size, 'sort': [{'doc_count': 'desc'}, '_score'], 'query': authz_filter(q), '_source': ['name', '$schema', 'terms', 'doc_count'] } ref = latinize_text(prefix) result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): ent = res.get('_source') terms = [latinize_text(t) for t in ent.pop('terms', [])] ent['match'] = ref in terms ent['score'] = res.get('_score') ent['id'] = res.get('_id') options.append(ent) return { 'prefix': prefix, 'results': options }
def delete_source(source_id): """Delete all documents from a particular source.""" q = {'query': {'term': {'source_id': source_id}}} def deletes(): q['_source'] = ['document_id'] for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_source', {}).get('document_id'), '_type': res.get('_type'), '_id': res.get('_id') } q['_source'] = [] for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_DOCUMENT]): yield { '_op_type': 'delete', '_index': get_es_index(), '_type': res.get('_type'), '_id': res.get('_id') } try: bulk(get_es(), deletes(), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex)
def peek_query(args): if not isinstance(args, MultiDict): args = MultiDict(args) text = args.get('q', '').strip() q = text_query(text) filters = parse_filters(args) for entity in args.getlist('entity'): filters.append(('entities.id', entity)) q = filter_query(q, filters, []) q = add_filter(q, { 'not': { 'terms': { 'collection_id': authz.collections(authz.READ) } } }) q = { 'query': q, 'size': 0, 'aggregations': { 'collections': { 'terms': {'field': 'collection_id', 'size': 30} } }, '_source': False } # import json # print json.dumps(q, indent=2) result = get_es().search(index=get_es_index(), body=q, doc_type=TYPE_DOCUMENT) aggs = result.get('aggregations', {}).get('collections', {}) buckets = aggs.get('buckets', []) q = Collection.all_by_ids([b['key'] for b in buckets]) q = q.filter(Collection.creator_id != None) # noqa objs = {o.id: o for o in q.all()} roles = {} for bucket in buckets: collection = objs.get(bucket.get('key')) if collection is None or collection.private: continue if collection.creator_id in roles: roles[collection.creator_id]['total'] += bucket.get('doc_count') else: roles[collection.creator_id] = { 'name': collection.creator.name, 'email': collection.creator.email, 'total': bucket.get('doc_count') } roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True) roles = [format_total(r) for r in roles] total = result.get('hits', {}).get('total') return format_total({ 'roles': roles, 'active': total > 0, 'total': total })
def suggest_entities(args): """Auto-complete API.""" text = args.get('prefix') min_count = int(args.get('min_count', 0)) options = [] if text is not None and len(text.strip()): q = { 'bool': { 'must': [ {'match_phrase_prefix': {'terms': text.strip()}}, {'range': {'doc_count': {'gte': min_count}}} ] } } q = { 'size': 5, 'sort': [{'doc_count': 'desc'}, '_score'], 'query': authz_collections_filter(q), '_source': ['name', '$schema', 'terms', 'doc_count'] } ref = latinize_text(text) result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): ent = res.get('_source') terms = [latinize_text(t) for t in ent.pop('terms', [])] ent['match'] = ref in terms ent['id'] = res.get('_id') options.append(ent) return { 'text': text, 'results': options }
def run_sub_queries(output, sub_queries): if len(sub_queries): res = get_es().msearch(index=get_es_index(), doc_type=TYPE_RECORD, body='\n'.join(sub_queries)) for doc in output['results']: for sq in res.get('responses', []): sqhits = sq.get('hits', {}) for hit in sqhits.get('hits', {}): record = hit.get('_source') if doc['id'] != record.get('document_id'): continue record['score'] = hit.get('_score') highlights = hit.get('highlight', {}) if len(highlights.get('text', [])): record['text'] = highlights.get('text') elif len(highlights.get('text_latin', [])): record['text'] = highlights.get('text_latin', []) else: continue record['text'] = [ clean_highlight(t) for t in record['text'] ] doc['records']['results'].append(record) doc['records']['total'] = sqhits.get('total', 0)
def delete_source(source_id): """Delete all documents from a particular source.""" q = {'query': {'term': {'source_id': source_id}}, '_source': False} def deletes(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_parent'), '_type': res.get('_type'), '_id': res.get('_id') } for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_DOCUMENT]): yield { '_op_type': 'delete', '_index': get_es_index(), '_type': res.get('_type'), '_id': res.get('_id') } try: bulk(get_es(), deletes(), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception: log.debug("Failed to clear documents: %r", source_id)
def peek_query(args): if not isinstance(args, MultiDict): args = MultiDict(args) text = args.get('q', '').strip() q = text_query(text) filters = parse_filters(args) for entity in args.getlist('entity'): filters.append(('entities.id', entity)) q = filter_query(q, filters, []) q = add_filter( q, {'not': { 'terms': { 'collection_id': authz.collections(authz.READ) } }}) q = { 'query': q, 'size': 0, 'aggregations': { 'collections': { 'terms': { 'field': 'collection_id', 'size': 30 } } }, '_source': False } # import json # print json.dumps(q, indent=2) result = get_es().search(index=get_es_index(), body=q, doc_type=TYPE_DOCUMENT) aggs = result.get('aggregations', {}).get('collections', {}) buckets = aggs.get('buckets', []) q = Collection.all_by_ids([b['key'] for b in buckets]) q = q.filter(Collection.creator_id != None) # noqa objs = {o.id: o for o in q.all()} roles = {} for bucket in buckets: collection = objs.get(bucket.get('key')) if collection is None or collection.private: continue if collection.creator_id in roles: roles[collection.creator_id]['total'] += bucket.get('doc_count') else: roles[collection.creator_id] = { 'name': collection.creator.name, 'email': collection.creator.email, 'total': bucket.get('doc_count') } roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True) roles = [format_total(r) for r in roles] total = result.get('hits', {}).get('total') return format_total({'roles': roles, 'active': total > 0, 'total': total})
def execute_entities_query(args, query, doc_counts=False): """Execute the query and return a set of results.""" result, hits, output = execute_basic(TYPE_ENTITY, query) convert_entity_aggregations(result, output, args) sub_queries = [] for doc in hits.get('hits', []): entity = doc.get('_source') entity['id'] = doc.get('_id') entity['score'] = doc.get('_score') entity['api_url'] = url_for('entities_api.view', id=doc.get('_id')) output['results'].append(entity) sq = {'term': {'entities.uuid': entity['id']}} sq = authz_sources_filter(sq) sq = {'size': 0, 'query': sq} sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) if doc_counts and len(sub_queries): res = get_es().msearch(index=get_es_index(), doc_type=TYPE_DOCUMENT, body='\n'.join(sub_queries)) for (entity, res) in zip(output['results'], res.get('responses')): entity['doc_count'] = res.get('hits', {}).get('total') return output
def delete_entity_references(entity_id): q = {'query': {'term': {'entities.uuid': entity_id}}} def updates(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_DOCUMENT]): entities = [] for ent in res.get('_source').get('entities'): if ent['uuid'] != entity_id: entities.append(ent) body = res.get('_source') body['entities'] = entities yield { '_id': res['_id'], '_type': res['_type'], '_index': res['_index'], '_source': body } try: bulk(get_es(), updates(), stats_only=True, chunk_size=100, request_timeout=120.0) except Exception: log.debug("Failed to clear entity refs: %r", entity_id)
def scan_entity_mentions(entity): """Find mentions of a given entity in all records.""" shoulds = [] for term in entity.regex_terms: shoulds.append(text_query_string(term)) query = { 'query': { 'bool': { 'should': shoulds, "minimum_should_match": 1 } }, 'sort': [{ 'document_id': 'desc' }], '_source': ['document_id', 'text'] } for res in scan(get_es(), query=query, index=get_es_index(), doc_type=[TYPE_RECORD]): text = res.get('_source').get('text') texts = text if isinstance(text, list) else [text] for text in texts: yield (res.get('_source').get('document_id'), text)
def clear_records(document_id): """Delete all records associated with the given document.""" q = {'query': {'term': {'document_id': document_id}}, '_source': False} def gen_deletes(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_parent'), '_type': res.get('_type'), '_id': res.get('_id') } try: bulk(get_es(), gen_deletes(), stats_only=True, chunk_size=2000, request_timeout=600.0) except Exception: log.debug("Failed to clear previous index: %r", document_id)
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_records(document) bulk(get_es(), generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0)
def get_count(entity): """Inaccurate, as it does not reflect auth.""" q = {'term': {'entities.uuid': entity.id}} q = {'size': 0, 'query': q} result = get_es().search(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=q) return result.get('hits', {}).get('total', 0)
def bulk_op(iter): try: bulk(get_es(), iter, stats_only=True, chunk_size=1000, request_timeout=220.0) except Exception as ex: log.debug("Bulk operation failed: %r", ex)
def deletes(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_parent'), '_type': res.get('_type'), '_id': res.get('_id') } for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_DOCUMENT]): yield { '_op_type': 'delete', '_index': get_es_index(), '_type': res.get('_type'), '_id': res.get('_id') }
def gen_deletes(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_parent'), '_type': res.get('_type'), '_id': res.get('_id') }
def update_entity_references(entity_id, max_query=1000): """Same as above but runs in bulk for a particular entity.""" q = db.session.query(Reference.document_id) q = q.filter(Reference.entity_id == entity_id) q = q.filter(Entity.id == entity_id) q = q.filter(Entity.state == Entity.STATE_ACTIVE) q = q.filter(collection_entity_table.c.entity_id == Entity.id) q = q.add_column(collection_entity_table.c.collection_id) references = defaultdict(list) for row in q: references[str(row.document_id)].append(row.collection_id) ids = references.keys() for i in range(0, len(ids), max_query): q = {'query': {'ids': {'values': ids[i:i + max_query]}}} bulk_op(document_updates(q, entity_id, references)) log.info("Clearing ES cache...") get_es().indices.clear_cache(index=get_es_index())
def gen_deletes(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_source', {}).get('document_id'), '_type': res.get('_type'), '_id': res.get('_id') }
def deletes(): q['_source'] = ['document_id'] for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_source', {}).get('document_id'), '_type': res.get('_type'), '_id': res.get('_id') } q['_source'] = [] for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_DOCUMENT]): yield { '_op_type': 'delete', '_index': get_es_index(), '_type': res.get('_type'), '_id': res.get('_id') }
def similar_entities(entity, args, collections): """Merge suggestions API.""" shoulds = [] for term in entity.terms: shoulds.append({ 'multi_match': { "fields": ["name^50", "terms^25", "summary^5"], "query": term, "fuzziness": 2 } }) shoulds.append({ 'multi_match': { "fields": ["name_latin^10", "terms_latin^5", "summary_latin"], "query": latinize_text(term), "fuzziness": 2 } }) q = { "bool": { "should": shoulds, "must_not": { "ids": { "values": [entity.id] } }, "must": { "terms": { "collection_id": collections } }, "minimum_should_match": 1 } } q = { 'size': 10, 'query': authz_filter(q), '_source': DEFAULT_FIELDS } options = [] result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): entity = res.get('_source') entity['id'] = res.get('_id') entity['score'] = res.get('_score') entity['api_url'] = url_for('entities_api.view', id=res.get('_id')) options.append(entity) return { 'results': options }
def upgrade_search(): """Add any missing properties to the index mappings.""" get_es().indices.put_mapping(index=get_es_index(), body=DOCUMENT_MAPPING, doc_type=TYPE_DOCUMENT) get_es().indices.put_mapping(index=get_es_index(), body=RECORD_MAPPING, doc_type=TYPE_RECORD) get_es().indices.put_mapping(index=get_es_index(), body=ENTITY_MAPPING, doc_type=TYPE_ENTITY)
def execute_basic(doc_type, query): """Common part of running a particular query.""" result = get_es().search(index=get_es_index(), doc_type=doc_type, body=query) hits = result.get('hits', {}) output = { 'status': 'ok', 'results': [], 'offset': query.get('from', 0), 'limit': query.get('size'), 'total': hits.get('total'), 'next': None } return result, hits, output
def updates(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_DOCUMENT]): entities = [] for ent in res.get('_source').get('entities'): if ent['uuid'] != entity_id: entities.append(ent) body = res.get('_source') body['entities'] = entities yield { '_id': res['_id'], '_type': res['_type'], '_index': res['_index'], '_source': body }
def scan_entity_mentions(entity): """Find mentions of a given entity in all records.""" shoulds = [] for term in entity.regex_terms: shoulds.append(text_query_string(term)) query = { "query": {"bool": {"should": shoulds, "minimum_should_match": 1}}, "sort": [{"document_id": "desc"}], "_source": ["document_id", "text"], } for res in scan(get_es(), query=query, index=get_es_index(), doc_type=[TYPE_RECORD]): text = res.get("_source").get("text") texts = text if isinstance(text, list) else [text] for text in texts: yield (res.get("_source").get("document_id"), text)
def similar_entities(entity, args): """Merge suggestions API.""" shoulds = [] for term in entity.terms: shoulds.append({ 'multi_match': { "fields": ["name^50", "terms^25", "summary^5"], "query": term, "fuzziness": 2 } }) shoulds.append({ 'multi_match': { "fields": ["name_latin^10", "terms_latin^5", "summary_latin"], "query": latinize_text(term), "fuzziness": 2 } }) q = { "bool": { "should": shoulds, "must_not": { "ids": { "values": [entity.id] } }, "minimum_should_match": 1 } } q = { 'size': 10, 'query': authz_collections_filter(q), '_source': DEFAULT_FIELDS } options = [] result = get_es().search(index=get_es_index(), doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): entity = res.get('_source') entity['id'] = res.get('_id') entity['score'] = res.get('_score') entity['api_url'] = url_for('entities_api.view', id=res.get('_id')) options.append(entity) return { 'results': options }
def scan_entity_mentions(entity): """Find mentions of a given entity in all records.""" shoulds = [] for term in entity.regex_terms: shoulds.append(text_query_string(term)) query = { 'query': { 'bool': {'should': shoulds, "minimum_should_match": 1} }, 'sort': [{'document_id': 'desc'}], '_source': ['document_id', 'text'] } for res in scan(get_es(), query=query, index=get_es_index(), doc_type=[TYPE_RECORD]): text = res.get('_source').get('text') texts = text if isinstance(text, list) else [text] for text in texts: yield (res.get('_source').get('document_id'), text)
def clear_children(document): """Delete all records associated with the given document.""" q = {'query': {'term': {'document_id': document.id}}, '_source': ['_id', 'document_id']} def gen_deletes(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_source', {}).get('document_id'), '_type': res.get('_type'), '_id': res.get('_id') } try: bulk(get_es(), gen_deletes(), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex)
def clear_records(document_id): """Delete all records associated with the given document.""" q = {'query': {'term': {'document_id': document_id}}, '_source': False} def gen_deletes(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_parent'), '_type': res.get('_type'), '_id': res.get('_id') } try: bulk(get_es(), gen_deletes(), stats_only=True, chunk_size=2000, request_timeout=600.0) except (Exception, NotFoundError): log.debug("Failed to clear previous index: %r", document_id)
def run_sub_queries(output, sub_queries): if len(sub_queries): res = get_es().msearch(index=get_es_index(), doc_type=TYPE_RECORD, body='\n'.join(sub_queries)) for doc in output['results']: for sq in res.get('responses', []): sqhits = sq.get('hits', {}) for hit in sqhits.get('hits', {}): record = hit.get('_source') if doc['id'] != record.get('document_id'): continue record['score'] = hit.get('_score') highlights = hit.get('highlight', {}) if len(highlights.get('text', [])): record['text'] = highlights.get('text') elif len(highlights.get('text_latin', [])): record['text'] = highlights.get('text_latin', []) else: continue record['text'] = [clean_highlight(t) for t in record['text']] doc['records']['results'].append(record) doc['records']['total'] = sqhits.get('total', 0)
def document_updates(q, entity_id, references=None): scanner = scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_DOCUMENT]) for res in scanner: body = res.get('_source') entities = [] if references is not None: entities.append({ 'id': entity_id, 'collection_id': references[res['_id']] }) for ent in res.get('_source').get('entities'): if ent['id'] != entity_id: entities.append(ent) body['entities'] = entities yield { '_op_type': 'update', '_id': res['_id'], '_type': res['_type'], '_index': res['_index'], 'doc': body }
def scan_iter(query): """Scan the results of a query. No pagination is applied.""" return scan(get_es(), query=query, index=get_es_index(), doc_type=[TYPE_DOCUMENT])
def delete_index(): get_es().indices.delete(get_es_index(), ignore=[404])
def optimize_search(): """Run a full index restructure. May take a while.""" get_es().indices.optimize(index=get_es_index())
def delete_entity(entity_id): """Delete an entity from the index.""" get_es().delete(index=get_es_index(), doc_type=TYPE_ENTITY, id=entity_id, ignore=[404])
def flush_es(): """Run a refresh to apply all indexing changes.""" get_es().indices.refresh(index=get_es_index())