def index_items(documents, object_type, **kwargs): """ Index items based on list of item ids Args: documents (iterable of dict): An iterable with ElasticSearch documents to index object_type (str): the ES object type """ conn = get_conn() # bulk will also break an iterable into chunks. However we should do this here so that # we can use the same documents when indexing to multiple aliases. for chunk in chunks(documents, chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE): for alias in get_active_aliases(conn, [object_type]): _, errors = bulk( conn, chunk, index=alias, doc_type=GLOBAL_DOC_TYPE, chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, **kwargs, ) if len(errors) > 0: raise ReindexException( f"Error during bulk {object_type} insert: {errors}")
def create_backing_index(object_type): """ Start the reindexing process by creating a new backing index and pointing the reindex alias toward it Args: object_type (str): The object type for the index (post, comment, etc) Returns: str: The new backing index """ conn = get_conn() # Create new backing index for reindex new_backing_index = make_backing_index_name(object_type) # Clear away temp alias so we can reuse it, and create mappings clear_and_create_index(index_name=new_backing_index, object_type=object_type) temp_alias = get_reindexing_alias_name(object_type) if conn.indices.exists_alias(name=temp_alias): # Deletes both alias and backing indexes indices = conn.indices.get_alias(temp_alias).keys() for index in indices: conn.indices.delete_alias(index=index, name=temp_alias) # Point temp_alias toward new backing index conn.indices.put_alias(index=new_backing_index, name=temp_alias) return new_backing_index
def _index_chunk(chunk, *, index): """ Add/update a list of records in Elasticsearch Args: chunk (list): List of serialized items to index index (str): An Elasticsearch index Returns: int: Number of items inserted into Elasticsearch """ conn = get_conn(verify_indices=[index]) insert_count, errors = bulk( conn, chunk, index=index, doc_type=GLOBAL_DOC_TYPE, ) if len(errors) > 0: raise ReindexException( "Error during bulk insert: {errors}".format(errors=errors)) refresh_index(index) return insert_count
def _search_percolate_queries(program_enrollment): """ Find all PercolateQuery ids whose queries match a user document Args: program_enrollment (ProgramEnrollment): A ProgramEnrollment Returns: list of int: A list of PercolateQuery ids """ conn = get_conn() percolate_index = get_default_alias(PERCOLATE_INDEX_TYPE) doc = serialize_program_enrolled_user(program_enrollment) if not doc: return [] # We don't need this to search for percolator queries and # it causes a dynamic mapping failure so we need to remove it del doc['_id'] body = {"query": {"percolate": {"field": "query", "document": doc}}} result = conn.search(percolate_index, GLOBAL_DOC_TYPE, body=body) failures = result.get('_shards', {}).get('failures', []) if len(failures) > 0: raise PercolateException("Failed to percolate: {}".format(failures)) return [int(row['_id']) for row in result['hits']['hits']]
def _index_chunk(chunk, *, index): """ Add/update a list of records in Elasticsearch Args: chunk (list): List of serialized items to index index (str): An Elasticsearch index Returns: int: Number of items inserted into Elasticsearch """ conn = get_conn(verify_indices=[index]) insert_count, errors = bulk( conn, chunk, index=index, doc_type=GLOBAL_DOC_TYPE, ) if len(errors) > 0: raise ReindexException("Error during bulk insert: {errors}".format( errors=errors )) refresh_index(index) return insert_count
def document_needs_updating(enrollment): """ Get the document from elasticsearch and see if it matches what's in the database Args: enrollment (ProgramEnrollment): A program enrollment Returns: bool: True if the document needs to be updated via reindex """ index = get_default_alias(PRIVATE_ENROLLMENT_INDEX_TYPE) conn = get_conn() try: document = conn.get(index=index, doc_type=GLOBAL_DOC_TYPE, id=enrollment.id) except NotFoundError: return True serialized_enrollment = serialize_program_enrolled_user(enrollment) del serialized_enrollment['_id'] source = document['_source'] if serialized_enrollment != source: # Convert OrderedDict to dict reserialized_enrollment = json.loads(json.dumps(serialized_enrollment)) diff = make_patch(source, reserialized_enrollment).patch serialized_diff = json.dumps(diff, indent=" ") log.info("Difference found for enrollment %s: %s", enrollment, serialized_diff) return True return False
def document_needs_updating(enrollment): """ Get the document from elasticsearch and see if it matches what's in the database Args: enrollment (ProgramEnrollment): A program enrollment Returns: bool: True if the document needs to be updated via reindex """ index = get_default_alias(PRIVATE_ENROLLMENT_INDEX_TYPE) conn = get_conn() try: document = conn.get(index=index, id=enrollment.id) except NotFoundError: return True serialized_enrollment = serialize_program_enrolled_user(enrollment) del serialized_enrollment['_id'] source = document['_source'] if serialized_enrollment != source: # Convert OrderedDict to dict reserialized_enrollment = json.loads(json.dumps(serialized_enrollment)) diff = make_patch(source, reserialized_enrollment).patch serialized_diff = json.dumps(diff, indent=" ") log.info("Difference found for enrollment %s: %s", enrollment, serialized_diff) return True return False
def clear_and_create_index(*, index_name=None, skip_mapping=False, object_type=None): """ Wipe and recreate index and mapping. No indexing is done. Args: index_name (str): The name of the index to clear skip_mapping (bool): If true, don't set any mapping object_type(str): The type of document (post, comment) """ if object_type not in VALID_OBJECT_TYPES: raise ValueError( "A valid object type must be specified when clearing and creating an index" ) conn = get_conn() if conn.indices.exists(index_name): conn.indices.delete(index_name) index_create_data = { "settings": { "analysis": { "analyzer": { "folding": { "type": "custom", "tokenizer": "standard", "filter": [ "lowercase", "asciifolding", # remove accents if we use folding analyzer ], }, "trigram": { "type": "custom", "tokenizer": "standard", "filter": ["lowercase", "shingle"], }, }, "filter": { "shingle": { "type": "shingle", "min_shingle_size": 2, "max_shingle_size": 3, } }, } } } if not skip_mapping: index_create_data["mappings"] = { GLOBAL_DOC_TYPE: { "properties": MAPPING[object_type] } } # from https://www.elastic.co/guide/en/elasticsearch/guide/current/asciifolding-token-filter.html conn.indices.create(index_name, body=index_create_data)
def execute_search(search_obj): """ Executes a search against ES after checking the connection Args: search_obj (Search): elasticsearch_dsl Search object Returns: elasticsearch_dsl.result.Response: ES response """ # make sure there is a live connection if search_obj._index is None: # pylint: disable=protected-access # If you're seeing this it means you're creating Search() without using # create_search_obj which sets important fields like the index. raise ImproperlyConfigured("search object is missing an index") get_conn() return search_obj.execute()
def delete_indices(): """ Drop all the indices. Used in testing. """ conn = get_conn(verify=False) for index_type in ALL_INDEX_TYPES: aliases = get_aliases(index_type) for alias in aliases: if conn.indices.exists(alias): conn.indices.delete(alias)
def delete_indices(): """ Drop all the indices. Used in testing. """ conn = get_conn(verify=False) for index_type in ALL_INDEX_TYPES: aliases = get_aliases(index_type) for alias in aliases: if conn.indices.exists(alias): conn.indices.delete_alias(index=INDEX_WILDCARD, name=alias)
def scan_search(search_obj): """ Executes a scan search after checking the connection and return a generator that will iterate over all the documents matching the query. Args: search_obj (Search): elasticsearch_dsl Search object Returns: generator of dict: A generator that will iterate over all the documents matching the query """ # make sure there is a live connection if search_obj._index is None: # pylint: disable=protected-access # If you're seeing this it means you're creating Search() without using # create_search_obj which sets important fields like the index. raise ImproperlyConfigured("search object is missing an index") get_conn() return search_obj.scan()
def switch_indices(backing_index, object_type): """ Switch the default index to point to the backing index, and delete the reindex alias Args: backing_index (str): The backing index of the reindex alias object_type (str): The object type for the index (post, comment, etc) """ conn = get_conn() actions = [] old_backing_indexes = [] default_alias = get_default_alias_name(object_type) global_alias = get_default_alias_name(ALIAS_ALL_INDICES) if conn.indices.exists_alias(name=default_alias): # Should only be one backing index in normal circumstances old_backing_indexes = list( conn.indices.get_alias(name=default_alias).keys()) for index in old_backing_indexes: actions.extend([ { "remove": { "index": index, "alias": default_alias } }, { "remove": { "index": index, "alias": global_alias } }, ]) actions.extend([ { "add": { "index": backing_index, "alias": default_alias } }, { "add": { "index": backing_index, "alias": global_alias } }, ]) conn.indices.update_aliases({"actions": actions}) refresh_index(backing_index) for index in old_backing_indexes: conn.indices.delete(index) # Finally, remove the link to the reindexing alias conn.indices.delete_alias(name=get_reindexing_alias_name(object_type), index=backing_index)
def _delete_item(document_id, *, index): """ Helper function to delete a document Args: document_id (int): A document id index (str): An Elasticsearch index """ conn = get_conn(verify_indices=[index]) try: conn.delete(index=index, doc_type=GLOBAL_DOC_TYPE, id=document_id) except NotFoundError: # Item is already gone pass
def create_document(doc_id, data): """ Makes a request to ES to create a new document Args: doc_id (str): The ES document id data (dict): Full ES document data """ conn = get_conn() for alias in get_active_aliases(conn, [data["object_type"]]): conn.create(index=alias, doc_type=GLOBAL_DOC_TYPE, body=data, id=doc_id)
def clear_and_create_index(index_name, *, index_type, skip_mapping=False): """ Wipe and recreate index and mapping. No indexing is done. Args: index_name (str): The name of the index index_type (str): The index type, used to pick the mapping skip_mapping (bool): If true, don't set any mapping """ if index_type == PERCOLATE_INDEX_TYPE: mapping = PERCOLATE_MAPPING elif index_type == PRIVATE_ENROLLMENT_INDEX_TYPE: mapping = PRIVATE_ENROLLMENT_MAPPING elif index_type == PUBLIC_ENROLLMENT_INDEX_TYPE: mapping = PUBLIC_ENROLLMENT_MAPPING else: raise Exception("Unknown index type") conn = get_conn(verify=False) if conn.indices.exists(index_name): conn.indices.delete(index_name) # from https://www.elastic.co/guide/en/elasticsearch/guide/current/asciifolding-token-filter.html conn.indices.create( index_name, body={ 'settings': { 'index': { 'number_of_shards': settings.ELASTICSEARCH_SHARD_COUNT, }, 'analysis': { 'analyzer': { 'folding': { 'type': 'custom', 'tokenizer': 'standard', 'filter': [ 'lowercase', 'asciifolding', # remove accents if we use folding analyzer ] } } } }, 'mappings': { **({} if skip_mapping else mapping), } })
def update_field_values_by_query(query, field_dict, object_types=None): """ Makes a request to ES to use the update_by_query API to update one or more field values for all documents that match the given query. Args: query (dict): A dict representing an ES query field_dict (dict): dictionary of fields with values to update object_types (list of str): The object types to query (post, comment, etc) """ sources = [] params = {} for (field_name, field_value) in field_dict.items(): new_param = "new_value_{}".format(field_name) sources.append("ctx._source['{}'] = params.{}".format( field_name, new_param)) params.update({new_param: field_value}) if not object_types: object_types = VALID_OBJECT_TYPES conn = get_conn() for alias in get_active_aliases(conn, object_types): es_response = conn.update_by_query( # pylint: disable=unexpected-keyword-arg index=alias, doc_type=GLOBAL_DOC_TYPE, conflicts=UPDATE_CONFLICT_SETTING, body={ "script": { "source": ";".join([source for source in sources]), "lang": SCRIPTING_LANG, "params": params, }, **query, }, ) # Our policy for document update-related version conflicts right now is to log them # and allow the app to continue as normal. num_version_conflicts = es_response.get("version_conflicts", 0) if num_version_conflicts > 0: log.error( "Update By Query API request resulted in %s version conflict(s) (alias: %s, query: %s)", num_version_conflicts, alias, query, )
def delete_document(doc_id, object_type, **kwargs): """ Makes a request to ES to delete a document Args: doc_id (str): The ES document id object_type (str): The object type kwargs (dict): optional parameters for the request """ conn = get_conn() for alias in get_active_aliases(conn, [object_type]): try: conn.delete(index=alias, doc_type=GLOBAL_DOC_TYPE, id=doc_id, params=kwargs) except NotFoundError: log.debug( "Tried to delete an ES document that didn't exist, doc_id: '%s'", doc_id)
def clear_and_create_index(index_name, *, index_type, skip_mapping=False): """ Wipe and recreate index and mapping. No indexing is done. Args: index_name (str): The name of the index index_type (str): The index type, used to pick the mapping skip_mapping (bool): If true, don't set any mapping """ if index_type == PERCOLATE_INDEX_TYPE: mapping = PERCOLATE_MAPPING elif index_type == PRIVATE_ENROLLMENT_INDEX_TYPE: mapping = PRIVATE_ENROLLMENT_MAPPING elif index_type == PUBLIC_ENROLLMENT_INDEX_TYPE: mapping = PUBLIC_ENROLLMENT_MAPPING else: raise Exception("Unknown index type") conn = get_conn(verify=False) if conn.indices.exists(index_name): conn.indices.delete(index_name) # from https://www.elastic.co/guide/en/elasticsearch/guide/current/asciifolding-token-filter.html conn.indices.create(index_name, body={ 'settings': { 'analysis': { 'analyzer': { 'folding': { 'type': 'custom', 'tokenizer': 'standard', 'filter': [ 'lowercase', 'asciifolding', # remove accents if we use folding analyzer ] } } } }, 'mappings': { **({} if skip_mapping else mapping), } })
def _search_percolate_queries(program_enrollment): """ Find all PercolateQuery ids whose queries match a user document Args: program_enrollment (ProgramEnrollment): A ProgramEnrollment Returns: list of int: A list of PercolateQuery ids """ conn = get_conn() percolate_index = get_default_alias(PERCOLATE_INDEX_TYPE) doc = serialize_program_enrolled_user(program_enrollment) if not doc: return [] # We don't need this to search for percolator queries and # it causes a dynamic mapping failure so we need to remove it del doc['_id'] result = conn.percolate(percolate_index, GLOBAL_DOC_TYPE, body={"doc": doc}) failures = result.get('_shards', {}).get('failures', []) if len(failures) > 0: raise PercolateException("Failed to percolate: {}".format(failures)) return [int(row['_id']) for row in result['matches']]
def _update_document_by_id(doc_id, body, object_type, *, retry_on_conflict=0, **kwargs): """ Makes a request to ES to update an existing document Args: doc_id (str): The ES document id body (dict): ES update operation body object_type (str): The object type to update (post, comment, etc) retry_on_conflict (int): Number of times to retry if there's a conflict (default=0) kwargs (dict): Optional kwargs to be passed to ElasticSearch """ conn = get_conn() for alias in get_active_aliases(conn, [object_type]): try: conn.update( index=alias, doc_type=GLOBAL_DOC_TYPE, body=body, id=doc_id, params={ "retry_on_conflict": retry_on_conflict, **kwargs }, ) # Our policy for document update-related version conflicts right now is to log them # and allow the app to continue as normal. except ConflictError: log.error( "Update API request resulted in a version conflict (alias: %s, doc id: %s)", alias, doc_id, )
def refresh_index(index): """ Refresh the Elasticsearch index """ get_conn(verify_indices=[index]).indices.refresh(index=index)
def recreate_index(): """ Wipe and recreate index and mapping, and index all items. """ conn = get_conn(verify=False) # Create new backing index for reindex new_backing_public_index = make_backing_index_name() new_backing_private_index = make_backing_index_name() new_backing_percolate_index = make_backing_index_name() backing_index_tuples = [ (new_backing_public_index, PUBLIC_ENROLLMENT_INDEX_TYPE), (new_backing_private_index, PRIVATE_ENROLLMENT_INDEX_TYPE), (new_backing_percolate_index, PERCOLATE_INDEX_TYPE), ] for backing_index, index_type in backing_index_tuples: # Clear away temp alias so we can reuse it, and create mappings clear_and_create_index(backing_index, index_type=index_type) temp_alias = make_alias_name(index_type, is_reindexing=True) if conn.indices.exists_alias(name=temp_alias): # Deletes both alias and backing indexes conn.indices.delete(temp_alias) # Point temp_alias toward new backing index conn.indices.put_alias(index=backing_index, name=temp_alias) # Do the indexing on the temp index start = now_in_utc() try: enrollment_count = ProgramEnrollment.objects.count() log.info("Indexing %d program enrollments...", enrollment_count) index_program_enrolled_users( ProgramEnrollment.objects.iterator(), public_indices=[new_backing_public_index], private_indices=[new_backing_private_index], ) log.info("Indexing %d percolator queries...", PercolateQuery.objects.exclude(is_deleted=True).count()) _index_chunks( _get_percolate_documents(PercolateQuery.objects.exclude(is_deleted=True).iterator()), index=new_backing_percolate_index, ) # Point default alias to new index and delete the old backing index, if any log.info("Done with temporary index. Pointing default aliases to newly created backing indexes...") for new_backing_index, index_type in backing_index_tuples: actions = [] old_backing_indexes = [] default_alias = make_alias_name(index_type, is_reindexing=False) if conn.indices.exists_alias(name=default_alias): # Should only be one backing index in normal circumstances old_backing_indexes = list(conn.indices.get_alias(name=default_alias).keys()) for index in old_backing_indexes: actions.append({ "remove": { "index": index, "alias": default_alias, } }) actions.append({ "add": { "index": new_backing_index, "alias": default_alias, }, }) conn.indices.update_aliases({ "actions": actions }) refresh_index(new_backing_index) for index in old_backing_indexes: conn.indices.delete(index) finally: for new_backing_index, index_type in backing_index_tuples: temp_alias = make_alias_name(index_type, is_reindexing=True) conn.indices.delete_alias(name=temp_alias, index=new_backing_index) end = now_in_utc() log.info("recreate_index took %d seconds", (end - start).total_seconds())
def recreate_index(): """ Wipe and recreate index and mapping, and index all items. """ conn = get_conn(verify=False) # Create new backing index for reindex new_backing_public_index = make_backing_index_name() new_backing_private_index = make_backing_index_name() new_backing_percolate_index = make_backing_index_name() backing_index_tuples = [ (new_backing_public_index, PUBLIC_ENROLLMENT_INDEX_TYPE), (new_backing_private_index, PRIVATE_ENROLLMENT_INDEX_TYPE), (new_backing_percolate_index, PERCOLATE_INDEX_TYPE), ] for backing_index, index_type in backing_index_tuples: # Clear away temp alias so we can reuse it, and create mappings clear_and_create_index(backing_index, index_type=index_type) temp_alias = make_alias_name(index_type, is_reindexing=True) if conn.indices.exists_alias(name=temp_alias): # Deletes both alias and backing indexes conn.indices.delete_alias(index=INDEX_WILDCARD, name=temp_alias) # Point temp_alias toward new backing index conn.indices.put_alias(index=backing_index, name=temp_alias) # Do the indexing on the temp index start = now_in_utc() try: enrollment_count = ProgramEnrollment.objects.count() log.info("Indexing %d program enrollments...", enrollment_count) index_program_enrolled_users( ProgramEnrollment.objects.iterator(), public_indices=[new_backing_public_index], private_indices=[new_backing_private_index], ) log.info("Indexing %d percolator queries...", PercolateQuery.objects.exclude(is_deleted=True).count()) _index_chunks( _get_percolate_documents( PercolateQuery.objects.exclude(is_deleted=True).iterator()), index=new_backing_percolate_index, ) # Point default alias to new index and delete the old backing index, if any log.info( "Done with temporary index. Pointing default aliases to newly created backing indexes..." ) for new_backing_index, index_type in backing_index_tuples: actions = [] old_backing_indexes = [] default_alias = make_alias_name(index_type, is_reindexing=False) if conn.indices.exists_alias(name=default_alias): # Should only be one backing index in normal circumstances old_backing_indexes = list( conn.indices.get_alias(name=default_alias).keys()) for index in old_backing_indexes: actions.append( {"remove": { "index": index, "alias": default_alias, }}) actions.append({ "add": { "index": new_backing_index, "alias": default_alias, }, }) conn.indices.update_aliases({"actions": actions}) refresh_index(new_backing_index) for index in old_backing_indexes: conn.indices.delete(index) finally: for new_backing_index, index_type in backing_index_tuples: temp_alias = make_alias_name(index_type, is_reindexing=True) conn.indices.delete_alias(name=temp_alias, index=new_backing_index) end = now_in_utc() log.info("recreate_index took %d seconds", (end - start).total_seconds())