def test_chunked(self): # chunking nothing yields nothing. eq_(list(chunked([], 1)), []) # chunking list where len(list) < n eq_(list(chunked([1], 10)), [(1, )]) # chunking a list where len(list) == n eq_(list(chunked([1, 2], 2)), [(1, 2)]) # chunking list where len(list) > n eq_(list(chunked([1, 2, 3, 4, 5], 2)), [(1, 2), (3, 4), (5, )])
def test_chunked(self): # chunking nothing yields nothing. eq_(list(chunked([], 1)), []) # chunking list where len(list) < n eq_(list(chunked([1], 10)), [(1,)]) # chunking a list where len(list) == n eq_(list(chunked([1, 2], 2)), [(1, 2)]) # chunking list where len(list) > n eq_(list(chunked([1, 2, 3, 4, 5], 2)), [(1, 2), (3, 4), (5,)])
def index_chunk(cls, id_list, reraise=False): """Index a chunk of documents. :arg cls: The MappingType class. :arg id_list: Iterable of ids of that MappingType to index. :arg reraise: False if you want errors to be swallowed and True if you want errors to be thrown. """ # Note: This bulk indexes in batches of 80. I didn't arrive at # this number through a proper scientific method. It's possible # there's a better number. It takes a while to fiddle with, # though. Probably best to expose the number as an environment # variable, then run a script that takes timings for # --criticalmass, runs overnight and returns a more "optimal" # number. for ids in chunked(id_list, 80): documents = [] for id_ in ids: try: documents.append(cls.extract_document(id_)) except UnindexMeBro: # extract_document throws this in cases where we need # to remove the item from the index. cls.unindex(id_) except Exception: log.exception('Unable to extract/index document (id: %d)', id_) if reraise: raise if documents: cls.bulk_index(documents, id_field='document_id')
def handle_reindex(request): """Caculates and kicks off indexing tasks""" write_index = es_utils.WRITE_INDEX # This is truthy if the user wants us to delete and recreate # the index first. delete_index_first = bool(request.POST.get("delete_index")) if delete_index_first: # Coming from the delete form, so we reindex all models. models_to_index = None else: # Coming from the reindex form, so we reindex whatever we're # told. models_to_index = [name.replace("check_", "") for name in request.POST.keys() if name.startswith("check_")] # TODO: If this gets fux0rd, then it's possible this could be # non-zero and we really want to just ignore it. Need the ability # to ignore it. try: client = redis_client("default") val = client.get(OUTSTANDING_INDEX_CHUNKS) if val is not None and int(val) > 0: raise ReindexError("There are %s outstanding chunks." % val) # We don't know how many chunks we're building, but we do want # to make sure another reindex request doesn't slide in here # and kick off a bunch of chunks. # # There is a race condition here. client.set(OUTSTANDING_INDEX_CHUNKS, 1) except RedisError: log.warning("Redis not running. Can not check if there are " "outstanding tasks.") batch_id = create_batch_id() # Break up all the things we want to index into chunks. This # chunkifies by class then by chunk size. chunks = [] for cls, indexable in get_indexable(search_models=models_to_index): chunks.extend((cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE)) if delete_index_first: # The previous lines do a lot of work and take some time to # execute. So we wait until here to wipe and rebuild the # index. That reduces the time that there is no index by a little. recreate_index() chunks_count = len(chunks) try: client = redis_client("default") client.set(OUTSTANDING_INDEX_CHUNKS, chunks_count) except RedisError: log.warning("Redis not running. Can't denote outstanding tasks.") for chunk in chunks: index_chunk_task.delay(write_index, batch_id, chunk) return HttpResponseRedirect(request.path)
def es_reindex_cmd(percent=100, delete=False, models=None): """Rebuild ElasticSearch indexes :arg percent: 1 to 100--the percentage of the db to index :arg delete: whether or not to wipe the index before reindexing :arg models: list of search model names to index """ es = get_indexing_es() try: get_doctype_stats(WRITE_INDEX) except ESIndexMissingException: if not delete: log.error("The index does not exist. You must specify --delete.") return if delete: log.info("wiping and recreating %s....", WRITE_INDEX) recreate_index(es=es) if models: indexable = get_indexable(percent, models) else: indexable = get_indexable(percent) start_time = time.time() for cls, indexable in indexable: cls_start_time = time.time() total = len(indexable) if total == 0: continue log.info("reindex %s into %s index. %s to index....", cls.get_model_name(), WRITE_INDEX, total) i = 0 for chunk in chunked(indexable, 1000): index_chunk(cls, chunk, es=es) i += len(chunk) time_to_go = (total - i) * ((time.time() - start_time) / i) per_1000 = (time.time() - start_time) / (i / 1000.0) log.info("%s/%s... (%s to go, %s per 1000 docs)", i, total, format_time(time_to_go), format_time(per_1000)) # We call this every 1000 or so because we're # essentially loading the whole db and if DEBUG=True, # then Django saves every sql statement which causes # our memory to go up up up. So we reset it and that # makes things happier even in DEBUG environments. reset_queries() delta_time = time.time() - cls_start_time log.info("done! (%s, %s per 1000 docs)", format_time(delta_time), format_time(delta_time / (total / 1000.0))) delta_time = time.time() - start_time log.info("done! (total time: %s)", format_time(delta_time))
def es_status_cmd(checkindex=False): """Shows elastic search index status""" try: try: read_doctype_stats = get_doctype_stats(READ_INDEX) except ESIndexMissingException: read_doctype_stats = None if READ_INDEX == WRITE_INDEX: write_doctype_stats = read_doctype_stats else: try: write_doctype_stats = get_doctype_stats(WRITE_INDEX) except ESIndexMissingException: write_doctype_stats = None indexes = get_indexes(all_indexes=True) except ESMaxRetryError: log.error( "Your elasticsearch process is not running or ES_HOSTS " "is set wrong in your settings_local.py file." ) return log.info("Settings:") log.info(" ES_HOSTS : %s", settings.ES_HOSTS) log.info(" ES_INDEX_PREFIX : %s", settings.ES_INDEX_PREFIX) log.info(" ES_LIVE_INDEXING : %s", settings.ES_LIVE_INDEXING) log.info(" ES_INDEXES : %s", settings.ES_INDEXES) log.info(" ES_WRITE_INDEXES : %s", settings.ES_WRITE_INDEXES) log.info("Index stats:") if indexes: log.info(" List of indexes:") for name, count in indexes: read_write = [] if name == READ_INDEX: read_write.append("READ") if name == WRITE_INDEX: read_write.append("WRITE") log.info(" %-20s: %s %s", name, count, "/".join(read_write)) else: log.info(" There are no %s indexes.", settings.ES_INDEX_PREFIX) if read_doctype_stats is None: log.info(" Read index does not exist. (%s)", READ_INDEX) else: log.info(" Read index (%s):", READ_INDEX) for name, count in read_doctype_stats.items(): log.info(" %-20s: %d", name, count) if READ_INDEX != WRITE_INDEX: if write_doctype_stats is None: log.info(" Write index does not exist. (%s)", WRITE_INDEX) else: log.info(" Write index (%s):", WRITE_INDEX) for name, count in write_doctype_stats.items(): log.info(" %-20s: %d", name, count) else: log.info(" Write index is same as read index.") if checkindex: # Go through the index and verify everything log.info("Checking index contents....") missing_docs = 0 for cls, id_list in get_indexable(): for id_group in chunked(id_list, 100): doc_list = get_documents(cls, id_group) if len(id_group) != len(doc_list): doc_list_ids = [doc["id"] for doc in doc_list] for id_ in id_group: if id_ not in doc_list_ids: log.info(" Missing %s %s", cls.get_model_name(), id_) missing_docs += 1 if missing_docs: print "There were %d missing_docs" % missing_docs
def es_status_cmd(checkindex=False, log=log): """Shows elastic search index status""" try: read_doctype_stats = get_doctype_stats(READ_INDEX) except ES_EXCEPTIONS: read_doctype_stats = None if READ_INDEX == WRITE_INDEX: write_doctype_stats = read_doctype_stats else: try: write_doctype_stats = get_doctype_stats(WRITE_INDEX) except ES_EXCEPTIONS: write_doctype_stats = None try: indexes = get_indexes(all_indexes=True) except ES_EXCEPTIONS: log.error('Your elasticsearch process is not running or ES_URLS ' 'is set wrong in your settings_local.py file.') return log.info('Settings:') log.info(' ES_URLS : %s', settings.ES_URLS) log.info(' ES_INDEX_PREFIX : %s', settings.ES_INDEX_PREFIX) log.info(' ES_LIVE_INDEXING : %s', settings.ES_LIVE_INDEXING) log.info(' ES_INDEXES : %s', settings.ES_INDEXES) log.info(' ES_WRITE_INDEXES : %s', settings.ES_WRITE_INDEXES) log.info('Index stats:') if indexes: log.info(' List of indexes:') for name, count in sorted(indexes): read_write = [] if name == READ_INDEX: read_write.append('READ') if name == WRITE_INDEX: read_write.append('WRITE') log.info(' %-22s: %s %s', name, count, '/'.join(read_write)) else: log.info(' There are no %s indexes.', settings.ES_INDEX_PREFIX) if read_doctype_stats is None: log.info(' Read index does not exist. (%s)', READ_INDEX) else: log.info(' Read index (%s):', READ_INDEX) for name, count in sorted(read_doctype_stats.items()): log.info(' %-22s: %d', name, count) if READ_INDEX != WRITE_INDEX: if write_doctype_stats is None: log.info(' Write index does not exist. (%s)', WRITE_INDEX) else: log.info(' Write index (%s):', WRITE_INDEX) for name, count in sorted(write_doctype_stats.items()): log.info(' %-22s: %d', name, count) else: log.info(' Write index is same as read index.') if checkindex: # Go through the index and verify everything log.info('Checking index contents....') missing_docs = 0 for cls, id_list in get_indexable(): for id_group in chunked(id_list, 100): doc_list = get_documents(cls, id_group) if len(id_group) != len(doc_list): doc_list_ids = [doc['id'] for doc in doc_list] for id_ in id_group: if id_ not in doc_list_ids: log.info(' Missing %s %s', cls.get_model_name(), id_) missing_docs += 1 if missing_docs: print 'There were %d missing_docs' % missing_docs
def es_reindex_cmd(percent=100, delete=False, mapping_types=None, criticalmass=False, log=log): """Rebuild ElasticSearch indexes :arg percent: 1 to 100--the percentage of the db to index :arg delete: whether or not to wipe the index before reindexing :arg mapping_types: list of mapping types to index :arg criticalmass: whether or not to index just a critical mass of things :arg log: the logger to use """ es = get_es() try: get_doctype_stats(WRITE_INDEX) except ES_EXCEPTIONS: if not delete: log.error('The index does not exist. You must specify --delete.') return if delete: log.info('wiping and recreating %s....', WRITE_INDEX) recreate_index(es=es) if criticalmass: # The critical mass is defined as the entire KB plus the most # recent 15k questions (which is about how many questions # there were created in the last 180 days). We build that # indexable here. # Get only questions and wiki document stuff. all_indexable = get_indexable( mapping_types=['questions_question', 'wiki_document']) # The first item is questions because we specified that # order. Old questions don't show up in searches, so we nix # them by reversing the list (ordered by id ascending) and # slicing it. all_indexable[0] = (all_indexable[0][0], list(reversed(all_indexable[0][1]))[:15000]) elif mapping_types: all_indexable = get_indexable(percent, mapping_types) else: all_indexable = get_indexable(percent) # We're doing a lot of indexing, so we get the refresh_interval # currently in the index, then nix refreshing. Later we'll restore # it. old_refresh = get_index_settings(WRITE_INDEX).get( 'index.refresh_interval', '1s') # Disable automatic refreshing es.update_settings( WRITE_INDEX, {'index': {'refresh_interval': '-1'}}) log.info('using index: %s', WRITE_INDEX) start_time = time.time() for cls, indexable in all_indexable: cls_start_time = time.time() total = len(indexable) if total == 0: continue log.info('reindexing %s. %s to index....', cls.get_mapping_type_name(), total) i = 0 for chunk in chunked(indexable, 1000): chunk_start_time = time.time() index_chunk(cls, chunk) i += len(chunk) time_to_go = (total - i) * ((time.time() - cls_start_time) / i) per_1000 = (time.time() - cls_start_time) / (i / 1000.0) this_1000 = time.time() - chunk_start_time log.info(' %s/%s %s... (%s/1000 avg, %s ETA)', i, total, format_time(this_1000), format_time(per_1000), format_time(time_to_go) ) delta_time = time.time() - cls_start_time log.info(' done! (%s total, %s/1000 avg)', format_time(delta_time), format_time(delta_time / (total / 1000.0))) # Re-enable automatic refreshing es.update_settings( WRITE_INDEX, {'index': {'refresh_interval': old_refresh}}) delta_time = time.time() - start_time log.info('done! (%s total)', format_time(delta_time))
def es_status_cmd(checkindex=False, log=log): """Shows elastic search index status""" try: read_doctype_stats = get_doctype_stats(READ_INDEX) except ES_EXCEPTIONS: read_doctype_stats = None if READ_INDEX == WRITE_INDEX: write_doctype_stats = read_doctype_stats else: try: write_doctype_stats = get_doctype_stats(WRITE_INDEX) except ES_EXCEPTIONS: write_doctype_stats = None try: indexes = get_indexes(all_indexes=True) except ES_EXCEPTIONS: log.error('Your elasticsearch process is not running or ES_URLS ' 'is set wrong in your settings_local.py file.') return log.info('Settings:') log.info(' ES_URLS : %s', settings.ES_URLS) log.info(' ES_INDEX_PREFIX : %s', settings.ES_INDEX_PREFIX) log.info(' ES_LIVE_INDEXING : %s', settings.ES_LIVE_INDEXING) log.info(' ES_INDEXES : %s', settings.ES_INDEXES) log.info(' ES_WRITE_INDEXES : %s', settings.ES_WRITE_INDEXES) log.info('Index stats:') if indexes: log.info(' List of indexes:') for name, count in indexes: read_write = [] if name == READ_INDEX: read_write.append('READ') if name == WRITE_INDEX: read_write.append('WRITE') log.info(' %-20s: %s %s', name, count, '/'.join(read_write)) else: log.info(' There are no %s indexes.', settings.ES_INDEX_PREFIX) if read_doctype_stats is None: log.info(' Read index does not exist. (%s)', READ_INDEX) else: log.info(' Read index (%s):', READ_INDEX) for name, count in read_doctype_stats.items(): log.info(' %-20s: %d', name, count) if READ_INDEX != WRITE_INDEX: if write_doctype_stats is None: log.info(' Write index does not exist. (%s)', WRITE_INDEX) else: log.info(' Write index (%s):', WRITE_INDEX) for name, count in write_doctype_stats.items(): log.info(' %-20s: %d', name, count) else: log.info(' Write index is same as read index.') if checkindex: # Go through the index and verify everything log.info('Checking index contents....') missing_docs = 0 for cls, id_list in get_indexable(): for id_group in chunked(id_list, 100): doc_list = get_documents(cls, id_group) if len(id_group) != len(doc_list): doc_list_ids = [doc['id'] for doc in doc_list] for id_ in id_group: if id_ not in doc_list_ids: log.info(' Missing %s %s', cls.get_model_name(), id_) missing_docs += 1 if missing_docs: print 'There were %d missing_docs' % missing_docs
def es_reindex_cmd(percent=100, delete=False, models=None, criticalmass=False, log=log): """Rebuild ElasticSearch indexes :arg percent: 1 to 100--the percentage of the db to index :arg delete: whether or not to wipe the index before reindexing :arg models: list of search model names to index :arg criticalmass: whether or not to index just a critical mass of things :arg log: the logger to use """ es = get_es() try: get_doctype_stats(WRITE_INDEX) except ES_EXCEPTIONS: if not delete: log.error('The index does not exist. You must specify --delete.') return if delete: log.info('wiping and recreating %s....', WRITE_INDEX) recreate_index(es=es) if criticalmass: # The critical mass is defined as the entire KB plus the most # recent 15k questions (which is about how many questions # there were created in the last 180 days). We build that # indexable here. # Get only questions and wiki document stuff. all_indexable = get_indexable( search_models=['questions_question', 'wiki_document']) # The first item is questions because we specified that # order. Old questions don't show up in searches, so we nix # them by reversing the list (ordered by id ascending) and # slicing it. all_indexable[0] = (all_indexable[0][0], list(reversed(all_indexable[0][1]))[:15000]) elif models: all_indexable = get_indexable(percent, models) else: all_indexable = get_indexable(percent) start_time = time.time() for cls, indexable in all_indexable: cls_start_time = time.time() total = len(indexable) if total == 0: continue log.info('reindex %s into %s index. %s to index....', cls.get_model_name(), WRITE_INDEX, total) i = 0 for chunk in chunked(indexable, 1000): index_chunk(cls, chunk) i += len(chunk) time_to_go = (total - i) * ((time.time() - start_time) / i) per_1000 = (time.time() - start_time) / (i / 1000.0) log.info('%s/%s... (%s to go, %s per 1000 docs)', i, total, format_time(time_to_go), format_time(per_1000)) # We call this every 1000 or so because we're # essentially loading the whole db and if DEBUG=True, # then Django saves every sql statement which causes # our memory to go up up up. So we reset it and that # makes things happier even in DEBUG environments. reset_queries() delta_time = time.time() - cls_start_time log.info('done! (%s, %s per 1000 docs)', format_time(delta_time), format_time(delta_time / (total / 1000.0))) delta_time = time.time() - start_time log.info('done! (total time: %s)', format_time(delta_time))
def es_reindex_cmd(percent=100, delete=False, models=None): """Rebuild ElasticSearch indexes :arg percent: 1 to 100--the percentage of the db to index :arg delete: whether or not to wipe the index before reindexing :arg models: list of search model names to index """ es = get_indexing_es() try: get_doctype_stats(WRITE_INDEX) except ESIndexMissingException: if not delete: log.error('The index does not exist. You must specify --delete.') return if delete: log.info('wiping and recreating %s....', WRITE_INDEX) recreate_index(es=es) if models: indexable = get_indexable(percent, models) else: indexable = get_indexable(percent) start_time = time.time() for cls, indexable in indexable: cls_start_time = time.time() total = len(indexable) if total == 0: continue log.info('reindex %s into %s index. %s to index....', cls.get_model_name(), WRITE_INDEX, total) i = 0 for chunk in chunked(indexable, 1000): index_chunk(cls, chunk, es=es) i += len(chunk) time_to_go = (total - i) * ((time.time() - start_time) / i) per_1000 = (time.time() - start_time) / (i / 1000.0) log.info('%s/%s... (%s to go, %s per 1000 docs)', i, total, format_time(time_to_go), format_time(per_1000)) # We call this every 1000 or so because we're # essentially loading the whole db and if DEBUG=True, # then Django saves every sql statement which causes # our memory to go up up up. So we reset it and that # makes things happier even in DEBUG environments. reset_queries() delta_time = time.time() - cls_start_time log.info('done! (%s, %s per 1000 docs)', format_time(delta_time), format_time(delta_time / (total / 1000.0))) delta_time = time.time() - start_time log.info('done! (total time: %s)', format_time(delta_time))
def handle_reindex(request): """Caculates and kicks off indexing tasks""" write_index = es_utils.WRITE_INDEX # This is truthy if the user wants us to delete and recreate # the index first. delete_index_first = bool(request.POST.get('delete_index')) if delete_index_first: # Coming from the delete form, so we reindex all models. models_to_index = None else: # Coming from the reindex form, so we reindex whatever we're # told. models_to_index = [ name.replace('check_', '') for name in request.POST.keys() if name.startswith('check_') ] # TODO: If this gets fux0rd, then it's possible this could be # non-zero and we really want to just ignore it. Need the ability # to ignore it. try: client = redis_client('default') val = client.get(OUTSTANDING_INDEX_CHUNKS) if val is not None and int(val) > 0: raise ReindexError('There are %s outstanding chunks.' % val) # We don't know how many chunks we're building, but we do want # to make sure another reindex request doesn't slide in here # and kick off a bunch of chunks. # # There is a race condition here. client.set(OUTSTANDING_INDEX_CHUNKS, 1) except RedisError: log.warning('Redis not running. Can not check if there are ' 'outstanding tasks.') batch_id = create_batch_id() # Break up all the things we want to index into chunks. This # chunkifies by class then by chunk size. chunks = [] for cls, indexable in get_indexable(search_models=models_to_index): chunks.extend((cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE)) if delete_index_first: # The previous lines do a lot of work and take some time to # execute. So we wait until here to wipe and rebuild the # index. That reduces the time that there is no index by a little. recreate_index() chunks_count = len(chunks) try: client = redis_client('default') client.set(OUTSTANDING_INDEX_CHUNKS, chunks_count) except RedisError: log.warning('Redis not running. Can\'t denote outstanding tasks.') for chunk in chunks: index_chunk_task.delay(write_index, batch_id, chunk) return HttpResponseRedirect(request.path)