def index_all(cls, percent=100): """Reindexes all the objects for this model. Yields number of documents done. Note: This can get run from the command line, so we log stuff to let the user know what's going on. :arg percent: The percentage of questions to index. Defaults to 100--e.g. all of them. """ es = es_utils.get_indexing_es() doc_type = cls._meta.db_table index = settings.ES_WRITE_INDEXES['default'] start_time = time.time() indexable_qs = cls.get_indexable() log.info('reindex %s into %s index', doc_type, index) log.info('iterating through %s....', doc_type) total = indexable_qs.count() to_index = int(total * (percent / 100.0)) log.info('total %s: %s (to be indexed: %s)', doc_type, total, to_index) if to_index == 0: log.info('done!') return total = to_index for t, obj_id in enumerate(indexable_qs): if t > total: break if t % 1000 == 0 and t > 0: time_to_go = (total - t) * ((time.time() - start_time) / t) per_1000 = (time.time() - start_time) / (t / 1000.0) log.info('%s/%s... (%s to go, %s per 1000 docs)', t, total, es_utils.format_time(time_to_go), es_utils.format_time(per_1000)) # We call this every 1000 or so because we're # essentially loading the whole db and if DEBUG=True, # then Django saves every sql statement which causes # our memory to go up up up. So we reset it and that # makes things happier even in DEBUG environments. reset_queries() if t % settings.ES_FLUSH_BULK_EVERY == 0: # We built the ES with this setting, but it doesn't # actually do anything with it unless we call # flush_bulk which causes it to check its bulk_size # and flush it if it's too big. es.flush_bulk() try: cls.index(cls.extract_document(obj_id), bulk=True, es=es) except Exception: log.exception('Unable to extract/index document (id: %d)', obj_id) yield t es.flush_bulk(forced=True) delta_time = time.time() - start_time log.info('done! (%s, %s per 1000 docs)', es_utils.format_time(delta_time), es_utils.format_time(delta_time / (total / 1000.0))) es.refresh()
def index_all(cls, percent=100): """Reindexes all the objects for this model. Yields number of documents done. Note: This can get run from the command line, so we log stuff to let the user know what's going on. :arg percent: The percentage of questions to index. Defaults to 100--e.g. all of them. """ es = es_utils.get_es() doc_type = cls._meta.db_table index = cls.get_es_index() if index != settings.ES_INDEXES.get('default'): # If this doctype isn't using the default index, then this # doctype is responsible for deleting and re-creating the # index. es.delete_index_if_exists(index) es.create_index(index) start_time = time.time() log.info('reindex %s into %s index', doc_type, index) log.info('setting up mapping....') mapping = cls.get_mapping() es.put_mapping(doc_type, mapping, index) log.info('iterating through %s....', doc_type) total = cls.objects.count() to_index = int(total * (percent / 100.0)) log.info('total %s: %s (to be indexed: %s)', doc_type, total, to_index) total = to_index # Some models have a gazillion instances. So we want to go # through them one at a time in a way that doesn't pull all # the data into memory all at once. So we iterate through ids # and pull the objects one at a time. qs = cls.objects.order_by('id').values_list('id', flat=True) for t, obj_id in enumerate(qs.iterator()): if t > total: break obj = cls.objects.get(pk=obj_id) if t % 1000 == 0 and t > 0: time_to_go = (total - t) * ((time.time() - start_time) / t) log.info('%s/%s... (%s to go)', t, total, es_utils.format_time(time_to_go)) # We call this every 1000 or so because we're # essentially loading the whole db and if DEBUG=True, # then Django saves every sql statement which causes # our memory to go up up up. So we reset it and that # makes things happier even in DEBUG environments. reset_queries() if t % settings.ES_FLUSH_BULK_EVERY == 0: # We built the ES with this setting, but it doesn't # actually do anything with it unless we call # flush_bulk which causes it to check its bulk_size # and flush it if it's too big. es.flush_bulk() try: cls.index(obj.extract_document(), bulk=True, es=es) except Exception: log.exception('Unable to extract/index document (id: %d)', obj.id) yield t es.flush_bulk(forced=True) end_time = time.time() log.info('done! (%s)', es_utils.format_time(end_time - start_time)) es.refresh()