def bulk_index(cls, documents, id_field='id', es=None): """Adds or updates a batch of documents. :arg documents: List of Python dicts representing individual documents to be added to the index .. Note:: This must be serializable into JSON. :arg id_field: The name of the field to use as the document id. This defaults to 'id'. :arg es: The `ElasticSearch` to use. If you don't specify an `ElasticSearch`, it'll use `cls.get_es()`. .. Note:: If you need the documents available for searches immediately, make sure to refresh the index by calling ``refresh_index()``. FIXME: This is copied from Indexable. """ if es is None: es = es_utils.get_es() es.bulk_index(cls.get_index(), cls.get_mapping_type_name(), documents, id_field)
def auto_lock_old_questions(): """Locks all questions that were created over 180 days ago""" # Set up logging so it doesn't send Ricky email. logging.basicConfig(level=logging.ERROR) # Get a list of ids of questions we're going to go change. We need # a list of ids so that we can feed it to the update, but then # also know what we need to update in the index. days_180 = datetime.now() - timedelta(days=180) q_ids = list(Question.objects.filter(is_locked=False) .filter(created__lte=days_180) .values_list('id', flat=True)) if q_ids: log.info('Updating %d questions', len(q_ids)) sql = """ UPDATE questions_question SET is_locked = 1 WHERE id IN (%s) """ % ','.join(map(str, q_ids)) cursor = connection.cursor() cursor.execute(sql) transaction.commit_unless_managed() if settings.ES_LIVE_INDEXING: try: es = get_es() # So... the first time this runs, it'll handle 160K # questions or so which stresses everything. Thus we # do it in chunks because otherwise this won't work. # # After we've done this for the first time, we can nix # the chunking code. from search.utils import chunked for chunk in chunked(q_ids, 1000): # Fetch all the documents we need to update. es_docs = get_documents(Question, chunk) log.info('Updating %d index documents', len(es_docs)) # For each document, update the data and stick it # back in the index. for doc in es_docs: doc[u'question_is_locked'] = True Question.index(doc, bulk=True, es=es) es.flush_bulk(forced=True) es.refresh(WRITE_INDEX, timesleep=0) except ES_EXCEPTIONS: # Something happened with ES, so let's push index updating # into an index_task which retries when it fails because # of ES issues. index_task.delay(Question, q_ids)
def index(cls, document, id_=None, force_insert=False, es=None): """Indexes a single document""" if not settings.ES_LIVE_INDEXING: return if es is None: es = es_utils.get_es() es.index(cls.get_index(), cls.get_mapping_type_name(), document, id=document['document_id'], force_insert=force_insert)
def index(cls, document, id_=None, force_insert=False, es=None): """Indexes a single document""" if not settings.ES_LIVE_INDEXING: return if es is None: es = es_utils.get_es() es.index( cls.get_index(), cls.get_mapping_type_name(), document, id=document['document_id'], force_insert=force_insert)
def unindex(cls, id_, es=None): """Removes a document from the index""" if not settings.ES_LIVE_INDEXING: return if es is None: es = es_utils.get_es() try: es.delete(es_utils.WRITE_INDEX, es_utils.SUMO_DOCTYPE, cls.get_document_id(id_)) except ElasticHttpNotFoundError: # Ignore the case where we try to delete something that's # not there. pass
def index_all(cls, percent=100): """Reindexes all the objects for this model. Yields number of documents done. Note: This can get run from the command line, so we log stuff to let the user know what's going on. :arg percent: The percentage of questions to index. Defaults to 100--e.g. all of them. """ es = es_utils.get_es() doc_type = cls._meta.db_table index = cls.get_es_index() if index != settings.ES_INDEXES.get('default'): # If this doctype isn't using the default index, then this # doctype is responsible for deleting and re-creating the # index. es.delete_index_if_exists(index) es.create_index(index) start_time = time.time() log.info('reindex %s into %s index', doc_type, index) log.info('setting up mapping....') mapping = cls.get_mapping() es.put_mapping(doc_type, mapping, index) log.info('iterating through %s....', doc_type) total = cls.objects.count() to_index = int(total * (percent / 100.0)) log.info('total %s: %s (to be indexed: %s)', doc_type, total, to_index) total = to_index # Some models have a gazillion instances. So we want to go # through them one at a time in a way that doesn't pull all # the data into memory all at once. So we iterate through ids # and pull the objects one at a time. qs = cls.objects.order_by('id').values_list('id', flat=True) for t, obj_id in enumerate(qs.iterator()): if t > total: break obj = cls.objects.get(pk=obj_id) if t % 1000 == 0 and t > 0: time_to_go = (total - t) * ((time.time() - start_time) / t) log.info('%s/%s... (%s to go)', t, total, es_utils.format_time(time_to_go)) # We call this every 1000 or so because we're # essentially loading the whole db and if DEBUG=True, # then Django saves every sql statement which causes # our memory to go up up up. So we reset it and that # makes things happier even in DEBUG environments. reset_queries() if t % settings.ES_FLUSH_BULK_EVERY == 0: # We built the ES with this setting, but it doesn't # actually do anything with it unless we call # flush_bulk which causes it to check its bulk_size # and flush it if it's too big. es.flush_bulk() try: cls.index(obj.extract_document(), bulk=True, es=es) except Exception: log.exception('Unable to extract/index document (id: %d)', obj.id) yield t es.flush_bulk(forced=True) end_time = time.time() log.info('done! (%s)', es_utils.format_time(end_time - start_time)) es.refresh()