Example #1
0
    def bulk_index(cls, documents, id_field='id', es=None):
        """Adds or updates a batch of documents.

        :arg documents: List of Python dicts representing individual
            documents to be added to the index

            .. Note::

               This must be serializable into JSON.

        :arg id_field: The name of the field to use as the document
            id. This defaults to 'id'.

        :arg es: The `ElasticSearch` to use. If you don't specify an
            `ElasticSearch`, it'll use `cls.get_es()`.

        .. Note::

           If you need the documents available for searches
           immediately, make sure to refresh the index by calling
           ``refresh_index()``.


        FIXME: This is copied from Indexable.

        """
        if es is None:
            es = es_utils.get_es()

        es.bulk_index(cls.get_index(),
                      cls.get_mapping_type_name(),
                      documents,
                      id_field)
Example #2
0
    def bulk_index(cls, documents, id_field='id', es=None):
        """Adds or updates a batch of documents.

        :arg documents: List of Python dicts representing individual
            documents to be added to the index

            .. Note::

               This must be serializable into JSON.

        :arg id_field: The name of the field to use as the document
            id. This defaults to 'id'.

        :arg es: The `ElasticSearch` to use. If you don't specify an
            `ElasticSearch`, it'll use `cls.get_es()`.

        .. Note::

           If you need the documents available for searches
           immediately, make sure to refresh the index by calling
           ``refresh_index()``.


        FIXME: This is copied from Indexable.

        """
        if es is None:
            es = es_utils.get_es()

        es.bulk_index(cls.get_index(), cls.get_mapping_type_name(), documents,
                      id_field)
Example #3
0
def auto_lock_old_questions():
    """Locks all questions that were created over 180 days ago"""
    # Set up logging so it doesn't send Ricky email.
    logging.basicConfig(level=logging.ERROR)

    # Get a list of ids of questions we're going to go change. We need
    # a list of ids so that we can feed it to the update, but then
    # also know what we need to update in the index.
    days_180 = datetime.now() - timedelta(days=180)
    q_ids = list(Question.objects.filter(is_locked=False)
                                 .filter(created__lte=days_180)
                                 .values_list('id', flat=True))

    if q_ids:
        log.info('Updating %d questions', len(q_ids))

        sql = """
            UPDATE questions_question
            SET is_locked = 1
            WHERE id IN (%s)
            """ % ','.join(map(str, q_ids))

        cursor = connection.cursor()
        cursor.execute(sql)
        transaction.commit_unless_managed()

        if settings.ES_LIVE_INDEXING:
            try:
                es = get_es()

                # So... the first time this runs, it'll handle 160K
                # questions or so which stresses everything. Thus we
                # do it in chunks because otherwise this won't work.
                #
                # After we've done this for the first time, we can nix
                # the chunking code.

                from search.utils import chunked
                for chunk in chunked(q_ids, 1000):

                    # Fetch all the documents we need to update.
                    es_docs = get_documents(Question, chunk)

                    log.info('Updating %d index documents', len(es_docs))

                    # For each document, update the data and stick it
                    # back in the index.
                    for doc in es_docs:
                        doc[u'question_is_locked'] = True
                        Question.index(doc, bulk=True, es=es)

                    es.flush_bulk(forced=True)
                    es.refresh(WRITE_INDEX, timesleep=0)

            except ES_EXCEPTIONS:
                # Something happened with ES, so let's push index updating
                # into an index_task which retries when it fails because
                # of ES issues.
                index_task.delay(Question, q_ids)
Example #4
0
    def index(cls, document, id_=None, force_insert=False, es=None):
        """Indexes a single document"""
        if not settings.ES_LIVE_INDEXING:
            return

        if es is None:
            es = es_utils.get_es()

        es.index(cls.get_index(),
                 cls.get_mapping_type_name(),
                 document,
                 id=document['document_id'],
                 force_insert=force_insert)
Example #5
0
    def index(cls, document, id_=None, force_insert=False, es=None):
        """Indexes a single document"""
        if not settings.ES_LIVE_INDEXING:
            return

        if es is None:
            es = es_utils.get_es()

        es.index(
            cls.get_index(),
            cls.get_mapping_type_name(),
            document,
            id=document['document_id'],
            force_insert=force_insert)
Example #6
0
    def unindex(cls, id_, es=None):
        """Removes a document from the index"""
        if not settings.ES_LIVE_INDEXING:
            return

        if es is None:
            es = es_utils.get_es()

        try:
            es.delete(es_utils.WRITE_INDEX, es_utils.SUMO_DOCTYPE,
                      cls.get_document_id(id_))

        except ElasticHttpNotFoundError:
            # Ignore the case where we try to delete something that's
            # not there.
            pass
Example #7
0
    def unindex(cls, id_, es=None):
        """Removes a document from the index"""
        if not settings.ES_LIVE_INDEXING:
            return

        if es is None:
            es = es_utils.get_es()

        try:
            es.delete(es_utils.WRITE_INDEX, es_utils.SUMO_DOCTYPE,
                      cls.get_document_id(id_))

        except ElasticHttpNotFoundError:
            # Ignore the case where we try to delete something that's
            # not there.
            pass
Example #8
0
    def index_all(cls, percent=100):
        """Reindexes all the objects for this model.

        Yields number of documents done.

        Note: This can get run from the command line, so we log stuff
        to let the user know what's going on.

        :arg percent: The percentage of questions to index. Defaults to
            100--e.g. all of them.

        """
        es = es_utils.get_es()

        doc_type = cls._meta.db_table
        index = cls.get_es_index()

        if index != settings.ES_INDEXES.get('default'):
            # If this doctype isn't using the default index, then this
            # doctype is responsible for deleting and re-creating the
            # index.
            es.delete_index_if_exists(index)
            es.create_index(index)

        start_time = time.time()

        log.info('reindex %s into %s index', doc_type, index)

        log.info('setting up mapping....')
        mapping = cls.get_mapping()
        es.put_mapping(doc_type, mapping, index)

        log.info('iterating through %s....', doc_type)
        total = cls.objects.count()
        to_index = int(total * (percent / 100.0))
        log.info('total %s: %s (to be indexed: %s)', doc_type, total, to_index)
        total = to_index

        # Some models have a gazillion instances. So we want to go
        # through them one at a time in a way that doesn't pull all
        # the data into memory all at once. So we iterate through ids
        # and pull the objects one at a time.
        qs = cls.objects.order_by('id').values_list('id', flat=True)

        for t, obj_id in enumerate(qs.iterator()):
            if t > total:
                break

            obj = cls.objects.get(pk=obj_id)

            if t % 1000 == 0 and t > 0:
                time_to_go = (total - t) * ((time.time() - start_time) / t)
                log.info('%s/%s... (%s to go)', t, total,
                         es_utils.format_time(time_to_go))

                # We call this every 1000 or so because we're
                # essentially loading the whole db and if DEBUG=True,
                # then Django saves every sql statement which causes
                # our memory to go up up up. So we reset it and that
                # makes things happier even in DEBUG environments.
                reset_queries()

            if t % settings.ES_FLUSH_BULK_EVERY == 0:
                # We built the ES with this setting, but it doesn't
                # actually do anything with it unless we call
                # flush_bulk which causes it to check its bulk_size
                # and flush it if it's too big.
                es.flush_bulk()

            try:
                cls.index(obj.extract_document(), bulk=True, es=es)
            except Exception:
                log.exception('Unable to extract/index document (id: %d)',
                              obj.id)

            yield t

        es.flush_bulk(forced=True)
        end_time = time.time()
        log.info('done! (%s)', es_utils.format_time(end_time - start_time))
        es.refresh()