Example #1
0
File: cron.py Project: ibai/kitsune
def auto_lock_old_questions():
    """Locks all questions that were created over 180 days ago"""
    # Set up logging so it doesn't send Ricky email.
    logging.basicConfig(level=logging.ERROR)

    # Get a list of ids of questions we're going to go change. We need
    # a list of ids so that we can feed it to the update, but then
    # also know what we need to update in the index.
    days_180 = datetime.now() - timedelta(days=180)
    q_ids = list(
        Question.objects.filter(is_locked=False).filter(
            created__lte=days_180).values_list('id', flat=True))

    if q_ids:
        log.info('Updating %d questions', len(q_ids))

        sql = """
            UPDATE questions_question
            SET is_locked = 1
            WHERE id IN (%s)
            """ % ','.join(map(str, q_ids))

        cursor = connection.cursor()
        cursor.execute(sql)
        transaction.commit_unless_managed()

        if settings.ES_LIVE_INDEXING:
            try:
                es = get_indexing_es()

                # So... the first time this runs, it'll handle 160K
                # questions or so which stresses everything. Thus we
                # do it in chunks because otherwise this won't work.
                #
                # After we've done this for the first time, we can nix
                # the chunking code.

                from search.utils import chunked
                for chunk in chunked(q_ids, 1000):

                    # Fetch all the documents we need to update.
                    es_docs = get_documents(Question, chunk)

                    log.info('Updating %d index documents', len(es_docs))

                    # For each document, update the data and stick it
                    # back in the index.
                    for doc in es_docs:
                        doc[u'question_is_locked'] = True
                        Question.index(doc, bulk=True, es=es)

                    es.flush_bulk(forced=True)
                    es.refresh(WRITE_INDEX, timesleep=0)

            except (ESTimeoutError, ESMaxRetryError, ESException):
                # Something happened with ES, so let's push index updating
                # into an index_task which retries when it fails because
                # of ES issues.
                index_task.delay(Question, q_ids)
Example #2
0
def auto_lock_old_questions():
    """Locks all questions that were created over 180 days ago"""
    # Set up logging so it doesn't send Ricky email.
    logging.basicConfig(level=logging.ERROR)

    # Get a list of ids of questions we're going to go change. We need
    # a list of ids so that we can feed it to the update, but then
    # also know what we need to update in the index.
    days_180 = datetime.now() - timedelta(days=180)
    q_ids = list(Question.objects.filter(is_locked=False)
                                 .filter(created__lte=days_180)
                                 .values_list('id', flat=True))

    if q_ids:
        log.info('Updating %d questions', len(q_ids))

        sql = """
            UPDATE questions_question
            SET is_locked = 1
            WHERE id IN (%s)
            """ % ','.join(map(str, q_ids))

        cursor = connection.cursor()
        cursor.execute(sql)
        transaction.commit_unless_managed()

        if settings.ES_LIVE_INDEXING:
            try:
                es = get_indexing_es()

                # So... the first time this runs, it'll handle 160K
                # questions or so which stresses everything. Thus we
                # do it in chunks because otherwise this won't work.
                #
                # After we've done this for the first time, we can nix
                # the chunking code.

                from search.utils import chunked
                for chunk in chunked(q_ids, 1000):

                    # Fetch all the documents we need to update.
                    es_docs = get_documents(Question, chunk)

                    log.info('Updating %d index documents', len(es_docs))

                    # For each document, update the data and stick it
                    # back in the index.
                    for doc in es_docs:
                        doc[u'question_is_locked'] = True
                        Question.index(doc, bulk=True, es=es)

                    es.flush_bulk(forced=True)
                    es.refresh(WRITE_INDEX, timesleep=0)

            except (ESTimeoutError, ESMaxRetryError, ESException):
                # Something happened with ES, so let's push index updating
                # into an index_task which retries when it fails because
                # of ES issues.
                index_task.delay(Question, q_ids)
Example #3
0
    def index(cls, document, bulk=False, force_insert=False, es=None):
        """Indexes a single document"""
        if not settings.ES_LIVE_INDEXING:
            return

        if es is None:
            # Use es_utils.get_indexing_es() because it uses
            # ES_INDEXING_TIMEOUT.
            es = es_utils.get_indexing_es()

        es.index(document,
                 index=es_utils.WRITE_INDEX,
                 doc_type=es_utils.SUMO_DOCTYPE,
                 id=cls.get_document_id(document['id']),
                 bulk=bulk,
                 force_insert=force_insert)
Example #4
0
    def unindex(cls, id, es=None):
        """Removes a document from the index"""
        if not settings.ES_LIVE_INDEXING:
            return

        if es is None:
            # Use es_utils.get_indexing_es() because it uses
            # ES_INDEXING_TIMEOUT.
            es = es_utils.get_indexing_es()

        try:
            # TODO: There is a race condition here if this gets called
            # during reindexing.
            es.delete(es_utils.WRITE_INDEX, es_utils.SUMO_DOCTYPE, id)
        except pyes.exceptions.NotFoundException:
            # Ignore the case where we try to delete something that's
            # not there.
            pass
Example #5
0
    def unindex(cls, id_, es=None):
        """Removes a document from the index"""
        if not settings.ES_LIVE_INDEXING:
            return

        if es is None:
            # Use es_utils.get_indexing_es() because it uses
            # ES_INDEXING_TIMEOUT.
            es = es_utils.get_indexing_es()

        try:
            es.delete(es_utils.WRITE_INDEX, es_utils.SUMO_DOCTYPE,
                      cls.get_document_id(id_))

        except pyes.exceptions.NotFoundException:
            # Ignore the case where we try to delete something that's
            # not there.
            pass
Example #6
0
    def unindex(cls, id_, es=None):
        """Removes a document from the index"""
        if not settings.ES_LIVE_INDEXING:
            return

        if es is None:
            # Use es_utils.get_indexing_es() because it uses
            # ES_INDEXING_TIMEOUT.
            es = es_utils.get_indexing_es()

        try:
            es.delete(es_utils.WRITE_INDEX, es_utils.SUMO_DOCTYPE,
                      cls.get_document_id(id_))

            # Refresh after the delete, but only if the delete was
            # successful.
            es.refresh(es_utils.WRITE_INDEX, timesleep=0)
        except pyes.exceptions.NotFoundException:
            # Ignore the case where we try to delete something that's
            # not there.
            pass
Example #7
0
    def index(cls, document, bulk=False, force_insert=False, refresh=False,
              es=None):
        """Indexes a single document"""
        if not settings.ES_LIVE_INDEXING:
            return

        if es is None:
            # Use es_utils.get_indexing_es() because it uses
            # ES_INDEXING_TIMEOUT.
            es = es_utils.get_indexing_es()

        index = settings.ES_WRITE_INDEXES['default']
        doc_type = cls._meta.db_table

        es.index(document,
                 index=index,
                 doc_type=doc_type,
                 id=document['id'],
                 bulk=bulk,
                 force_insert=force_insert)

        if refresh:
            es.refresh(timesleep=0)
Example #8
0
    def index(cls,
              document,
              bulk=False,
              force_insert=False,
              refresh=False,
              es=None):
        """Indexes a single document"""
        if not settings.ES_LIVE_INDEXING:
            return

        if es is None:
            # Use es_utils.get_indexing_es() because it uses
            # ES_INDEXING_TIMEOUT.
            es = es_utils.get_indexing_es()

        es.index(document,
                 index=es_utils.WRITE_INDEX,
                 doc_type=es_utils.SUMO_DOCTYPE,
                 id=cls.get_document_id(document['id']),
                 bulk=bulk,
                 force_insert=force_insert)

        if refresh:
            es.refresh(es_utils.WRITE_INDEX, timesleep=0)
Example #9
0
    def index_all(cls, percent=100):
        """Reindexes all the objects for this model.

        Yields number of documents done.

        Note: This can get run from the command line, so we log stuff
        to let the user know what's going on.

        :arg percent: The percentage of questions to index. Defaults to
            100--e.g. all of them.

        """
        es = es_utils.get_indexing_es()

        doc_type = cls._meta.db_table
        index = settings.ES_WRITE_INDEXES['default']

        start_time = time.time()

        indexable_qs = cls.get_indexable()

        log.info('reindex %s into %s index', doc_type, index)

        log.info('iterating through %s....', doc_type)
        total = indexable_qs.count()
        to_index = int(total * (percent / 100.0))
        log.info('total %s: %s (to be indexed: %s)', doc_type, total, to_index)
        if to_index == 0:
            log.info('done!')
            return

        total = to_index

        for t, obj_id in enumerate(indexable_qs):
            if t > total:
                break

            if t % 1000 == 0 and t > 0:
                time_to_go = (total - t) * ((time.time() - start_time) / t)
                per_1000 = (time.time() - start_time) / (t / 1000.0)
                log.info('%s/%s... (%s to go, %s per 1000 docs)', t, total,
                         es_utils.format_time(time_to_go),
                         es_utils.format_time(per_1000))

                # We call this every 1000 or so because we're
                # essentially loading the whole db and if DEBUG=True,
                # then Django saves every sql statement which causes
                # our memory to go up up up. So we reset it and that
                # makes things happier even in DEBUG environments.
                reset_queries()

            if t % settings.ES_FLUSH_BULK_EVERY == 0:
                # We built the ES with this setting, but it doesn't
                # actually do anything with it unless we call
                # flush_bulk which causes it to check its bulk_size
                # and flush it if it's too big.
                es.flush_bulk()

            try:
                cls.index(cls.extract_document(obj_id), bulk=True, es=es)
            except Exception:
                log.exception('Unable to extract/index document (id: %d)',
                              obj_id)

            yield t

        es.flush_bulk(forced=True)
        delta_time = time.time() - start_time
        log.info('done! (%s, %s per 1000 docs)',
                 es_utils.format_time(delta_time),
                 es_utils.format_time(delta_time / (total / 1000.0)))
        es.refresh()
Example #10
0
    def index_all(cls, percent=100):
        """Reindexes all the objects for this model.

        Yields number of documents done.

        Note: This can get run from the command line, so we log stuff
        to let the user know what's going on.

        :arg percent: The percentage of questions to index. Defaults to
            100--e.g. all of them.

        """
        es = es_utils.get_indexing_es()

        doc_type = cls._meta.db_table
        index = settings.ES_INDEXES['default']

        start_time = time.time()

        log.info('reindex %s into %s index', doc_type, index)

        log.info('iterating through %s....', doc_type)
        total = cls.objects.count()
        to_index = int(total * (percent / 100.0))
        log.info('total %s: %s (to be indexed: %s)', doc_type, total, to_index)
        total = to_index

        # Some models have a gazillion instances. So we want to go
        # through them one at a time in a way that doesn't pull all
        # the data into memory all at once. So we iterate through ids
        # and pull the objects one at a time.
        qs = cls.objects.order_by('id').values_list('id', flat=True)

        for t, obj_id in enumerate(qs.iterator()):
            if t > total:
                break

            obj = cls.objects.get(pk=obj_id)

            if t % 1000 == 0 and t > 0:
                time_to_go = (total - t) * ((time.time() - start_time) / t)
                log.info('%s/%s... (%s to go)', t, total,
                         es_utils.format_time(time_to_go))

                # We call this every 1000 or so because we're
                # essentially loading the whole db and if DEBUG=True,
                # then Django saves every sql statement which causes
                # our memory to go up up up. So we reset it and that
                # makes things happier even in DEBUG environments.
                reset_queries()

            if t % settings.ES_FLUSH_BULK_EVERY == 0:
                # We built the ES with this setting, but it doesn't
                # actually do anything with it unless we call
                # flush_bulk which causes it to check its bulk_size
                # and flush it if it's too big.
                es.flush_bulk()

            try:
                cls.index(obj.extract_document(), bulk=True, es=es)
            except Exception:
                log.exception('Unable to extract/index document (id: %d)',
                              obj.id)

            yield t

        es.flush_bulk(forced=True)
        end_time = time.time()
        log.info('done! (%s)', es_utils.format_time(end_time - start_time))
        es.refresh()