Beispiel #1
0
    def test_chunked(self):
        # chunking nothing yields nothing.
        eq_(list(chunked([], 1)), [])

        # chunking list where len(list) < n
        eq_(list(chunked([1], 10)), [(1, )])

        # chunking a list where len(list) == n
        eq_(list(chunked([1, 2], 2)), [(1, 2)])

        # chunking list where len(list) > n
        eq_(list(chunked([1, 2, 3, 4, 5], 2)), [(1, 2), (3, 4), (5, )])
Beispiel #2
0
    def test_chunked(self):
        # chunking nothing yields nothing.
        eq_(list(chunked([], 1)), [])

        # chunking list where len(list) < n
        eq_(list(chunked([1], 10)), [(1,)])

        # chunking a list where len(list) == n
        eq_(list(chunked([1, 2], 2)), [(1, 2)])

        # chunking list where len(list) > n
        eq_(list(chunked([1, 2, 3, 4, 5], 2)),
            [(1, 2), (3, 4), (5,)])
Beispiel #3
0
def reindex(mapping_type_names):
    """Reindex all instances of a given mapping type with celery tasks

    :arg mapping_type_names: list of mapping types to reindex

    """
    outstanding = Record.objects.outstanding().count()
    if outstanding > 0:
        raise ReindexError('There are %s outstanding chunks.' % outstanding)

    batch_id = create_batch_id()

    # Break up all the things we want to index into chunks. This
    # chunkifies by class then by chunk size.
    chunks = []
    for cls, indexable in get_indexable(mapping_types=mapping_type_names):
        chunks.extend((cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE))

    for cls, id_list in chunks:
        index = cls.get_index()
        chunk_name = 'Indexing: %s %d -> %d' % (cls.get_mapping_type_name(),
                                                id_list[0], id_list[-1])
        rec = Record.objects.create(batch_id=batch_id, name=chunk_name)
        index_chunk_task.delay(index, batch_id, rec.id,
                               (to_class_path(cls), id_list))
    def handle(self, **options):
        # Set up logging so it doesn't send Ricky email.
        logging.basicConfig(level=logging.ERROR)

        # Get a list of ids of questions we're going to go change. We need
        # a list of ids so that we can feed it to the update, but then
        # also know what we need to update in the index.
        days_180 = datetime.now() - timedelta(days=180)
        q_ids = list(
            Question.objects.filter(is_archived=False).filter(
                created__lte=days_180).values_list("id", flat=True))

        if q_ids:
            log.info("Updating %d questions", len(q_ids))

            sql = """
                UPDATE questions_question
                SET is_archived = 1
                WHERE id IN (%s)
                """ % ",".join(map(str, q_ids))

            cursor = connection.cursor()
            cursor.execute(sql)
            if not transaction.get_connection().in_atomic_block:
                transaction.commit()

            if settings.ES_LIVE_INDEXING:
                try:
                    # So... the first time this runs, it'll handle 160K
                    # questions or so which stresses everything. Thus we
                    # do it in chunks because otherwise this won't work.
                    #
                    # After we've done this for the first time, we can nix
                    # the chunking code.

                    from kitsune.search.utils import chunked

                    for chunk in chunked(q_ids, 100):

                        # Fetch all the documents we need to update.
                        es_docs = get_documents(QuestionMappingType, chunk)

                        log.info("Updating %d index documents", len(es_docs))

                        documents = []

                        # For each document, update the data and stick it
                        # back in the index.
                        for doc in es_docs:
                            doc["question_is_archived"] = True
                            doc["indexed_on"] = int(time.time())
                            documents.append(doc)

                        QuestionMappingType.bulk_index(documents)

                except ES_EXCEPTIONS:
                    # Something happened with ES, so let's push index
                    # updating into an index_task which retries when it
                    # fails because of ES issues.
                    index_task.delay(to_class_path(QuestionMappingType), q_ids)
    def handle(self, **options):
        # Set up logging so it doesn't send Ricky email.
        logging.basicConfig(level=logging.ERROR)

        # Get a list of ids of questions we're going to go change. We need
        # a list of ids so that we can feed it to the update, but then
        # also know what we need to update in the index.
        days_180 = datetime.now() - timedelta(days=180)
        q_ids = list(
            Question.objects.filter(is_archived=False)
            .filter(created__lte=days_180)
            .values_list('id', flat=True))

        if q_ids:
            log.info('Updating %d questions', len(q_ids))

            sql = """
                UPDATE questions_question
                SET is_archived = 1
                WHERE id IN (%s)
                """ % ','.join(map(str, q_ids))

            cursor = connection.cursor()
            cursor.execute(sql)
            if not transaction.get_connection().in_atomic_block:
                transaction.commit()

            if settings.ES_LIVE_INDEXING:
                try:
                    # So... the first time this runs, it'll handle 160K
                    # questions or so which stresses everything. Thus we
                    # do it in chunks because otherwise this won't work.
                    #
                    # After we've done this for the first time, we can nix
                    # the chunking code.

                    from kitsune.search.utils import chunked
                    for chunk in chunked(q_ids, 100):

                        # Fetch all the documents we need to update.
                        es_docs = get_documents(QuestionMappingType, chunk)

                        log.info('Updating %d index documents', len(es_docs))

                        documents = []

                        # For each document, update the data and stick it
                        # back in the index.
                        for doc in es_docs:
                            doc[u'question_is_archived'] = True
                            doc[u'indexed_on'] = int(time.time())
                            documents.append(doc)

                        QuestionMappingType.bulk_index(documents)

                except ES_EXCEPTIONS:
                    # Something happened with ES, so let's push index
                    # updating into an index_task which retries when it
                    # fails because of ES issues.
                    index_task.delay(QuestionMappingType, q_ids)
Beispiel #6
0
def reindex(mapping_type_names):
    """Reindex all instances of a given mapping type with celery tasks

    :arg mapping_type_names: list of mapping types to reindex

    """
    outstanding = Record.objects.outstanding().count()
    if outstanding > 0:
        raise ReindexError('There are %s outstanding chunks.' % outstanding)

    batch_id = create_batch_id()

    # Break up all the things we want to index into chunks. This
    # chunkifies by class then by chunk size.
    chunks = []
    for cls, indexable in get_indexable(mapping_types=mapping_type_names):
        chunks.extend(
            (cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE))

    for cls, id_list in chunks:
        index = cls.get_index()
        chunk_name = 'Indexing: %s %d -> %d' % (
            cls.get_mapping_type_name(), id_list[0], id_list[-1])
        rec = Record.objects.create(batch_id=batch_id, name=chunk_name)
        index_chunk_task.delay(index, batch_id, rec.id, (to_class_path(cls), id_list))
Beispiel #7
0
def reindex_with_scoreboard(mapping_type_names):
    """Reindex all instances of a given mapping type with celery tasks.

    This will use Redis to keep track of outstanding tasks so nothing
    gets screwed up by two jobs running at once.
    """
    # TODO: If this gets fux0rd, then it's possible this could be
    # non-zero and we really want to just ignore it. Need the ability
    # to ignore it.
    try:
        client = redis_client('default')
        val = client.get(OUTSTANDING_INDEX_CHUNKS)
        if val is not None and int(val) > 0:
            raise ReindexError('There are %s outstanding chunks.' % val)

        # We don't know how many chunks we're building, but we do want
        # to make sure another reindex request doesn't slide in here
        # and kick off a bunch of chunks.
        #
        # There is a race condition here.
        client.set(OUTSTANDING_INDEX_CHUNKS, 1)
    except RedisError:
        log.warning('Redis not running. Can not check if there are '
                    'outstanding tasks.')

    batch_id = create_batch_id()

    # Break up all the things we want to index into chunks. This
    # chunkifies by class then by chunk size. Also generate
    # reconcile_tasks.
    chunks = []
    for cls, indexable in get_indexable(mapping_types=mapping_type_names):
        chunks.extend(
            (cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE))

        reconcile_task.delay(cls.get_index(), batch_id,
                             cls.get_mapping_type_name())

    chunks_count = len(chunks)

    try:
        client = redis_client('default')
        client.set(OUTSTANDING_INDEX_CHUNKS, chunks_count)
    except RedisError:
        log.warning('Redis not running. Can\'t denote outstanding tasks.')

    for chunk in chunks:
        index = chunk[0].get_index()
        index_chunk_task.delay(index, batch_id, chunk)
Beispiel #8
0
def reindex_with_scoreboard(mapping_type_names):
    """Reindex all instances of a given mapping type with celery tasks.

    This will use Redis to keep track of outstanding tasks so nothing
    gets screwed up by two jobs running at once.
    """
    # TODO: If this gets fux0rd, then it's possible this could be
    # non-zero and we really want to just ignore it. Need the ability
    # to ignore it.
    try:
        client = redis_client('default')
        val = client.get(OUTSTANDING_INDEX_CHUNKS)
        if val is not None and int(val) > 0:
            raise ReindexError('There are %s outstanding chunks.' % val)

        # We don't know how many chunks we're building, but we do want
        # to make sure another reindex request doesn't slide in here
        # and kick off a bunch of chunks.
        #
        # There is a race condition here.
        client.set(OUTSTANDING_INDEX_CHUNKS, 1)
    except RedisError:
        log.warning('Redis not running. Can not check if there are '
                    'outstanding tasks.')

    batch_id = create_batch_id()

    # Break up all the things we want to index into chunks. This
    # chunkifies by class then by chunk size. Also generate
    # reconcile_tasks.
    chunks = []
    for cls, indexable in get_indexable(mapping_types=mapping_type_names):
        chunks.extend((cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE))

        reconcile_task.delay(cls.get_index(), batch_id,
                             cls.get_mapping_type_name())

    chunks_count = len(chunks)

    try:
        client = redis_client('default')
        client.set(OUTSTANDING_INDEX_CHUNKS, chunks_count)
    except RedisError:
        log.warning('Redis not running. Can\'t denote outstanding tasks.')

    for chunk in chunks:
        index = chunk[0].get_index()
        index_chunk_task.delay(index, batch_id, chunk)
Beispiel #9
0
def index_chunk(cls, id_list, reraise=False):
    """Index a chunk of documents.

    :arg cls: The MappingType class.
    :arg id_list: Iterable of ids of that MappingType to index.
    :arg reraise: False if you want errors to be swallowed and True
        if you want errors to be thrown.

    """
    # Note: This bulk indexes in batches of 80. I didn't arrive at
    # this number through a proper scientific method. It's possible
    # there's a better number. It takes a while to fiddle with,
    # though. Probably best to expose the number as an environment
    # variable, then run a script that takes timings for
    # --criticalmass, runs overnight and returns a more "optimal"
    # number.
    for ids in chunked(id_list, 80):
        documents = []
        for id_ in ids:
            try:
                documents.append(cls.extract_document(id_))

            except UnindexMeBro:
                # extract_document throws this in cases where we need
                # to remove the item from the index.
                cls.unindex(id_)

            except Exception:
                log.exception('Unable to extract/index document (id: %d)',
                              id_)
                if reraise:
                    raise

        if documents:
            cls.bulk_index(documents, id_field='id')

        if settings.DEBUG:
            # Nix queries so that this doesn't become a complete
            # memory hog and make Will's computer sad when DEBUG=True.
            reset_queries()
Beispiel #10
0
def index_chunk(cls, id_list, reraise=False):
    """Index a chunk of documents.

    :arg cls: The MappingType class.
    :arg id_list: Iterable of ids of that MappingType to index.
    :arg reraise: False if you want errors to be swallowed and True
        if you want errors to be thrown.

    """
    # Note: This bulk indexes in batches of 80. I didn't arrive at
    # this number through a proper scientific method. It's possible
    # there's a better number. It takes a while to fiddle with,
    # though. Probably best to expose the number as an environment
    # variable, then run a script that takes timings for
    # --criticalmass, runs overnight and returns a more "optimal"
    # number.
    for ids in chunked(id_list, 80):
        documents = []
        for id_ in ids:
            try:
                documents.append(cls.extract_document(id_))

            except UnindexMeBro:
                # extract_document throws this in cases where we need
                # to remove the item from the index.
                cls.unindex(id_)

            except Exception:
                log.exception('Unable to extract/index document (id: %d)',
                              id_)
                if reraise:
                    raise

        if documents:
            cls.bulk_index(documents, id_field='id')

        if settings.DEBUG:
            # Nix queries so that this doesn't become a complete
            # memory hog and make Will's computer sad when DEBUG=True.
            reset_queries()
Beispiel #11
0
def handle_reindex(request):
    """Caculates and kicks off indexing tasks"""
    # This is truthy if the user wants us to delete and recreate
    # the index first.
    delete_index_first = bool(request.POST.get('delete_index'))

    if delete_index_first:
        # Coming from the delete form, so we reindex all models.
        mapping_types_to_index = None
    else:
        # Coming from the reindex form, so we reindex whatever we're
        # told.
        mapping_types_to_index = [name.replace('check_', '')
                                  for name in request.POST.keys()
                                  if name.startswith('check_')]

    # TODO: If this gets fux0rd, then it's possible this could be
    # non-zero and we really want to just ignore it. Need the ability
    # to ignore it.
    try:
        client = redis_client('default')
        val = client.get(OUTSTANDING_INDEX_CHUNKS)
        if val is not None and int(val) > 0:
            raise ReindexError('There are %s outstanding chunks.' % val)

        # We don't know how many chunks we're building, but we do want
        # to make sure another reindex request doesn't slide in here
        # and kick off a bunch of chunks.
        #
        # There is a race condition here.
        client.set(OUTSTANDING_INDEX_CHUNKS, 1)
    except RedisError:
        log.warning('Redis not running. Can not check if there are '
                    'outstanding tasks.')

    batch_id = create_batch_id()

    # Break up all the things we want to index into chunks. This
    # chunkifies by class then by chunk size.
    chunks = []
    for cls, indexable in get_indexable(mapping_types=mapping_types_to_index):
        chunks.extend(
            (cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE))

    if delete_index_first:
        # The previous lines do a lot of work and take some time to
        # execute.  So we wait until here to wipe and rebuild the
        # index. That reduces the time that there is no index by a little.
        recreate_index()

    chunks_count = len(chunks)

    try:
        client = redis_client('default')
        client.set(OUTSTANDING_INDEX_CHUNKS, chunks_count)
    except RedisError:
        log.warning('Redis not running. Can\'t denote outstanding tasks.')

    for chunk in chunks:
        index_chunk_task.delay(write_index(), batch_id, chunk)

    return HttpResponseRedirect(request.path)
Beispiel #12
0
def es_status_cmd(checkindex=False, log=log):
    """Shows elastic search index status"""
    try:
        # TODO: SUMO has a single ES_URL and that's the ZLB and does
        # the balancing. If that ever changes and we have multiple
        # ES_URLs, then this should get fixed.
        es_deets = requests.get(settings.ES_URLS[0]).json()
    except requests.exceptions.RequestException:
        pass

    read_doctype_stats = {}
    for index in all_read_indexes():
        try:
            read_doctype_stats[index] = get_doctype_stats(index)
        except ES_EXCEPTIONS:
            read_doctype_stats[index] = None

    if set(all_read_indexes()) == set(all_write_indexes()):
        write_doctype_stats = read_doctype_stats
    else:
        write_doctype_stats = {}
        for index in all_write_indexes():
            try:
                write_doctype_stats[index] = get_doctype_stats(index)
            except ES_EXCEPTIONS:
                write_doctype_stats[index] = None

    try:
        indexes = get_indexes(all_indexes=True)
    except ES_EXCEPTIONS:
        log.error('Your elasticsearch process is not running or ES_URLS '
                  'is set wrong in your settings_local.py file.')
        return

    log.info('Elasticsearch:')
    log.info('  Version                 : %s', es_deets['version']['number'])

    log.info('Settings:')
    log.info('  ES_URLS                 : %s', settings.ES_URLS)
    log.info('  ES_INDEX_PREFIX         : %s', settings.ES_INDEX_PREFIX)
    log.info('  ES_LIVE_INDEXING        : %s', settings.ES_LIVE_INDEXING)
    log.info('  ES_INDEXES              : %s', settings.ES_INDEXES)
    log.info('  ES_WRITE_INDEXES        : %s', settings.ES_WRITE_INDEXES)

    log.info('Index stats:')

    if indexes:
        log.info('  List of indexes:')
        for name, count in sorted(indexes):
            read_write = []
            if name in all_read_indexes():
                read_write.append('READ')
            if name in all_write_indexes():
                read_write.append('WRITE')
            log.info('    %-22s: %s %s', name, count, '/'.join(read_write))
    else:
        log.info('  There are no %s indexes.', settings.ES_INDEX_PREFIX)

    if not read_doctype_stats:
        read_index_names = ', '.join(all_read_indexes())
        log.info('  No read indexes exist. (%s)', read_index_names)
    else:
        log.info('  Read indexes:')
        for index, stats in read_doctype_stats.items():
            if stats is None:
                log.info('    %s does not exist', index)
            else:
                log.info('    %s:', index)
                for name, count in sorted(stats.items()):
                    log.info('      %-22s: %d', name, count)

    if set(all_read_indexes()) == set(all_write_indexes()):
        log.info('  Write indexes are the same as the read indexes.')
    else:
        if not write_doctype_stats:
            write_index_names = ', '.join(all_write_indexes())
            log.info('  No write indexes exist. (%s)', write_index_names)
        else:
            log.info('  Write indexes:')
            for index, stats in write_doctype_stats.items():
                if stats is None:
                    log.info('    %s does not exist', index)
                else:
                    log.info('    %s:', index)
                    for name, count in sorted(stats.items()):
                        log.info('      %-22s: %d', name, count)

    if checkindex:
        # Go through the index and verify everything
        log.info('Checking index contents....')

        missing_docs = 0

        for cls, id_list in get_indexable():
            for id_group in chunked(id_list, 100):
                doc_list = get_documents(cls, id_group)
                if len(id_group) != len(doc_list):
                    doc_list_ids = [doc['id'] for doc in doc_list]
                    for id_ in id_group:
                        if id_ not in doc_list_ids:
                            log.info('   Missing %s %s', cls.get_model_name(),
                                     id_)
                            missing_docs += 1

        if missing_docs:
            print 'There were %d missing_docs' % missing_docs
Beispiel #13
0
def es_reindex_cmd(percent=100,
                   delete=False,
                   mapping_types=None,
                   criticalmass=False,
                   log=log):
    """Rebuild ElasticSearch indexes

    :arg percent: 1 to 100--the percentage of the db to index
    :arg delete: whether or not to wipe the index before reindexing
    :arg mapping_types: list of mapping types to index
    :arg criticalmass: whether or not to index just a critical mass of
        things
    :arg log: the logger to use

    """
    es = get_es()

    if mapping_types is None:
        indexes = all_write_indexes()
    else:
        indexes = indexes_for_doctypes(mapping_types)

    need_delete = False
    for index in indexes:
        try:
            # This is used to see if the index exists.
            get_doctype_stats(index)
        except ES_EXCEPTIONS:
            if not delete:
                log.error('The index "%s" does not exist. '
                          'You must specify --delete.' % index)
                need_delete = True
    if need_delete:
        return

    if delete:
        log.info('wiping and recreating %s...', ', '.join(indexes))
        recreate_indexes(es, indexes)

    if criticalmass:
        # The critical mass is defined as the entire KB plus the most
        # recent 15k questions (which is about how many questions
        # there were created in the last 180 days). We build that
        # indexable here.

        # Get only questions and wiki document stuff.
        all_indexable = get_indexable(
            mapping_types=['questions_question', 'wiki_document'])

        # The first item is questions because we specified that
        # order. Old questions don't show up in searches, so we nix
        # them by reversing the list (ordered by id ascending) and
        # slicing it.
        all_indexable[0] = (all_indexable[0][0],
                            list(reversed(all_indexable[0][1]))[:15000])

    elif mapping_types:
        all_indexable = get_indexable(percent, mapping_types)

    else:
        all_indexable = get_indexable(percent)

    try:
        old_refreshes = {}
        # We're doing a lot of indexing, so we get the refresh_interval of
        # the index currently, then nix refreshing. Later we'll restore it.
        for index in indexes:
            old_refreshes[index] = (get_index_settings(index).get(
                'index.refresh_interval', '1s'))
            # Disable automatic refreshing
            es.indices.put_settings(index=index,
                                    body={'index': {
                                        'refresh_interval': '-1'
                                    }})

        start_time = time.time()
        for cls, indexable in all_indexable:
            cls_start_time = time.time()
            total = len(indexable)

            if total == 0:
                continue

            chunk_start_time = time.time()
            log.info('reindexing %s. %s to index....',
                     cls.get_mapping_type_name(), total)

            i = 0
            for chunk in chunked(indexable, 1000):
                chunk_start_time = time.time()
                index_chunk(cls, chunk)

                i += len(chunk)
                time_to_go = (total - i) * ((time.time() - cls_start_time) / i)
                per_1000 = (time.time() - cls_start_time) / (i / 1000.0)
                this_1000 = time.time() - chunk_start_time

                log.info('   %s/%s %s... (%s/1000 avg, %s ETA)', i, total,
                         format_time(this_1000), format_time(per_1000),
                         format_time(time_to_go))

            delta_time = time.time() - cls_start_time
            log.info('   done! (%s total, %s/1000 avg)',
                     format_time(delta_time),
                     format_time(delta_time / (total / 1000.0)))

        delta_time = time.time() - start_time
        log.info('done! (%s total)', format_time(delta_time))

    finally:
        # Re-enable automatic refreshing
        for index, old_refresh in old_refreshes.items():
            es.indices.put_settings(
                index=index, body={'index': {
                    'refresh_interval': old_refresh
                }})
Beispiel #14
0
def es_status_cmd(checkindex=False, log=log):
    """Shows elastic search index status"""
    try:
        # TODO: SUMO has a single ES_URL and that's the ZLB and does
        # the balancing. If that ever changes and we have multiple
        # ES_URLs, then this should get fixed.
        es_deets = requests.get(settings.ES_URLS[0]).json()
    except requests.exceptions.RequestException:
        pass

    read_doctype_stats = {}
    for index in all_read_indexes():
        try:
            read_doctype_stats[index] = get_doctype_stats(index)
        except ES_EXCEPTIONS:
            read_doctype_stats[index] = None

    if set(all_read_indexes()) == set(all_write_indexes()):
        write_doctype_stats = read_doctype_stats
    else:
        write_doctype_stats = {}
        for index in all_write_indexes():
            try:
                write_doctype_stats[index] = get_doctype_stats(index)
            except ES_EXCEPTIONS:
                write_doctype_stats[index] = None

    try:
        indexes = get_indexes(all_indexes=True)
    except ES_EXCEPTIONS:
        log.error("Your elasticsearch process is not running or ES_URLS "
                  "is set wrong in your settings_local.py file.")
        return

    log.info("Elasticsearch:")
    log.info("  Version                 : %s", es_deets["version"]["number"])

    log.info("Settings:")
    log.info("  ES_URLS                 : %s", settings.ES_URLS)
    log.info("  ES_INDEX_PREFIX         : %s", settings.ES_INDEX_PREFIX)
    log.info("  ES_LIVE_INDEXING        : %s", settings.ES_LIVE_INDEXING)
    log.info("  ES_INDEXES              : %s", settings.ES_INDEXES)
    log.info("  ES_WRITE_INDEXES        : %s", settings.ES_WRITE_INDEXES)

    log.info("Index stats:")

    if indexes:
        log.info("  List of indexes:")
        for name, count in sorted(indexes):
            read_write = []
            if name in all_read_indexes():
                read_write.append("READ")
            if name in all_write_indexes():
                read_write.append("WRITE")
            log.info("    %-22s: %s %s", name, count, "/".join(read_write))
    else:
        log.info("  There are no %s indexes.", settings.ES_INDEX_PREFIX)

    if not read_doctype_stats:
        read_index_names = ", ".join(all_read_indexes())
        log.info("  No read indexes exist. (%s)", read_index_names)
    else:
        log.info("  Read indexes:")
        for index, stats in list(read_doctype_stats.items()):
            if stats is None:
                log.info("    %s does not exist", index)
            else:
                log.info("    %s:", index)
                for name, count in sorted(stats.items()):
                    log.info("      %-22s: %d", name, count)

    if set(all_read_indexes()) == set(all_write_indexes()):
        log.info("  Write indexes are the same as the read indexes.")
    else:
        if not write_doctype_stats:
            write_index_names = ", ".join(all_write_indexes())
            log.info("  No write indexes exist. (%s)", write_index_names)
        else:
            log.info("  Write indexes:")
            for index, stats in list(write_doctype_stats.items()):
                if stats is None:
                    log.info("    %s does not exist", index)
                else:
                    log.info("    %s:", index)
                    for name, count in sorted(stats.items()):
                        log.info("      %-22s: %d", name, count)

    if checkindex:
        # Go through the index and verify everything
        log.info("Checking index contents....")

        missing_docs = 0

        for cls, id_list in get_indexable():
            for id_group in chunked(id_list, 100):
                doc_list = get_documents(cls, id_group)
                if len(id_group) != len(doc_list):
                    doc_list_ids = [doc["id"] for doc in doc_list]
                    for id_ in id_group:
                        if id_ not in doc_list_ids:
                            log.info("   Missing %s %s", cls.get_model_name(),
                                     id_)
                            missing_docs += 1

        if missing_docs:
            print("There were %d missing_docs" % missing_docs)
Beispiel #15
0
def es_status_cmd(checkindex=False, log=log):
    """Shows elastic search index status"""
    try:
        # TODO: SUMO has a single ES_URL and that's the ZLB and does
        # the balancing. If that ever changes and we have multiple
        # ES_URLs, then this should get fixed.
        es_deets = requests.get(settings.ES_URLS[0]).json()
    except requests.exceptions.RequestException:
        pass

    read_doctype_stats = {}
    for index in all_read_indexes():
        try:
            read_doctype_stats[index] = get_doctype_stats(index)
        except ES_EXCEPTIONS:
            read_doctype_stats[index] = None

    if set(all_read_indexes()) == set(all_write_indexes()):
        write_doctype_stats = read_doctype_stats
    else:
        write_doctype_stats = {}
        for index in all_write_indexes():
            try:
                write_doctype_stats[index] = get_doctype_stats(index)
            except ES_EXCEPTIONS:
                write_doctype_stats[index] = None

    try:
        indexes = get_indexes(all_indexes=True)
    except ES_EXCEPTIONS:
        log.error('Your elasticsearch process is not running or ES_URLS '
                  'is set wrong in your settings_local.py file.')
        return

    log.info('Elasticsearch:')
    log.info('  Version                 : %s', es_deets['version']['number'])

    log.info('Settings:')
    log.info('  ES_URLS                 : %s', settings.ES_URLS)
    log.info('  ES_INDEX_PREFIX         : %s', settings.ES_INDEX_PREFIX)
    log.info('  ES_LIVE_INDEXING        : %s', settings.ES_LIVE_INDEXING)
    log.info('  ES_INDEXES              : %s', settings.ES_INDEXES)
    log.info('  ES_WRITE_INDEXES        : %s', settings.ES_WRITE_INDEXES)

    log.info('Index stats:')

    if indexes:
        log.info('  List of indexes:')
        for name, count in sorted(indexes):
            read_write = []
            if name in all_read_indexes():
                read_write.append('READ')
            if name in all_write_indexes():
                read_write.append('WRITE')
            log.info('    %-22s: %s %s', name, count,
                     '/'.join(read_write))
    else:
        log.info('  There are no %s indexes.', settings.ES_INDEX_PREFIX)

    if not read_doctype_stats:
        read_index_names = ', '.join(all_read_indexes())
        log.info('  No read indexes exist. (%s)', read_index_names)
    else:
        log.info('  Read indexes:')
        for index, stats in read_doctype_stats.items():
            if stats is None:
                log.info('    %s does not exist', index)
            else:
                log.info('    %s:', index)
                for name, count in sorted(stats.items()):
                    log.info('      %-22s: %d', name, count)

    if set(all_read_indexes()) == set(all_write_indexes()):
        log.info('  Write indexes are the same as the read indexes.')
    else:
        if not write_doctype_stats:
            write_index_names = ', '.join(all_write_indexes())
            log.info('  No write indexes exist. (%s)', write_index_names)
        else:
            log.info('  Write indexes:')
            for index, stats in write_doctype_stats.items():
                if stats is None:
                    log.info('    %s does not exist', index)
                else:
                    log.info('    %s:', index)
                    for name, count in sorted(stats.items()):
                        log.info('      %-22s: %d', name, count)

    if checkindex:
        # Go through the index and verify everything
        log.info('Checking index contents....')

        missing_docs = 0

        for cls, id_list in get_indexable():
            for id_group in chunked(id_list, 100):
                doc_list = get_documents(cls, id_group)
                if len(id_group) != len(doc_list):
                    doc_list_ids = [doc['id'] for doc in doc_list]
                    for id_ in id_group:
                        if id_ not in doc_list_ids:
                            log.info('   Missing %s %s',
                                     cls.get_model_name(), id_)
                            missing_docs += 1

        if missing_docs:
            print 'There were %d missing_docs' % missing_docs
Beispiel #16
0
def es_reindex_cmd(percent=100, delete=False, mapping_types=None,
                   criticalmass=False, log=log):
    """Rebuild ElasticSearch indexes

    :arg percent: 1 to 100--the percentage of the db to index
    :arg delete: whether or not to wipe the index before reindexing
    :arg mapping_types: list of mapping types to index
    :arg criticalmass: whether or not to index just a critical mass of
        things
    :arg log: the logger to use

    """
    es = get_es()

    if mapping_types is None:
        indexes = all_write_indexes()
    else:
        indexes = indexes_for_doctypes(mapping_types)

    need_delete = False
    for index in indexes:
        try:
            # This is used to see if the index exists.
            get_doctype_stats(index)
        except ES_EXCEPTIONS:
            if not delete:
                log.error('The index "%s" does not exist. '
                          'You must specify --delete.' % index)
                need_delete = True
    if need_delete:
        return

    if delete:
        log.info('wiping and recreating %s...', ', '.join(indexes))
        recreate_indexes(es, indexes)

    if criticalmass:
        # The critical mass is defined as the entire KB plus the most
        # recent 15k questions (which is about how many questions
        # there were created in the last 180 days). We build that
        # indexable here.

        # Get only questions and wiki document stuff.
        all_indexable = get_indexable(
            mapping_types=['questions_question', 'wiki_document'])

        # The first item is questions because we specified that
        # order. Old questions don't show up in searches, so we nix
        # them by reversing the list (ordered by id ascending) and
        # slicing it.
        all_indexable[0] = (all_indexable[0][0],
                            list(reversed(all_indexable[0][1]))[:15000])

    elif mapping_types:
        all_indexable = get_indexable(percent, mapping_types)

    else:
        all_indexable = get_indexable(percent)

    try:
        old_refreshes = {}
        # We're doing a lot of indexing, so we get the refresh_interval of
        # the index currently, then nix refreshing. Later we'll restore it.
        for index in indexes:
            old_refreshes[index] = (get_index_settings(index)
                                    .get('index.refresh_interval', '1s'))
            # Disable automatic refreshing
            es.indices.put_settings(index=index,
                                    body={'index': {'refresh_interval': '-1'}})

        start_time = time.time()
        for cls, indexable in all_indexable:
            cls_start_time = time.time()
            total = len(indexable)

            if total == 0:
                continue

            chunk_start_time = time.time()
            log.info('reconciling %s: %s in db....',
                     cls.get_mapping_type_name(), total)
            ret = reconcile_chunk(cls, cls.get_indexable())
            log.info('   done! reconciled %s index documents (%s total)',
                     ret, format_time(time.time() - chunk_start_time))

            log.info('reindexing %s. %s to index....',
                     cls.get_mapping_type_name(), total)

            i = 0
            for chunk in chunked(indexable, 1000):
                chunk_start_time = time.time()
                index_chunk(cls, chunk)

                i += len(chunk)
                time_to_go = (total - i) * ((time.time() - cls_start_time) / i)
                per_1000 = (time.time() - cls_start_time) / (i / 1000.0)
                this_1000 = time.time() - chunk_start_time

                log.info('   %s/%s %s... (%s/1000 avg, %s ETA)',
                         i,
                         total,
                         format_time(this_1000),
                         format_time(per_1000),
                         format_time(time_to_go))

            delta_time = time.time() - cls_start_time
            log.info('   done! (%s total, %s/1000 avg)',
                     format_time(delta_time),
                     format_time(delta_time / (total / 1000.0)))

        delta_time = time.time() - start_time
        log.info('done! (%s total)', format_time(delta_time))

    finally:
        # Re-enable automatic refreshing
        for index, old_refresh in old_refreshes.items():
            es.indices.put_settings(
                index=index,
                body={'index': {'refresh_interval': old_refresh}})
Beispiel #17
0
def handle_reindex(request):
    """Caculates and kicks off indexing tasks"""
    # This is truthy if the user wants us to delete and recreate
    # the index first.
    delete_index_first = bool(request.POST.get('delete_index'))

    if delete_index_first:
        # Coming from the delete form, so we reindex all models.
        mapping_types_to_index = None
    else:
        # Coming from the reindex form, so we reindex whatever we're
        # told.
        mapping_types_to_index = [
            name.replace('check_', '') for name in request.POST.keys()
            if name.startswith('check_')
        ]

    # TODO: If this gets fux0rd, then it's possible this could be
    # non-zero and we really want to just ignore it. Need the ability
    # to ignore it.
    try:
        client = redis_client('default')
        val = client.get(OUTSTANDING_INDEX_CHUNKS)
        if val is not None and int(val) > 0:
            raise ReindexError('There are %s outstanding chunks.' % val)

        # We don't know how many chunks we're building, but we do want
        # to make sure another reindex request doesn't slide in here
        # and kick off a bunch of chunks.
        #
        # There is a race condition here.
        client.set(OUTSTANDING_INDEX_CHUNKS, 1)
    except RedisError:
        log.warning('Redis not running. Can not check if there are '
                    'outstanding tasks.')

    batch_id = create_batch_id()

    # Break up all the things we want to index into chunks. This
    # chunkifies by class then by chunk size.
    chunks = []
    for cls, indexable in get_indexable(mapping_types=mapping_types_to_index):
        chunks.extend((cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE))

    if delete_index_first:
        # The previous lines do a lot of work and take some time to
        # execute.  So we wait until here to wipe and rebuild the
        # index. That reduces the time that there is no index by a little.
        recreate_index()

    chunks_count = len(chunks)

    try:
        client = redis_client('default')
        client.set(OUTSTANDING_INDEX_CHUNKS, chunks_count)
    except RedisError:
        log.warning('Redis not running. Can\'t denote outstanding tasks.')

    for chunk in chunks:
        index_chunk_task.delay(write_index(), batch_id, chunk)

    return HttpResponseRedirect(request.path)