Esempio n. 1
0
    def test_chunked(self):
        # chunking nothing yields nothing.
        eq_(list(chunked([], 1)), [])

        # chunking list where len(list) < n
        eq_(list(chunked([1], 10)), [(1, )])

        # chunking a list where len(list) == n
        eq_(list(chunked([1, 2], 2)), [(1, 2)])

        # chunking list where len(list) > n
        eq_(list(chunked([1, 2, 3, 4, 5], 2)), [(1, 2), (3, 4), (5, )])
Esempio n. 2
0
    def test_chunked(self):
        # chunking nothing yields nothing.
        eq_(list(chunked([], 1)), [])

        # chunking list where len(list) < n
        eq_(list(chunked([1], 10)), [(1,)])

        # chunking a list where len(list) == n
        eq_(list(chunked([1, 2], 2)), [(1, 2)])

        # chunking list where len(list) > n
        eq_(list(chunked([1, 2, 3, 4, 5], 2)),
            [(1, 2), (3, 4), (5,)])
Esempio n. 3
0
def index_chunk(cls, id_list, reraise=False):
    """Index a chunk of documents.

    :arg cls: The MappingType class.
    :arg id_list: Iterable of ids of that MappingType to index.
    :arg reraise: False if you want errors to be swallowed and True
        if you want errors to be thrown.

    """
    # Note: This bulk indexes in batches of 80. I didn't arrive at
    # this number through a proper scientific method. It's possible
    # there's a better number. It takes a while to fiddle with,
    # though. Probably best to expose the number as an environment
    # variable, then run a script that takes timings for
    # --criticalmass, runs overnight and returns a more "optimal"
    # number.
    for ids in chunked(id_list, 80):
        documents = []
        for id_ in ids:
            try:
                documents.append(cls.extract_document(id_))

            except UnindexMeBro:
                # extract_document throws this in cases where we need
                # to remove the item from the index.
                cls.unindex(id_)

            except Exception:
                log.exception('Unable to extract/index document (id: %d)', id_)
                if reraise:
                    raise

        if documents:
            cls.bulk_index(documents, id_field='document_id')
Esempio n. 4
0
def index_chunk(cls, id_list, reraise=False):
    """Index a chunk of documents.

    :arg cls: The MappingType class.
    :arg id_list: Iterable of ids of that MappingType to index.
    :arg reraise: False if you want errors to be swallowed and True
        if you want errors to be thrown.

    """
    # Note: This bulk indexes in batches of 80. I didn't arrive at
    # this number through a proper scientific method. It's possible
    # there's a better number. It takes a while to fiddle with,
    # though. Probably best to expose the number as an environment
    # variable, then run a script that takes timings for
    # --criticalmass, runs overnight and returns a more "optimal"
    # number.
    for ids in chunked(id_list, 80):
        documents = []
        for id_ in ids:
            try:
                documents.append(cls.extract_document(id_))

            except UnindexMeBro:
                # extract_document throws this in cases where we need
                # to remove the item from the index.
                cls.unindex(id_)

            except Exception:
                log.exception('Unable to extract/index document (id: %d)',
                              id_)
                if reraise:
                    raise

        if documents:
            cls.bulk_index(documents, id_field='document_id')
Esempio n. 5
0
def handle_reindex(request):
    """Caculates and kicks off indexing tasks"""
    write_index = es_utils.WRITE_INDEX

    # This is truthy if the user wants us to delete and recreate
    # the index first.
    delete_index_first = bool(request.POST.get("delete_index"))

    if delete_index_first:
        # Coming from the delete form, so we reindex all models.
        models_to_index = None
    else:
        # Coming from the reindex form, so we reindex whatever we're
        # told.
        models_to_index = [name.replace("check_", "") for name in request.POST.keys() if name.startswith("check_")]

    # TODO: If this gets fux0rd, then it's possible this could be
    # non-zero and we really want to just ignore it. Need the ability
    # to ignore it.
    try:
        client = redis_client("default")
        val = client.get(OUTSTANDING_INDEX_CHUNKS)
        if val is not None and int(val) > 0:
            raise ReindexError("There are %s outstanding chunks." % val)

        # We don't know how many chunks we're building, but we do want
        # to make sure another reindex request doesn't slide in here
        # and kick off a bunch of chunks.
        #
        # There is a race condition here.
        client.set(OUTSTANDING_INDEX_CHUNKS, 1)
    except RedisError:
        log.warning("Redis not running. Can not check if there are " "outstanding tasks.")

    batch_id = create_batch_id()

    # Break up all the things we want to index into chunks. This
    # chunkifies by class then by chunk size.
    chunks = []
    for cls, indexable in get_indexable(search_models=models_to_index):
        chunks.extend((cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE))

    if delete_index_first:
        # The previous lines do a lot of work and take some time to
        # execute.  So we wait until here to wipe and rebuild the
        # index. That reduces the time that there is no index by a little.
        recreate_index()

    chunks_count = len(chunks)

    try:
        client = redis_client("default")
        client.set(OUTSTANDING_INDEX_CHUNKS, chunks_count)
    except RedisError:
        log.warning("Redis not running. Can't denote outstanding tasks.")

    for chunk in chunks:
        index_chunk_task.delay(write_index, batch_id, chunk)

    return HttpResponseRedirect(request.path)
Esempio n. 6
0
def es_reindex_cmd(percent=100, delete=False, models=None):
    """Rebuild ElasticSearch indexes

    :arg percent: 1 to 100--the percentage of the db to index
    :arg delete: whether or not to wipe the index before reindexing
    :arg models: list of search model names to index

    """
    es = get_indexing_es()

    try:
        get_doctype_stats(WRITE_INDEX)
    except ESIndexMissingException:
        if not delete:
            log.error("The index does not exist. You must specify --delete.")
            return

    if delete:
        log.info("wiping and recreating %s....", WRITE_INDEX)
        recreate_index(es=es)

    if models:
        indexable = get_indexable(percent, models)
    else:
        indexable = get_indexable(percent)

    start_time = time.time()

    for cls, indexable in indexable:
        cls_start_time = time.time()
        total = len(indexable)

        if total == 0:
            continue

        log.info("reindex %s into %s index. %s to index....", cls.get_model_name(), WRITE_INDEX, total)

        i = 0
        for chunk in chunked(indexable, 1000):
            index_chunk(cls, chunk, es=es)

            i += len(chunk)
            time_to_go = (total - i) * ((time.time() - start_time) / i)
            per_1000 = (time.time() - start_time) / (i / 1000.0)
            log.info("%s/%s... (%s to go, %s per 1000 docs)", i, total, format_time(time_to_go), format_time(per_1000))

            # We call this every 1000 or so because we're
            # essentially loading the whole db and if DEBUG=True,
            # then Django saves every sql statement which causes
            # our memory to go up up up. So we reset it and that
            # makes things happier even in DEBUG environments.
            reset_queries()

        delta_time = time.time() - cls_start_time
        log.info("done! (%s, %s per 1000 docs)", format_time(delta_time), format_time(delta_time / (total / 1000.0)))

    delta_time = time.time() - start_time
    log.info("done! (total time: %s)", format_time(delta_time))
Esempio n. 7
0
def es_status_cmd(checkindex=False):
    """Shows elastic search index status"""
    try:
        try:
            read_doctype_stats = get_doctype_stats(READ_INDEX)
        except ESIndexMissingException:
            read_doctype_stats = None

        if READ_INDEX == WRITE_INDEX:
            write_doctype_stats = read_doctype_stats
        else:
            try:
                write_doctype_stats = get_doctype_stats(WRITE_INDEX)
            except ESIndexMissingException:
                write_doctype_stats = None

        indexes = get_indexes(all_indexes=True)
    except ESMaxRetryError:
        log.error(
            "Your elasticsearch process is not running or ES_HOSTS " "is set wrong in your settings_local.py file."
        )
        return

    log.info("Settings:")
    log.info("  ES_HOSTS              : %s", settings.ES_HOSTS)
    log.info("  ES_INDEX_PREFIX       : %s", settings.ES_INDEX_PREFIX)
    log.info("  ES_LIVE_INDEXING      : %s", settings.ES_LIVE_INDEXING)
    log.info("  ES_INDEXES            : %s", settings.ES_INDEXES)
    log.info("  ES_WRITE_INDEXES      : %s", settings.ES_WRITE_INDEXES)

    log.info("Index stats:")

    if indexes:
        log.info("  List of indexes:")
        for name, count in indexes:
            read_write = []
            if name == READ_INDEX:
                read_write.append("READ")
            if name == WRITE_INDEX:
                read_write.append("WRITE")
            log.info("    %-20s: %s %s", name, count, "/".join(read_write))
    else:
        log.info("  There are no %s indexes.", settings.ES_INDEX_PREFIX)

    if read_doctype_stats is None:
        log.info("  Read index does not exist. (%s)", READ_INDEX)
    else:
        log.info("  Read index (%s):", READ_INDEX)
        for name, count in read_doctype_stats.items():
            log.info("    %-20s: %d", name, count)

    if READ_INDEX != WRITE_INDEX:
        if write_doctype_stats is None:
            log.info("  Write index does not exist. (%s)", WRITE_INDEX)
        else:
            log.info("  Write index (%s):", WRITE_INDEX)
            for name, count in write_doctype_stats.items():
                log.info("    %-20s: %d", name, count)
    else:
        log.info("  Write index is same as read index.")

    if checkindex:
        # Go through the index and verify everything
        log.info("Checking index contents....")

        missing_docs = 0

        for cls, id_list in get_indexable():
            for id_group in chunked(id_list, 100):
                doc_list = get_documents(cls, id_group)
                if len(id_group) != len(doc_list):
                    doc_list_ids = [doc["id"] for doc in doc_list]
                    for id_ in id_group:
                        if id_ not in doc_list_ids:
                            log.info("   Missing %s %s", cls.get_model_name(), id_)
                            missing_docs += 1

        if missing_docs:
            print "There were %d missing_docs" % missing_docs
Esempio n. 8
0
def es_status_cmd(checkindex=False, log=log):
    """Shows elastic search index status"""
    try:
        read_doctype_stats = get_doctype_stats(READ_INDEX)
    except ES_EXCEPTIONS:
        read_doctype_stats = None

    if READ_INDEX == WRITE_INDEX:
        write_doctype_stats = read_doctype_stats
    else:
        try:
            write_doctype_stats = get_doctype_stats(WRITE_INDEX)
        except ES_EXCEPTIONS:
            write_doctype_stats = None

    try:
        indexes = get_indexes(all_indexes=True)
    except ES_EXCEPTIONS:
        log.error('Your elasticsearch process is not running or ES_URLS '
                  'is set wrong in your settings_local.py file.')
        return

    log.info('Settings:')
    log.info('  ES_URLS                 : %s', settings.ES_URLS)
    log.info('  ES_INDEX_PREFIX         : %s', settings.ES_INDEX_PREFIX)
    log.info('  ES_LIVE_INDEXING        : %s', settings.ES_LIVE_INDEXING)
    log.info('  ES_INDEXES              : %s', settings.ES_INDEXES)
    log.info('  ES_WRITE_INDEXES        : %s', settings.ES_WRITE_INDEXES)

    log.info('Index stats:')

    if indexes:
        log.info('  List of indexes:')
        for name, count in sorted(indexes):
            read_write = []
            if name == READ_INDEX:
                read_write.append('READ')
            if name == WRITE_INDEX:
                read_write.append('WRITE')
            log.info('    %-22s: %s %s', name, count,
                     '/'.join(read_write))
    else:
        log.info('  There are no %s indexes.', settings.ES_INDEX_PREFIX)

    if read_doctype_stats is None:
        log.info('  Read index does not exist. (%s)', READ_INDEX)
    else:
        log.info('  Read index (%s):', READ_INDEX)
        for name, count in sorted(read_doctype_stats.items()):
            log.info('    %-22s: %d', name, count)

    if READ_INDEX != WRITE_INDEX:
        if write_doctype_stats is None:
            log.info('  Write index does not exist. (%s)', WRITE_INDEX)
        else:
            log.info('  Write index (%s):', WRITE_INDEX)
            for name, count in sorted(write_doctype_stats.items()):
                log.info('    %-22s: %d', name, count)
    else:
        log.info('  Write index is same as read index.')

    if checkindex:
        # Go through the index and verify everything
        log.info('Checking index contents....')

        missing_docs = 0

        for cls, id_list in get_indexable():
            for id_group in chunked(id_list, 100):
                doc_list = get_documents(cls, id_group)
                if len(id_group) != len(doc_list):
                    doc_list_ids = [doc['id'] for doc in doc_list]
                    for id_ in id_group:
                        if id_ not in doc_list_ids:
                            log.info('   Missing %s %s',
                                     cls.get_model_name(), id_)
                            missing_docs += 1

        if missing_docs:
            print 'There were %d missing_docs' % missing_docs
Esempio n. 9
0
def es_reindex_cmd(percent=100, delete=False, mapping_types=None,
                   criticalmass=False, log=log):
    """Rebuild ElasticSearch indexes

    :arg percent: 1 to 100--the percentage of the db to index
    :arg delete: whether or not to wipe the index before reindexing
    :arg mapping_types: list of mapping types to index
    :arg criticalmass: whether or not to index just a critical mass of
        things
    :arg log: the logger to use

    """
    es = get_es()

    try:
        get_doctype_stats(WRITE_INDEX)
    except ES_EXCEPTIONS:
        if not delete:
            log.error('The index does not exist. You must specify --delete.')
            return

    if delete:
        log.info('wiping and recreating %s....', WRITE_INDEX)
        recreate_index(es=es)

    if criticalmass:
        # The critical mass is defined as the entire KB plus the most
        # recent 15k questions (which is about how many questions
        # there were created in the last 180 days). We build that
        # indexable here.

        # Get only questions and wiki document stuff.
        all_indexable = get_indexable(
            mapping_types=['questions_question', 'wiki_document'])

        # The first item is questions because we specified that
        # order. Old questions don't show up in searches, so we nix
        # them by reversing the list (ordered by id ascending) and
        # slicing it.
        all_indexable[0] = (all_indexable[0][0],
                            list(reversed(all_indexable[0][1]))[:15000])

    elif mapping_types:
        all_indexable = get_indexable(percent, mapping_types)

    else:
        all_indexable = get_indexable(percent)

    # We're doing a lot of indexing, so we get the refresh_interval
    # currently in the index, then nix refreshing. Later we'll restore
    # it.
    old_refresh = get_index_settings(WRITE_INDEX).get(
        'index.refresh_interval', '1s')

    # Disable automatic refreshing
    es.update_settings(
        WRITE_INDEX, {'index': {'refresh_interval': '-1'}})

    log.info('using index: %s', WRITE_INDEX)

    start_time = time.time()
    for cls, indexable in all_indexable:
        cls_start_time = time.time()
        total = len(indexable)

        if total == 0:
            continue

        log.info('reindexing %s. %s to index....',
                 cls.get_mapping_type_name(), total)

        i = 0
        for chunk in chunked(indexable, 1000):
            chunk_start_time = time.time()
            index_chunk(cls, chunk)

            i += len(chunk)
            time_to_go = (total - i) * ((time.time() - cls_start_time) / i)
            per_1000 = (time.time() - cls_start_time) / (i / 1000.0)
            this_1000 = time.time() - chunk_start_time

            log.info('   %s/%s %s... (%s/1000 avg, %s ETA)',
                     i,
                     total,
                     format_time(this_1000),
                     format_time(per_1000),
                     format_time(time_to_go)
            )

        delta_time = time.time() - cls_start_time
        log.info('   done! (%s total, %s/1000 avg)',
                 format_time(delta_time),
                 format_time(delta_time / (total / 1000.0)))

    # Re-enable automatic refreshing
    es.update_settings(
        WRITE_INDEX, {'index': {'refresh_interval': old_refresh}})
    delta_time = time.time() - start_time
    log.info('done! (%s total)', format_time(delta_time))
Esempio n. 10
0
def es_status_cmd(checkindex=False, log=log):
    """Shows elastic search index status"""
    try:
        read_doctype_stats = get_doctype_stats(READ_INDEX)
    except ES_EXCEPTIONS:
        read_doctype_stats = None

    if READ_INDEX == WRITE_INDEX:
        write_doctype_stats = read_doctype_stats
    else:
        try:
            write_doctype_stats = get_doctype_stats(WRITE_INDEX)
        except ES_EXCEPTIONS:
            write_doctype_stats = None

    try:
        indexes = get_indexes(all_indexes=True)
    except ES_EXCEPTIONS:
        log.error('Your elasticsearch process is not running or ES_URLS '
                  'is set wrong in your settings_local.py file.')
        return

    log.info('Settings:')
    log.info('  ES_URLS               : %s', settings.ES_URLS)
    log.info('  ES_INDEX_PREFIX       : %s', settings.ES_INDEX_PREFIX)
    log.info('  ES_LIVE_INDEXING      : %s', settings.ES_LIVE_INDEXING)
    log.info('  ES_INDEXES            : %s', settings.ES_INDEXES)
    log.info('  ES_WRITE_INDEXES      : %s', settings.ES_WRITE_INDEXES)

    log.info('Index stats:')

    if indexes:
        log.info('  List of indexes:')
        for name, count in indexes:
            read_write = []
            if name == READ_INDEX:
                read_write.append('READ')
            if name == WRITE_INDEX:
                read_write.append('WRITE')
            log.info('    %-20s: %s %s', name, count, '/'.join(read_write))
    else:
        log.info('  There are no %s indexes.', settings.ES_INDEX_PREFIX)

    if read_doctype_stats is None:
        log.info('  Read index does not exist. (%s)', READ_INDEX)
    else:
        log.info('  Read index (%s):', READ_INDEX)
        for name, count in read_doctype_stats.items():
            log.info('    %-20s: %d', name, count)

    if READ_INDEX != WRITE_INDEX:
        if write_doctype_stats is None:
            log.info('  Write index does not exist. (%s)', WRITE_INDEX)
        else:
            log.info('  Write index (%s):', WRITE_INDEX)
            for name, count in write_doctype_stats.items():
                log.info('    %-20s: %d', name, count)
    else:
        log.info('  Write index is same as read index.')

    if checkindex:
        # Go through the index and verify everything
        log.info('Checking index contents....')

        missing_docs = 0

        for cls, id_list in get_indexable():
            for id_group in chunked(id_list, 100):
                doc_list = get_documents(cls, id_group)
                if len(id_group) != len(doc_list):
                    doc_list_ids = [doc['id'] for doc in doc_list]
                    for id_ in id_group:
                        if id_ not in doc_list_ids:
                            log.info('   Missing %s %s', cls.get_model_name(),
                                     id_)
                            missing_docs += 1

        if missing_docs:
            print 'There were %d missing_docs' % missing_docs
Esempio n. 11
0
def es_reindex_cmd(percent=100,
                   delete=False,
                   models=None,
                   criticalmass=False,
                   log=log):
    """Rebuild ElasticSearch indexes

    :arg percent: 1 to 100--the percentage of the db to index
    :arg delete: whether or not to wipe the index before reindexing
    :arg models: list of search model names to index
    :arg criticalmass: whether or not to index just a critical mass of
        things
    :arg log: the logger to use
    """
    es = get_es()

    try:
        get_doctype_stats(WRITE_INDEX)
    except ES_EXCEPTIONS:
        if not delete:
            log.error('The index does not exist. You must specify --delete.')
            return

    if delete:
        log.info('wiping and recreating %s....', WRITE_INDEX)
        recreate_index(es=es)

    if criticalmass:
        # The critical mass is defined as the entire KB plus the most
        # recent 15k questions (which is about how many questions
        # there were created in the last 180 days). We build that
        # indexable here.

        # Get only questions and wiki document stuff.
        all_indexable = get_indexable(
            search_models=['questions_question', 'wiki_document'])

        # The first item is questions because we specified that
        # order. Old questions don't show up in searches, so we nix
        # them by reversing the list (ordered by id ascending) and
        # slicing it.
        all_indexable[0] = (all_indexable[0][0],
                            list(reversed(all_indexable[0][1]))[:15000])

    elif models:
        all_indexable = get_indexable(percent, models)

    else:
        all_indexable = get_indexable(percent)

    start_time = time.time()
    for cls, indexable in all_indexable:
        cls_start_time = time.time()
        total = len(indexable)

        if total == 0:
            continue

        log.info('reindex %s into %s index. %s to index....',
                 cls.get_model_name(), WRITE_INDEX, total)

        i = 0
        for chunk in chunked(indexable, 1000):
            index_chunk(cls, chunk)

            i += len(chunk)
            time_to_go = (total - i) * ((time.time() - start_time) / i)
            per_1000 = (time.time() - start_time) / (i / 1000.0)
            log.info('%s/%s... (%s to go, %s per 1000 docs)', i, total,
                     format_time(time_to_go), format_time(per_1000))

            # We call this every 1000 or so because we're
            # essentially loading the whole db and if DEBUG=True,
            # then Django saves every sql statement which causes
            # our memory to go up up up. So we reset it and that
            # makes things happier even in DEBUG environments.
            reset_queries()

        delta_time = time.time() - cls_start_time
        log.info('done! (%s, %s per 1000 docs)', format_time(delta_time),
                 format_time(delta_time / (total / 1000.0)))

    delta_time = time.time() - start_time
    log.info('done! (total time: %s)', format_time(delta_time))
Esempio n. 12
0
def es_reindex_cmd(percent=100, delete=False, models=None,
                   criticalmass=False, log=log):
    """Rebuild ElasticSearch indexes

    :arg percent: 1 to 100--the percentage of the db to index
    :arg delete: whether or not to wipe the index before reindexing
    :arg models: list of search model names to index
    :arg criticalmass: whether or not to index just a critical mass of
        things
    :arg log: the logger to use
    """
    es = get_es()

    try:
        get_doctype_stats(WRITE_INDEX)
    except ES_EXCEPTIONS:
        if not delete:
            log.error('The index does not exist. You must specify --delete.')
            return

    if delete:
        log.info('wiping and recreating %s....', WRITE_INDEX)
        recreate_index(es=es)

    if criticalmass:
        # The critical mass is defined as the entire KB plus the most
        # recent 15k questions (which is about how many questions
        # there were created in the last 180 days). We build that
        # indexable here.

        # Get only questions and wiki document stuff.
        all_indexable = get_indexable(
            search_models=['questions_question', 'wiki_document'])

        # The first item is questions because we specified that
        # order. Old questions don't show up in searches, so we nix
        # them by reversing the list (ordered by id ascending) and
        # slicing it.
        all_indexable[0] = (all_indexable[0][0],
                            list(reversed(all_indexable[0][1]))[:15000])

    elif models:
        all_indexable = get_indexable(percent, models)

    else:
        all_indexable = get_indexable(percent)

    start_time = time.time()
    for cls, indexable in all_indexable:
        cls_start_time = time.time()
        total = len(indexable)

        if total == 0:
            continue

        log.info('reindex %s into %s index. %s to index....',
                 cls.get_model_name(),
                 WRITE_INDEX, total)

        i = 0
        for chunk in chunked(indexable, 1000):
            index_chunk(cls, chunk)

            i += len(chunk)
            time_to_go = (total - i) * ((time.time() - start_time) / i)
            per_1000 = (time.time() - start_time) / (i / 1000.0)
            log.info('%s/%s... (%s to go, %s per 1000 docs)', i, total,
                     format_time(time_to_go),
                     format_time(per_1000))

            # We call this every 1000 or so because we're
            # essentially loading the whole db and if DEBUG=True,
            # then Django saves every sql statement which causes
            # our memory to go up up up. So we reset it and that
            # makes things happier even in DEBUG environments.
            reset_queries()

        delta_time = time.time() - cls_start_time
        log.info('done! (%s, %s per 1000 docs)',
                 format_time(delta_time),
                 format_time(delta_time / (total / 1000.0)))

    delta_time = time.time() - start_time
    log.info('done! (total time: %s)', format_time(delta_time))
Esempio n. 13
0
def es_reindex_cmd(percent=100, delete=False, models=None):
    """Rebuild ElasticSearch indexes

    :arg percent: 1 to 100--the percentage of the db to index
    :arg delete: whether or not to wipe the index before reindexing
    :arg models: list of search model names to index

    """
    es = get_indexing_es()

    try:
        get_doctype_stats(WRITE_INDEX)
    except ESIndexMissingException:
        if not delete:
            log.error('The index does not exist. You must specify --delete.')
            return

    if delete:
        log.info('wiping and recreating %s....', WRITE_INDEX)
        recreate_index(es=es)

    if models:
        indexable = get_indexable(percent, models)
    else:
        indexable = get_indexable(percent)

    start_time = time.time()

    for cls, indexable in indexable:
        cls_start_time = time.time()
        total = len(indexable)

        if total == 0:
            continue

        log.info('reindex %s into %s index. %s to index....',
                 cls.get_model_name(), WRITE_INDEX, total)

        i = 0
        for chunk in chunked(indexable, 1000):
            index_chunk(cls, chunk, es=es)

            i += len(chunk)
            time_to_go = (total - i) * ((time.time() - start_time) / i)
            per_1000 = (time.time() - start_time) / (i / 1000.0)
            log.info('%s/%s... (%s to go, %s per 1000 docs)', i, total,
                     format_time(time_to_go), format_time(per_1000))

            # We call this every 1000 or so because we're
            # essentially loading the whole db and if DEBUG=True,
            # then Django saves every sql statement which causes
            # our memory to go up up up. So we reset it and that
            # makes things happier even in DEBUG environments.
            reset_queries()

        delta_time = time.time() - cls_start_time
        log.info('done! (%s, %s per 1000 docs)', format_time(delta_time),
                 format_time(delta_time / (total / 1000.0)))

    delta_time = time.time() - start_time
    log.info('done! (total time: %s)', format_time(delta_time))
Esempio n. 14
0
def handle_reindex(request):
    """Caculates and kicks off indexing tasks"""
    write_index = es_utils.WRITE_INDEX

    # This is truthy if the user wants us to delete and recreate
    # the index first.
    delete_index_first = bool(request.POST.get('delete_index'))

    if delete_index_first:
        # Coming from the delete form, so we reindex all models.
        models_to_index = None
    else:
        # Coming from the reindex form, so we reindex whatever we're
        # told.
        models_to_index = [
            name.replace('check_', '') for name in request.POST.keys()
            if name.startswith('check_')
        ]

    # TODO: If this gets fux0rd, then it's possible this could be
    # non-zero and we really want to just ignore it. Need the ability
    # to ignore it.
    try:
        client = redis_client('default')
        val = client.get(OUTSTANDING_INDEX_CHUNKS)
        if val is not None and int(val) > 0:
            raise ReindexError('There are %s outstanding chunks.' % val)

        # We don't know how many chunks we're building, but we do want
        # to make sure another reindex request doesn't slide in here
        # and kick off a bunch of chunks.
        #
        # There is a race condition here.
        client.set(OUTSTANDING_INDEX_CHUNKS, 1)
    except RedisError:
        log.warning('Redis not running. Can not check if there are '
                    'outstanding tasks.')

    batch_id = create_batch_id()

    # Break up all the things we want to index into chunks. This
    # chunkifies by class then by chunk size.
    chunks = []
    for cls, indexable in get_indexable(search_models=models_to_index):
        chunks.extend((cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE))

    if delete_index_first:
        # The previous lines do a lot of work and take some time to
        # execute.  So we wait until here to wipe and rebuild the
        # index. That reduces the time that there is no index by a little.
        recreate_index()

    chunks_count = len(chunks)

    try:
        client = redis_client('default')
        client.set(OUTSTANDING_INDEX_CHUNKS, chunks_count)
    except RedisError:
        log.warning('Redis not running. Can\'t denote outstanding tasks.')

    for chunk in chunks:
        index_chunk_task.delay(write_index, batch_id, chunk)

    return HttpResponseRedirect(request.path)