Ejemplo n.º 1
0
    def handle(self, *args, **kwargs):
        """Set up reindexing tasks.

        Creates a Tasktree that creates a new indexes and indexes all objects,
        then points the alias to this new index when finished.
        """
        index_choice = kwargs.get('index', None)
        prefix = kwargs.get('prefix', '')
        force = kwargs.get('force', False)

        if index_choice:
            # If we only want to reindex a subset of indexes.
            INDEXES = INDEX_CHOICES.get(index_choice, None)
            if INDEXES is None:
                raise CommandError(
                    'Incorrect index name specified. '
                    'Choose one of: %s' % ', '.join(INDEX_CHOICES.keys()))
        else:
            INDEXES = INDEXERS

        if Reindexing.is_reindexing() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')
        elif force:
            Reindexing.unflag_reindexing()

        for INDEXER in INDEXES:
            index_name = INDEXER.get_mapping_type_name()
            chunk_size = INDEXER.chunk_size
            alias = ES_INDEXES[index_name]

            chunks, total = chunk_indexing(INDEXER, chunk_size)
            if not total:
                _print('No items to queue.', alias)
            else:
                total_chunks = int(ceil(total / float(chunk_size)))
                _print('Indexing {total} items into {n} chunks of size {size}'
                       .format(total=total, n=total_chunks, size=chunk_size),
                       alias)

            # Get the old index if it exists.
            try:
                aliases = ES.indices.get_alias(name=alias).keys()
            except elasticsearch.NotFoundError:
                aliases = []
            old_index = aliases[0] if aliases else None

            # Create a new index, using the index name with a timestamp.
            new_index = timestamp_index(prefix + alias)

            # See how the index is currently configured.
            if old_index:
                try:
                    s = (ES.indices.get_settings(index=old_index).get(
                        old_index, {}).get('settings', {}))
                except elasticsearch.NotFoundError:
                    s = {}
            else:
                s = {}
            num_replicas = s.get('number_of_replicas',
                                 settings.ES_DEFAULT_NUM_REPLICAS)
            num_shards = s.get('number_of_shards',
                               settings.ES_DEFAULT_NUM_SHARDS)

            pre_task = pre_index.si(new_index, old_index, alias, index_name, {
                'analysis': INDEXER.get_analysis(),
                'number_of_replicas': 0,
                'number_of_shards': num_shards,
                'store.compress.tv': True,
                'store.compress.stored': True,
                'refresh_interval': '-1'})
            post_task = post_index.si(new_index, old_index, alias, index_name,
                                      {'number_of_replicas': num_replicas,
                                       'refresh_interval': '5s'})

            # Ship it.
            if not total:
                # If there's no data we still create the index and alias.
                chain(pre_task, post_task).apply_async()
            else:
                index_tasks = [run_indexing.si(new_index, index_name, chunk)
                               for chunk in chunks]

                if settings.CELERY_ALWAYS_EAGER:
                    # Eager mode and chords don't get along. So we serialize
                    # the tasks as a workaround.
                    index_tasks.insert(0, pre_task)
                    index_tasks.append(post_task)
                    chain(*index_tasks).apply_async()
                else:
                    chain(pre_task, chord(header=index_tasks,
                                          body=post_task)).apply_async()

        _print('New index and indexing tasks all queued up.')
Ejemplo n.º 2
0
    def test_is_reindexing(self):
        assert not Reindexing.is_reindexing()

        Reindexing.objects.create(alias='foo', new_index='bar',
                                  old_index='baz')
        assert Reindexing.is_reindexing()
Ejemplo n.º 3
0
    def handle(self, *args, **kwargs):
        """Set up reindexing tasks.

        Creates a Tasktree that creates a new indexes and indexes all objects,
        then points the alias to this new index when finished.
        """
        index_choice = kwargs.get('index', None)
        prefix = kwargs.get('prefix', '')
        force = kwargs.get('force', False)

        if index_choice:
            # If we only want to reindex a subset of indexes.
            INDEXES = INDEX_CHOICES.get(index_choice, None)
            if INDEXES is None:
                raise CommandError('Incorrect index name specified. '
                                   'Choose one of: %s' %
                                   ', '.join(INDEX_CHOICES.keys()))
        else:
            INDEXES = INDEXERS

        if Reindexing.is_reindexing() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')
        elif force:
            Reindexing.unflag_reindexing()

        for INDEXER in INDEXES:
            index_name = INDEXER.get_mapping_type_name()
            chunk_size = INDEXER.chunk_size
            alias = ES_INDEXES[index_name]

            chunks, total = chunk_indexing(INDEXER, chunk_size)
            if not total:
                _print('No items to queue.', alias)
            else:
                total_chunks = int(ceil(total / float(chunk_size)))
                _print(
                    'Indexing {total} items into {n} chunks of size {size}'.
                    format(total=total, n=total_chunks,
                           size=chunk_size), alias)

            # Get the old index if it exists.
            try:
                aliases = ES.indices.get_alias(name=alias).keys()
            except elasticsearch.NotFoundError:
                aliases = []
            old_index = aliases[0] if aliases else None

            # Create a new index, using the index name with a timestamp.
            new_index = timestamp_index(prefix + alias)

            # See how the index is currently configured.
            if old_index:
                try:
                    s = (ES.indices.get_settings(index=old_index).get(
                        old_index, {}).get('settings', {}))
                except elasticsearch.NotFoundError:
                    s = {}
            else:
                s = {}
            num_replicas = s.get('number_of_replicas',
                                 settings.ES_DEFAULT_NUM_REPLICAS)
            num_shards = s.get('number_of_shards',
                               settings.ES_DEFAULT_NUM_SHARDS)

            pre_task = pre_index.si(
                new_index, old_index, alias, index_name, {
                    'analysis': INDEXER.get_analysis(),
                    'number_of_replicas': 0,
                    'number_of_shards': num_shards,
                    'store.compress.tv': True,
                    'store.compress.stored': True,
                    'refresh_interval': '-1'
                })
            post_task = post_index.si(new_index, old_index, alias, index_name,
                                      {
                                          'number_of_replicas': num_replicas,
                                          'refresh_interval': '5s'
                                      })

            # Ship it.
            if not total:
                # If there's no data we still create the index and alias.
                chain(pre_task, post_task).apply_async()
            else:
                index_tasks = [
                    run_indexing.si(new_index, index_name, chunk)
                    for chunk in chunks
                ]

                if settings.CELERY_ALWAYS_EAGER:
                    # Eager mode and chords don't get along. So we serialize
                    # the tasks as a workaround.
                    index_tasks.insert(0, pre_task)
                    index_tasks.append(post_task)
                    chain(*index_tasks).apply_async()
                else:
                    chain(pre_task, chord(header=index_tasks,
                                          body=post_task)).apply_async()

        _print('New index and indexing tasks all queued up.')
Ejemplo n.º 4
0
    def handle(self, *args, **kwargs):
        """Set up reindexing tasks.

        Creates a Tasktree that creates a new indexes and indexes all objects,
        then points the alias to this new index when finished.
        """
        global INDEXES

        index_choice = kwargs.get('index', None)
        prefix = kwargs.get('prefix', '')
        force = kwargs.get('force', False)

        if index_choice:
            # If we only want to reindex a subset of indexes.
            INDEXES = INDEX_DICT.get(index_choice, INDEXES)

        if Reindexing.is_reindexing() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')
        elif force:
            Reindexing.unflag_reindexing()

        for ALIAS, INDEXER, CHUNK_SIZE in INDEXES:

            chunks, total = chunk_indexing(INDEXER, CHUNK_SIZE)
            if not total:
                _print('No items to queue.', ALIAS)
            else:
                total_chunks = int(ceil(total / float(CHUNK_SIZE)))
                _print('Indexing {total} items into {n} chunks of size {size}'
                       .format(total=total, n=total_chunks, size=CHUNK_SIZE),
                       ALIAS)

            # Get the old index if it exists.
            try:
                aliases = ES.indices.get_alias(name=ALIAS).keys()
            except elasticsearch.NotFoundError:
                aliases = []
            old_index = aliases[0] if aliases else None

            # Create a new index, using the index name with a timestamp.
            new_index = timestamp_index(prefix + ALIAS)

            # See how the index is currently configured.
            if old_index:
                try:
                    s = (ES.indices.get_settings(index=old_index).get(
                        old_index, {}).get('settings', {}))
                except elasticsearch.NotFoundError:
                    s = {}
            else:
                s = {}
            num_replicas = s.get('number_of_replicas',
                                 settings.ES_DEFAULT_NUM_REPLICAS)
            num_shards = s.get('number_of_shards',
                               settings.ES_DEFAULT_NUM_SHARDS)

            pre_task = pre_index.si(new_index, old_index, ALIAS, INDEXER, {
                'analysis': INDEXER.get_analysis(),
                'number_of_replicas': 0,
                'number_of_shards': num_shards,
                'store.compress.tv': True,
                'store.compress.stored': True,
                'refresh_interval': '-1'})
            post_task = post_index.si(new_index, old_index, ALIAS, INDEXER, {
                'number_of_replicas': num_replicas,
                'refresh_interval': '5s'})

            # Ship it.
            if not total:
                # If there's no data we still create the index and alias.
                chain(pre_task, post_task).apply_async()
            else:
                index_tasks = [run_indexing.si(new_index, INDEXER, chunk)
                               for chunk in chunks]
                chain(pre_task,
                      chord(header=index_tasks, body=post_task)).apply_async()

        _print('New index and indexing tasks all queued up.')
Ejemplo n.º 5
0
    def handle(self, *args, **kwargs):
        """Set up reindexing tasks.

        Creates a Tasktree that creates a new indexes and indexes all objects,
        then points the alias to this new index when finished.
        """
        global INDEXES

        index_choice = kwargs.get('index', None)
        prefix = kwargs.get('prefix', '')
        force = kwargs.get('force', False)

        if index_choice:
            # If we only want to reindex a subset of indexes.
            INDEXES = INDEX_DICT.get(index_choice, INDEXES)

        if Reindexing.is_reindexing() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')
        elif force:
            unflag_database()

        chain = None
        old_indexes = []
        for ALIAS, INDEXER, CHUNK_SIZE in INDEXES:
            # Get the old index if it exists.
            try:
                aliases = ES.indices.get_alias(name=ALIAS).keys()
            except elasticsearch.NotFoundError:
                aliases = []
            old_index = aliases[0] if aliases else None
            old_indexes.append(old_index)

            # Create a new index, using the index name with a timestamp.
            new_index = timestamp_index(prefix + ALIAS)

            # See how the index is currently configured.
            if old_index:
                try:
                    s = (ES.indices.get_settings(index=old_index).get(
                        old_index, {}).get('settings', {}))
                except elasticsearch.NotFoundError:
                    s = {}
            else:
                s = {}
            num_replicas = s.get('number_of_replicas',
                                 settings.ES_DEFAULT_NUM_REPLICAS)
            num_shards = s.get('number_of_shards',
                               settings.ES_DEFAULT_NUM_SHARDS)

            # Flag the database to mark as currently indexing.
            if not chain:
                chain = flag_database.si(new_index, old_index, ALIAS)
            else:
                chain |= flag_database.si(new_index, old_index, ALIAS)

            # Create the indexes and mappings.
            # Note: We set num_replicas=0 here to lower load while re-indexing.
            # In later step we increase it which results in more efficient bulk
            # copy in ES. For ES < 0.90 we manually enable compression.
            chain |= create_index.si(new_index, ALIAS, INDEXER, {
                'analysis': INDEXER.get_analysis(),
                'number_of_replicas': 0, 'number_of_shards': num_shards,
                'store.compress.tv': True, 'store.compress.stored': True,
                'refresh_interval': '-1'})

            # Index all the things!
            chain |= run_indexing.si(new_index, INDEXER, CHUNK_SIZE)

            # After indexing we optimize the index, adjust settings, and point
            # alias to the new index.
            chain |= update_alias.si(new_index, old_index, ALIAS, {
                'number_of_replicas': num_replicas, 'refresh_interval': '5s'})

        # Unflag the database to mark as done indexing.
        chain |= unflag_database.si()

        # Delete the old index, if any.
        for old_index in old_indexes:
            if old_index:
                chain |= delete_index.si(old_index)

        # All done!
        chain |= output_summary.si()

        # Ship it.
        self.stdout.write('\nNew index and indexing tasks all queued up.\n')
        os.environ['FORCE_INDEXING'] = '1'
        try:
            chain.apply_async()
        finally:
            del os.environ['FORCE_INDEXING']
Ejemplo n.º 6
0
    def handle(self, *args, **kwargs):
        """Set up reindexing tasks.

        Creates a Tasktree that creates a new indexes and indexes all objects,
        then points the alias to this new index when finished.
        """
        global INDEXES

        index_choice = kwargs.get('index', None)
        prefix = kwargs.get('prefix', '')
        force = kwargs.get('force', False)

        if index_choice:
            # If we only want to reindex a subset of indexes.
            INDEXES = INDEX_DICT.get(index_choice, INDEXES)

        if Reindexing.is_reindexing() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')
        elif force:
            Reindexing.unflag_reindexing()

        for ALIAS, INDEXER, CHUNK_SIZE in INDEXES:

            chunks, total = chunk_indexing(INDEXER, CHUNK_SIZE)
            if not total:
                _print('No items to queue.', ALIAS)
            else:
                total_chunks = int(ceil(total / float(CHUNK_SIZE)))
                _print(
                    'Indexing {total} items into {n} chunks of size {size}'.
                    format(total=total, n=total_chunks,
                           size=CHUNK_SIZE), ALIAS)

            # Get the old index if it exists.
            try:
                aliases = ES.indices.get_alias(name=ALIAS).keys()
            except elasticsearch.NotFoundError:
                aliases = []
            old_index = aliases[0] if aliases else None

            # Create a new index, using the index name with a timestamp.
            new_index = timestamp_index(prefix + ALIAS)

            # See how the index is currently configured.
            if old_index:
                try:
                    s = (ES.indices.get_settings(index=old_index).get(
                        old_index, {}).get('settings', {}))
                except elasticsearch.NotFoundError:
                    s = {}
            else:
                s = {}
            num_replicas = s.get('number_of_replicas',
                                 settings.ES_DEFAULT_NUM_REPLICAS)
            num_shards = s.get('number_of_shards',
                               settings.ES_DEFAULT_NUM_SHARDS)

            pre_task = pre_index.si(
                new_index, old_index, ALIAS, INDEXER, {
                    'analysis': INDEXER.get_analysis(),
                    'number_of_replicas': 0,
                    'number_of_shards': num_shards,
                    'store.compress.tv': True,
                    'store.compress.stored': True,
                    'refresh_interval': '-1'
                })
            post_task = post_index.si(new_index, old_index, ALIAS, INDEXER, {
                'number_of_replicas': num_replicas,
                'refresh_interval': '5s'
            })

            # Ship it.
            if not total:
                # If there's no data we still create the index and alias.
                chain(pre_task, post_task).apply_async()
            else:
                index_tasks = [
                    run_indexing.si(new_index, INDEXER, chunk)
                    for chunk in chunks
                ]
                chain(pre_task, chord(header=index_tasks,
                                      body=post_task)).apply_async()

        _print('New index and indexing tasks all queued up.')