Example #1
0
    def _chunk_queryset_into_tasks(self,
                                   items,
                                   count,
                                   chunksize=5000,
                                   bundle_size=250):
        """Chunks the queryset passed in, and dispatches it to Celery for
        adding to the index.

        Potential performance improvements:
         - Postgres is quiescent when Solr is popping tasks from Celery,
           instead, it should be fetching the next 1,000
         - The wait loop (while not result.ready()) polls for the results, at
           a 1s interval. Could this be reduced or somehow eliminated while
           keeping Celery's tasks list from running away?
        """
        processed_count = 0
        subtasks = []
        item_bundle = []
        for item in items:
            last_item = (count == processed_count + 1)
            if self.verbosity >= 2:
                self.stdout.write('Indexing item %s' % item.pk)

            item_bundle.append(item)
            if (processed_count % bundle_size == 0) or last_item:
                # Every bundle_size documents we create a subtask
                subtasks.append(
                    add_or_update_items.subtask((item_bundle, self.solr_url)))
                item_bundle = []
            processed_count += 1

            if (processed_count % chunksize == 0) or last_item:
                # Every chunksize items, we send the subtasks for processing
                job = TaskSet(tasks=subtasks)
                result = job.apply_async()
                while not result.ready():
                    time.sleep(1)
                subtasks = []

            if (processed_count % 50000 == 0) or last_item:
                # Do a commit every 50000 items, for good measure.
                self.stdout.write("...running commit command...")
                self.si.commit()

            sys.stdout.write("\rProcessed {}/{} ({:.0%})".format(
                processed_count,
                count,
                processed_count * 1.0 / count,
            ))
            self.stdout.flush()
        self.stdout.write('\n')
Example #2
0
    def _chunk_queryset_into_tasks(self, items, count, chunksize=5000,
                                   bundle_size=250):
        """Chunks the queryset passed in, and dispatches it to Celery for
        adding to the index.

        Potential performance improvements:
         - Postgres is quiescent when Solr is popping tasks from Celery,
           instead, it should be fetching the next 1,000
         - The wait loop (while not result.ready()) polls for the results, at
           a 1s interval. Could this be reduced or somehow eliminated while
           keeping Celery's tasks list from running away?
        """
        processed_count = 0
        subtasks = []
        item_bundle = []
        for item in items:
            last_item = (count == processed_count + 1)
            if self.verbosity >= 2:
                self.stdout.write('Indexing item %s' % item.pk)

            item_bundle.append(item)
            if (processed_count % bundle_size == 0) or last_item:
                # Every bundle_size documents we create a subtask
                subtasks.append(add_or_update_items.subtask(
                    (item_bundle, self.solr_url)))
                item_bundle = []
            processed_count += 1

            if (processed_count % chunksize == 0) or last_item:
                # Every chunksize items, we send the subtasks for processing
                job = TaskSet(tasks=subtasks)
                result = job.apply_async()
                while not result.ready():
                    time.sleep(1)
                subtasks = []

            if (processed_count % 50000 == 0) or last_item:
                # Do a commit every 50000 items, for good measure.
                self.stdout.write("...running commit command...")
                self.si.commit()

            sys.stdout.write("\rProcessed {}/{} ({:.0%})".format(
                processed_count,
                count,
                processed_count * 1.0 / count,
            ))
            self.stdout.flush()
        self.stdout.write('\n')
    def _chunk_queryset_into_tasks(self,
                                   items,
                                   count,
                                   chunksize=5000,
                                   bundle_size=250):
        """Chunks the queryset passed in, and dispatches it to Celery for
        adding to the index.

        Potential performance improvements:
         - Postgres is quiescent when Solr is popping tasks from Celery,
           instead, it should be fetching the next 1,000
        """
        processed_count = 0
        subtasks = []
        item_bundle = []
        for item in items:
            last_item = (count == processed_count + 1)
            if self.verbosity >= 2:
                self.stdout.write('Indexing item %s' % item.pk)

            item_bundle.append(item)
            if (len(item_bundle) >= bundle_size) or last_item:
                # Every bundle_size documents we create a subtask
                subtasks.append(
                    add_or_update_items.subtask((item_bundle, self.solr_url)))
                item_bundle = []
            processed_count += 1

            if (len(subtasks) >= chunksize) or last_item:
                # Every chunksize items, we send the subtasks for processing
                job = TaskSet(tasks=subtasks)
                job.apply_async().join()
                subtasks = []

            if (processed_count % 50000 == 0) or last_item:
                # Do a commit every 50000 items, for good measure.
                self.stdout.write("...running commit command...")
                self.si.commit()

            sys.stdout.write("\rProcessed {}/{} ({:.0%})".format(
                processed_count,
                count,
                processed_count * 1.0 / count,
            ))
            self.stdout.flush()
        self.stdout.write('\n')
    def _chunk_queryset_into_tasks(self, items, count, chunksize=5000,
                                   bundle_size=250):
        """Chunks the queryset passed in, and dispatches it to Celery for
        adding to the index.

        Potential performance improvements:
         - Postgres is quiescent when Solr is popping tasks from Celery,
           instead, it should be fetching the next 1,000
        """
        processed_count = 0
        subtasks = []
        item_bundle = []
        for item in items:
            last_item = (count == processed_count + 1)
            if self.verbosity >= 2:
                self.stdout.write('Indexing item %s' % item.pk)

            item_bundle.append(item)
            if (len(item_bundle) >= bundle_size) or last_item:
                # Every bundle_size documents we create a subtask
                subtasks.append(
                    add_or_update_items.subtask((item_bundle, self.solr_url))
                )
                item_bundle = []
            processed_count += 1

            if (len(subtasks) >= chunksize) or last_item:
                # Every chunksize items, we send the subtasks for processing
                job = TaskSet(tasks=subtasks)
                job.apply_async().join()
                subtasks = []

            if (processed_count % 50000 == 0) or last_item:
                # Do a commit every 50000 items, for good measure.
                self.stdout.write("...running commit command...")
                self.si.commit()

            sys.stdout.write("\rProcessed {}/{} ({:.0%})".format(
                processed_count,
                count,
                processed_count * 1.0 / count,
            ))
            self.stdout.flush()
        self.stdout.write('\n')