def _chunk_queryset_into_tasks(self, items, count, chunksize=5000, bundle_size=250): """Chunks the queryset passed in, and dispatches it to Celery for adding to the index. Potential performance improvements: - Postgres is quiescent when Solr is popping tasks from Celery, instead, it should be fetching the next 1,000 - The wait loop (while not result.ready()) polls for the results, at a 1s interval. Could this be reduced or somehow eliminated while keeping Celery's tasks list from running away? """ processed_count = 0 subtasks = [] item_bundle = [] for item in items: last_item = (count == processed_count + 1) if self.verbosity >= 2: self.stdout.write('Indexing item %s' % item.pk) item_bundle.append(item) if (processed_count % bundle_size == 0) or last_item: # Every bundle_size documents we create a subtask subtasks.append( add_or_update_items.subtask((item_bundle, self.solr_url))) item_bundle = [] processed_count += 1 if (processed_count % chunksize == 0) or last_item: # Every chunksize items, we send the subtasks for processing job = TaskSet(tasks=subtasks) result = job.apply_async() while not result.ready(): time.sleep(1) subtasks = [] if (processed_count % 50000 == 0) or last_item: # Do a commit every 50000 items, for good measure. self.stdout.write("...running commit command...") self.si.commit() sys.stdout.write("\rProcessed {}/{} ({:.0%})".format( processed_count, count, processed_count * 1.0 / count, )) self.stdout.flush() self.stdout.write('\n')
def _chunk_queryset_into_tasks(self, items, count, chunksize=5000, bundle_size=250): """Chunks the queryset passed in, and dispatches it to Celery for adding to the index. Potential performance improvements: - Postgres is quiescent when Solr is popping tasks from Celery, instead, it should be fetching the next 1,000 - The wait loop (while not result.ready()) polls for the results, at a 1s interval. Could this be reduced or somehow eliminated while keeping Celery's tasks list from running away? """ processed_count = 0 subtasks = [] item_bundle = [] for item in items: last_item = (count == processed_count + 1) if self.verbosity >= 2: self.stdout.write('Indexing item %s' % item.pk) item_bundle.append(item) if (processed_count % bundle_size == 0) or last_item: # Every bundle_size documents we create a subtask subtasks.append(add_or_update_items.subtask( (item_bundle, self.solr_url))) item_bundle = [] processed_count += 1 if (processed_count % chunksize == 0) or last_item: # Every chunksize items, we send the subtasks for processing job = TaskSet(tasks=subtasks) result = job.apply_async() while not result.ready(): time.sleep(1) subtasks = [] if (processed_count % 50000 == 0) or last_item: # Do a commit every 50000 items, for good measure. self.stdout.write("...running commit command...") self.si.commit() sys.stdout.write("\rProcessed {}/{} ({:.0%})".format( processed_count, count, processed_count * 1.0 / count, )) self.stdout.flush() self.stdout.write('\n')
def _chunk_queryset_into_tasks(self, items, count, chunksize=5000, bundle_size=250): """Chunks the queryset passed in, and dispatches it to Celery for adding to the index. Potential performance improvements: - Postgres is quiescent when Solr is popping tasks from Celery, instead, it should be fetching the next 1,000 """ processed_count = 0 subtasks = [] item_bundle = [] for item in items: last_item = (count == processed_count + 1) if self.verbosity >= 2: self.stdout.write('Indexing item %s' % item.pk) item_bundle.append(item) if (len(item_bundle) >= bundle_size) or last_item: # Every bundle_size documents we create a subtask subtasks.append( add_or_update_items.subtask((item_bundle, self.solr_url))) item_bundle = [] processed_count += 1 if (len(subtasks) >= chunksize) or last_item: # Every chunksize items, we send the subtasks for processing job = TaskSet(tasks=subtasks) job.apply_async().join() subtasks = [] if (processed_count % 50000 == 0) or last_item: # Do a commit every 50000 items, for good measure. self.stdout.write("...running commit command...") self.si.commit() sys.stdout.write("\rProcessed {}/{} ({:.0%})".format( processed_count, count, processed_count * 1.0 / count, )) self.stdout.flush() self.stdout.write('\n')
def _chunk_queryset_into_tasks(self, items, count, chunksize=5000, bundle_size=250): """Chunks the queryset passed in, and dispatches it to Celery for adding to the index. Potential performance improvements: - Postgres is quiescent when Solr is popping tasks from Celery, instead, it should be fetching the next 1,000 """ processed_count = 0 subtasks = [] item_bundle = [] for item in items: last_item = (count == processed_count + 1) if self.verbosity >= 2: self.stdout.write('Indexing item %s' % item.pk) item_bundle.append(item) if (len(item_bundle) >= bundle_size) or last_item: # Every bundle_size documents we create a subtask subtasks.append( add_or_update_items.subtask((item_bundle, self.solr_url)) ) item_bundle = [] processed_count += 1 if (len(subtasks) >= chunksize) or last_item: # Every chunksize items, we send the subtasks for processing job = TaskSet(tasks=subtasks) job.apply_async().join() subtasks = [] if (processed_count % 50000 == 0) or last_item: # Do a commit every 50000 items, for good measure. self.stdout.write("...running commit command...") self.si.commit() sys.stdout.write("\rProcessed {}/{} ({:.0%})".format( processed_count, count, processed_count * 1.0 / count, )) self.stdout.flush() self.stdout.write('\n')