def _index_img(self, img): """Index a single img and ensure that it's been propagated to the search engine""" image = search.db_image_to_index(img) image.save() index = Index(name=settings.ELASTICSEARCH_INDEX) index.flush(force=True) index.refresh()
def test_retrieve(self): """It should be possible to retrieve a database item by auto-id""" image = search.db_image_to_index(self.img1) image.save() id_ = image.meta.id image2 = search.Image.get(id=id_) assert image2.meta.id == id_
def insert_image(chunk_size, max_results=5000, from_file=None): count = 0 success_count = 0 es = search.init() search.Image.init() mapping = search.Image._doc_type.mapping mapping.save(settings.ELASTICSEARCH_INDEX) for chunk in grouper_it(chunk_size, import_from_file(from_file)): if not from_file and count >= max_results: # Load everything if loading from file break else: images = [] for result in chunk: images.append(result) if len(images) > 0: try: # Bulk update the search engine too search_objs = [ search.db_image_to_index(img).to_dict( include_meta=True) for img in images ] models.Image.objects.bulk_create(images) helpers.bulk(es, search_objs) log.debug("*** Committed set of %d images", len(images)) success_count += len(images) except IntegrityError as e: log.warn("Got one or more integrity errors on batch: %s", e) finally: count += len(images) return success_count
def _update_search_index(img): # FIXME This may result in a lot of concurrent requests during batch updates; # in those cases consider unregistering this signal and manually batching requests # (note that Django's bulk_create will not fire this signal, which is good) search_obj = search.db_image_to_index(img) if (search_obj.removed_from_source): log.debug("Removing image %s from search index", img.identifier) search_obj.delete(ignore=404) else: log.debug("Indexing image %s", img.identifier) search_obj.save()
def do_index(start, chunk_size): end = start + chunk_size + 1 batches = [] retries = 0 try: es = search.init(timeout=2000) if not settings.DEBUG: es.cluster.health(wait_for_status='green', request_timeout=2000) except (requests.exceptions.ReadTimeout, elasticsearch.exceptions.TransportError) as e: log.warn(e) log.warn("Skipping batch and retrying after wait") time.sleep(RETRY_WAIT) return log.debug("Starting index in range from %d to %d...", start, end) qs = models.Image.objects.filter(removed_from_source=False, id__gt=start).order_by('id')[0:chunk_size] for db_image in server_cursor_query(qs, chunk_size=chunk_size): log.debug("Indexing database record %s", db_image.identifier) image = search.db_image_to_index(db_image) try: if len(batches) >= chunk_size: if not settings.DEBUG: log.debug("Waiting for green status...") es.cluster.health(wait_for_status='green', request_timeout=2000) helpers.bulk(es, batches) log.debug("Pushed batch of %d records to ES", len(batches)) batches = [] # Clear the batch size else: batches.append(image.to_dict(include_meta=True)) except (requests.exceptions.ReadTimeout, elasticsearch.exceptions.TransportError, elasticsearch.helpers.BulkIndexError) as e: if retries < MAX_CONNECTION_RETRIES: log.warn("Got timeout: retrying with %d retries remaining", MAX_CONNECTION_RETRIES - retries) retries += 1 time.sleep(RETRY_WAIT) else: raise helpers.bulk(es, batches)
def insert_image(walk_func, serialize_func, chunk_size, max_results=5000, **kwargs): count = 0 success_count = 0 es = search.init() search.Image.init() mapping = search.Image._doc_type.mapping mapping.save(settings.ELASTICSEARCH_INDEX) for chunk in grouper_it(chunk_size, walk_func(**kwargs)): if max_results is not None and count >= max_results: break else: images = [] for result in chunk: image = serialize_func(result) if image: images.append(image) if len(images) > 0: try: # Bulk update the search engine too if not settings.DEBUG: es.cluster.health(wait_for_status='green', request_timeout=2000) search_objs = [ search.db_image_to_index(img).to_dict( include_meta=True) for img in images ] elasticsearch.helpers.bulk(es, search_objs) models.Image.objects.bulk_create(images) log.debug("*** Committed set of %d images", len(images)) success_count += len(images) except (requests.exceptions.ReadTimeout, elasticsearch.exceptions.TransportError, elasticsearch.helpers.BulkIndexError, IntegrityError) as e: log.warn("Got one or more integrity errors on batch: %s", e) finally: count += len(images) return success_count
def test_store(self): """It should be possible to index a database item""" image = search.db_image_to_index(self.img1) image.save()
def _index_img(self, img): """Index a single img and ensure that it's been propagated to the search engine""" image = search.db_image_to_index(img) image.save() self.es.indices.refresh(force=True)