def tearDown(self): # Cleanup to remove these from the index. self.app1.delete() self.app2.delete() unindex_webapps([self.app1.id, self.app2.id]) # Required to purge the suggestions data structure. In Lucene, a # document is not deleted from a segment, just marked as deleted. WebappIndexer.get_es().optimize(WebappIndexer.get_index(), only_expunge_deletes=True)
def index_webapp(ids, **kw): index = kw.pop('index', None) or ALIAS sys.stdout.write('Indexing %s apps' % len(ids)) qs = Webapp.indexing_transformer(Webapp.uncached.filter(id__in=ids)) docs = [WebappIndexer.extract_document(obj.id, obj=obj) for obj in qs] WebappIndexer.bulk_index(docs, es=ES, index=index)
def index_webapp(ids, **kw): index = kw.pop("index", None) or ALIAS sys.stdout.write("Indexing %s apps" % len(ids)) qs = Webapp.indexing_transformer(Webapp.with_deleted.no_cache().filter(id__in=ids)) docs = [] for obj in qs: try: docs.append(WebappIndexer.extract_document(obj.id, obj=obj)) except: sys.stdout.write("Failed to index obj: {0}".format(obj.id)) WebappIndexer.bulk_index(docs, es=ES, index=index)
def index_webapps(ids, **kw): task_log.info('Indexing apps %s-%s. [%s]' % (ids[0], ids[-1], len(ids))) index = kw.pop('index', WebappIndexer.get_index()) # Note: If reindexing is currently occurring, `get_indices` will return # more than one index. indices = get_indices(index) es = WebappIndexer.get_es(urls=settings.ES_URLS) qs = Webapp.indexing_transformer(Webapp.uncached.filter(id__in=ids)) for obj in qs: doc = WebappIndexer.extract_document(obj.id, obj) for idx in indices: WebappIndexer.index(doc, id_=obj.id, es=es, index=idx)
def index_webapp(ids, **kw): index = kw.pop('index', None) or ALIAS sys.stdout.write('Indexing %s apps' % len(ids)) qs = Webapp.indexing_transformer(Webapp.uncached.filter(id__in=ids)) docs = [] for obj in qs: try: docs.append(WebappIndexer.extract_document(obj.id, obj=obj)) except: sys.stdout.write('Failed to index obj: {0}'.format(obj.id)) WebappIndexer.bulk_index(docs, es=ES, index=index)
def index_webapp(ids, **kw): index = kw.pop('index', None) or ALIAS sys.stdout.write('Indexing %s apps' % len(ids)) qs = Webapp.indexing_transformer( Webapp.with_deleted.no_cache().filter(id__in=ids)) docs = [] for obj in qs: try: docs.append(WebappIndexer.extract_document(obj.id, obj=obj)) except: sys.stdout.write('Failed to index obj: {0}'.format(obj.id)) WebappIndexer.bulk_index(docs, es=ES, index=index)
def create_index(new_index, alias, settings): """Creates a mapping for the new index. - new_index: new index name - alias: alias name - settings: a dictionary of settings """ sys.stdout.write('Create the mapping for index %r, alias: %r' % (new_index, alias)) # Update settings with mapping. settings = { 'settings': settings, 'mappings': WebappIndexer.get_mapping(), } # Create index and mapping. try: ES.create_index(new_index, settings) except pyelasticsearch.exceptions.IndexAlreadyExistsError: raise CommandError('New index [%s] already exists' % new_index) # Don't return until the health is green. By default waits for 30s. ES.health(new_index, wait_for_status='green', wait_for_relocating_shards=0)
def test_mapping_properties(self): # Spot check a few of the key properties. mapping = WebappIndexer.get_mapping() keys = mapping['webapp']['properties'].keys() for k in ('id', 'app_slug', 'category', 'default_locale', 'description', 'device', 'features', 'name', 'status'): ok_(k in keys, 'Key %s not found in mapping properties' % k)
def create_index(new_index, alias, settings): """Creates a mapping for the new index. - new_index: new index name - alias: alias name - settings: a dictionary of settings """ sys.stdout.write( 'Create the mapping for index %r, alias: %r' % (new_index, alias)) # Update settings with mapping. settings = { 'settings': settings, 'mappings': WebappIndexer.get_mapping(), } # Create index and mapping. try: ES.create_index(new_index, settings) except pyelasticsearch.exceptions.IndexAlreadyExistsError: raise CommandError('New index [%s] already exists' % new_index) # Don't return until the health is green. By default waits for 30s. ES.health(new_index, wait_for_status='green', wait_for_relocating_shards=0)
def unindex_webapps(ids, **kw): task_log.info("Un-indexing apps %s-%s. [%s]" % (ids[0], ids[-1], len(ids))) index = kw.pop("index", WebappIndexer.get_index()) # Note: If reindexing is currently occurring, `get_indices` will return # more than one index. indices = get_indices(index) es = WebappIndexer.get_es(urls=settings.ES_URLS) for id_ in ids: for idx in indices: try: WebappIndexer.unindex(id_=id_, es=es, index=idx) except ElasticHttpNotFoundError: # Ignore if it's not there. task_log.info(u"[Webapp:%s] Unindexing app but not found in index" % id_)
class ESTestCase(TestCase): """Base class for tests that require elasticsearch.""" # ES is slow to set up so this uses class setup/teardown. That happens # outside Django transactions so be careful to clean up afterwards. test_es = True mock_es = False exempt_from_fixture_bundling = True # ES doesn't support bundling (yet?) @classmethod def setUpClass(cls): if not settings.RUN_ES_TESTS: raise SkipTest('ES disabled') cls.es = amo.search.get_es(timeout=settings.ES_TIMEOUT) # The ES setting are set before we call super() # because we may have indexation occuring in upper classes. for key, index in settings.ES_INDEXES.items(): if not index.startswith('test_'): settings.ES_INDEXES[key] = 'test_%s_%s' % ( 'mkt' if settings.MARKETPLACE else 'amo', index) super(ESTestCase, cls).setUpClass() try: cls.es.cluster_health() except Exception, e: e.args = tuple([u'%s (it looks like ES is not running, ' 'try starting it or set RUN_ES_TESTS=False)' % e.args[0]] + list(e.args[1:])) raise cls._SEARCH_ANALYZER_MAP = amo.SEARCH_ANALYZER_MAP amo.SEARCH_ANALYZER_MAP = { 'english': ['en-us'], 'spanish': ['es'], } for index in set(settings.ES_INDEXES.values()): # Get the index that's pointed to by the alias. try: indices = cls.es.get_alias(index) index = indices[0] except IndexError: # There's no alias, just use the index. print 'Found no alias for %s.' % index index = index except (pyes.IndexMissingException, pyelasticsearch.ElasticHttpNotFoundError): pass # Remove any alias as well. try: cls.es.delete_index(index) except (pyes.IndexMissingException, pyelasticsearch.ElasticHttpNotFoundError) as exc: print 'Could not delete index %r: %s' % (index, exc) addons.search.setup_mapping() stats.search.setup_indexes() if settings.MARKETPLACE: WebappIndexer.setup_mapping()
def get(self, request, *args, **kwargs): limit = request.GET.get('limit', 5) es_query = { 'apps': { 'completion': { 'field': 'name_suggest', 'size': limit }, 'text': request.GET.get('q', '').strip() } } results = S(WebappIndexer).get_es().send_request( 'GET', [WebappIndexer.get_index(), '_suggest'], body=es_query) if 'apps' in results: data = results['apps'][0]['options'] else: data = [] serializer = self.get_serializer(data) # This returns a JSON list. Usually this is a bad idea for security # reasons, but we don't include any user-specific data, it's fully # anonymous, so we're fine. return HttpResponse(json.dumps(serializer.data), content_type='application/x-rocketbar+json')
def unindex_webapps(ids, **kw): task_log.info('Un-indexing apps %s-%s. [%s]' % (ids[0], ids[-1], len(ids))) index = kw.pop('index', WebappIndexer.get_index()) # Note: If reindexing is currently occurring, `get_indices` will return # more than one index. indices = get_indices(index) es = WebappIndexer.get_es(urls=settings.ES_URLS) for id_ in ids: for idx in indices: try: WebappIndexer.unindex(id_=id_, es=es, index=idx) except ElasticHttpNotFoundError: # Ignore if it's not there. task_log.info( u'[Webapp:%s] Unindexing app but not found in index' % id_)
def handle(self, *args, **kwargs): index = WebappIndexer.get_index() doctype = WebappIndexer.get_mapping_type_name() es = WebappIndexer.get_es() apps = Webapp.objects.values_list('id', flat=True) missing_ids = [] for app in apps: try: res = es.get(index, doctype, app, fields='id') except ElasticHttpNotFoundError: # App doesn't exist in our index, add it to `missing_ids`. missing_ids.append(app) if missing_ids: sys.stdout.write('Adding %s doc(s) to the index.' % len(missing_ids)) index_webapps.delay(missing_ids) else: sys.stdout.write('No docs missing from index.')
def get(self, request, *args, **kwargs): limit = request.GET.get("limit", 5) es_query = { "apps": {"completion": {"field": "name_suggest", "size": limit}, "text": request.GET.get("q", "").strip()} } results = S(WebappIndexer).get_es().send_request("GET", [WebappIndexer.get_index(), "_suggest"], body=es_query) if "apps" in results: data = results["apps"][0]["options"] else: data = [] serializer = self.get_serializer(data) # This returns a JSON list. Usually this is a bad idea for security # reasons, but we don't include any user-specific data, it's fully # anonymous, so we're fine. return HttpResponse(json.dumps(serializer.data), content_type="application/x-rocketbar+json")
def run_indexing(index): """Index the objects. - index: name of the index Note: Our ES doc sizes are about 5k in size. Chunking by 100 sends ~500kb of data to ES at a time. TODO: Use celery chords here to parallelize these indexing chunks. This requires celery 3 (bug 825938). """ sys.stdout.write('Indexing apps into index: %s' % index) qs = WebappIndexer.get_indexable() for chunk in chunked(list(qs), 100): index_webapp(chunk, index=index)
def test_q_num_requests(self): es = WebappIndexer.get_es() orig_search = es.search es.counter = 0 def monkey_search(*args, **kwargs): es.counter += 1 return orig_search(*args, **kwargs) es.search = monkey_search res = self.client.get(self.url, data={'q': 'something'}) eq_(res.status_code, 200) obj = res.json['objects'][0] eq_(obj['slug'], self.webapp.app_slug) # Verify only one search call was made. eq_(es.counter, 1) es.search = orig_search
def test_q_num_requests_no_results(self): es = WebappIndexer.get_es() orig_search = es.search es.counter = 0 def monkey_search(*args, **kwargs): es.counter += 1 return orig_search(*args, **kwargs) es.search = monkey_search res = self.client.get(self.url, data={'q': 'noresults'}) eq_(res.status_code, 200) eq_(res.json['meta']['total_count'], 0) eq_(len(res.json['objects']), 0) # Verify only one search call was made. eq_(es.counter, 1) es.search = orig_search
def test_q_num_requests(self): es = WebappIndexer.get_es() orig_search = es.search es.counter = 0 def monkey_search(*args, **kwargs): es.counter += 1 return orig_search(*args, **kwargs) es.search = monkey_search res = self.client.get(self.url, data={"q": "something"}) eq_(res.status_code, 200) eq_(res.json["meta"]["total_count"], 1) eq_(len(res.json["objects"]), 1) obj = res.json["objects"][0] eq_(obj["slug"], self.webapp.app_slug) # Verify only one search call was made. eq_(es.counter, 1) es.search = orig_search
def get(self, request, *args, **kwargs): limit = request.GET.get('limit', 5) es_query = { 'apps': { 'completion': {'field': 'name_suggest', 'size': limit}, 'text': request.GET.get('q', '').strip() } } results = S(WebappIndexer).get_es().send_request( 'GET', [WebappIndexer.get_index(), '_suggest'], body=es_query) if 'apps' in results: data = results['apps'][0]['options'] else: data = [] serializer = self.get_serializer(data) # This returns a JSON list. Usually this is a bad idea for security # reasons, but we don't include any user-specific data, it's fully # anonymous, so we're fine. return HttpResponse(json.dumps(serializer.data), content_type='application/x-rocketbar+json')
def test_mapping(self): mapping = WebappIndexer.get_mapping() eq_(mapping.keys(), ['webapp']) eq_(mapping['webapp']['_all'], {'enabled': False}) eq_(mapping['webapp']['_boost'], {'name': '_boost', 'null_value': 1.0})
def _get_doc(self): qs = Webapp.indexing_transformer( Webapp.uncached.filter(id__in=[self.app.pk])) obj = qs[0] return obj, WebappIndexer.extract_document(obj.pk, obj)
def handle(self, *args, **kwargs): """Set up reindexing tasks. Creates a Tasktree that creates a new indexes and indexes all objects, then points the alias to this new index when finished. """ if not settings.MARKETPLACE: raise CommandError('This command affects only marketplace and ' 'should be run under Marketplace settings.') force = kwargs.get('force', False) prefix = kwargs.get('prefix', '') if database_flagged() and not force: raise CommandError('Indexation already occuring - use --force to ' 'bypass') elif force: unflag_database() # The list of indexes that is currently aliased by `ALIAS`. try: aliases = ES.aliases(ALIAS).keys() except pyelasticsearch.exceptions.ElasticHttpNotFoundError: aliases = [] old_index = aliases[0] if aliases else None # Create a new index, using the index name with a timestamp. new_index = timestamp_index(prefix + ALIAS) # See how the index is currently configured. if old_index: try: s = (ES.get_settings(old_index).get(old_index, {}).get('settings', {})) except pyelasticsearch.exceptions.ElasticHttpNotFoundError: s = {} else: s = {} num_replicas = s.get('number_of_replicas', settings.ES_DEFAULT_NUM_REPLICAS) num_shards = s.get('number_of_shards', settings.ES_DEFAULT_NUM_SHARDS) # Flag the database. chain = flag_database.si(new_index, old_index, ALIAS) # Create the index and mapping. # # Note: We set num_replicas=0 here to decrease load while re-indexing. # In a later step we increase it which results in a more efficient bulk # copy in Elasticsearch. # For ES < 0.90 we manually enable compression. chain |= create_index.si( new_index, ALIAS, { 'analysis': WebappIndexer.get_analysis(), 'number_of_replicas': 0, 'number_of_shards': num_shards, 'store.compress.tv': True, 'store.compress.stored': True, 'refresh_interval': '-1' }) # Index all the things! chain |= run_indexing.si(new_index) # After indexing we optimize the index, adjust settings, and point the # alias to the new index. chain |= update_alias.si(new_index, old_index, ALIAS, { 'number_of_replicas': num_replicas, 'refresh_interval': '5s' }) # Unflag the database. chain |= unflag_database.si() # Delete the old index, if any. if old_index: chain |= delete_index.si(old_index) chain |= output_summary.si() self.stdout.write('\nNew index and indexing tasks all queued up.\n') os.environ['FORCE_INDEXING'] = '1' try: chain.apply_async() finally: del os.environ['FORCE_INDEXING']
def test_mapping_type_name(self): eq_(WebappIndexer.get_mapping_type_name(), 'webapp')
def test_model(self): eq_(WebappIndexer.get_model(), Webapp)
def handle(self, *args, **kwargs): """Set up reindexing tasks. Creates a Tasktree that creates a new indexes and indexes all objects, then points the alias to this new index when finished. """ force = kwargs.get('force', False) prefix = kwargs.get('prefix', '') if is_reindexing_mkt() and not force: raise CommandError('Indexation already occuring - use --force to ' 'bypass') elif force: unflag_database() # The list of indexes that is currently aliased by `ALIAS`. try: aliases = ES.aliases(ALIAS).keys() except pyelasticsearch.exceptions.ElasticHttpNotFoundError: aliases = [] old_index = aliases[0] if aliases else None # Create a new index, using the index name with a timestamp. new_index = timestamp_index(prefix + ALIAS) # See how the index is currently configured. if old_index: try: s = (ES.get_settings(old_index).get(old_index, {}) .get('settings', {})) except pyelasticsearch.exceptions.ElasticHttpNotFoundError: s = {} else: s = {} num_replicas = s.get('number_of_replicas', settings.ES_DEFAULT_NUM_REPLICAS) num_shards = s.get('number_of_shards', settings.ES_DEFAULT_NUM_SHARDS) # Flag the database. chain = flag_database.si(new_index, old_index, ALIAS) # Create the index and mapping. # # Note: We set num_replicas=0 here to decrease load while re-indexing. # In a later step we increase it which results in a more efficient bulk # copy in Elasticsearch. # For ES < 0.90 we manually enable compression. chain |= create_index.si(new_index, ALIAS, { 'analysis': WebappIndexer.get_analysis(), 'number_of_replicas': 0, 'number_of_shards': num_shards, 'store.compress.tv': True, 'store.compress.stored': True, 'refresh_interval': '-1'}) # Index all the things! chain |= run_indexing.si(new_index) # After indexing we optimize the index, adjust settings, and point the # alias to the new index. chain |= update_alias.si(new_index, old_index, ALIAS, { 'number_of_replicas': num_replicas, 'refresh_interval': '5s'}) # Unflag the database. chain |= unflag_database.si() # Delete the old index, if any. if old_index: chain |= delete_index.si(old_index) chain |= output_summary.si() self.stdout.write('\nNew index and indexing tasks all queued up.\n') os.environ['FORCE_INDEXING'] = '1' try: chain.apply_async() finally: del os.environ['FORCE_INDEXING']
def setUp(self): super(TestFixupCommand, self).setUp() self.index = WebappIndexer.get_index() self.doctype = WebappIndexer.get_mapping_type_name() self.es = WebappIndexer.get_es() self.app = Webapp.objects.get(pk=337141)
def test_index(self): with self.settings(ES_INDEXES={'webapp': 'apps'}): eq_(WebappIndexer.get_index(), 'apps')
def handle(self, *args, **kwargs): """Set up reindexing tasks. Creates a Tasktree that creates a new indexes and indexes all objects, then points the alias to this new index when finished. """ if not settings.MARKETPLACE: raise CommandError("This command affects only marketplace and " "should be run under Marketplace settings.") force = kwargs.get("force", False) prefix = kwargs.get("prefix", "") if database_flagged() and not force: raise CommandError("Indexation already occuring - use --force to " "bypass") elif force: unflag_database() # The list of indexes that is currently aliased by `ALIAS`. try: aliases = ES.aliases(ALIAS).keys() except pyelasticsearch.exceptions.ElasticHttpNotFoundError: aliases = [] old_index = aliases[0] if aliases else None # Create a new index, using the index name with a timestamp. new_index = timestamp_index(prefix + ALIAS) # See how the index is currently configured. if old_index: try: s = ES.get_settings(old_index).get(old_index, {}).get("settings", {}) except pyelasticsearch.exceptions.ElasticHttpNotFoundError: s = {} else: s = {} num_replicas = s.get("number_of_replicas", settings.ES_DEFAULT_NUM_REPLICAS) num_shards = s.get("number_of_shards", settings.ES_DEFAULT_NUM_SHARDS) # Flag the database. chain = flag_database.si(new_index, old_index, ALIAS) # Create the index and mapping. # # Note: We set num_replicas=0 here to decrease load while re-indexing. # In a later step we increase it which results in a more efficient bulk # copy in Elasticsearch. # For ES < 0.90 we manually enable compression. chain |= create_index.si( new_index, ALIAS, { "analysis": WebappIndexer.get_analysis(), "number_of_replicas": 0, "number_of_shards": num_shards, "store.compress.tv": True, "store.compress.stored": True, "refresh_interval": "-1", }, ) # Index all the things! chain |= run_indexing.si(new_index) # After indexing we optimize the index, adjust settings, and point the # alias to the new index. chain |= update_alias.si( new_index, old_index, ALIAS, {"number_of_replicas": num_replicas, "refresh_interval": "5s"} ) # Unflag the database. chain |= unflag_database.si() # Delete the old index, if any. if old_index: chain |= delete_index.si(old_index) chain |= output_summary.si() self.stdout.write("\nNew index and indexing tasks all queued up.\n") os.environ["FORCE_INDEXING"] = "1" try: chain.apply_async() finally: del os.environ["FORCE_INDEXING"]