def test_profilepic_search(self): """Make sure searching for only users with profile pics works.""" with open(os.path.join(os.path.dirname(__file__), 'profile-photo.jpg')) as f: r = self.mozillian_client.post(reverse('profile.edit'), dict(first_name='Aman', last_name='Withapic', photo=f)) if not settings.ES_DISABLED: get_es().refresh(settings.ES_INDEXES['default'], timesleep=0) amanhasapic = 'Aman Withapic' amanda = 'Amanda Younger' url = reverse('search') r = self.mozillian_client.get(url, dict(q='Am')) rpp = self.mozillian_client.get(url, dict(q='Am', picture_only=1)) eq_(r.status_code, 200) peeps = r.context['people'] peeps_pp = rpp.context['people'] saw_amanda = False # Make sure that every body has a profile picture for person in peeps: if person.display_name == amanda: if bool(person.photo): self.fail('Amanda doesnt have a profile pic') saw_amanda = True # Make sure amanda shows up in peeps assert amanda in [p.display_name for p in peeps] # Make sure she doesn't show up in peeps_pp assert amanda not in [p.display_name for p in peeps_pp] self.assertEqual(peeps_pp[0].display_name, amanhasapic) self.assertTrue(saw_amanda, 'We dont see profile picture')
def setUpClass(cls): super(ElasticSearchTestCase, cls).setUpClass() if not getattr(settings, 'ES_URLS', None): cls._skip_tests = True return try: get_es().health() except (Timeout, ConnectionError): cls._skip_tests = True return # Save settings and override them cls._old_es_disabled = settings.ES_DISABLED settings.ES_DISABLED = False cls._old_es_indexes = settings.ES_INDEXES settings.ES_INDEXES = testify(settings.ES_INDEXES) cls.es = get_es() for index in settings.ES_INDEXES.values(): try: cls.es.delete_index(index) except ElasticHttpNotFoundError: pass
def setUpClass(cls): """Runs the :class:`TestCase` setup to add some data. Also flushes and refreshes the data so it's searchable via computer. """ elasticutils.tests.ESTestCase.setUpClass() TestCase.setUpClass() get_es().flush(refresh=True)
def refresh(self): # Any time we're doing a refresh, we're making sure that the # index is ready to be queried. Given that, it's almost # always the case that we want to run all the generated tasks, # then refresh. from search.models import generate_tasks generate_tasks() get_es().refresh(settings.ES_INDEXES['default'], timesleep=0)
def refresh(self, timesleep=0): """Refresh index after indexing. This refreshes the index specified by `self.index_name`. :arg timesleep: int; number of seconds to sleep after telling ES to refresh """ get_es().refresh(self.index_name, timesleep=timesleep)
def refresh(self, timesleep=0): index = es_utils.WRITE_INDEX # Any time we're doing a refresh, we're making sure that the # index is ready to be queried. Given that, it's almost # always the case that we want to run all the generated tasks, # then refresh. generate_tasks() get_es().refresh(index, timesleep=timesleep)
def setup_class(cls): """Class setup for tests. Checks to see if ES is running and if not, sets ``skip_test`` to True on the class. """ # Note: TestCase has no setup_class try: get_es().collect_info() except pyes.urllib3.MaxRetryError: cls.skip_tests = True
def setup_class(cls): """Class setup for tests. Checks to see if ES is running and if not, sets ``skip_test`` to True on the class. """ # Note: TestCase has no setup_class try: get_es().health() except pyelasticsearch.exceptions.ConnectionError: cls.skip_tests = True
def index(cls, document, id=None, bulk=False, force_insert=False): """Associates a document with a correlated id in ES. Wrapper around pyes.ES.index. Example:: MyModel.index(instance.fields, id=instance.id) """ elasticutils.get_es().index( document, index=cls._get_index(), doc_type=cls._meta.db_table, id=id, bulk=bulk, force_insert=force_insert)
def test_get_es_force_new(self): """Test that force_new works correctly.""" es = get_es() es2 = get_es(force_new=True) # force_new prevents the new ElasticSearch instance from getting # cached, so we should only have one item in the cache. eq_(len(_cached_elasticsearch), 1) # However, the two ElasticSearch instances should be different. assert id(es) != id(es2)
def unindex(cls, id): """Removes a document from the index""" if not settings.ES_LIVE_INDEXING: return index = cls.get_es_index() doc_type = cls._meta.db_table try: elasticutils.get_es().delete(index, doc_type, id) except pyes.exceptions.NotFoundException: # Ignore the case where we try to delete something that's # not there. pass
def index_all(pks, **kw): ids_str = ','.join(map(str, pks)) log.debug('ES starting bulk action for packages: [%s]' % ids_str) for package in Package.objects.filter(pk__in=pks): package.refresh_index(bulk=True) try: get_es().flush_bulk(forced=True) except KeyboardInterrupt: raise except Exception, e: log.error('ES failed bulk action (%s), package ids: [%s]' % (e, ids_str))
def test_get_es_settings_cache(self): """Tests **settings and cache.""" es = get_es(max_retries=5, revival_delay=10) eq_(len(_cached_elasticsearch), 1) # Switching the order doesn't affect caching. es2 = get_es(revival_delay=10, max_retries=5) eq_(len(_cached_elasticsearch), 1) assert id(es) == id(es2) # Different values brings up a new item. es3 = get_es(max_retries=4, revival_delay=10) eq_(len(_cached_elasticsearch), 2) assert id(es) != id(es3)
def elastic(request): INDEX = site_settings.ES_INDEXES['default'] es = elasticutils.get_es() mappings = {'addons': addons.cron.reindex_addons, 'apps': addons.cron.reindex_apps, 'collections': bandwagon.cron.reindex_collections, 'compat': compatibility_report, 'users': users.cron.reindex_users, } if request.method == 'POST': if request.POST.get('recreate'): es.delete_index_if_exists(INDEX) # We must set up the mappings before we create the index again. addons.search.setup_mapping() stats.search.setup_indexes() es.create_index_if_missing(INDEX) messages.info(request, 'Deleting %s index.' % INDEX) if request.POST.get('reindex') in mappings: name = request.POST['reindex'] # Reindex. if mappings.get(name): mappings[name]() messages.info(request, 'Reindexing %s.' % name) return redirect('zadmin.elastic') indexes = set(site_settings.ES_INDEXES.values()) mappings = es.get_mapping(None, indexes) ctx = { 'index': INDEX, 'nodes': es.cluster_nodes(), 'health': es.cluster_health(), 'state': es.cluster_state(), 'mappings': [(index, mappings.get(index, {})) for index in indexes], } return jingo.render(request, 'zadmin/elastic.html', ctx)
def handle(self, *args, **options): conn = get_es() if options.get('delete'): conn.delete_index('monolith') conn.create_index('monolith') mapping = { 'name': { 'store': 'yes', 'type': 'string', }, 'date': { 'store': 'yes', 'type': 'date', 'format': 'yyyy-MM-dd', }, 'key': { 'store': 'yes', 'type': 'string', }, 'value': { 'store': 'yes', 'type': 'integer', } } conn.put_mapping('metrics', {'properties': mapping}, ['monolith'])
def setup_mkt_indexes(): """ Define explicit ES mappings for models. If a field is not explicitly defined and a field is inserted, ES will dynamically guess the type and insert it, in a schemaless manner. """ es = elasticutils.get_es() for model in [Contribution, InappPayment]: index = model._get_index() try: es.create_index_if_missing(index) except pyes.ElasticSearchException: pass mapping = { 'properties': { 'id': {'type': 'long'}, 'date': {'format': 'dateOptionalTime', 'type': 'date'}, 'count': {'type': 'long'}, 'revenue': {'type': 'double'}, # Try to tell ES not to 'analyze' the field to querying with # hyphens and lowercase letters. 'currency': {'type': 'string', 'index': 'not_analyzed'}, 'source': {'type': 'string', 'index': 'not_analyzed'}, 'inapp': {'type': 'string', 'index': 'not_analyzed'} } } es.put_mapping(model._meta.db_table, mapping, model._get_index())
def index_collections(ids, **kw): es = elasticutils.get_es() log.debug('Indexing collections %s-%s [%s].' % (ids[0], ids[-1], len(ids))) qs = Collection.uncached.filter(id__in=ids).transform(attach_translations) for c in qs: Collection.index(search.extract(c), bulk=True, id=c.id) es.flush_bulk(forced=True)
def test_get_es_defaults(self): es = get_es() eq_(es.timeout, settings.ES_TIMEOUT) # dump_curl defaults to False, but if dump_curl is Falsey, # then pyes.es.ES sets its dump_curl attribute to None. eq_(es.dump_curl, None) eq_(es.default_indexes, [settings.ES_INDEXES["default"]])
def setup_mapping(index): from forums.models import Thread mapping = { 'properties': { 'id': {TYPE: INTEGER}, 'thread_id': {TYPE: INTEGER}, 'forum_id': {TYPE: INTEGER}, 'title': {TYPE: STRING, INDEX: ANALYZED, ANALYZER: SNOWBALL}, 'is_sticky': {TYPE: BOOLEAN}, 'is_locked': {TYPE: BOOLEAN}, 'author_id': {TYPE: INTEGER}, 'author_ord': {TYPE: STRING}, 'content': {TYPE: STRING, INDEX: ANALYZED, ANALYZER: SNOWBALL, STORE: YES, TERM_VECTOR: WITH_POS_OFFSETS}, 'created': {TYPE: DATE}, 'updated': {TYPE: DATE}, 'replies': {TYPE: INTEGER} } } es = elasticutils.get_es() try: es.put_mapping(Thread._meta.db_table, mapping, index) except pyes.exceptions.ElasticSearchException, e: log.error(e)
def es_reindex_with_progress(doctypes=None, percent=100): """Rebuild Elastic indexes as you iterate over yielded progress ratios. :arg doctypes: Defaults to None which will index all doctypes. Otherwise indexes the doctypes specified. See :py:func:`.get_doctype_stats()` for what doctypes look like. :arg percent: Defaults to 100. Allows you to specify how much of each doctype you want to index. This is useful for development where doing a full reindex takes an hour. """ from search.models import get_search_models es = elasticutils.get_es() search_models = get_search_models() if doctypes: search_models = [cls for cls in search_models if cls._meta.db_table in doctypes] if len(search_models) == len(get_search_models()): index = settings.ES_INDEXES.get('default') if index is not None: # If we're indexing everything and there's a default index # specified in settings, then we delete and recreate it. es.delete_index_if_exists(index) es.create_index(index) total = sum([cls.objects.count() for cls in search_models]) to_index = [cls.index_all(percent) for cls in search_models] return (float(done) / total for done, _ in izip(count(1), chain(*to_index)))
def unindex(cls, id): es = elasticutils.get_es() try: es.delete(cls._get_index(), cls._meta.db_table, id) except pyes.exceptions.NotFoundException: # Item wasn't found, whatevs. pass
def index_finance_total_by_currency(addons, **kw): """ Bug 757581 Total finance stats, currency breakdown. """ es = elasticutils.get_es() log.info('Indexing total financial stats by currency for %s apps.' % len(addons)) for addon in addons: # Get all contributions for given add-on. qs = Contribution.objects.filter(addon=addon, uuid=None) if not qs.exists(): continue # Get list of distinct currencies. currencies = set(qs.values_list('currency', flat=True)) for currency in currencies: try: key = ord_word('cur' + str(addon) + currency.lower()) data = search.get_finance_total_by_currency( qs, addon, currency) if not already_indexed(Contribution, data): Contribution.index(data, bulk=True, id=key) es.flush_bulk(forced=True) except Exception, exc: index_finance_total_by_currency.retry(args=[addons], exc=exc) raise
def on_current_word_signal(self, word_signal): if not word_signal.strip(): self._possible_completion = '' self.possible_completion_signal.emit(self._possible_completion) return suggestion_name = 'completion_suggestion' compl_resp = get_es().suggest(index=WordMappingType.get_index(), body={ suggestion_name: { 'text': word_signal, 'completion': { 'field': 'text' } } }) suggestions = compl_resp[suggestion_name][0]['options'] if not suggestions: self._possible_completion = '' self.possible_completion_signal.emit(self._possible_completion) return for suggestion in suggestions: if suggestion['text'] != word_signal: top_suggestion = suggestion['text'] self._possible_completion = top_suggestion break self.possible_completion_signal.emit(self._possible_completion)
def index_installed_daily(ids, **kw): """ Takes a list of Installed ids and uses its addon and date fields to index stats for that day. ids -- ids of mkt.webapps.Installed objects """ from mkt.webapps.models import Installed es = elasticutils.get_es() # Get Installed's qs = (Installed.objects.filter(id__in=set(ids)). order_by('-created').values('addon', 'created')) log.info('[%s] Indexing %s installed counts for daily stats.' % (qs[0]['created'], len(qs))) addons_dates = defaultdict(lambda: defaultdict(dict)) for installed in qs: addon = installed['addon'] date = installed['created'].strftime('%Y%m%d') try: if not date in addons_dates[addon]: key = ord_word('ins' + str(addon) + str(date)) data = search.get_installed_daily(installed) if not already_indexed(Installed, data): Installed.index(data, bulk=True, id=key) addons_dates[addon][date] = 0 es.flush_bulk(forced=True) except Exception, exc: index_installed_daily.retry(args=[ids], exc=exc) raise
def es_reindex_with_progress(percent=100): """Rebuild Elastic indexes as you iterate over yielded progress ratios. :arg percent: Defaults to 100. Allows you to specify how much of each doctype you want to index. This is useful for development where doing a full reindex takes an hour. """ from search.models import get_search_models search_models = get_search_models() es = elasticutils.get_es() index = settings.ES_INDEXES['default'] es.delete_index_if_exists(index) # There should be no mapping-conflict race here since the index doesn't # exist. Live indexing should just fail. # Simultaneously create the index and the mappings, so live indexing # doesn't get a chance to index anything between the two and infer a bogus # mapping (which ES then freaks out over when we try to lay in an # incompatible explicit mapping). mappings = dict((cls._meta.db_table, {'properties': cls.get_mapping()}) for cls in search_models) es.create_index(index, settings={'mappings': mappings}) total = sum([cls.objects.count() for cls in search_models]) to_index = [cls.index_all(percent) for cls in search_models] return (float(done) / total for done, _ in izip(count(1), chain(*to_index)))
def index_contribution_counts(ids, **kw): """ Contribution stats by addon-date unique pair Uses a nested dictionary to not index duplicate contribution with same addon/date pairs. For each addon-date, it stores the addon in the dict as a top level key with a dict as its value. And it stores the date in the addon's dict as a second level key. To check if an addon-date pair has been already index, it looks up the dict[addon][date] to see if the key exists. """ es = elasticutils.get_es() qs = (Contribution.objects.filter(id__in=ids) .order_by('created').values('addon', 'created')) try: addons_dates = defaultdict(lambda: defaultdict(dict)) for contribution in qs: addon = contribution['addon'] date = contribution['created'].strftime('%Y%m%d') # date for addon not processed, index it and give it key if not date in addons_dates[addon]: key = '%s-%s' % (addon, date) data = search.extract_contribution_counts(contribution) Contribution.index(data, bulk=True, id=key) addons_dates[addon][date] = 0 if qs: log.info('Indexed %s addons/apps for contribution stats: %s' % (len(addons_dates), qs[0]['created'])) es.flush_bulk(forced=True) except Exception, exc: index_contribution_counts.retry(args=[ids], exc=exc) raise
def setup_class(cls): """Class setup for tests. Checks to see if ES is running and if not, sets ``skip_test`` to True on the class. """ # Note: TestCase has no setup_class try: get_es().health() except pyelasticsearch.exceptions.ConnectionError: cls.skip_tests = True if cls.data: cls.create_index(settings={'mappings': cls.mapping}) cls.index_data(cls.data) cls.refresh()
def setup_indexes(): es = elasticutils.get_es() for model in CollectionCount, DownloadCount, UpdateCount: index = model._get_index() try: es.create_index_if_missing(index) except pyes.ElasticSearchException: pass mapping = { 'properties': { 'id': {'type': 'long'}, 'count': {'type': 'long'}, 'data': {'dynamic': 'true', 'properties': { 'v': {'type': 'long'}, 'k': {'type': 'string'} } }, 'date': {'format':'dateOptionalTime', 'type':'date'} } } es.put_mapping(CollectionCount._meta.db_table, mapping, CollectionCount._get_index())
def elastic(request): INDEX = site_settings.ES_INDEXES["default"] es = elasticutils.get_es() mappings = { "addons": (addons.search.setup_mapping, addons.cron.reindex_addons), "collections": (addons.search.setup_mapping, bandwagon.cron.reindex_collections), "compat": (addons.search.setup_mapping, None), "users": (addons.search.setup_mapping, users.cron.reindex_users), } if request.method == "POST": if request.POST.get("reset") in mappings: name = request.POST["reset"] es.delete_mapping(INDEX, name) if mappings[name][0]: mappings[name][0]() messages.info(request, "Resetting %s." % name) if request.POST.get("reindex") in mappings: name = request.POST["reindex"] mappings[name][1]() messages.info(request, "Reindexing %s." % name) return redirect("zadmin.elastic") indexes = set(site_settings.ES_INDEXES.values()) mappings = es.get_mapping(None, indexes) ctx = { "nodes": es.cluster_nodes(), "health": es.cluster_health(), "state": es.cluster_state(), "mappings": [(index, mappings.get(index, {})) for index in indexes], } return jingo.render(request, "zadmin/elastic.html", ctx)
def elastic(request): INDEX = site_settings.ES_INDEXES['default'] es = elasticutils.get_es() mappings = {'addons': (addons.search.setup_mapping, addons.cron.reindex_addons), 'collections': (addons.search.setup_mapping, bandwagon.cron.reindex_collections), 'compat': (addons.search.setup_mapping, None), 'users': (addons.search.setup_mapping, users.cron.reindex_users), } if request.method == 'POST': if request.POST.get('reset') in mappings: name = request.POST['reset'] es.delete_mapping(INDEX, name) if mappings[name][0]: mappings[name][0]() messages.info(request, 'Resetting %s.' % name) if request.POST.get('reindex') in mappings: name = request.POST['reindex'] mappings[name][1]() messages.info(request, 'Reindexing %s.' % name) return redirect('zadmin.elastic') indexes = set(site_settings.ES_INDEXES.values()) mappings = es.get_mapping(None, indexes) ctx = { 'nodes': es.cluster_nodes(), 'health': es.cluster_health(), 'state': es.cluster_state(), 'mappings': [(index, mappings.get(index, {})) for index in indexes], } return jingo.render(request, 'zadmin/elastic.html', ctx)
def unindex(cls, id): """Removes a particular item from the search index.""" elasticutils.get_es().delete(cls._get_index(), cls._meta.db_table, id)
def open_spider(self, spider): self.es = elasticutils.get_es() if self.es.indices.exists(index='doc-index'): self.es.indices.delete(index='doc-index') self.es.indices.create(index='doc-index', body={ 'settings': { 'analysis': { 'filter': { 'en_stop_filter': { 'type': 'stop', 'stopwords': ['_english_'] }, 'en_stem_filter': { 'type': 'stemmer', 'name': 'minimal_english' } }, 'analyzer': { 'en_analyzer': { 'type': 'custom', 'tokenizer': 'lowercase', 'filter': [ 'asciifolding', 'word_delimiter', 'en_stop_filter', 'en_stem_filter' ] } } }, }, 'mappings': { 'doc-section-type': { 'analyzer': 'en_analyzer', 'url': { 'type': 'string' }, 'category': { 'type': 'string' }, 'tags': { 'type': 'string', 'boost': 1.8 }, 'title': { 'type': 'string', 'boost': 1 }, 'content': { 'type': 'string' }, '_boost': { 'name': 'boost', 'null_value': 1.0 } } } }) f = open('tags.json', 'r') self.tags = json.loads(f.read())
def get_es(cls): return get_es(**ESTestCase.es_settings)
def teardown_class(cls): es = get_es() es.delete_index('test')
def create_impl(self, config, config_no_sensitive): return get_es(config.hosts.splitlines(), float(config.timeout), send_get_body_as=config.body_as)
def get_es(cls): """Returns the Elasticsearch object specified by ``cls.es_settings``""" return get_es(**cls.es_settings)
def test_get_es_mocked(self): es = elasticutils.get_es() assert issubclass(es.__class__, mock.Mock)
def setup_mapping(): """Set up the addons index mapping.""" # Mapping describes how elasticsearch handles a document during indexing. # Most fields are detected and mapped automatically. appver = { 'dynamic': False, 'properties': { 'max': { 'type': 'long' }, 'min': { 'type': 'long' } } } mapping = { # Optional boosting during indexing. '_boost': { 'name': '_boost', 'null_value': 1.0 }, 'properties': { # Turn off analysis on name so we can sort by it. 'name_sort': { 'type': 'string', 'index': 'not_analyzed' }, # Adding word-delimiter to split on camelcase and punctuation. 'name': { 'type': 'string', 'analyzer': 'standardPlusWordDelimiter' }, 'summary': { 'type': 'string', 'analyzer': 'snowball' }, 'description': { 'type': 'string', 'analyzer': 'snowball' }, 'tags': { 'type': 'string', 'index': 'not_analyzed', 'index_name': 'tag' }, 'platforms': { 'type': 'integer', 'index_name': 'platform' }, 'appversion': { 'properties': dict((app.id, appver) for app in amo.APP_USAGE) }, }, } # Add room for language-specific indexes. for analyzer in amo.SEARCH_ANALYZER_MAP: mapping['properties']['name_' + analyzer] = { 'type': 'string', 'analyzer': analyzer, } mapping['properties']['summary_' + analyzer] = { 'type': 'string', 'analyzer': analyzer, } mapping['properties']['description_' + analyzer] = { 'type': 'string', 'analyzer': analyzer, } es = elasticutils.get_es() # Adjust the mapping for all models at once because fields are shared # across all doc types in an index. If we forget to adjust one of them # we'll get burned later on. for model in Addon, AppCompat, Collection, UserProfile: index = model._get_index() try: es.create_index_if_missing(index) except pyes.ElasticSearchException: pass try: es.put_mapping(model._meta.db_table, mapping, index) except pyes.ElasticSearchException, e: log.error(e)
def get_es(cls): return get_es(default_indexes=[cls.index_name])
def handle(self, *args, **options): self.es = get_es(urls=settings.ES_URLS) chunk_size = options.get("chunk") index_suffix = options.get("index_suffix") if index_suffix: index_suffix = "_" + index_suffix all_models_to_index = set() if len(args): for app_name in args: for model in models.get_models(models.get_app(app_name)): if issubclass(model, PolymorphicIndexable): all_models_to_index.add(model) else: for app in models.get_apps(): for model in models.get_models(app): if issubclass(model, PolymorphicIndexable): all_models_to_index.add(model) # remove redundant subclasses since the instance_of query will select them models_to_index = set() for model_i in all_models_to_index: should_add = True for model_j in all_models_to_index: if model_i != model_j and issubclass(model_i, model_j): should_add = False break if should_add: models_to_index.add(model_i) self.stdout.write(u"Indexing models: %s" % ', '.join([m.__name__ for m in models_to_index])) num_processed = 0 payload = [] for model in models_to_index: for instance in model.objects.instance_of(model).order_by("id").iterator(): meta = { "index": { "_index": instance.get_index_name() + index_suffix, "_type": instance.get_mapping_type_name(), "_id": instance.pk } } payload.append(meta) doc = instance.extract_document() payload.append(doc) if len(payload) / 2 == chunk_size: response = self.es.bulk(body=payload) good_items = [item for item in response["items"] if item["index"]["status"] <= 299] if len(good_items) != len(payload) // 2: self.stdout.write("Bulk indexing error! Item count mismatch.") bad_items = [item for item in response["items"] if item["index"]["status"] > 201] self.stdout.write("These were rejected: %s" % str(bad_items)) return "Bulk indexing failed." num_processed += (len(payload) / 2) self.stdout.write("Indexed %d items" % num_processed) payload = [] if payload: response = self.es.bulk(body=payload) num_processed += (len(payload) / 2) self.stdout.write("Indexed %d items" % num_processed)
""" This is a sample program that uses PyES ES to create an index, create a mapping, and index some data. Then it uses ElasticUtils S to show some behavior with facets. """ from elasticutils import get_es, S HOST = 'localhost:9200' INDEX = 'fooindex' DOCTYPE = 'testdoc' es = get_es(hosts=HOST, default_indexes=[INDEX]) # This uses pyes ES.delete_index_if_exists. es.delete_index_if_exists(INDEX) # Define the mapping for the doctype 'testdoc'. It's got an id field, # a title which is analyzed, and two fields that are lists of tags, so # we don't want to analyze them. # # Note: The alternative for the tags is to analyze them and use the # 'keyword' analyzer. Both not analyzing and using the keyword # analyzer treats the values as a single term rather than tokenizing # them and treating as multiple terms. mapping = { DOCTYPE: { 'properties': { 'id': { 'type': 'integer' },
def index_users(ids, **kw): es = elasticutils.get_es() task_log.debug('Indexing users %s-%s [%s].' % (ids[0], ids[-1], len(ids))) for c in UserProfile.objects.filter(id__in=ids): UserProfile.index(search.extract(c), bulk=True, id=c.id) es.flush_bulk(forced=True)
def unindex(cls, id): elasticutils.get_es().delete(settings.ES_INDEX, cls._meta.app_label, id)
def test_bulk_index(self): ParentIndexable(foo="Fighters").save(index=False) ChildIndexable(foo="Fighters", bar=69).save(index=False) GrandchildIndexable(foo="Fighters", bar=69, baz=datetime.datetime.now() - datetime.timedelta(hours=1)).save(index=False) SeparateIndexable(junk="Testing").save(index=False) # Let's make sure that nothing is indexed yet. self.assertEqual(ParentIndexable.search_objects.s().count(), 0) self.assertEqual(SeparateIndexable.search_objects.s().count(), 0) # Now that everything has been made, let's try a bulk_index. call_command("bulk_index") ParentIndexable.search_objects.refresh() SeparateIndexable.search_objects.refresh() # Let's make sure that everything has the right counts self.assertEqual(ParentIndexable.search_objects.s().count(), 3) self.assertEqual(SeparateIndexable.search_objects.s().count(), 1) # Let's add another one, make sure the counts are right. ParentIndexable(foo="Mr. T").save(index=False) self.assertEqual(ParentIndexable.search_objects.s().count(), 3) call_command("bulk_index") ParentIndexable.search_objects.refresh() self.assertEqual(ParentIndexable.search_objects.s().count(), 4) # Let's f**k up some data in ES. obj = ParentIndexable.objects.all()[0] es = get_es(urls=settings.ES_URLS) doc = obj.extract_document() doc["foo"] = "DATA LOVERS" es.update(index=obj.get_index_name(), doc_type=obj.get_mapping_type_name(), id=obj.id, body=dict(doc=doc, doc_as_upsert=True), refresh=True) # Make sure the bad data works self.assertEqual( ParentIndexable.search_objects.query( foo__match="DATA LOVERS").count(), 1) call_command("bulk_index") ParentIndexable.search_objects.refresh() self.assertEqual( ParentIndexable.search_objects.query( foo__match="DATA LOVERS").count(), 0) # Let's delete an item from the db. obj = ParentIndexable.objects.all()[0] obj.delete() # Make sure the count is the same self.assertEqual(ParentIndexable.search_objects.s().count(), 4) # This shoulnd't remove the item call_command("bulk_index") ParentIndexable.search_objects.refresh() self.assertEqual(ParentIndexable.search_objects.s().count(), 4) # This should call_command("synces", self.index_suffix, drop_existing_indexes=True) call_command("es_swap_aliases", self.index_suffix) call_command("bulk_index") ParentIndexable.search_objects.refresh() self.assertEqual(ParentIndexable.search_objects.s().count(), 3)
libraries_results.append(('Spidermonkey is ready!', True, None)) # TODO: see if it works? else: status_summary['libraries'] = False msg = "You said it was at (%s)" % settings.SPIDERMONKEY libraries_results.append(('Spidermonkey not found!', False, msg)) else: status_summary['libraries'] = False msg = "Please set SPIDERMONKEY in your settings file." libraries_results.append(("Spidermonkey isn't set up.", False, msg)) elastic_results = None if settings.USE_ELASTIC: status_summary['elastic'] = False try: health = elasticutils.get_es().cluster_health() status_summary['elastic'] = health['status'] != 'red' elastic_results = health except Exception: elastic_results = traceback.format_exc() # Check file paths / permissions rw = (settings.TMP_PATH, settings.NETAPP_STORAGE, settings.UPLOADS_PATH, settings.ADDONS_PATH, settings.MIRROR_STAGE_PATH, settings.GUARDED_ADDONS_PATH, settings.ADDON_ICONS_PATH, settings.COLLECTIONS_ICON_PATH, settings.PREVIEWS_PATH, settings.USERPICS_PATH, settings.SPHINX_CATALOG_PATH, settings.SPHINX_LOG_PATH, dump_apps.Command.JSON_PATH) r = [os.path.join(settings.ROOT, 'locale')] filepaths = [(path, os.R_OK | os.W_OK, "We want read + write") for path in rw]
def get_elasticsearch_handle(timeout=ELASTIC_SEARCH_TIMEOUT): return elasticutils.get_es(urls=ELASTIC_SEARCH_URL, timeout=timeout)
def _delete_unique_id_from_elastic_search(dbm, entity_type, document_id): elasticutils.get_es(urls=ELASTIC_SEARCH_URL, timeout=ELASTIC_SEARCH_TIMEOUT).delete( dbm.database_name, entity_type, document_id)
import yaml from elasticutils import get_es, S # def create_mapping(): # a = open('schema/deals.yaml') # b = yaml.load(a) # a.close() # return b #mapping = create_mapping() #fileformat = [{'company_name': 'homedepot', 'active': [{'20% off': 'ZYZZ', '50% off': 'REDDIT'}], 'inactive': [{'-10% off': 'DIVIDEBYZERO'}]}, # {'company_name': 'lowes', 'active': [{'15% off': 'XCX', '100% off': 'HACKERNEWS'}], 'inactive': [{'Buy one get one': 'BOGO'}]},] mapping = {'companies': {'properties': {'company_name': {'type': 'string'}, 'active': {'type': 'string'}, 'inactive': {'type': 'string'},}}} es = get_es(hosts='localhost:9200', default_indexes=['dealsindex']) def get_data_from_yaml(): data = {} dataList = [] a = glob.iglob("data/*.yaml") for file in a: b = open(file) c = yaml.load(b) dataList.append(c) b.close() # Elasticsearch wants a list of dictionaries, hence the conversion return dataList def create_and_insert(): es.delete_index_if_exists('dealsindex')
This is a sample program that uses Elasticsearch (from elasticsearch-py) object to create an index, create a mapping, and index some data. Then it uses ElasticUtils S to show some behavior. """ from elasticutils import get_es, S from elasticsearch.helpers import bulk_index URL = 'localhost' INDEX = 'fooindex' DOCTYPE = 'testdoc' # This creates an elasticsearch.Elasticsearch object which we can use # to do all our indexing. es = get_es(urls=[URL]) # First, delete the index if it exists. es.indices.delete(index=INDEX, ignore=404) # Define the mapping for the doctype 'testdoc'. It's got an id field, # a title which is analyzed, and two fields that are lists of tags, so # we don't want to analyze them. mapping = { DOCTYPE: { 'properties': { 'id': { 'type': 'integer' }, 'title': { 'type': 'string',
def _refresh_elastic_search_index(dbm): elasticutils.get_es( urls=ELASTIC_SEARCH_URL, timeout=ELASTIC_SEARCH_TIMEOUT).refresh(index=dbm.database_name)
def index_collections(ids, **kw): es = elasticutils.get_es() log.debug('Indexing collections %s-%s [%s].' % (ids[0], ids[-1], len(ids))) for c in Collection.objects.filter(id__in=ids): Collection.index(search.extract(c), bulk=True, id=c.id) es.flush_bulk(forced=True)
def teardown_indexes(self): es = get_es() for index in settings.ES_INDEXES.values(): es.delete_index_if_exists(index) settings.ES_LIVE_INDEXING = False
def index(cls, document, id=None, bulk=False, force_insert=False): """Wrapper around pyes.ES.index.""" elasticutils.get_es().index( document, index=cls._get_index(), doc_type=cls._meta.db_table, id=id, bulk=bulk, force_insert=force_insert)
def get_es(cls): return get_es(**cls.es_settings)
def setUp(self): self.index_suffix = "vtest" self.es = get_es(urls=settings.ES_URLS) call_command("synces", self.index_suffix, drop_existing_indexes=True) call_command("es_swap_aliases", self.index_suffix)
def compatibility_report(): redis = redisutils.connections['master'] docs = defaultdict(dict) # Gather all the data for the index. for app in amo.APP_USAGE: versions = [c for c in settings.COMPAT if c['app'] == app.id] log.info(u'Making compat report for %s.' % app.pretty) latest = UpdateCount.objects.aggregate(d=Max('date'))['d'] qs = UpdateCount.objects.filter(addon__appsupport__app=app.id, addon__disabled_by_user=False, addon__status__in=amo.VALID_STATUSES, addon___current_version__isnull=False, date=latest) updates = dict(qs.values_list('addon', 'count')) for chunk in amo.utils.chunked(updates.items(), 50): chunk = dict(chunk) for addon in Addon.objects.filter(id__in=chunk): doc = docs[addon.id] doc.update(id=addon.id, slug=addon.slug, guid=addon.guid, self_hosted=addon.is_selfhosted(), binary=addon.binary_components, name=unicode(addon.name), created=addon.created, current_version=addon.current_version.version, current_version_id=addon.current_version.pk) doc['count'] = chunk[addon.id] doc.setdefault('top_95', defaultdict(lambda: defaultdict(dict))) doc.setdefault('top_95_all', {}) doc.setdefault('usage', {})[app.id] = updates[addon.id] doc.setdefault('works', {}).setdefault(app.id, {}) # Populate with default counts for all app versions. for ver in versions: doc['works'][app.id][vint(ver['main'])] = { 'success': 0, 'failure': 0, 'total': 0, 'failure_ratio': 0.0, } # Group reports by `major`.`minor` app version. reports = (CompatReport.objects.filter( guid=addon.guid, app_guid=app.guid).values_list( 'app_version', 'works_properly').annotate(Count('id'))) for ver, works_properly, cnt in reports: ver = vint(floor_version(ver)) major = [ v['main'] for v in versions if vint(v['previous']) < ver <= vint(v['main']) ] if major: w = doc['works'][app.id][vint(major[0])] # Tally number of success and failure reports. w['success' if works_properly else 'failure'] += cnt w['total'] += cnt # Calculate % of incompatibility reports. w['failure_ratio'] = w['failure'] / float(w['total']) if app not in addon.compatible_apps: continue compat = addon.compatible_apps[app] d = { 'min': compat.min.version_int, 'max': compat.max.version_int } doc.setdefault('support', {})[app.id] = d doc.setdefault('max_version', {})[app.id] = compat.max.version total = sum(updates.values()) # Remember the total so we can show % of usage later. redis.hset('compat:%s' % app.id, 'total', total) # Figure out which add-ons are in the top 95% for this app. running_total = 0 for addon, count in sorted(updates.items(), key=lambda x: x[1], reverse=True): running_total += count docs[addon]['top_95_all'][app.id] = running_total < (.95 * total) # Mark the top 95% of add-ons compatible with the previous version for each # app + version combo. for compat in settings.COMPAT: app, ver = compat['app'], vint(compat['previous']) # Find all the docs that have a max_version compatible with ver. supported = [ doc for doc in docs.values() if app in doc.get('support', {}) and doc['support'][app]['max'] >= ver ] # Sort by count so we can get the top 95% most-used add-ons. supported = sorted(supported, key=lambda d: d['count'], reverse=True) total = sum(doc['count'] for doc in supported) # Figure out which add-ons are in the top 95% for this app + version. running_total = 0 for doc in supported: running_total += doc['count'] doc['top_95'][app][ver] = running_total < (.95 * total) # Send it all to the index. for chunk in amo.utils.chunked(docs.values(), 150): for doc in chunk: AppCompat.index(doc, id=doc['id'], bulk=True) elasticutils.get_es().flush_bulk(forced=True)
def es(self): """Returns an elasticsearch object, using the ES URL from the Django settings""" return get_es(urls=settings.ES_URLS)
def columns(): es = elasticutils.get_es() index = settings.ES_INDEXES['default'] return es.get_mapping('addons', index)['addons']['properties'].keys()
def get_es(cls): return get_es(urls=settings.ES_URLS)
def unindex(cls, id): elasticutils.get_es().delete(cls._get_index(), cls._meta.db_table, id)