def mlt(self): """Returns a search with a morelikethis query for docs like this""" # Short responses tend to not repeat any words, so then MLT # returns nothing. This fixes that by setting min_term_freq to # 1. Longer responses tend to repeat important words, so we can # set min_term_freq to 2. num_words = len(self.description.split(' ')) if num_words > 40: min_term_freq = 2 else: min_term_freq = 1 s = self.search() if self.product: s = s.filter('term', product=self.product) if self.platform: s = s.filter('term', platform=self.platform) s = s.query( 'more_like_this', fields=['description'], docs=[ { '_index': get_index_name(), '_type': self._doc_type.name, '_id': self.id } ], min_term_freq=min_term_freq, stop_words=list(ANALYSIS_STOPWORDS) ) return s
def mlt(self): """Returns a search with a morelikethis query for docs like this""" # Short responses tend to not repeat any words, so then MLT # returns nothing. This fixes that by setting min_term_freq to # 1. Longer responses tend to repeat important words, so we can # set min_term_freq to 2. num_words = len(self.description.split(' ')) if num_words > 40: min_term_freq = 2 else: min_term_freq = 1 s = self.search() if self.product: s = s.filter('term', product=self.product) if self.platform: s = s.filter('term', platform=self.platform) s = s.query('more_like_this', fields=['description'], docs=[{ '_index': get_index_name(), '_type': self._doc_type.name, '_id': self.id }], min_term_freq=min_term_freq, stop_words=list(ANALYSIS_STOPWORDS)) return s
def teardown_indexes(self): try: get_es().indices.delete(get_index_name()) except NotFoundError: # If we get this error, it means the index didn't exist # so there's nothing to delete. pass
def test_index_chunk_task(self): responses = ResponseFactory.create_batch(10) # With live indexing, that'll create items in the index. Since # we want to test index_chunk_test, we need a clean index to # start with so we delete and recreate it. self.setup_indexes(empty=True) # Verify there's nothing in the index. assert ResponseDocType.docs.search().count() == 0 # Create the record and the chunk and then run it through # celery. batch_id = 'ou812' rec = RecordFactory(batch_id=batch_id) chunk = ( to_class_path(ResponseDocType), [item.id for item in responses] ) index_chunk_task.delay(get_index_name(), batch_id, rec.id, chunk) self.refresh() # Verify everything is in the index now. assert ResponseDocType.docs.search().count() == 10 # Verify the record was marked succeeded. rec = Record.objects.get(pk=rec.id) assert rec.status == Record.STATUS_SUCCESS
def test_index_chunk_task(self): responses = ResponseFactory.create_batch(10) # With live indexing, that'll create items in the index. Since # we want to test index_chunk_test, we need a clean index to # start with so we delete and recreate it. self.setup_indexes(empty=True) # Verify there's nothing in the index. assert ResponseDocType.docs.search().count() == 0 # Create the record and the chunk and then run it through # celery. batch_id = 'ou812' rec = RecordFactory(batch_id=batch_id) chunk = (to_class_path(ResponseDocType), [item.id for item in responses]) index_chunk_task.delay(get_index_name(), batch_id, rec.id, chunk) self.refresh() # Verify everything is in the index now. assert ResponseDocType.docs.search().count() == 10 # Verify the record was marked succeeded. rec = Record.objects.get(pk=rec.id) assert rec.status == Record.STATUS_SUCCESS
def search_admin_view(request): """Render the admin view containing search tools""" error_messages = [] stats = None es_deets = None indexes = [] try: if 'reset' in request.POST: return handle_reset(request) if 'reindex' in request.POST: return handle_reindex(request) if 'recreate_reindex' in request.POST: return handle_recreate_reindex(request) except Exception as exc: error_messages.append(u'Error: %s' % exc.message) try: # This gets index stats, but also tells us whether ES is in # a bad state. try: stats = get_index_stats() except NotFoundError: stats = None indexes = get_indexes() indexes.sort(key=lambda m: m[0]) # TODO: Input has a single ES_URL and that's the ZLB and does # the balancing. If that ever changes and we have multiple # ES_URLs, then this should get fixed. es_deets = requests.get(settings.ES_URLS[0]).json() except ConnectionError: error_messages.append('Error: Elastic Search is not set up on this ' 'machine or timed out trying to respond. ' '(ConnectionError/Timeout)') except NotFoundError: error_messages.append('Error: Index is missing. Press the reindex ' 'button below. (ElasticHttpNotFoundError)') outstanding_records = Record.outstanding() recent_records = Record.objects.order_by('-creation_time')[:100] return render( request, 'admin/search_admin_view.html', { 'title': 'Search', 'es_deets': es_deets, 'doc_type_stats': stats, 'indexes': indexes, 'index': get_index_name(), 'error_messages': error_messages, 'recent_records': recent_records, 'outstanding_records': outstanding_records, 'now': datetime.now(), })
def search_admin_view(request): """Render the admin view containing search tools""" error_messages = [] stats = None es_deets = None indexes = [] try: if 'reset' in request.POST: return handle_reset(request) if 'reindex' in request.POST: return handle_reindex(request) if 'recreate_reindex' in request.POST: return handle_recreate_reindex(request) except Exception as exc: error_messages.append(u'Error: %s' % exc.message) try: # This gets index stats, but also tells us whether ES is in # a bad state. try: stats = get_index_stats() except NotFoundError: stats = None indexes = get_indexes() indexes.sort(key=lambda m: m[0]) # TODO: Input has a single ES_URL and that's the ZLB and does # the balancing. If that ever changes and we have multiple # ES_URLs, then this should get fixed. es_deets = requests.get(settings.ES_URLS[0]).json() except ConnectionError: error_messages.append('Error: Elastic Search is not set up on this ' 'machine or timed out trying to respond. ' '(ConnectionError/Timeout)') except NotFoundError: error_messages.append('Error: Index is missing. Press the reindex ' 'button below. (ElasticHttpNotFoundError)') outstanding_records = Record.outstanding() recent_records = Record.objects.order_by('-creation_time')[:100] return render(request, 'admin/search_admin_view.html', { 'title': 'Search', 'es_deets': es_deets, 'doc_type_stats': stats, 'indexes': indexes, 'index': get_index_name(), 'error_messages': error_messages, 'recent_records': recent_records, 'outstanding_records': outstanding_records, 'now': datetime.now(), })
def timezone_view(request): """Admin view showing times and timezones in data.""" # Note: This is an admin page that gets used once in a blue moon. # As such, I'm taking some liberties (hand-indexing the response, # time.sleep, etc) that I would never take if it was used more # often or was viewable by users. If these two assumptions ever # change, then this should be rewritten. from fjord.feedback.models import ( Response, ResponseDocType, ResponseDocTypeManager ) from fjord.feedback.tests import ResponseFactory from fjord.search.index import get_es, get_index_name server_time = datetime.now() # Create a new response. resp = ResponseFactory() resp_time = resp.created # Index the response by hand so we know it gets to # Elasticsearch. Otherwise it gets done by celery and we don't # know how long that'll take. doc = ResponseDocType.extract_doc(resp) ResponseDocTypeManager.bulk_index(docs=[doc]) # Fetch the response from the db. resp = Response.objects.get(id=resp.id) resp2_time = resp.created # Refresh and sleep 5 seconds as a hand-wavey way to make sure # that Elasticsearch has had time to refresh the index. get_es().indices.refresh(get_index_name()) time.sleep(5) s = ResponseDocTypeManager.search().filter('term', id=resp.id).execute() es_time = s[0].created # Delete the test response which also deletes it in the index. resp.delete() return render(request, 'admin/timezone_view.html', { 'server_time': server_time, 'resp_time': resp_time, 'resp2_time': resp2_time, 'es_time': es_time })
def reindex(): """Calculates and creates indexing chunks""" index = get_index_name() batch_id = create_batch_id() # Break up all the things we want to index into chunks. This # chunkifies by class then by chunk size. chunks = [] for cls, indexable in get_indexable(): chunks.extend((cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE)) for cls, id_list in chunks: chunk_name = '%s %d -> %d' % (cls._doc_type.name, id_list[0], id_list[-1]) rec = Record(batch_id=batch_id, name=chunk_name) rec.save() index_chunk_task.delay(index, batch_id, rec.id, (to_class_path(cls), id_list))
def reindex(): """Calculates and creates indexing chunks""" index = get_index_name() batch_id = create_batch_id() # Break up all the things we want to index into chunks. This # chunkifies by class then by chunk size. chunks = [] for cls, indexable in get_indexable(): chunks.extend( (cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE)) for cls, id_list in chunks: chunk_name = '%s %d -> %d' % (cls._doc_type.name, id_list[0], id_list[-1]) rec = Record(batch_id=batch_id, name=chunk_name) rec.save() index_chunk_task.delay(index, batch_id, rec.id, (to_class_path(cls), id_list))
def monitor_view(request): """View for services monitor.""" # Dict of infrastructure name -> list of output tuples of (INFO, # msg) or (ERROR, msg) status = {} # Note: To add a new component, do your testing and then add a # name -> list of output tuples map to status. # Check memcached. memcache_results = [] try: for cache_name, cache_props in settings.CACHES.items(): result = True backend = cache_props['BACKEND'] location = cache_props['LOCATION'] # LOCATION can be a string or a list of strings if isinstance(location, basestring): location = location.split(';') if 'memcache' in backend: for loc in location: # TODO: this doesn't handle unix: variant ip, port = loc.split(':') result = test_memcached(ip, int(port)) memcache_results.append( (INFO, '%s:%s %s' % (ip, port, result))) if not memcache_results: memcache_results.append((ERROR, 'memcache is not configured.')) elif len(memcache_results) < 2: memcache_results.append( (ERROR, ('You should have at least 2 memcache servers. ' 'You have %s.' % len(memcache_results)))) else: memcache_results.append((INFO, 'memcached servers look good.')) except Exception as exc: memcache_results.append( (ERROR, 'Exception while looking at memcached: %s' % str(exc))) status['memcached'] = memcache_results # Check ES. es_results = [] try: get_index_stats() es_results.append( (INFO, ('Successfully connected to ElasticSearch and index ' 'exists.'))) except ConnectionError as exc: es_results.append( (ERROR, 'Cannot connect to ElasticSearch: %s' % str(exc))) except NotFoundError: es_results.append( (ERROR, 'Index "%s" missing.' % get_index_name())) except Exception as exc: es_results.append( (ERROR, 'Exception while looking at ElasticSearch: %s' % str(exc))) status['ElasticSearch'] = es_results # Check RabbitMQ. rabbitmq_results = [] try: rabbit_conn = establish_connection(connect_timeout=2) rabbit_conn.connect() rabbitmq_results.append( (INFO, 'Successfully connected to RabbitMQ.')) except (socket.error, IOError) as exc: rabbitmq_results.append( (ERROR, 'Error connecting to RabbitMQ: %s' % str(exc))) except Exception as exc: rabbitmq_results.append( (ERROR, 'Exception while looking at RabbitMQ: %s' % str(exc))) status['RabbitMQ'] = rabbitmq_results status_code = 200 status_summary = {} for component, output in status.items(): if ERROR in [item[0] for item in output]: status_code = 500 status_summary[component] = False else: status_summary[component] = True return render(request, 'services/monitor.html', {'component_status': status, 'status_summary': status_summary}, status=status_code)