Example #1
0
    def mlt(self):
        """Returns a search with a morelikethis query for docs like this"""
        # Short responses tend to not repeat any words, so then MLT
        # returns nothing. This fixes that by setting min_term_freq to
        # 1. Longer responses tend to repeat important words, so we can
        # set min_term_freq to 2.
        num_words = len(self.description.split(' '))
        if num_words > 40:
            min_term_freq = 2
        else:
            min_term_freq = 1

        s = self.search()
        if self.product:
            s = s.filter('term', product=self.product)
        if self.platform:
            s = s.filter('term', platform=self.platform)

        s = s.query(
            'more_like_this',
            fields=['description'],
            docs=[
                {
                    '_index': get_index_name(),
                    '_type': self._doc_type.name,
                    '_id': self.id
                }
            ],
            min_term_freq=min_term_freq,
            stop_words=list(ANALYSIS_STOPWORDS)
        )
        return s
Example #2
0
    def mlt(self):
        """Returns a search with a morelikethis query for docs like this"""
        # Short responses tend to not repeat any words, so then MLT
        # returns nothing. This fixes that by setting min_term_freq to
        # 1. Longer responses tend to repeat important words, so we can
        # set min_term_freq to 2.
        num_words = len(self.description.split(' '))
        if num_words > 40:
            min_term_freq = 2
        else:
            min_term_freq = 1

        s = self.search()
        if self.product:
            s = s.filter('term', product=self.product)
        if self.platform:
            s = s.filter('term', platform=self.platform)

        s = s.query('more_like_this',
                    fields=['description'],
                    docs=[{
                        '_index': get_index_name(),
                        '_type': self._doc_type.name,
                        '_id': self.id
                    }],
                    min_term_freq=min_term_freq,
                    stop_words=list(ANALYSIS_STOPWORDS))
        return s
Example #3
0
 def teardown_indexes(self):
     try:
         get_es().indices.delete(get_index_name())
     except NotFoundError:
         # If we get this error, it means the index didn't exist
         # so there's nothing to delete.
         pass
Example #4
0
 def teardown_indexes(self):
     try:
         get_es().indices.delete(get_index_name())
     except NotFoundError:
         # If we get this error, it means the index didn't exist
         # so there's nothing to delete.
         pass
Example #5
0
    def test_index_chunk_task(self):
        responses = ResponseFactory.create_batch(10)

        # With live indexing, that'll create items in the index. Since
        # we want to test index_chunk_test, we need a clean index to
        # start with so we delete and recreate it.
        self.setup_indexes(empty=True)

        # Verify there's nothing in the index.
        assert ResponseDocType.docs.search().count() == 0

        # Create the record and the chunk and then run it through
        # celery.
        batch_id = 'ou812'
        rec = RecordFactory(batch_id=batch_id)

        chunk = (
            to_class_path(ResponseDocType),
            [item.id for item in responses]
        )
        index_chunk_task.delay(get_index_name(), batch_id, rec.id, chunk)

        self.refresh()

        # Verify everything is in the index now.
        assert ResponseDocType.docs.search().count() == 10

        # Verify the record was marked succeeded.
        rec = Record.objects.get(pk=rec.id)
        assert rec.status == Record.STATUS_SUCCESS
Example #6
0
    def test_index_chunk_task(self):
        responses = ResponseFactory.create_batch(10)

        # With live indexing, that'll create items in the index. Since
        # we want to test index_chunk_test, we need a clean index to
        # start with so we delete and recreate it.
        self.setup_indexes(empty=True)

        # Verify there's nothing in the index.
        assert ResponseDocType.docs.search().count() == 0

        # Create the record and the chunk and then run it through
        # celery.
        batch_id = 'ou812'
        rec = RecordFactory(batch_id=batch_id)

        chunk = (to_class_path(ResponseDocType),
                 [item.id for item in responses])
        index_chunk_task.delay(get_index_name(), batch_id, rec.id, chunk)

        self.refresh()

        # Verify everything is in the index now.
        assert ResponseDocType.docs.search().count() == 10

        # Verify the record was marked succeeded.
        rec = Record.objects.get(pk=rec.id)
        assert rec.status == Record.STATUS_SUCCESS
Example #7
0
File: admin.py Project: xrile/fjord
def search_admin_view(request):
    """Render the admin view containing search tools"""
    error_messages = []
    stats = None
    es_deets = None
    indexes = []

    try:
        if 'reset' in request.POST:
            return handle_reset(request)

        if 'reindex' in request.POST:
            return handle_reindex(request)

        if 'recreate_reindex' in request.POST:
            return handle_recreate_reindex(request)

    except Exception as exc:
        error_messages.append(u'Error: %s' % exc.message)

    try:
        # This gets index stats, but also tells us whether ES is in
        # a bad state.
        try:
            stats = get_index_stats()
        except NotFoundError:
            stats = None

        indexes = get_indexes()
        indexes.sort(key=lambda m: m[0])

        # TODO: Input has a single ES_URL and that's the ZLB and does
        # the balancing. If that ever changes and we have multiple
        # ES_URLs, then this should get fixed.
        es_deets = requests.get(settings.ES_URLS[0]).json()
    except ConnectionError:
        error_messages.append('Error: Elastic Search is not set up on this '
                              'machine or timed out trying to respond. '
                              '(ConnectionError/Timeout)')
    except NotFoundError:
        error_messages.append('Error: Index is missing. Press the reindex '
                              'button below. (ElasticHttpNotFoundError)')

    outstanding_records = Record.outstanding()
    recent_records = Record.objects.order_by('-creation_time')[:100]

    return render(
        request, 'admin/search_admin_view.html', {
            'title': 'Search',
            'es_deets': es_deets,
            'doc_type_stats': stats,
            'indexes': indexes,
            'index': get_index_name(),
            'error_messages': error_messages,
            'recent_records': recent_records,
            'outstanding_records': outstanding_records,
            'now': datetime.now(),
        })
Example #8
0
def search_admin_view(request):
    """Render the admin view containing search tools"""
    error_messages = []
    stats = None
    es_deets = None
    indexes = []

    try:
        if 'reset' in request.POST:
            return handle_reset(request)

        if 'reindex' in request.POST:
            return handle_reindex(request)

        if 'recreate_reindex' in request.POST:
            return handle_recreate_reindex(request)

    except Exception as exc:
        error_messages.append(u'Error: %s' % exc.message)

    try:
        # This gets index stats, but also tells us whether ES is in
        # a bad state.
        try:
            stats = get_index_stats()
        except NotFoundError:
            stats = None

        indexes = get_indexes()
        indexes.sort(key=lambda m: m[0])

        # TODO: Input has a single ES_URL and that's the ZLB and does
        # the balancing. If that ever changes and we have multiple
        # ES_URLs, then this should get fixed.
        es_deets = requests.get(settings.ES_URLS[0]).json()
    except ConnectionError:
        error_messages.append('Error: Elastic Search is not set up on this '
                              'machine or timed out trying to respond. '
                              '(ConnectionError/Timeout)')
    except NotFoundError:
        error_messages.append('Error: Index is missing. Press the reindex '
                              'button below. (ElasticHttpNotFoundError)')

    outstanding_records = Record.outstanding()
    recent_records = Record.objects.order_by('-creation_time')[:100]

    return render(request, 'admin/search_admin_view.html', {
        'title': 'Search',
        'es_deets': es_deets,
        'doc_type_stats': stats,
        'indexes': indexes,
        'index': get_index_name(),
        'error_messages': error_messages,
        'recent_records': recent_records,
        'outstanding_records': outstanding_records,
        'now': datetime.now(),
    })
Example #9
0
def timezone_view(request):
    """Admin view showing times and timezones in data."""
    # Note: This is an admin page that gets used once in a blue moon.
    # As such, I'm taking some liberties (hand-indexing the response,
    # time.sleep, etc) that I would never take if it was used more
    # often or was viewable by users. If these two assumptions ever
    # change, then this should be rewritten.

    from fjord.feedback.models import (
        Response,
        ResponseDocType,
        ResponseDocTypeManager
    )
    from fjord.feedback.tests import ResponseFactory
    from fjord.search.index import get_es, get_index_name

    server_time = datetime.now()

    # Create a new response.
    resp = ResponseFactory()
    resp_time = resp.created

    # Index the response by hand so we know it gets to
    # Elasticsearch. Otherwise it gets done by celery and we don't
    # know how long that'll take.
    doc = ResponseDocType.extract_doc(resp)
    ResponseDocTypeManager.bulk_index(docs=[doc])

    # Fetch the response from the db.
    resp = Response.objects.get(id=resp.id)
    resp2_time = resp.created

    # Refresh and sleep 5 seconds as a hand-wavey way to make sure
    # that Elasticsearch has had time to refresh the index.
    get_es().indices.refresh(get_index_name())
    time.sleep(5)

    s = ResponseDocTypeManager.search().filter('term', id=resp.id).execute()
    es_time = s[0].created

    # Delete the test response which also deletes it in the index.
    resp.delete()

    return render(request, 'admin/timezone_view.html', {
        'server_time': server_time,
        'resp_time': resp_time,
        'resp2_time': resp2_time,
        'es_time': es_time
    })
Example #10
0
File: admin.py Project: xrile/fjord
def reindex():
    """Calculates and creates indexing chunks"""
    index = get_index_name()

    batch_id = create_batch_id()

    # Break up all the things we want to index into chunks. This
    # chunkifies by class then by chunk size.
    chunks = []
    for cls, indexable in get_indexable():
        chunks.extend((cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE))

    for cls, id_list in chunks:
        chunk_name = '%s %d -> %d' % (cls._doc_type.name, id_list[0],
                                      id_list[-1])
        rec = Record(batch_id=batch_id, name=chunk_name)
        rec.save()
        index_chunk_task.delay(index, batch_id, rec.id,
                               (to_class_path(cls), id_list))
Example #11
0
def reindex():
    """Calculates and creates indexing chunks"""
    index = get_index_name()

    batch_id = create_batch_id()

    # Break up all the things we want to index into chunks. This
    # chunkifies by class then by chunk size.
    chunks = []
    for cls, indexable in get_indexable():
        chunks.extend(
            (cls, chunk) for chunk in chunked(indexable, CHUNK_SIZE))

    for cls, id_list in chunks:
        chunk_name = '%s %d -> %d' % (cls._doc_type.name,
                                      id_list[0], id_list[-1])
        rec = Record(batch_id=batch_id, name=chunk_name)
        rec.save()
        index_chunk_task.delay(index, batch_id, rec.id,
                               (to_class_path(cls), id_list))
Example #12
0
def monitor_view(request):
    """View for services monitor."""
    # Dict of infrastructure name -> list of output tuples of (INFO,
    # msg) or (ERROR, msg)
    status = {}

    # Note: To add a new component, do your testing and then add a
    # name -> list of output tuples map to status.

    # Check memcached.
    memcache_results = []
    try:
        for cache_name, cache_props in settings.CACHES.items():
            result = True
            backend = cache_props['BACKEND']
            location = cache_props['LOCATION']

            # LOCATION can be a string or a list of strings
            if isinstance(location, basestring):
                location = location.split(';')

            if 'memcache' in backend:
                for loc in location:
                    # TODO: this doesn't handle unix: variant
                    ip, port = loc.split(':')
                    result = test_memcached(ip, int(port))
                    memcache_results.append(
                        (INFO, '%s:%s %s' % (ip, port, result)))

        if not memcache_results:
            memcache_results.append((ERROR, 'memcache is not configured.'))

        elif len(memcache_results) < 2:
            memcache_results.append(
                (ERROR, ('You should have at least 2 memcache servers. '
                         'You have %s.' % len(memcache_results))))

        else:
            memcache_results.append((INFO, 'memcached servers look good.'))

    except Exception as exc:
        memcache_results.append(
            (ERROR, 'Exception while looking at memcached: %s' % str(exc)))

    status['memcached'] = memcache_results

    # Check ES.
    es_results = []
    try:
        get_index_stats()
        es_results.append(
            (INFO, ('Successfully connected to ElasticSearch and index '
                    'exists.')))

    except ConnectionError as exc:
        es_results.append(
            (ERROR, 'Cannot connect to ElasticSearch: %s' % str(exc)))

    except NotFoundError:
        es_results.append(
            (ERROR, 'Index "%s" missing.' % get_index_name()))

    except Exception as exc:
        es_results.append(
            (ERROR, 'Exception while looking at ElasticSearch: %s' % str(exc)))

    status['ElasticSearch'] = es_results

    # Check RabbitMQ.
    rabbitmq_results = []
    try:
        rabbit_conn = establish_connection(connect_timeout=2)
        rabbit_conn.connect()
        rabbitmq_results.append(
            (INFO, 'Successfully connected to RabbitMQ.'))

    except (socket.error, IOError) as exc:
        rabbitmq_results.append(
            (ERROR, 'Error connecting to RabbitMQ: %s' % str(exc)))

    except Exception as exc:
        rabbitmq_results.append(
            (ERROR, 'Exception while looking at RabbitMQ: %s' % str(exc)))

    status['RabbitMQ'] = rabbitmq_results

    status_code = 200

    status_summary = {}
    for component, output in status.items():
        if ERROR in [item[0] for item in output]:
            status_code = 500
            status_summary[component] = False
        else:
            status_summary[component] = True

    return render(request, 'services/monitor.html',
                  {'component_status': status,
                   'status_summary': status_summary},
                  status=status_code)