Example #1
0
def metadata(request):
    """This view will show metadata aggregations"""
    params = get_search_parameters(request.REQUEST)
    result = metadata_aggregation(settings.ES_INDEX,
                                  settings.ES_DOCTYPE,
                                  params['query'],
                                  params['dates'],
                                  params['distributions'],
                                  params['article_types'],
                                  params['pillars'])

    # Categorize newspaper_ids per Pillar
    pillars = Counter()
    for n in result['aggregations']['newspaper_ids']['buckets']:
        pillar = 'None'
        try:
            newspaper = Newspaper.objects.get(pk=n['key'])
        except Newspaper.DoesNotExist:
            # TODO: this means there's a paper_dc_identifier in ElasticSearch without a corresponding Newspaper.
            newspaper = None
        if newspaper and newspaper.pillar:
            pillar = newspaper.pillar.name
        pillars[pillar] += n['doc_count']

    # Mimic the result of the other aggregations
    result['aggregations']['pillar'] = [{'key': k, 'doc_count': v} for (k, v) in pillars.iteritems()]

    return json_response_message('success', 'Complete', result['aggregations'])
def doc_count(request):
    """
    Returns the number of documents returned by the current query
    """
    logger.info('services/doc_count/ - user: {}'.format(request.user.username))

    if settings.DEBUG:
        print >> stderr, "doc_count()"

    params = get_search_parameters(request.GET)

    result = count_search_results(settings.ES_INDEX,
                                  settings.ES_DOCTYPE,
                                  params['query'],
                                  params['dates'],
                                  params['distributions'],
                                  params['article_types'],
                                  params['pillars'])

    count = result.get('count', None)

    if count:
        params = {'doc_count': count}
        logger.info('services/doc_count/ - returned calculated count.')
        return json_response_message('ok', 'Retrieved document count.', params)

    logger.info('services/doc_count/ - returned "unable to retrieve".')
    return json_response_message('error', 'Unable to retrieve document count')
Example #3
0
def search(request):
    """Perform search request and return html string with results"""

    logger.info('services/search/ - user: {}'.format(request.user.username))

    params = get_search_parameters(request.REQUEST)

    valid_q, result = do_search(settings.ES_INDEX,
                                settings.ES_DOCTYPE,
                                params['query'],
                                params['start']-1,  # Zero based counting
                                params['result_size'],
                                params['dates'],
                                params['distributions'],
                                params['article_types'],
                                params['pillars'],
                                sort_order=params['sort_order'])
    if valid_q:
        html_str = elasticsearch_htmlresp(settings.ES_INDEX,
                                          params['start'],
                                          params['result_size'],
                                          result)
        return json_response_message('ok', 'Search completed', {'html': html_str})
    else:
        result = escape(result).replace('\n', '<br />')
        msg = 'Unable to parse query "{q}"<br /><br />'. \
            format(q=params['query'])
        msg = msg + result.replace('\n', '<br />')
        return json_response_message('error', msg)
def search(request):
    """Perform search request and return html string with results"""

    logger.info('services/search/ - user: {}'.format(request.user.username))

    params = get_search_parameters(request.GET)

    if not validate_dates(params['dates']):
        msg = 'You entered an invalid date range. Please check your date filters.'
        return json_response_message('error', msg)

    valid_q, result = do_search(settings.ES_INDEX,
                                settings.ES_DOCTYPE,
                                params['query'],
                                params['start']-1,  # Zero based counting
                                params['result_size'],
                                params['dates'],
                                params['distributions'],
                                params['article_types'],
                                params['pillars'],
                                sort_order=params['sort_order'])
    if valid_q:
        return json_response_message('ok', 'Search completed', {'hits': result['hits']})
    else:
        result = escape(result).replace('\n', '<br />')
        msg = 'Unable to parse query "{q}"<br /><br />'. \
            format(q=params['query'])
        msg = msg + result.replace('\n', '<br />')
        return json_response_message('error', msg)
Example #5
0
def doc_count(request):
    """
    Returns the number of documents returned by the current query
    """
    logger.info('services/doc_count/ - user: {}'.format(request.user.username))

    if settings.DEBUG:
        print >> stderr, "doc_count()"

    params = get_search_parameters(request.GET)

    result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE,
                                  params['query'], params['dates'],
                                  params['distributions'],
                                  params['article_types'], params['pillars'])

    count = result.get('count', None)

    if count:
        params = {'doc_count': count}
        logger.info('services/doc_count/ - returned calculated count.')
        return json_response_message('ok', 'Retrieved document count.', params)

    logger.info('services/doc_count/ - returned "unable to retrieve".')
    return json_response_message('error', 'Unable to retrieve document count')
Example #6
0
def search(request):
    """Perform search request and return html string with results"""

    logger.info('services/search/ - user: {}'.format(request.user.username))

    params = get_search_parameters(request.GET)

    if not validate_dates(params['dates']):
        msg = 'You entered an invalid date range. Please check your date filters.'
        return json_response_message('error', msg)

    valid_q, result = do_search(
        settings.ES_INDEX,
        settings.ES_DOCTYPE,
        params['query'],
        params['start'] - 1,  # Zero based counting
        params['result_size'],
        params['dates'],
        params['distributions'],
        params['article_types'],
        params['pillars'],
        sort_order=params['sort_order'])
    if valid_q:
        return json_response_message('ok', 'Search completed',
                                     {'hits': result['hits']})
    else:
        result = escape(result).replace('\n', '<br />')
        msg = 'Unable to parse query "{q}"<br /><br />'. \
            format(q=params['query'])
        msg = msg + result.replace('\n', '<br />')
        return json_response_message('error', msg)
Example #7
0
def tv_cloud(request):
    """Generate termvector word cloud using the termvector approach.

    Returns word cloud data for a single document word cloud (based on a single
    document id) and multiple document word clouds (either based on a list of
    document ids (i.e., timeline burst cloud) or a query with metadata).

    For multiple document word clouds, a celery task generates the cloud data.
    """
    if settings.DEBUG:
        print >> stderr, "termvector cloud()"
    logger.info('services/cloud/ - termvector word cloud')
    logger.info('services/cloud/ - user: {}'.format(request.user.username))

    params = get_search_parameters(request.REQUEST)

    ids = request.REQUEST.get('ids')
    query_id = request.GET.get('queryID')
    min_length = int(request.GET.get('min_length', 2))
    use_stopwords = request.GET.get('stopwords') == "1"
    use_default_stopwords = request.GET.get('stopwords_default') == "1"
    stems = request.GET.get('stems') == "1"

    # Retrieve the stopwords
    stopwords = []
    if use_stopwords:
        stopwords_user = list(StopWord.objects
                              .filter(user=request.user)
                              .filter(query=None)
                              .values_list('word', flat=True))

        stopwords_query = []
        if query_id:
            stopwords_query = list(StopWord.objects
                                   .filter(user=request.user)
                                   .filter(query__id=query_id)
                                   .values_list('word', flat=True))

        stopwords_default = []
        if use_default_stopwords:
            stopwords_default = list(StopWord.objects
                                     .filter(user=None)
                                     .filter(query=None)
                                     .values_list('word', flat=True))

        stopwords = stopwords_user + stopwords_query + stopwords_default

    # Cloud by ids
    if ids:
        ids = ids.split(',')

        if len(ids) == 1:
            # Word cloud for single document
            logger.info('services/cloud/ - single document word cloud')
            t_vector = single_document_word_cloud(settings.ES_INDEX,
                                                  settings.ES_DOCTYPE,
                                                  ids[0],
                                                  min_length,
                                                  stopwords,
                                                  stems)
            return json_response_message('ok', 'Word cloud generated', t_vector)

    # Cloud by queryID or multiple ids
    logger.info('services/cloud/ - multiple document word cloud')

    task = generate_tv_cloud.delay(params, min_length, stopwords, ids, stems)
    logger.info('services/cloud/ - Celery task id: {}'.format(task.id))

    return json_response_message('ok', '', {'task': task.id})
Example #8
0
def cloud(request):
    """Return word cloud data using the terms aggregation approach

    This view is currently not used, because it uses the terms aggregation
    approach to generate word clouds, and this is not feasible in ES.

    Returns word cloud data for a single document word cloud (based on a single
    document id) and multiple document word clouds (either based on a list of
    document ids (i.e., timeline burst cloud) or a query with metadata).
    """
    if settings.DEBUG:
        print >> stderr, "cloud()"

    result = None

    params = get_search_parameters(request.REQUEST)

    ids = request.REQUEST.get('ids')

    # Cloud by ids
    if ids:
        ids = ids.split(',')

        if len(ids) == 1:
            # Word cloud for single document
            t_vector = single_document_word_cloud(settings.ES_INDEX,
                                                  settings.ES_DOCTYPE,
                                                  ids[0])
            return json_response_message('ok', 'Word cloud generated', t_vector)
        else:
            # Word cloud for multiple ids
            result = multiple_document_word_cloud(params.get('collection'),
                                                  settings.ES_DOCTYPE,
                                                  params.get('query'),
                                                  params.get('dates'),
                                                  params.get('distributions'),
                                                  params.get('article_types'),
                                                  params.get('pillars'),
                                                  ids)

    # Cloud by queryID
    query_id = request.REQUEST.get('queryID')

    if query_id:
        query, response = get_query_object(query_id)

        if not query:
            return response

        # for some reason, the collection to be searched is stored in parameter
        # 'collections' (with s added) instead of 'collection' as expected by
        # get_search_parameters.
        coll = request.REQUEST.get('collections', settings.ES_INDEX)

        result = multiple_document_word_cloud(coll,
                                              settings.ES_DOCTYPE,
                                              query.query,
                                              params.get('dates'),
                                              params.get('distributions'),
                                              params.get('article_types'),
                                              params.get('pillars'))

    if not result:
        return json_response_message('error', 'No word cloud generated.')

    return json_response_message('success', 'Word cloud generated', result)