Example #1
0
def count_results(query):
    """Returns the number of results for a Query"""
    params = query.get_query_dict()
    result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE,
                                  params['query'], params['dates'],
                                  params['exclude_distributions'],
                                  params['exclude_article_types'],
                                  params['selected_pillars'])
    return result.get('count')
Example #2
0
    def handle(self, *args, **options):
        print 'Emptying table...'
        Term.objects.all().delete()

        for timeframe, dates in TIMEFRAMES.items():
            print 'Retrieving documents for timeframe {}...'.format(timeframe)
            exclude_dist = Distribution.objects.exclude(
                name='Landelijk').values_list('id', flat=True)
            date_range = daterange2dates(dates)

            total_documents = count_search_results(settings.ES_INDEX,
                                                   settings.ES_DOCTYPE, None,
                                                   date_range, exclude_dist,
                                                   [], []).get('count')
            print 'Total documents: {}'.format(total_documents)

            sets = document_id_chunks(10000,
                                      settings.ES_INDEX,
                                      settings.ES_DOCTYPE,
                                      None,
                                      date_range,
                                      dist=exclude_dist)

            print 'Counting terms...'
            counter = Counter()
            for n, s in enumerate(sets):
                start_time = time.time()
                counter += termvector_wordcloud(settings.ES_INDEX,
                                                settings.ES_DOCTYPE,
                                                s,
                                                min_length=2,
                                                add_freqs=False)
                print 'Completed set {} in {} seconds...'.format(
                    n + 1,
                    time.time() - start_time)

            print 'Calculating IDFs...'
            terms = []
            for term, count in counter.items():
                if count > 1:  # don't add single occurrences
                    idf = math.log10(total_documents / float(count))
                    terms.append(
                        Term(timeframe=timeframe,
                             word=term,
                             count=count,
                             idf=idf))

            print 'Transferring to database...'
            Term.objects.bulk_create(terms, batch_size=10000)

            print 'Creating RecordDAWG'
            d = dawg.RecordDAWG(
                '<d', zip([t.word for t in terms], [(t.idf, ) for t in terms]))
            d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg'))
        """ Test code below.
def generate_tv_cloud(search_params, min_length, stopwords, date_range=None, stems=False, idf_timeframe=''):
    """
    Generates multiple document word clouds using the termvector approach.
    """
    # Date range is either provided (in case of burst clouds from the timelines) or from the Query
    dates = date_range or search_params['dates']

    # First, count the search results
    result = count_search_results(settings.ES_INDEX,
                                  settings.ES_DOCTYPE,
                                  search_params['query'],
                                  dates,
                                  search_params['exclude_distributions'],
                                  search_params['exclude_article_types'],
                                  search_params['selected_pillars'])
    doc_count = result.get('count')
    update_task_status(0, doc_count)

    # Then, create the word clouds per chunk
    progress = 0
    wordcloud_counter = Counter()
    for subset in document_id_chunks(settings.QUERY_DATA_CHUNK_SIZE,
                                     settings.ES_INDEX,
                                     settings.ES_DOCTYPE,
                                     search_params['query'],
                                     dates,
                                     search_params['exclude_distributions'],
                                     search_params['exclude_article_types'],
                                     search_params['selected_pillars']):

        wordcloud_counter += termvector_wordcloud(settings.ES_INDEX,
                                                  settings.ES_DOCTYPE,
                                                  subset,
                                                  min_length,
                                                  stems)

        # Update the task status
        progress += len(subset)
        update_task_status(progress, doc_count)

    # Remove non-frequent words form the counter
    for key, count in dropwhile(lambda c: c[1] > math.log10(doc_count), wordcloud_counter.most_common()):
        del wordcloud_counter[key]

    # Remove the stopwords from the counter
    for sw in stopwords:
        del wordcloud_counter[sw]

    # Return a dictionary with the results
    return {
        'result': normalize_cloud(wordcloud_counter, idf_timeframe),
        'status': 'ok',
        'burstcloud': date_range is not None
    }
def count_results(query):
    """Returns the number of results for a Query"""
    params = query.get_query_dict()
    result = count_search_results(settings.ES_INDEX,
                                  settings.ES_DOCTYPE,
                                  params['query'],
                                  params['dates'],
                                  params['exclude_distributions'],
                                  params['exclude_article_types'],
                                  params['selected_pillars'])
    return result.get('count')
Example #5
0
def generate_tv_cloud(search_params,
                      min_length,
                      stopwords,
                      date_range=None,
                      stems=False,
                      idf_timeframe=''):
    """
    Generates multiple document word clouds using the termvector approach.
    """
    # Date range is either provided (in case of burst clouds from the timelines) or from the Query
    dates = date_range or search_params['dates']

    # First, count the search results
    result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE,
                                  search_params['query'], dates,
                                  search_params['exclude_distributions'],
                                  search_params['exclude_article_types'],
                                  search_params['selected_pillars'])
    doc_count = result.get('count')
    update_task_status(0, doc_count)

    # Then, create the word clouds per chunk
    progress = 0
    wordcloud_counter = Counter()
    for subset in document_id_chunks(settings.QUERY_DATA_CHUNK_SIZE,
                                     settings.ES_INDEX, settings.ES_DOCTYPE,
                                     search_params['query'], dates,
                                     search_params['exclude_distributions'],
                                     search_params['exclude_article_types'],
                                     search_params['selected_pillars']):

        wordcloud_counter += termvector_wordcloud(settings.ES_INDEX,
                                                  settings.ES_DOCTYPE, subset,
                                                  min_length, stems)

        # Update the task status
        progress += len(subset)
        update_task_status(progress, doc_count)

    # Remove non-frequent words form the counter
    for key, count in dropwhile(lambda c: c[1] > math.log10(doc_count),
                                wordcloud_counter.most_common()):
        del wordcloud_counter[key]

    # Remove the stopwords from the counter
    for sw in stopwords:
        del wordcloud_counter[sw]

    # Return a dictionary with the results
    return {
        'result': normalize_cloud(wordcloud_counter, idf_timeframe),
        'status': 'ok',
        'burstcloud': date_range is not None
    }
Example #6
0
def generate_tv_cloud(search_params,
                      min_length,
                      stopwords,
                      ids=None,
                      stems=False):
    """Generates multiple document word clouds using the termvector approach"""
    burst = True
    chunk_size = 1000
    progress = 0
    wordcloud_counter = Counter()

    if not ids:
        # Normal (non-time line) wordcloud (based on query)
        burst = False

        result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE,
                                      search_params['query'],
                                      search_params['dates'],
                                      search_params['distributions'],
                                      search_params['article_types'],
                                      search_params['pillars'])
        doc_count = result.get('count')

        info = {'current': 0, 'total': doc_count}
        current_task.update_state(state='PROGRESS', meta=info)

        for subset in document_id_chunks(
                chunk_size, settings.ES_INDEX, settings.ES_DOCTYPE,
                search_params['query'], search_params['dates'],
                search_params['distributions'], search_params['article_types'],
                search_params['pillars']):

            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE, subset,
                                          min_length, stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {'current': progress, 'total': doc_count}
            current_task.update_state(state='PROGRESS', meta=info)
    else:
        # Time line word cloud (based in list of document ids)
        for subset in utils.chunks(ids, chunk_size):
            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE, subset,
                                          min_length, stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {'current': progress, 'total': len(ids)}
            current_task.update_state(state='PROGRESS', meta=info)

    return counter2wordclouddata(wordcloud_counter, burst, stopwords)
Example #7
0
def generate_tv_cloud(search_params, min_length, stopwords, date_range=None, stems=False):
    """Generates multiple document word clouds using the termvector approach"""
    chunk_size = settings.QUERY_DATA_CHUNK_SIZE
    progress = 0
    wordcloud_counter = Counter()

    # Date range is either provided or from the Query
    dates = date_range or search_params['dates']

    # First, count the search results
    result = count_search_results(settings.ES_INDEX,
                                  settings.ES_DOCTYPE,
                                  search_params['query'],
                                  dates,
                                  search_params['exclude_distributions'],
                                  search_params['exclude_article_types'],
                                  search_params['selected_pillars'])
    doc_count = result.get('count')

    info = {
        'current': 0,
        'total': doc_count
    }
    current_task.update_state(state='PROGRESS', meta=info)

    # Then, create the word clouds per chunk
    for subset in document_id_chunks(chunk_size,
                                     settings.ES_INDEX,
                                     settings.ES_DOCTYPE,
                                     search_params['query'],
                                     dates,
                                     search_params['exclude_distributions'],
                                     search_params['exclude_article_types'],
                                     search_params['selected_pillars']):

        result = termvector_wordcloud(settings.ES_INDEX,
                                      settings.ES_DOCTYPE,
                                      subset,
                                      min_length,
                                      stems)
        wordcloud_counter = wordcloud_counter + result

        progress += len(subset)
        info = {
            'current': progress,
            'total': doc_count
        }
        current_task.update_state(state='PROGRESS', meta=info)

    burst = date_range is not None
    return counter2wordclouddata(wordcloud_counter, burst, stopwords)
    def handle(self, *args, **options):
        print 'Emptying table...'
        Term.objects.all().delete()

        for timeframe, dates in TIMEFRAMES.items():
            print 'Retrieving documents for timeframe {}...'.format(timeframe)
            exclude_dist = Distribution.objects.exclude(name='Landelijk').values_list('id', flat=True)
            date_range = daterange2dates(dates)

            total_documents = count_search_results(settings.ES_INDEX,
                                                   settings.ES_DOCTYPE,
                                                   None,
                                                   date_range,
                                                   exclude_dist, [], []).get('count')
            print 'Total documents: {}'.format(total_documents)

            sets = document_id_chunks(10000,
                                      settings.ES_INDEX,
                                      settings.ES_DOCTYPE,
                                      None,
                                      date_range,
                                      dist=exclude_dist)

            print 'Counting terms...'
            counter = Counter()
            for n, s in enumerate(sets):
                start_time = time.time()
                counter += termvector_wordcloud(settings.ES_INDEX,
                                                settings.ES_DOCTYPE,
                                                s,
                                                min_length=2,
                                                add_freqs=False)
                print 'Completed set {} in {} seconds...'.format(n + 1, time.time() - start_time)

            print 'Calculating IDFs...'
            terms = []
            for term, count in counter.items():
                if count > 1:  # don't add single occurrences
                    idf = math.log10(total_documents / float(count))
                    terms.append(Term(timeframe=timeframe, word=term, count=count, idf=idf))

            print 'Transferring to database...'
            Term.objects.bulk_create(terms, batch_size=10000)

            print 'Creating RecordDAWG'
            d = dawg.RecordDAWG('<d', zip([t.word for t in terms], [(t.idf,) for t in terms]))
            d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg'))

        """ Test code below.
Example #9
0
def download_prepare(request):
    """Prepares the ocr+meta-data zipfile for download
    """
    if settings.DEBUG:
        print >> stderr, "download_prepare()"
        print >> stderr, request.REQUEST
    logger.info('query/download/prepare - user: {}'.
                format(request.user.username))

    user = request.user
    query = Query.objects.get(title=request.GET.get('query_title'), user=user)

    params = query.get_query_dict()
    result = count_search_results(settings.ES_INDEX,
                                  settings.ES_DOCTYPE,
                                  params['query'],
                                  params['dates'],
                                  params['exclude_distributions'],
                                  params['exclude_article_types'],
                                  params['selected_pillars'])
    count = result.get('count')

    if count > settings.QUERY_DATA_MAX_RESULTS:
        msg = "Your query has too much results to export: " + str(count)
        msg += " where " + str(settings.QUERY_DATA_MAX_RESULTS) + " are allowed. "
        msg += "Please consider filtering your results before exporting."
        return json_response_message('error', msg)

    if user.email == "":
        msg = "Preparing your download for query <br/><b>" + query.title + \
              "</b> failed.<br/>A valid email address is needed for user " \
              "<br/><b>" + user.username + "</b>"
        if settings.DEBUG:
            print >> stderr, msg
        return json_response_message('error', msg)

    try:
        validate_email(user.email)
    except:
        msg = "Preparing your download for query <br/><b>" + query.title + \
              "</b> failed.<br/>The email address of user <b>" + \
              user.username + "</b> could not be validated: <b>" + \
              user.email + "</b>"
        if settings.DEBUG:
            print >> stderr, msg
        return json_response_message('error', msg)

    zip_basename = create_zipname(user, query)
    url = urljoin('http://{}'.format(request.get_host()),
                  "/query/download/" + quote_plus(zip_basename))
    email_message = "Texcavator query: " + query.title + "\n" + zip_basename + \
        "\nURL: " + url
    if settings.DEBUG:
        print >> stderr, email_message
        print >> stderr, 'http://{}'.format(request.get_host())

    # zip documents by celery background task
    execute(query, dict(request.REQUEST), zip_basename, user.email, email_message)

    msg = "Your export for query <b>" + query.title + \
          "</b> is completed.<br/>An e-mail with a download link has been sent " + \
          "to <b>" + user.email + "</b>."
    return json_response_message('SUCCESS', msg)
Example #10
0
def generate_tv_cloud(search_params, min_length, stopwords, ids=None, stems=False):
    """Generates multiple document word clouds using the termvector approach"""
    burst = True
    chunk_size = 1000
    progress = 0
    wordcloud_counter = Counter()

    if not ids:
        # Normal (non-time line) wordcloud (based on query)
        burst = False

        result = count_search_results(settings.ES_INDEX,
                                      settings.ES_DOCTYPE,
                                      search_params['query'],
                                      search_params['dates'],
                                      search_params['distributions'],
                                      search_params['article_types'],
                                      search_params['pillars'])
        doc_count = result.get('count')

        info = {
            'current': 0,
            'total': doc_count
        }
        current_task.update_state(state='PROGRESS', meta=info)

        for subset in document_id_chunks(chunk_size,
                                         settings.ES_INDEX,
                                         settings.ES_DOCTYPE,
                                         search_params['query'],
                                         search_params['dates'],
                                         search_params['distributions'],
                                         search_params['article_types'],
                                         search_params['pillars']):

            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE,
                                          subset,
                                          min_length,
                                          stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {
                'current': progress,
                'total': doc_count
            }
            current_task.update_state(state='PROGRESS', meta=info)
    else:
        # Time line word cloud (based in list of document ids)
        for subset in utils.chunks(ids, chunk_size):
            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE,
                                          subset,
                                          min_length,
                                          stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {
                'current': progress,
                'total': len(ids)
            }
            current_task.update_state(state='PROGRESS', meta=info)

    return counter2wordclouddata(wordcloud_counter, burst, stopwords)
Example #11
0
def download_prepare(request):
    """Prepares the ocr+meta-data zipfile for download
    """
    if settings.DEBUG:
        print >> stderr, "download_prepare()"
        print >> stderr, request.REQUEST
    logger.info('query/download/prepare - user: {}'.format(
        request.user.username))

    user = request.user
    query = Query.objects.get(title=request.GET.get('query_title'), user=user)

    params = query.get_query_dict()
    result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE,
                                  params['query'], params['dates'],
                                  params['exclude_distributions'],
                                  params['exclude_article_types'],
                                  params['selected_pillars'])
    count = result.get('count')

    if count > settings.QUERY_DATA_MAX_RESULTS:
        msg = "Your query has too much results to export: " + str(count)
        msg += " where " + str(
            settings.QUERY_DATA_MAX_RESULTS) + " are allowed. "
        msg += "Please consider filtering your results before exporting."
        return json_response_message('error', msg)

    if user.email == "":
        msg = "Preparing your download for query <br/><b>" + query.title + \
              "</b> failed.<br/>A valid email address is needed for user " \
              "<br/><b>" + user.username + "</b>"
        if settings.DEBUG:
            print >> stderr, msg
        return json_response_message('error', msg)

    try:
        validate_email(user.email)
    except:
        msg = "Preparing your download for query <br/><b>" + query.title + \
              "</b> failed.<br/>The email address of user <b>" + \
              user.username + "</b> could not be validated: <b>" + \
              user.email + "</b>"
        if settings.DEBUG:
            print >> stderr, msg
        return json_response_message('error', msg)

    zip_basename = create_zipname(user, query)
    url = urljoin('http://{}'.format(request.get_host()),
                  "/query/download/" + quote_plus(zip_basename))
    email_message = "Texcavator query: " + query.title + "\n" + zip_basename + \
        "\nURL: " + url
    if settings.DEBUG:
        print >> stderr, email_message
        print >> stderr, 'http://{}'.format(request.get_host())

    # zip documents by celery background task
    execute(query, dict(request.REQUEST), zip_basename, user.email,
            email_message)

    msg = "Your export for query <b>" + query.title + \
          "</b> is completed.<br/>An e-mail with a download link has been sent " + \
          "to <b>" + user.email + "</b>."
    return json_response_message('SUCCESS', msg)