Exemple #1
0
def test_chunks():
    results = [chunk for chunk in utils.chunks([], 100)]
    assert_equals(results, [])

    results = [chunk for chunk in utils.chunks(range(5), 1)]
    assert_equals(results, [[0], [1], [2], [3], [4]])

    results = [chunk for chunk in utils.chunks(range(5), 2)]
    assert_equals(results, [[0, 1], [2, 3], [4]])
def test_chunks():
    results = [chunk for chunk in utils.chunks([], 100)]
    assert_equals(results, [])

    results = [chunk for chunk in utils.chunks(range(5), 1)]
    assert_equals(results, [[0], [1], [2], [3], [4]])

    results = [chunk for chunk in utils.chunks(range(5), 2)]
    assert_equals(results, [[0, 1], [2, 3], [4]])
Exemple #3
0
def generate_tv_cloud(search_params,
                      min_length,
                      stopwords,
                      ids=None,
                      stems=False):
    """Generates multiple document word clouds using the termvector approach"""
    burst = True
    chunk_size = 1000
    progress = 0
    wordcloud_counter = Counter()

    if not ids:
        # Normal (non-time line) wordcloud (based on query)
        burst = False

        result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE,
                                      search_params['query'],
                                      search_params['dates'],
                                      search_params['distributions'],
                                      search_params['article_types'],
                                      search_params['pillars'])
        doc_count = result.get('count')

        info = {'current': 0, 'total': doc_count}
        current_task.update_state(state='PROGRESS', meta=info)

        for subset in document_id_chunks(
                chunk_size, settings.ES_INDEX, settings.ES_DOCTYPE,
                search_params['query'], search_params['dates'],
                search_params['distributions'], search_params['article_types'],
                search_params['pillars']):

            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE, subset,
                                          min_length, stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {'current': progress, 'total': doc_count}
            current_task.update_state(state='PROGRESS', meta=info)
    else:
        # Time line word cloud (based in list of document ids)
        for subset in utils.chunks(ids, chunk_size):
            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE, subset,
                                          min_length, stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {'current': progress, 'total': len(ids)}
            current_task.update_state(state='PROGRESS', meta=info)

    return counter2wordclouddata(wordcloud_counter, burst, stopwords)
    def handle(self, *args, **options):
        query_size = 10000
        es_retrieve = 2500

        if len(args) > 0:
            query_size = int(args[0])

        if DocID.objects.all().count() == 0:
            print 'Document ids must be gathered before query terms can be ' \
                  'extracted. \n Please execute python manage.py gatherdocids'
            sys.exit(1)

        # Empty database
        QueryTerm.objects.all().delete()

        self.stdout.write('Retrieving {} documents...'.format(query_size))

        terms = set()

        # select random documents
        document_set = DocID.objects.order_by('?')[0:query_size]
        doc_ids = [doc.doc_id for doc in document_set]

        for ids in utils.chunks(doc_ids, es_retrieve):
            bdy = {
                'ids': ids,
                'parameters': {
                    'fields': ['article_dc_title'],
                    'term_statistics': False,
                    'field_statistics': False,
                    'offsets': False,
                    'payloads': False,
                    'positions': False
                }
            }

            t_vectors = _es().mtermvectors(index=settings.ES_INDEX,
                                           doc_type=settings.ES_DOCTYPE,
                                           body=bdy)

            for doc in t_vectors.get('docs'):
                for field, data in doc.get('term_vectors').iteritems():
                    for term, details in data.get('terms').iteritems():
                        t = term.encode('ascii', 'replace')
                        if len(t) <= 26:
                            terms.add(QueryTerm(t))

        # save to database
        print 'Saving {} terms to the database.'.format(len(terms))

        QueryTerm.objects.bulk_create(terms)
Exemple #5
0
def generate_tv_cloud(search_params, min_length, stopwords, ids=None, stems=False):
    """Generates multiple document word clouds using the termvector approach"""
    burst = True
    chunk_size = 1000
    progress = 0
    wordcloud_counter = Counter()

    if not ids:
        # Normal (non-time line) wordcloud (based on query)
        burst = False

        result = count_search_results(settings.ES_INDEX,
                                      settings.ES_DOCTYPE,
                                      search_params['query'],
                                      search_params['dates'],
                                      search_params['distributions'],
                                      search_params['article_types'],
                                      search_params['pillars'])
        doc_count = result.get('count')

        info = {
            'current': 0,
            'total': doc_count
        }
        current_task.update_state(state='PROGRESS', meta=info)

        for subset in document_id_chunks(chunk_size,
                                         settings.ES_INDEX,
                                         settings.ES_DOCTYPE,
                                         search_params['query'],
                                         search_params['dates'],
                                         search_params['distributions'],
                                         search_params['article_types'],
                                         search_params['pillars']):

            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE,
                                          subset,
                                          min_length,
                                          stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {
                'current': progress,
                'total': doc_count
            }
            current_task.update_state(state='PROGRESS', meta=info)
    else:
        # Time line word cloud (based in list of document ids)
        for subset in utils.chunks(ids, chunk_size):
            result = termvector_wordcloud(settings.ES_INDEX,
                                          settings.ES_DOCTYPE,
                                          subset,
                                          min_length,
                                          stems)
            wordcloud_counter = wordcloud_counter + result

            progress += len(subset)
            info = {
                'current': progress,
                'total': len(ids)
            }
            current_task.update_state(state='PROGRESS', meta=info)

    return counter2wordclouddata(wordcloud_counter, burst, stopwords)
Exemple #6
0
    def handle(self, *args, **options):
        query_size = 2500
        n_repetitions = 10
        es_retrieve = 2500

        if len(args) > 0:
            query_size = int(args[0])
        if len(args) > 1:
            n_repetitions = int(args[1])
        if len(args) > 2:
            es_retrieve = int(args[2])

        response_times = []

        for repetition in range(n_repetitions):
            c1 = time.time()
            es_time = []

            wordcloud = Counter()

            # select random documents
            document_set = DocID.objects.order_by('?')[0:query_size]
            doc_ids = [doc.doc_id for doc in document_set]

            for ids in utils.chunks(doc_ids, es_retrieve):

                bdy = {
                    'ids': ids,
                    'parameters': {
                        'fields': ['article_dc_title', 'text_content'],
                        'term_statistics': False,
                        'field_statistics': False,
                        'offsets': False,
                        'payloads': False,
                        'positions': False

                    }
                }

                c3 = time.time()
                t_vectors = _es().mtermvectors(index='kb', doc_type='doc',
                                               body=bdy)
                c4 = time.time()

                es_time.append((c4-c3)*1000)

                for doc in t_vectors.get('docs'):
                    for field, data in doc.get('term_vectors').iteritems():
                        temp = {}
                        for term, details in data.get('terms').iteritems():
                            temp[term] = int(details['term_freq'])
                        wordcloud.update(temp)

            c2 = time.time()

            elapsed_c = (c2-c1)*1000
            response_times.append(elapsed_c)
            self.stdout.write(str(elapsed_c)+' ES: '+str(sum(es_time)))
            self.stdout.flush()

        avg = float(sum(response_times)/len(response_times))
        print 'Average response time for generating word clouds from {num} ' \
              'documents: {avg} miliseconds'.format(num=query_size, avg=avg)