def handle(self, *args, **options): print 'Emptying table...' Term.objects.all().delete() for timeframe, dates in TIMEFRAMES.items(): print 'Retrieving documents for timeframe {}...'.format(timeframe) exclude_dist = Distribution.objects.exclude( name='Landelijk').values_list('id', flat=True) date_range = daterange2dates(dates) total_documents = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, exclude_dist, [], []).get('count') print 'Total documents: {}'.format(total_documents) sets = document_id_chunks(10000, settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, dist=exclude_dist) print 'Counting terms...' counter = Counter() for n, s in enumerate(sets): start_time = time.time() counter += termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, s, min_length=2, add_freqs=False) print 'Completed set {} in {} seconds...'.format( n + 1, time.time() - start_time) print 'Calculating IDFs...' terms = [] for term, count in counter.items(): if count > 1: # don't add single occurrences idf = math.log10(total_documents / float(count)) terms.append( Term(timeframe=timeframe, word=term, count=count, idf=idf)) print 'Transferring to database...' Term.objects.bulk_create(terms, batch_size=10000) print 'Creating RecordDAWG' d = dawg.RecordDAWG( '<d', zip([t.word for t in terms], [(t.idf, ) for t in terms])) d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg')) """ Test code below.
def generate_tv_cloud(search_params, min_length, stopwords, date_range=None, stems=False, idf_timeframe=''): """ Generates multiple document word clouds using the termvector approach. """ # Date range is either provided (in case of burst clouds from the timelines) or from the Query dates = date_range or search_params['dates'] # First, count the search results result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], dates, search_params['exclude_distributions'], search_params['exclude_article_types'], search_params['selected_pillars']) doc_count = result.get('count') update_task_status(0, doc_count) # Then, create the word clouds per chunk progress = 0 wordcloud_counter = Counter() for subset in document_id_chunks(settings.QUERY_DATA_CHUNK_SIZE, settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], dates, search_params['exclude_distributions'], search_params['exclude_article_types'], search_params['selected_pillars']): wordcloud_counter += termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) # Update the task status progress += len(subset) update_task_status(progress, doc_count) # Remove non-frequent words form the counter for key, count in dropwhile(lambda c: c[1] > math.log10(doc_count), wordcloud_counter.most_common()): del wordcloud_counter[key] # Remove the stopwords from the counter for sw in stopwords: del wordcloud_counter[sw] # Return a dictionary with the results return { 'result': normalize_cloud(wordcloud_counter, idf_timeframe), 'status': 'ok', 'burstcloud': date_range is not None }
def generate_tv_cloud(search_params, min_length, stopwords, ids=None, stems=False): """Generates multiple document word clouds using the termvector approach""" burst = True chunk_size = 1000 progress = 0 wordcloud_counter = Counter() if not ids: # Normal (non-time line) wordcloud (based on query) burst = False result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']) doc_count = result.get('count') info = {'current': 0, 'total': doc_count} current_task.update_state(state='PROGRESS', meta=info) for subset in document_id_chunks( chunk_size, settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = {'current': progress, 'total': doc_count} current_task.update_state(state='PROGRESS', meta=info) else: # Time line word cloud (based in list of document ids) for subset in utils.chunks(ids, chunk_size): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = {'current': progress, 'total': len(ids)} current_task.update_state(state='PROGRESS', meta=info) return counter2wordclouddata(wordcloud_counter, burst, stopwords)
def generate_tv_cloud(search_params, min_length, stopwords, date_range=None, stems=False): """Generates multiple document word clouds using the termvector approach""" chunk_size = settings.QUERY_DATA_CHUNK_SIZE progress = 0 wordcloud_counter = Counter() # Date range is either provided or from the Query dates = date_range or search_params['dates'] # First, count the search results result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], dates, search_params['exclude_distributions'], search_params['exclude_article_types'], search_params['selected_pillars']) doc_count = result.get('count') info = { 'current': 0, 'total': doc_count } current_task.update_state(state='PROGRESS', meta=info) # Then, create the word clouds per chunk for subset in document_id_chunks(chunk_size, settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], dates, search_params['exclude_distributions'], search_params['exclude_article_types'], search_params['selected_pillars']): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = { 'current': progress, 'total': doc_count } current_task.update_state(state='PROGRESS', meta=info) burst = date_range is not None return counter2wordclouddata(wordcloud_counter, burst, stopwords)
def handle(self, *args, **options): print 'Emptying table...' Term.objects.all().delete() for timeframe, dates in TIMEFRAMES.items(): print 'Retrieving documents for timeframe {}...'.format(timeframe) exclude_dist = Distribution.objects.exclude(name='Landelijk').values_list('id', flat=True) date_range = daterange2dates(dates) total_documents = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, exclude_dist, [], []).get('count') print 'Total documents: {}'.format(total_documents) sets = document_id_chunks(10000, settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, dist=exclude_dist) print 'Counting terms...' counter = Counter() for n, s in enumerate(sets): start_time = time.time() counter += termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, s, min_length=2, add_freqs=False) print 'Completed set {} in {} seconds...'.format(n + 1, time.time() - start_time) print 'Calculating IDFs...' terms = [] for term, count in counter.items(): if count > 1: # don't add single occurrences idf = math.log10(total_documents / float(count)) terms.append(Term(timeframe=timeframe, word=term, count=count, idf=idf)) print 'Transferring to database...' Term.objects.bulk_create(terms, batch_size=10000) print 'Creating RecordDAWG' d = dawg.RecordDAWG('<d', zip([t.word for t in terms], [(t.idf,) for t in terms])) d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg')) """ Test code below.
def generate_tv_cloud(search_params, min_length, stopwords, ids=None, stems=False): """Generates multiple document word clouds using the termvector approach""" burst = True chunk_size = 1000 progress = 0 wordcloud_counter = Counter() if not ids: # Normal (non-time line) wordcloud (based on query) burst = False result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']) doc_count = result.get('count') info = { 'current': 0, 'total': doc_count } current_task.update_state(state='PROGRESS', meta=info) for subset in document_id_chunks(chunk_size, settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = { 'current': progress, 'total': doc_count } current_task.update_state(state='PROGRESS', meta=info) else: # Time line word cloud (based in list of document ids) for subset in utils.chunks(ids, chunk_size): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = { 'current': progress, 'total': len(ids) } current_task.update_state(state='PROGRESS', meta=info) return counter2wordclouddata(wordcloud_counter, burst, stopwords)