def count_results(query): """Returns the number of results for a Query""" params = query.get_query_dict() result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, params['query'], params['dates'], params['exclude_distributions'], params['exclude_article_types'], params['selected_pillars']) return result.get('count')
def handle(self, *args, **options): print 'Emptying table...' Term.objects.all().delete() for timeframe, dates in TIMEFRAMES.items(): print 'Retrieving documents for timeframe {}...'.format(timeframe) exclude_dist = Distribution.objects.exclude( name='Landelijk').values_list('id', flat=True) date_range = daterange2dates(dates) total_documents = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, exclude_dist, [], []).get('count') print 'Total documents: {}'.format(total_documents) sets = document_id_chunks(10000, settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, dist=exclude_dist) print 'Counting terms...' counter = Counter() for n, s in enumerate(sets): start_time = time.time() counter += termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, s, min_length=2, add_freqs=False) print 'Completed set {} in {} seconds...'.format( n + 1, time.time() - start_time) print 'Calculating IDFs...' terms = [] for term, count in counter.items(): if count > 1: # don't add single occurrences idf = math.log10(total_documents / float(count)) terms.append( Term(timeframe=timeframe, word=term, count=count, idf=idf)) print 'Transferring to database...' Term.objects.bulk_create(terms, batch_size=10000) print 'Creating RecordDAWG' d = dawg.RecordDAWG( '<d', zip([t.word for t in terms], [(t.idf, ) for t in terms])) d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg')) """ Test code below.
def generate_tv_cloud(search_params, min_length, stopwords, date_range=None, stems=False, idf_timeframe=''): """ Generates multiple document word clouds using the termvector approach. """ # Date range is either provided (in case of burst clouds from the timelines) or from the Query dates = date_range or search_params['dates'] # First, count the search results result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], dates, search_params['exclude_distributions'], search_params['exclude_article_types'], search_params['selected_pillars']) doc_count = result.get('count') update_task_status(0, doc_count) # Then, create the word clouds per chunk progress = 0 wordcloud_counter = Counter() for subset in document_id_chunks(settings.QUERY_DATA_CHUNK_SIZE, settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], dates, search_params['exclude_distributions'], search_params['exclude_article_types'], search_params['selected_pillars']): wordcloud_counter += termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) # Update the task status progress += len(subset) update_task_status(progress, doc_count) # Remove non-frequent words form the counter for key, count in dropwhile(lambda c: c[1] > math.log10(doc_count), wordcloud_counter.most_common()): del wordcloud_counter[key] # Remove the stopwords from the counter for sw in stopwords: del wordcloud_counter[sw] # Return a dictionary with the results return { 'result': normalize_cloud(wordcloud_counter, idf_timeframe), 'status': 'ok', 'burstcloud': date_range is not None }
def generate_tv_cloud(search_params, min_length, stopwords, ids=None, stems=False): """Generates multiple document word clouds using the termvector approach""" burst = True chunk_size = 1000 progress = 0 wordcloud_counter = Counter() if not ids: # Normal (non-time line) wordcloud (based on query) burst = False result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']) doc_count = result.get('count') info = {'current': 0, 'total': doc_count} current_task.update_state(state='PROGRESS', meta=info) for subset in document_id_chunks( chunk_size, settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = {'current': progress, 'total': doc_count} current_task.update_state(state='PROGRESS', meta=info) else: # Time line word cloud (based in list of document ids) for subset in utils.chunks(ids, chunk_size): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = {'current': progress, 'total': len(ids)} current_task.update_state(state='PROGRESS', meta=info) return counter2wordclouddata(wordcloud_counter, burst, stopwords)
def generate_tv_cloud(search_params, min_length, stopwords, date_range=None, stems=False): """Generates multiple document word clouds using the termvector approach""" chunk_size = settings.QUERY_DATA_CHUNK_SIZE progress = 0 wordcloud_counter = Counter() # Date range is either provided or from the Query dates = date_range or search_params['dates'] # First, count the search results result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], dates, search_params['exclude_distributions'], search_params['exclude_article_types'], search_params['selected_pillars']) doc_count = result.get('count') info = { 'current': 0, 'total': doc_count } current_task.update_state(state='PROGRESS', meta=info) # Then, create the word clouds per chunk for subset in document_id_chunks(chunk_size, settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], dates, search_params['exclude_distributions'], search_params['exclude_article_types'], search_params['selected_pillars']): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = { 'current': progress, 'total': doc_count } current_task.update_state(state='PROGRESS', meta=info) burst = date_range is not None return counter2wordclouddata(wordcloud_counter, burst, stopwords)
def handle(self, *args, **options): print 'Emptying table...' Term.objects.all().delete() for timeframe, dates in TIMEFRAMES.items(): print 'Retrieving documents for timeframe {}...'.format(timeframe) exclude_dist = Distribution.objects.exclude(name='Landelijk').values_list('id', flat=True) date_range = daterange2dates(dates) total_documents = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, exclude_dist, [], []).get('count') print 'Total documents: {}'.format(total_documents) sets = document_id_chunks(10000, settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, dist=exclude_dist) print 'Counting terms...' counter = Counter() for n, s in enumerate(sets): start_time = time.time() counter += termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, s, min_length=2, add_freqs=False) print 'Completed set {} in {} seconds...'.format(n + 1, time.time() - start_time) print 'Calculating IDFs...' terms = [] for term, count in counter.items(): if count > 1: # don't add single occurrences idf = math.log10(total_documents / float(count)) terms.append(Term(timeframe=timeframe, word=term, count=count, idf=idf)) print 'Transferring to database...' Term.objects.bulk_create(terms, batch_size=10000) print 'Creating RecordDAWG' d = dawg.RecordDAWG('<d', zip([t.word for t in terms], [(t.idf,) for t in terms])) d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg')) """ Test code below.
def download_prepare(request): """Prepares the ocr+meta-data zipfile for download """ if settings.DEBUG: print >> stderr, "download_prepare()" print >> stderr, request.REQUEST logger.info('query/download/prepare - user: {}'. format(request.user.username)) user = request.user query = Query.objects.get(title=request.GET.get('query_title'), user=user) params = query.get_query_dict() result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, params['query'], params['dates'], params['exclude_distributions'], params['exclude_article_types'], params['selected_pillars']) count = result.get('count') if count > settings.QUERY_DATA_MAX_RESULTS: msg = "Your query has too much results to export: " + str(count) msg += " where " + str(settings.QUERY_DATA_MAX_RESULTS) + " are allowed. " msg += "Please consider filtering your results before exporting." return json_response_message('error', msg) if user.email == "": msg = "Preparing your download for query <br/><b>" + query.title + \ "</b> failed.<br/>A valid email address is needed for user " \ "<br/><b>" + user.username + "</b>" if settings.DEBUG: print >> stderr, msg return json_response_message('error', msg) try: validate_email(user.email) except: msg = "Preparing your download for query <br/><b>" + query.title + \ "</b> failed.<br/>The email address of user <b>" + \ user.username + "</b> could not be validated: <b>" + \ user.email + "</b>" if settings.DEBUG: print >> stderr, msg return json_response_message('error', msg) zip_basename = create_zipname(user, query) url = urljoin('http://{}'.format(request.get_host()), "/query/download/" + quote_plus(zip_basename)) email_message = "Texcavator query: " + query.title + "\n" + zip_basename + \ "\nURL: " + url if settings.DEBUG: print >> stderr, email_message print >> stderr, 'http://{}'.format(request.get_host()) # zip documents by celery background task execute(query, dict(request.REQUEST), zip_basename, user.email, email_message) msg = "Your export for query <b>" + query.title + \ "</b> is completed.<br/>An e-mail with a download link has been sent " + \ "to <b>" + user.email + "</b>." return json_response_message('SUCCESS', msg)
def generate_tv_cloud(search_params, min_length, stopwords, ids=None, stems=False): """Generates multiple document word clouds using the termvector approach""" burst = True chunk_size = 1000 progress = 0 wordcloud_counter = Counter() if not ids: # Normal (non-time line) wordcloud (based on query) burst = False result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']) doc_count = result.get('count') info = { 'current': 0, 'total': doc_count } current_task.update_state(state='PROGRESS', meta=info) for subset in document_id_chunks(chunk_size, settings.ES_INDEX, settings.ES_DOCTYPE, search_params['query'], search_params['dates'], search_params['distributions'], search_params['article_types'], search_params['pillars']): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = { 'current': progress, 'total': doc_count } current_task.update_state(state='PROGRESS', meta=info) else: # Time line word cloud (based in list of document ids) for subset in utils.chunks(ids, chunk_size): result = termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, subset, min_length, stems) wordcloud_counter = wordcloud_counter + result progress += len(subset) info = { 'current': progress, 'total': len(ids) } current_task.update_state(state='PROGRESS', meta=info) return counter2wordclouddata(wordcloud_counter, burst, stopwords)
def download_prepare(request): """Prepares the ocr+meta-data zipfile for download """ if settings.DEBUG: print >> stderr, "download_prepare()" print >> stderr, request.REQUEST logger.info('query/download/prepare - user: {}'.format( request.user.username)) user = request.user query = Query.objects.get(title=request.GET.get('query_title'), user=user) params = query.get_query_dict() result = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, params['query'], params['dates'], params['exclude_distributions'], params['exclude_article_types'], params['selected_pillars']) count = result.get('count') if count > settings.QUERY_DATA_MAX_RESULTS: msg = "Your query has too much results to export: " + str(count) msg += " where " + str( settings.QUERY_DATA_MAX_RESULTS) + " are allowed. " msg += "Please consider filtering your results before exporting." return json_response_message('error', msg) if user.email == "": msg = "Preparing your download for query <br/><b>" + query.title + \ "</b> failed.<br/>A valid email address is needed for user " \ "<br/><b>" + user.username + "</b>" if settings.DEBUG: print >> stderr, msg return json_response_message('error', msg) try: validate_email(user.email) except: msg = "Preparing your download for query <br/><b>" + query.title + \ "</b> failed.<br/>The email address of user <b>" + \ user.username + "</b> could not be validated: <b>" + \ user.email + "</b>" if settings.DEBUG: print >> stderr, msg return json_response_message('error', msg) zip_basename = create_zipname(user, query) url = urljoin('http://{}'.format(request.get_host()), "/query/download/" + quote_plus(zip_basename)) email_message = "Texcavator query: " + query.title + "\n" + zip_basename + \ "\nURL: " + url if settings.DEBUG: print >> stderr, email_message print >> stderr, 'http://{}'.format(request.get_host()) # zip documents by celery background task execute(query, dict(request.REQUEST), zip_basename, user.email, email_message) msg = "Your export for query <b>" + query.title + \ "</b> is completed.<br/>An e-mail with a download link has been sent " + \ "to <b>" + user.email + "</b>." return json_response_message('SUCCESS', msg)