def test_daterange2dates(): assert_equals(utils.daterange2dates(settings.TEXCAVATOR_DATE_RANGE), [{'lower': '1850-01-01', 'upper': '1990-12-31'}]) # Input single date assert_equals(utils.daterange2dates("19901231"), [{'lower': '1850-01-01', 'upper': '1990-12-31'}]) # Empty input assert_equals(utils.daterange2dates(""), [{'lower': '1850-01-01', 'upper': '1990-12-31'}])
def handle(self, *args, **options): query_size = 100000 n_repetitions = 10 if len(args) > 0: query_size = int(args[0]) if len(args) > 1: n_repetitions = int(args[1]) response_times = [] for repetition in range(n_repetitions): # select random documents document_set = DocID.objects.order_by('?')[0:query_size] doc_ids = [doc.doc_id for doc in document_set] dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE) aggr_resp = multiple_document_word_cloud(settings.ES_INDEX, settings.ES_DOCTYPE, None, dates[0], [], [], doc_ids) response_times.append(int(aggr_resp.get('took'))) self.stdout.write(str(aggr_resp.get('took'))) self.stdout.flush() avg = float(sum(response_times) / len(response_times)) print 'Average response time for aggregating over {num} documents: ' \ '{avg} miliseconds'.format(num=query_size, avg=avg)
def handle(self, *args, **options): query_size = 100000 n_repetitions = 10 if len(args) > 0: query_size = int(args[0]) if len(args) > 1: n_repetitions = int(args[1]) response_times = [] for repetition in range(n_repetitions): # select random documents document_set = DocID.objects.order_by('?')[0:query_size] doc_ids = [doc.doc_id for doc in document_set] dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE) aggr_resp = multiple_document_word_cloud(settings.ES_INDEX, settings.ES_DOCTYPE, None, dates[0], [], [], doc_ids) response_times.append(int(aggr_resp.get('took'))) self.stdout.write(str(aggr_resp.get('took'))) self.stdout.flush() avg = float(sum(response_times)/len(response_times)) print 'Average response time for aggregating over {num} documents: ' \ '{avg} miliseconds'.format(num=query_size, avg=avg)
def heatmap(request, query_id, year): """ Retrieves heatmap data for the given Query and year. """ query = get_object_or_404(Query, pk=query_id) params = query.get_query_dict() year = int(year) range = daterange2dates(str(year - 5) + '0101,' + str(year + 5) + '1231') result = metadata_aggregation(settings.ES_INDEX, settings.ES_DOCTYPE, params['query'], range, params['exclude_distributions'], params['exclude_article_types'], params['selected_pillars'], articles_over_time()) articles_per_day = {} for bucket in result['aggregations']['articles_over_time']['buckets']: date = bucket['key'] / 1000 # Cal-HeatMap requires the date in seconds instead of milliseconds articles_per_day[date] = bucket['doc_count'] return JsonResponse(articles_per_day)
def test_daterange2dates(): assert_equals(utils.daterange2dates(settings.TEXCAVATOR_DATE_RANGE), [{ 'lower': '1850-01-01', 'upper': '1990-12-31' }]) # Input single date assert_equals(utils.daterange2dates("19901231"), [{ 'lower': '1850-01-01', 'upper': '1990-12-31' }]) # Empty input assert_equals(utils.daterange2dates(""), [{ 'lower': '1850-01-01', 'upper': '1990-12-31' }])
def handle(self, *args, **options): print 'Emptying table...' Term.objects.all().delete() for timeframe, dates in TIMEFRAMES.items(): print 'Retrieving documents for timeframe {}...'.format(timeframe) exclude_dist = Distribution.objects.exclude( name='Landelijk').values_list('id', flat=True) date_range = daterange2dates(dates) total_documents = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, exclude_dist, [], []).get('count') print 'Total documents: {}'.format(total_documents) sets = document_id_chunks(10000, settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, dist=exclude_dist) print 'Counting terms...' counter = Counter() for n, s in enumerate(sets): start_time = time.time() counter += termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, s, min_length=2, add_freqs=False) print 'Completed set {} in {} seconds...'.format( n + 1, time.time() - start_time) print 'Calculating IDFs...' terms = [] for term, count in counter.items(): if count > 1: # don't add single occurrences idf = math.log10(total_documents / float(count)) terms.append( Term(timeframe=timeframe, word=term, count=count, idf=idf)) print 'Transferring to database...' Term.objects.bulk_create(terms, batch_size=10000) print 'Creating RecordDAWG' d = dawg.RecordDAWG( '<d', zip([t.word for t in terms], [(t.idf, ) for t in terms])) d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg')) """ Test code below.
def handle(self, *args, **options): print 'Emptying table...' Term.objects.all().delete() for timeframe, dates in TIMEFRAMES.items(): print 'Retrieving documents for timeframe {}...'.format(timeframe) exclude_dist = Distribution.objects.exclude(name='Landelijk').values_list('id', flat=True) date_range = daterange2dates(dates) total_documents = count_search_results(settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, exclude_dist, [], []).get('count') print 'Total documents: {}'.format(total_documents) sets = document_id_chunks(10000, settings.ES_INDEX, settings.ES_DOCTYPE, None, date_range, dist=exclude_dist) print 'Counting terms...' counter = Counter() for n, s in enumerate(sets): start_time = time.time() counter += termvector_wordcloud(settings.ES_INDEX, settings.ES_DOCTYPE, s, min_length=2, add_freqs=False) print 'Completed set {} in {} seconds...'.format(n + 1, time.time() - start_time) print 'Calculating IDFs...' terms = [] for term, count in counter.items(): if count > 1: # don't add single occurrences idf = math.log10(total_documents / float(count)) terms.append(Term(timeframe=timeframe, word=term, count=count, idf=idf)) print 'Transferring to database...' Term.objects.bulk_create(terms, batch_size=10000) print 'Creating RecordDAWG' d = dawg.RecordDAWG('<d', zip([t.word for t in terms], [(t.idf,) for t in terms])) d.save(os.path.join(settings.PROJECT_PARENT, timeframe + '.dawg')) """ Test code below.
def get_search_parameters(req_dict): """Return a tuple of search parameters extracted from a dictionary Parameters: req_dict : dict A Django request dictionary Returns: dict : dict A dictionary that contains query metadata """ query_str = req_dict.get('query', None) start = int(req_dict.get('startRecord', 1)) result_size = int(req_dict.get('maximumRecords', 20)) date_range_str = req_dict.get('dateRange', settings.TEXCAVATOR_DATE_RANGE) dates = daterange2dates(date_range_str) distributions = [] for ds in _KB_DISTRIBUTION_VALUES.keys(): use_ds = json.loads(req_dict.get(ds, "true")) if not use_ds: distributions.append(ds) article_types = [] for typ in _KB_ARTICLE_TYPE_VALUES: use_type = json.loads(req_dict.get(typ, "true")) if not use_type: article_types.append(typ) pillars = [int(x) for x in req_dict.getlist('pillars')] collection = req_dict.get('collection', settings.ES_INDEX) sort_order = req_dict.get('sort_order', '_score') return { 'query': query_str, 'start': start, 'result_size': result_size, 'dates': dates, 'distributions': distributions, 'article_types': article_types, 'pillars': pillars, 'collection': collection, 'sort_order': sort_order }
def handle(self, *args, **options): if QueryTerm.objects.all().count() == 0: print 'No query terms stored in the database. Please run ' \ 'python manage.py gatherqueryterms\' first.' sys.exit(1) query_size = 10 n_repetitions = 10 if len(args) > 0: query_size = int(args[0]) if len(args) > 1: n_repetitions = int(args[1]) response_times = [] es_wall_clock = [] for repetition in range(n_repetitions): # generate random weigthed query query_terms = QueryTerm.objects.order_by('?')[0:query_size] query_list = [ '{}^{}'.format(t.term, randint(1, 40)) for t in query_terms ] q = ' OR '.join(query_list) t1 = time.time() dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE) valid_q, result = do_search(settings.ES_INDEX, settings.ES_DOCTYPE, q, 0, 20, dates[0], [], []) t2 = time.time() if not valid_q: print 'Invalid query: {}'.format(q) else: es_wall_clock.append((t2 - t1) * 1000) response_times.append(int(result.get('took'))) self.stdout.write(str(result.get('took'))) self.stdout.flush() avg = float(sum(response_times) / len(response_times)) avg_wall_clock = float(sum(es_wall_clock) / len(es_wall_clock)) print 'Average response time for queries of size {}: {} miliseconds'. \ format(query_size, avg) print 'Average wall clock time for queries of size {}: {} ' \ 'miliseconds'.format(query_size, avg_wall_clock)
def handle(self, *args, **options): if QueryTerm.objects.all().count() == 0: print 'No query terms stored in the database. Please run ' \ 'python manage.py gatherqueryterms\' first.' sys.exit(1) query_size = 10 n_repetitions = 10 if len(args) > 0: query_size = int(args[0]) if len(args) > 1: n_repetitions = int(args[1]) response_times = [] es_wall_clock = [] for repetition in range(n_repetitions): # generate random weigthed query query_terms = QueryTerm.objects.order_by('?')[0:query_size] query_list = ['{}^{}'.format(t.term, randint(1, 40)) for t in query_terms] q = ' OR '.join(query_list) t1 = time.time() dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE) valid_q, result = do_search(settings.ES_INDEX, settings.ES_DOCTYPE, q, 0, 20, dates[0], [], []) t2 = time.time() if not valid_q: print 'Invalid query: {}'.format(q) else: es_wall_clock.append((t2-t1)*1000) response_times.append(int(result.get('took'))) self.stdout.write(str(result.get('took'))) self.stdout.flush() avg = float(sum(response_times)/len(response_times)) avg_wall_clock = float(sum(es_wall_clock)/len(es_wall_clock)) print 'Average response time for queries of size {}: {} miliseconds'. \ format(query_size, avg) print 'Average wall clock time for queries of size {}: {} ' \ 'miliseconds'.format(query_size, avg_wall_clock)
def index(request): """Render main page.""" from services.es import _KB_DISTRIBUTION_VALUES, _KB_ARTICLE_TYPE_VALUES config_reverse_mapping = {"sd": flip_dict(_KB_DISTRIBUTION_VALUES), "st": flip_dict(_KB_ARTICLE_TYPE_VALUES)} date_limits = daterange2dates(settings.TEXCAVATOR_DATE_RANGE) data = { "PROJECT_NAME": settings.PROJECT_NAME, "PROJECT_MIN_DATE": date_limits[0]["lower"], "PROJECT_MAX_DATE": date_limits[0]["upper"], "QUERY_DATA_DOWNLOAD_ALLOW": settings.QUERY_DATA_DOWNLOAD_ALLOW, "ES_INDEX": settings.ES_INDEX, "ES_REVERSE_MAPPING": json.dumps(config_reverse_mapping), "ILPS_LOGGING": settings.ILPS_LOGGING, } return render_to_response("index.html", data, RequestContext(request))
def handle(self, *args, **options): print "Emptying table..." DayStatistic.objects.all().delete() dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE) year_lower = datetime.strptime(dates[0]["lower"], "%Y-%m-%d").date().year year_upper = datetime.strptime(dates[0]["upper"], "%Y-%m-%d").date().year if len(args) > 0: year_lower = int(args[0]) if len(args) > 1: year_upper = int(args[1]) print "Gathering statistics from %s until %s." % (year_lower, year_upper) agg_name = "daystatistic" for year in range(year_lower, year_upper + 1): date_range = {"lower": "{y}-01-01".format(y=year), "upper": "{y}-12-31".format(y=year)} print year results = day_statistics(settings.ES_INDEX, settings.ES_DOCTYPE, date_range, agg_name) if results: # save results to database agg_data = results["aggregations"][agg_name]["buckets"] for date in agg_data: try: d = datetime.strptime(date["key_as_string"], "%Y-%m-%dT00:00:00.000Z").date() DayStatistic.objects.create(date=str(d), count=date["doc_count"]) except DatabaseError, exc: msg = "Database Error: %s" % exc if settings.DEBUG: print msg
def handle(self, *args, **options): print 'Emptying table...' DayStatistic.objects.all().delete() dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE) year_lower = datetime.strptime(dates[0]['lower'], '%Y-%m-%d').date().year year_upper = datetime.strptime(dates[0]['upper'], '%Y-%m-%d').date().year if len(args) > 0: year_lower = int(args[0]) if len(args) > 1: year_upper = int(args[1]) print 'Gathering statistics from %s until %s.' \ % (year_lower, year_upper) agg_name = 'daystatistic' for year in range(year_lower, year_upper + 1): date_range = { 'lower': '{y}-01-01'.format(y=year), 'upper': '{y}-12-31'.format(y=year) } print year for article_type in ArticleType.objects.all(): for distribution in Distribution.objects.all(): results = day_statistics(settings.ES_INDEX, settings.ES_DOCTYPE, date_range, agg_name, distribution.id, article_type.id) self.save_to_database(agg_name, results, distribution, article_type)
def heatmap(request, query_id, year): """ Retrieves heatmap data for the given Query and year. """ query = get_object_or_404(Query, pk=query_id) params = query.get_query_dict() year = int(year) range = daterange2dates(str(year - 5) + '0101,' + str(year + 5) + '1231') result = metadata_aggregation(settings.ES_INDEX, settings.ES_DOCTYPE, params['query'], range, params['exclude_distributions'], params['exclude_article_types'], params['selected_pillars'], articles_over_time()) articles_per_day = {} for bucket in result['aggregations']['articles_over_time']['buckets']: date = bucket[ 'key'] / 1000 # Cal-HeatMap requires the date in seconds instead of milliseconds articles_per_day[date] = bucket['doc_count'] return JsonResponse(articles_per_day)
def handle(self, *args, **options): print 'Emptying table...' DayStatistic.objects.all().delete() dates = daterange2dates(settings.TEXCAVATOR_DATE_RANGE) year_lower = datetime.strptime(dates[0]['lower'], '%Y-%m-%d').date().year year_upper = datetime.strptime(dates[0]['upper'], '%Y-%m-%d').date().year if len(args) > 0: year_lower = int(args[0]) if len(args) > 1: year_upper = int(args[1]) print 'Gathering statistics from %s until %s.' \ % (year_lower, year_upper) agg_name = 'daystatistic' for year in range(year_lower, year_upper+1): date_range = { 'lower': '{y}-01-01'.format(y=year), 'upper': '{y}-12-31'.format(y=year) } print year for article_type in ArticleType.objects.all(): for distribution in Distribution.objects.all(): results = day_statistics(settings.ES_INDEX, settings.ES_DOCTYPE, date_range, agg_name, distribution.id, article_type.id) self.save_to_database(agg_name, results, distribution, article_type)
def tv_cloud(request): """Generate termvector word cloud using the termvector approach. Returns word cloud data for a single document word cloud (based on a single document id) and multiple document word clouds (either based on a list of document ids (i.e., timeline burst cloud) or a query with metadata). For multiple document word clouds, a celery task generates the cloud data. """ if settings.DEBUG: print >> stderr, "termvector cloud()" logger.info('services/cloud/ - termvector word cloud') logger.info('services/cloud/ - user: {}'.format(request.user.username)) # Retrieve the cloud settings query_id = request.GET.get('queryID') min_length = int(request.GET.get('min_length', 2)) use_stopwords = request.GET.get('stopwords') == "1" use_default_stopwords = request.GET.get('stopwords_default') == "1" stems = request.GET.get('stems') == "1" # Retrieve the stopwords stopwords = [] if use_stopwords: stopwords_user = list( StopWord.objects.filter(user=request.user).filter( query=None).values_list('word', flat=True)) stopwords_query = [] if query_id: stopwords_query = list( StopWord.objects.filter(user=request.user).filter( query__id=query_id).values_list('word', flat=True)) stopwords_default = [] if use_default_stopwords: stopwords_default = list( StopWord.objects.filter(user=None).filter( query=None).values_list('word', flat=True)) stopwords = stopwords_user + stopwords_query + stopwords_default record_id = request.GET.get('record_id') logger.info('services/cloud/ - record_id: {}'.format(record_id)) idf_timeframe = request.GET.get('idf_timeframe') if record_id: # Cloud for a single document t_vector = single_document_word_cloud(settings.ES_INDEX, settings.ES_DOCTYPE, record_id, min_length, stopwords, stems) normalized = normalize_cloud(t_vector['result'], idf_timeframe) return json_response_message('ok', 'Word cloud generated', {'result': normalized}) else: # Cloud for a query logger.info('services/cloud/ - multiple document word cloud') query = get_object_or_404(Query, pk=query_id) params = query.get_query_dict() # If we're creating a timeline cloud, set the min/max dates date_range = None if request.GET.get('is_timeline'): date_range = daterange2dates(request.GET.get('date_range')) task = generate_tv_cloud.delay(params, min_length, stopwords, date_range, stems, idf_timeframe) logger.info('services/cloud/ - Celery task id: {}'.format(task.id)) return json_response_message('ok', '', {'task': task.id})
def tv_cloud(request): """Generate termvector word cloud using the termvector approach. Returns word cloud data for a single document word cloud (based on a single document id) and multiple document word clouds (either based on a list of document ids (i.e., timeline burst cloud) or a query with metadata). For multiple document word clouds, a celery task generates the cloud data. """ if settings.DEBUG: print >> stderr, "termvector cloud()" logger.info('services/cloud/ - termvector word cloud') logger.info('services/cloud/ - user: {}'.format(request.user.username)) # Retrieve the cloud settings query_id = request.GET.get('queryID') min_length = int(request.GET.get('min_length', 2)) use_stopwords = request.GET.get('stopwords') == "1" use_default_stopwords = request.GET.get('stopwords_default') == "1" stems = request.GET.get('stems') == "1" # Retrieve the stopwords stopwords = [] if use_stopwords: stopwords_user = list(StopWord.objects .filter(user=request.user) .filter(query=None) .values_list('word', flat=True)) stopwords_query = [] if query_id: stopwords_query = list(StopWord.objects .filter(user=request.user) .filter(query__id=query_id) .values_list('word', flat=True)) stopwords_default = [] if use_default_stopwords: stopwords_default = list(StopWord.objects .filter(user=None) .filter(query=None) .values_list('word', flat=True)) stopwords = stopwords_user + stopwords_query + stopwords_default record_id = request.GET.get('record_id') logger.info('services/cloud/ - record_id: {}'.format(record_id)) idf_timeframe = request.GET.get('idf_timeframe') if record_id: # Cloud for a single document t_vector = single_document_word_cloud(settings.ES_INDEX, settings.ES_DOCTYPE, record_id, min_length, stopwords, stems) normalized = normalize_cloud(t_vector['result'], idf_timeframe) return json_response_message('ok', 'Word cloud generated', {'result': normalized}) else: # Cloud for a query logger.info('services/cloud/ - multiple document word cloud') query = get_object_or_404(Query, pk=query_id) params = query.get_query_dict() # If we're creating a timeline cloud, set the min/max dates date_range = None if request.GET.get('is_timeline'): date_range = daterange2dates(request.GET.get('date_range')) task = generate_tv_cloud.delay(params, min_length, stopwords, date_range, stems, idf_timeframe) logger.info('services/cloud/ - Celery task id: {}'.format(task.id)) return json_response_message('ok', '', {'task': task.id})