def get_funders_for_datasets(datasets): es = get_es() for dataset in datasets: query = { "query": { "bool": { "filter": [{ "term": { "filename": dataset['identifier'] + '.json' } }] } }, "aggs": { "funders": { "terms": { "field": "fundingOrganization.id_and_name", "size": 10 } } } } results = es.search(body=query, index=settings.ES_INDEX, size=0) dataset['funders'] = [ json.loads(bucket['key']) for bucket in results['aggregations']['funders']['buckets'] ]
def get_results(json_query, size=10, from_=0): es = get_es() extra_context = json_query.pop('extra_context', None) results = es.search(body=json_query, size=size, from_=from_, index=settings.ES_INDEX) if extra_context is not None: json_query['extra_context'] = extra_context return results
def grants_json_generator(query): yield '''{ "license": "See dataset/license within each grant. This file also contains OS data © Crown copyright and database right 2016, Royal Mail data © Royal Mail copyright and Database right 2016, National Statistics data © Crown copyright and database right 2016, see http://grantnav.org/datasets/ for more information.", "grants": [\n''' es = get_es() for num, result in enumerate(scan(es, query, index=settings.ES_INDEX)): result["_source"]["dataset"] = provenance.by_identifier.get(provenance.identifier_from_filename(result['_source']['filename']), {}) if num == 0: yield json.dumps(result["_source"]) + "\n" else: yield ", " + json.dumps(result["_source"]) + "\n" yield ']}'
def grants_csv_generator(query): yield csv_layout.grant_csv_titles es = get_es() for result in scan(es, query, index=settings.ES_INDEX): result_with_provenance = { "result": result["_source"], "dataset": provenance.by_identifier.get(provenance.identifier_from_filename(result['_source']['filename']), {}) } line = [] for path in csv_layout.grant_csv_paths: line.append(get_data_from_path(path, result_with_provenance)) yield line
def get_funders_for_datasets(datasets): es = get_es() for dataset in datasets: query = {"query": {"bool": { "filter": [{"term": {"filename": dataset['identifier'] + '.json'}}]}}, "aggs": { "funders": {"terms": {"field": "fundingOrganization.id_and_name", "size": 10}} } } results = es.search(body=query, index=settings.ES_INDEX, size=0) dataset['funders'] = [json.loads(bucket['key']) for bucket in results['aggregations']['funders']['buckets']]
def stats(request): text_query = request.GET.get('text_query') if not text_query: text_query = '*' context = {'text_query': text_query or ''} es = get_es() mapping = es.indices.get_mapping(index=settings.ES_INDEX) all_fields = list(flatten_mapping(mapping[settings.ES_INDEX]['mappings']['grant']['properties'])) query = {"query": {"bool": {"must": {"query_string": {"query": text_query}}, "filter": {}}}, "aggs": {}} schema = jsonref.load_uri(settings.GRANT_SCHEMA) schema_fields = set(flatten_schema(schema)) for field in all_fields: query["aggs"][field + ":terms"] = {"terms": {"field": field, "size": 5}} query["aggs"][field + ":missing"] = {"missing": {"field": field}} query["aggs"][field + ":cardinality"] = {"cardinality": {"field": field}} if context['text_query'] == '*': context['text_query'] = '' field_info = collections.defaultdict(dict) results = es.search(body=query, index=settings.ES_INDEX, size=0) for field, aggregation in results['aggregations'].items(): field_name, agg_type = field.split(':') field_info[field_name]["in_schema"] = field_name in schema_fields if agg_type == 'terms': field_info[field_name]["terms"] = aggregation["buckets"] if agg_type == 'missing': field_info[field_name]["found"] = results['hits']['total'] - aggregation["doc_count"] if agg_type == 'cardinality': field_info[field_name]["distinct"] = aggregation["value"] context['field_info'] = sorted(field_info.items(), key=lambda val: -val[1]["found"]) context['results'] = results return render(request, "stats.html", context=context)