def last_name_count(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=False, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): aggs_name = "last_name_count" sex_aggs_name = "sex_aggs" title = "last name count" if is_need_active: title += " active" sex_size = 2 es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) sex_a = elasticsearch_dsl.A('terms', field="sex", missing="-1", size=sex_size) a = elasticsearch_dsl.A('terms', field="last_name.keyword", size=size) s.aggs.bucket(sex_aggs_name, sex_a).bucket(aggs_name, a) response = s.execute() data_dict = {} sex_dict = {"0": "unknown", "1": "woman", "2": "man", "-1": "missing"} for sex_hit in response.aggregations[sex_aggs_name].buckets: x_axis = [hit.key for hit in sex_hit[aggs_name].buckets] y_axis = [hit.doc_count for hit in sex_hit[aggs_name].buckets] if is_need_other: x_axis.append("other") y_axis.append(sex_hit[aggs_name].sum_other_doc_count) data_dict[sex_dict[str(sex_hit.key)]] = {} data_dict[sex_dict[str(sex_hit.key)]]["x_axis"] = x_axis data_dict[sex_dict[str(sex_hit.key)]]["y_axis"] = y_axis for sex in data_dict: x_axis = data_dict[sex]["x_axis"] y_axis = data_dict[sex]["y_axis"] cur_title = f"{title}\n{sex}" figname = f"{title.replace(' ', '_')}_{sex}" if is_need_print: print(cur_title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: fig, ax = plt.subplots(1, 1) ax.set_title(cur_title) ax.barh(x_axis, y_axis) # plt.show() fig.savefig(f"{save_path}/{figname}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig)
def count_by_university_order_by_country(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=True, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): country_aggs_name = "country_count" university_aggs_name = "university_count" title = "count university by country" if is_need_active: title += " active" missing_str = "" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size, collect_mode="breadth_first") a1 = elasticsearch_dsl.A('terms', field="university_name.keyword", missing=missing_str, size=size) s.aggs.bucket(country_aggs_name, a1).bucket(university_aggs_name, a) response = s.execute() data_dict = {} for country_hit in response.aggregations[country_aggs_name].buckets: x_axis = [] y_axis = [] for hit in country_hit[university_aggs_name].buckets: if hit.key == missing_str: continue x_axis.append(hit.key) y_axis.append(hit.doc_count) if is_need_other: x_axis.append("other") y_axis.append(country_hit[university_aggs_name].sum_other_doc_count) data_dict[country_hit.key] = {} data_dict[country_hit.key]["x_axis"] = x_axis data_dict[country_hit.key]["y_axis"] = y_axis for country in data_dict: x_axis = data_dict[country]["x_axis"] y_axis = data_dict[country]["y_axis"] cur_title = f"{title}\n{country}" figname = f"{title.replace(' ', '_')}_{country}" if is_need_print: print(cur_title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: fig, ax = plt.subplots(1, 1) ax.set_title(cur_title) ax.barh(x_axis, y_axis) # plt.show() fig.savefig(f"{save_path}/{figname}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig)
def resource_summaries(host, index, after, before, interval): by_site = [{ k: edsl.A("terms", field=k + ".keyword") } for k in ("site", "country", "institution", "resource")] # split sites into GPU/CPU partitions by_site.append({ "slot_type": edsl.A("terms", script='doc.TotalGPUs.value > 0 ? "GPU" : "CPU"') }) # NB: @timestamp is not included in the composite aggregation, as this # buckets documents for _every_ combination of the source values, meaning # that a document will be added to the bucket N times if N of its # @timestamp values fall into the time range. To emulate ES 7.x range # semantics (one doc falls in many buckets, each bucket sees only one copy # of each doc), we split date_histogram off into a sub-aggregation. by_timestamp = edsl.A( "date_histogram", field="@timestamp", interval=int(interval.total_seconds() * 1000), ) by_timestamp.bucket("resources", summarize_resources) buckets = scan_aggs( (edsl.Search().using( elasticsearch.Elasticsearch(host)).index(index).filter( "range", **{"@timestamp": { "gte": after, "lt": before }})), by_site, {"timestamp": by_timestamp}, size=1, ) for site in buckets: for bucket in site.timestamp.buckets: # Filter buckets to query time range. This should be possible to do # in the query DSL, but bucket_selector does not support # date_histogram buckets, and the corresponding ticket has been # open for years: # https://github.com/elastic/elasticsearch/issues/23874 timestamp = datetime.datetime.utcfromtimestamp(bucket.key / 1000) if timestamp >= after and timestamp < before and bucket.doc_count > 0: data = bucket.resources.value.to_dict() data["count"] = bucket.doc_count data["_keys"] = site.key.to_dict() data["_keys"]["timestamp"] = timestamp.strftime( "%Y-%m-%dT%H:%M:%S") yield data
def count_by_country(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_active=False, days_delta=20): country_aggs_name = "country_count" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) s = s.filter( "bool", must=[elasticsearch_dsl.Q("exists", field="country.title.keyword")]) s = s.filter( "bool", must_not=[elasticsearch_dsl.Q("match", country__title__keywordd="")]) a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size) s.aggs.bucket(country_aggs_name, a) response = s.execute() data = [] for country_hit in response.aggregations[country_aggs_name].buckets: country_dict = { 'country': country_hit.key, 'count': country_hit.doc_count } data.append(country_dict) add_geoposition(data) return data
def aggregates(self): return [ (self.name, esd.A(self.agg_type, **{ 'field': self.name, 'interval': self.interval })), ]
def get_elasticsearch_index_samples(elasticsearch_index, dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS): es_client = get_es_client() index_metadata = get_index_metadata(elasticsearch_index, es_client).get(elasticsearch_index) s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket('sample_ids', elasticsearch_dsl.A('terms', field=SAMPLE_FIELDS_MAP[dataset_type], size=10000)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets], index_metadata
def get_elasticsearch_index_samples(elasticsearch_index): es_client = get_es_client() index_metadata = get_index_metadata(elasticsearch_index, es_client).get(elasticsearch_index) s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket('sample_ids', elasticsearch_dsl.A('terms', field='samples_num_alt_1', size=10000)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets], index_metadata
def aggregates(self): aggs = [] for name, interval, format, f in self.aggregations: aggs.append((name, esd.A( self.agg_type, **{ 'field': self.name, 'interval': interval, 'min_doc_count': 1 }))) return aggs
def get_active_users_filter(es, es_index, s, days_delta=20): agg_name = "last_time" day_s = elasticsearch_dsl.Search(using=es, index=es_index) day_a = elasticsearch_dsl.A('max', field="last_seen.time") day_s.aggs.bucket(agg_name, day_a) resp = day_s.execute() latest_day_timestamp = resp.aggregations[agg_name].value value = datetime.datetime.fromtimestamp(latest_day_timestamp) barier_data = value - datetime.timedelta(days=days_delta) barier_timestamp = time.mktime(barier_data.timetuple()) ret_s = s.filter("range", last_seen__time={'gt': barier_timestamp}) return ret_s
def count_by_university(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=True, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): aggs_name = "university_count" title = "university count" if is_need_active: title += " active" missing_str = "" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) a = elasticsearch_dsl.A('terms', field="university_name.keyword", missing=missing_str, size=size) s.aggs.bucket(aggs_name, a) response = s.execute() x_axis = [] y_axis = [] for hit in response.aggregations[aggs_name].buckets: if hit.key == missing_str: continue x_axis.append(hit.key) y_axis.append(hit.doc_count) if is_need_other: x_axis.append("other") y_axis.append(response.aggregations[aggs_name].sum_other_doc_count) if is_need_print: print(title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: fig, ax = plt.subplots(1, 1) ax.set_title(title) ax.barh(x_axis, y_axis) # plt.show() fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig) '''
def has_mobile(vk_elastic_db: es_client.VkDataDatabaseClient, size=1, is_need_other=True, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): aggs_name = "has_mobile" title = "has_mobile" if is_need_active: title += " active" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) a = elasticsearch_dsl.A('terms', field="has_mobile", size=size, missing="0") s.aggs.bucket(aggs_name, a) response = s.execute() label_dict = {"1": "has", "0": "has not", "-1": "missing"} x_axis = [ label_dict[str(hit.key)] for hit in response.aggregations[aggs_name].buckets ] y_axis = [ hit.doc_count for hit in response.aggregations[aggs_name].buckets ] if is_need_other: x_axis.append("other") y_axis.append(response.aggregations[aggs_name].sum_other_doc_count) if is_need_print: print(title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: sizes = [elem / sum(y_axis) for elem in y_axis] fig, ax = plt.subplots(1, 1) ax.set_title(title) ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90) # plt.show() fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig)
def count_by_country(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=True, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): title = "count by country" if is_need_active: title += " active" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size) aggs_name = "countries_count" s.aggs.bucket(aggs_name, a) response = s.execute() response_process(response, aggs_name, title, is_need_other, is_need_print, is_need_plot)
def has_city(vk_elastic_db: es_client.VkDataDatabaseClient, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): aggs_name = "has_city" title = "has city" if is_need_active: title += " active" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) s = s.filter() size = 10000 missing_str = "missing" a = elasticsearch_dsl.A('terms', field="city.title.keyword", missing=missing_str, size=size) s.aggs.bucket(aggs_name, a) response = s.execute() data = {"has city": 0, "missing city": 0} for hit in response.aggregations[aggs_name].buckets: if hit.key == missing_str: data["missing city"] += hit.doc_count else: data["has city"] += hit.doc_count x_axis = [key for key in data] y_axis = [data[key] for key in data] if is_need_print: print(title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: sizes = [elem / sum(y_axis) for elem in y_axis] fig, ax = plt.subplots(1, 1) ax.set_title(title) ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90) # plt.show() fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig)
def name_count(vk_elastic_db: es_client.VkDataDatabaseClient, aggs_name, sex=None, size=10, is_need_active=False, days_delta=20): es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) if sex is not None: s = s.filter('term', sex=sex) a = elasticsearch_dsl.A('terms', field="first_name.keyword", size=size) s.aggs.bucket(aggs_name, a) response = s.execute() return response
def get_elasticsearch_index_samples(elasticsearch_index): es_client = get_es_client() index_metadata = get_index_metadata( elasticsearch_index, es_client, include_fields=True).get(elasticsearch_index) sample_field = next((field for field in SAMPLE_FIELDS_LIST if field in index_metadata['fields'].keys())) s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket('sample_ids', elasticsearch_dsl.A('terms', field=sample_field, size=10000)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets ], index_metadata
def _get_elasticsearch_index_samples(elasticsearch_index, project): sample_field_suffix = '_num_alt' es_client = get_es_client(timeout=30) index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index), using=es_client) try: field_mapping = index.get_field_mapping( fields=['*{}'.format(sample_field_suffix), 'join_field'], doc_type=[VARIANT_DOC_TYPE]) except NotFoundError: raise Exception('Index "{}" not found'.format(elasticsearch_index)) except TransportError as e: raise Exception(e.error) # Nested genotypes if field_mapping.get(elasticsearch_index, {}).get('mappings', {}).get(VARIANT_DOC_TYPE, {}).get('join_field'): max_samples = Individual.objects.filter( family__project=project).count() s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket( 'sample_ids', elasticsearch_dsl.A('terms', field='sample_id', size=max_samples)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets] samples = set() for index in field_mapping.values(): samples.update([ key.split(sample_field_suffix)[0] for key in index.get( 'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys() ]) if not samples: raise Exception('No sample fields found for index "{}"'.format( elasticsearch_index)) return samples
def test_you_can_make_aggregations_on_tags_raw(self, factories, index, search): annotation_1 = factories.Annotation.build(id="test_annotation_id_1", tags=["Hello"]) annotation_2 = factories.Annotation.build(id="test_annotation_id_2", tags=["hello"]) index(annotation_1, annotation_2) tags_aggregation = elasticsearch_dsl.A('terms', field='tags_raw') search.aggs.bucket('tags_raw_terms', tags_aggregation) response = search.execute() tag_bucket_1 = next( bucket for bucket in response.aggregations.tags_raw_terms.buckets if bucket["key"] == "Hello") tag_bucket_2 = next( bucket for bucket in response.aggregations.tags_raw_terms.buckets if bucket["key"] == "hello") assert tag_bucket_1["doc_count"] == 1 assert tag_bucket_2["doc_count"] == 1
def test_you_can_make_aggregations_on_user_raw(self, factories, index, search): annotation_1 = factories.Annotation.build( userid="acct:[email protected]") annotation_2 = factories.Annotation.build( userid="acct:[email protected]") index(annotation_1, annotation_2) user_aggregation = elasticsearch_dsl.A('terms', field='user_raw') search.aggs.bucket('user_raw_terms', user_aggregation) response = search.execute() user_bucket_1 = next( bucket for bucket in response.aggregations.user_raw_terms.buckets if bucket["key"] == "acct:[email protected]") user_bucket_2 = next( bucket for bucket in response.aggregations.user_raw_terms.buckets if bucket["key"] == "acct:[email protected]") assert user_bucket_1["doc_count"] == 1 assert user_bucket_2["doc_count"] == 1
def _get_elasticsearch_index_samples(elasticsearch_index): es_client = get_es_client() # Nested genotypes if is_nested_genotype_index(elasticsearch_index): s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) s = s.params(size=0) s.aggs.bucket( 'sample_ids', elasticsearch_dsl.A('terms', field='samples_num_alt_1', size=10000)) response = s.execute() return [agg['key'] for agg in response.aggregations.sample_ids.buckets] sample_field_suffix = '_num_alt' index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index), using=es_client) try: field_mapping = index.get_field_mapping( fields=['*{}'.format(sample_field_suffix)], doc_type=[VARIANT_DOC_TYPE]) except NotFoundError: raise Exception('Index "{}" not found'.format(elasticsearch_index)) except TransportError as e: raise Exception(e.error) samples = set() for index in field_mapping.values(): samples.update([ key.split(sample_field_suffix)[0] for key in index.get( 'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys() ]) if not samples: raise Exception('No sample fields found for index "{}"'.format( elasticsearch_index)) return samples
summarize_resources = edsl.A( "scripted_metric", init_script=""" state.interval = (Long)(params.interval); HashMap metrics = new HashMap(); for (resource in params.RESOURCES) { for (status in params.STATUSES) { String key = "claimed."+status+"."+resource; metrics.put(key, 0.0); } metrics.put("offered."+resource, 0.0); } state.metrics = metrics; """, map_script=""" // The time range of each item intersects one or more buckets, but does not // necessarily overlap each completely. Ideally we would use the exact overlap // fraction to weight contributions to each bucket, but since Elastic does not // give us access to the bucket key, we have to settle for the average overlap // fraction. long left = doc[params.left].value.toInstant().toEpochMilli(); long right = doc[params.right].value.toInstant().toEpochMilli(); long total_interval = (state.interval*((right+params.interval)/state.interval-left/state.interval)); double active_fraction = (right-left).doubleValue()/total_interval.doubleValue(); HashMap metrics = state.metrics; for (resource in params.RESOURCES) { if (!doc.containsKey("Total"+resource)) { continue; } double capacity = doc["Total"+resource].value.doubleValue(); for (status in params.STATUSES) { String source = "occupancy."+status+"."+resource; String dest = "claimed."+status+"."+resource; if (doc.containsKey(source)) { metrics[dest] += active_fraction*doc[source].value*capacity; } } metrics["offered."+resource] += active_fraction*capacity; } """, combine_script=""" return state.metrics; """, reduce_script=""" Map aggregate = new HashMap(); for (state in states) { if (state == null) { continue; } for (entry in state.entrySet()) { if (aggregate.containsKey(entry.getKey())) { aggregate[entry.getKey()] += entry.getValue(); } else { aggregate[entry.getKey()] = entry.getValue(); } } } return aggregate; """, params={ "left": "DaemonStartTime", "right": "LastHeardFrom", "interval": int(options.interval.total_seconds() * 1000), "RESOURCES": RESOURCES, "STATUSES": STATUSES + ("total", ), }, )
def aggregates(self): return [(self.name, esd.A(self.agg_type, field=self.name, size=5)), ("%s_missing" % self.name, esd.A("missing", field=self.name))]