def last_name_count(vk_elastic_db: es_client.VkDataDatabaseClient,
                    size=10,
                    is_need_other=False,
                    is_need_print=False,
                    is_need_plot=True,
                    is_need_active=False,
                    days_delta=20):
    aggs_name = "last_name_count"
    sex_aggs_name = "sex_aggs"
    title = "last name count"
    if is_need_active:
        title += " active"
    sex_size = 2
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    sex_a = elasticsearch_dsl.A('terms',
                                field="sex",
                                missing="-1",
                                size=sex_size)
    a = elasticsearch_dsl.A('terms', field="last_name.keyword", size=size)
    s.aggs.bucket(sex_aggs_name, sex_a).bucket(aggs_name, a)
    response = s.execute()

    data_dict = {}
    sex_dict = {"0": "unknown", "1": "woman", "2": "man", "-1": "missing"}
    for sex_hit in response.aggregations[sex_aggs_name].buckets:
        x_axis = [hit.key for hit in sex_hit[aggs_name].buckets]
        y_axis = [hit.doc_count for hit in sex_hit[aggs_name].buckets]
        if is_need_other:
            x_axis.append("other")
            y_axis.append(sex_hit[aggs_name].sum_other_doc_count)
        data_dict[sex_dict[str(sex_hit.key)]] = {}
        data_dict[sex_dict[str(sex_hit.key)]]["x_axis"] = x_axis
        data_dict[sex_dict[str(sex_hit.key)]]["y_axis"] = y_axis

    for sex in data_dict:
        x_axis = data_dict[sex]["x_axis"]
        y_axis = data_dict[sex]["y_axis"]
        cur_title = f"{title}\n{sex}"
        figname = f"{title.replace(' ', '_')}_{sex}"

        if is_need_print:
            print(cur_title)
            for i in range(len(x_axis)):
                print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

        if is_need_plot:
            fig, ax = plt.subplots(1, 1)
            ax.set_title(cur_title)
            ax.barh(x_axis, y_axis)
            # plt.show()
            fig.savefig(f"{save_path}/{figname}.png",
                        dpi=300,
                        format='png',
                        bbox_inches='tight')
            plt.close(fig)
def count_by_university_order_by_country(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=True,
                                         is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20):
    country_aggs_name = "country_count"
    university_aggs_name = "university_count"
    title = "count university by country"
    if is_need_active:
        title += " active"
    missing_str = ""
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size, collect_mode="breadth_first")
    a1 = elasticsearch_dsl.A('terms', field="university_name.keyword",  missing=missing_str, size=size)
    s.aggs.bucket(country_aggs_name, a1).bucket(university_aggs_name, a)
    response = s.execute()

    data_dict = {}
    for country_hit in response.aggregations[country_aggs_name].buckets:

        x_axis = []
        y_axis = []
        for hit in country_hit[university_aggs_name].buckets:
            if hit.key == missing_str:
                continue
            x_axis.append(hit.key)
            y_axis.append(hit.doc_count)
        if is_need_other:
            x_axis.append("other")
            y_axis.append(country_hit[university_aggs_name].sum_other_doc_count)
        data_dict[country_hit.key] = {}
        data_dict[country_hit.key]["x_axis"] = x_axis
        data_dict[country_hit.key]["y_axis"] = y_axis

    for country in data_dict:
        x_axis = data_dict[country]["x_axis"]
        y_axis = data_dict[country]["y_axis"]
        cur_title = f"{title}\n{country}"
        figname = f"{title.replace(' ', '_')}_{country}"
        if is_need_print:
            print(cur_title)
            for i in range(len(x_axis)):
                print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

        if is_need_plot:
            fig, ax = plt.subplots(1, 1)
            ax.set_title(cur_title)
            ax.barh(x_axis, y_axis)
            # plt.show()
            fig.savefig(f"{save_path}/{figname}.png", dpi=300, format='png', bbox_inches='tight')
            plt.close(fig)
コード例 #3
0
def resource_summaries(host, index, after, before, interval):
    by_site = [{
        k: edsl.A("terms", field=k + ".keyword")
    } for k in ("site", "country", "institution", "resource")]
    # split sites into GPU/CPU partitions
    by_site.append({
        "slot_type":
        edsl.A("terms", script='doc.TotalGPUs.value > 0 ? "GPU" : "CPU"')
    })
    # NB: @timestamp is not included in the composite aggregation, as this
    # buckets documents for _every_ combination of the source values, meaning
    # that a document will be added to the bucket N times if N of its
    # @timestamp values fall into the time range. To emulate ES 7.x range
    # semantics (one doc falls in many buckets, each bucket sees only one copy
    # of each doc), we split date_histogram off into a sub-aggregation.
    by_timestamp = edsl.A(
        "date_histogram",
        field="@timestamp",
        interval=int(interval.total_seconds() * 1000),
    )
    by_timestamp.bucket("resources", summarize_resources)

    buckets = scan_aggs(
        (edsl.Search().using(
            elasticsearch.Elasticsearch(host)).index(index).filter(
                "range", **{"@timestamp": {
                    "gte": after,
                    "lt": before
                }})),
        by_site,
        {"timestamp": by_timestamp},
        size=1,
    )
    for site in buckets:
        for bucket in site.timestamp.buckets:
            # Filter buckets to query time range. This should be possible to do
            # in the query DSL, but bucket_selector does not support
            # date_histogram buckets, and the corresponding ticket has been
            # open for years:
            # https://github.com/elastic/elasticsearch/issues/23874
            timestamp = datetime.datetime.utcfromtimestamp(bucket.key / 1000)
            if timestamp >= after and timestamp < before and bucket.doc_count > 0:
                data = bucket.resources.value.to_dict()
                data["count"] = bucket.doc_count
                data["_keys"] = site.key.to_dict()
                data["_keys"]["timestamp"] = timestamp.strftime(
                    "%Y-%m-%dT%H:%M:%S")
                yield data
def count_by_country(vk_elastic_db: es_client.VkDataDatabaseClient,
                     size=10,
                     is_need_active=False,
                     days_delta=20):
    country_aggs_name = "country_count"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    s = s.filter(
        "bool",
        must=[elasticsearch_dsl.Q("exists", field="country.title.keyword")])
    s = s.filter(
        "bool",
        must_not=[elasticsearch_dsl.Q("match", country__title__keywordd="")])
    a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size)
    s.aggs.bucket(country_aggs_name, a)
    response = s.execute()

    data = []
    for country_hit in response.aggregations[country_aggs_name].buckets:
        country_dict = {
            'country': country_hit.key,
            'count': country_hit.doc_count
        }
        data.append(country_dict)
    add_geoposition(data)
    return data
コード例 #5
0
ファイル: facets.py プロジェクト: piledirect/photo-manager
 def aggregates(self):
     return [
         (self.name,
          esd.A(self.agg_type, **{
              'field': self.name,
              'interval': self.interval
          })),
     ]
コード例 #6
0
def get_elasticsearch_index_samples(elasticsearch_index, dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS):
    es_client = get_es_client()

    index_metadata = get_index_metadata(elasticsearch_index, es_client).get(elasticsearch_index)

    s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index)
    s = s.params(size=0)
    s.aggs.bucket('sample_ids', elasticsearch_dsl.A('terms', field=SAMPLE_FIELDS_MAP[dataset_type], size=10000))
    response = s.execute()
    return [agg['key'] for agg in response.aggregations.sample_ids.buckets], index_metadata
コード例 #7
0
ファイル: dataset_utils.py プロジェクト: SarahBeecroft/seqr
def get_elasticsearch_index_samples(elasticsearch_index):
    es_client = get_es_client()

    index_metadata = get_index_metadata(elasticsearch_index, es_client).get(elasticsearch_index)

    s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index)
    s = s.params(size=0)
    s.aggs.bucket('sample_ids', elasticsearch_dsl.A('terms', field='samples_num_alt_1', size=10000))
    response = s.execute()
    return [agg['key'] for agg in response.aggregations.sample_ids.buckets], index_metadata
コード例 #8
0
ファイル: facets.py プロジェクト: piledirect/photo-manager
 def aggregates(self):
     aggs = []
     for name, interval, format, f in self.aggregations:
         aggs.append((name,
                      esd.A(
                          self.agg_type, **{
                              'field': self.name,
                              'interval': interval,
                              'min_doc_count': 1
                          })))
     return aggs
def get_active_users_filter(es, es_index, s, days_delta=20):
    agg_name = "last_time"
    day_s = elasticsearch_dsl.Search(using=es, index=es_index)
    day_a = elasticsearch_dsl.A('max', field="last_seen.time")
    day_s.aggs.bucket(agg_name, day_a)
    resp = day_s.execute()

    latest_day_timestamp = resp.aggregations[agg_name].value
    value = datetime.datetime.fromtimestamp(latest_day_timestamp)
    barier_data = value - datetime.timedelta(days=days_delta)
    barier_timestamp = time.mktime(barier_data.timetuple())
    ret_s = s.filter("range", last_seen__time={'gt': barier_timestamp})
    return ret_s
def count_by_university(vk_elastic_db: es_client.VkDataDatabaseClient,
                        size=10,
                        is_need_other=True,
                        is_need_print=False,
                        is_need_plot=True,
                        is_need_active=False,
                        days_delta=20):
    aggs_name = "university_count"
    title = "university count"
    if is_need_active:
        title += " active"
    missing_str = ""
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    a = elasticsearch_dsl.A('terms',
                            field="university_name.keyword",
                            missing=missing_str,
                            size=size)
    s.aggs.bucket(aggs_name, a)
    response = s.execute()

    x_axis = []
    y_axis = []
    for hit in response.aggregations[aggs_name].buckets:
        if hit.key == missing_str:
            continue
        x_axis.append(hit.key)
        y_axis.append(hit.doc_count)
    if is_need_other:
        x_axis.append("other")
        y_axis.append(response.aggregations[aggs_name].sum_other_doc_count)

    if is_need_print:
        print(title)
        for i in range(len(x_axis)):
            print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

    if is_need_plot:
        fig, ax = plt.subplots(1, 1)
        ax.set_title(title)
        ax.barh(x_axis, y_axis)
        # plt.show()
        fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png",
                    dpi=300,
                    format='png',
                    bbox_inches='tight')
        plt.close(fig)
        '''
コード例 #11
0
def has_mobile(vk_elastic_db: es_client.VkDataDatabaseClient,
               size=1,
               is_need_other=True,
               is_need_print=False,
               is_need_plot=True,
               is_need_active=False,
               days_delta=20):
    aggs_name = "has_mobile"
    title = "has_mobile"
    if is_need_active:
        title += " active"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    a = elasticsearch_dsl.A('terms',
                            field="has_mobile",
                            size=size,
                            missing="0")
    s.aggs.bucket(aggs_name, a)
    response = s.execute()

    label_dict = {"1": "has", "0": "has not", "-1": "missing"}
    x_axis = [
        label_dict[str(hit.key)]
        for hit in response.aggregations[aggs_name].buckets
    ]
    y_axis = [
        hit.doc_count for hit in response.aggregations[aggs_name].buckets
    ]
    if is_need_other:
        x_axis.append("other")
        y_axis.append(response.aggregations[aggs_name].sum_other_doc_count)

    if is_need_print:
        print(title)
        for i in range(len(x_axis)):
            print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

    if is_need_plot:
        sizes = [elem / sum(y_axis) for elem in y_axis]
        fig, ax = plt.subplots(1, 1)
        ax.set_title(title)
        ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90)
        # plt.show()
        fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png",
                    dpi=300,
                    format='png',
                    bbox_inches='tight')
        plt.close(fig)
def count_by_country(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=True, is_need_print=False,
                     is_need_plot=True, is_need_active=False, days_delta=20):
    title = "count by country"
    if is_need_active:
        title += " active"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size)
    aggs_name = "countries_count"
    s.aggs.bucket(aggs_name, a)
    response = s.execute()
    response_process(response, aggs_name, title, is_need_other, is_need_print, is_need_plot)
コード例 #13
0
def has_city(vk_elastic_db: es_client.VkDataDatabaseClient,
             is_need_print=False,
             is_need_plot=True,
             is_need_active=False,
             days_delta=20):
    aggs_name = "has_city"
    title = "has city"
    if is_need_active:
        title += " active"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    s = s.filter()
    size = 10000
    missing_str = "missing"
    a = elasticsearch_dsl.A('terms',
                            field="city.title.keyword",
                            missing=missing_str,
                            size=size)
    s.aggs.bucket(aggs_name, a)
    response = s.execute()

    data = {"has city": 0, "missing city": 0}
    for hit in response.aggregations[aggs_name].buckets:
        if hit.key == missing_str:
            data["missing city"] += hit.doc_count
        else:
            data["has city"] += hit.doc_count
    x_axis = [key for key in data]
    y_axis = [data[key] for key in data]

    if is_need_print:
        print(title)
        for i in range(len(x_axis)):
            print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

    if is_need_plot:
        sizes = [elem / sum(y_axis) for elem in y_axis]
        fig, ax = plt.subplots(1, 1)
        ax.set_title(title)
        ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90)
        # plt.show()
        fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png",
                    dpi=300,
                    format='png',
                    bbox_inches='tight')
        plt.close(fig)
def name_count(vk_elastic_db: es_client.VkDataDatabaseClient,
               aggs_name,
               sex=None,
               size=10,
               is_need_active=False,
               days_delta=20):
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    if sex is not None:
        s = s.filter('term', sex=sex)
    a = elasticsearch_dsl.A('terms', field="first_name.keyword", size=size)
    s.aggs.bucket(aggs_name, a)
    response = s.execute()
    return response
コード例 #15
0
def get_elasticsearch_index_samples(elasticsearch_index):
    es_client = get_es_client()

    index_metadata = get_index_metadata(
        elasticsearch_index, es_client,
        include_fields=True).get(elasticsearch_index)

    sample_field = next((field for field in SAMPLE_FIELDS_LIST
                         if field in index_metadata['fields'].keys()))
    s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index)
    s = s.params(size=0)
    s.aggs.bucket('sample_ids',
                  elasticsearch_dsl.A('terms', field=sample_field, size=10000))
    response = s.execute()
    return [agg['key'] for agg in response.aggregations.sample_ids.buckets
            ], index_metadata
コード例 #16
0
ファイル: dataset_utils.py プロジェクト: evrimulgen/seqr
def _get_elasticsearch_index_samples(elasticsearch_index, project):
    sample_field_suffix = '_num_alt'

    es_client = get_es_client(timeout=30)
    index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index),
                                    using=es_client)
    try:
        field_mapping = index.get_field_mapping(
            fields=['*{}'.format(sample_field_suffix), 'join_field'],
            doc_type=[VARIANT_DOC_TYPE])
    except NotFoundError:
        raise Exception('Index "{}" not found'.format(elasticsearch_index))
    except TransportError as e:
        raise Exception(e.error)

    #  Nested genotypes
    if field_mapping.get(elasticsearch_index,
                         {}).get('mappings', {}).get(VARIANT_DOC_TYPE,
                                                     {}).get('join_field'):
        max_samples = Individual.objects.filter(
            family__project=project).count()
        s = elasticsearch_dsl.Search(using=es_client,
                                     index=elasticsearch_index)
        s = s.params(size=0)
        s.aggs.bucket(
            'sample_ids',
            elasticsearch_dsl.A('terms', field='sample_id', size=max_samples))
        response = s.execute()
        return [agg['key'] for agg in response.aggregations.sample_ids.buckets]

    samples = set()
    for index in field_mapping.values():
        samples.update([
            key.split(sample_field_suffix)[0] for key in index.get(
                'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys()
        ])
    if not samples:
        raise Exception('No sample fields found for index "{}"'.format(
            elasticsearch_index))
    return samples
コード例 #17
0
    def test_you_can_make_aggregations_on_tags_raw(self, factories, index,
                                                   search):
        annotation_1 = factories.Annotation.build(id="test_annotation_id_1",
                                                  tags=["Hello"])
        annotation_2 = factories.Annotation.build(id="test_annotation_id_2",
                                                  tags=["hello"])

        index(annotation_1, annotation_2)

        tags_aggregation = elasticsearch_dsl.A('terms', field='tags_raw')
        search.aggs.bucket('tags_raw_terms', tags_aggregation)

        response = search.execute()

        tag_bucket_1 = next(
            bucket for bucket in response.aggregations.tags_raw_terms.buckets
            if bucket["key"] == "Hello")
        tag_bucket_2 = next(
            bucket for bucket in response.aggregations.tags_raw_terms.buckets
            if bucket["key"] == "hello")

        assert tag_bucket_1["doc_count"] == 1
        assert tag_bucket_2["doc_count"] == 1
コード例 #18
0
    def test_you_can_make_aggregations_on_user_raw(self, factories, index,
                                                   search):
        annotation_1 = factories.Annotation.build(
            userid="acct:[email protected]")
        annotation_2 = factories.Annotation.build(
            userid="acct:[email protected]")

        index(annotation_1, annotation_2)

        user_aggregation = elasticsearch_dsl.A('terms', field='user_raw')
        search.aggs.bucket('user_raw_terms', user_aggregation)

        response = search.execute()

        user_bucket_1 = next(
            bucket for bucket in response.aggregations.user_raw_terms.buckets
            if bucket["key"] == "acct:[email protected]")
        user_bucket_2 = next(
            bucket for bucket in response.aggregations.user_raw_terms.buckets
            if bucket["key"] == "acct:[email protected]")

        assert user_bucket_1["doc_count"] == 1
        assert user_bucket_2["doc_count"] == 1
コード例 #19
0
def _get_elasticsearch_index_samples(elasticsearch_index):
    es_client = get_es_client()

    #  Nested genotypes
    if is_nested_genotype_index(elasticsearch_index):
        s = elasticsearch_dsl.Search(using=es_client,
                                     index=elasticsearch_index)
        s = s.params(size=0)
        s.aggs.bucket(
            'sample_ids',
            elasticsearch_dsl.A('terms', field='samples_num_alt_1',
                                size=10000))
        response = s.execute()
        return [agg['key'] for agg in response.aggregations.sample_ids.buckets]

    sample_field_suffix = '_num_alt'
    index = elasticsearch_dsl.Index('{}*'.format(elasticsearch_index),
                                    using=es_client)
    try:
        field_mapping = index.get_field_mapping(
            fields=['*{}'.format(sample_field_suffix)],
            doc_type=[VARIANT_DOC_TYPE])
    except NotFoundError:
        raise Exception('Index "{}" not found'.format(elasticsearch_index))
    except TransportError as e:
        raise Exception(e.error)

    samples = set()
    for index in field_mapping.values():
        samples.update([
            key.split(sample_field_suffix)[0] for key in index.get(
                'mappings', {}).get(VARIANT_DOC_TYPE, {}).keys()
        ])
    if not samples:
        raise Exception('No sample fields found for index "{}"'.format(
            elasticsearch_index))
    return samples
コード例 #20
0
summarize_resources = edsl.A(
    "scripted_metric",
    init_script="""
        state.interval = (Long)(params.interval);
        HashMap metrics = new HashMap();
        for (resource in params.RESOURCES) {
            for (status in params.STATUSES) {
                String key = "claimed."+status+"."+resource;
                metrics.put(key, 0.0);
            }
            metrics.put("offered."+resource, 0.0);
        }
        state.metrics = metrics;
        """,
    map_script="""
        // The time range of each item intersects one or more buckets, but does not
        // necessarily overlap each completely. Ideally we would use the exact overlap
        // fraction to weight contributions to each bucket, but since Elastic does not
        // give us access to the bucket key, we have to settle for the average overlap
        // fraction.
        long left = doc[params.left].value.toInstant().toEpochMilli();
        long right = doc[params.right].value.toInstant().toEpochMilli();
        long total_interval = (state.interval*((right+params.interval)/state.interval-left/state.interval));
        double active_fraction = (right-left).doubleValue()/total_interval.doubleValue();
        HashMap metrics = state.metrics;
        for (resource in params.RESOURCES) {
            if (!doc.containsKey("Total"+resource)) {
                continue;
            }
            double capacity = doc["Total"+resource].value.doubleValue();
            for (status in params.STATUSES) {
                String source = "occupancy."+status+"."+resource;
                String dest = "claimed."+status+"."+resource;
                if (doc.containsKey(source)) {
                    metrics[dest] += active_fraction*doc[source].value*capacity;
                }
            }
            metrics["offered."+resource] += active_fraction*capacity;
        }
        """,
    combine_script="""
        return state.metrics;
        """,
    reduce_script="""
        Map aggregate = new HashMap();
        for (state in states) {
            if (state == null) {
                continue;
            }
            for (entry in state.entrySet()) {
                if (aggregate.containsKey(entry.getKey())) {
                    aggregate[entry.getKey()] += entry.getValue();
                } else {
                    aggregate[entry.getKey()] = entry.getValue();
                }
            }
        }
        return aggregate;
        """,
    params={
        "left": "DaemonStartTime",
        "right": "LastHeardFrom",
        "interval": int(options.interval.total_seconds() * 1000),
        "RESOURCES": RESOURCES,
        "STATUSES": STATUSES + ("total", ),
    },
)
コード例 #21
0
ファイル: facets.py プロジェクト: piledirect/photo-manager
 def aggregates(self):
     return [(self.name, esd.A(self.agg_type, field=self.name, size=5)),
             ("%s_missing" % self.name, esd.A("missing", field=self.name))]