def active_users_pie(vk_elastic_db: es_client.VkDataDatabaseClient,
                     days_delta=20,
                     is_need_print=False,
                     is_need_plot=True):
    title = "active users"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    s = get_active_users_filter(es, index, s, days_delta=days_delta)
    s.execute()

    total_search = elasticsearch_dsl.Search(using=es, index=index)
    total_num = total_search.count()
    active_num = s.count()

    x_axis = ["active", "inactive"]
    y_axis = [active_num, total_num - active_num]
    if is_need_print:
        print(title)
        for i in range(len(x_axis)):
            print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

    if is_need_plot:
        sizes = [elem / sum(y_axis) for elem in y_axis]
        fig, ax = plt.subplots(1, 1)
        ax.set_title(title)
        ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90)
        # plt.show()
        fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png",
                    dpi=300,
                    format='png',
                    bbox_inches='tight')
        plt.close(fig)
def count_by_country(vk_elastic_db: es_client.VkDataDatabaseClient,
                     size=10,
                     is_need_active=False,
                     days_delta=20):
    country_aggs_name = "country_count"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    s = s.filter(
        "bool",
        must=[elasticsearch_dsl.Q("exists", field="country.title.keyword")])
    s = s.filter(
        "bool",
        must_not=[elasticsearch_dsl.Q("match", country__title__keywordd="")])
    a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size)
    s.aggs.bucket(country_aggs_name, a)
    response = s.execute()

    data = []
    for country_hit in response.aggregations[country_aggs_name].buckets:
        country_dict = {
            'country': country_hit.key,
            'count': country_hit.doc_count
        }
        data.append(country_dict)
    add_geoposition(data)
    return data
def last_name_count(vk_elastic_db: es_client.VkDataDatabaseClient,
                    size=10,
                    is_need_other=False,
                    is_need_print=False,
                    is_need_plot=True,
                    is_need_active=False,
                    days_delta=20):
    aggs_name = "last_name_count"
    sex_aggs_name = "sex_aggs"
    title = "last name count"
    if is_need_active:
        title += " active"
    sex_size = 2
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    sex_a = elasticsearch_dsl.A('terms',
                                field="sex",
                                missing="-1",
                                size=sex_size)
    a = elasticsearch_dsl.A('terms', field="last_name.keyword", size=size)
    s.aggs.bucket(sex_aggs_name, sex_a).bucket(aggs_name, a)
    response = s.execute()

    data_dict = {}
    sex_dict = {"0": "unknown", "1": "woman", "2": "man", "-1": "missing"}
    for sex_hit in response.aggregations[sex_aggs_name].buckets:
        x_axis = [hit.key for hit in sex_hit[aggs_name].buckets]
        y_axis = [hit.doc_count for hit in sex_hit[aggs_name].buckets]
        if is_need_other:
            x_axis.append("other")
            y_axis.append(sex_hit[aggs_name].sum_other_doc_count)
        data_dict[sex_dict[str(sex_hit.key)]] = {}
        data_dict[sex_dict[str(sex_hit.key)]]["x_axis"] = x_axis
        data_dict[sex_dict[str(sex_hit.key)]]["y_axis"] = y_axis

    for sex in data_dict:
        x_axis = data_dict[sex]["x_axis"]
        y_axis = data_dict[sex]["y_axis"]
        cur_title = f"{title}\n{sex}"
        figname = f"{title.replace(' ', '_')}_{sex}"

        if is_need_print:
            print(cur_title)
            for i in range(len(x_axis)):
                print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

        if is_need_plot:
            fig, ax = plt.subplots(1, 1)
            ax.set_title(cur_title)
            ax.barh(x_axis, y_axis)
            # plt.show()
            fig.savefig(f"{save_path}/{figname}.png",
                        dpi=300,
                        format='png',
                        bbox_inches='tight')
            plt.close(fig)
def count_by_university_order_by_country(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=True,
                                         is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20):
    country_aggs_name = "country_count"
    university_aggs_name = "university_count"
    title = "count university by country"
    if is_need_active:
        title += " active"
    missing_str = ""
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size, collect_mode="breadth_first")
    a1 = elasticsearch_dsl.A('terms', field="university_name.keyword",  missing=missing_str, size=size)
    s.aggs.bucket(country_aggs_name, a1).bucket(university_aggs_name, a)
    response = s.execute()

    data_dict = {}
    for country_hit in response.aggregations[country_aggs_name].buckets:

        x_axis = []
        y_axis = []
        for hit in country_hit[university_aggs_name].buckets:
            if hit.key == missing_str:
                continue
            x_axis.append(hit.key)
            y_axis.append(hit.doc_count)
        if is_need_other:
            x_axis.append("other")
            y_axis.append(country_hit[university_aggs_name].sum_other_doc_count)
        data_dict[country_hit.key] = {}
        data_dict[country_hit.key]["x_axis"] = x_axis
        data_dict[country_hit.key]["y_axis"] = y_axis

    for country in data_dict:
        x_axis = data_dict[country]["x_axis"]
        y_axis = data_dict[country]["y_axis"]
        cur_title = f"{title}\n{country}"
        figname = f"{title.replace(' ', '_')}_{country}"
        if is_need_print:
            print(cur_title)
            for i in range(len(x_axis)):
                print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

        if is_need_plot:
            fig, ax = plt.subplots(1, 1)
            ax.set_title(cur_title)
            ax.barh(x_axis, y_axis)
            # plt.show()
            fig.savefig(f"{save_path}/{figname}.png", dpi=300, format='png', bbox_inches='tight')
            plt.close(fig)
def count_by_university(vk_elastic_db: es_client.VkDataDatabaseClient,
                        size=10,
                        is_need_other=True,
                        is_need_print=False,
                        is_need_plot=True,
                        is_need_active=False,
                        days_delta=20):
    aggs_name = "university_count"
    title = "university count"
    if is_need_active:
        title += " active"
    missing_str = ""
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    a = elasticsearch_dsl.A('terms',
                            field="university_name.keyword",
                            missing=missing_str,
                            size=size)
    s.aggs.bucket(aggs_name, a)
    response = s.execute()

    x_axis = []
    y_axis = []
    for hit in response.aggregations[aggs_name].buckets:
        if hit.key == missing_str:
            continue
        x_axis.append(hit.key)
        y_axis.append(hit.doc_count)
    if is_need_other:
        x_axis.append("other")
        y_axis.append(response.aggregations[aggs_name].sum_other_doc_count)

    if is_need_print:
        print(title)
        for i in range(len(x_axis)):
            print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

    if is_need_plot:
        fig, ax = plt.subplots(1, 1)
        ax.set_title(title)
        ax.barh(x_axis, y_axis)
        # plt.show()
        fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png",
                    dpi=300,
                    format='png',
                    bbox_inches='tight')
        plt.close(fig)
        '''
Exemple #6
0
def has_mobile(vk_elastic_db: es_client.VkDataDatabaseClient,
               size=1,
               is_need_other=True,
               is_need_print=False,
               is_need_plot=True,
               is_need_active=False,
               days_delta=20):
    aggs_name = "has_mobile"
    title = "has_mobile"
    if is_need_active:
        title += " active"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    a = elasticsearch_dsl.A('terms',
                            field="has_mobile",
                            size=size,
                            missing="0")
    s.aggs.bucket(aggs_name, a)
    response = s.execute()

    label_dict = {"1": "has", "0": "has not", "-1": "missing"}
    x_axis = [
        label_dict[str(hit.key)]
        for hit in response.aggregations[aggs_name].buckets
    ]
    y_axis = [
        hit.doc_count for hit in response.aggregations[aggs_name].buckets
    ]
    if is_need_other:
        x_axis.append("other")
        y_axis.append(response.aggregations[aggs_name].sum_other_doc_count)

    if is_need_print:
        print(title)
        for i in range(len(x_axis)):
            print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

    if is_need_plot:
        sizes = [elem / sum(y_axis) for elem in y_axis]
        fig, ax = plt.subplots(1, 1)
        ax.set_title(title)
        ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90)
        # plt.show()
        fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png",
                    dpi=300,
                    format='png',
                    bbox_inches='tight')
        plt.close(fig)
def count_by_country(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=True, is_need_print=False,
                     is_need_plot=True, is_need_active=False, days_delta=20):
    title = "count by country"
    if is_need_active:
        title += " active"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size)
    aggs_name = "countries_count"
    s.aggs.bucket(aggs_name, a)
    response = s.execute()
    response_process(response, aggs_name, title, is_need_other, is_need_print, is_need_plot)
def other_social_network(vk_elastic_db: es_client.VkDataDatabaseClient, is_need_print=False, is_need_plot=True,
                         is_need_active=False, days_delta=20):
    title = "has other site"
    if is_need_active:
        title += " active"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    q = elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="twitter")]) |\
        elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="site")]) |\
        elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="skype")]) |\
        elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="livejournal")]) |\
        elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="instagram")]) |\
        elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="facebook")])

    s = s.query(q)
    s.execute()
    '''
    # запрос: найти всех тех, у кого не указано ничего в сторонних сайтах
    s = elasticsearch_dsl.Search(using=es, index=index)
    s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="twitter")])
    s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="site")])
    s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="skype")])
    s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="livejournal")])
    s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="instagram")])
    s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="facebook")])
    response = s.execute()
    '''
    total_search = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        total_search = get_active_users_filter(es, index, total_search, days_delta=days_delta)
    total_num = total_search.count()
    other_sn_num = s.count()

    x_axis = ["has", "has not"]
    y_axis = [other_sn_num, total_num-other_sn_num]
    if is_need_print:
        print(title)
        for i in range(len(x_axis)):
            print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

    if is_need_plot:
        sizes = [elem / sum(y_axis) for elem in y_axis]
        fig, ax = plt.subplots(1, 1)
        ax.set_title(title)
        ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90)
        # plt.show()
        fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png", dpi=300, format='png', bbox_inches='tight')
        plt.close(fig)
def has_city(vk_elastic_db: es_client.VkDataDatabaseClient,
             is_need_print=False,
             is_need_plot=True,
             is_need_active=False,
             days_delta=20):
    aggs_name = "has_city"
    title = "has city"
    if is_need_active:
        title += " active"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    s = s.filter()
    size = 10000
    missing_str = "missing"
    a = elasticsearch_dsl.A('terms',
                            field="city.title.keyword",
                            missing=missing_str,
                            size=size)
    s.aggs.bucket(aggs_name, a)
    response = s.execute()

    data = {"has city": 0, "missing city": 0}
    for hit in response.aggregations[aggs_name].buckets:
        if hit.key == missing_str:
            data["missing city"] += hit.doc_count
        else:
            data["has city"] += hit.doc_count
    x_axis = [key for key in data]
    y_axis = [data[key] for key in data]

    if is_need_print:
        print(title)
        for i in range(len(x_axis)):
            print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

    if is_need_plot:
        sizes = [elem / sum(y_axis) for elem in y_axis]
        fig, ax = plt.subplots(1, 1)
        ax.set_title(title)
        ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90)
        # plt.show()
        fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png",
                    dpi=300,
                    format='png',
                    bbox_inches='tight')
        plt.close(fig)