def active_users_pie(vk_elastic_db: es_client.VkDataDatabaseClient, days_delta=20, is_need_print=False, is_need_plot=True): title = "active users" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) s = get_active_users_filter(es, index, s, days_delta=days_delta) s.execute() total_search = elasticsearch_dsl.Search(using=es, index=index) total_num = total_search.count() active_num = s.count() x_axis = ["active", "inactive"] y_axis = [active_num, total_num - active_num] if is_need_print: print(title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: sizes = [elem / sum(y_axis) for elem in y_axis] fig, ax = plt.subplots(1, 1) ax.set_title(title) ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90) # plt.show() fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig)
def count_by_country(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_active=False, days_delta=20): country_aggs_name = "country_count" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) s = s.filter( "bool", must=[elasticsearch_dsl.Q("exists", field="country.title.keyword")]) s = s.filter( "bool", must_not=[elasticsearch_dsl.Q("match", country__title__keywordd="")]) a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size) s.aggs.bucket(country_aggs_name, a) response = s.execute() data = [] for country_hit in response.aggregations[country_aggs_name].buckets: country_dict = { 'country': country_hit.key, 'count': country_hit.doc_count } data.append(country_dict) add_geoposition(data) return data
def last_name_count(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=False, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): aggs_name = "last_name_count" sex_aggs_name = "sex_aggs" title = "last name count" if is_need_active: title += " active" sex_size = 2 es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) sex_a = elasticsearch_dsl.A('terms', field="sex", missing="-1", size=sex_size) a = elasticsearch_dsl.A('terms', field="last_name.keyword", size=size) s.aggs.bucket(sex_aggs_name, sex_a).bucket(aggs_name, a) response = s.execute() data_dict = {} sex_dict = {"0": "unknown", "1": "woman", "2": "man", "-1": "missing"} for sex_hit in response.aggregations[sex_aggs_name].buckets: x_axis = [hit.key for hit in sex_hit[aggs_name].buckets] y_axis = [hit.doc_count for hit in sex_hit[aggs_name].buckets] if is_need_other: x_axis.append("other") y_axis.append(sex_hit[aggs_name].sum_other_doc_count) data_dict[sex_dict[str(sex_hit.key)]] = {} data_dict[sex_dict[str(sex_hit.key)]]["x_axis"] = x_axis data_dict[sex_dict[str(sex_hit.key)]]["y_axis"] = y_axis for sex in data_dict: x_axis = data_dict[sex]["x_axis"] y_axis = data_dict[sex]["y_axis"] cur_title = f"{title}\n{sex}" figname = f"{title.replace(' ', '_')}_{sex}" if is_need_print: print(cur_title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: fig, ax = plt.subplots(1, 1) ax.set_title(cur_title) ax.barh(x_axis, y_axis) # plt.show() fig.savefig(f"{save_path}/{figname}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig)
def count_by_university_order_by_country(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=True, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): country_aggs_name = "country_count" university_aggs_name = "university_count" title = "count university by country" if is_need_active: title += " active" missing_str = "" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size, collect_mode="breadth_first") a1 = elasticsearch_dsl.A('terms', field="university_name.keyword", missing=missing_str, size=size) s.aggs.bucket(country_aggs_name, a1).bucket(university_aggs_name, a) response = s.execute() data_dict = {} for country_hit in response.aggregations[country_aggs_name].buckets: x_axis = [] y_axis = [] for hit in country_hit[university_aggs_name].buckets: if hit.key == missing_str: continue x_axis.append(hit.key) y_axis.append(hit.doc_count) if is_need_other: x_axis.append("other") y_axis.append(country_hit[university_aggs_name].sum_other_doc_count) data_dict[country_hit.key] = {} data_dict[country_hit.key]["x_axis"] = x_axis data_dict[country_hit.key]["y_axis"] = y_axis for country in data_dict: x_axis = data_dict[country]["x_axis"] y_axis = data_dict[country]["y_axis"] cur_title = f"{title}\n{country}" figname = f"{title.replace(' ', '_')}_{country}" if is_need_print: print(cur_title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: fig, ax = plt.subplots(1, 1) ax.set_title(cur_title) ax.barh(x_axis, y_axis) # plt.show() fig.savefig(f"{save_path}/{figname}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig)
def count_by_university(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=True, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): aggs_name = "university_count" title = "university count" if is_need_active: title += " active" missing_str = "" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) a = elasticsearch_dsl.A('terms', field="university_name.keyword", missing=missing_str, size=size) s.aggs.bucket(aggs_name, a) response = s.execute() x_axis = [] y_axis = [] for hit in response.aggregations[aggs_name].buckets: if hit.key == missing_str: continue x_axis.append(hit.key) y_axis.append(hit.doc_count) if is_need_other: x_axis.append("other") y_axis.append(response.aggregations[aggs_name].sum_other_doc_count) if is_need_print: print(title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: fig, ax = plt.subplots(1, 1) ax.set_title(title) ax.barh(x_axis, y_axis) # plt.show() fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig) '''
def has_mobile(vk_elastic_db: es_client.VkDataDatabaseClient, size=1, is_need_other=True, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): aggs_name = "has_mobile" title = "has_mobile" if is_need_active: title += " active" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) a = elasticsearch_dsl.A('terms', field="has_mobile", size=size, missing="0") s.aggs.bucket(aggs_name, a) response = s.execute() label_dict = {"1": "has", "0": "has not", "-1": "missing"} x_axis = [ label_dict[str(hit.key)] for hit in response.aggregations[aggs_name].buckets ] y_axis = [ hit.doc_count for hit in response.aggregations[aggs_name].buckets ] if is_need_other: x_axis.append("other") y_axis.append(response.aggregations[aggs_name].sum_other_doc_count) if is_need_print: print(title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: sizes = [elem / sum(y_axis) for elem in y_axis] fig, ax = plt.subplots(1, 1) ax.set_title(title) ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90) # plt.show() fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig)
def count_by_country(vk_elastic_db: es_client.VkDataDatabaseClient, size=10, is_need_other=True, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): title = "count by country" if is_need_active: title += " active" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size) aggs_name = "countries_count" s.aggs.bucket(aggs_name, a) response = s.execute() response_process(response, aggs_name, title, is_need_other, is_need_print, is_need_plot)
def other_social_network(vk_elastic_db: es_client.VkDataDatabaseClient, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): title = "has other site" if is_need_active: title += " active" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) q = elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="twitter")]) |\ elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="site")]) |\ elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="skype")]) |\ elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="livejournal")]) |\ elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="instagram")]) |\ elasticsearch_dsl.Q("bool", must=[elasticsearch_dsl.Q("exists", field="facebook")]) s = s.query(q) s.execute() ''' # запрос: найти всех тех, у кого не указано ничего в сторонних сайтах s = elasticsearch_dsl.Search(using=es, index=index) s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="twitter")]) s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="site")]) s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="skype")]) s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="livejournal")]) s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="instagram")]) s = s.filter("bool", must_not=[elasticsearch_dsl.Q("exists", field="facebook")]) response = s.execute() ''' total_search = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: total_search = get_active_users_filter(es, index, total_search, days_delta=days_delta) total_num = total_search.count() other_sn_num = s.count() x_axis = ["has", "has not"] y_axis = [other_sn_num, total_num-other_sn_num] if is_need_print: print(title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: sizes = [elem / sum(y_axis) for elem in y_axis] fig, ax = plt.subplots(1, 1) ax.set_title(title) ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90) # plt.show() fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig)
def has_city(vk_elastic_db: es_client.VkDataDatabaseClient, is_need_print=False, is_need_plot=True, is_need_active=False, days_delta=20): aggs_name = "has_city" title = "has city" if is_need_active: title += " active" es = get_elastic_object(vk_elastic_db) s = elasticsearch_dsl.Search(using=es, index=index) if is_need_active: s = get_active_users_filter(es, index, s, days_delta=days_delta) s = s.filter() size = 10000 missing_str = "missing" a = elasticsearch_dsl.A('terms', field="city.title.keyword", missing=missing_str, size=size) s.aggs.bucket(aggs_name, a) response = s.execute() data = {"has city": 0, "missing city": 0} for hit in response.aggregations[aggs_name].buckets: if hit.key == missing_str: data["missing city"] += hit.doc_count else: data["has city"] += hit.doc_count x_axis = [key for key in data] y_axis = [data[key] for key in data] if is_need_print: print(title) for i in range(len(x_axis)): print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}") if is_need_plot: sizes = [elem / sum(y_axis) for elem in y_axis] fig, ax = plt.subplots(1, 1) ax.set_title(title) ax.pie(sizes, labels=x_axis, autopct='%1.1f%%', startangle=90) # plt.show() fig.savefig(f"{save_path}/{title.replace(' ', '_')}.png", dpi=300, format='png', bbox_inches='tight') plt.close(fig)