def handle(self, *args, **options): logger = setup_logging(log_file_name="build_rubric.log") office_hierarchy = TOfficeTableInMemory(use_office_types=False) office_hierarchy.read_from_table(models.Office.objects.all()) for office in models.Office.objects.all(): rubric_id = office_hierarchy.build_office_rubric(logger, office.id) if rubric_id is not None and rubric_id != office.rubric_id: logger.debug( "set office rubric_id from {} to {} for {}".format( get_russian_rubric_str(office.rubric_id), get_russian_rubric_str(rubric_id), office.name)) office.rubric_id = rubric_id office.save()
def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) office_id = self.object.id self.office = RUSSIA.get_office(office_id) self.office_stats = RUSSIA.calc_data_current.office_stats.get_group_data(office_id) region_name = "" if self.office.region_id is not None: region_name = RUSSIA.regions.get_region_by_id(self.office.region_id).name child_examples = list((id, RUSSIA.get_office(id).name) for id in self.office_stats.child_office_examples) extra = { 'source_document_count': self.office_stats.source_document_count, 'region_name': region_name, 'source_document_count_html': self.get_source_doc_html(), 'child_offices_count': self.office_stats.child_offices_count, 'section_count_html': self.section_count_html(), 'section_count_by_years_html': self.section_count_by_years_html(), 'median_income_by_years_html': self.median_income_by_years_html(), 'child_office_examples': child_examples, 'office_in_memory': self.office, 'parent_office_name': "" if self.office.parent_id is None else RUSSIA.get_office(self.office.parent_id).name, "rubric_str": "unknown" if self.office.rubric_id is None else get_russian_rubric_str(self.office.rubric_id), "income_comparison": self.comparison_to_population() } context.update(extra) return context
def get_office_report_table_row(self, russia, group_id, max_cell_width=None): if self.group_type == TGroupStatDataList.office_group: name = russia.get_office(group_id).name if max_cell_width is not None: if len(name) > max_cell_width - 3: name = name[:max_cell_width - 3] + "..." output_row = [group_id, name] else: if group_id is None: rubric_name = "остальное" else: rubric_name = get_russian_rubric_str(group_id) output_row = [group_id, rubric_name] office_info: TGroupStatData office_info = self.declarant_groups.get(group_id) if office_info is None: return None declarant_count = 0 year_count = 0 valid_incomes = list() for year in range(self.start_year, self.last_year + 1): d = office_info.get_year_snapshot(year) if d is not None and d.incomes_count is not None and d.incomes_count > 5: declarant_count += d.incomes_count year_count += 1 output_row.append(d.median_year_income) output_row.append(d.incomes_count) valid_incomes.append(TYearIncome(year, d.median_year_income)) else: output_row.append(-1) output_row.append(0) if declarant_count <= 10 or year_count < 2: # office is too small return None cmp_result = russia.get_average_nominal_incomes(valid_incomes) if cmp_result is None: params = [-1] * 4 else: Q1 = russia.compare_to_all_russia_average_month_income( valid_incomes[-1].year, valid_incomes[-1].income / 12.0) Q1_str = str(Q1).replace(".", ",") PI = cmp_result.population_income_growth D1 = cmp_result.declarant_income_growth V2_str = str(office_info.v2).replace(".", ",") params = [Q1_str, PI, D1, V2_str, office_info.v2_size] output_row.extend(params) return output_row
def build_income_first_word_position(self, max_count, filename, start_year=2010, last_year=2019): query = """ select distinct p.id, s.gender, i.size, s.income_year, s.rubric_id, s.position from declarations_section s join declarations_income i on i.section_id=s.id left join declarations_person p on s.person_id=p.id where i.size > 10000 and i.relative = 'D' and length(position) > 0 and s.gender is not null limit {} """.format(max_count) rubric_genders = defaultdict(list) rubric_first_word = defaultdict(int) for person_id, gender, income_size, income_year, rubric_id, position_str in fetch_cursor_by_chunks( query): position_words = re.split("[\s,;.]", position_str) if len(position_words) == 0: continue first_position_word = position_words[0].lower() rubric_genders[(rubric_id, first_position_word, gender)].append(income_size) rubric_first_word[(rubric_id, first_position_word)] += 1 with open(filename, "w") as outp: outp.write( "\n\nРубрика\tПол\t{}\tМедианный доход\tДолжность\tГендерный перекос\n" ) for (rubric_id, first_position_word), all_cnt in rubric_first_word.items(): if all_cnt > 100: masc_incomes = rubric_genders[(rubric_id, first_position_word, TGender.masculine)] if len(masc_incomes) == 0: continue masc_med = median(masc_incomes) for gender in TGender.gender_list(): incomes = rubric_genders[(rubric_id, first_position_word, gender)] if len(incomes) == 0: continue med = median(incomes) outp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( get_russian_rubric_str(rubric_id), first_position_word, TGender.gender_to_Russian_str(gender), all_cnt, int(med), int(100.0 * (masc_med - med) / med)))
def build_vehicles(self, max_count, filename): with connection.cursor() as cursor: cursor.execute( 'select section_id from declarations_vehicle where relative="D"' ) section_with_vehicles = set(section_id for section_id, in cursor) query = """ select distinct id, gender, office_id, income_year, rubric_id from declarations_section where s.gender is not null limit {} """.format(max_count) rubric_vehicle_positive = defaultdict(int) rubric_vehicle_negative = defaultdict(int) rubric_vehicle_all = defaultdict(int) for section_id, gender, office_id, income_year, rubric_id in fetch_cursor_by_chunks( query): key = (gender, rubric_id) if section_id in section_with_vehicles: rubric_vehicle_positive[key] += 1 else: rubric_vehicle_negative[key] += 1 rubric_vehicle_all[rubric_id] += 1 with open(filename, "w") as outp: outp.write( "\n\nРубрика\tПол\tКол-во автомобилей в рубрике\tАвтомобилизация\tГендерный перекос\n" ) for rubric_id in get_all_rubric_ids(): if rubric_vehicle_all[rubric_id] > 0: k = (TGender.masculine, rubric_id) vehicle_index_masc = 100.0 * rubric_vehicle_positive[ k] / float(rubric_vehicle_positive[k] + rubric_vehicle_negative[k] + 0.0000001) for gender in TGender.gender_list(): k = (gender, rubric_id) vehicle_index = 100.0 * rubric_vehicle_positive[ k] / float(rubric_vehicle_positive[k] + rubric_vehicle_negative[k] + 0.0000001) outp.write("{}\t{}\t{}\t{}\t{}\n".format( get_russian_rubric_str(rubric_id), TGender.gender_to_Russian_str(gender), rubric_vehicle_all[rubric_id], int(vehicle_index), int(100.0 * (vehicle_index_masc - vehicle_index) / (vehicle_index + 0.0000001))))
def build_section_json_for_toloka(self, section_sql_records): sections = list() for s in section_sql_records: section_json = get_section_json(s) office_id = section_json['office_id'] section_json['office_section_count'] = self.db_squeeze.office_info[ office_id]['section_count'] office = models.Office.objects.get(id=office_id) section_json['office_rubric'] = get_russian_rubric_str( office.rubric_id) section_json['office_region'] = self.region_id_to_name.get( office.region_id, "") section_json['surname_rank'] = s.surname_rank section_json['name_rank'] = s.name_rank sections.append(section_json) return {'sections': sections}
def build_genders_rubrics(self, max_count, filename): query = """ select distinct p.id, s.gender, o.region_id, s.rubric_id from declarations_person p join declarations_section s on s.person_id=p.id join declarations_office o on s.office_id=o.id where s.gender is not null limit {} """.format(max_count) rubric_genders = defaultdict(int) genders_in_db = defaultdict(int) section_count = 0 with connection.cursor() as cursor: cursor.execute(query) for person_id, gender, region_id, rubric_id in cursor: section_count += 1 genders_in_db[gender] += 1 rubric_genders[(gender, rubric_id)] += 1 with open(filename, "w") as outp: outp.write("Genders in DB: ") outp.write("Gender\tPerson Count\n") for k, v in genders_in_db.items(): outp.write("{}\t{}\n".format(TGender.gender_to_str(k), v)) outp.write("\nGenders in DB Rubrics: ") outp.write( "\nRubric\tGender\tPersons in the Rubric\tGender Share\n") for rubric in get_all_rubric_ids(): all_cnt = sum(rubric_genders[(gender, rubric)] for gender in TGender.gender_list()) if all_cnt > 0: for gender in TGender.gender_list(): outp.write("{}\t{}\t{}\t{}\n".format( get_russian_rubric_str(rubric), TGender.gender_to_str(gender), all_cnt, round( 100.0 * rubric_genders[(gender, rubric)] / all_cnt, 2), ))
def rubric_str(self): if self.rubric_id is None: return "unknown" else: return get_russian_rubric_str(self.rubric_id)