def handle(self, *args, **options):
     logger = setup_logging(log_file_name="build_rubric.log")
     office_hierarchy = TOfficeTableInMemory(use_office_types=False)
     office_hierarchy.read_from_table(models.Office.objects.all())
     for office in models.Office.objects.all():
         rubric_id = office_hierarchy.build_office_rubric(logger, office.id)
         if rubric_id is not None and rubric_id != office.rubric_id:
             logger.debug(
                 "set office rubric_id from {} to {} for {}".format(
                     get_russian_rubric_str(office.rubric_id),
                     get_russian_rubric_str(rubric_id), office.name))
             office.rubric_id = rubric_id
             office.save()
Beispiel #2
0
 def get_context_data(self, **kwargs):
     context = super().get_context_data(**kwargs)
     office_id = self.object.id
     self.office = RUSSIA.get_office(office_id)
     self.office_stats = RUSSIA.calc_data_current.office_stats.get_group_data(office_id)
     region_name = ""
     if self.office.region_id is not None:
         region_name = RUSSIA.regions.get_region_by_id(self.office.region_id).name
     child_examples = list((id, RUSSIA.get_office(id).name) for id in self.office_stats.child_office_examples)
     extra = {
         'source_document_count':  self.office_stats.source_document_count,
         'region_name': region_name,
         'source_document_count_html': self.get_source_doc_html(),
         'child_offices_count': self.office_stats.child_offices_count,
         'section_count_html': self.section_count_html(),
         'section_count_by_years_html': self.section_count_by_years_html(),
         'median_income_by_years_html': self.median_income_by_years_html(),
         'child_office_examples': child_examples,
         'office_in_memory': self.office,
         'parent_office_name': "" if self.office.parent_id is None else RUSSIA.get_office(self.office.parent_id).name,
         "rubric_str": "unknown" if self.office.rubric_id is None else get_russian_rubric_str(self.office.rubric_id),
         "income_comparison": self.comparison_to_population()
     }
     context.update(extra)
     return context
Beispiel #3
0
    def get_office_report_table_row(self,
                                    russia,
                                    group_id,
                                    max_cell_width=None):
        if self.group_type == TGroupStatDataList.office_group:
            name = russia.get_office(group_id).name
            if max_cell_width is not None:
                if len(name) > max_cell_width - 3:
                    name = name[:max_cell_width - 3] + "..."
            output_row = [group_id, name]
        else:
            if group_id is None:
                rubric_name = "остальное"
            else:
                rubric_name = get_russian_rubric_str(group_id)
            output_row = [group_id, rubric_name]
        office_info: TGroupStatData
        office_info = self.declarant_groups.get(group_id)
        if office_info is None:
            return None
        declarant_count = 0
        year_count = 0
        valid_incomes = list()
        for year in range(self.start_year, self.last_year + 1):
            d = office_info.get_year_snapshot(year)
            if d is not None and d.incomes_count is not None and d.incomes_count > 5:
                declarant_count += d.incomes_count
                year_count += 1
                output_row.append(d.median_year_income)
                output_row.append(d.incomes_count)
                valid_incomes.append(TYearIncome(year, d.median_year_income))
            else:
                output_row.append(-1)
                output_row.append(0)

        if declarant_count <= 10 or year_count < 2:
            # office is too small
            return None

        cmp_result = russia.get_average_nominal_incomes(valid_incomes)
        if cmp_result is None:
            params = [-1] * 4
        else:
            Q1 = russia.compare_to_all_russia_average_month_income(
                valid_incomes[-1].year, valid_incomes[-1].income / 12.0)
            Q1_str = str(Q1).replace(".", ",")
            PI = cmp_result.population_income_growth
            D1 = cmp_result.declarant_income_growth
            V2_str = str(office_info.v2).replace(".", ",")
            params = [Q1_str, PI, D1, V2_str, office_info.v2_size]
        output_row.extend(params)
        return output_row
    def build_income_first_word_position(self,
                                         max_count,
                                         filename,
                                         start_year=2010,
                                         last_year=2019):
        query = """
            select distinct p.id, s.gender, i.size, s.income_year, s.rubric_id, s.position  
            from declarations_section s
            join declarations_income i on i.section_id=s.id
            left join declarations_person p on s.person_id=p.id
            where i.size > 10000 and i.relative = 'D' and length(position) > 0 and s.gender is not null
            limit {}  
        """.format(max_count)

        rubric_genders = defaultdict(list)
        rubric_first_word = defaultdict(int)
        for person_id, gender, income_size, income_year, rubric_id, position_str in fetch_cursor_by_chunks(
                query):
            position_words = re.split("[\s,;.]", position_str)
            if len(position_words) == 0:
                continue
            first_position_word = position_words[0].lower()
            rubric_genders[(rubric_id, first_position_word,
                            gender)].append(income_size)
            rubric_first_word[(rubric_id, first_position_word)] += 1

        with open(filename, "w") as outp:
            outp.write(
                "\n\nРубрика\tПол\t{}\tМедианный доход\tДолжность\tГендерный перекос\n"
            )
            for (rubric_id,
                 first_position_word), all_cnt in rubric_first_word.items():
                if all_cnt > 100:
                    masc_incomes = rubric_genders[(rubric_id,
                                                   first_position_word,
                                                   TGender.masculine)]
                    if len(masc_incomes) == 0:
                        continue
                    masc_med = median(masc_incomes)
                    for gender in TGender.gender_list():
                        incomes = rubric_genders[(rubric_id,
                                                  first_position_word, gender)]
                        if len(incomes) == 0:
                            continue
                        med = median(incomes)
                        outp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                            get_russian_rubric_str(rubric_id),
                            first_position_word,
                            TGender.gender_to_Russian_str(gender), all_cnt,
                            int(med), int(100.0 * (masc_med - med) / med)))
    def build_vehicles(self, max_count, filename):
        with connection.cursor() as cursor:
            cursor.execute(
                'select section_id from declarations_vehicle where relative="D"'
            )
            section_with_vehicles = set(section_id for section_id, in cursor)

        query = """
            select distinct id, gender, office_id, income_year, rubric_id  
            from declarations_section
            where s.gender is not null
            limit {}  
        """.format(max_count)

        rubric_vehicle_positive = defaultdict(int)
        rubric_vehicle_negative = defaultdict(int)
        rubric_vehicle_all = defaultdict(int)
        for section_id, gender, office_id, income_year, rubric_id in fetch_cursor_by_chunks(
                query):
            key = (gender, rubric_id)
            if section_id in section_with_vehicles:
                rubric_vehicle_positive[key] += 1
            else:
                rubric_vehicle_negative[key] += 1
            rubric_vehicle_all[rubric_id] += 1

        with open(filename, "w") as outp:
            outp.write(
                "\n\nРубрика\tПол\tКол-во автомобилей в рубрике\tАвтомобилизация\tГендерный перекос\n"
            )
            for rubric_id in get_all_rubric_ids():
                if rubric_vehicle_all[rubric_id] > 0:
                    k = (TGender.masculine, rubric_id)
                    vehicle_index_masc = 100.0 * rubric_vehicle_positive[
                        k] / float(rubric_vehicle_positive[k] +
                                   rubric_vehicle_negative[k] + 0.0000001)

                    for gender in TGender.gender_list():
                        k = (gender, rubric_id)
                        vehicle_index = 100.0 * rubric_vehicle_positive[
                            k] / float(rubric_vehicle_positive[k] +
                                       rubric_vehicle_negative[k] + 0.0000001)
                        outp.write("{}\t{}\t{}\t{}\t{}\n".format(
                            get_russian_rubric_str(rubric_id),
                            TGender.gender_to_Russian_str(gender),
                            rubric_vehicle_all[rubric_id], int(vehicle_index),
                            int(100.0 * (vehicle_index_masc - vehicle_index) /
                                (vehicle_index + 0.0000001))))
Beispiel #6
0
 def build_section_json_for_toloka(self, section_sql_records):
     sections = list()
     for s in section_sql_records:
         section_json = get_section_json(s)
         office_id = section_json['office_id']
         section_json['office_section_count'] = self.db_squeeze.office_info[
             office_id]['section_count']
         office = models.Office.objects.get(id=office_id)
         section_json['office_rubric'] = get_russian_rubric_str(
             office.rubric_id)
         section_json['office_region'] = self.region_id_to_name.get(
             office.region_id, "")
         section_json['surname_rank'] = s.surname_rank
         section_json['name_rank'] = s.name_rank
         sections.append(section_json)
     return {'sections': sections}
    def build_genders_rubrics(self, max_count, filename):
        query = """
            select distinct p.id, s.gender, o.region_id, s.rubric_id 
            from declarations_person p
            join declarations_section s on s.person_id=p.id
            join declarations_office o on s.office_id=o.id
            where s.gender is not null
            limit {}  
        """.format(max_count)
        rubric_genders = defaultdict(int)
        genders_in_db = defaultdict(int)
        section_count = 0
        with connection.cursor() as cursor:
            cursor.execute(query)
            for person_id, gender, region_id, rubric_id in cursor:
                section_count += 1
                genders_in_db[gender] += 1
                rubric_genders[(gender, rubric_id)] += 1

        with open(filename, "w") as outp:
            outp.write("Genders in DB: ")
            outp.write("Gender\tPerson Count\n")
            for k, v in genders_in_db.items():
                outp.write("{}\t{}\n".format(TGender.gender_to_str(k), v))

            outp.write("\nGenders in DB Rubrics: ")
            outp.write(
                "\nRubric\tGender\tPersons in the Rubric\tGender Share\n")
            for rubric in get_all_rubric_ids():
                all_cnt = sum(rubric_genders[(gender, rubric)]
                              for gender in TGender.gender_list())
                if all_cnt > 0:
                    for gender in TGender.gender_list():
                        outp.write("{}\t{}\t{}\t{}\n".format(
                            get_russian_rubric_str(rubric),
                            TGender.gender_to_str(gender),
                            all_cnt,
                            round(
                                100.0 * rubric_genders[(gender, rubric)] /
                                all_cnt, 2),
                        ))
Beispiel #8
0
 def rubric_str(self):
     if self.rubric_id is None:
         return "unknown"
     else:
         return get_russian_rubric_str(self.rubric_id)