def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) office_id = self.object.id self.office = RUSSIA.get_office(office_id) self.office_stats = RUSSIA.calc_data_current.office_stats.get_group_data(office_id) region_name = "" if self.office.region_id is not None: region_name = RUSSIA.regions.get_region_by_id(self.office.region_id).name child_examples = list((id, RUSSIA.get_office(id).name) for id in self.office_stats.child_office_examples) extra = { 'source_document_count': self.office_stats.source_document_count, 'region_name': region_name, 'source_document_count_html': self.get_source_doc_html(), 'child_offices_count': self.office_stats.child_offices_count, 'section_count_html': self.section_count_html(), 'section_count_by_years_html': self.section_count_by_years_html(), 'median_income_by_years_html': self.median_income_by_years_html(), 'child_office_examples': child_examples, 'office_in_memory': self.office, 'parent_office_name': "" if self.office.parent_id is None else RUSSIA.get_office(self.office.parent_id).name, "rubric_str": "unknown" if self.office.rubric_id is None else get_russian_rubric_str(self.office.rubric_id), "income_comparison": self.comparison_to_population() } context.update(extra) return context
def build_section_incomes(self): query = """ select o.id, s.income_year, i.size, s.person_id from declarations_section s join declarations_office o on s.office_id=o.id join declarations_income i on i.section_id=s.id join declarations_source_document d on s.source_document_id=d.id where s.income_year >= {} and s.income_year <= {} and i.size < {} and i.size > 50000 and s.person_id is not null and d.median_income > 10000 and i.relative='{}' order by o.id """.format(self.income_stat_start_year, self.last_year, self.max_income, models.Relative.main_declarant_code) office_stats = TAllGroupIncomeStats() rubric_stats = TAllGroupIncomeStats() with connection.cursor() as cursor: cursor.execute(query) for office_id, office_items in groupby(cursor, itemgetter(0)): rubric_id = RUSSIA.get_office(office_id).rubric_id for _, year, income, person_id in office_items: if income / 12 < RUSSIA.get_mrot(year): continue office_stats.add_income(office_id, person_id, year, income) rubric_stats.add_income(rubric_id, person_id, year, income) return office_stats, rubric_stats
def init_rubric(self): # json_reader.section.rubric_id = source_document_in_db.office.rubric_id does not work # may be we should call source_document_in_db.refresh_from_db self.section.rubric_id = RUSSIA.get_office( self.section.office.id).rubric_id if self.section.rubric_id == TOfficeRubrics.Municipality and \ TOfficeTableInMemory.convert_municipality_to_education(self.section.position): self.section.rubric_id = TOfficeRubrics.Education
def distribute_offices_to_processes(self, process_count): assert process_count > 1 cnt = 0 for office_id in self.office_to_source_documents.keys(): cnt += 1 if RUSSIA.get_office(office_id).rubric_id == TOfficeRubrics.Gulag: #put all fsin offices to the first process bucket_id = 0 else: if len(self.office_buckets[0]) > cnt / process_count: #if bucket 0 contains more offices than other buckets, put to current office to other buckets bucket_id = cnt % (process_count - 1) + 1 else: bucket_id = cnt % process_count self.office_buckets[bucket_id].append(office_id) for i in self.office_buckets.keys(): self.logger.debug("bucket[{}] size = {}".format( i, len(self.office_buckets[i])))
def import_office(self, office_id): if self.args.get('rubric_id') is not None and RUSSIA.get_office( office_id).rubric_id != self.args.get('rubric_id'): return all_imported_human_jsons = set() max_doc_id = 2**32 ordered_documents = list() for sha256 in self.office_to_source_documents[office_id]: doc_id = self.permalinks_db_source_document.get_old_source_doc_id_by_sha256( sha256) if doc_id is None: doc_id = max_doc_id ordered_documents.append((doc_id, sha256)) ordered_documents.sort() TImporter.logger.debug("import office {} document count = {}".format( office_id, len(ordered_documents))) for _, sha256 in ordered_documents: src_doc = self.dlrobot_human.get_document(sha256) assert src_doc.calculated_office_id == office_id smart_parser_json = self.get_smart_parser_json( all_imported_human_jsons, sha256, src_doc) doc_file_in_db = self.register_document_in_database( sha256, src_doc) if smart_parser_json is None: self.logger.debug( "file {} has no valid smart parser json, skip it".format( sha256)) else: try: sections_count = self.import_one_smart_parser_json( doc_file_in_db, smart_parser_json, src_doc) TImporter.logger.debug("import {} sections from {}".format( sections_count, sha256)) except TSmartParserSectionJson.SerializerException as exp: TImporter.logger.error( "Error! cannot import smart parser json for file {}: {} " .format(sha256, exp))
def import_one_smart_parser_json(self, source_document_in_db, input_json, src_doc: TSourceDocument): imported_section_years = list() section_index = 0 TImporter.logger.debug("try to import {} declarants".format( len(input_json['persons']))) incomes = list() is_fsin = RUSSIA.get_office( src_doc.calculated_office_id).rubric_id == TOfficeRubrics.Gulag for raw_section in input_json['persons']: section_index += 1 section_income_year = self.calc_income_year( input_json, src_doc, raw_section, section_index) if is_fsin: office_id = self.get_fsin_office_id(raw_section, src_doc) else: office_id = src_doc.calculated_office_id with transaction.atomic(): try: prepared_section = TSmartParserSectionJson( section_income_year, office_id, source_document_in_db) prepared_section.read_raw_json(raw_section) if len(prepared_section.vehicles ) > TImporter.max_vehicle_count: TImporter.logger.debug( "ignore section {} because it has too many vehicles ( > {})" .format(prepared_section.section.person_name, TImporter.max_vehicle_count)) continue passport1 = prepared_section.get_passport_components1( ).get_main_section_passport() if self.register_section_passport(passport1): prepared_section.section.tmp_income_set = prepared_section.incomes passport2 = prepared_section.get_passport_components2( ).get_main_section_passport() section_id, is_new = self.permalinks_db_section.get_section_id( passport1, passport2) if is_new: TImporter.logger.debug( "found a new section {}, set section.id to {}". format( prepared_section.section. get_permalink_passport(), section_id)) main_income = prepared_section.get_main_declarant_income_size( ) if main_income is not None and main_income > 0: incomes.append(main_income) prepared_section.save_to_database(section_id) imported_section_years.append(section_income_year) except (DatabaseError, TSmartParserSectionJson.SerializerException) as exp: TImporter.logger.error( "Error! cannot import section N {}: {} ".format( section_index, exp)) if len(imported_section_years) > 0: source_document_in_db.min_income_year = min(imported_section_years) source_document_in_db.max_income_year = max(imported_section_years) source_document_in_db.section_count = len(imported_section_years) median_income = 0 if len(incomes) > 0: median_income = median(incomes) if median_income >= 2**31: median_income = 0 source_document_in_db.median_income = median_income source_document_in_db.save() return len(imported_section_years)