Beispiel #1
0
 def __init__(self, regions=None):
     self.region_stat = dict()
     self.regions = regions
     if self.regions is None:
         self.regions = TRussianRegions()
     self.file_path = os.path.join(os.path.dirname(__file__),
                                   "data/ross_stat.json")
Beispiel #2
0
    def build_declarant_incomes(self,
                                year,
                                max_income=5000000
                                ) -> TAllRegionStatsForOneYear:
        region_data = TAllRegionStatsForOneYear(
            year, file_name=self.options.get('output_json'))
        minOboronyId = 450
        query = """
            select o.region_id, i.size
            from declarations_section   s 
            join declarations_office o on s.office_id=o.id  
            join declarations_income i on i.section_id=s.id  
            where s.income_year = {} and  
                 i.size < {} and 
                 i.size > 0 and 
                 i.relative='{}' and
                 o.id != {} and
                 o.region_id is  not null and
                 o.region_id != {}
            order by o.region_id, i.size
        """.format(year, max_income, models.Relative.main_declarant_code,
                   minOboronyId, TRussianRegions.Russia_as_s_whole_region_id)
        regions = TRussianRegions()
        mrot = RUSSIA.get_mrot(year)
        assert mrot is not None
        with connection.cursor() as cursor:
            cursor.execute(query)
            for region_id, items in groupby(cursor, itemgetter(0)):
                incomes = list(income for _, income in items
                               if income / 12 > mrot)
                if region_id == TRussianRegions.Baikonur:
                    continue
                region = regions.get_region_by_id(region_id)
                if region.joined_to is not None:
                    region = regions.get_region_by_id(region.joined_to)
                stat_info = region_data.ross_stat.get_data(region.id, year)
                if stat_info is None:
                    raise Exception(
                        "cannot find stat_info for region.id={}, region.name={}"
                        .format(region.id, region.name))
                population = stat_info.population
                population_median_income = region_data.ross_stat.get_or_predict_median_salary(
                    region.id, year)
                if population_median_income is None:
                    raise Exception(
                        "cannot estimate population median_income for region.id={}, region.name={}"
                        .format(region.id, region.name))
                s = TRegionYearStats(
                    region.id, region.name, incomes, population_median_income,
                    population,
                    region_data.ross_stat.get_data(region.id,
                                                   2021).er_election_2021)
                region_data.add_snapshot(s)

        region_data.calc_aux_params()
        return region_data
Beispiel #3
0
 def __init__(self, args):
     self.args = args
     self.logger = setup_logging("wd_by_url")
     self.regions = TRussianRegions()
     self.offices = TOfficeTableInMemory(use_office_types=False)
     self.offices.read_from_local_file()
     self.disclosures_hostnames = defaultdict(set)
     self.disclosures_office_names = defaultdict(set)
     self.build_office_indices()
     self.wd_urls = TWikidataUrlRecords()
     self.wd_urls.read_from_file(self.args.wikidata_info)
     self.wd_region_heads = TWikidataRegionHeads()
     self.wd_region_heads.read_from_file(self.args.wd_region_head_info)
Beispiel #4
0
 def __init__(self, args):
     self.args = args
     self.dlrobot_human = TDlrobotHumanFileDBM(args['dlrobot_human'])
     self.dlrobot_human.open_db_read_only()
     self.all_section_passports = set()
     if models.Section.objects.count() > 0:
         raise Exception(
             "implement all section passports reading from db if you want to import to non-empty db! "
         )
     self.office_to_source_documents = self.build_office_to_file_mapping()
     self.permalinks_db_section = None
     self.permalinks_db_source_document = None
     self.smart_parser_cache_client = None
     self.regions = TRussianRegions()
     self.office_buckets = defaultdict(list)
Beispiel #5
0
 def test_regions_nominative(self):
     regions = TRussianRegions()
     self.assertEqual(63, regions.get_region_in_nominative("Москва").id)
     self.assertEqual(9, regions.get_region_in_nominative("Кабардино-Балкария").id)
     self.assertEqual(17, regions.get_region_in_nominative("Северная Осетия").id)
     self.assertEqual(109, regions.get_region_in_nominative("Крым").id)
     self.assertEqual(109, regions.get_region_in_nominative("Республика Крым").id)
     self.assertEqual(1, regions.get_region_in_nominative("санкт-петербург").id)
def include_fns_json_to_html(json_path, html_path):
    regions = TRussianRegions()
    assert json_path.endswith('json')
    assert html_path.endswith('html')
    with open(json_path) as inp:
        filters = json.load(inp)['filters']

    upr_name = filters.get('upr_name', '')
    if upr_name is None:
        upr_name = ''
    if 'insp_name' in filters:
        department = filters['insp_name']
    else:
        department = upr_name
    if department is None:
        department = ''

    url = "service.nalog.ru"
    if len(upr_name) > 1 and upr_name[0:4].endswith("00"):
        region = regions.get_region_in_nominative_and_dative(upr_name)
        assert region is not None
        url = "{}.{}".format(region.id, url)

    if filters.get('otdel_name') is not None:
        if len(department) > 0:
            department += '; '
        department += filters.get('otdel_name')

    with open(html_path, "rb") as inp:
        file_data = inp.read().strip()
        if file_data.endswith(b'<html>'):
            file_data = file_data[:-len('<html>')] + b'</html>'
        soup = BeautifulSoup(file_data, "html.parser")

    metatag = soup.new_tag('meta')
    metatag.attrs['name'] = 'smartparser_department'
    metatag.attrs['content'] = department
    soup.html.insert(2, metatag)

    metatag = soup.new_tag('meta')
    metatag.attrs['name'] = 'smartparser_url'
    metatag.attrs['content'] = url
    soup.html.insert(2, metatag)

    with open(html_path, "w") as outp:
        outp.write(str(soup))
Beispiel #7
0
 def __init__(self):
     self.regions = TRussianRegions()
     self.year_stat = dict()
     for year in [LAST_DECLARATION_YEAR]:
         self.init_one_year_stats(year)
     self.sorted_region_list_for_web_interface = self._build_region_list_for_combo_box()
     self.offices_in_memory = TOfficeTableInMemory()
     self.offices_in_memory.read_from_local_file()
     self.federal_fsin = self.offices_in_memory.fsin_by_region[TRussianRegions.Russia_as_s_whole_region_id]
     assert self.federal_fsin is not None
     self.calc_data_2020 = TOfficeRubricCalculatedData(os.path.join(os.path.dirname(__file__), "data", "office2020"))
     self.calc_data_current = TOfficeRubricCalculatedData(os.path.join(os.path.dirname(__file__), "data", "office_current"))
Beispiel #8
0
 def test_region_capitals(self):
     def r(s):
         d = regions.search_capital_in_address(s)
         return d
     regions = TRussianRegions()
     self.assertEqual(63, r("Россия, Москва, Красная площадь,1"))
     self.assertEqual(66, r("Россия, Нижний Новгород, Красная площадь,1"))
     self.assertEqual(85, r("улица Уфаимская, Кызыл, Тува"))
     self.assertEqual(70, r("улица Воронежская, Орёл"))
     self.assertEqual(70, r("улица Воронежская, Орел"))
     self.assertEqual(80, r("Россия, Свердловская область, Екатеринбург, проспект Ленина, 32"))
     self.assertEqual(5, r("Россия, Республика Бурятия, Улан-Удэ, улица Смолина, 18"))
     self.assertEqual(None, r("уфаимская"))
Beispiel #9
0
 def test_regions_all_forms(self):
     def r(s):
         d = regions.get_region_all_forms(s)
         return d
     regions = TRussianRegions()
     self.assertEqual(63, r("мэр Москвы Собянин"))
     self.assertEqual(1, r("Московский район Санкт-Петербурга"))
     self.assertEqual(28, r("Никита Рязанский из Красноярского края"))
     self.assertEqual(28, r("представитель Москвы в Красноярском крае")) #longest match
     self.assertEqual(28, r("Межрегиональное управление № 042 ФМБА (Красноярский край, г. Зеленогорск"))
     #  что делать ?
     self.assertEqual(69, r(     "омская область"))
     self.assertEqual(53, r("костромская область"))
Beispiel #10
0
def main():
    regions = TRussianRegions()
    with open("/home/sokirko/pop2021.csv") as inp:
        data = dict()
        name = True
        region_id = None
        for l in inp:
            w = l.strip().split("\t")
            if name:
                r = w[0]
                r = regions.get_region_in_nominative(w[0])
                if r is None:
                    raise Exception("cannot fine {}".format(w[0]))
                region_id = r.id
                name = False
            else:
                assert l.startswith('Раздел')
                data[region_id] = list(map(int, w[-3:]))
                name = True

    for i in regions.iterate_inner_regions_without_joined():
        if i.id not in data:
            raise Exception("cannot find data for region_id={}".format(i.id))

    data1 = TRossStatData()
    data1.load_from_disk()
    for region_id, populations in data.items():
        info2019 = data1.get_data(region_id, 2019)
        if info2019.population != populations[0]:
            raise Exception("region_id={} different data {} != {}".format(
                region_id, info2019.population, populations[0]))
        s = data1.get_data(region_id, 2021)
        assert s.median_salary2 > 0
        s.population = populations[2]
        assert data1.get_data(region_id, 2020) is None
        data1.set_data(region_id, 2020,
                       TRegionYearInfo(population=populations[1]))
    data1.save_to_disk(".new")
Beispiel #11
0
def main():
    args = parse_args()
    logger = setup_logging("calc_region_from_wd")
    regions = TRussianRegions()
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()
    wd = TWikidataRecords(regions)
    wd.read_from_file(args.wikidata_info)

    web_sites_db = TDeclarationWebSiteList(logger,
                                           TDeclarationWebSiteList.default_input_task_list_path).load_from_disk()
    office_to_urls = web_sites_db.build_office_to_main_website(take_abandoned=True)
    with open(args.input_file) as inp:
        for l in inp:
            office_id, name = l.strip().split("\t")
            office = offices.offices.get(int(office_id))
            if office is None:
                logger.debug("cannot find office_id={}, name={} no valid urls, deleted office?")
                continue

            wikidata_id, region = wd.get_region_by_name(name)
            if wikidata_id is not None:
                cause = "name"
            else:
                urls = office_to_urls.get(int(office_id), [])
                if len(urls) == 0:
                    logger.debug("office_id={}, name={} no valid urls, delete office?")
                    continue
                for url in urls:
                    wikidata_id, region = wd.get_region_by_url(name, url)
                    if wikidata_id is not None:
                        cause = "url"
                        break

            if region is None:
                logger.error(
                    "office_id={}, name={} cannot recognize region".format(office_id, name))
            else:
                logger.debug("set region {} to {} {} by {} ".format(region.name, office_id, name, cause))
                office.region_id = region.id
                office.wikidata_id = wikidata_id
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
Beispiel #12
0
class TImporter:
    logger = None
    max_vehicle_count = 60

    def build_office_to_file_mapping(self):
        db_offices = set(o.id for o in models.Office.objects.all())
        TImporter.logger.debug("there are {} records in table {} ".format(
            len(db_offices), models.Office.objects.model._meta.db_table))
        office_to_source_documents = defaultdict(list)
        for sha256, src_doc in self.dlrobot_human.get_all_documents():
            office_id = src_doc.calculated_office_id
            if office_id is None:
                continue
            if int(office_id) not in db_offices:
                TImporter.logger.error(
                    "cannot find office id={} from {} in sql table ".format(
                        office_id, self.args['dlrobot_human']))
                raise Exception("integrity failed")
            office_to_source_documents[office_id].append(sha256)
        return office_to_source_documents

    def __init__(self, args):
        self.args = args
        self.dlrobot_human = TDlrobotHumanFileDBM(args['dlrobot_human'])
        self.dlrobot_human.open_db_read_only()
        self.all_section_passports = set()
        if models.Section.objects.count() > 0:
            raise Exception(
                "implement all section passports reading from db if you want to import to non-empty db! "
            )
        self.office_to_source_documents = self.build_office_to_file_mapping()
        self.permalinks_db_section = None
        self.permalinks_db_source_document = None
        self.smart_parser_cache_client = None
        self.regions = TRussianRegions()
        self.office_buckets = defaultdict(list)

    def delete_before_fork(self):
        from django import db
        db.connections.close_all()
        self.dlrobot_human.close_db()

    def init_non_pickable(self):
        self.smart_parser_cache_client = TSmartParserCacheClient(
            TSmartParserCacheClient.parse_args([]), TImporter.logger)

        self.permalinks_db_section = TPermaLinksSection(
            self.args['permalinks_folder'])
        self.permalinks_db_section.open_db_read_only()
        self.permalinks_db_source_document = TPermaLinksSourceDocument(
            self.args['permalinks_folder'])
        self.permalinks_db_source_document.open_db_read_only()

        self.dlrobot_human.open_db_read_only()

    def init_after_fork(self):
        from django.db import connection
        connection.connect()
        self.init_non_pickable()

    def get_human_smart_parser_json(self, src_doc, already_imported):
        for ref in src_doc.decl_references:
            filename = os.path.join(self.args['smart_parser_human_json'],
                                    str(ref.document_id) + ".json")
            if os.path.exists(filename) and filename not in already_imported:
                TImporter.logger.debug("import human json {}".format(filename))
                already_imported.add(filename)
                with open(filename, "r") as inp:
                    return json.load(inp)
        return None

    def register_document_in_database(self, sha256, src_doc: TSourceDocument):
        source_document_in_db = models.Source_Document(
            sha256=sha256,
            intersection_status=src_doc.build_intersection_status(),
        )
        source_document_in_db.id, new_file = self.permalinks_db_source_document.get_source_doc_id_by_sha256(
            sha256)
        assert not models.Source_Document.objects.filter(
            id=source_document_in_db.id).exists()
        self.logger.debug("register doc sha256={} id={}, new_file={}".format(
            sha256, source_document_in_db.id, new_file))
        source_document_in_db.file_extension = src_doc.file_extension
        source_document_in_db.save()
        ref: TDeclaratorReference
        for ref in src_doc.decl_references:
            models.Declarator_File_Reference(
                source_document=source_document_in_db,
                declarator_documentfile_id=ref.document_file_id,
                declarator_document_id=ref.document_id,
                web_domain=ref._site_url,
                declarator_document_file_url=ref.document_file_url).save()
        ref: TWebReference
        for ref in src_doc.web_references:
            models.Web_Reference(source_document=source_document_in_db,
                                 dlrobot_url=ref.url,
                                 web_domain=ref._site_url,
                                 crawl_epoch=ref.crawl_epoch).save()

        return source_document_in_db

    def register_section_passport(self, passport):
        if passport in self.all_section_passports:
            TImporter.logger.debug(
                "skip section because a section with the same passport already exists: {}"
                .format(passport))
            return False
        # we process each office in one thread, so there  is no need to use thread.locks, since office_id is a part of passport tuple
        self.all_section_passports.add(passport)
        return True

    def calc_income_year(self, input_json, src_doc: TSourceDocument,
                         section_json, section_index):
        # take year from a particular declarant (many declarants with different year in one file)
        # do not use here default value for get, since smart_parser explicitly write "year": null
        year = section_json.get('year')
        if year is not None:
            return int(year)

        year = src_doc.calc_document_income_year(input_json)

        # if year is absent, then the file is useless
        if year is None:
            raise TSmartParserSectionJson.SerializerException(
                "year is not defined: section No {}".format(section_index))

        return int(year)

    def get_fsin_office_id(self, section_json, src_doc: TSourceDocument):
        department = section_json.get('person', dict()).get('department')
        if department is None or len(department) < 5:
            return src_doc.calculated_office_id
        region_id = self.regions.get_region_all_forms(
            department, TRussianRegions.Russia_as_s_whole_region_id)
        return RUSSIA.get_fsin_by_region(region_id)

    def import_one_smart_parser_json(self, source_document_in_db, input_json,
                                     src_doc: TSourceDocument):
        imported_section_years = list()
        section_index = 0
        TImporter.logger.debug("try to import {} declarants".format(
            len(input_json['persons'])))
        incomes = list()
        is_fsin = RUSSIA.get_office(
            src_doc.calculated_office_id).rubric_id == TOfficeRubrics.Gulag

        for raw_section in input_json['persons']:
            section_index += 1
            section_income_year = self.calc_income_year(
                input_json, src_doc, raw_section, section_index)
            if is_fsin:
                office_id = self.get_fsin_office_id(raw_section, src_doc)
            else:
                office_id = src_doc.calculated_office_id
            with transaction.atomic():
                try:
                    prepared_section = TSmartParserSectionJson(
                        section_income_year, office_id, source_document_in_db)
                    prepared_section.read_raw_json(raw_section)

                    if len(prepared_section.vehicles
                           ) > TImporter.max_vehicle_count:
                        TImporter.logger.debug(
                            "ignore section {} because it has too many vehicles ( > {})"
                            .format(prepared_section.section.person_name,
                                    TImporter.max_vehicle_count))
                        continue
                    passport1 = prepared_section.get_passport_components1(
                    ).get_main_section_passport()
                    if self.register_section_passport(passport1):
                        prepared_section.section.tmp_income_set = prepared_section.incomes
                        passport2 = prepared_section.get_passport_components2(
                        ).get_main_section_passport()
                        section_id, is_new = self.permalinks_db_section.get_section_id(
                            passport1, passport2)
                        if is_new:
                            TImporter.logger.debug(
                                "found a new section {}, set section.id to {}".
                                format(
                                    prepared_section.section.
                                    get_permalink_passport(), section_id))

                        main_income = prepared_section.get_main_declarant_income_size(
                        )
                        if main_income is not None and main_income > 0:
                            incomes.append(main_income)
                        prepared_section.save_to_database(section_id)
                        imported_section_years.append(section_income_year)

                except (DatabaseError,
                        TSmartParserSectionJson.SerializerException) as exp:
                    TImporter.logger.error(
                        "Error! cannot import section N {}: {} ".format(
                            section_index, exp))

        if len(imported_section_years) > 0:
            source_document_in_db.min_income_year = min(imported_section_years)
            source_document_in_db.max_income_year = max(imported_section_years)
            source_document_in_db.section_count = len(imported_section_years)
            median_income = 0
            if len(incomes) > 0:
                median_income = median(incomes)
            if median_income >= 2**31:
                median_income = 0
            source_document_in_db.median_income = median_income
            source_document_in_db.save()

        return len(imported_section_years)

    def get_smart_parser_json(self, all_imported_human_jsons, sha256, src_doc):
        response = self.smart_parser_cache_client.retrieve_json_by_sha256(
            sha256)
        if response is None or response == {}:
            return self.get_human_smart_parser_json(src_doc,
                                                    all_imported_human_jsons)
        else:
            return response

    def import_office(self, office_id):
        if self.args.get('rubric_id') is not None and RUSSIA.get_office(
                office_id).rubric_id != self.args.get('rubric_id'):
            return

        all_imported_human_jsons = set()
        max_doc_id = 2**32
        ordered_documents = list()
        for sha256 in self.office_to_source_documents[office_id]:
            doc_id = self.permalinks_db_source_document.get_old_source_doc_id_by_sha256(
                sha256)
            if doc_id is None:
                doc_id = max_doc_id
            ordered_documents.append((doc_id, sha256))
        ordered_documents.sort()
        TImporter.logger.debug("import office {} document count = {}".format(
            office_id, len(ordered_documents)))

        for _, sha256 in ordered_documents:
            src_doc = self.dlrobot_human.get_document(sha256)
            assert src_doc.calculated_office_id == office_id
            smart_parser_json = self.get_smart_parser_json(
                all_imported_human_jsons, sha256, src_doc)
            doc_file_in_db = self.register_document_in_database(
                sha256, src_doc)
            if smart_parser_json is None:
                self.logger.debug(
                    "file {} has no valid smart parser json, skip it".format(
                        sha256))
            else:
                try:
                    sections_count = self.import_one_smart_parser_json(
                        doc_file_in_db, smart_parser_json, src_doc)
                    TImporter.logger.debug("import {} sections from {}".format(
                        sections_count, sha256))
                except TSmartParserSectionJson.SerializerException as exp:
                    TImporter.logger.error(
                        "Error! cannot import smart parser json for file {}: {} "
                        .format(sha256, exp))

    def distribute_offices_to_processes(self, process_count):
        assert process_count > 1
        cnt = 0
        for office_id in self.office_to_source_documents.keys():
            cnt += 1
            if RUSSIA.get_office(office_id).rubric_id == TOfficeRubrics.Gulag:
                #put all fsin offices to the first process
                bucket_id = 0
            else:
                if len(self.office_buckets[0]) > cnt / process_count:
                    #if bucket 0 contains more offices than other buckets, put to current office to other buckets
                    bucket_id = cnt % (process_count - 1) + 1
                else:
                    bucket_id = cnt % process_count
            self.office_buckets[bucket_id].append(office_id)
        for i in self.office_buckets.keys():
            self.logger.debug("bucket[{}] size = {}".format(
                i, len(self.office_buckets[i])))

    def process_one_office_bucket_in_subprocess(self, bucket_id):
        self.init_after_fork()
        for office_id in self.office_buckets[bucket_id]:
            try:
                self.import_office(office_id)
                gc.collect()
            except TSmartParserSectionJson.SerializerException as exp:
                TImporter.logger.error(
                    "cannot import bucket id {}, exception: {}".format(
                        bucket_id), exp)
Beispiel #13
0
 def test_regions_nominative_and_dative(self):
     regions = TRussianRegions()
     self.assertEqual(63, regions.get_region_in_nominative_and_dative("по г.Москве").id)
     self.assertEqual(63, regions.get_region_in_nominative_and_dative("по  г.Москве").id)
     self.assertEqual(9, regions.get_region_in_nominative_and_dative("по кабардино-балкарской республике").id)
     self.assertEqual(17, regions.get_region_in_nominative_and_dative("по северной осетии").id)
     self.assertEqual(109, regions.get_region_in_nominative_and_dative("по республике Крым").id)
     self.assertEqual(1, regions.get_region_in_nominative_and_dative("по санкт-петербургу").id)
     self.assertEqual(53, regions.get_region_in_nominative_and_dative("костромская область").id)
     self.assertEqual(69 , regions.get_region_in_nominative_and_dative(    "омская область").id)
     self.assertEqual(None, regions.get_region_in_nominative_and_dative("по московской области   мусор"))
Beispiel #14
0
class TRossStatData:
    def __init__(self, regions=None):
        self.region_stat = dict()
        self.regions = regions
        if self.regions is None:
            self.regions = TRussianRegions()
        self.file_path = os.path.join(os.path.dirname(__file__),
                                      "data/ross_stat.json")

    def load_from_disk(self):
        with open(self.file_path) as inp:
            for key, years in json.load(inp).items():
                region = self.regions.get_region_by_id(int(key))
                assert region is not None
                region_id = region.id
                if region_id not in self.region_stat:
                    self.region_stat[region_id] = dict()
                for year, stat in years.items():
                    self.region_stat[int(region_id)][int(
                        year)] = TRegionYearInfo.from_json(stat)

    def save_to_disk(self, postfix=""):
        d = dict()
        with open(self.file_path + postfix, "w") as outp:
            for region_id, years in self.region_stat.items():
                d[region_id] = dict()
                for year, info in years.items():
                    d[region_id][year] = info.to_json()
            json.dump(d, outp, indent=3, ensure_ascii=False)

    def check(self, year: int):
        for r in self.regions.iterate_regions():
            if r.id not in self.region_stat:
                raise Exception("region {}, region_id={} is missing".format(
                    r.name, r.id))
            if year not in self.region_stat[r.id]:
                raise Exception(
                    "year {} region {}, region_id={} is missing".format(
                        year, r.name, r.id))

    def get_data(self, region_id, year: int) -> TRegionYearInfo:
        return self.region_stat.get(region_id, dict()).get(year)

    def get_or_create_data(self, region_id, year: int) -> TRegionYearInfo:
        return self.region_stat.get(region_id,
                                    dict()).get(year, TRegionYearInfo())

    def set_data(self, region_id, year: int, info: TRegionYearInfo):
        info.check()
        r = self.region_stat.get(region_id)
        assert r is not None
        r[year] = info

    def get_or_predict_median_salary(self, region_id: int, year: int) -> int:
        d = self.get_data(region_id, year)
        if d is not None and d.median_salary2 is not None:
            return d.median_salary2
        d1 = self.get_data(region_id, year - 1)
        d2 = self.get_data(region_id, year + 1)
        if d1 is not None and d1.median_salary2 is not None and d2 is not None and d2.median_salary2 is not None:
            return int((d1.median_salary2 + d2.median_salary2) / 2)
        return None
Beispiel #15
0
 def __init__(self, *args, **kwargs):
     super(Command, self).__init__(*args, **kwargs)
     self.regions = TRussianRegions()
     self.options = None
Beispiel #16
0
class TWikiDataMatcher:
    def __init__(self, args):
        self.args = args
        self.logger = setup_logging("wd_by_url")
        self.regions = TRussianRegions()
        self.offices = TOfficeTableInMemory(use_office_types=False)
        self.offices.read_from_local_file()
        self.disclosures_hostnames = defaultdict(set)
        self.disclosures_office_names = defaultdict(set)
        self.build_office_indices()
        self.wd_urls = TWikidataUrlRecords()
        self.wd_urls.read_from_file(self.args.wikidata_info)
        self.wd_region_heads = TWikidataRegionHeads()
        self.wd_region_heads.read_from_file(self.args.wd_region_head_info)

    def build_office_indices(self):
        office: TOfficeInMemory
        self.disclosures_hostnames = defaultdict(set)
        self.disclosures_office_names.clear()
        for office in self.offices.offices.values():
            self.disclosures_office_names[office.name.lower()].add(office)
            site_info: TDeclarationWebSite
            for site_info in office.office_web_sites:
                if site_info.can_communicate():
                    self.disclosures_hostnames[get_web_domain(
                        site_info.url)].add(office)

    def find_wikidata_entry(self, hostname, wd_infos) -> TOfficeInMemory:
        if len(wd_infos) == 1:
            found = self.disclosures_hostnames.get(hostname, list())
            if len(found) == 0:
                self.logger.debug(
                    "cannot find {} in disclosures".format(hostname))
            elif len(found) > 1:
                self.logger.debug("hostname  {} is ambiguous".format(hostname))
            else:
                return list(found)[0], wd_infos[0]
        else:
            found = self.disclosures_hostnames.get(hostname, list())
            if len(found) == 0:
                self.logger.debug(
                    "{} is ambiguous in wikidata, but it also useless since it cannot be found in disclosures"
                    .format(hostname))
                return None
            elif len(found) > 1:
                self.logger.debug(
                    "hostname  {} is ambiguous in wikidata and in disclosures".
                    format(hostname))
            else:
                office: TOfficeInMemory
                office = list(found)[0]
                for w in wd_infos:
                    if w['itemLabel'].lower() == office.name.lower():
                        return office, w
                for w in wd_infos:
                    if w['itemLabel'].lower().startswith(office.name.lower()):
                        return office, w
                for w in wd_infos:
                    if office.name.lower().startswith(w['itemLabel'].lower()):
                        return office, w

                return None

    def set_wikidata_id(self, cause, office, wikidata_id, wikidata_label):
        if wikidata_id.startswith('http://www.wikidata.org/entity/'):
            wikidata_id = wikidata_id[len('http://www.wikidata.org/entity/'):]

        if self.regions.get_region_by_wikidata_id(wikidata_id) is not None:
            self.logger.debug(
                "skip region wikidata set cause={} office.name = {} to wikidata = https://www.wikidata.org/wiki/{} , wikidata.title={}"
                .format(cause, office.name, wikidata_id, wikidata_label))
            return

        if office.wikidata_id is None:
            office.wikidata_id = wikidata_id
            self.logger.debug(
                "set cause={} office.name = {} to wikidata = https://www.wikidata.org/wiki/{} , wikidata.title={}"
                .format(cause, office.name, wikidata_id, wikidata_label))
        elif office.wikidata_id != wikidata_id:
            self.logger.error(
                "office https://disclosures.ru/office/{} {} has  wikidata_id=https://www.wikidata.org/wiki/{}, "
                "but the input file has https://www.wikidata.org/wiki/{}, skip it"
                .format(office.office_id, office.name, office.wikidata_id,
                        wikidata_id))

    def process_offices_urls(self):
        for hostname, wd_infos in self.wd_urls.hostnames.items():
            r = self.find_wikidata_entry(hostname, wd_infos)
            if r is not None:
                office, wd_info = r
                self.set_wikidata_id(hostname, office, wd_info["item"],
                                     wd_info["itemLabel"])

    def process_offices_region_heads(self):
        for name, wd_infos in self.wd_region_heads.titles.items():
            found = self.disclosures_office_names.get(name)
            if found is None:
                self.logger.error(
                    "region head name {} cannot be found in disclosures".
                    format(name))
            elif len(found) > 1:
                self.logger.error(
                    "region head name {} is ambiguous in disclosures".format(
                        name))
            else:
                office = list(found)[0]
                wd_info = wd_infos[0]
                self.set_wikidata_id(name, office, wd_info["item"],
                                     wd_info["itemLabel"])