Exemple #1
0
 def select(self):
     out = TOfficeTableInMemory()
     for web_domain in self.get_url_list():
         site_info: TDeclarationWebSite
         site_info = self.web_sites.get_web_site(web_domain)
         out.add_office(site_info.parent_office)
     self.web_sites.offices = out
Exemple #2
0
 def __init__(self):
     self.regions = TRussianRegions()
     self.year_stat = dict()
     for year in [LAST_DECLARATION_YEAR]:
         self.init_one_year_stats(year)
     self.sorted_region_list_for_web_interface = self._build_region_list_for_combo_box()
     self.offices_in_memory = TOfficeTableInMemory()
     self.offices_in_memory.read_from_local_file()
     self.federal_fsin = self.offices_in_memory.fsin_by_region[TRussianRegions.Russia_as_s_whole_region_id]
     assert self.federal_fsin is not None
     self.calc_data_2020 = TOfficeRubricCalculatedData(os.path.join(os.path.dirname(__file__), "data", "office2020"))
     self.calc_data_current = TOfficeRubricCalculatedData(os.path.join(os.path.dirname(__file__), "data", "office_current"))
 def handle(self, *args, **options):
     logger = setup_logging(log_file_name="build_rubric.log")
     office_hierarchy = TOfficeTableInMemory(use_office_types=False)
     office_hierarchy.read_from_table(models.Office.objects.all())
     for office in models.Office.objects.all():
         rubric_id = office_hierarchy.build_office_rubric(logger, office.id)
         if rubric_id is not None and rubric_id != office.rubric_id:
             logger.debug(
                 "set office rubric_id from {} to {} for {}".format(
                     get_russian_rubric_str(office.rubric_id),
                     get_russian_rubric_str(rubric_id), office.name))
             office.rubric_id = rubric_id
             office.save()
Exemple #4
0
 def __init__(self, args):
     self.args = args
     self.logger = setup_logging("wd_by_url")
     self.regions = TRussianRegions()
     self.offices = TOfficeTableInMemory(use_office_types=False)
     self.offices.read_from_local_file()
     self.disclosures_hostnames = defaultdict(set)
     self.disclosures_office_names = defaultdict(set)
     self.build_office_indices()
     self.wd_urls = TWikidataUrlRecords()
     self.wd_urls.read_from_file(self.args.wikidata_info)
     self.wd_region_heads = TWikidataRegionHeads()
     self.wd_region_heads.read_from_file(self.args.wd_region_head_info)
Exemple #5
0
    def __init__(self):
        self.args = parse_args()
        self.logger = setup_logging(log_file_name=self.args.logfile)
        if self.args.input_offices is not None:
            offices = TOfficeTableInMemory()
            offices.read_from_local_file(self.args.input_offices)
            self.web_sites = TDeclarationWebSiteList(self.logger,
                                                     offices=offices)
        else:
            self.web_sites = TDeclarationWebSiteList(self.logger)

        self.temp_dlrobot_project: TRobotProject
        self.temp_dlrobot_project = None
        THttpRequester.initialize(self.logger)
def main():
    args = parse_args()
    logger = setup_logging("set_rubrics")
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()
    offices.set_rubrics(logger)
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
Exemple #7
0
def add_offices(apps, schema_editor):
    clear_offices(apps, schema_editor)
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()
    Office = apps.get_model('declarations', 'Office')
    office: TOfficeInMemory
    for office in offices.offices.values():
        c = Office(id=office.office_id,
                   name=office.name,
                   type_id=office.type_id,
                   parent_id=office.parent_id,
                   region_id=office.region_id,
                   rubric_id=office.rubric_id
                   )
        c.save()
Exemple #8
0
    def init_rubric(self):
        # json_reader.section.rubric_id = source_document_in_db.office.rubric_id does not work
        # may be we should call source_document_in_db.refresh_from_db
        self.section.rubric_id = RUSSIA.get_office(
            self.section.office.id).rubric_id

        if self.section.rubric_id == TOfficeRubrics.Municipality and \
                TOfficeTableInMemory.convert_municipality_to_education(self.section.position):
            self.section.rubric_id = TOfficeRubrics.Education
Exemple #9
0
 def __init__(self, args):
     self.register_task_result_error_count = 0
     self.logger = setup_logging(log_file_name=args.log_file_name,
                                 append_mode=True)
     self.conversion_client = TDocConversionClient(
         TDocConversionClient.parse_args([]), self.logger)
     self.args = args
     rounds = TDeclarationRounds(args.round_file)
     self.dlrobot_remote_calls = TRemoteDlrobotCallList(
         logger=self.logger,
         file_name=args.remote_calls_file,
         min_start_time_stamp=rounds.start_time_stamp)
     self.worker_2_running_tasks = defaultdict(list)
     self.worker_2_continuous_failures_count = defaultdict(int)
     offices = TOfficeTableInMemory()
     offices.read_from_local_file(self.args.offices_file)
     self.web_sites_db = TDeclarationWebSiteList(self.logger,
                                                 offices=offices)
     if not os.path.exists(self.args.result_folder):
         os.makedirs(self.args.result_folder)
     self.web_sites_to_process = self.find_projects_to_process()
     self.cloud_id_to_worker_ip = dict()
     self.config = TRobotConfig.read_by_config_type(
         self.args.dlrobot_config_type)
     self.last_remote_call = None  # for testing
     host, port = self.args.server_address.split(":")
     self.logger.debug("start server on {}:{}".format(host, port))
     super().__init__((host, int(port)), TDlrobotRequestHandler)
     self.last_service_action_time_stamp = time.time()
     self.service_action_count = 0
     self.decl_sender = TDeclarationSender(
         self.logger, self.args.enable_smart_parser,
         self.args.enable_source_doc_server)
     self.stop_process = False
     if self.args.enable_ip_checking:
         self.permitted_hosts = set(
             str(x)
             for x in ipaddress.ip_network('192.168.100.0/24').hosts())
         self.permitted_hosts.add('127.0.0.1')
         self.permitted_hosts.add('95.165.96.61')  # disclosures.ru
     self.logger.debug("init complete")
     self.send_to_telegram("start dlrobot central with {} tasks".format(
         len(self.web_sites_to_process)))
Exemple #10
0
def create_train_pool(logger, file_name):
    offices = TOfficeTableInMemory()
    offices.read_from_local_file()
    office: TOfficeInMemory
    pool = TRawPool(logger, "train")
    with open(file_name, "w") as outp:
        for office in offices.offices.values():
            if len(office.office_web_sites) > 0:
                r = TRawRecord(
                    rubric_id=office.rubric_id,
                    name_words=split_to_words(office.name),
                    web_name_words=split_to_words(
                        office.office_web_sites[0].url),
                )
                if r.rubric_id is None:
                    continue
                outp.write(json.dumps(r.__dict__, ensure_ascii=False))
                pool.add_record(r)
    return pool
Exemple #11
0
def main():
    args = parse_args()
    logger = setup_logging("join_office_and_websites")
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()

    web_sites_db = TDeclarationWebSiteList(
        logger,
        TDeclarationWebSiteList.default_input_task_list_path).load_from_disk()
    url_info: TDeclarationWebSiteObsolete
    for url, url_info in web_sites_db.web_sites.items():
        office_id = url_info.calculated_office_id
        office: TOfficeInMemory
        office = offices.offices.get(int(office_id))
        if office is None:
            logger.debug(
                "cannot find office_id={}, url={} no valid urls, deleted office?"
                .format(office_id, url))
            continue
        p = url_info.http_protocol if url_info.http_protocol is not None else "http"
        i = TDeclarationWebSite()
        i.url = p + "://" + url
        i.reach_status = url_info.reach_status
        i.comments = url_info.comments
        i.redirect_to = url_info.redirect_to
        i.title = url_info.title
        office.office_web_sites.append(i)
    for o in offices.offices.values():
        o.office_web_sites.sort(key=lambda x: 1 if x.reach_status ==
                                TWebSiteReachStatus.normal else 0)
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
    def __init__(self, logger, offices=None):
        self.web_sites = dict()
        self.web_sites_to_office = dict()
        self.web_domains_redirects = None
        self.web_domain_to_web_site = defaultdict(list)
        self.logger = logger
        if offices is None:
            self.offices = TOfficeTableInMemory()
            self.offices.read_from_local_file()
        else:
            self.offices = offices
        o: TOfficeInMemory
        error_cnt = 0
        for o in self.offices.offices.values():
            u: TDeclarationWebSite
            for u in o.office_web_sites:
                site_url = get_site_url(u.url)
                if site_url in self.web_sites:
                    if site_url in self.web_sites:
                        exception_msg = "url {} occurs in office db more than one time".format(
                            site_url)
                        error_cnt += 1
                self.web_sites[site_url] = u
                self.web_sites_to_office[site_url] = o
                if u.can_communicate() and u.title is None:
                    self.logger.error(
                        "url={} has no title, ML model predict office needs titles to work properly"
                        .format(u.url))
        if error_cnt > 0:
            raise Exception(exception_msg +
                            " and {} other equal urls".format(error_cnt))

        self.build_web_domains_redirects()
        self.web_domain_to_web_site.clear()
        for k, v in self.web_sites.items():
            self.web_domain_to_web_site[
                TDeclarationWebSiteList.site_url_to_web_domain(k)].append(
                    get_site_url(k))
Exemple #13
0
 def split(self):
     parts_count = self.args.split_parts
     chunk_size = int(len(self.web_sites.offices.offices) / parts_count)
     offices = list(self.web_sites.offices.offices.values())
     chunk_id = 0
     cnt = 0
     for l in range(0, len(offices), chunk_size):
         chunk_id += 1
         o = TOfficeTableInMemory()
         for i in offices[l:l + chunk_size]:
             o.add_office(i)
         file_path = "chunk_offices_{}.txt".format(chunk_id)
         o.write_to_local_file(file_path)
         cnt += len(o.offices)
     assert cnt == len(offices)
Exemple #14
0
def set_rubric(document_id):
    document_id = document_id[0]
    global FIRST_CALL_SET_RUBRIC_IN_SUBPROCESS
    if FIRST_CALL_SET_RUBRIC_IN_SUBPROCESS:
        from django.db import connection
        connection.connect()
        FIRST_CALL_SET_RUBRIC_IN_SUBPROCESS = False
    src_doc = models.Source_Document.objects.get(id=document_id)
    with transaction.atomic():
        for section in src_doc.section_set.all():
            if section.rubric_id is not None and section.rubric_id != src_doc.office.rubric_id:
                sys.stdout.write('set rubric {} to section {}\n'.format(
                    src_doc.office.rubric_id, section.id))

            section.rubric_id = src_doc.office.rubric_id
            if section.position is not None and section.rubric_id == TOfficeRubrics.Municipality:
                res = TOfficeTableInMemory.convert_municipality_to_education(
                    section.position)
                if res:
                    sys.stdout.write('{} {}\n'.format(res, section.id,
                                                      section.position))
                    section.rubric_id = TOfficeRubrics.Education
            section.save()
Exemple #15
0
def main():
    args = parse_args()
    logger = setup_logging("calc_region_from_wd")
    regions = TRussianRegions()
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()
    wd = TWikidataRecords(regions)
    wd.read_from_file(args.wikidata_info)

    web_sites_db = TDeclarationWebSiteList(logger,
                                           TDeclarationWebSiteList.default_input_task_list_path).load_from_disk()
    office_to_urls = web_sites_db.build_office_to_main_website(take_abandoned=True)
    with open(args.input_file) as inp:
        for l in inp:
            office_id, name = l.strip().split("\t")
            office = offices.offices.get(int(office_id))
            if office is None:
                logger.debug("cannot find office_id={}, name={} no valid urls, deleted office?")
                continue

            wikidata_id, region = wd.get_region_by_name(name)
            if wikidata_id is not None:
                cause = "name"
            else:
                urls = office_to_urls.get(int(office_id), [])
                if len(urls) == 0:
                    logger.debug("office_id={}, name={} no valid urls, delete office?")
                    continue
                for url in urls:
                    wikidata_id, region = wd.get_region_by_url(name, url)
                    if wikidata_id is not None:
                        cause = "url"
                        break

            if region is None:
                logger.error(
                    "office_id={}, name={} cannot recognize region".format(office_id, name))
            else:
                logger.debug("set region {} to {} {} by {} ".format(region.name, office_id, name, cause))
                office.region_id = region.id
                office.wikidata_id = wikidata_id
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
Exemple #16
0
def main():
    args = parse_args()
    logger = setup_logging("calc_region")
    regions = TRussianRegions()
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()

    with open(args.input_file) as inp:
        for l in inp:
            office_id, name, yandex_info = l.strip().split("\t")
            address = json.loads(yandex_info).get('address', '')
            region_id = regions.calc_region_by_address(address)
            if region_id is None:
                logger.error("cannot recognize region for {}".format(address))
            else:
                office = offices.offices.get(int(office_id))
                logger.debug(
                    "office_id={}, change region_id={} to region_id={}".format(
                        office_id, office.region_id, region_id))
                office.region_id = region_id
                office.address = address
            #print("\t".join([office_id, name, yandex_info, str(region_id)]))
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
Exemple #17
0
class TRussia:
    def __init__(self):
        self.regions = TRussianRegions()
        self.year_stat = dict()
        for year in [LAST_DECLARATION_YEAR]:
            self.init_one_year_stats(year)
        self.sorted_region_list_for_web_interface = self._build_region_list_for_combo_box()
        self.offices_in_memory = TOfficeTableInMemory()
        self.offices_in_memory.read_from_local_file()
        self.federal_fsin = self.offices_in_memory.fsin_by_region[TRussianRegions.Russia_as_s_whole_region_id]
        assert self.federal_fsin is not None
        self.calc_data_2020 = TOfficeRubricCalculatedData(os.path.join(os.path.dirname(__file__), "data", "office2020"))
        self.calc_data_current = TOfficeRubricCalculatedData(os.path.join(os.path.dirname(__file__), "data", "office_current"))


    def get_office(self, office_id) -> TOfficeInMemory:
        return self.offices_in_memory.get_office_by_id(office_id)

    def get_fsin_by_region(self, region_id) -> TOfficeInMemory:
        return self.offices_in_memory.fsin_by_region.get(region_id, self.federal_fsin)

    def iterate_offices(self) -> TOfficeInMemory:
        for office in self.offices_in_memory.offices.values():
            yield office

    def iterate_offices_ids(self):
        for office_id in self.offices_in_memory.offices.keys():
            yield office_id

    def init_one_year_stats(self, year):
        s = TAllRegionStatsForOneYear(year, regions=self.regions)
        s.load_from_disk()
        s.build_correlation_matrix()
        self.year_stat[year] = s
        if LAST_DECLARATION_YEAR == year:
            for r in self.regions.regions:
                if r.id == TRussianRegions.Russia_as_s_whole_region_id:
                    last_sala = max(RUSSIA_MEDIAN_SALARY.items(), key=operator.itemgetter(0))[1]
                    last_popul = max(RUSSIA_POPULATION.items(), key=operator.itemgetter(0))[1]
                    r.set_stat_data(TRegionYearStats(r.id, r.name, citizen_month_median_salary=last_sala,
                                     population=last_popul))
                else:
                    r.set_stat_data(s.get_region_info(r.id))

    #years are not continous but ordered by year
    def get_average_nominal_incomes(self, year_incomes) -> TIncomeCompare:
        if len(year_incomes) <= 1:
            return None
        first_income = None
        last_income = None
        for year_income in year_incomes:
            if year_income.income == 0 or year_income.income is None:
                continue
            if year_income.year not in MROT:
                continue
            if year_income.income < 12*MROT[year_income.year]:
                continue
            if year_income.year in ROSSTAT_ALL_RUSSIA_AVERAGE_MONTH_INCOME:
                if first_income is None:
                    first_income = year_income
                last_income = year_income
        if first_income is None or first_income == last_income:
            return None
        if first_income.year == last_income.year:
            return None
        declarant_growth = TYearIncome.get_growth_rate(first_income.income, last_income.income)
        population_growth = TYearIncome.get_growth_rate(ROSSTAT_ALL_RUSSIA_AVERAGE_MONTH_INCOME[first_income.year],
                                                        ROSSTAT_ALL_RUSSIA_AVERAGE_MONTH_INCOME[last_income.year])
        return TIncomeCompare(population_growth, declarant_growth, first_income.year, last_income.year)

    def compare_to_all_russia_average_month_income(self, year: int, month_income):
        i = ROSSTAT_ALL_RUSSIA_AVERAGE_MONTH_INCOME.get(year)
        if i is None:
            return None
        return round(float(month_income) / float(i), 2)

    def get_mrot(self, year: int):
        return MROT.get(year)

    def _build_region_list_for_combo_box(self):
        lst = list()
        lst.append(('', ''))
        for r in self.regions.regions:
            name = r.name
            if len(name) > 33:
                name = name[:33]
            lst.append((r.id, name))
        lst.sort(key=operator.itemgetter(1))
        return lst
Exemple #18
0
class TWikiDataMatcher:
    def __init__(self, args):
        self.args = args
        self.logger = setup_logging("wd_by_url")
        self.regions = TRussianRegions()
        self.offices = TOfficeTableInMemory(use_office_types=False)
        self.offices.read_from_local_file()
        self.disclosures_hostnames = defaultdict(set)
        self.disclosures_office_names = defaultdict(set)
        self.build_office_indices()
        self.wd_urls = TWikidataUrlRecords()
        self.wd_urls.read_from_file(self.args.wikidata_info)
        self.wd_region_heads = TWikidataRegionHeads()
        self.wd_region_heads.read_from_file(self.args.wd_region_head_info)

    def build_office_indices(self):
        office: TOfficeInMemory
        self.disclosures_hostnames = defaultdict(set)
        self.disclosures_office_names.clear()
        for office in self.offices.offices.values():
            self.disclosures_office_names[office.name.lower()].add(office)
            site_info: TDeclarationWebSite
            for site_info in office.office_web_sites:
                if site_info.can_communicate():
                    self.disclosures_hostnames[get_web_domain(
                        site_info.url)].add(office)

    def find_wikidata_entry(self, hostname, wd_infos) -> TOfficeInMemory:
        if len(wd_infos) == 1:
            found = self.disclosures_hostnames.get(hostname, list())
            if len(found) == 0:
                self.logger.debug(
                    "cannot find {} in disclosures".format(hostname))
            elif len(found) > 1:
                self.logger.debug("hostname  {} is ambiguous".format(hostname))
            else:
                return list(found)[0], wd_infos[0]
        else:
            found = self.disclosures_hostnames.get(hostname, list())
            if len(found) == 0:
                self.logger.debug(
                    "{} is ambiguous in wikidata, but it also useless since it cannot be found in disclosures"
                    .format(hostname))
                return None
            elif len(found) > 1:
                self.logger.debug(
                    "hostname  {} is ambiguous in wikidata and in disclosures".
                    format(hostname))
            else:
                office: TOfficeInMemory
                office = list(found)[0]
                for w in wd_infos:
                    if w['itemLabel'].lower() == office.name.lower():
                        return office, w
                for w in wd_infos:
                    if w['itemLabel'].lower().startswith(office.name.lower()):
                        return office, w
                for w in wd_infos:
                    if office.name.lower().startswith(w['itemLabel'].lower()):
                        return office, w

                return None

    def set_wikidata_id(self, cause, office, wikidata_id, wikidata_label):
        if wikidata_id.startswith('http://www.wikidata.org/entity/'):
            wikidata_id = wikidata_id[len('http://www.wikidata.org/entity/'):]

        if self.regions.get_region_by_wikidata_id(wikidata_id) is not None:
            self.logger.debug(
                "skip region wikidata set cause={} office.name = {} to wikidata = https://www.wikidata.org/wiki/{} , wikidata.title={}"
                .format(cause, office.name, wikidata_id, wikidata_label))
            return

        if office.wikidata_id is None:
            office.wikidata_id = wikidata_id
            self.logger.debug(
                "set cause={} office.name = {} to wikidata = https://www.wikidata.org/wiki/{} , wikidata.title={}"
                .format(cause, office.name, wikidata_id, wikidata_label))
        elif office.wikidata_id != wikidata_id:
            self.logger.error(
                "office https://disclosures.ru/office/{} {} has  wikidata_id=https://www.wikidata.org/wiki/{}, "
                "but the input file has https://www.wikidata.org/wiki/{}, skip it"
                .format(office.office_id, office.name, office.wikidata_id,
                        wikidata_id))

    def process_offices_urls(self):
        for hostname, wd_infos in self.wd_urls.hostnames.items():
            r = self.find_wikidata_entry(hostname, wd_infos)
            if r is not None:
                office, wd_info = r
                self.set_wikidata_id(hostname, office, wd_info["item"],
                                     wd_info["itemLabel"])

    def process_offices_region_heads(self):
        for name, wd_infos in self.wd_region_heads.titles.items():
            found = self.disclosures_office_names.get(name)
            if found is None:
                self.logger.error(
                    "region head name {} cannot be found in disclosures".
                    format(name))
            elif len(found) > 1:
                self.logger.error(
                    "region head name {} is ambiguous in disclosures".format(
                        name))
            else:
                office = list(found)[0]
                wd_info = wd_infos[0]
                self.set_wikidata_id(name, office, wd_info["item"],
                                     wd_info["itemLabel"])
class TDeclarationWebSiteList:
    disclosures_office_start_id = 20000

    def __init__(self, logger, offices=None):
        self.web_sites = dict()
        self.web_sites_to_office = dict()
        self.web_domains_redirects = None
        self.web_domain_to_web_site = defaultdict(list)
        self.logger = logger
        if offices is None:
            self.offices = TOfficeTableInMemory()
            self.offices.read_from_local_file()
        else:
            self.offices = offices
        o: TOfficeInMemory
        error_cnt = 0
        for o in self.offices.offices.values():
            u: TDeclarationWebSite
            for u in o.office_web_sites:
                site_url = get_site_url(u.url)
                if site_url in self.web_sites:
                    if site_url in self.web_sites:
                        exception_msg = "url {} occurs in office db more than one time".format(
                            site_url)
                        error_cnt += 1
                self.web_sites[site_url] = u
                self.web_sites_to_office[site_url] = o
                if u.can_communicate() and u.title is None:
                    self.logger.error(
                        "url={} has no title, ML model predict office needs titles to work properly"
                        .format(u.url))
        if error_cnt > 0:
            raise Exception(exception_msg +
                            " and {} other equal urls".format(error_cnt))

        self.build_web_domains_redirects()
        self.web_domain_to_web_site.clear()
        for k, v in self.web_sites.items():
            self.web_domain_to_web_site[
                TDeclarationWebSiteList.site_url_to_web_domain(k)].append(
                    get_site_url(k))

    @staticmethod
    def site_url_to_web_domain(site_url):
        return urlsplit_pro(site_url).hostname

    def build_web_domains_redirects(self):
        self.web_domains_redirects = defaultdict(set)
        for k, v in self.web_sites.items():
            if v.redirect_to is not None:
                d1 = urlsplit_pro(k).hostname
                d2 = urlsplit_pro(v.redirect_to).hostname
                if d1 != d2:
                    self.web_domains_redirects[d1].add(d2)
                    self.web_domains_redirects[d2].add(d1)

    def get_mirrors(self, d: str):
        return self.web_domains_redirects.get(d, set())

    def get_sites_by_web_domain(self, web_domain: str):
        l = self.web_domain_to_web_site.get(web_domain)
        if l is None:
            if web_domain.startswith('www.'):
                return self.web_domain_to_web_site[web_domain[4:]]
            return list()
        return l

    def get_first_site_by_web_domain(self,
                                     web_domain: str) -> TDeclarationWebSite:
        if web_domain is None:
            raise Exception("web_domain cannot be None ")
        if '/' in web_domain:
            raise Exception(
                "web_domain ({}) cannot contain '/' ".format(web_domain))

        l = self.get_sites_by_web_domain(web_domain)
        if len(l) == 0:
            return None
        return self.web_sites.get(l[0])

    def search_url(self, url: str) -> TDeclarationWebSite:
        host_name = self.site_url_to_web_domain(url)
        web_site = self.get_first_site_by_web_domain(host_name)
        if web_site is not None:
            return web_site
        if not host_name.startswith('www.'):
            return self.get_first_site_by_web_domain("www." + host_name)
        return None

    def is_a_special_domain(self, web_domain):
        return web_domain == "declarator.org" or web_domain == "rg.ru"

    def get_web_domains(self):
        for k in self.web_domain_to_web_site:
            yield k

    def get_other_sites_regexp_on_the_same_web_domain(self, morda_url):
        web_domain = urlsplit_pro(morda_url).hostname
        other_sites = list()
        for k in self.get_sites_by_web_domain(web_domain):
            if morda_url.find(k) == -1:
                other_sites.append("((www.)?{}(/|$))".format(k))
        if len(other_sites) == 0:
            return None
        s = "|".join(other_sites)
        self.logger.debug(
            "use regexp {} to prohibit crawling other projects".format(s))
        return re.compile(s)

    def get_title_by_web_domain(self, web_domain: str) -> str:
        info = self.get_first_site_by_web_domain(web_domain)
        if info is None or info.title is None:
            return ""
        return info.title

    def get_office_id_by_web_domain(self,
                                    web_domain: str,
                                    unknown_office_id=-1) -> str:
        info = self.get_first_site_by_web_domain(web_domain)
        if info is None or info.title is None:
            return unknown_office_id
        return info.parent_office.office_id

    def has_web_site(self, site_url):
        return site_url in self.web_sites

    def get_web_site(self, site_url) -> TDeclarationWebSite:
        return self.web_sites.get(site_url)

    def get_office(self, site_url) -> TOfficeInMemory:
        return self.web_sites_to_office.get(site_url)

    def check_valid(self, logger, fail_fast=True):
        cnt = 0
        errors = 0
        for site_url, site_info in self.web_sites.items():
            cnt += 1
            if TWebSiteReachStatus.can_communicate(site_info.reach_status):
                if not site_info.url.startswith('http'):
                    errors += 1
                    logger.error("{} has no protocol".format(site_url))
                    if fail_fast:
                        return False
            if site_info.redirect_to is not None:
                if not self.has_web_site(get_site_url(site_info.redirect_to)):
                    errors += 1
                    logger.error("{} has missing redirect {}".format(
                        site_url, site_info.redirect_to))
                    if fail_fast:
                        return False
        self.logger.info("checked {} sites".format(cnt))
        return errors == 0