Exemple #1
0
 def __init__(self, logger, file_path):
     self.index_file_path = file_path
     self.logger = logger
     self.office_name_bigrams = None
     self.office_name_unigrams = None
     self.office_squeezes = None
     self.web_domains = None
     self.office_id_2_ml_office_id = None
     self.ml_office_id_2_office_id = None
     self.web_sites = TDeclarationWebSiteList(self.logger)
     self.regions = TRussianRegions()
Exemple #2
0
    def __init__(self):
        self.args = parse_args()
        self.logger = setup_logging(log_file_name=self.args.logfile)
        if self.args.input_offices is not None:
            offices = TOfficeTableInMemory()
            offices.read_from_local_file(self.args.input_offices)
            self.web_sites = TDeclarationWebSiteList(self.logger,
                                                     offices=offices)
        else:
            self.web_sites = TDeclarationWebSiteList(self.logger)

        self.temp_dlrobot_project: TRobotProject
        self.temp_dlrobot_project = None
        THttpRequester.initialize(self.logger)
Exemple #3
0
def main():
    args = parse_args()
    logger = setup_logging("join_office_and_websites")
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()

    web_sites_db = TDeclarationWebSiteList(
        logger,
        TDeclarationWebSiteList.default_input_task_list_path).load_from_disk()
    url_info: TDeclarationWebSiteObsolete
    for url, url_info in web_sites_db.web_sites.items():
        office_id = url_info.calculated_office_id
        office: TOfficeInMemory
        office = offices.offices.get(int(office_id))
        if office is None:
            logger.debug(
                "cannot find office_id={}, url={} no valid urls, deleted office?"
                .format(office_id, url))
            continue
        p = url_info.http_protocol if url_info.http_protocol is not None else "http"
        i = TDeclarationWebSite()
        i.url = p + "://" + url
        i.reach_status = url_info.reach_status
        i.comments = url_info.comments
        i.redirect_to = url_info.redirect_to
        i.title = url_info.title
        office.office_web_sites.append(i)
    for o in offices.offices.values():
        o.office_web_sites.sort(key=lambda x: 1 if x.reach_status ==
                                TWebSiteReachStatus.normal else 0)
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
Exemple #4
0
 def __init__(self, logger, web_sites=None):
     self.logger = logger
     if web_sites is not None:
         self.web_sites = web_sites
     else:
         self.web_sites = TDeclarationWebSiteList(logger,
                                                  RUSSIA.offices_in_memory)
 def print_predicted_as_external(self):
     web_sites = TDeclarationWebSiteList(logger=self.logger,
                                         offices=RUSSIA.offices_in_memory)
     for key, src_doc in self.dlrobot_human.get_all_documents():
         if src_doc.calculated_office_id is None:
             continue
         urls = set(r.get_site_url() for r in src_doc.web_references)
         if len(urls) != 1:
             continue
         src_doc_url = list(urls)[0]
         if src_doc_url == "service.nalog.ru":
             continue
         office = RUSSIA.offices_in_memory.get_office_by_id(
             src_doc.calculated_office_id)
         u: TDeclarationWebSite
         found = False
         origin_hostname = urlsplit_pro(src_doc_url).hostname
         if web_sites.is_a_special_domain(origin_hostname):
             continue
         for u in office.office_web_sites:
             if urlsplit_pro(u.url).hostname == origin_hostname:
                 found = True
                 break
         if found:
             continue
         ww = web_sites.search_url(src_doc_url)
         if ww is None:
             self.logger.error(
                 "cannot find url {} by web domain in offices.txt".format(
                     src_doc_url))
             continue
         r = {
             "sha256": key,
             "predicted_office": {
                 "id": office.office_id,
                 "name": office.name
             },
             "url_host_office": {
                 "id": ww.parent_office.office_id,
                 "name": ww.parent_office.name
             },
             "url": src_doc_url,
             "title": src_doc.get_doc_title()
         }
         print(json.dumps(r, indent=4, ensure_ascii=False))
 def __init__(self, args):
     self.logger = setup_logging(log_file_name="predict_office.log")
     self.dlrobot_human_path = args.dlrobot_human_path
     self.dlrobot_human = TDlrobotHumanFileDBM(self.dlrobot_human_path)
     self.dlrobot_human.open_write_mode()
     self.enable_ml = args.enable_ml
     sp_args = TSmartParserCacheClient.parse_args([])
     self.smart_parser_server_client = TSmartParserCacheClient(sp_args, self.logger)
     model_path = args.office_model_path
     self.max_failures_count = args.max_failures_count
     assert (os.path.exists(model_path))
     bigrams_path = os.path.join(model_path, "office_ngrams.txt")
     ml_model_path = os.path.join(model_path, "model")
     self.office_ml_model = TTensorFlowOfficeModel(self.logger, bigrams_path, ml_model_path, create_model=False)
     self.regional_tax_offices = self.build_regional_tax_offices()
     self.web_sites = TDeclarationWebSiteList(self.logger, RUSSIA.offices_in_memory)
     self.title_parser = TOfficeFromTitle(self.logger, web_sites=self.web_sites)
     self.src_doc_to_rule_results = dict()
 def __init__(self, args):
     self.args = args
     self.logger = setup_logging(log_file_name="join_human_and_dlrobot.log",
                                 append_mode=True)
     self.output_dlrobot_human = TDlrobotHumanFileDBM(args.output_json)
     self.output_dlrobot_human.create_db()
     self.old_files_with_office_count = 0
     self.web_sites_db = TDeclarationWebSiteList(self.logger)
     self.offices = self.web_sites_db.offices
     self.dlrobot_config = TRobotConfig.read_by_config_type("prod")
Exemple #8
0
def main():
    args = parse_args()
    logger = setup_logging("calc_region_from_wd")
    regions = TRussianRegions()
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()
    wd = TWikidataRecords(regions)
    wd.read_from_file(args.wikidata_info)

    web_sites_db = TDeclarationWebSiteList(logger,
                                           TDeclarationWebSiteList.default_input_task_list_path).load_from_disk()
    office_to_urls = web_sites_db.build_office_to_main_website(take_abandoned=True)
    with open(args.input_file) as inp:
        for l in inp:
            office_id, name = l.strip().split("\t")
            office = offices.offices.get(int(office_id))
            if office is None:
                logger.debug("cannot find office_id={}, name={} no valid urls, deleted office?")
                continue

            wikidata_id, region = wd.get_region_by_name(name)
            if wikidata_id is not None:
                cause = "name"
            else:
                urls = office_to_urls.get(int(office_id), [])
                if len(urls) == 0:
                    logger.debug("office_id={}, name={} no valid urls, delete office?")
                    continue
                for url in urls:
                    wikidata_id, region = wd.get_region_by_url(name, url)
                    if wikidata_id is not None:
                        cause = "url"
                        break

            if region is None:
                logger.error(
                    "office_id={}, name={} cannot recognize region".format(office_id, name))
            else:
                logger.debug("set region {} to {} {} by {} ".format(region.name, office_id, name, cause))
                office.region_id = region.id
                office.wikidata_id = wikidata_id
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
Exemple #9
0
 def __init__(self, args):
     self.register_task_result_error_count = 0
     self.logger = setup_logging(log_file_name=args.log_file_name,
                                 append_mode=True)
     self.conversion_client = TDocConversionClient(
         TDocConversionClient.parse_args([]), self.logger)
     self.args = args
     rounds = TDeclarationRounds(args.round_file)
     self.dlrobot_remote_calls = TRemoteDlrobotCallList(
         logger=self.logger,
         file_name=args.remote_calls_file,
         min_start_time_stamp=rounds.start_time_stamp)
     self.worker_2_running_tasks = defaultdict(list)
     self.worker_2_continuous_failures_count = defaultdict(int)
     offices = TOfficeTableInMemory()
     offices.read_from_local_file(self.args.offices_file)
     self.web_sites_db = TDeclarationWebSiteList(self.logger,
                                                 offices=offices)
     if not os.path.exists(self.args.result_folder):
         os.makedirs(self.args.result_folder)
     self.web_sites_to_process = self.find_projects_to_process()
     self.cloud_id_to_worker_ip = dict()
     self.config = TRobotConfig.read_by_config_type(
         self.args.dlrobot_config_type)
     self.last_remote_call = None  # for testing
     host, port = self.args.server_address.split(":")
     self.logger.debug("start server on {}:{}".format(host, port))
     super().__init__((host, int(port)), TDlrobotRequestHandler)
     self.last_service_action_time_stamp = time.time()
     self.service_action_count = 0
     self.decl_sender = TDeclarationSender(
         self.logger, self.args.enable_smart_parser,
         self.args.enable_source_doc_server)
     self.stop_process = False
     if self.args.enable_ip_checking:
         self.permitted_hosts = set(
             str(x)
             for x in ipaddress.ip_network('192.168.100.0/24').hosts())
         self.permitted_hosts.add('127.0.0.1')
         self.permitted_hosts.add('95.165.96.61')  # disclosures.ru
     self.logger.debug("init complete")
     self.send_to_telegram("start dlrobot central with {} tasks".format(
         len(self.web_sites_to_process)))
    def get_weak_office_uniq_website(self):
        strong_offices = set()
        for _, _, office_id in self.get_predict_train_entries():
            strong_offices.add(office_id)

        web_sites = TDeclarationWebSiteList(logger=self.logger,
                                            offices=RUSSIA.offices_in_memory)
        processed_websites = set()
        for sha256, src_doc in self.dlrobot_human.get_all_documents():
            web_site = src_doc.get_web_site()
            if web_site in processed_websites or web_site is None or web_site == "":
                continue
            processed_websites.add(web_site)
            site_info = web_sites.search_url(web_site)
            if site_info is None:
                self.logger.error(
                    "cannot find {} in offices.txt".format(web_site))
                continue
            office_id = site_info.parent_office.office_id
            if office_id not in strong_offices:
                yield sha256, src_doc, office_id
Exemple #11
0
class TOfficePredictIndex:
    def __init__(self, logger, file_path):
        self.index_file_path = file_path
        self.logger = logger
        self.office_name_bigrams = None
        self.office_name_unigrams = None
        self.office_squeezes = None
        self.web_domains = None
        self.office_id_2_ml_office_id = None
        self.ml_office_id_2_office_id = None
        self.web_sites = TDeclarationWebSiteList(self.logger)
        self.regions = TRussianRegions()

    def get_bigrams_count(self):
        return len(self.office_name_bigrams)

    def get_unigrams_count(self):
        return len(self.office_name_unigrams)

    def get_max_region_id(self):
        return self.regions.max_region_id

    def get_web_domain_index(self, web_domain):
        s = self.web_domains.get(web_domain)
        if s is None:
            return 0
        return s.web_domain_id

    def is_office_child(self, child_id, parent_id):
        return child_id is not None and self.office_squeezes[child_id][
            'parent_id'] == parent_id

    def is_office_child_or_grandchild(self, child_id, parent_id):
        if self.is_office_child(child_id, parent_id):
            return True
        p = self.office_squeezes[child_id]['parent_id']
        return self.is_office_child(p, parent_id)

    def get_web_domains_count(self):
        return len(self.web_domains)

    def get_web_domain_by_url(self, document_url, site_url):
        # first take web domain from which the document was dowloaded
        web_domain = urlsplit_pro(document_url).hostname
        if self.web_sites.get_first_site_by_web_domain(web_domain) is not None:
            return web_domain
        # if this web domain is unknown, take web domain from site_url
        web_domain = urlsplit_pro(site_url).hostname
        if self.web_sites.get_first_site_by_web_domain(web_domain) is None:
            if not self.web_sites.is_a_special_domain(web_domain):
                self.logger.error(
                    "web domain {} is missing in office.txt".format(site_url))
        return web_domain

    def get_ml_office_id(self, office_id: int):
        return self.office_id_2_ml_office_id.get(office_id)

    def get_office_id_by_ml_office_id(self, ml_office_id: int):
        return self.ml_office_id_2_office_id.get(ml_office_id)

    def get_bigram_id(self, bigram):
        b = self.office_name_bigrams.get(bigram)
        if b is None:
            return None
        return b.ngram_id

    def get_unigram_id(self, gram):
        b = self.office_name_unigrams.get(gram)
        if b is None:
            return None
        return b.ngram_id

    def get_offices_by_bigram(self, bigram):
        b = self.office_name_bigrams.get(bigram)
        if b is None:
            return list()
        return b.office_squeezes

    @staticmethod
    def get_word_stems(text, stem_size=4, add_starter_and_enders=True):
        if add_starter_and_enders:
            yield "^"
        text = text.lower().replace('ё', 'е')
        for word in re.split("[\s,\.;:_\"* ()«»]", text):
            if len(word) == 0:
                continue
            #ignore year
            if word.startswith("20") and len(word) == 4:
                continue
            hyphen_index = word.find('-')
            if hyphen_index > 0:
                if word[hyphen_index -
                        1] == 'о':  #"ямало-ненецкий" не надо разбивать
                    yield word[:stem_size * 2]
                else:
                    w1, w2 = word.split('-', 1)
                    yield w1[:stem_size]  #  split каменск-уральский
                    yield w2[:stem_size]
            else:
                yield word[:stem_size]
        if add_starter_and_enders:
            yield "$"

    @staticmethod
    def get_bigrams(text):
        words = list(TOfficePredictIndex.get_word_stems(text))
        for w1, w2 in zip(words[:-1], words[1:]):
            yield "_".join((w1, w2))

    @staticmethod
    def get_trigrams(text):
        words = list(TOfficePredictIndex.get_word_stems(text))

        for w1, w2, w3 in zip(words[:-2], words[1:-1], words[2:]):
            yield "_".join((w1, w2, w3))

    @staticmethod
    def split_web_domain(web_domain):
        for x in web_domain.split('.'):
            yield x

    def read(self):
        with open(self.index_file_path) as inp:
            js = json.load(inp)
            self.office_name_bigrams = dict((k, TOfficeNgram.from_json(v))
                                            for k, v in js['bigrams'].items())
            self.office_name_unigrams = dict(
                (k, TOfficeNgram.from_json(v))
                for k, v in js['unigrams'].items())
            self.office_squeezes = dict(
                (int(k), v) for k, v in js['offices'].items())
            self.web_domains = dict((k, TOfficeWebDomain.from_json(v))
                                    for k, v in js['web_domains'].items())
            self.office_id_2_ml_office_id = dict(
                (int(k), v) for k, v in js['office_id_2_ml_office_id'].items())
            self.ml_office_id_2_office_id = dict(
                (int(k), v) for k, v in js['ml_office_id_2_office_id'].items())
        self.logger.info("bigrams count = {}".format(self.get_bigrams_count()))

    def write(self):
        self.logger.info("write to {}".format(self.index_file_path))
        with open(self.index_file_path, "w") as outp:
            assert self.office_squeezes is not None
            assert len(self.office_squeezes) > 0
            rec = {
                'bigrams':
                dict((k, v.to_json())
                     for k, v in self.office_name_bigrams.items()),
                'unigrams':
                dict((k, v.to_json())
                     for k, v in self.office_name_unigrams.items()),
                'offices':
                self.office_squeezes,
                'web_domains':
                dict((k, v.to_json()) for k, v in self.web_domains.items()),
                'office_id_2_ml_office_id':
                self.office_id_2_ml_office_id,
                'ml_office_id_2_office_id':
                self.ml_office_id_2_office_id,
            }
            json.dump(rec, outp, ensure_ascii=False, indent=4)

    def get_office_name(self, office_id: int):
        return self.office_squeezes[office_id]['name']

    def has_office_squeeze(self, office_id: int):
        return office_id in self.office_squeezes

    def get_office_region(self, office_id: int):
        return self.office_squeezes[office_id]['region']

    def get_region_from_web_site_title(self, site_url: str):
        site_info = self.web_sites.get_web_site(site_url)
        if site_info is not None and site_info.title is not None:
            return self.regions.get_region_all_forms(site_info.title, 0)
        else:
            return 0

    def get_parent_office_from_web_site(self, site_url: str):
        site_info = self.web_sites.get_web_site(site_url)
        if site_info is None:
            self.logger.error(
                " site_url = {} cannot be found in offices.txt".format(
                    site_url))
            return None
        return self.get_ml_office_id(site_info.parent_office.office_id)
        if r.result_files_count > 0:
            good.add(url)
        else:
            bad.add(url)

    cnt = 0
    for url in bad:
        if url in good:
            continue
        cnt += 1
        if web_sites.has_web_site(url) and TWebSiteReachStatus.can_communicate(
                web_sites.get_web_site(url).reach_status):
            logger.info("browse {} ...".format(url))
            title = get_html_title_from_url(url)
            output_file.write("{}\t{}\t{}\n".format(
                url, ",".join(statuses.get(url, ["unk"])), title))
        #if cnt > 10:
        #    break


if __name__ == "__main__":
    args = parse_args()
    logger = setup_logging("analyze_remote_calls")
    web_sites = TDeclarationWebSiteList(logger)
    remote_calls = TRemoteDlrobotCall.read_remote_calls_from_file(
        args.input_file)
    with open(args.output_file, "w") as outp:
        if args.action == "print_sites_wo_results":
            print_sites_wo_results(logger, remote_calls, web_sites, outp)
        else:
            raise Exception('unknown acton')
Exemple #13
0
class TWebSitesManager:
    def __init__(self):
        self.args = parse_args()
        self.logger = setup_logging(log_file_name=self.args.logfile)
        if self.args.input_offices is not None:
            offices = TOfficeTableInMemory()
            offices.read_from_local_file(self.args.input_offices)
            self.web_sites = TDeclarationWebSiteList(self.logger,
                                                     offices=offices)
        else:
            self.web_sites = TDeclarationWebSiteList(self.logger)

        self.temp_dlrobot_project: TRobotProject
        self.temp_dlrobot_project = None
        THttpRequester.initialize(self.logger)

    def check_web_site_filters(self, site_url):
        if site_url.strip() == "":
            return False

        if self.args.filter_regex is not None:
            if re.search(self.args.filter_regex, site_url) is None:
                return False

        site_info = self.web_sites.get_web_site(site_url)
        if site_info is None:
            self.logger.error(
                "skip {}, cannot find this site".format(site_url))
            return False
        else:
            if self.args.take_without_titles:
                return TWebSiteReachStatus.can_communicate(
                    site_info.reach_status) and site_info.title is None
            elif self.args.take_all_web_sites or TWebSiteReachStatus.can_communicate(
                    site_info.reach_status):
                return True
            else:
                self.logger.debug("skip abandoned {}".format(site_url))
                return False

    def read_web_domains_from_file(self):
        self.logger.info("read url list from {}".format(self.args.url_list))
        web_domains = list()
        with open(self.args.url_list) as inp:
            for url in inp:
                url = url.strip(" \r\n")
                if url.startswith('http'):
                    web_domains.append(strip_scheme_and_query(url))
                else:
                    web_domains.append(url)
        return web_domains

    def get_url_list(self, start_selenium=False):
        web_domains = list()
        if self.args.filter_by_source is not None:
            web_domains = list()
            for k in self.web_sites.web_sites.values():
                if k.parent_office.source_id == self.args.filter_by_source:
                    web_domains.append(get_site_url(k.url))
        elif self.args.url_list is not None:
            web_domains = self.read_web_domains_from_file()
        else:
            #take all web domains
            web_domains = list(self.web_sites.web_sites.keys())

        domains_filtered = list(w for w in web_domains
                                if self.check_web_site_filters(w))

        self.logger.info("we are going to process {} web sites".format(
            len(domains_filtered)))

        if start_selenium:
            TDownloadEnv.FILE_CACHE_FOLDER = TDownloadEnv.FILE_CACHE_FOLDER + "_{}_{}".format(
                time.time(), os.getpid())
            self.logger.info("rm {}".format(TDownloadEnv.FILE_CACHE_FOLDER))
            TDownloadEnv.clear_cache_folder()
            project_path = "project.txt"
            TRobotProject.create_project("dummy.ru", project_path)
            with TRobotProject(
                    self.logger, project_path,
                    export_folder="result") as self.temp_dlrobot_project:
                for w in domains_filtered:
                    yield w
                os.unlink(project_path)
        else:
            for w in domains_filtered:
                yield w

    def ban_sites(self):
        cnt = 0
        for url in self.get_url_list(start_selenium=True):
            self.logger.debug("ban {}".format(url))
            self.web_sites.get_web_site(url).ban()
            cnt += 1
        self.logger.info("ban {} web sites".format(cnt))

    def to_utf8(self):
        cnt = 0
        for site_url in self.get_url_list():
            site_info = self.web_sites.get_web_site(site_url)
            if site_info.redirect_to is not None and TUrlUtf8Encode.is_idna_string(
                    site_info.redirect_to):
                site_info.redirect_to = TUrlUtf8Encode.convert_url_from_idna(
                    site_info.redirect_to)
                if site_info.redirect_to == site_url and site_info.reach_status == TWebSiteReachStatus.abandoned:
                    site_info.redirect_to = None
                    site_info.reach_status = TWebSiteReachStatus.normal
                cnt += 1
            if TUrlUtf8Encode.is_idna_string(site_url):
                site_info.url = TUrlUtf8Encode.convert_url_from_idna(site_url)
                cnt += 1
        self.logger.info("{} conversions made".format(cnt))

    def browse_one_url(self, url):
        self.logger.info("check {}".format(url))
        web_site = TWebSiteCrawlSnapshot(self.temp_dlrobot_project,
                                         morda_url=url,
                                         enable_step_init=False)
        web_site.fetch_the_main_page(enable_search_engine=False)
        if TWebSiteReachStatus.can_communicate(web_site.reach_status):
            return web_site
        else:
            self.logger.info("restart selenium, and try again")
            self.temp_dlrobot_project.selenium_driver.restart()
            web_site = TWebSiteCrawlSnapshot(self.temp_dlrobot_project,
                                             morda_url=url,
                                             enable_step_init=False)
            web_site.fetch_the_main_page(enable_search_engine=False)
            if TWebSiteReachStatus.can_communicate(web_site.reach_status):
                return web_site
            else:
                return None

    def get_external_file_name_by_site_url(self, site_url):
        return site_url.strip('/').replace('/', '_') + ".page_source.html"

    def check_alive_one_url(self, site_url, complete_bans, site_info=None):
        site_info: TDeclarationWebSite
        if site_info is None:
            site_info = self.web_sites.get_web_site(site_url)
        web_site = self.browse_one_url(site_url)
        #office = self.web_sites.get_office(site_url)
        office = site_info.parent_office
        if web_site is None:
            self.logger.info("     {} is dead".format(site_url))
            site_info.ban()
            complete_bans.append(site_url)
        else:
            new_site_url = web_site.get_main_url_protocol(
            ) + "://" + strip_scheme_and_query(web_site.main_page_url)
            title = web_site.get_title(web_site.main_page_url)
            if strip_scheme_and_query(
                    web_site.main_page_url).strip('/') != site_url.strip('/'):
                self.logger.info(
                    '   {} is alive, but is redirected to {}'.format(
                        site_url, new_site_url))
                new_site_info = None
                for u in office.office_web_sites:
                    if u.url == site_url:
                        u.set_redirect(new_site_url)
                    if u.url == new_site_url:
                        new_site_info = u
                if new_site_info is None:
                    new_site_info = TDeclarationWebSite(url=new_site_url)
                    office.office_web_sites.append(new_site_info)
                new_site_info.set_title(title)
            else:
                self.logger.info("     {} is alive, main_page_url = {}".format(
                    site_url, web_site.main_page_url))
                site_info.set_title(title)

            if web_site.main_page_source.lower().find('коррупц') != -1:
                self.logger.info(
                    "site contains corruption keyword {}".format(site_url))
                site_info.corruption_keyword_in_html = True

            if self.args.main_page_path:
                try:
                    with open(
                            self.get_external_file_name_by_site_url(site_url),
                            "w") as outp:
                        outp.write(web_site.main_page_source)
                except Exception as exp:
                    self.logger.error(
                        "cannot save page html to file: {} ".format(site_url))

    def check_alive(self):
        complete_bans = list()
        checked_count = 0
        for site_url in self.get_url_list(start_selenium=True):
            self.check_alive_one_url(site_url, complete_bans)
            checked_count += 1

        self.logger.info("ban {} web sites out of {} sites".format(
            len(complete_bans), checked_count))

    def print_keys(self):
        for web_domain in self.get_url_list():
            print(web_domain)

    def split(self):
        parts_count = self.args.split_parts
        chunk_size = int(len(self.web_sites.offices.offices) / parts_count)
        offices = list(self.web_sites.offices.offices.values())
        chunk_id = 0
        cnt = 0
        for l in range(0, len(offices), chunk_size):
            chunk_id += 1
            o = TOfficeTableInMemory()
            for i in offices[l:l + chunk_size]:
                o.add_office(i)
            file_path = "chunk_offices_{}.txt".format(chunk_id)
            o.write_to_local_file(file_path)
            cnt += len(o.offices)
        assert cnt == len(offices)

    def check(self):
        self.web_sites.check_valid(self.logger, fail_fast=False)

    def redirect_subdomain(self):
        for web_domain in self.get_url_list(start_selenium=True):
            site_info = self.web_sites.get_web_site(web_domain)
            if site_info.redirect_to is None or not web_domain.endswith(
                    site_info.redirect_to):
                continue
            self.browse_one_url(web_domain)

    def create_departments(self):
        o: TOfficeInMemory
        TDownloadEnv.clear_cache_folder()
        project_path = "project.txt"
        TRobotProject.create_project("dummy.ru",
                                     project_path,
                                     web_sites_db=self.web_sites)
        with TRobotProject(
                self.logger, project_path,
                export_folder="result") as self.temp_dlrobot_project:
            for o in self.web_sites.offices.values():
                if o.parent_id == self.args.parent_office_id:
                    self.logger.info("ofiice id = {}, {}".format(
                        o.office_id, o.name))
                    query = self.args.query_template.format(o.name)
                    engine = random.choice(
                        [SearchEngineEnum.GOOGLE, SearchEngineEnum.YANDEX])
                    results = SearchEngine.send_request(
                        engine, query,
                        self.temp_dlrobot_project.selenium_driver)
                    if len(results) == 0:
                        msg = "cannot find results fo query {}".format(query)
                        self.logger.error(msg)
                    else:
                        new_web_site = TDeclarationWebSite(url=results[0])
                        found = False
                        for u in o.office_web_sites:
                            if u.url == new_web_site:
                                found = True
                                self.logger.error(
                                    "{} already exists".format(new_web_site))
                        if not found:
                            o.office_web_sites.append(new_web_site)
                            self.check_alive_one_url(new_web_site.url)
                    time.sleep(20)

    def select(self):
        out = TOfficeTableInMemory()
        for web_domain in self.get_url_list():
            site_info: TDeclarationWebSite
            site_info = self.web_sites.get_web_site(web_domain)
            out.add_office(site_info.parent_office)
        self.web_sites.offices = out

    def select_adhoc(self):
        good_web_domains = set(self.read_web_domains_from_file())
        office: TOfficeInMemory
        ban_cnt = 0
        sp_left = 0
        for office in self.web_sites.offices.offices.values():
            if office.is_from_spravochnik():
                w: TDeclarationWebSite

                for w in office.office_web_sites:
                    if not w.can_communicate():
                        continue
                    u = strip_scheme_and_query(w.url)
                    if u in good_web_domains or "{}/".format(
                            u) in good_web_domains:
                        sp_left += 1
                        continue
                    ban_cnt += 1
                    self.logger.debug("ban office_id={}".format(
                        office.office_id))
                    w.ban(TWebSiteReachStatus.unpromising)
        self.logger.info("ban {} sites, left in spravochnik {}".format(
            ban_cnt, sp_left))

    def make_redirects(self):
        with open(self.args.redirect_mapping_path) as inp:
            for l in inp:
                old, new_site_url = l.strip().split()
                if not new_site_url.startswith('http'):
                    raise Exception(
                        "unknown http prefix in  {}".format(new_site_url))
                web_site = self.web_sites.search_url(old)
                if web_site is None:
                    raise Exception("cannot find website {}".format(old))
                web_site.set_redirect(new_site_url)
                new_site_info = TDeclarationWebSite(url=new_site_url)
                web_site.parent_office.office_web_sites.append(new_site_info)

    def get_title_from_local_files(self):
        for site_url in self.get_url_list(start_selenium=False):
            site_info = self.web_sites.get_web_site(site_url)
            file_path = os.path.join(
                "page_source",
                self.get_external_file_name_by_site_url(site_url))
            if os.path.exists(file_path):
                self.logger.info("read {}".format(file_path))
                with open(file_path, "rb") as inp:
                    title = get_html_title(inp.read())
                    site_info.set_title(title)

    def print_web_sites(self):
        site_infos = list()
        for site_url in self.get_url_list(start_selenium=False):
            site_info = self.web_sites.get_web_site(site_url)
            site_info.title = TDeclarationWebSite.clean_title(site_info.title)
            d = site_info.write_to_json()
            d['office_id'] = site_info.parent_office.office_id
            site_infos.append(d)

        print(json.dumps(site_infos, ensure_ascii=False, indent=4))

    def check_mirrors(self):
        offices = set()
        complete_bans = list()
        for site_url in self.get_url_list(start_selenium=True):
            office_info: TOfficeInMemory
            office_info = self.web_sites.get_web_site(site_url).parent_office
            not_abandoned_cnt = 0
            for u in office_info.office_web_sites:
                if u.can_communicate():
                    not_abandoned_cnt += 1
            if not_abandoned_cnt > 1 and office_info.office_web_sites[
                    -1].can_communicate() and office_info not in offices:
                offices.add(office_info)
                for i in range(len(office_info.office_web_sites) - 1):
                    site_info = office_info.office_web_sites[i]
                    if site_info.can_communicate():
                        self.check_alive_one_url(site_info.url,
                                                 complete_bans,
                                                 site_info=site_info)

    def main(self):
        if self.args.action == "ban":
            self.ban_sites()
        elif self.args.action == "to_utf8":
            self.to_utf8()
        elif self.args.action == "check_alive":
            self.check_alive()
        elif self.args.action == "print_keys":
            self.print_keys()
        elif self.args.action == "check":
            self.check()
        elif self.args.action == "redirect_subdomain":
            self.redirect_subdomain()
        elif self.args.action == "create_departments":
            self.create_departments()
        elif self.args.action == "select":
            self.select()
        elif self.args.action == "split":
            self.split()
            return
        elif self.args.action == "make_redirects":
            self.make_redirects()
        elif self.args.action == "get_title_from_local_files":
            self.get_title_from_local_files()
        elif self.args.action == "check_mirrors":
            self.check_mirrors()
        elif self.args.action == "select_adhoc":
            self.select_adhoc()
        elif self.args.action == "print_web_sites":
            self.print_web_sites()
            return
        else:
            raise Exception("unknown action")

        self.logger.info("write to {}".format(self.args.output_file))
        self.web_sites.offices.write_to_local_file(self.args.output_file)
Exemple #14
0
 def test_office_website_valid(self):
     logger = setup_logging("test_office_website_valid")
     web_sites = TDeclarationWebSiteList(logger)
     self.assertEqual(True, web_sites.check_valid(logger, fail_fast=True))
class TOfficePredictor:
    default_ml_model_path = os.path.join(os.path.dirname(__file__), "../model")

    @staticmethod
    def parse_args(args):
        parser = argparse.ArgumentParser()
        parser.add_argument("--dlrobot-human-path", dest='dlrobot_human_path', required=True)
        parser.add_argument("--office-model-path", dest='office_model_path', required=False,
                            default=TOfficePredictor.default_ml_model_path)
        parser.add_argument("--disable-ml", dest='enable_ml', required=False, default=True,
                            action="store_false")
        parser.add_argument("--max-failures-count", dest='max_failures_count', required=False, default=100,
                            type=int)
        return parser.parse_args(args=args)

    def __init__(self, args):
        self.logger = setup_logging(log_file_name="predict_office.log")
        self.dlrobot_human_path = args.dlrobot_human_path
        self.dlrobot_human = TDlrobotHumanFileDBM(self.dlrobot_human_path)
        self.dlrobot_human.open_write_mode()
        self.enable_ml = args.enable_ml
        sp_args = TSmartParserCacheClient.parse_args([])
        self.smart_parser_server_client = TSmartParserCacheClient(sp_args, self.logger)
        model_path = args.office_model_path
        self.max_failures_count = args.max_failures_count
        assert (os.path.exists(model_path))
        bigrams_path = os.path.join(model_path, "office_ngrams.txt")
        ml_model_path = os.path.join(model_path, "model")
        self.office_ml_model = TTensorFlowOfficeModel(self.logger, bigrams_path, ml_model_path, create_model=False)
        self.regional_tax_offices = self.build_regional_tax_offices()
        self.web_sites = TDeclarationWebSiteList(self.logger, RUSSIA.offices_in_memory)
        self.title_parser = TOfficeFromTitle(self.logger, web_sites=self.web_sites)
        self.src_doc_to_rule_results = dict()

    def build_regional_tax_offices(self):
        o: TOfficeInMemory
        tax_offices = dict()
        for o in RUSSIA.iterate_offices():
            if o.rubric_id == TOfficeRubrics.Tax:
                tax_offices[o.region_id] = o.office_id
        assert len(tax_offices) > 0
        return tax_offices

    def set_office_id(self, sha256, src_doc: TSourceDocument, office_id, method_name: str):
        old_office_id = src_doc.calculated_office_id
        if old_office_id is None or office_id == old_office_id:
            self.logger.debug("set file {} office_id={} ({} )".format(
                sha256, office_id, method_name))
        else:
            self.logger.info("change office_id from {} to {} for file {} , ({})".format( \
                old_office_id, office_id, sha256, method_name))
        src_doc.calculated_office_id = office_id
        self.dlrobot_human.update_source_document(sha256, src_doc)

    def predict_tax_office(self, sha256, src_doc: TSourceDocument):
        web_ref: TWebReference
        for web_ref in src_doc.web_references:
            if web_ref._site_url.endswith("service.nalog.ru"):
                if src_doc.region_id is None:
                    smart_parser_json = self.smart_parser_server_client.retrieve_json_by_sha256(sha256)
                    if smart_parser_json is None:
                        return False
                    props = smart_parser_json.get('document_sheet_props')
                    if props is None or len(props) == 0 or 'url' not in props[0]:
                        return False
                    url = props[0]['url']
                    region_str = url[:url.find('.')]
                    if not region_str.isdigit():
                        return False
                    src_doc.region_id = int(region_str)

                office_id = self.regional_tax_offices.get(src_doc.region_id)
                if office_id is not None:
                    self.set_office_id(sha256, src_doc, office_id, "regional tax office")
                    return True
        return False

    # all sites are ascribed to the same office
    def single_web_site(self, src_doc):
        r: TWebReference
        offices = set()
        for r in src_doc.web_references:
            if r.get_site_url():
                site_info = self.web_sites.search_url(r.get_site_url())
                if site_info is not None:
                    offices.add(site_info.parent_office.office_id)
        if len(offices) == 1:
            return list(offices)[0]
        return None

    #Take the first office, that is a very bad solution. This is done to make the whole thing work.
    # In future we hope to get rid of this solution by adding anchor texts analysis  or moro sophisticated title parsing
    def predict_by_first_web_site(self, case: TPredictionCase, src_doc):
        r: TWebReference
        min_crawl_epoch = time.time()
        office_id = None
        for r in src_doc.web_references:
            if 0 < r.crawl_epoch < min_crawl_epoch:
                site_info = self.web_sites.search_url(r.get_site_url())
                if site_info is not None:
                    min_crawl_epoch = r.crawl_epoch
                    office_id = site_info.parent_office.office_id
        return office_id
Exemple #16
0
class TDlrobotHTTPServer(http.server.HTTPServer):
    max_continuous_failures_count = 7
    PITSTOP_FILE = ".dlrobot_pit_stop"

    @staticmethod
    def parse_args(arg_list):
        parser = argparse.ArgumentParser()
        parser.add_argument(
            "--server-address",
            dest='server_address',
            default=None,
            help=
            "by default read it from environment variable DLROBOT_CENTRAL_SERVER_ADDRESS"
        )
        parser.add_argument("--dlrobot-config-type",
                            dest='dlrobot_config_type',
                            required=False,
                            default="prod",
                            help="can be prod, preliminary or test")
        parser.add_argument("--custom-offices-file",
                            dest='offices_file',
                            required=False)
        parser.add_argument("--log-file-name",
                            dest='log_file_name',
                            required=False,
                            default="dlrobot_central.log")
        parser.add_argument("--remote-calls-file",
                            dest='remote_calls_file',
                            default=None)
        parser.add_argument("--result-folder",
                            dest='result_folder',
                            required=True)
        parser.add_argument("--tries-count",
                            dest='tries_count',
                            required=False,
                            default=2,
                            type=int)
        parser.add_argument("--central-heart-rate",
                            dest='central_heart_rate',
                            required=False,
                            default='60s')
        parser.add_argument(
            "--check-yandex-cloud",
            dest='check_yandex_cloud',
            default=False,
            action='store_true',
            required=False,
            help="check yandex cloud health and restart workstations")
        parser.add_argument(
            "--skip-worker-check",
            dest='skip_worker_check',
            default=False,
            action='store_true',
            required=False,
            help="skip checking that this task was given to this worker")
        parser.add_argument("--enable-ip-checking",
                            dest='enable_ip_checking',
                            default=False,
                            action='store_true',
                            required=False)
        parser.add_argument("--disable-smart-parser-server",
                            dest="enable_smart_parser",
                            default=True,
                            action="store_false",
                            required=False)
        parser.add_argument("--disable-source-doc-server",
                            dest="enable_source_doc_server",
                            default=True,
                            action="store_false",
                            required=False)
        parser.add_argument("--disable-search-engines",
                            dest="enable_search_engines",
                            default=True,
                            action="store_false",
                            required=False)
        parser.add_argument("--disable-telegram",
                            dest="enable_telegram",
                            default=True,
                            required=False,
                            action="store_false")
        parser.add_argument("--disable-pdf-conversion-server-checking",
                            dest="pdf_conversion_server_checking",
                            default=True,
                            required=False,
                            action="store_false")
        parser.add_argument("--web-site-regexp",
                            dest="web_site_regexp",
                            required=False)
        parser.add_argument("--office-source-id",
                            dest="office_source_id",
                            required=False)
        parser.add_argument(
            "--round-file",
            dest="round_file",
            default=TDeclarationRounds.default_dlrobot_round_path)

        args = parser.parse_args(arg_list)
        args.central_heart_rate = convert_timeout_to_seconds(
            args.central_heart_rate)
        if args.server_address is None:
            args.server_address = os.environ['DLROBOT_CENTRAL_SERVER_ADDRESS']
        if args.check_yandex_cloud:
            assert TYandexCloud.get_yc() is not None

        return args

    def __init__(self, args):
        self.register_task_result_error_count = 0
        self.logger = setup_logging(log_file_name=args.log_file_name,
                                    append_mode=True)
        self.conversion_client = TDocConversionClient(
            TDocConversionClient.parse_args([]), self.logger)
        self.args = args
        rounds = TDeclarationRounds(args.round_file)
        self.dlrobot_remote_calls = TRemoteDlrobotCallList(
            logger=self.logger,
            file_name=args.remote_calls_file,
            min_start_time_stamp=rounds.start_time_stamp)
        self.worker_2_running_tasks = defaultdict(list)
        self.worker_2_continuous_failures_count = defaultdict(int)
        offices = TOfficeTableInMemory()
        offices.read_from_local_file(self.args.offices_file)
        self.web_sites_db = TDeclarationWebSiteList(self.logger,
                                                    offices=offices)
        if not os.path.exists(self.args.result_folder):
            os.makedirs(self.args.result_folder)
        self.web_sites_to_process = self.find_projects_to_process()
        self.cloud_id_to_worker_ip = dict()
        self.config = TRobotConfig.read_by_config_type(
            self.args.dlrobot_config_type)
        self.last_remote_call = None  # for testing
        host, port = self.args.server_address.split(":")
        self.logger.debug("start server on {}:{}".format(host, port))
        super().__init__((host, int(port)), TDlrobotRequestHandler)
        self.last_service_action_time_stamp = time.time()
        self.service_action_count = 0
        self.decl_sender = TDeclarationSender(
            self.logger, self.args.enable_smart_parser,
            self.args.enable_source_doc_server)
        self.stop_process = False
        if self.args.enable_ip_checking:
            self.permitted_hosts = set(
                str(x)
                for x in ipaddress.ip_network('192.168.100.0/24').hosts())
            self.permitted_hosts.add('127.0.0.1')
            self.permitted_hosts.add('95.165.96.61')  # disclosures.ru
        self.logger.debug("init complete")
        self.send_to_telegram("start dlrobot central with {} tasks".format(
            len(self.web_sites_to_process)))

    def send_to_telegram(self, message):
        if self.args.enable_telegram:
            self.logger.debug("send to telegram: {}".format(message))
            telegram_send.send(messages=[message])

    def stop_server(self):
        self.server_close()
        self.shutdown()

    def verify_request(self, request, client_address):
        if self.args.enable_ip_checking:
            (ip, dummy) = client_address
            if ip not in self.permitted_hosts:
                return False
        return True

    def log_process_result(self, process_result):
        s = process_result.stdout.strip("\n\r ")
        if len(s) > 0:
            for line in s.split("\n"):
                self.logger.error("task stderr: {}".format(line))
        s = process_result.stderr.strip("\n\r ")
        if len(s) > 0:
            for line in s.split("\n"):
                self.logger.error("task stderr: {}".format(line))

    def have_tasks(self):
        return len(self.web_sites_to_process) > 0 and not self.stop_process

    def project_is_to_process(self, project_file):
        interactions = self.dlrobot_remote_calls.get_interactions(project_file)
        if sum(1 for i in interactions if i.task_was_successful()) > 0:
            return False
        tries_count = self.args.tries_count
        if sum(1 for i in interactions if not i.task_ended()) > 0:
            # if the last result was not obtained, may be,
            # worker is down, so the problem is not in the task but in the worker
            # so give this task one more chance
            tries_count += 1
            self.logger.debug("increase max_tries_count for {} to {}".format(
                project_file, tries_count))
        return len(interactions) < tries_count

    def save_dlrobot_remote_call(self, remote_call: TRemoteDlrobotCall):
        self.dlrobot_remote_calls.add_dlrobot_remote_call(remote_call)
        if not remote_call.task_was_successful():
            if self.project_is_to_process(remote_call.project_file):
                self.web_sites_to_process.append(remote_call.web_site)
                self.logger.debug("register retry for {}".format(
                    remote_call.web_site))

    def find_projects_to_process(self):
        web_sites_to_process = list()
        self.logger.info("filter web sites")
        web_site_info: TDeclarationWebSite
        for web_site, web_site_info in self.web_sites_db.web_sites.items():
            if self.args.web_site_regexp is not None:
                if re.match(self.args.web_site_regexp, web_site) is None:
                    continue
            if self.args.office_source_id is not None:
                if web_site_info.get_parent_source_id(
                ) != self.args.office_source_id:
                    continue
            if TWebSiteReachStatus.can_communicate(web_site_info.reach_status):
                project_file = TRemoteDlrobotCall.web_site_to_project_file(
                    web_site)
                if self.project_is_to_process(project_file):
                    web_sites_to_process.append(web_site)

        self.logger.info("there are {} sites in the input queue".format(
            len(web_sites_to_process)))
        web_sites_to_process.sort(
            key=(lambda x: self.dlrobot_remote_calls.last_interaction[x]))

        with open("web_sites_to_process_debug.txt", "w") as out:
            for w in web_sites_to_process:
                out.write(w + "\n")
        return web_sites_to_process

    def get_running_jobs_count(self):
        return sum(len(w) for w in self.worker_2_running_tasks.values())

    def get_processed_jobs_count(self):
        return len(list(self.dlrobot_remote_calls.get_all_calls()))

    def get_new_project_to_process(self, worker_host_name, worker_ip):
        site_url = self.web_sites_to_process.pop(0)
        project_file = TRemoteDlrobotCall.web_site_to_project_file(site_url)
        self.logger.info(
            "start job: {} on {} (host name={}), left jobs: {}, running jobs: {}"
            .format(project_file, worker_ip, worker_host_name,
                    len(self.web_sites_to_process),
                    self.get_running_jobs_count()))
        remote_call = TRemoteDlrobotCall(worker_ip=worker_ip,
                                         project_file=project_file,
                                         web_site=site_url)
        remote_call.worker_host_name = worker_host_name
        web_site_passport = self.web_sites_db.get_web_site(site_url)
        regional_main_pages = list()
        if web_site_passport is None:
            self.logger.error(
                "{} is not registered in the web site db, no office information is available for the site"
            )
        project_content_str = TRobotProject.create_project_str(
            site_url,
            regional_main_pages,
            disable_search_engine=not self.args.enable_search_engines)
        self.worker_2_running_tasks[worker_ip].append(remote_call)
        return remote_call, project_content_str.encode("utf8")

    def untar_file(self, project_file, result_archive):
        base_folder, _ = os.path.splitext(project_file)
        output_folder = os.path.join(self.args.result_folder,
                                     base_folder) + ".{}".format(
                                         int(time.time()))
        compressed_file = io.BytesIO(result_archive)
        decompressed_file = gzip.GzipFile(fileobj=compressed_file)
        tar = tarfile.open(fileobj=decompressed_file)
        tar.extractall(output_folder)
        return output_folder

    def pop_project_from_running_tasks(self, worker_ip, project_file):
        if worker_ip not in self.worker_2_running_tasks:
            raise Exception(
                "{} is missing in the worker table".format(worker_ip))
        worker_running_tasks = self.worker_2_running_tasks[worker_ip]
        for i in range(len(worker_running_tasks)):
            if worker_running_tasks[i].project_file == project_file:
                return worker_running_tasks.pop(i)
        raise Exception("{} is missing in the worker {} task table".format(
            project_file, worker_ip))

    def worker_is_banned(self, worker_ip, host_name):
        return self.worker_2_continuous_failures_count[(worker_ip, host_name)] > \
                        TDlrobotHTTPServer.max_continuous_failures_count

    def update_worker_info(self, worker_host_name, worker_ip, exit_code):
        key = (worker_ip, worker_host_name)
        if exit_code == 0:
            self.worker_2_continuous_failures_count[key] = 0
        else:
            self.worker_2_continuous_failures_count[key] += 1
            if self.worker_is_banned(worker_ip, worker_host_name):
                self.send_to_telegram(
                    "too many dlrobot errors from ip {}, hostname={}, the host is banned, "
                    "you have to restart dlrobot_central to unban it".format(
                        worker_ip, worker_host_name))

    def register_task_result(self, worker_host_name, worker_ip, project_file,
                             exit_code, result_archive):
        if self.args.skip_worker_check:
            remote_call = TRemoteDlrobotCall(worker_ip, project_file)
        else:
            try:
                remote_call = self.pop_project_from_running_tasks(
                    worker_ip, project_file)
            except:
                if ipaddress.ip_address(worker_ip).is_private:
                    self.logger.debug(
                        "try to get a result {} from a local ip {}, though this task was not dispatched"
                        .format(project_file, worker_ip))
                    remote_call = TRemoteDlrobotCall(worker_ip, project_file)
                else:
                    raise

        self.update_worker_info(worker_host_name, worker_ip, exit_code)

        remote_call.worker_host_name = worker_host_name
        remote_call.exit_code = exit_code
        remote_call.end_time = int(time.time())
        project_folder = self.untar_file(project_file, result_archive)
        remote_call.calc_project_stats(self.logger, self.web_sites_db,
                                       project_folder, self.config)
        if not TWebSiteReachStatus.can_communicate(remote_call.reach_status):
            remote_call.exit_code = -1
        self.decl_sender.send_declaraion_files_to_other_servers(project_folder)
        self.save_dlrobot_remote_call(remote_call)
        self.last_remote_call = remote_call
        self.logger.debug(
            "got exitcode {} for task result {} from worker {} (host_name = {})"
            .format(exit_code, project_file, worker_ip, worker_host_name))

    def forget_old_remote_processes(self, current_time):
        for running_procs in self.worker_2_running_tasks.values():
            for i in range(len(running_procs) - 1, -1, -1):
                remote_call = running_procs[i]
                elapsed_seconds = current_time - remote_call.start_time
                if elapsed_seconds > self.config.get_kill_timeout_in_central():
                    self.logger.debug(
                        "task {} on worker {}(host={}) takes {} seconds, probably it failed, stop waiting for a result"
                        .format(remote_call.web_site, remote_call.worker_ip,
                                remote_call.worker_host_name, elapsed_seconds))
                    running_procs.pop(i)
                    remote_call.exit_code = 126
                    self.save_dlrobot_remote_call(remote_call)

    def forget_remote_processes_for_yandex_worker(self, cloud_id):
        worker_ip = self.cloud_id_to_worker_ip.get(cloud_id)
        if worker_ip is None and len(self.cloud_id_to_worker_ip) > 0:
            self.logger.info(
                "I do not remember ip for cloud_id {}, cannot delete processes"
                .format(cloud_id))
            return

        running_procs = self.worker_2_running_tasks.get(worker_ip, list())
        for i in range(len(running_procs) - 1, -1, -1):
            rc = running_procs[i]
            self.logger.debug(
                "forget task {} on worker {} since the workstation was stopped"
                .format(rc.project_file, rc.worker_ip))
            running_procs.pop(i)
            rc.exit_code = 125
            self.save_dlrobot_remote_call(rc)
        if cloud_id in self.cloud_id_to_worker_ip:
            del self.cloud_id_to_worker_ip[cloud_id]

    def check_yandex_cloud(self):
        if not self.args.check_yandex_cloud:
            return None
        try:
            if not check_internet():
                self.logger.error(
                    "cannot connect to google dns, probably internet is down")
                return None
            for m in TYandexCloud.list_instances():
                cloud_id = m['id']
                if m['status'] == 'STOPPED':
                    self.forget_remote_processes_for_yandex_worker(cloud_id)
                    self.logger.info(
                        "start yandex cloud worker {}".format(cloud_id))
                    TYandexCloud.start_yandex_cloud_worker(cloud_id)
                elif m['status'] == "RUNNING":
                    worker_ip = TYandexCloud.get_worker_ip(m)
                    if self.args.enable_ip_checking:
                        self.permitted_hosts.add(worker_ip)
                    self.cloud_id_to_worker_ip[cloud_id] = worker_ip
        except Exception as exp:
            self.logger.error(exp)

    def check_pdf_conversion_server(self):
        if not self.args.pdf_conversion_server_checking:
            return True
        return not self.conversion_client.server_is_too_busy()

    def service_actions(self):
        current_time = time.time()
        if current_time - self.last_service_action_time_stamp >= self.args.central_heart_rate:
            self.service_action_count += 1
            if self.service_action_count % 10 == 0:
                self.logger.debug('alive')
            self.last_service_action_time_stamp = current_time
            if os.path.exists(self.PITSTOP_FILE):
                self.stop_process = True
                self.logger.debug(
                    "stop sending tasks, exit for a pit stop after all tasks complete"
                )
                os.unlink(self.PITSTOP_FILE)
            if self.stop_process and self.get_running_jobs_count() == 0:
                self.logger.debug("exit via exception")
                raise Exception("exit for pit stop")
            try:
                self.forget_old_remote_processes(current_time)
            except Exception as exp:
                self.logger.error(exp)
            self.check_yandex_cloud()
            if not self.check_pdf_conversion_server():
                self.logger.debug(
                    "stop sending tasks, because conversion pdf queue length is {}"
                    .format(self.conversion_client.
                            last_pdf_conversion_queue_length))

    def get_stats(self):
        workers = dict((k, list(r.write_to_json() for r in v))
                       for (k, v) in self.worker_2_running_tasks.items())
        stats = {
            'running_count':
            self.get_running_jobs_count(),
            'input_tasks':
            len(self.web_sites_to_process),
            'processed_tasks':
            self.get_processed_jobs_count(),
            'worker_2_running_tasks':
            workers,
            'last_service_action_time_stamp':
            self.last_service_action_time_stamp,
            'central_heart_rate':
            self.args.central_heart_rate,
            'register_task_result_error_count':
            self.register_task_result_error_count
        }
        if self.stop_process:
            stats['stop_process'] = True
        return stats