Esempio n. 1
0
 def select(self):
     out = TOfficeTableInMemory()
     for web_domain in self.get_url_list():
         site_info: TDeclarationWebSite
         site_info = self.web_sites.get_web_site(web_domain)
         out.add_office(site_info.parent_office)
     self.web_sites.offices = out
Esempio n. 2
0
def main():
    args = parse_args()
    logger = setup_logging("join_office_and_websites")
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()

    web_sites_db = TDeclarationWebSiteList(
        logger,
        TDeclarationWebSiteList.default_input_task_list_path).load_from_disk()
    url_info: TDeclarationWebSiteObsolete
    for url, url_info in web_sites_db.web_sites.items():
        office_id = url_info.calculated_office_id
        office: TOfficeInMemory
        office = offices.offices.get(int(office_id))
        if office is None:
            logger.debug(
                "cannot find office_id={}, url={} no valid urls, deleted office?"
                .format(office_id, url))
            continue
        p = url_info.http_protocol if url_info.http_protocol is not None else "http"
        i = TDeclarationWebSite()
        i.url = p + "://" + url
        i.reach_status = url_info.reach_status
        i.comments = url_info.comments
        i.redirect_to = url_info.redirect_to
        i.title = url_info.title
        office.office_web_sites.append(i)
    for o in offices.offices.values():
        o.office_web_sites.sort(key=lambda x: 1 if x.reach_status ==
                                TWebSiteReachStatus.normal else 0)
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
Esempio n. 3
0
def main():
    args = parse_args()
    logger = setup_logging("set_rubrics")
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()
    offices.set_rubrics(logger)
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
Esempio n. 4
0
 def __init__(self):
     self.regions = TRussianRegions()
     self.year_stat = dict()
     for year in [LAST_DECLARATION_YEAR]:
         self.init_one_year_stats(year)
     self.sorted_region_list_for_web_interface = self._build_region_list_for_combo_box()
     self.offices_in_memory = TOfficeTableInMemory()
     self.offices_in_memory.read_from_local_file()
     self.federal_fsin = self.offices_in_memory.fsin_by_region[TRussianRegions.Russia_as_s_whole_region_id]
     assert self.federal_fsin is not None
     self.calc_data_2020 = TOfficeRubricCalculatedData(os.path.join(os.path.dirname(__file__), "data", "office2020"))
     self.calc_data_current = TOfficeRubricCalculatedData(os.path.join(os.path.dirname(__file__), "data", "office_current"))
 def handle(self, *args, **options):
     logger = setup_logging(log_file_name="build_rubric.log")
     office_hierarchy = TOfficeTableInMemory(use_office_types=False)
     office_hierarchy.read_from_table(models.Office.objects.all())
     for office in models.Office.objects.all():
         rubric_id = office_hierarchy.build_office_rubric(logger, office.id)
         if rubric_id is not None and rubric_id != office.rubric_id:
             logger.debug(
                 "set office rubric_id from {} to {} for {}".format(
                     get_russian_rubric_str(office.rubric_id),
                     get_russian_rubric_str(rubric_id), office.name))
             office.rubric_id = rubric_id
             office.save()
Esempio n. 6
0
 def __init__(self, args):
     self.args = args
     self.logger = setup_logging("wd_by_url")
     self.regions = TRussianRegions()
     self.offices = TOfficeTableInMemory(use_office_types=False)
     self.offices.read_from_local_file()
     self.disclosures_hostnames = defaultdict(set)
     self.disclosures_office_names = defaultdict(set)
     self.build_office_indices()
     self.wd_urls = TWikidataUrlRecords()
     self.wd_urls.read_from_file(self.args.wikidata_info)
     self.wd_region_heads = TWikidataRegionHeads()
     self.wd_region_heads.read_from_file(self.args.wd_region_head_info)
Esempio n. 7
0
    def __init__(self):
        self.args = parse_args()
        self.logger = setup_logging(log_file_name=self.args.logfile)
        if self.args.input_offices is not None:
            offices = TOfficeTableInMemory()
            offices.read_from_local_file(self.args.input_offices)
            self.web_sites = TDeclarationWebSiteList(self.logger,
                                                     offices=offices)
        else:
            self.web_sites = TDeclarationWebSiteList(self.logger)

        self.temp_dlrobot_project: TRobotProject
        self.temp_dlrobot_project = None
        THttpRequester.initialize(self.logger)
Esempio n. 8
0
 def split(self):
     parts_count = self.args.split_parts
     chunk_size = int(len(self.web_sites.offices.offices) / parts_count)
     offices = list(self.web_sites.offices.offices.values())
     chunk_id = 0
     cnt = 0
     for l in range(0, len(offices), chunk_size):
         chunk_id += 1
         o = TOfficeTableInMemory()
         for i in offices[l:l + chunk_size]:
             o.add_office(i)
         file_path = "chunk_offices_{}.txt".format(chunk_id)
         o.write_to_local_file(file_path)
         cnt += len(o.offices)
     assert cnt == len(offices)
Esempio n. 9
0
def add_offices(apps, schema_editor):
    clear_offices(apps, schema_editor)
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()
    Office = apps.get_model('declarations', 'Office')
    office: TOfficeInMemory
    for office in offices.offices.values():
        c = Office(id=office.office_id,
                   name=office.name,
                   type_id=office.type_id,
                   parent_id=office.parent_id,
                   region_id=office.region_id,
                   rubric_id=office.rubric_id
                   )
        c.save()
Esempio n. 10
0
def create_train_pool(logger, file_name):
    offices = TOfficeTableInMemory()
    offices.read_from_local_file()
    office: TOfficeInMemory
    pool = TRawPool(logger, "train")
    with open(file_name, "w") as outp:
        for office in offices.offices.values():
            if len(office.office_web_sites) > 0:
                r = TRawRecord(
                    rubric_id=office.rubric_id,
                    name_words=split_to_words(office.name),
                    web_name_words=split_to_words(
                        office.office_web_sites[0].url),
                )
                if r.rubric_id is None:
                    continue
                outp.write(json.dumps(r.__dict__, ensure_ascii=False))
                pool.add_record(r)
    return pool
Esempio n. 11
0
def main():
    args = parse_args()
    logger = setup_logging("calc_region_from_wd")
    regions = TRussianRegions()
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()
    wd = TWikidataRecords(regions)
    wd.read_from_file(args.wikidata_info)

    web_sites_db = TDeclarationWebSiteList(logger,
                                           TDeclarationWebSiteList.default_input_task_list_path).load_from_disk()
    office_to_urls = web_sites_db.build_office_to_main_website(take_abandoned=True)
    with open(args.input_file) as inp:
        for l in inp:
            office_id, name = l.strip().split("\t")
            office = offices.offices.get(int(office_id))
            if office is None:
                logger.debug("cannot find office_id={}, name={} no valid urls, deleted office?")
                continue

            wikidata_id, region = wd.get_region_by_name(name)
            if wikidata_id is not None:
                cause = "name"
            else:
                urls = office_to_urls.get(int(office_id), [])
                if len(urls) == 0:
                    logger.debug("office_id={}, name={} no valid urls, delete office?")
                    continue
                for url in urls:
                    wikidata_id, region = wd.get_region_by_url(name, url)
                    if wikidata_id is not None:
                        cause = "url"
                        break

            if region is None:
                logger.error(
                    "office_id={}, name={} cannot recognize region".format(office_id, name))
            else:
                logger.debug("set region {} to {} {} by {} ".format(region.name, office_id, name, cause))
                office.region_id = region.id
                office.wikidata_id = wikidata_id
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
Esempio n. 12
0
 def __init__(self, args):
     self.register_task_result_error_count = 0
     self.logger = setup_logging(log_file_name=args.log_file_name,
                                 append_mode=True)
     self.conversion_client = TDocConversionClient(
         TDocConversionClient.parse_args([]), self.logger)
     self.args = args
     rounds = TDeclarationRounds(args.round_file)
     self.dlrobot_remote_calls = TRemoteDlrobotCallList(
         logger=self.logger,
         file_name=args.remote_calls_file,
         min_start_time_stamp=rounds.start_time_stamp)
     self.worker_2_running_tasks = defaultdict(list)
     self.worker_2_continuous_failures_count = defaultdict(int)
     offices = TOfficeTableInMemory()
     offices.read_from_local_file(self.args.offices_file)
     self.web_sites_db = TDeclarationWebSiteList(self.logger,
                                                 offices=offices)
     if not os.path.exists(self.args.result_folder):
         os.makedirs(self.args.result_folder)
     self.web_sites_to_process = self.find_projects_to_process()
     self.cloud_id_to_worker_ip = dict()
     self.config = TRobotConfig.read_by_config_type(
         self.args.dlrobot_config_type)
     self.last_remote_call = None  # for testing
     host, port = self.args.server_address.split(":")
     self.logger.debug("start server on {}:{}".format(host, port))
     super().__init__((host, int(port)), TDlrobotRequestHandler)
     self.last_service_action_time_stamp = time.time()
     self.service_action_count = 0
     self.decl_sender = TDeclarationSender(
         self.logger, self.args.enable_smart_parser,
         self.args.enable_source_doc_server)
     self.stop_process = False
     if self.args.enable_ip_checking:
         self.permitted_hosts = set(
             str(x)
             for x in ipaddress.ip_network('192.168.100.0/24').hosts())
         self.permitted_hosts.add('127.0.0.1')
         self.permitted_hosts.add('95.165.96.61')  # disclosures.ru
     self.logger.debug("init complete")
     self.send_to_telegram("start dlrobot central with {} tasks".format(
         len(self.web_sites_to_process)))
Esempio n. 13
0
    def __init__(self, logger, offices=None):
        self.web_sites = dict()
        self.web_sites_to_office = dict()
        self.web_domains_redirects = None
        self.web_domain_to_web_site = defaultdict(list)
        self.logger = logger
        if offices is None:
            self.offices = TOfficeTableInMemory()
            self.offices.read_from_local_file()
        else:
            self.offices = offices
        o: TOfficeInMemory
        error_cnt = 0
        for o in self.offices.offices.values():
            u: TDeclarationWebSite
            for u in o.office_web_sites:
                site_url = get_site_url(u.url)
                if site_url in self.web_sites:
                    if site_url in self.web_sites:
                        exception_msg = "url {} occurs in office db more than one time".format(
                            site_url)
                        error_cnt += 1
                self.web_sites[site_url] = u
                self.web_sites_to_office[site_url] = o
                if u.can_communicate() and u.title is None:
                    self.logger.error(
                        "url={} has no title, ML model predict office needs titles to work properly"
                        .format(u.url))
        if error_cnt > 0:
            raise Exception(exception_msg +
                            " and {} other equal urls".format(error_cnt))

        self.build_web_domains_redirects()
        self.web_domain_to_web_site.clear()
        for k, v in self.web_sites.items():
            self.web_domain_to_web_site[
                TDeclarationWebSiteList.site_url_to_web_domain(k)].append(
                    get_site_url(k))
Esempio n. 14
0
def main():
    args = parse_args()
    logger = setup_logging("calc_region")
    regions = TRussianRegions()
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()

    with open(args.input_file) as inp:
        for l in inp:
            office_id, name, yandex_info = l.strip().split("\t")
            address = json.loads(yandex_info).get('address', '')
            region_id = regions.calc_region_by_address(address)
            if region_id is None:
                logger.error("cannot recognize region for {}".format(address))
            else:
                office = offices.offices.get(int(office_id))
                logger.debug(
                    "office_id={}, change region_id={} to region_id={}".format(
                        office_id, office.region_id, region_id))
                office.region_id = region_id
                office.address = address
            #print("\t".join([office_id, name, yandex_info, str(region_id)]))
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)