def test_import_second_passport(self): self.assertGreater(models.Office.objects.count(), 0) models.Section.objects.all().delete() models.Source_Document.objects.all().delete() permalinks_folder = os.path.dirname(__file__) TPermalinksManager(setup_logging(), {'directory': permalinks_folder}).create_empty_dbs() domains_folder = os.path.join(os.path.dirname(__file__), "domains_1") sp_workdir = os.path.join(os.path.dirname(__file__), "smart_parser_server") importer = ImportJsonCommand(None, None) os.chdir(os.path.dirname(__file__)) with SmartParserServerForTesting(sp_workdir, domains_folder): importer.handle(None, dlrobot_human="dlrobot_human_1.json", smart_parser_human_json="human_jsons", permalinks_folder=permalinks_folder) self.assertEqual(models.Section.objects.count(), 1) self.assertEqual(models.RealEstate.objects.count(), 1) section_id1 = list(models.Section.objects.all())[0].id # one more time, but now we have two vehicles for the same person (same document), as though smart_parser # is more intelligent TPermalinksManager(setup_logging(), {'directory': permalinks_folder}).create_permalinks() # clear the db models.Vehicle.objects.all().delete() models.RealEstate.objects.all().delete() models.Income.objects.all().delete() models.Section.objects.all().delete() models.Source_Document.objects.all().delete() domains_folder = os.path.join(os.path.dirname(__file__), "domains_2") sp_workdir = os.path.join(os.path.dirname(__file__), "smart_parser_server") importer = ImportJsonCommand(None, None) os.chdir(os.path.dirname(__file__)) with SmartParserServerForTesting(sp_workdir, domains_folder): importer.handle(None, dlrobot_human="dlrobot_human_2.json", smart_parser_human_json="human_jsons", permalinks_folder=permalinks_folder) self.assertEqual(models.Section.objects.count(), 1) self.assertEqual(models.RealEstate.objects.count(), 1) section_id2 = list(models.Section.objects.all())[0].id ## "Иванов И.В." == "Иванов И. В." self.assertEqual(section_id1, section_id2)
def test(self): logger = setup_logging(logger_name="test_real_dedupe") models.Section.objects.all().delete() permalinks_folder = os.path.dirname(__file__) db = TPermaLinksPerson(permalinks_folder) db.open_db_read_only() db.recreate_auto_increment_table() db.close_db() model_path = os.path.join( os.path.dirname(__file__), "../../../deduplicate/model/random_forest.pickle") dedupe_objects = os.path.join(os.path.dirname(__file__), "dedupe_objects.dump") run_dedupe = RunDedupe(None, None) run_dedupe.handle(None, permalinks_folder=permalinks_folder, input_dedupe_objects=dedupe_objects, model_file=model_path, threshold=0.6, recreate_db=True, surname_bounds=',', write_to_db=True) sec = models.Section.objects.get(id=757036) self.assertEqual(1406125, sec.person_id)
def main(): args = parse_args() logger = setup_logging("manage_pool") pool = TOfficePool(logger) pool.read_cases(args.input_pool) case: TPredictionCase cnt = 0 toloka_pool = list() automatic_pool = list() parser = TOfficeFromTitle(logger) for case in pool.pool: cnt += 1 w: TTitleParseResult w = parser.parse_title(case) if w is None: logger.debug("cannot parse {}".format(case.sha256)) else: #print ("{}".format(json.dumps(parser.to_json(), indent=4, ensure_ascii=False))) #print(parser.org_name) if w.weight > 0.5: automatic_pool.append(case) case.true_office_id = w.office.office_id else: toloka_pool.append(case) logger.debug("{}\t{}\t{}\t=>{}:{}".format( w.office.office_id, w.office.name, w.org_name, w.weight, ",".join(w.common_words))) TOfficePool.write_pool(toloka_pool, args.output_toloka_file) TOfficePool.write_pool(automatic_pool, args.output_automatic_file)
def __init__(self, *args, **kwargs): super(Command, self).__init__(*args, **kwargs) self.logger = setup_logging(log_file_name="create_misspell_db.log") self.rml_path = None self.converter1 = None self.converter2 = None self.output_folder = None
def setUp(self): self.env = TestDlrobotEnv("data.ssl") TDownloadEnv.clear_cache_folder() THttpRequester.ENABLE = False logger = setup_logging(log_file_name="dlrobot.log") THttpRequester.initialize(logger)
def main(): args = parse_args() logger = setup_logging("join_office_and_websites") offices = TOfficeTableInMemory(use_office_types=False) offices.read_from_local_file() web_sites_db = TDeclarationWebSiteList( logger, TDeclarationWebSiteList.default_input_task_list_path).load_from_disk() url_info: TDeclarationWebSiteObsolete for url, url_info in web_sites_db.web_sites.items(): office_id = url_info.calculated_office_id office: TOfficeInMemory office = offices.offices.get(int(office_id)) if office is None: logger.debug( "cannot find office_id={}, url={} no valid urls, deleted office?" .format(office_id, url)) continue p = url_info.http_protocol if url_info.http_protocol is not None else "http" i = TDeclarationWebSite() i.url = p + "://" + url i.reach_status = url_info.reach_status i.comments = url_info.comments i.redirect_to = url_info.redirect_to i.title = url_info.title office.office_web_sites.append(i) for o in offices.offices.values(): o.office_web_sites.sort(key=lambda x: 1 if x.reach_status == TWebSiteReachStatus.normal else 0) logger.info("write to {}".format(args.output_file)) offices.write_to_local_file(args.output_file)
def test_fsin_2_import(self): self.assertGreater(models.Office.objects.count(), 0) models.Income.objects.all().delete() models.RealEstate.objects.all().delete() models.Vehicle.objects.all().delete() models.Section.objects.all().delete() models.Source_Document.objects.all().delete() permalinks_folder = os.path.dirname(__file__) logger = setup_logging(log_file_name="test_fsin_import.log") TPermalinksManager(logger, { 'directory': permalinks_folder }).create_empty_dbs() doc_folder = os.path.join(os.path.dirname(__file__), "domains") sp_workdir = os.path.join(os.path.dirname(__file__), "smart_parser_server") importer = ImportJsonCommand(None, None) os.chdir(os.path.dirname(__file__)) with SmartParserServerForTesting(sp_workdir, doc_folder): importer.handle(None, process_count=2, dlrobot_human="dlrobot_human.json", permalinks_folder=permalinks_folder) self.assertEqual(1, models.Section.objects.count()) pass
def __init__(self, args): self.args = args self.working = True self.thread_pool = ThreadPoolExecutor(max_workers=self.args.worker_count) self.setup_working_folder() self.logger = setup_logging(log_file_name=self.args.log_file_name, append_mode=True) self.setup_environment()
def handle(self, *args, **options): TImporter.logger = setup_logging(log_file_name="import_json.log") importer = TImporter(options) self.stdout.write("start importing") if options.get('office_id') is not None: importer.init_non_pickable() importer.import_office(options.get('office_id')) elif options.get('process_count', 0) > 1: importer.delete_before_fork() process_count = options.get('process_count') importer.distribute_offices_to_processes(process_count) pool = Pool(processes=process_count) pool.map(importer.process_one_office_bucket_in_subprocess, importer.office_buckets.keys()) importer.init_after_fork() else: importer.init_non_pickable() cnt = 0 for office_id in importer.office_to_source_documents.keys(): if options.get( 'take_first_n_offices' ) is not None and cnt >= options.get('take_first_n_offices'): break importer.import_office(office_id) cnt += 1 TImporter.logger.info("Section count={}".format( models.Section.objects.all().count())) TImporter.logger.info("all done")
def handle(self, *args, **options): logger = setup_logging("fix_fsin") for s in models.Section.objects.filter(rubric_id=10): self.filter_set(logger, s.id, s.income_set.all().order_by('id'), models.Income) self.filter_set(logger, s.id, s.vehicle_set.all().order_by('id'), models.Vehicle) self.filter_set(logger, s.id, s.realestate_set.all().order_by('id'), models.RealEstate)
def __init__(self, args): self.logger = setup_logging(log_file_name="access_log_reader.log") self.args = args self.start_access_log_date = self.args.start_access_log_date self.last_access_log_date = self.args.last_access_log_date self.access_log_folder = self.args.access_log_folder self.min_request_freq = self.args.min_request_freq
def test_complex_import(self): self.assertGreater(models.Office.objects.count(), 0) models.Income.objects.all().delete() models.RealEstate.objects.all().delete() models.Vehicle.objects.all().delete() models.Section.objects.all().delete() models.Source_Document.objects.all().delete() permalinks_folder = os.path.dirname(__file__) logger = setup_logging(log_file_name="test_complex_import.log") TPermalinksManager(logger, { 'directory': permalinks_folder }).create_empty_dbs() doc_folder = os.path.join(os.path.dirname(__file__), "domains") sp_workdir = os.path.join(os.path.dirname(__file__), "smart_parser_server") importer = ImportJsonCommand(None, None) os.chdir(os.path.dirname(__file__)) with SmartParserServerForTesting(sp_workdir, doc_folder): importer.handle(None, dlrobot_human="dlrobot_human.json", smart_parser_human_json="human_jsons", permalinks_folder=permalinks_folder) self.assertEqual(models.Section.objects.count(), 3) old_sections = [(s.id, s.person_name) for s in models.Section.objects.all()] self.assertEqual(models.RealEstate.objects.count(), 3) self.assertEqual(models.Income.objects.count(), 3) self.assertEqual(models.Income.objects.count(), 3) self.assertGreater(models.Office.objects.count(), 0) old_docs = [(d.id, d.sha256) for d in models.Source_Document.objects.all()] # import the same sections adn check that we reuse old section ids and source doc ids CreatePermalinksStorageCommand(None, None).handle( None, directory=permalinks_folder) permalinks_db = TPermalinksManager(logger, {'directory': permalinks_folder}) permalinks_db.create_sql_sequences() models.Section.objects.all().delete() models.Source_Document.objects.all().delete() with SmartParserServerForTesting(sp_workdir, doc_folder): importer.handle(None, dlrobot_human="dlrobot_human.json", smart_parser_human_json="human_jsons", permalinks_folder=permalinks_folder) new_docs = [(d.id, d.sha256) for d in models.Source_Document.objects.all()] self.assertListEqual(old_docs, new_docs) new_sections = [(s.id, s.person_name) for s in models.Section.objects.all()] self.assertListEqual(old_sections, new_sections)
def __init__(self): self.args = parse_args() self.logger = setup_logging("check_disclosures_heath") self.last_messages = dict() self.checks = list() with open(self.args.config) as inp: for c in json.load(inp): self.checks.append(TCheckState.read_from_json(self, c))
def main(): args = parse_args() logger = setup_logging(log_file_name="predict_office_baseline.log") model = TPredictionModelBase(logger, args.bigrams_path, args.model_folder, test_pool=args.test_pool) test_baseline(model)
def test_unicode(self): try: THttpRequester.initialize(setup_logging()) s = THttpRequester.make_http_request("http://5%20июня%20запретят%20розничную%20продажу%20алкоголя", "GET") except THttpRequester.RobotHttpException as exp: # no UnicodeException for this url pass
def __init__(self, *args, **kwargs): super(Command, self).__init__(*args, **kwargs) self.logger = setup_logging( log_file_name="update_person_redirects.log") self.options = None self.old_person_to_sections = defaultdict(list) self.redirects = dict() self.new_section_to_person = dict()
def main(): args = parse_args() logger = setup_logging("set_rubrics") offices = TOfficeTableInMemory(use_office_types=False) offices.read_from_local_file() offices.set_rubrics(logger) logger.info("write to {}".format(args.output_file)) offices.write_to_local_file(args.output_file)
def handle(self, *args, **options): logger = setup_logging(log_file_name="normalize_fio.log") for section in models.Section.objects.all(): person_name = normalize_fio_before_db_insert(section.person_name) if person_name != section.person_name: logger.debug("normalize {} -> {}".format( section.person_name, person_name)) section.person_name = person_name section.save()
def __init__(self, *args, **kwargs): super(Command, self).__init__(*args, **kwargs) self.test_objects = None self.test_data = None self.options = None self.logger = setup_logging(log_file_name="test_ml_pool.log") self.ml_model = None self.X = None self.y_true = None
def setUp(self): self.server_address = '127.0.0.1:{}'.format(self.web_site_port) self.web_server = TestHTTPServer(self.web_site_port) threading.Thread(target=start_server, args=(self.web_server,)).start() time.sleep(1) self.env = TestDlrobotEnv("data.timeout") TDownloadEnv.clear_cache_folder() self.logger = setup_logging(log_file_name="dlrobot.log") THttpRequester.initialize(self.logger)
def main(): logger = setup_logging(log_file_name="predict_office_test.log") args = parse_args() model = TTensorFlowOfficeModel(logger, args.bigrams_path, args.model_folder, create_model=False, work_pool_path=args.test_pool) model.test_model(thresholds=args.threshold)
def main(): args = parse_args() logger = setup_logging(log_file_name="predict_office.log") model = TPredictionModel(logger, args.bigrams_path, model_path=args.model_path, row_count=args.row_count, train_pool=args.train_pool, test_pool=args.test_pool) elif args.action == "train": model.train_catboost()
def setUp(self, website_folder): self.env = TestDlrobotEnv("data.{}".format( os.path.basename(website_folder))) shutil.copy2( os.path.join(os.path.dirname(__file__), website_folder, "project.txt"), self.env.data_folder) THttpRequester.ENABLE = False self.logger = setup_logging(log_file_name="dlrobot.log") THttpRequester.initialize(self.logger)
def main(): logger = setup_logging(log_file_name="predict_office_toloka.log") args = parse_args() model = TTensorFlowOfficeModel(logger, args.bigrams_path, args.model_folder, create_model=False, work_pool_path=args.test_pool) model.toloka(args.toloka_pool, format=args.format)
def test(self): logger = setup_logging(logger_name="test_real_dedupe") sql_script = os.path.join( os.path.dirname(__file__), "disclosures.sql.person_id_5295.n") run_sql_script(logger, sql_script) permalinks_folder = os.path.dirname(__file__) db = TPermaLinksPerson(permalinks_folder) db.create_db() db.save_dataset(setup_logging()) db.recreate_auto_increment_table() db.close_db() model_path = os.path.join(os.path.dirname(__file__), "../../../deduplicate/model/random_forest.pickle" ) run_dedupe = RunDedupe(None, None) run_dedupe.handle(None, permalinks_folder=permalinks_folder, write_to_db=True, surname_bounds=',', model_file=model_path, threshold=0.6 ) person_id = 5295 self.assertEqual(models.Person.objects.count(), 3) person = models.Person.objects.get(id=person_id) self.assertIsNotNone(person) self.assertEqual(5295, person.declarator_person_id) canon_sections = [ (451721, 5295, True), (452066, 5295, True), (452420, 5295, True), (453686, 5295, False), (455039, 5295, False), (1801614, 5296, True), (5105303, 5295, True), (6437989, 5297, True), (6672563, 5297, True), (6674154, 5297, True), (6773981, 5297, True), ] sections = [] for s in models.Section.objects.all(): sections.append ((s.id, s.person_id, s.dedupe_score is not None)) self.assertListEqual(canon_sections, sections)
def __init__(self, args): self.args = args self.logger = setup_logging(log_file_name="join_human_and_dlrobot.log", append_mode=True) self.output_dlrobot_human = TDlrobotHumanFileDBM(args.output_json) self.output_dlrobot_human.create_db() self.old_files_with_office_count = 0 self.web_sites_db = TDeclarationWebSiteList(self.logger) self.offices = self.web_sites_db.offices self.dlrobot_config = TRobotConfig.read_by_config_type("prod")
def main(): args = parse_args() logger = setup_logging("create_sample") dlrobot_human = TDlrobotHumanFileDBM(args.input_file) dlrobot_human.open_db_read_only() source_doc_client = TSourceDocClient(TSourceDocClient.parse_args([])) smart_parser_client = TSmartParserCacheClient( TSmartParserCacheClient.parse_args([])) logger.info("create population") tmp_folder = '/tmp/create_sample_sp' if os.path.exists(tmp_folder): shutil.rmtree(tmp_folder) logger.info("create directory {}".format(tmp_folder)) os.mkdir(tmp_folder) population = list(dlrobot_human.get_all_keys()) random.shuffle(population) logger.info("fetch files") found = set() for sha256 in population: logger.debug("get doc {}".format(sha256)) file_data, file_extension = source_doc_client.retrieve_file_data_by_sha256( sha256) if file_data is None: logger.error("cannot get data for {}".format(sha256)) continue if args.income_year is not None: smart_parser_json = smart_parser_client.retrieve_json_by_sha256( sha256) if smart_parser_json is None or len(smart_parser_json) == 0: logger.error( "empty or invalid smart parser json for {}".format(sha256)) continue src_doc = dlrobot_human.get_document(sha256) year = src_doc.calc_document_income_year(smart_parser_json) if year != args.income_year: logger.error("different year ({} != {})".format( year, args.income_year)) continue found.add(sha256) file_path = os.path.join(tmp_folder, "{}{}".format(len(found) + 1, file_extension)) with open(file_path, "wb") as outp: outp.write(file_data) if len(found) >= args.sample_size: break logger.info("found {} files".format(len(found))) output_file = os.path.abspath(args.output_file) cmd = "tar -C {} --create --file {} {}".format( os.path.dirname(tmp_folder), output_file, os.path.basename(tmp_folder)) logger.info(cmd) os.system(cmd)
def __init__(self, *args, **kwargs): super(Command, self).__init__(*args, **kwargs) self.logger = setup_logging(log_file_name="name_report.log") self.regions = dict() for r in models.Region.objects.all(): self.regions[r.id] = r.name self.names_masc = set() self.names_fem = set() self.surnames_masc = set() self.surnames_fem = set() self.gender_recognizer = TGenderRecognizer()
def setup_project(self, morda_url): logger = setup_logging('prohibited') self.project = TRobotProject(logger, '', config=TRobotConfig(), export_folder="result", enable_search_engine=False) web_site = self.project.add_web_site(morda_url) self.robot_step = TRobotStep(web_site) self.env = TestDlrobotEnv("data.prohibited") TDownloadEnv.FILE_CACHE_FOLDER = self.env.data_folder
def main(): args = parse_args() logger = setup_logging("send_docs") decl_sender = TDeclarationSender(logger, True, True) for d in args.folders: logger.info("folder = {}".format(d)) result_folder = os.path.join(d, "result") if not os.path.exists(result_folder): logger.error("no directory {} found".format(result_folder)) else: decl_sender.send_declaraion_files_to_other_servers(d)