コード例 #1
0
    def test_import_second_passport(self):
        self.assertGreater(models.Office.objects.count(), 0)
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()

        permalinks_folder = os.path.dirname(__file__)
        TPermalinksManager(setup_logging(), {'directory': permalinks_folder}).create_empty_dbs()

        domains_folder = os.path.join(os.path.dirname(__file__), "domains_1")
        sp_workdir = os.path.join(os.path.dirname(__file__), "smart_parser_server")

        importer = ImportJsonCommand(None, None)
        os.chdir(os.path.dirname(__file__))

        with SmartParserServerForTesting(sp_workdir, domains_folder):
            importer.handle(None, dlrobot_human="dlrobot_human_1.json", smart_parser_human_json="human_jsons",
                            permalinks_folder=permalinks_folder)

        self.assertEqual(models.Section.objects.count(), 1)
        self.assertEqual(models.RealEstate.objects.count(), 1)
        section_id1 = list(models.Section.objects.all())[0].id

        # one more time, but now we have two vehicles for the same person (same document), as though smart_parser
        # is more intelligent
        TPermalinksManager(setup_logging(), {'directory': permalinks_folder}).create_permalinks()

        # clear the db
        models.Vehicle.objects.all().delete()
        models.RealEstate.objects.all().delete()
        models.Income.objects.all().delete()
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()

        domains_folder = os.path.join(os.path.dirname(__file__), "domains_2")
        sp_workdir = os.path.join(os.path.dirname(__file__), "smart_parser_server")

        importer = ImportJsonCommand(None, None)
        os.chdir(os.path.dirname(__file__))
        with SmartParserServerForTesting(sp_workdir, domains_folder):
            importer.handle(None, dlrobot_human="dlrobot_human_2.json", smart_parser_human_json="human_jsons",
                            permalinks_folder=permalinks_folder)

        self.assertEqual(models.Section.objects.count(), 1)
        self.assertEqual(models.RealEstate.objects.count(), 1)
        section_id2 = list(models.Section.objects.all())[0].id

        ## "Иванов И.В." ==  "Иванов И. В."
        self.assertEqual(section_id1, section_id2)
コード例 #2
0
    def test(self):
        logger = setup_logging(logger_name="test_real_dedupe")
        models.Section.objects.all().delete()

        permalinks_folder = os.path.dirname(__file__)

        db = TPermaLinksPerson(permalinks_folder)
        db.open_db_read_only()
        db.recreate_auto_increment_table()
        db.close_db()

        model_path = os.path.join(
            os.path.dirname(__file__),
            "../../../deduplicate/model/random_forest.pickle")
        dedupe_objects = os.path.join(os.path.dirname(__file__),
                                      "dedupe_objects.dump")
        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          input_dedupe_objects=dedupe_objects,
                          model_file=model_path,
                          threshold=0.6,
                          recreate_db=True,
                          surname_bounds=',',
                          write_to_db=True)
        sec = models.Section.objects.get(id=757036)
        self.assertEqual(1406125, sec.person_id)
コード例 #3
0
def main():
    args = parse_args()
    logger = setup_logging("manage_pool")
    pool = TOfficePool(logger)
    pool.read_cases(args.input_pool)
    case: TPredictionCase
    cnt = 0
    toloka_pool = list()
    automatic_pool = list()
    parser = TOfficeFromTitle(logger)
    for case in pool.pool:
        cnt += 1
        w: TTitleParseResult
        w = parser.parse_title(case)
        if w is None:
            logger.debug("cannot parse {}".format(case.sha256))
        else:
            #print ("{}".format(json.dumps(parser.to_json(), indent=4, ensure_ascii=False)))
            #print(parser.org_name)
            if w.weight > 0.5:
                automatic_pool.append(case)
                case.true_office_id = w.office.office_id
            else:
                toloka_pool.append(case)
            logger.debug("{}\t{}\t{}\t=>{}:{}".format(
                w.office.office_id, w.office.name, w.org_name, w.weight,
                ",".join(w.common_words)))

    TOfficePool.write_pool(toloka_pool, args.output_toloka_file)
    TOfficePool.write_pool(automatic_pool, args.output_automatic_file)
コード例 #4
0
 def __init__(self, *args, **kwargs):
     super(Command, self).__init__(*args, **kwargs)
     self.logger = setup_logging(log_file_name="create_misspell_db.log")
     self.rml_path = None
     self.converter1 = None
     self.converter2 = None
     self.output_folder = None
コード例 #5
0
ファイル: test_ssl.py プロジェクト: TI-Russia/smart_parser
    def setUp(self):
        self.env = TestDlrobotEnv("data.ssl")

        TDownloadEnv.clear_cache_folder()
        THttpRequester.ENABLE = False
        logger = setup_logging(log_file_name="dlrobot.log")
        THttpRequester.initialize(logger)
コード例 #6
0
def main():
    args = parse_args()
    logger = setup_logging("join_office_and_websites")
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()

    web_sites_db = TDeclarationWebSiteList(
        logger,
        TDeclarationWebSiteList.default_input_task_list_path).load_from_disk()
    url_info: TDeclarationWebSiteObsolete
    for url, url_info in web_sites_db.web_sites.items():
        office_id = url_info.calculated_office_id
        office: TOfficeInMemory
        office = offices.offices.get(int(office_id))
        if office is None:
            logger.debug(
                "cannot find office_id={}, url={} no valid urls, deleted office?"
                .format(office_id, url))
            continue
        p = url_info.http_protocol if url_info.http_protocol is not None else "http"
        i = TDeclarationWebSite()
        i.url = p + "://" + url
        i.reach_status = url_info.reach_status
        i.comments = url_info.comments
        i.redirect_to = url_info.redirect_to
        i.title = url_info.title
        office.office_web_sites.append(i)
    for o in offices.offices.values():
        o.office_web_sites.sort(key=lambda x: 1 if x.reach_status ==
                                TWebSiteReachStatus.normal else 0)
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
コード例 #7
0
    def test_fsin_2_import(self):
        self.assertGreater(models.Office.objects.count(), 0)
        models.Income.objects.all().delete()
        models.RealEstate.objects.all().delete()
        models.Vehicle.objects.all().delete()
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()

        permalinks_folder = os.path.dirname(__file__)
        logger = setup_logging(log_file_name="test_fsin_import.log")
        TPermalinksManager(logger, {
            'directory': permalinks_folder
        }).create_empty_dbs()
        doc_folder = os.path.join(os.path.dirname(__file__), "domains")
        sp_workdir = os.path.join(os.path.dirname(__file__),
                                  "smart_parser_server")

        importer = ImportJsonCommand(None, None)
        os.chdir(os.path.dirname(__file__))

        with SmartParserServerForTesting(sp_workdir, doc_folder):
            importer.handle(None,
                            process_count=2,
                            dlrobot_human="dlrobot_human.json",
                            permalinks_folder=permalinks_folder)

        self.assertEqual(1, models.Section.objects.count())
        pass
コード例 #8
0
 def __init__(self, args):
     self.args = args
     self.working = True
     self.thread_pool = ThreadPoolExecutor(max_workers=self.args.worker_count)
     self.setup_working_folder()
     self.logger = setup_logging(log_file_name=self.args.log_file_name, append_mode=True)
     self.setup_environment()
コード例 #9
0
    def handle(self, *args, **options):
        TImporter.logger = setup_logging(log_file_name="import_json.log")
        importer = TImporter(options)

        self.stdout.write("start importing")
        if options.get('office_id') is not None:
            importer.init_non_pickable()
            importer.import_office(options.get('office_id'))
        elif options.get('process_count', 0) > 1:
            importer.delete_before_fork()
            process_count = options.get('process_count')
            importer.distribute_offices_to_processes(process_count)
            pool = Pool(processes=process_count)
            pool.map(importer.process_one_office_bucket_in_subprocess,
                     importer.office_buckets.keys())
            importer.init_after_fork()
        else:
            importer.init_non_pickable()
            cnt = 0
            for office_id in importer.office_to_source_documents.keys():
                if options.get(
                        'take_first_n_offices'
                ) is not None and cnt >= options.get('take_first_n_offices'):
                    break
                importer.import_office(office_id)
                cnt += 1

        TImporter.logger.info("Section count={}".format(
            models.Section.objects.all().count()))
        TImporter.logger.info("all done")
コード例 #10
0
    def handle(self, *args, **options):
        logger = setup_logging("fix_fsin")

        for s in models.Section.objects.filter(rubric_id=10):
            self.filter_set(logger, s.id, s.income_set.all().order_by('id'), models.Income)
            self.filter_set(logger, s.id, s.vehicle_set.all().order_by('id'), models.Vehicle)
            self.filter_set(logger, s.id, s.realestate_set.all().order_by('id'), models.RealEstate)
コード例 #11
0
 def __init__(self, args):
     self.logger = setup_logging(log_file_name="access_log_reader.log")
     self.args = args
     self.start_access_log_date = self.args.start_access_log_date
     self.last_access_log_date = self.args.last_access_log_date
     self.access_log_folder = self.args.access_log_folder
     self.min_request_freq = self.args.min_request_freq
コード例 #12
0
    def test_complex_import(self):
        self.assertGreater(models.Office.objects.count(), 0)
        models.Income.objects.all().delete()
        models.RealEstate.objects.all().delete()
        models.Vehicle.objects.all().delete()
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()

        permalinks_folder = os.path.dirname(__file__)
        logger = setup_logging(log_file_name="test_complex_import.log")
        TPermalinksManager(logger, {
            'directory': permalinks_folder
        }).create_empty_dbs()

        doc_folder = os.path.join(os.path.dirname(__file__), "domains")
        sp_workdir = os.path.join(os.path.dirname(__file__),
                                  "smart_parser_server")

        importer = ImportJsonCommand(None, None)
        os.chdir(os.path.dirname(__file__))

        with SmartParserServerForTesting(sp_workdir, doc_folder):
            importer.handle(None,
                            dlrobot_human="dlrobot_human.json",
                            smart_parser_human_json="human_jsons",
                            permalinks_folder=permalinks_folder)

        self.assertEqual(models.Section.objects.count(), 3)
        old_sections = [(s.id, s.person_name)
                        for s in models.Section.objects.all()]

        self.assertEqual(models.RealEstate.objects.count(), 3)
        self.assertEqual(models.Income.objects.count(), 3)
        self.assertEqual(models.Income.objects.count(), 3)
        self.assertGreater(models.Office.objects.count(), 0)
        old_docs = [(d.id, d.sha256)
                    for d in models.Source_Document.objects.all()]

        # import the same sections adn check that we reuse old section ids and source doc ids
        CreatePermalinksStorageCommand(None, None).handle(
            None, directory=permalinks_folder)
        permalinks_db = TPermalinksManager(logger,
                                           {'directory': permalinks_folder})
        permalinks_db.create_sql_sequences()
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()
        with SmartParserServerForTesting(sp_workdir, doc_folder):
            importer.handle(None,
                            dlrobot_human="dlrobot_human.json",
                            smart_parser_human_json="human_jsons",
                            permalinks_folder=permalinks_folder)

        new_docs = [(d.id, d.sha256)
                    for d in models.Source_Document.objects.all()]
        self.assertListEqual(old_docs, new_docs)

        new_sections = [(s.id, s.person_name)
                        for s in models.Section.objects.all()]
        self.assertListEqual(old_sections, new_sections)
コード例 #13
0
 def __init__(self):
     self.args = parse_args()
     self.logger = setup_logging("check_disclosures_heath")
     self.last_messages = dict()
     self.checks = list()
     with open(self.args.config) as inp:
         for c in json.load(inp):
             self.checks.append(TCheckState.read_from_json(self, c))
コード例 #14
0
def main():
    args = parse_args()
    logger = setup_logging(log_file_name="predict_office_baseline.log")
    model = TPredictionModelBase(logger,
                                 args.bigrams_path,
                                 args.model_folder,
                                 test_pool=args.test_pool)
    test_baseline(model)
コード例 #15
0
    def test_unicode(self):

        try:
            THttpRequester.initialize(setup_logging())
            s = THttpRequester.make_http_request("http://5%20июня%20запретят%20розничную%20продажу%20алкоголя", "GET")
        except THttpRequester.RobotHttpException as exp:
            # no UnicodeException for this url
            pass
コード例 #16
0
 def __init__(self, *args, **kwargs):
     super(Command, self).__init__(*args, **kwargs)
     self.logger = setup_logging(
         log_file_name="update_person_redirects.log")
     self.options = None
     self.old_person_to_sections = defaultdict(list)
     self.redirects = dict()
     self.new_section_to_person = dict()
コード例 #17
0
ファイル: set_rubrics.py プロジェクト: TI-Russia/smart_parser
def main():
    args = parse_args()
    logger = setup_logging("set_rubrics")
    offices = TOfficeTableInMemory(use_office_types=False)
    offices.read_from_local_file()
    offices.set_rubrics(logger)
    logger.info("write to {}".format(args.output_file))
    offices.write_to_local_file(args.output_file)
コード例 #18
0
 def handle(self, *args, **options):
     logger = setup_logging(log_file_name="normalize_fio.log")
     for section in models.Section.objects.all():
         person_name = normalize_fio_before_db_insert(section.person_name)
         if person_name != section.person_name:
             logger.debug("normalize {} -> {}".format(
                 section.person_name, person_name))
             section.person_name = person_name
             section.save()
コード例 #19
0
 def __init__(self, *args, **kwargs):
     super(Command, self).__init__(*args, **kwargs)
     self.test_objects = None
     self.test_data = None
     self.options = None
     self.logger = setup_logging(log_file_name="test_ml_pool.log")
     self.ml_model = None
     self.X = None
     self.y_true = None
コード例 #20
0
 def setUp(self):
     self.server_address = '127.0.0.1:{}'.format(self.web_site_port)
     self.web_server = TestHTTPServer(self.web_site_port)
     threading.Thread(target=start_server, args=(self.web_server,)).start()
     time.sleep(1)
     self.env = TestDlrobotEnv("data.timeout")
     TDownloadEnv.clear_cache_folder()
     self.logger = setup_logging(log_file_name="dlrobot.log")
     THttpRequester.initialize(self.logger)
コード例 #21
0
def main():
    logger = setup_logging(log_file_name="predict_office_test.log")
    args = parse_args()
    model = TTensorFlowOfficeModel(logger,
                                   args.bigrams_path,
                                   args.model_folder,
                                   create_model=False,
                                   work_pool_path=args.test_pool)
    model.test_model(thresholds=args.threshold)
コード例 #22
0
def main():
    args = parse_args()
    logger = setup_logging(log_file_name="predict_office.log")
    model = TPredictionModel(logger, args.bigrams_path,  model_path=args.model_path,
                             row_count=args.row_count,
                             train_pool=args.train_pool,
                             test_pool=args.test_pool)
    elif args.action == "train":
        model.train_catboost()
コード例 #23
0
 def setUp(self, website_folder):
     self.env = TestDlrobotEnv("data.{}".format(
         os.path.basename(website_folder)))
     shutil.copy2(
         os.path.join(os.path.dirname(__file__), website_folder,
                      "project.txt"), self.env.data_folder)
     THttpRequester.ENABLE = False
     self.logger = setup_logging(log_file_name="dlrobot.log")
     THttpRequester.initialize(self.logger)
コード例 #24
0
def main():
    logger = setup_logging(log_file_name="predict_office_toloka.log")
    args = parse_args()
    model = TTensorFlowOfficeModel(logger,
                                   args.bigrams_path,
                                   args.model_folder,
                                   create_model=False,
                                   work_pool_path=args.test_pool)
    model.toloka(args.toloka_pool, format=args.format)
コード例 #25
0
    def test(self):
        logger = setup_logging(logger_name="test_real_dedupe")
        sql_script = os.path.join( os.path.dirname(__file__), "disclosures.sql.person_id_5295.n")
        run_sql_script(logger, sql_script)

        permalinks_folder = os.path.dirname(__file__)
        db = TPermaLinksPerson(permalinks_folder)
        db.create_db()
        db.save_dataset(setup_logging())
        db.recreate_auto_increment_table()
        db.close_db()

        model_path = os.path.join(os.path.dirname(__file__), "../../../deduplicate/model/random_forest.pickle" )
        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          write_to_db=True,
                          surname_bounds=',',
                          model_file=model_path,
                          threshold=0.6
                          )

        person_id = 5295
        self.assertEqual(models.Person.objects.count(), 3)
        person = models.Person.objects.get(id=person_id)
        self.assertIsNotNone(person)
        self.assertEqual(5295, person.declarator_person_id)
        canon_sections  =  [
            (451721,	5295,	True),
            (452066,	5295,	True),
            (452420,	5295, True),
            (453686,	5295, False),
            (455039,	5295,	False),
            (1801614,	5296,	True),
            (5105303,	5295,	True),
            (6437989,	5297,	True),
            (6672563,	5297,	True),
            (6674154,	5297,	True),
            (6773981,	5297,	True),
        ]
        sections = []
        for s in models.Section.objects.all():
            sections.append ((s.id, s.person_id, s.dedupe_score is not None))
        self.assertListEqual(canon_sections, sections)
コード例 #26
0
 def __init__(self, args):
     self.args = args
     self.logger = setup_logging(log_file_name="join_human_and_dlrobot.log",
                                 append_mode=True)
     self.output_dlrobot_human = TDlrobotHumanFileDBM(args.output_json)
     self.output_dlrobot_human.create_db()
     self.old_files_with_office_count = 0
     self.web_sites_db = TDeclarationWebSiteList(self.logger)
     self.offices = self.web_sites_db.offices
     self.dlrobot_config = TRobotConfig.read_by_config_type("prod")
コード例 #27
0
def main():
    args = parse_args()
    logger = setup_logging("create_sample")
    dlrobot_human = TDlrobotHumanFileDBM(args.input_file)
    dlrobot_human.open_db_read_only()
    source_doc_client = TSourceDocClient(TSourceDocClient.parse_args([]))
    smart_parser_client = TSmartParserCacheClient(
        TSmartParserCacheClient.parse_args([]))
    logger.info("create population")

    tmp_folder = '/tmp/create_sample_sp'
    if os.path.exists(tmp_folder):
        shutil.rmtree(tmp_folder)
    logger.info("create directory {}".format(tmp_folder))
    os.mkdir(tmp_folder)
    population = list(dlrobot_human.get_all_keys())
    random.shuffle(population)

    logger.info("fetch files")
    found = set()
    for sha256 in population:
        logger.debug("get doc {}".format(sha256))
        file_data, file_extension = source_doc_client.retrieve_file_data_by_sha256(
            sha256)
        if file_data is None:
            logger.error("cannot get data for {}".format(sha256))
            continue

        if args.income_year is not None:
            smart_parser_json = smart_parser_client.retrieve_json_by_sha256(
                sha256)
            if smart_parser_json is None or len(smart_parser_json) == 0:
                logger.error(
                    "empty or invalid smart parser json for {}".format(sha256))
                continue
            src_doc = dlrobot_human.get_document(sha256)
            year = src_doc.calc_document_income_year(smart_parser_json)
            if year != args.income_year:
                logger.error("different year ({} != {})".format(
                    year, args.income_year))
                continue
        found.add(sha256)
        file_path = os.path.join(tmp_folder,
                                 "{}{}".format(len(found) + 1, file_extension))
        with open(file_path, "wb") as outp:
            outp.write(file_data)
        if len(found) >= args.sample_size:
            break

    logger.info("found {} files".format(len(found)))
    output_file = os.path.abspath(args.output_file)
    cmd = "tar -C {} --create --file {} {}".format(
        os.path.dirname(tmp_folder), output_file, os.path.basename(tmp_folder))
    logger.info(cmd)
    os.system(cmd)
コード例 #28
0
ファイル: name_report.py プロジェクト: TI-Russia/smart_parser
 def __init__(self, *args, **kwargs):
     super(Command, self).__init__(*args, **kwargs)
     self.logger = setup_logging(log_file_name="name_report.log")
     self.regions = dict()
     for r in models.Region.objects.all():
         self.regions[r.id] = r.name
     self.names_masc = set()
     self.names_fem = set()
     self.surnames_masc = set()
     self.surnames_fem = set()
     self.gender_recognizer = TGenderRecognizer()
コード例 #29
0
 def setup_project(self, morda_url):
     logger = setup_logging('prohibited')
     self.project = TRobotProject(logger,
                                  '',
                                  config=TRobotConfig(),
                                  export_folder="result",
                                  enable_search_engine=False)
     web_site = self.project.add_web_site(morda_url)
     self.robot_step = TRobotStep(web_site)
     self.env = TestDlrobotEnv("data.prohibited")
     TDownloadEnv.FILE_CACHE_FOLDER = self.env.data_folder
コード例 #30
0
def main():
    args = parse_args()
    logger = setup_logging("send_docs")
    decl_sender = TDeclarationSender(logger, True, True)
    for d in args.folders:
        logger.info("folder = {}".format(d))
        result_folder = os.path.join(d, "result")
        if not os.path.exists(result_folder):
            logger.error("no directory {} found".format(result_folder))
        else:
            decl_sender.send_declaraion_files_to_other_servers(d)