def __init__(self, args):
     self.args = args
     self.logger = setup_logging(log_file_name="join_human_and_dlrobot.log",
                                 append_mode=True)
     self.output_dlrobot_human = TDlrobotHumanFileDBM(args.output_json)
     self.output_dlrobot_human.create_db()
     self.old_files_with_office_count = 0
     self.web_sites_db = TDeclarationWebSiteList(self.logger)
     self.offices = self.web_sites_db.offices
     self.dlrobot_config = TRobotConfig.read_by_config_type("prod")
 def __init__(self):
     self.args = parse_args()
     self.logger = setup_logging(logger_name="dlrobot_human")
     self.dlrobot_human = TDlrobotHumanFileDBM(self.args.input_file)
     self.dlrobot_human.open_db_read_only()
     if self.args.action in {
             "check_office", "build_office_train_set", "weak_offices"
     } or self.args.action.endswith('_pool'):
         self.office_index = TOfficePredictIndex(self.logger,
                                                 self.args.office_index)
         self.office_index.read()
     else:
         self.office_index = None
 def add_human_files(self):
     self.logger.info("read {}".format(self.args.human_json))
     human_files = TDlrobotHumanFileDBM(self.args.human_json)
     human_files.open_db_read_only()
     self.logger.info("add human files ...")
     for sha256, src_doc in human_files.get_all_documents():
         if not self.is_new_fns_document_from_declarator(src_doc):
             self.check_declaration_office(sha256, src_doc)
             self.add_dlrobot_file(sha256,
                                   src_doc.file_extension,
                                   decl_refs=src_doc.decl_references)
     self.logger.info("Database Document Count: {}".format(
         self.output_dlrobot_human.get_documents_count()))
 def to_utf8(self):
     new_dlrobot_human = TDlrobotHumanFileDBM(self.args.output_file)
     new_dlrobot_human.create_db()
     src_doc: TSourceDocument
     for key, src_doc in self.dlrobot_human.get_all_documents():
         src_doc.convert_refs_to_utf8()
         new_dlrobot_human.update_source_document(key, src_doc)
     new_dlrobot_human.close_db()
 def add_old_dlrobot_files(self):
     self.logger.info("read {}".format(self.args.old_dlrobot_human_json))
     old_json = TDlrobotHumanFileDBM(self.args.old_dlrobot_human_json)
     old_json.open_db_read_only()
     self.logger.info("copy old files ...")
     self.old_files_with_office_count = 0
     for sha256, src_doc in old_json.get_all_documents():
         if src_doc.calculated_office_id is not None:
             self.old_files_with_office_count += 1
         self.add_dlrobot_file(sha256,
                               src_doc.file_extension,
                               web_refs=src_doc.web_references,
                               decl_refs=src_doc.decl_references)
     self.logger.info("Database Document Count: {}".format(
         self.output_dlrobot_human.get_documents_count()))
Esempio n. 6
0
 def __init__(self, args):
     self.args = args
     self.dlrobot_human = TDlrobotHumanFileDBM(args['dlrobot_human'])
     self.dlrobot_human.open_db_read_only()
     self.all_section_passports = set()
     if models.Section.objects.count() > 0:
         raise Exception(
             "implement all section passports reading from db if you want to import to non-empty db! "
         )
     self.office_to_source_documents = self.build_office_to_file_mapping()
     self.permalinks_db_section = None
     self.permalinks_db_source_document = None
     self.smart_parser_cache_client = None
     self.regions = TRussianRegions()
     self.office_buckets = defaultdict(list)
 def __init__(self, args):
     self.logger = setup_logging(log_file_name="predict_office.log")
     self.dlrobot_human_path = args.dlrobot_human_path
     self.dlrobot_human = TDlrobotHumanFileDBM(self.dlrobot_human_path)
     self.dlrobot_human.open_write_mode()
     self.enable_ml = args.enable_ml
     sp_args = TSmartParserCacheClient.parse_args([])
     self.smart_parser_server_client = TSmartParserCacheClient(sp_args, self.logger)
     model_path = args.office_model_path
     self.max_failures_count = args.max_failures_count
     assert (os.path.exists(model_path))
     bigrams_path = os.path.join(model_path, "office_ngrams.txt")
     ml_model_path = os.path.join(model_path, "model")
     self.office_ml_model = TTensorFlowOfficeModel(self.logger, bigrams_path, ml_model_path, create_model=False)
     self.regional_tax_offices = self.build_regional_tax_offices()
     self.web_sites = TDeclarationWebSiteList(self.logger, RUSSIA.offices_in_memory)
     self.title_parser = TOfficeFromTitle(self.logger, web_sites=self.web_sites)
     self.src_doc_to_rule_results = dict()
    def delete_by_sha256(self):
        sha256_list = self.build_sha256_list()
        assert self.args.output_file is not None

        new_dlrobot_human = TDlrobotHumanFileDBM(self.args.output_file)
        new_dlrobot_human.create_db()

        for sha256, src_doc in self.dlrobot_human.get_all_documents():
            if sha256 not in sha256_list:
                new_dlrobot_human.update_source_document(sha256, src_doc)

        new_dlrobot_human.close_db()
    def select_by_sha256(self):
        sha256_list = self.build_sha256_list()
        assert self.args.output_file is not None

        new_dlrobot_human = TDlrobotHumanFileDBM(self.args.output_file)
        new_dlrobot_human.create_db()

        for sha256 in sha256_list:
            src_doc = self.dlrobot_human.get_document(sha256)
            new_dlrobot_human.update_source_document(sha256, src_doc)

        new_dlrobot_human.close_db()
    def test_join_dlrobot_and_human(self):
        self.assertGreater(models.Office.objects.count(), 0)
        args = [
            '--max-ctime',
            '5602811863',  #the far future
            '--input-dlrobot-folder',
            'processed_projects',
            '--human-json',
            "human_files.json",
            '--old-dlrobot-human-json',
            'old/dlrobot_human.json',
            '--output-json',
            self.dlrobot_human_path
        ]
        joiner = TJoiner(TJoiner.parse_args(args))
        joiner.main()

        dlrobot_human = TDlrobotHumanFileDBM(self.dlrobot_human_path)
        dlrobot_human.open_db_read_only()
        stats = dlrobot_human.get_stats()
        self.assertDictEqual(CANON_STATS, stats)

        result_json = dlrobot_human.to_json()
        self.maxDiff = None
        self.assertDictEqual(CANON_HUMAN_DLROBOT, result_json)
    def to_json(self):
        if self.has_sha256_filters():
            self.args.output_file = "tmp.dbm"
            self.select_by_sha256()
            tmp_db = TDlrobotHumanFileDBM(self.args.output_file)
            tmp_db.open_db_read_only()
            js = tmp_db.to_json()
            tmp_db.close_db()
            os.unlink(self.args.output_file)
        else:
            js = self.dlrobot_human.to_json()

        print(json.dumps(js, indent=4, ensure_ascii=False))
Esempio n. 12
0
def main():
    args = parse_args()
    logger = setup_logging("create_sample")
    dlrobot_human = TDlrobotHumanFileDBM(args.input_file)
    dlrobot_human.open_db_read_only()
    source_doc_client = TSourceDocClient(TSourceDocClient.parse_args([]))
    smart_parser_client = TSmartParserCacheClient(
        TSmartParserCacheClient.parse_args([]))
    logger.info("create population")

    tmp_folder = '/tmp/create_sample_sp'
    if os.path.exists(tmp_folder):
        shutil.rmtree(tmp_folder)
    logger.info("create directory {}".format(tmp_folder))
    os.mkdir(tmp_folder)
    population = list(dlrobot_human.get_all_keys())
    random.shuffle(population)

    logger.info("fetch files")
    found = set()
    for sha256 in population:
        logger.debug("get doc {}".format(sha256))
        file_data, file_extension = source_doc_client.retrieve_file_data_by_sha256(
            sha256)
        if file_data is None:
            logger.error("cannot get data for {}".format(sha256))
            continue

        if args.income_year is not None:
            smart_parser_json = smart_parser_client.retrieve_json_by_sha256(
                sha256)
            if smart_parser_json is None or len(smart_parser_json) == 0:
                logger.error(
                    "empty or invalid smart parser json for {}".format(sha256))
                continue
            src_doc = dlrobot_human.get_document(sha256)
            year = src_doc.calc_document_income_year(smart_parser_json)
            if year != args.income_year:
                logger.error("different year ({} != {})".format(
                    year, args.income_year))
                continue
        found.add(sha256)
        file_path = os.path.join(tmp_folder,
                                 "{}{}".format(len(found) + 1, file_extension))
        with open(file_path, "wb") as outp:
            outp.write(file_data)
        if len(found) >= args.sample_size:
            break

    logger.info("found {} files".format(len(found)))
    output_file = os.path.abspath(args.output_file)
    cmd = "tar -C {} --create --file {} {}".format(
        os.path.dirname(tmp_folder), output_file, os.path.basename(tmp_folder))
    logger.info(cmd)
    os.system(cmd)
 def common_test(self, document_file_id, canon_json):
     output_db = "human_files.dbm"
     arg_list = ['--document-file-id', str(document_file_id), '--table', 'declarations_documentfile',
                 '--dlrobot-human-json', output_db, '--start-from-an-empty-file']
     source_doc_workdir = os.path.join(os.path.dirname(__file__), "source_doc")
     smart_parser_workdir = os.path.join(os.path.dirname(__file__), "smart_parser_workdir")
     with SourceDocServerForTesting(source_doc_workdir) as source_doc_wrapper:
         with SmartParserServerForTesting(smart_parser_workdir) as smart_parser_server:
             args = TExportHumanFiles.parse_args(arg_list)
             with TExportHumanFiles(args) as exporter:
                 exporter.export_files()
                 smart_parser_server.server.task_queue.join()
             self.assertEqual(source_doc_wrapper.server.get_stats()['source_doc_count'], 1)
             json.dumps(canon_json, indent=4, ensure_ascii=False)
             f = TDlrobotHumanFileDBM(output_db)
             f.open_db_read_only()
             self.assertDictEqual(canon_json, f.to_json())
             f.close_db()
             time.sleep(2)
             self.assertEqual(smart_parser_server.server.get_stats()['session_write_count'], 1)
class TJoiner:
    @staticmethod
    def parse_args(arg_list):
        default_ml_model_path = os.path.join(os.path.dirname(__file__),
                                             "../predict_office/model")
        parser = argparse.ArgumentParser()
        # input args
        parser.add_argument("--max-ctime",
                            dest='max_ctime',
                            required=True,
                            type=int,
                            help="max ctime of an input folder")
        parser.add_argument("--input-dlrobot-folder",
                            dest='input_dlrobot_folder',
                            required=True)
        parser.add_argument("--human-json", dest='human_json', required=True)
        parser.add_argument("--old-dlrobot-human-json",
                            dest='old_dlrobot_human_json',
                            required=False)
        parser.add_argument("--office-model-path",
                            dest='office_model_path',
                            required=False,
                            default=default_ml_model_path)

        # output args
        parser.add_argument("--output-json",
                            dest='output_json',
                            default="dlrobot_human.json")

        return parser.parse_args(arg_list)

    def __init__(self, args):
        self.args = args
        self.logger = setup_logging(log_file_name="join_human_and_dlrobot.log",
                                    append_mode=True)
        self.output_dlrobot_human = TDlrobotHumanFileDBM(args.output_json)
        self.output_dlrobot_human.create_db()
        self.old_files_with_office_count = 0
        self.web_sites_db = TDeclarationWebSiteList(self.logger)
        self.offices = self.web_sites_db.offices
        self.dlrobot_config = TRobotConfig.read_by_config_type("prod")

    def add_dlrobot_file(self,
                         sha256,
                         file_extension,
                         web_refs=[],
                         decl_refs=[]):
        src_doc = self.output_dlrobot_human.get_document_maybe(sha256)
        if src_doc is None:
            src_doc = TSourceDocument(file_extension)
            self.output_dlrobot_human.update_source_document(sha256, src_doc)
        for web_ref in web_refs:
            src_doc.add_web_reference(web_ref)
        for decl_ref in decl_refs:
            src_doc.add_decl_reference(decl_ref)
        self.output_dlrobot_human.update_source_document(sha256, src_doc)

    def add_files_of_one_project(self, dlrobot_project):
        self.logger.debug("process {}".format(dlrobot_project))
        project_folder = os.path.join(self.args.input_dlrobot_folder,
                                      dlrobot_project)
        dlrobot_project_without_timestamp = re.sub('\.[0-9]+$', '',
                                                   dlrobot_project)
        project_path = os.path.join(project_folder,
                                    dlrobot_project_without_timestamp + ".txt")
        if not os.path.exists(project_path):
            self.logger.error(
                "no dlrobot project file found".format(project_folder))
            return
        try:
            project = TRobotProject(self.logger,
                                    project_path,
                                    config=self.dlrobot_config,
                                    web_sites_db=self.web_sites_db)
            project.read_project(check_step_names=False)
            office_info: TWebSiteCrawlSnapshot
            office_info = project.web_site_snapshots[0]
            site_url = office_info.get_site_url()
            exported_files = dict()
            for export_record in office_info.export_env.exported_files:
                exported_files[export_record.sha256] = export_record
        except Exception as exp:
            self.logger.error("cannot read project {}, exp={}".format(
                project_path, exp))
            return

        file_info: TExportFile
        for sha256, file_info in exported_files.items():
            web_ref = TWebReference(
                url=file_info.url,
                crawl_epoch=self.args.max_ctime,
                site_url=site_url,
                declaration_year=file_info.declaration_year)
            self.add_dlrobot_file(sha256, file_info.file_extension, [web_ref])

    def add_new_dlrobot_files(self):
        self.logger.info("copy dlrobot files from {} ...".format(
            self.args.input_dlrobot_folder))
        with os.scandir(self.args.input_dlrobot_folder) as it:
            for entry in it:
                if entry.is_dir():
                    if entry.stat().st_ctime < self.args.max_ctime:
                        self.add_files_of_one_project(entry.name)
                    else:
                        self.logger.debug("skip too young folder {}".format(
                            entry.name))

        self.logger.info("Database Document Count: {}".format(
            self.output_dlrobot_human.get_documents_count()))

    def add_old_dlrobot_files(self):
        self.logger.info("read {}".format(self.args.old_dlrobot_human_json))
        old_json = TDlrobotHumanFileDBM(self.args.old_dlrobot_human_json)
        old_json.open_db_read_only()
        self.logger.info("copy old files ...")
        self.old_files_with_office_count = 0
        for sha256, src_doc in old_json.get_all_documents():
            if src_doc.calculated_office_id is not None:
                self.old_files_with_office_count += 1
            self.add_dlrobot_file(sha256,
                                  src_doc.file_extension,
                                  web_refs=src_doc.web_references,
                                  decl_refs=src_doc.decl_references)
        self.logger.info("Database Document Count: {}".format(
            self.output_dlrobot_human.get_documents_count()))

    def is_new_fns_document_from_declarator(self, src_doc: TSourceDocument):
        for ref in src_doc.decl_references:
            if 9542 <= ref.office_id <= 10611:
                # this document is already imported from fns but with a different sha256
                return True
        return False

    def check_declaration_office(self, sha256, src_doc: TSourceDocument):
        for ref in src_doc.decl_references:
            if ref.office_id is not None and ref.office_id not in self.offices.offices:
                raise Exception(
                    "document sha256={} office {} is not registered in disclosures"
                    .format(sha256, ref.office_id))

    def add_human_files(self):
        self.logger.info("read {}".format(self.args.human_json))
        human_files = TDlrobotHumanFileDBM(self.args.human_json)
        human_files.open_db_read_only()
        self.logger.info("add human files ...")
        for sha256, src_doc in human_files.get_all_documents():
            if not self.is_new_fns_document_from_declarator(src_doc):
                self.check_declaration_office(sha256, src_doc)
                self.add_dlrobot_file(sha256,
                                      src_doc.file_extension,
                                      decl_refs=src_doc.decl_references)
        self.logger.info("Database Document Count: {}".format(
            self.output_dlrobot_human.get_documents_count()))

    def main(self):
        self.add_new_dlrobot_files()
        if self.args.old_dlrobot_human_json is not None:
            self.add_old_dlrobot_files()
        self.add_human_files()
        self.output_dlrobot_human.close_db()
Esempio n. 15
0
    def export_files(self):
        human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json)
        if self.args.start_from_empty:
            human_files_db.create_db()
        else:
            human_files_db.open_write_mode()
        document_file_ids = set()
        for sha256, doc in human_files_db.get_all_documents():
            for ref in doc.decl_references:
                if ref.document_file_id is not None:
                    document_file_ids.add(ref.document_file_id)

        files_count = 0
        for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records():
            if document_file_id in document_file_ids:
                continue

            while self.pdf_conversion_client.server_is_too_busy():
                self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format(
                    self.pdf_conversion_client.last_pdf_conversion_queue_length
                ))
                time.sleep(5*60)

            web_site = urlsplit_pro(link).netloc
            if web_site.startswith('www.'):
                web_site = web_site[len('www.'):]

            if self.args.max_files_count is not None and files_count >= self.args.max_files_count:
                break
            self.logger.debug("export document_file_id={}".format(document_file_id))
            for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path,
                                                                                                    document_file_id):
                sha256 = build_dislosures_sha256(local_file_path)
                self.logger.debug("add {}, sha256={}".format(local_file_path, sha256))
                source_document = TSourceDocument(os.path.splitext(local_file_path)[1])
                ref = TDeclaratorReference()
                ref.document_id = document_id
                ref.document_file_id = document_file_id
                ref._site_url = web_site
                ref.office_id = self.fix_list(sha256, office_id)
                ref.income_year = income_year
                ref.document_file_url = declarator_url
                source_document.add_decl_reference(ref)
                human_files_db.update_source_document(sha256, source_document)
                files_count += 1
        self.logger.debug('added files count: {}'.format(files_count))
        human_files_db.close_db()
        self.send_new_pdfs_to_smart_parser()
class TDlrobotHumanManager:
    def __init__(self):
        self.args = parse_args()
        self.logger = setup_logging(logger_name="dlrobot_human")
        self.dlrobot_human = TDlrobotHumanFileDBM(self.args.input_file)
        self.dlrobot_human.open_db_read_only()
        if self.args.action in {
                "check_office", "build_office_train_set", "weak_offices"
        } or self.args.action.endswith('_pool'):
            self.office_index = TOfficePredictIndex(self.logger,
                                                    self.args.office_index)
            self.office_index.read()
        else:
            self.office_index = None

    def print_web_sites(self):
        value: TSourceDocument
        for key, value in self.dlrobot_human.get_all_documents():
            print("{}\t{}".format(key, value.get_web_site()))

    def has_sha256_filters(self):
        return self.args.sha256_list_file is not None or self.args.sha256 is not None

    def build_sha256_list(self):
        assert self.has_sha256_filters()
        if self.args.sha256_list_file is not None:
            sha_set = set()
            with open(self.args.sha256_list_file) as inp:
                for x in inp:
                    sha_set.add(x.strip())
            return sha_set
        else:
            return {self.args.sha256}

    def select_by_sha256(self):
        sha256_list = self.build_sha256_list()
        assert self.args.output_file is not None

        new_dlrobot_human = TDlrobotHumanFileDBM(self.args.output_file)
        new_dlrobot_human.create_db()

        for sha256 in sha256_list:
            src_doc = self.dlrobot_human.get_document(sha256)
            new_dlrobot_human.update_source_document(sha256, src_doc)

        new_dlrobot_human.close_db()

    def delete_by_sha256(self):
        sha256_list = self.build_sha256_list()
        assert self.args.output_file is not None

        new_dlrobot_human = TDlrobotHumanFileDBM(self.args.output_file)
        new_dlrobot_human.create_db()

        for sha256, src_doc in self.dlrobot_human.get_all_documents():
            if sha256 not in sha256_list:
                new_dlrobot_human.update_source_document(sha256, src_doc)

        new_dlrobot_human.close_db()

    def to_utf8(self):
        new_dlrobot_human = TDlrobotHumanFileDBM(self.args.output_file)
        new_dlrobot_human.create_db()
        src_doc: TSourceDocument
        for key, src_doc in self.dlrobot_human.get_all_documents():
            src_doc.convert_refs_to_utf8()
            new_dlrobot_human.update_source_document(key, src_doc)
        new_dlrobot_human.close_db()

    def to_json(self):
        if self.has_sha256_filters():
            self.args.output_file = "tmp.dbm"
            self.select_by_sha256()
            tmp_db = TDlrobotHumanFileDBM(self.args.output_file)
            tmp_db.open_db_read_only()
            js = tmp_db.to_json()
            tmp_db.close_db()
            os.unlink(self.args.output_file)
        else:
            js = self.dlrobot_human.to_json()

        print(json.dumps(js, indent=4, ensure_ascii=False))

    def check_office(self):
        pool = TOfficePool(self.logger)
        pool.read_cases(self.args.input_predict_office_pool_path)
        positive = 0
        negative = 0
        case: TPredictionCase
        for case in pool.pool:
            src_doc: TSourceDocument
            src_doc = self.dlrobot_human.get_document(case.sha256)
            if case.true_office_id == src_doc.calculated_office_id:
                self.logger.debug("positive case {} office_id={}".format(
                    case.sha256, case.true_office_id))
                positive += 1
            else:
                self.logger.debug(
                    "negative case {} , office_id must be {} but predicted {}".
                    format(case.sha256, case.true_office_id,
                           src_doc.calculated_office_id))
                negative += 1
        rec = {
            "positive_count": positive,
            "negative_count": negative,
            "precision":
            float(positive) / (negative + positive + 0.000000000001)
        }
        self.logger.info(json.dumps(rec))
Esempio n. 17
0
class TImporter:
    logger = None
    max_vehicle_count = 60

    def build_office_to_file_mapping(self):
        db_offices = set(o.id for o in models.Office.objects.all())
        TImporter.logger.debug("there are {} records in table {} ".format(
            len(db_offices), models.Office.objects.model._meta.db_table))
        office_to_source_documents = defaultdict(list)
        for sha256, src_doc in self.dlrobot_human.get_all_documents():
            office_id = src_doc.calculated_office_id
            if office_id is None:
                continue
            if int(office_id) not in db_offices:
                TImporter.logger.error(
                    "cannot find office id={} from {} in sql table ".format(
                        office_id, self.args['dlrobot_human']))
                raise Exception("integrity failed")
            office_to_source_documents[office_id].append(sha256)
        return office_to_source_documents

    def __init__(self, args):
        self.args = args
        self.dlrobot_human = TDlrobotHumanFileDBM(args['dlrobot_human'])
        self.dlrobot_human.open_db_read_only()
        self.all_section_passports = set()
        if models.Section.objects.count() > 0:
            raise Exception(
                "implement all section passports reading from db if you want to import to non-empty db! "
            )
        self.office_to_source_documents = self.build_office_to_file_mapping()
        self.permalinks_db_section = None
        self.permalinks_db_source_document = None
        self.smart_parser_cache_client = None
        self.regions = TRussianRegions()
        self.office_buckets = defaultdict(list)

    def delete_before_fork(self):
        from django import db
        db.connections.close_all()
        self.dlrobot_human.close_db()

    def init_non_pickable(self):
        self.smart_parser_cache_client = TSmartParserCacheClient(
            TSmartParserCacheClient.parse_args([]), TImporter.logger)

        self.permalinks_db_section = TPermaLinksSection(
            self.args['permalinks_folder'])
        self.permalinks_db_section.open_db_read_only()
        self.permalinks_db_source_document = TPermaLinksSourceDocument(
            self.args['permalinks_folder'])
        self.permalinks_db_source_document.open_db_read_only()

        self.dlrobot_human.open_db_read_only()

    def init_after_fork(self):
        from django.db import connection
        connection.connect()
        self.init_non_pickable()

    def get_human_smart_parser_json(self, src_doc, already_imported):
        for ref in src_doc.decl_references:
            filename = os.path.join(self.args['smart_parser_human_json'],
                                    str(ref.document_id) + ".json")
            if os.path.exists(filename) and filename not in already_imported:
                TImporter.logger.debug("import human json {}".format(filename))
                already_imported.add(filename)
                with open(filename, "r") as inp:
                    return json.load(inp)
        return None

    def register_document_in_database(self, sha256, src_doc: TSourceDocument):
        source_document_in_db = models.Source_Document(
            sha256=sha256,
            intersection_status=src_doc.build_intersection_status(),
        )
        source_document_in_db.id, new_file = self.permalinks_db_source_document.get_source_doc_id_by_sha256(
            sha256)
        assert not models.Source_Document.objects.filter(
            id=source_document_in_db.id).exists()
        self.logger.debug("register doc sha256={} id={}, new_file={}".format(
            sha256, source_document_in_db.id, new_file))
        source_document_in_db.file_extension = src_doc.file_extension
        source_document_in_db.save()
        ref: TDeclaratorReference
        for ref in src_doc.decl_references:
            models.Declarator_File_Reference(
                source_document=source_document_in_db,
                declarator_documentfile_id=ref.document_file_id,
                declarator_document_id=ref.document_id,
                web_domain=ref._site_url,
                declarator_document_file_url=ref.document_file_url).save()
        ref: TWebReference
        for ref in src_doc.web_references:
            models.Web_Reference(source_document=source_document_in_db,
                                 dlrobot_url=ref.url,
                                 web_domain=ref._site_url,
                                 crawl_epoch=ref.crawl_epoch).save()

        return source_document_in_db

    def register_section_passport(self, passport):
        if passport in self.all_section_passports:
            TImporter.logger.debug(
                "skip section because a section with the same passport already exists: {}"
                .format(passport))
            return False
        # we process each office in one thread, so there  is no need to use thread.locks, since office_id is a part of passport tuple
        self.all_section_passports.add(passport)
        return True

    def calc_income_year(self, input_json, src_doc: TSourceDocument,
                         section_json, section_index):
        # take year from a particular declarant (many declarants with different year in one file)
        # do not use here default value for get, since smart_parser explicitly write "year": null
        year = section_json.get('year')
        if year is not None:
            return int(year)

        year = src_doc.calc_document_income_year(input_json)

        # if year is absent, then the file is useless
        if year is None:
            raise TSmartParserSectionJson.SerializerException(
                "year is not defined: section No {}".format(section_index))

        return int(year)

    def get_fsin_office_id(self, section_json, src_doc: TSourceDocument):
        department = section_json.get('person', dict()).get('department')
        if department is None or len(department) < 5:
            return src_doc.calculated_office_id
        region_id = self.regions.get_region_all_forms(
            department, TRussianRegions.Russia_as_s_whole_region_id)
        return RUSSIA.get_fsin_by_region(region_id)

    def import_one_smart_parser_json(self, source_document_in_db, input_json,
                                     src_doc: TSourceDocument):
        imported_section_years = list()
        section_index = 0
        TImporter.logger.debug("try to import {} declarants".format(
            len(input_json['persons'])))
        incomes = list()
        is_fsin = RUSSIA.get_office(
            src_doc.calculated_office_id).rubric_id == TOfficeRubrics.Gulag

        for raw_section in input_json['persons']:
            section_index += 1
            section_income_year = self.calc_income_year(
                input_json, src_doc, raw_section, section_index)
            if is_fsin:
                office_id = self.get_fsin_office_id(raw_section, src_doc)
            else:
                office_id = src_doc.calculated_office_id
            with transaction.atomic():
                try:
                    prepared_section = TSmartParserSectionJson(
                        section_income_year, office_id, source_document_in_db)
                    prepared_section.read_raw_json(raw_section)

                    if len(prepared_section.vehicles
                           ) > TImporter.max_vehicle_count:
                        TImporter.logger.debug(
                            "ignore section {} because it has too many vehicles ( > {})"
                            .format(prepared_section.section.person_name,
                                    TImporter.max_vehicle_count))
                        continue
                    passport1 = prepared_section.get_passport_components1(
                    ).get_main_section_passport()
                    if self.register_section_passport(passport1):
                        prepared_section.section.tmp_income_set = prepared_section.incomes
                        passport2 = prepared_section.get_passport_components2(
                        ).get_main_section_passport()
                        section_id, is_new = self.permalinks_db_section.get_section_id(
                            passport1, passport2)
                        if is_new:
                            TImporter.logger.debug(
                                "found a new section {}, set section.id to {}".
                                format(
                                    prepared_section.section.
                                    get_permalink_passport(), section_id))

                        main_income = prepared_section.get_main_declarant_income_size(
                        )
                        if main_income is not None and main_income > 0:
                            incomes.append(main_income)
                        prepared_section.save_to_database(section_id)
                        imported_section_years.append(section_income_year)

                except (DatabaseError,
                        TSmartParserSectionJson.SerializerException) as exp:
                    TImporter.logger.error(
                        "Error! cannot import section N {}: {} ".format(
                            section_index, exp))

        if len(imported_section_years) > 0:
            source_document_in_db.min_income_year = min(imported_section_years)
            source_document_in_db.max_income_year = max(imported_section_years)
            source_document_in_db.section_count = len(imported_section_years)
            median_income = 0
            if len(incomes) > 0:
                median_income = median(incomes)
            if median_income >= 2**31:
                median_income = 0
            source_document_in_db.median_income = median_income
            source_document_in_db.save()

        return len(imported_section_years)

    def get_smart_parser_json(self, all_imported_human_jsons, sha256, src_doc):
        response = self.smart_parser_cache_client.retrieve_json_by_sha256(
            sha256)
        if response is None or response == {}:
            return self.get_human_smart_parser_json(src_doc,
                                                    all_imported_human_jsons)
        else:
            return response

    def import_office(self, office_id):
        if self.args.get('rubric_id') is not None and RUSSIA.get_office(
                office_id).rubric_id != self.args.get('rubric_id'):
            return

        all_imported_human_jsons = set()
        max_doc_id = 2**32
        ordered_documents = list()
        for sha256 in self.office_to_source_documents[office_id]:
            doc_id = self.permalinks_db_source_document.get_old_source_doc_id_by_sha256(
                sha256)
            if doc_id is None:
                doc_id = max_doc_id
            ordered_documents.append((doc_id, sha256))
        ordered_documents.sort()
        TImporter.logger.debug("import office {} document count = {}".format(
            office_id, len(ordered_documents)))

        for _, sha256 in ordered_documents:
            src_doc = self.dlrobot_human.get_document(sha256)
            assert src_doc.calculated_office_id == office_id
            smart_parser_json = self.get_smart_parser_json(
                all_imported_human_jsons, sha256, src_doc)
            doc_file_in_db = self.register_document_in_database(
                sha256, src_doc)
            if smart_parser_json is None:
                self.logger.debug(
                    "file {} has no valid smart parser json, skip it".format(
                        sha256))
            else:
                try:
                    sections_count = self.import_one_smart_parser_json(
                        doc_file_in_db, smart_parser_json, src_doc)
                    TImporter.logger.debug("import {} sections from {}".format(
                        sections_count, sha256))
                except TSmartParserSectionJson.SerializerException as exp:
                    TImporter.logger.error(
                        "Error! cannot import smart parser json for file {}: {} "
                        .format(sha256, exp))

    def distribute_offices_to_processes(self, process_count):
        assert process_count > 1
        cnt = 0
        for office_id in self.office_to_source_documents.keys():
            cnt += 1
            if RUSSIA.get_office(office_id).rubric_id == TOfficeRubrics.Gulag:
                #put all fsin offices to the first process
                bucket_id = 0
            else:
                if len(self.office_buckets[0]) > cnt / process_count:
                    #if bucket 0 contains more offices than other buckets, put to current office to other buckets
                    bucket_id = cnt % (process_count - 1) + 1
                else:
                    bucket_id = cnt % process_count
            self.office_buckets[bucket_id].append(office_id)
        for i in self.office_buckets.keys():
            self.logger.debug("bucket[{}] size = {}".format(
                i, len(self.office_buckets[i])))

    def process_one_office_bucket_in_subprocess(self, bucket_id):
        self.init_after_fork()
        for office_id in self.office_buckets[bucket_id]:
            try:
                self.import_office(office_id)
                gc.collect()
            except TSmartParserSectionJson.SerializerException as exp:
                TImporter.logger.error(
                    "cannot import bucket id {}, exception: {}".format(
                        bucket_id), exp)
Esempio n. 18
0
class TOfficePredictor:
    default_ml_model_path = os.path.join(os.path.dirname(__file__), "../model")

    @staticmethod
    def parse_args(args):
        parser = argparse.ArgumentParser()
        parser.add_argument("--dlrobot-human-path", dest='dlrobot_human_path', required=True)
        parser.add_argument("--office-model-path", dest='office_model_path', required=False,
                            default=TOfficePredictor.default_ml_model_path)
        parser.add_argument("--disable-ml", dest='enable_ml', required=False, default=True,
                            action="store_false")
        parser.add_argument("--max-failures-count", dest='max_failures_count', required=False, default=100,
                            type=int)
        return parser.parse_args(args=args)

    def __init__(self, args):
        self.logger = setup_logging(log_file_name="predict_office.log")
        self.dlrobot_human_path = args.dlrobot_human_path
        self.dlrobot_human = TDlrobotHumanFileDBM(self.dlrobot_human_path)
        self.dlrobot_human.open_write_mode()
        self.enable_ml = args.enable_ml
        sp_args = TSmartParserCacheClient.parse_args([])
        self.smart_parser_server_client = TSmartParserCacheClient(sp_args, self.logger)
        model_path = args.office_model_path
        self.max_failures_count = args.max_failures_count
        assert (os.path.exists(model_path))
        bigrams_path = os.path.join(model_path, "office_ngrams.txt")
        ml_model_path = os.path.join(model_path, "model")
        self.office_ml_model = TTensorFlowOfficeModel(self.logger, bigrams_path, ml_model_path, create_model=False)
        self.regional_tax_offices = self.build_regional_tax_offices()
        self.web_sites = TDeclarationWebSiteList(self.logger, RUSSIA.offices_in_memory)
        self.title_parser = TOfficeFromTitle(self.logger, web_sites=self.web_sites)
        self.src_doc_to_rule_results = dict()

    def build_regional_tax_offices(self):
        o: TOfficeInMemory
        tax_offices = dict()
        for o in RUSSIA.iterate_offices():
            if o.rubric_id == TOfficeRubrics.Tax:
                tax_offices[o.region_id] = o.office_id
        assert len(tax_offices) > 0
        return tax_offices

    def set_office_id(self, sha256, src_doc: TSourceDocument, office_id, method_name: str):
        old_office_id = src_doc.calculated_office_id
        if old_office_id is None or office_id == old_office_id:
            self.logger.debug("set file {} office_id={} ({} )".format(
                sha256, office_id, method_name))
        else:
            self.logger.info("change office_id from {} to {} for file {} , ({})".format( \
                old_office_id, office_id, sha256, method_name))
        src_doc.calculated_office_id = office_id
        self.dlrobot_human.update_source_document(sha256, src_doc)

    def predict_tax_office(self, sha256, src_doc: TSourceDocument):
        web_ref: TWebReference
        for web_ref in src_doc.web_references:
            if web_ref._site_url.endswith("service.nalog.ru"):
                if src_doc.region_id is None:
                    smart_parser_json = self.smart_parser_server_client.retrieve_json_by_sha256(sha256)
                    if smart_parser_json is None:
                        return False
                    props = smart_parser_json.get('document_sheet_props')
                    if props is None or len(props) == 0 or 'url' not in props[0]:
                        return False
                    url = props[0]['url']
                    region_str = url[:url.find('.')]
                    if not region_str.isdigit():
                        return False
                    src_doc.region_id = int(region_str)

                office_id = self.regional_tax_offices.get(src_doc.region_id)
                if office_id is not None:
                    self.set_office_id(sha256, src_doc, office_id, "regional tax office")
                    return True
        return False

    # all sites are ascribed to the same office
    def single_web_site(self, src_doc):
        r: TWebReference
        offices = set()
        for r in src_doc.web_references:
            if r.get_site_url():
                site_info = self.web_sites.search_url(r.get_site_url())
                if site_info is not None:
                    offices.add(site_info.parent_office.office_id)
        if len(offices) == 1:
            return list(offices)[0]
        return None

    #Take the first office, that is a very bad solution. This is done to make the whole thing work.
    # In future we hope to get rid of this solution by adding anchor texts analysis  or moro sophisticated title parsing
    def predict_by_first_web_site(self, case: TPredictionCase, src_doc):
        r: TWebReference
        min_crawl_epoch = time.time()
        office_id = None
        for r in src_doc.web_references:
            if 0 < r.crawl_epoch < min_crawl_epoch:
                site_info = self.web_sites.search_url(r.get_site_url())
                if site_info is not None:
                    min_crawl_epoch = r.crawl_epoch
                    office_id = site_info.parent_office.office_id
        return office_id