Esempio n. 1
0
    def test(self):
        logger = setup_logging(logger_name="test_real_dedupe")
        models.Section.objects.all().delete()

        permalinks_folder = os.path.dirname(__file__)

        db = TPermaLinksPerson(permalinks_folder)
        db.open_db_read_only()
        db.recreate_auto_increment_table()
        db.close_db()

        model_path = os.path.join(
            os.path.dirname(__file__),
            "../../../deduplicate/model/random_forest.pickle")
        dedupe_objects = os.path.join(os.path.dirname(__file__),
                                      "dedupe_objects.dump")
        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          input_dedupe_objects=dedupe_objects,
                          model_file=model_path,
                          threshold=0.6,
                          recreate_db=True,
                          surname_bounds=',',
                          write_to_db=True)
        sec = models.Section.objects.get(id=757036)
        self.assertEqual(1406125, sec.person_id)
Esempio n. 2
0
    def test(self):
        self.initialize()

        person_id = 99
        person = models.Person(id=person_id)
        person.save()
        section1 = self.create_section(1,
                                       "Иванов Иван Иванович",
                                       person=person)
        section2 = self.create_section(2, "Иванов И. И.", person=person)
        section3 = self.create_section(3, "Петров И. И.")

        permalinks_folder = os.path.dirname(__file__)
        db = TPermaLinksPerson(permalinks_folder)
        db.create_db()
        db.save_dataset(setup_logging())
        db.recreate_auto_increment_table()
        db.close_db()

        section1.person = None
        section1.save()
        section2.person = None
        section2.save()
        person.delete()

        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          write_to_db=True,
                          fake_dedupe=True,
                          surname_bounds=',',
                          take_sections_with_empty_income=True,
                          rebuild=True)

        self.assertEqual(models.Person.objects.count(), 1)

        sec1 = models.Section.objects.get(id=1)
        self.assertEqual(sec1.person_id, person_id)

        sec2 = models.Section.objects.get(id=2)
        self.assertEqual(sec2.person_id, person_id)

        sec3 = models.Section.objects.get(id=3)
        self.assertEqual(sec3.person_id, person_id)
Esempio n. 3
0
    def test(self):
        logger = setup_logging(logger_name="test_real_dedupe")
        sql_script = os.path.join( os.path.dirname(__file__), "disclosures.sql.person_id_5295.n")
        run_sql_script(logger, sql_script)

        permalinks_folder = os.path.dirname(__file__)
        db = TPermaLinksPerson(permalinks_folder)
        db.create_db()
        db.save_dataset(setup_logging())
        db.recreate_auto_increment_table()
        db.close_db()

        model_path = os.path.join(os.path.dirname(__file__), "../../../deduplicate/model/random_forest.pickle" )
        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          write_to_db=True,
                          surname_bounds=',',
                          model_file=model_path,
                          threshold=0.6
                          )

        person_id = 5295
        self.assertEqual(models.Person.objects.count(), 3)
        person = models.Person.objects.get(id=person_id)
        self.assertIsNotNone(person)
        self.assertEqual(5295, person.declarator_person_id)
        canon_sections  =  [
            (451721,	5295,	True),
            (452066,	5295,	True),
            (452420,	5295, True),
            (453686,	5295, False),
            (455039,	5295,	False),
            (1801614,	5296,	True),
            (5105303,	5295,	True),
            (6437989,	5297,	True),
            (6672563,	5297,	True),
            (6674154,	5297,	True),
            (6773981,	5297,	True),
        ]
        sections = []
        for s in models.Section.objects.all():
            sections.append ((s.id, s.person_id, s.dedupe_score is not None))
        self.assertListEqual(canon_sections, sections)
Esempio n. 4
0
    def test(self):
        self.initialize()

        person_id = 2
        declarator_person_id = 1111
        person = models.Person(id=person_id,
                               declarator_person_id=declarator_person_id,
                               person_name="Иванов Иван Иванович")
        person.save()

        self.create_section(1, "Иванов Иван Иванович", person)
        self.create_section(2, "Иванов И. И.")

        permalinks_folder = os.path.dirname(__file__)
        db = TPermaLinksPerson(permalinks_folder)
        db.create_db()
        db.save_dataset(setup_logging())
        #db.save_max_plus_one_primary_key(3)
        db.recreate_auto_increment_table()
        db.close_db()

        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          write_to_db=True,
                          fake_dedupe=True,
                          surname_bounds=',',
                          take_sections_with_empty_income=True,
                          rebuild=True)

        self.assertEqual(models.Person.objects.count(), 1)
        person = models.Person.objects.get(id=person_id)
        self.assertIsNotNone(person)
        self.assertEqual(declarator_person_id, person.declarator_person_id)

        sec1 = models.Section.objects.get(id=1)
        self.assertEqual(sec1.person_id, person.id)

        sec2 = models.Section.objects.get(id=2)
        self.assertEqual(sec2.person_id, person.id)
Esempio n. 5
0
class Command(BaseCommand):
    help = 'copy person id from declarator to disclosures'

    def __init__(self, *args, **kwargs):
        super(Command, self).__init__(*args, **kwargs)
        self.options = None
        self.permalinks_db = None
        self.logger = None
        self.declarator_person_id_to_disclosures_person = dict()
        self.disclosures_person_id_to_disclosures_person = dict()

    def add_arguments(self, parser):
        parser.add_argument('--read-person-from-json',
                            dest='read_person_from_json',
                            default=None,
                            help='read person info  from json for testing')
        parser.add_argument('--permalinks-folder',
                            dest='permalinks_folder',
                            required=True)
        parser.add_argument('--declarator-host',
                            dest='declarator_host',
                            required=False)
        parser.add_argument('--person-name-prefix',
                            dest='person_name_prefix',
                            required=False)

    def open_permalinks_db(self):
        self.permalinks_db = TPermaLinksPerson(
            self.options['permalinks_folder'])
        self.permalinks_db.open_db_read_only()

    def build_passport_to_person_id_mapping_from_declarator(self):
        if self.options.get('read_person_from_json') is not None:
            with open(self.options.get('read_person_from_json'),
                      "r",
                      encoding="utf8") as inpf:
                return json.load(inpf)
        else:
            return get_all_section_from_declarator_with_person_id(
                self.options['declarator_host'])

    # we think that person ids in declarator db are stable
    def copy_human_merge(self, section, declarator_person_id):
        person = self.declarator_person_id_to_disclosures_person.get(
            declarator_person_id)
        if person is None:
            person_id = self.permalinks_db.get_person_id_by_declarator_id(
                declarator_person_id, section.id)
            if person_id in self.disclosures_person_id_to_disclosures_person:
                person = self.disclosures_person_id_to_disclosures_person.get(
                    person_id)
                if declarator_person_id != person.declarator_person_id:
                    self.logger.error(
                        "Person id={} has conflict declarator_person_id ({} != {}), use the first person id {}"
                        .format(person_id, declarator_person_id,
                                person.declarator_person_id,
                                person.declarator_person_id))

            else:
                person = models.Person(
                    id=person_id,
                    declarator_person_id=declarator_person_id,
                    person_name=section.person_name)
                person.save()
                self.declarator_person_id_to_disclosures_person[
                    declarator_person_id] = person
                self.disclosures_person_id_to_disclosures_person[
                    declarator_person_id] = person
        elif person.person_name is None or len(person.person_name) < len(
                section.person_name):
            person.person_name = section.person_name
            person.save()

        assert person.declarator_person_id is not None
        self.logger.debug(
            "connect section {} to person {}, declarator_person_id={}".format(
                section.id, person.id, person.declarator_person_id))

        section.person = person
        section.dedupe_score = None
        section.save()

    def process_section(self, section, section_passports):
        main_income = 0
        for i in section.income_set.all():
            if i.relative == models.Relative.main_declarant_code:
                main_income = i.size
        found_results = list()
        for declaration_info in section.source_document.declarator_file_reference_set.all(
        ):
            key1 = build_section_passport(
                declaration_info.declarator_document_id, section.person_name,
                main_income)
            found_res1 = section_passports.get(key1)
            if found_res1 is not None:
                found_results.append(found_res1)
            fio = TRussianFio(section.person_name)
            if fio.is_resolved:
                key2 = build_section_passport(
                    declaration_info.declarator_document_id, fio.family_name,
                    main_income)
                found_res2 = section_passports.get(key2)
                if found_res2 is not None:
                    found_results.append(found_res2)
            else:
                self.logger.error(
                    "section {} fio={} cannot find surname".format(
                        section.id, section.person_name))

        if len(found_results) == 0:
            self.logger.debug(
                "section {} fio={} cannot be found in declarator".format(
                    section.id, section.person_name))
        else:
            for person_id in found_results:
                if person_id != "AMBIGUOUS_KEY":
                    self.copy_human_merge(section, person_id)
                    return True
            self.logger.debug("section {} fio={} is ambiguous".format(
                section.id, section.person_name))
        return False

    def copy_declarator_person_ids(self, section_passports):
        query = """
            select s.id, r.declarator_document_id, s.person_name, i.size
            from declarations_section s
            join declarations_income i on i.section_id = s.id
            join declarations_source_document d on s.source_document_id = d.id
            join declarations_declarator_file_reference r on r.source_document_id = d.id
            where i.relative = '{}'
        """.format(models.Relative.main_declarant_code)
        merge_count = 0
        with connection.cursor() as cursor:
            cursor.execute(query)
            for section_id, declarator_document_id, person_name, main_income in cursor:
                found_results = list()
                key1 = build_section_passport(declarator_document_id,
                                              person_name, main_income)
                found_res1 = section_passports.get(key1)
                if found_res1 is not None:
                    found_results.append(found_res1)
                fio = TRussianFio(person_name)
                if fio.is_resolved:
                    key2 = build_section_passport(declarator_document_id,
                                                  fio.family_name, main_income)
                    found_res2 = section_passports.get(key2)
                    if found_res2 is not None:
                        found_results.append(found_res2)
                if len(found_results) > 0:
                    success = False
                    for person_id in found_results:
                        if person_id != "AMBIGUOUS_KEY":
                            self.copy_human_merge(
                                models.Section.objects.get(id=section_id),
                                person_id)
                            success = True
                            merge_count += 1
                            break
                    if not success:
                        self.logger.debug(
                            "section {} fio={} is ambiguous".format(
                                section_id, person_name))
        self.logger.info(
            "set human person id to {} records".format(merge_count))

    def handle(self, *args, **options):
        self.logger = setup_logging(logger_name="copy_person")
        self.options = options
        self.logger.debug("models.Person.objects.count()={}".format(
            models.Person.objects.count()))
        assert models.Person.objects.count() == 0
        self.open_permalinks_db()
        section_passports = self.build_passport_to_person_id_mapping_from_declarator(
        )
        self.logger.info("merge by {} passports from declarator".format(
            len(section_passports)))
        self.copy_declarator_person_ids(section_passports)
        self.permalinks_db.close_db()
        self.logger.info("all done")