Exemple #1
0
 def init_options(self, options):
     self.logger = setup_logging(log_file_name=options.get('logfile'))
     self.options = options
     self.rebuild = options.get('rebuild', False)
     if self.rebuild and not options['write_to_db']:
         self.logger.info(
             "please add --write-to-db  option if you use --rebuild")
     self.permalinks_db = TPermaLinksPerson(options['permalinks_folder'])
     self.permalinks_db.open_db_read_only()
     if options.get('threshold', 0) != 0:
         self.threshold = options.get('threshold')
     else:
         self.logger.info(
             'Warning! Threshold is not set. Is it just a test?')
Exemple #2
0
    def test(self):
        logger = setup_logging(logger_name="test_real_dedupe")
        models.Section.objects.all().delete()

        permalinks_folder = os.path.dirname(__file__)

        db = TPermaLinksPerson(permalinks_folder)
        db.open_db_read_only()
        db.recreate_auto_increment_table()
        db.close_db()

        model_path = os.path.join(
            os.path.dirname(__file__),
            "../../../deduplicate/model/random_forest.pickle")
        dedupe_objects = os.path.join(os.path.dirname(__file__),
                                      "dedupe_objects.dump")
        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          input_dedupe_objects=dedupe_objects,
                          model_file=model_path,
                          threshold=0.6,
                          recreate_db=True,
                          surname_bounds=',',
                          write_to_db=True)
        sec = models.Section.objects.get(id=757036)
        self.assertEqual(1406125, sec.person_id)
    def test(self):
        self.initialize()
        self.create_section(1, "Иванов Иван Иванович")
        self.create_section(2, "Иванов И. И.")

        permalinks_folder = os.path.dirname(__file__)
        TPermaLinksPerson(permalinks_folder).create_and_save_empty_db()
        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          write_to_db=True,
                          fake_dedupe=True,
                          surname_bounds=',',
                          take_sections_with_empty_income=True,
                          rebuild=True)

        self.assertEqual(models.Person.objects.count(), 1)
        person = models.Person.objects.get(id=1)
        self.assertEqual(person.person_name, "Иванов Иван Иванович")

        sec1 = models.Section.objects.get(id=1)
        self.assertEqual(sec1.person_id, 1)

        sec2 = models.Section.objects.get(id=2)
        self.assertEqual(sec2.person_id, 1)
Exemple #4
0
    def test(self):
        TPermaLinksPerson(CopyPersonIdTestCaseBase.permalinks_folder
                          ).create_and_save_empty_db()

        self.run_copy_person_id(False, True)

        section1 = models.Section.objects.get(id=self.section_id1)
        self.assertEqual(section1.person, None)
Exemple #5
0
    def test(self):
        self.create_test_db()

        person_id = 1
        person = models.Person(id=person_id, person_name=self.fio)
        self.assertIsNone(person.declarator_person_id)
        person.save()

        section1 = models.Section.objects.get(id=self.section_id1)
        section1.person = person
        section1.save()

        TPermaLinksPerson(CopyPersonIdTestCaseBase.permalinks_folder
                          ).create_and_save_empty_db()

        CreatePermalinksStorageCommand(None, None).handle(
            None, directory=CopyPersonIdTestCaseBase.permalinks_folder)
        permalinks_db = TPermaLinksPerson(
            CopyPersonIdTestCaseBase.permalinks_folder)
        permalinks_db.open_db_read_only()
        permalinks_db.recreate_auto_increment_table()

        self.run_copy_person_id(False, False)
        self.assertEqual(models.Person.objects.count(), 1)
        section1 = models.Section.objects.get(id=self.section_id1)
        self.assertEqual(section1.person.declarator_person_id,
                         self.declarator_person_id)
        self.assertEqual(section1.person.id, person_id)
Exemple #6
0
 def test_(self):
     TPermaLinksPerson(CopyPersonIdTestCaseBase.permalinks_folder
                       ).create_and_save_empty_db()
     self.run_copy_person_id(True, False)
     self.assertEqual(models.Person.objects.count(), 1)
     section1 = models.Section.objects.get(id=self.section_id1)
     self.assertEqual(section1.person.declarator_person_id,
                      self.declarator_person_id)
     self.assertEqual(section1.person.id, 1)
    def test(self):
        self.initialize()

        person_id = 99
        person = models.Person(id=person_id)
        person.save()
        section1 = self.create_section(1,
                                       "Иванов Иван Иванович",
                                       person=person)
        section2 = self.create_section(2, "Иванов И. И.", person=person)
        section3 = self.create_section(3, "Петров И. И.")

        permalinks_folder = os.path.dirname(__file__)
        db = TPermaLinksPerson(permalinks_folder)
        db.create_db()
        db.save_dataset(setup_logging())
        db.recreate_auto_increment_table()
        db.close_db()

        section1.person = None
        section1.save()
        section2.person = None
        section2.save()
        person.delete()

        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          write_to_db=True,
                          fake_dedupe=True,
                          surname_bounds=',',
                          take_sections_with_empty_income=True,
                          rebuild=True)

        self.assertEqual(models.Person.objects.count(), 1)

        sec1 = models.Section.objects.get(id=1)
        self.assertEqual(sec1.person_id, person_id)

        sec2 = models.Section.objects.get(id=2)
        self.assertEqual(sec2.person_id, person_id)

        sec3 = models.Section.objects.get(id=3)
        self.assertEqual(sec3.person_id, person_id)
Exemple #8
0
    def test(self):
        logger = setup_logging(logger_name="test_real_dedupe")
        sql_script = os.path.join( os.path.dirname(__file__), "disclosures.sql.person_id_5295.n")
        run_sql_script(logger, sql_script)

        permalinks_folder = os.path.dirname(__file__)
        db = TPermaLinksPerson(permalinks_folder)
        db.create_db()
        db.save_dataset(setup_logging())
        db.recreate_auto_increment_table()
        db.close_db()

        model_path = os.path.join(os.path.dirname(__file__), "../../../deduplicate/model/random_forest.pickle" )
        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          write_to_db=True,
                          surname_bounds=',',
                          model_file=model_path,
                          threshold=0.6
                          )

        person_id = 5295
        self.assertEqual(models.Person.objects.count(), 3)
        person = models.Person.objects.get(id=person_id)
        self.assertIsNotNone(person)
        self.assertEqual(5295, person.declarator_person_id)
        canon_sections  =  [
            (451721,	5295,	True),
            (452066,	5295,	True),
            (452420,	5295, True),
            (453686,	5295, False),
            (455039,	5295,	False),
            (1801614,	5296,	True),
            (5105303,	5295,	True),
            (6437989,	5297,	True),
            (6672563,	5297,	True),
            (6674154,	5297,	True),
            (6773981,	5297,	True),
        ]
        sections = []
        for s in models.Section.objects.all():
            sections.append ((s.id, s.person_id, s.dedupe_score is not None))
        self.assertListEqual(canon_sections, sections)
    def test(self):
        self.initialize()

        person_id = 2
        declarator_person_id = 1111
        person = models.Person(id=person_id,
                               declarator_person_id=declarator_person_id,
                               person_name="Иванов Иван Иванович")
        person.save()

        self.create_section(1, "Иванов Иван Иванович", person)
        self.create_section(2, "Иванов И. И.")

        permalinks_folder = os.path.dirname(__file__)
        db = TPermaLinksPerson(permalinks_folder)
        db.create_db()
        db.save_dataset(setup_logging())
        #db.save_max_plus_one_primary_key(3)
        db.recreate_auto_increment_table()
        db.close_db()

        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          write_to_db=True,
                          fake_dedupe=True,
                          surname_bounds=',',
                          take_sections_with_empty_income=True,
                          rebuild=True)

        self.assertEqual(models.Person.objects.count(), 1)
        person = models.Person.objects.get(id=person_id)
        self.assertIsNotNone(person)
        self.assertEqual(declarator_person_id, person.declarator_person_id)

        sec1 = models.Section.objects.get(id=1)
        self.assertEqual(sec1.person_id, person.id)

        sec2 = models.Section.objects.get(id=2)
        self.assertEqual(sec2.person_id, person.id)
Exemple #10
0
    def test(self):
        TPermaLinksPerson(CopyPersonIdTestCaseBase.permalinks_folder
                          ).create_and_save_empty_db()
        self.run_copy_person_id(False, False)

        # check that we reuse old person ids
        CreatePermalinksStorageCommand(None, None).handle(
            None, directory=CopyPersonIdTestCaseBase.permalinks_folder)
        permalinks_db = TPermaLinksPerson(
            CopyPersonIdTestCaseBase.permalinks_folder)
        permalinks_db.open_db_read_only()
        permalinks_db.recreate_auto_increment_table()

        self.run_copy_person_id(False, False)
        self.assertEqual(permalinks_db.get_last_inserted_id_for_testing(),
                         None)
Exemple #11
0
    def test(self):
        self.initialize()
        permalinks_folder = os.path.dirname(__file__)

        person_id = 99
        person = models.Person(id=person_id)
        person.save()
        section1 = self.create_section(1,
                                       "Иванов Иван Иванович",
                                       person=person)
        section2 = self.create_section(2,
                                       "Иванов Иван Иванович",
                                       person=person)

        CreatePermalinksStorageCommand(None, None).handle(
            None, directory=permalinks_folder)
        TPermaLinksPerson(permalinks_folder).open_db_read_only(
        ).recreate_auto_increment_table()

        section1.person = None
        section1.save()

        section2.person = None
        section2.save()

        person.delete()

        run_dedupe = RunDedupe(None, None)
        run_dedupe.handle(None,
                          permalinks_folder=permalinks_folder,
                          write_to_db=True,
                          fake_dedupe=True,
                          separate_sections=True,
                          surname_bounds=',',
                          take_sections_with_empty_income=True,
                          rebuild=True)

        self.assertEqual(2, models.Person.objects.count())

        #"person_id" is inherited by the minimal section_id, if there is no other grounds
        sec1 = models.Section.objects.get(id=1)
        self.assertEqual(sec1.person_id, person_id)

        sec2 = models.Section.objects.get(id=2)
        self.assertEqual(sec2.person_id, person_id + 1)  # a new person_id
Exemple #12
0
    def test(self):
        TPermaLinksPerson(CopyPersonIdTestCaseBase.permalinks_folder
                          ).create_and_save_empty_db()
        self.run_copy_person_id(False, False)

        # check that we reuse old person ids
        CreatePermalinksStorageCommand(None, None).handle(
            None, directory=CopyPersonIdTestCaseBase.permalinks_folder)
        permalinks_db = TPermaLinksPerson(
            CopyPersonIdTestCaseBase.permalinks_folder)
        permalinks_db.open_db_read_only()
        permalinks_db.recreate_auto_increment_table()

        new_declarator_person_id = self.declarator_person_id + 1
        self.run_copy_person_id(False,
                                False,
                                declarator_person_id=new_declarator_person_id)
        self.assertEqual(models.Person.objects.count(), 1)
        section1 = models.Section.objects.get(id=self.section_id1)
        self.assertEqual(section1.person.declarator_person_id,
                         new_declarator_person_id)
        self.assertEqual(section1.person.id, 1)
Exemple #13
0
class Command(BaseCommand):
    help = ''

    def add_arguments(self, parser):
        parser.add_argument(
            '--surname-bounds',
            dest='surname_bounds',
            default=None,
            help=
            '[l,b], take records where person_name >=l and person_name < b',
        )
        parser.add_argument(
            '--print-family-prefixes',
            dest='print_family_prefixes',
            default=False,
            action="store_true",
            help='print family prefixes and exit',
        )
        parser.add_argument(
            '--ml-model-file',
            dest='model_file',
        )
        parser.add_argument(
            '--threshold',
            dest='threshold',
            type=float,
        )
        parser.add_argument(
            '--result-pairs-file',
            dest='result_pairs_file',
            help='',
        )
        parser.add_argument(
            '--rebuild',
            dest='rebuild',
            action="store_true",
            default=False,
            help='rebuild old persom, declaration pairs',
        )
        parser.add_argument(
            '--dump-dedupe-objects-file',
            dest='dump_dedupe_objects_file',
            help='',
        )
        parser.add_argument(
            '--input-dedupe-objects',
            dest='input_dedupe_objects',
            help=
            'read objects that were written by option --dump-dedupe-objects-file',
        )
        parser.add_argument(
            '--recreate-sections-and-persons',
            dest='recreate_db',
            help=
            'create fake sections and persons that are written in --input-dedupe-objects',
        )

        parser.add_argument(
            '--write-to-db',
            dest='write_to_db',
            action='store_true',
            default=False,
            help='write back to DB',
        )
        parser.add_argument('--permalinks-folder',
                            dest='permalinks_folder',
                            required=True)
        parser.add_argument(
            '--fake-dedupe',
            dest='fake_dedupe',
            required=False,
            help=
            'create one person for all sections without dedupe (test purpose)')
        parser.add_argument(
            '--separate-sections',
            dest='separate_sections',
            required=False,
            help='put all sections in a separate cluster (test purpose)')
        parser.add_argument('--logfile',
                            dest='logfile',
                            required=False,
                            help='set logfile name')

    def __init__(self, *args, **kwargs):
        super(Command, self).__init__(*args, **kwargs)
        self.ml_model = None
        self.options = None
        self.logger = None
        self.permalinks_db = None
        self.rebuild = False
        self.threshold = 0
        self.cluster_by_minimal_fio = defaultdict(list)
        self.section_cache = dict()

    def init_options(self, options):
        self.logger = setup_logging(log_file_name=options.get('logfile'))
        self.options = options
        self.rebuild = options.get('rebuild', False)
        if self.rebuild and not options['write_to_db']:
            self.logger.info(
                "please add --write-to-db  option if you use --rebuild")
        self.permalinks_db = TPermaLinksPerson(options['permalinks_folder'])
        self.permalinks_db.open_db_read_only()
        if options.get('threshold', 0) != 0:
            self.threshold = options.get('threshold')
        else:
            self.logger.info(
                'Warning! Threshold is not set. Is it just a test?')

    def filter_table(self, model_type, lower_bound, upper_bound):
        #By default, string comparisons in sql are case insensitive because strings are non-binary.
        records = model_type.objects
        if lower_bound != '':
            records = records.filter(person_name__gte=lower_bound)
        if upper_bound != '':
            records = records.filter(person_name__lt=upper_bound)
        records_count = records.count()
        self.logger.info("Start reading {} records from {}... ".format(
            records_count, model_type._meta.db_table))
        return records

    def read_sections(self, lower_bound, upper_bound):
        sections = self.filter_table(models.Section, lower_bound, upper_bound)\

        # there are sections with person_id != null, person_id was set by  copy_person_id.py,
        # we need these records to build valid clusters
        cnt = 0
        take_sections_with_empty_income = self.options.get(
            'take_sections_with_empty_income', False)
        trace = (self.options.get('verbosity', 0) == 3)
        for s in sections.all():
            o = TDeduplicationObject().initialize_from_section(s)
            if not o.fio.is_resolved:
                self.logger.debug(
                    "ignore section id={} person_name={}, cannot find family name"
                    .format(s.id, s.person_name))
                continue
            if not take_sections_with_empty_income and o.average_income == 0:
                self.logger.debug(
                    "ignore section id={} person_name={}, no income or zero-income"
                    .format(s.id, s.person_name))
                continue
            self.section_cache[s.id] = s
            if trace:
                self.logger.debug("read section id = {}".format(s.id))
            self.cluster_by_minimal_fio[
                o.fio.build_fio_with_initials()].append(o)
            cnt += 1
            if cnt % 10000 == 0:
                self.logger.info(
                    "Read {} records from section table".format(cnt))
        self.logger.info("Read {0} records from section table".format(cnt))

    def read_people(self, lower_bound, upper_bound):
        persons = self.filter_table(models.Person, lower_bound, upper_bound)
        cnt = 0
        trace = (self.options.get('verbosity', 0) == 3)
        for p in persons.all():
            o = TDeduplicationObject().initialize_from_person(p)
            if len(o.years) > 0:
                self.cluster_by_minimal_fio[
                    o.fio.build_fio_with_initials()].append(o)
            else:
                self.logger.debug(
                    "skip person id={}, because this record has no related sections with"
                    " defined income years".format(p.id))
            if trace:
                self.logger.debug("read person id = {}".format(p.id))
            cnt += 1
            if cnt % 1000 == 0:
                self.logger.info(
                    "Read {} records from person table".format(cnt))
        self.logger.info("Read {} records from person table".format(cnt))

    def get_all_leaf_objects(self):
        for l in self.cluster_by_minimal_fio.values():
            for o in l:
                yield o

    def filter_sql_by_person_name(self, sql, lower_bound, upper_bound):
        # By default, string comparisons in sql are case insensitive because strings are non-binary.
        if lower_bound != '':
            sql += " and person_name >= '{}' ".format(lower_bound)
        if upper_bound != '':
            sql += " and person_name <  '{}' ".format(upper_bound)
        return sql

    def delete_person_ids_from_previous_deduplication(self, lower_bound,
                                                      upper_bound):
        sql = self.filter_sql_by_person_name("update declarations_section set " \
              "person_id=null, dedupe_score=null " \
              "where dedupe_score is not null ", lower_bound, upper_bound)

        if lower_bound != '':
            assert upper_bound != ''
            sql += ""
        self.logger.debug(sql)
        with connection.cursor() as cursor:
            cursor.execute(sql)

        sql = self.filter_sql_by_person_name("delete from declarations_person " \
                        "where declarator_person_id is null", lower_bound, upper_bound)
        self.logger.debug(sql)
        with connection.cursor() as cursor:
            cursor.execute(sql)

    def read_dumped_objects(self, file_name):
        if self.options.get('recreate_db'):
            assert models.Section.objects.count() == 0
        with open(file_name) as inp:
            for line in inp:
                js = json.loads(line)
                o = TDeduplicationObject().from_json(js)
                if self.options.get('recreate_db'):
                    if o.record_id.source_table == TDeduplicationObject.SECTION:
                        assert len(o.offices) == 1
                        s = models.Section(id=o.record_id.id,
                                           office_id=list(o.offices)[0])
                        self.section_cache[o.record_id.id] = s
                        s.save()
                    else:
                        models.Person(id=o.record_id.id).save()
                self.cluster_by_minimal_fio[
                    o.fio.build_fio_with_initials()].append(o)

    def dump_dedupe_objects(self, dump_file_name):
        with open(dump_file_name, "w", encoding="utf-8") as of:
            for o in self.get_all_leaf_objects():
                js = json.dumps(o.to_json(), ensure_ascii=False)
                of.write(js + "\n")

    def fill_dedupe_data(self, lower_bound, upper_bound):
        self.cluster_by_minimal_fio = defaultdict(list)
        if self.rebuild:
            self.delete_person_ids_from_previous_deduplication(
                lower_bound, upper_bound)
        if self.options.get('input_dedupe_objects') is not None:
            self.read_dumped_objects(self.options.get('input_dedupe_objects'))
        else:
            self.read_sections(lower_bound, upper_bound)
            self.read_people(lower_bound, upper_bound)

        dump_file_name = self.options.get("dump_dedupe_objects_file")
        if dump_file_name:
            self.dump_dedupe_objects(dump_file_name)

    def write_results_to_file(self, clusters, dump_stream):
        self.logger.info('{} clusters generated'.format(len(clusters)))
        for cluster_id, items in clusters.items():
            dump_stream.write("cluster {}\n".format(cluster_id))
            for obj, distance in items:
                dump_stream.write("\t{} {} {} {}\n".format(
                    obj.record_id, 1.0 - distance, obj.person_name,
                    min(obj.years)))

    def link_section_to_person(self, section, person, distance):
        if section.person_id is not None and section.dedupe_score is None:
            #these person_id's came from declarator, do not touch them
            self.logger.debug(
                "skip setting person_id={} to section (id={}, person_id={}), because it is from declarator"
                .format(person.id, section.id, section.person_id))
            return
        self.logger.debug("link section {} to person {}".format(
            section.id, person.id))
        section.person_id = person.id
        section.dedupe_score = 1.0 - distance
        section.save()
        if len(person.person_name) < len(section.person_name):
            person.person_name = section.person_name
            person.save()

    def link_sections_to_a_new_person(self, sections, section_distances,
                                      person_id):
        assert len(section_distances) == len(sections)
        if person_id is None:
            person_id = self.permalinks_db._get_new_id()
            self.logger.debug("create new person.id: {}".format(person_id))
        else:
            self.logger.debug("use old person.id: {}".format(person_id))

        try:
            person = models.Person.objects.get(id=person_id)
            #reuse person record from declarator
            if person.declarator_person_id is None:
                self.logger.error(
                    "Warning! Reuse existing person_id = {} for different sections (cluster), it could happen"
                    "if this person_id was used for different person created by copy_person_id.py "
                    "but should not happen if declarator_person_id is None. ".
                    format(person_id))
        except models.Person.DoesNotExist as exp:
            #create new person record
            person = models.Person(id=person_id)
            person.save()

        for (section, distance) in zip(sections, section_distances):
            self.link_section_to_person(section, person, distance)

    def _get_old_clustering(self, clusters):
        old_person_to_new_sections = defaultdict(list)
        for cluster_id, items in clusters.items():
            section_to_person = list()
            found_person = False
            for obj, distance in items:
                if obj.record_id.source_table == TDeduplicationObject.SECTION:
                    section_id = obj.record_id.id
                    person_id = self.permalinks_db.get_person_id_by_section_id(
                        section_id)
                    if person_id is not None:
                        section_to_person.append((section_id, person_id))
                else:
                    # a person is already in this cluster, use it
                    found_person = True
                    break
            if not found_person:
                for (section_id, person_id) in section_to_person:
                    old_person_to_new_sections[person_id].append(
                        (cluster_id, section_id))
        return old_person_to_new_sections

    def build_cluster_to_old_person_id(self, clusters):
        old_person_to_sections = self._get_old_clustering(clusters)

        intersections = list()
        for person_id, sections in old_person_to_sections.items():
            # take always the cluster with that the minimal section_id
            min_section = min(section for section, _ in sections)
            for cluster_id, items in itertools.groupby(sections,
                                                       lambda x: x[0]):
                intersection_size = len(list(items))
                intersections.append(
                    (-intersection_size, -min_section, person_id, cluster_id))
        intersections = sorted(intersections)
        used_person_ids = set()
        used_cluster_ids = set()
        new_to_old_clusters = dict()
        for _, _, person_id, cluster_id in intersections:
            already = (person_id in used_person_ids) or (cluster_id
                                                         in used_cluster_ids)
            used_person_ids.add(person_id)
            used_cluster_ids.add(cluster_id)
            if not already:
                new_to_old_clusters[cluster_id] = person_id
        return new_to_old_clusters

    def write_results_to_db(self, clusters):
        clusters_to_old_person_ids = self.build_cluster_to_old_person_id(
            clusters)

        for cluster_id, items in clusters.items():
            person_ids = list()
            sections = list()
            section_distances = list()
            for obj, distance in items:
                if obj.record_id.source_table == TDeduplicationObject.PERSON:
                    person_ids.append(obj.record_id.id)
                else:
                    section = self.section_cache[obj.record_id.id]
                    sections.append(section)
                    section_distances.append(distance)
            if len(person_ids) == 0:
                self.link_sections_to_a_new_person(
                    sections, section_distances,
                    clusters_to_old_person_ids.get(cluster_id))
            elif len(person_ids) == 1:
                person = models.Person.objects.get(id=person_ids[0])
                for section, distance in zip(sections, section_distances):
                    self.link_section_to_person(section, person, distance)
            else:
                left_sections = ",".join(
                    (str(section.id) for section in sections))
                persons = ",".join((str(id) for id in person_ids))
                self.logger.debug(
                    "a cluster with two people found, I do not know what to do"
                    .format(left_sections))
                self.logger.debug("  cluster sections: ".format(left_sections))
                self.logger.debug("  cluster persons: ".format(persons))

    def get_person_name_baskets(self):
        if self.options.get('surname_bounds') is not None:
            yield self.options.get('surname_bounds').split(',')
        else:
            # By default, string comparisons in sql are case insensitive because strings are non-binary.
            sql = """
                    select substring(upper(person_name), 1, 3) as a, count(id) 
                    from declarations_section
                    group by a
                    order by a 
                   """
            borders = list([''])
            with connection.cursor() as cursor:
                cursor.execute(sql)
                cnt = 0
                accum = 0
                last_trigram = None
                for trigram, trigram_count in cursor:
                    cnt += trigram_count
                    if accum + trigram_count > 50000 and last_trigram is not None:
                        borders.append(last_trigram)
                        accum = 0
                    accum += trigram_count
                    if re.search(r'[\s/.,"\\:-]', trigram) is None:
                        last_trigram = trigram
                assert cnt == models.Section.objects.count()
            borders.append('')
            for x in range(1, len(borders)):
                yield borders[x - 1], borders[x]

    def load_dedupe_model(self):
        if not self.options.get("fake_dedupe", False):
            self.logger.info('read ml model from {}'.format(
                self.options["model_file"]))
            self.ml_model = TMLModel(self.options["model_file"])

    def cluster_sections(self):
        for fio, leaf_clusters in self.cluster_by_minimal_fio.items():
            self.logger.debug("cluster inside for {}: {} sections".format(
                fio, len(leaf_clusters)))
            clustering = TFioClustering(self.logger, leaf_clusters,
                                        self.ml_model, self.threshold)
            clustering.cluster()
            yield clustering.clusters

    def cluster_sections_by_minimal_fio(self):
        if self.options.get("fake_dedupe", False):
            if self.options.get("separate_sections", False):
                # each record to a separate cluster
                c = defaultdict(list)
                k = 0
                for i in self.get_all_leaf_objects():
                    c[k] = [(i, 0.1)]
                    k += 1
                yield c
            else:
                # all records in one cluster
                c = defaultdict(list)
                c[0] = [(i, 0.5) for i in self.get_all_leaf_objects()]
                yield c
        else:
            all_objects_count = sum(
                len(v) for v in self.cluster_by_minimal_fio.values())
            self.logger.info(
                'Clustering {} objects with threshold={}, len(self.cluster_by_minimal_fio) = {}'
                .format(all_objects_count, self.threshold,
                        len(self.cluster_by_minimal_fio)))
            for c in self.cluster_sections():
                yield c

    def handle(self, *args, **options):
        self.init_options(options)
        if options.get('print_family_prefixes'):
            for lower_bound, upper_bound in self.get_person_name_baskets():
                sys.stdout.write("{},{}\n".format(lower_bound, upper_bound))
            return
        self.logger.info('surname bounds are {}'.format(
            options.get('surname_bounds', "")))
        self.load_dedupe_model()
        dump_stream = None
        dump_file_name = self.options.get("result_pairs_file")
        if dump_file_name:
            dump_stream = open(dump_file_name, "w", encoding="utf8")
            self.logger.debug(
                'write result pairs to {}\n'.format(dump_file_name))

        for lower_bound, upper_bound in self.get_person_name_baskets():
            self.logger.info("lower_bound={}, upper_bound={}".format(
                lower_bound, upper_bound))
            self.fill_dedupe_data(lower_bound, upper_bound)
            for clusters_for_one_fio in self.cluster_sections_by_minimal_fio():
                if dump_stream is not None:
                    self.write_results_to_file(clusters_for_one_fio,
                                               dump_stream)
                if options['write_to_db']:
                    self.write_results_to_db(clusters_for_one_fio)

        if dump_stream is not None:
            dump_stream.close()
        self.logger.debug("all done")
Exemple #14
0
class Command(BaseCommand):
    help = 'copy person id from declarator to disclosures'

    def __init__(self, *args, **kwargs):
        super(Command, self).__init__(*args, **kwargs)
        self.options = None
        self.permalinks_db = None
        self.logger = None
        self.declarator_person_id_to_disclosures_person = dict()
        self.disclosures_person_id_to_disclosures_person = dict()

    def add_arguments(self, parser):
        parser.add_argument('--read-person-from-json',
                            dest='read_person_from_json',
                            default=None,
                            help='read person info  from json for testing')
        parser.add_argument('--permalinks-folder',
                            dest='permalinks_folder',
                            required=True)
        parser.add_argument('--declarator-host',
                            dest='declarator_host',
                            required=False)
        parser.add_argument('--person-name-prefix',
                            dest='person_name_prefix',
                            required=False)

    def open_permalinks_db(self):
        self.permalinks_db = TPermaLinksPerson(
            self.options['permalinks_folder'])
        self.permalinks_db.open_db_read_only()

    def build_passport_to_person_id_mapping_from_declarator(self):
        if self.options.get('read_person_from_json') is not None:
            with open(self.options.get('read_person_from_json'),
                      "r",
                      encoding="utf8") as inpf:
                return json.load(inpf)
        else:
            return get_all_section_from_declarator_with_person_id(
                self.options['declarator_host'])

    # we think that person ids in declarator db are stable
    def copy_human_merge(self, section, declarator_person_id):
        person = self.declarator_person_id_to_disclosures_person.get(
            declarator_person_id)
        if person is None:
            person_id = self.permalinks_db.get_person_id_by_declarator_id(
                declarator_person_id, section.id)
            if person_id in self.disclosures_person_id_to_disclosures_person:
                person = self.disclosures_person_id_to_disclosures_person.get(
                    person_id)
                if declarator_person_id != person.declarator_person_id:
                    self.logger.error(
                        "Person id={} has conflict declarator_person_id ({} != {}), use the first person id {}"
                        .format(person_id, declarator_person_id,
                                person.declarator_person_id,
                                person.declarator_person_id))

            else:
                person = models.Person(
                    id=person_id,
                    declarator_person_id=declarator_person_id,
                    person_name=section.person_name)
                person.save()
                self.declarator_person_id_to_disclosures_person[
                    declarator_person_id] = person
                self.disclosures_person_id_to_disclosures_person[
                    declarator_person_id] = person
        elif person.person_name is None or len(person.person_name) < len(
                section.person_name):
            person.person_name = section.person_name
            person.save()

        assert person.declarator_person_id is not None
        self.logger.debug(
            "connect section {} to person {}, declarator_person_id={}".format(
                section.id, person.id, person.declarator_person_id))

        section.person = person
        section.dedupe_score = None
        section.save()

    def process_section(self, section, section_passports):
        main_income = 0
        for i in section.income_set.all():
            if i.relative == models.Relative.main_declarant_code:
                main_income = i.size
        found_results = list()
        for declaration_info in section.source_document.declarator_file_reference_set.all(
        ):
            key1 = build_section_passport(
                declaration_info.declarator_document_id, section.person_name,
                main_income)
            found_res1 = section_passports.get(key1)
            if found_res1 is not None:
                found_results.append(found_res1)
            fio = TRussianFio(section.person_name)
            if fio.is_resolved:
                key2 = build_section_passport(
                    declaration_info.declarator_document_id, fio.family_name,
                    main_income)
                found_res2 = section_passports.get(key2)
                if found_res2 is not None:
                    found_results.append(found_res2)
            else:
                self.logger.error(
                    "section {} fio={} cannot find surname".format(
                        section.id, section.person_name))

        if len(found_results) == 0:
            self.logger.debug(
                "section {} fio={} cannot be found in declarator".format(
                    section.id, section.person_name))
        else:
            for person_id in found_results:
                if person_id != "AMBIGUOUS_KEY":
                    self.copy_human_merge(section, person_id)
                    return True
            self.logger.debug("section {} fio={} is ambiguous".format(
                section.id, section.person_name))
        return False

    def copy_declarator_person_ids(self, section_passports):
        query = """
            select s.id, r.declarator_document_id, s.person_name, i.size
            from declarations_section s
            join declarations_income i on i.section_id = s.id
            join declarations_source_document d on s.source_document_id = d.id
            join declarations_declarator_file_reference r on r.source_document_id = d.id
            where i.relative = '{}'
        """.format(models.Relative.main_declarant_code)
        merge_count = 0
        with connection.cursor() as cursor:
            cursor.execute(query)
            for section_id, declarator_document_id, person_name, main_income in cursor:
                found_results = list()
                key1 = build_section_passport(declarator_document_id,
                                              person_name, main_income)
                found_res1 = section_passports.get(key1)
                if found_res1 is not None:
                    found_results.append(found_res1)
                fio = TRussianFio(person_name)
                if fio.is_resolved:
                    key2 = build_section_passport(declarator_document_id,
                                                  fio.family_name, main_income)
                    found_res2 = section_passports.get(key2)
                    if found_res2 is not None:
                        found_results.append(found_res2)
                if len(found_results) > 0:
                    success = False
                    for person_id in found_results:
                        if person_id != "AMBIGUOUS_KEY":
                            self.copy_human_merge(
                                models.Section.objects.get(id=section_id),
                                person_id)
                            success = True
                            merge_count += 1
                            break
                    if not success:
                        self.logger.debug(
                            "section {} fio={} is ambiguous".format(
                                section_id, person_name))
        self.logger.info(
            "set human person id to {} records".format(merge_count))

    def handle(self, *args, **options):
        self.logger = setup_logging(logger_name="copy_person")
        self.options = options
        self.logger.debug("models.Person.objects.count()={}".format(
            models.Person.objects.count()))
        assert models.Person.objects.count() == 0
        self.open_permalinks_db()
        section_passports = self.build_passport_to_person_id_mapping_from_declarator(
        )
        self.logger.info("merge by {} passports from declarator".format(
            len(section_passports)))
        self.copy_declarator_person_ids(section_passports)
        self.permalinks_db.close_db()
        self.logger.info("all done")
Exemple #15
0
 def open_permalinks_db(self):
     self.permalinks_db = TPermaLinksPerson(
         self.options['permalinks_folder'])
     self.permalinks_db.open_db_read_only()