def test_elastic(self):
        ElasticSectionDocument.init()
        ElasticSectionDocument._index._name.endswith("_test")
        ElasticSectionDocument.search().query().delete()
        time.sleep(2)
        people = list(ElasticSectionDocument.search().query(
            'match', person_name='Иванов'))
        self.assertEqual(len(people), 0)
        people = list(ElasticSectionDocument.search().query(
            'match', person_name='Иванов'))
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()

        ofc = models.Office()
        ofc.name = "some name"
        ofc.save()

        src_doc = models.Source_Document()
        src_doc.id = 1
        src_doc.office = ofc
        src_doc.save()

        section = models.Section()
        section.id = 1
        section.person_name = "Иванов Иван"
        section.source_document = src_doc
        section.save()
        print("sleep 2 seconds till elastic processes records")
        time.sleep(2)

        people = list(ElasticSectionDocument.search().query(
            'match', person_name='Иванов'))
        print(len(people))
        self.assertEqual(len(people), 1)
Exemple #2
0
    def register_document_in_database(self, sha256, src_doc: TSourceDocument):
        source_document_in_db = models.Source_Document(
            sha256=sha256,
            intersection_status=src_doc.build_intersection_status(),
        )
        source_document_in_db.id, new_file = self.permalinks_db_source_document.get_source_doc_id_by_sha256(
            sha256)
        assert not models.Source_Document.objects.filter(
            id=source_document_in_db.id).exists()
        self.logger.debug("register doc sha256={} id={}, new_file={}".format(
            sha256, source_document_in_db.id, new_file))
        source_document_in_db.file_extension = src_doc.file_extension
        source_document_in_db.save()
        ref: TDeclaratorReference
        for ref in src_doc.decl_references:
            models.Declarator_File_Reference(
                source_document=source_document_in_db,
                declarator_documentfile_id=ref.document_file_id,
                declarator_document_id=ref.document_id,
                web_domain=ref._site_url,
                declarator_document_file_url=ref.document_file_url).save()
        ref: TWebReference
        for ref in src_doc.web_references:
            models.Web_Reference(source_document=source_document_in_db,
                                 dlrobot_url=ref.url,
                                 web_domain=ref._site_url,
                                 crawl_epoch=ref.crawl_epoch).save()

        return source_document_in_db
Exemple #3
0
    def create_test_db(self):
        models.Section.objects.all().delete()
        models.Declarator_File_Reference.objects.all().delete()
        models.Source_Document.objects.all().delete()
        models.Person.objects.all().delete()
        self.assertGreater(models.Office.objects.count(), 0)

        src_doc = models.Source_Document(id=1)
        src_doc.save()

        models.Declarator_File_Reference(
            source_document=src_doc,
            declarator_document_id=self.declarator_document_id).save()

        section1 = models.Section(id=self.section_id1,
                                  source_document=src_doc,
                                  person_name=self.fio,
                                  office_id=1)
        section1.save()

        section2 = models.Section(id=self.section_id2,
                                  source_document=src_doc,
                                  person_name=self.fio,
                                  office_id=1)
        section2.save()

        models.Income(section=section1,
                      size=self.income_main1,
                      relative=models.Relative.main_declarant_code).save()
        models.Income(section=section2,
                      size=self.income_main2,
                      relative=models.Relative.main_declarant_code).save()
    def test_search_section_by_partial_person_name(self):
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()
        src_doc = models.Source_Document(id=1)
        src_doc.save()
        models.Section(id=1, person_name="Один Иван Ильич", source_document=src_doc, office_id=1).save()
        models.Section(id=2, person_name="Два Иван Ильич", source_document=src_doc, office_id=1).save()
        models.Section(id=3, person_name="Иван Ильич", source_document=src_doc, office_id=1).save()
        BuildElasticIndex(None, None).handle(None, model="section")

        res = self.search_sections_by_fio("Один Иван")
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0].id, 1)

        res = self.search_sections_by_fio("Два Иван")
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0].id, 2)

        res = self.search_sections_by_fio("Один Иван Ильич")
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0].id, 1)

        res = self.search_sections_by_fio("Иван Ильич")
        self.assertEqual(len(res), 3)

        res = self.search_sections_by_fio("Ильич")
        self.assertEqual(len(res), 3)

        res = self.search_sections_by_fio("Один")
        self.assertEqual(len(res), 1)
def create_default_source_document():
    models.Section.objects.all().delete()
    models.Source_Document.objects.all().delete()
    models.Person.objects.all().delete()
    src_doc = models.Source_Document(id=1)
    src_doc.office_id = 1
    src_doc.save()
    return src_doc
    def test_elastic(self):
        self.assertGreater(models.Office.objects.count(), 0)

        #delete all documents
        index = Index(settings.ELASTICSEARCH_INDEX_NAMES['section_index_name'],
                      Elasticsearch())
        index.delete()
        index.create()
        time.sleep(2)

        #search to get no results
        people = list(ElasticSectionDocument.search().query(
            'match', person_name='Иванов'))
        self.assertEqual(len(people), 0)

        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()

        ofc = models.Office.objects.get(id=1)

        src_doc = models.Source_Document()
        src_doc.id = 1
        src_doc.save()

        models.Section(id=1,
                       person_name="Иванов Иван",
                       source_document=src_doc,
                       office=ofc).save()
        models.Section(id=2,
                       person_name="Петров Петр",
                       source_document=src_doc,
                       office=ofc).save()
        models.Section(id=3,
                       person_name="Сидоров Федор",
                       source_document=src_doc,
                       office=ofc).save()

        #reindex section index
        TSectionElasticIndexator.chunk_size = 2
        BuildElasticIndex(None, None).handle(None, model="section")
        time.sleep(2)
        people = list(ElasticSectionDocument.search().query(
            'match', person_name='Иванов'))
        self.assertEqual(len(people), 1)

        people = list(ElasticSectionDocument.search().query(
            'match', person_name='Петров'))
        self.assertEqual(len(people), 1)

        people = list(ElasticSectionDocument.search().query(
            'match', person_name='Сидоров'))
        self.assertEqual(len(people), 1)

        people = list(ElasticSectionDocument.search().query(
            'match', person_name='Сокирко'))
        self.assertEqual(len(people), 0)
Exemple #7
0
    def check_case(self, use_only_surname, check_ambiguity):
        models.Section.objects.all().delete()
        self.assertGreater(models.Office.objects.count(), 0)

        fio = "Иванов Иван Иванович"
        document_id = 1784
        income_main = 12534
        declarator_person_id = 178
        person_ids_path = os.path.join(os.path.dirname(__file__), "person_ids.json")
        with open(person_ids_path, "w") as outp:
            fio_key = fio
            if use_only_surname:
                fio_key = fio.split()[0]
            value = declarator_person_id
            if check_ambiguity:
                value = "AMBIGUOUS_KEY"
            record = {
                build_section_passport(document_id, fio_key, income_main): value
            }
            json.dump(record, outp, ensure_ascii=False, indent=4)
        src_doc = models.Source_Document()
        src_doc.office_id = 1
        src_doc.id = 1
        src_doc.save()

        decl_info = models.Declarator_File_Reference(source_document=src_doc,
                                                     declarator_document_id=document_id)
        decl_info.save()

        section = models.Section(source_document=src_doc,
                                 person_name=fio)
        section.id = 1
        section.save()

        income = models.Income(section=section,
                               size=income_main,
                               relative=models.Relative.main_declarant_code
                               )
        income.save()

        permalinks_path = os.path.join(os.path.dirname(__file__), "permalinks.dbm")
        p = TPermaLinksDB(permalinks_path)
        p.create_db()
        p.close_db()

        copier = CopyPersonIdCommand(None, None)
        copier.handle(None, read_person_from_json=person_ids_path, permanent_links_db=permalinks_path)

        section.refresh_from_db()
        if check_ambiguity:
            self.assertEqual(section.person, None)
        else:
            self.assertEqual(section.person.declarator_person_id, declarator_person_id)
    def test_search_section_by_person_name(self):
        self.assertGreater(models.Office.objects.count(), 0)
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()
        src_doc = models.Source_Document(id=1)
        src_doc.save()
        models.Section(id=1, person_name="Иванов Иван Иванович", source_document=src_doc, office_id=1).save()
        BuildElasticIndex(None, None).handle(None, model="section")

        self.assertEqual(self.search_sections_by_fio("Иванов И.И.")[0].id, 1)
        self.assertEqual(self.search_sections_by_fio("Иванов Иван Иванович")[0].id, 1)
        self.assertEqual(self.search_sections_by_fio("Иванов Иван")[0].id, 1)
Exemple #9
0
    def convert_one_id(self, header, section_or_person_id_key, json_key, row):
        id_index = header.index(section_or_person_id_key)
        assert id_index != -1
        section_or_person_id = row[id_index]
        json_index = header.index(json_key)
        assert json_index != -1
        person_json = json.loads(row[json_index])
        sections = person_json.get('sections', [])
        if len(sections) == 0:
            raise ConvertException(
                "cannot find sections or bad format for id: {}".format(
                    section_or_person_id))
        if 'person' in sections[0]:
            input_person_name = sections[0]['person']['name_raw']
        else:
            input_person_name = person_json['fio']

        if section_or_person_id.startswith('person-'):
            declarator_person_id = int(section_or_person_id[len('person-'):])
            if declarator_person_id not in self.squeeze.declarator_person_id_to_person_id:
                raise ConvertException(
                    "declarator_person_id {} cannot be found in disclosures, skip this record"
                    .format(declarator_person_id))
            person_id, person_name = self.squeeze.declarator_person_id_to_person_id.get(
                declarator_person_id)
            if not check_family_name(person_name, input_person_name):
                raise ConvertException(
                    "person id: {} has a different family name, skip this record"
                    .format(person_id))
            row[id_index] = str("person-") + str(person_id)
        else:
            assert section_or_person_id.startswith('section-')
            year = sections[0].get('year', 0)
            office_id = office_id = sections[0].get('office_id', -1)
            json_file = models.Source_Document()
            passport_items = TSmartParserSectionJson(
                year, office_id, json_file).read_raw_json(
                    sections[0]).get_passport_components1()
            section_id, search_results = self.search_by_passports(
                passport_items)
            if section_id is not None:
                row[id_index] = str("section-") + str(section_id)
            else:
                raise ConvertException(
                    "cannot find in disclosures declarator section_id={}, passport={}, "
                    "search_results={} ".format(
                        section_or_person_id,
                        passport_items.get_main_section_passport(),
                        search_results))
Exemple #10
0
    def test_rating(self):
        models.Person.objects.all().delete()
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()

        src_doc = models.Source_Document(id=1)
        src_doc.save()

        person_id = 99
        person = models.Person(id=person_id)
        person.save()

        models.Section(id=1,
                       source_document=src_doc,
                       person_name="i1",
                       income_year=2019,
                       office_id=1,
                       person=person).save()
        models.Section(id=2,
                       source_document=src_doc,
                       person_name="i2",
                       income_year=2019,
                       office_id=1,
                       person=person).save()
        models.Section(id=3,
                       source_document=src_doc,
                       person_name="i3",
                       income_year=2019,
                       office_id=1,
                       person=person).save()
        models.Income(section_id=1, size=1, relative=models.Relative.main_declarant_code).save()
        models.Income(section_id=2, size=2, relative=models.Relative.main_declarant_code).save()
        models.Income(section_id=3, size=3, relative=models.Relative.main_declarant_code).save()

        builder = BuildRatingCommand(None, None)
        builder.handle(None, min_members_count=3)

        self.assertEqual(models.Person_Rating_Items.objects.count(), 3)
        rating = list(models.Person_Rating_Items.objects.all())

        self.assertEqual(rating[0].rating_value, 3)
        self.assertEqual(rating[0].person_place, 1)

        self.assertEqual(rating[1].rating_value, 2)
        self.assertEqual(rating[1].person_place, 2)

        self.assertEqual(rating[2].rating_value,  1)
        self.assertEqual(rating[2].person_place, 3)
Exemple #11
0
    def test_simple_import(self):
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()
        self.assertGreater(models.Office.objects.count(), 0)
        os.environ['SMART_PARSER_SERVER_ADDRESS'] = "localhost:8178"
        domains_folder = os.path.join(os.path.dirname(__file__), "domains")
        sp_workdir = os.path.join(os.path.dirname(__file__),
                                  "smart_parser_server")
        permalinks_path = os.path.join(os.path.dirname(__file__),
                                       "permalinks.dbm")

        section_count = 111999
        doc_old_id = 111110
        p = TPermaLinksDB(permalinks_path)
        p.create_db()
        p.save_next_primary_key_value(models.Section, section_count)
        src_doc = models.Source_Document(
            id=doc_old_id,
            sha256=
            "f974dc82aa52acea2f9c49467e7395924605de474e76bafa85572351194b153a")
        p.put_record_id(src_doc)
        p.close()

        with SmartParserServerForTesting(sp_workdir, domains_folder):
            importer = ImportJsonCommand(None, None)
            input_path = os.path.join(os.path.dirname(__file__),
                                      "dlrobot_human.json")
            importer.handle(None,
                            dlrobot_human=input_path,
                            permanent_links_db=permalinks_path)

        self.assertEqual(models.Source_Document.objects.count(), 1)
        self.assertEqual(
            list(models.Source_Document.objects.all())[0].id, doc_old_id)

        self.assertEqual(models.Section.objects.count(), 1)
        self.assertEqual(
            list(models.Section.objects.all())[0].id, section_count)

        self.assertEqual(models.RealEstate.objects.count(), 1)
        self.assertEqual(models.Income.objects.count(), 1)
        self.assertEqual(models.Income.objects.count(), 1)
        self.assertEqual(models.Income.objects.all()[:1].get().size, 1462642)
        self.assertGreater(models.Office.objects.count(), 0)
 def convert_line(self, header, section_or_person_id_key, json_key, row):
     id_index = header.index(section_or_person_id_key)
     assert id_index != -1
     section_or_person_id = row[id_index]
     json_index = header.index(json_key)
     assert json_index != -1
     person_json = json.loads(row[json_index])
     sections = person_json.get('sections', [])
     if len(sections) == 0:
         raise ConvertException(
             "cannot find sections or bad format for id: {}".format(
                 section_or_person_id))
     input_person_name = sections[0]['person']['name_raw']
     assert sections[0]['source'] == "declarator"
     if section_or_person_id.startswith('person-'):
         person_id = int(section_or_person_id[len('person-'):])
         person = models.Person.objects.get(id=person_id)
         person_sections = list(person.section_set.all())
         if not check_family_name(person_sections[0].person_name,
                                  input_person_name):
             raise ConvertException(
                 "person id: {} has a different family name, skip this record"
                 .format(person_id))
         return  # nothing to set just check that it is the right person
     else:
         assert section_or_person_id.startswith('section-')
         year = sections[0].get('year', 0)
         json_file = models.Source_Document(
             office_id=sections[0].get('office_id', -1))
         passport_factory = TSmartParserJsonReader(
             year, json_file,
             sections[0]).get_passport_factory(self.office_hierarchy)
         section_id, search_results = passport_factory.search_by_passports(
             self.stable_key_to_sections)
         if section_id is not None:
             row[id_index] = str("section-") + str(section_id)
         else:
             raise ConvertException(
                 "cannot find in disclosures declarator section_id={}, passport={}, "
                 "search_results={} ".format(
                     section_or_person_id,
                     passport_factory.get_passport_collection()[0],
                     search_results))
    def register_document_in_database(self, sha256, src_doc):
        office = models.Office(id=src_doc.calculated_office_id)
        source_document_in_db = models.Source_Document(office=office,
                                                       sha256=sha256,
                                                       file_path=src_doc.document_path,
                                                       intersection_status=src_doc.intersection_status,
                                                       )
        source_document_in_db.id = self.primary_keys_builder.get_record_id(source_document_in_db)
        source_document_in_db.save()
        for ref in src_doc.decl_references:
            models.Declarator_File_Reference(source_document=source_document_in_db,
                                             declarator_documentfile_id=ref.document_file_id,
                                             declarator_document_id=ref.document_id,
                                             declarator_document_file_url=ref.document_file_url).save()
        for ref in src_doc.web_references:
            models.Web_Reference(source_document=source_document_in_db,
                                 dlrobot_url=ref.url,
                                 crawl_epoch=ref.crawl_epoch).save()

        return source_document_in_db
Exemple #14
0
    def create_records(self, records):
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()
        models.Person.objects.all().delete()
        models.PersonRedirect.objects.all().delete()
        assert models.Office.objects.all().count() > 0
        for d in records.get('source_documents', []):
            d = models.Source_Document(**d)
            d.save()

        for d in records.get('persons', []):
            models.Person(**d).save()

        for d in records.get('sections', []):
            if len(models.Office.objects.filter(id=d['office_id'])) == 0:
                o = models.Office(id=d['office_id'], name="aaa")
                o.save()
            models.Section(**d).save()

        for d in records.get('redirects', []):
            models.PersonRedirect(**d).save()
Exemple #15
0
    def test_corrected_person(self):
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()
        models.Person.objects.all().delete()
        src_doc = models.Source_Document(id=1)
        src_doc.save()
        assert SECTION_CORRECTIONS.get_corrected_section_id(8048661) == 9798543
        models.Person(id=1, person_name="Иванов Иван Ильич").save()
        models.Section(id=8048661,
                       income_year=2016,
                       person_name="Иванов Иван Ильич",
                       source_document=src_doc,
                       office_id=1,
                       person_id=1).save()
        models.Section(id=9798543,
                       income_year=2016,
                       person_name="Иванов Иван Ильич",
                       source_document=src_doc,
                       office_id=1,
                       person_id=1).save()

        person = models.Person.objects.get(id=1)
        sections = person.sections_ordered_by_year
        self.assertEqual(1, len(sections))
Exemple #16
0
 def initialize(self):
     models.Section.objects.all().delete()
     models.Source_Document.objects.all().delete()
     models.Person.objects.all().delete()
     self.src_doc = models.Source_Document(id=1)
     self.src_doc.save()