def test_elastic(self): ElasticSectionDocument.init() ElasticSectionDocument._index._name.endswith("_test") ElasticSectionDocument.search().query().delete() time.sleep(2) people = list(ElasticSectionDocument.search().query( 'match', person_name='Иванов')) self.assertEqual(len(people), 0) people = list(ElasticSectionDocument.search().query( 'match', person_name='Иванов')) models.Section.objects.all().delete() models.Source_Document.objects.all().delete() ofc = models.Office() ofc.name = "some name" ofc.save() src_doc = models.Source_Document() src_doc.id = 1 src_doc.office = ofc src_doc.save() section = models.Section() section.id = 1 section.person_name = "Иванов Иван" section.source_document = src_doc section.save() print("sleep 2 seconds till elastic processes records") time.sleep(2) people = list(ElasticSectionDocument.search().query( 'match', person_name='Иванов')) print(len(people)) self.assertEqual(len(people), 1)
def register_document_in_database(self, sha256, src_doc: TSourceDocument): source_document_in_db = models.Source_Document( sha256=sha256, intersection_status=src_doc.build_intersection_status(), ) source_document_in_db.id, new_file = self.permalinks_db_source_document.get_source_doc_id_by_sha256( sha256) assert not models.Source_Document.objects.filter( id=source_document_in_db.id).exists() self.logger.debug("register doc sha256={} id={}, new_file={}".format( sha256, source_document_in_db.id, new_file)) source_document_in_db.file_extension = src_doc.file_extension source_document_in_db.save() ref: TDeclaratorReference for ref in src_doc.decl_references: models.Declarator_File_Reference( source_document=source_document_in_db, declarator_documentfile_id=ref.document_file_id, declarator_document_id=ref.document_id, web_domain=ref._site_url, declarator_document_file_url=ref.document_file_url).save() ref: TWebReference for ref in src_doc.web_references: models.Web_Reference(source_document=source_document_in_db, dlrobot_url=ref.url, web_domain=ref._site_url, crawl_epoch=ref.crawl_epoch).save() return source_document_in_db
def create_test_db(self): models.Section.objects.all().delete() models.Declarator_File_Reference.objects.all().delete() models.Source_Document.objects.all().delete() models.Person.objects.all().delete() self.assertGreater(models.Office.objects.count(), 0) src_doc = models.Source_Document(id=1) src_doc.save() models.Declarator_File_Reference( source_document=src_doc, declarator_document_id=self.declarator_document_id).save() section1 = models.Section(id=self.section_id1, source_document=src_doc, person_name=self.fio, office_id=1) section1.save() section2 = models.Section(id=self.section_id2, source_document=src_doc, person_name=self.fio, office_id=1) section2.save() models.Income(section=section1, size=self.income_main1, relative=models.Relative.main_declarant_code).save() models.Income(section=section2, size=self.income_main2, relative=models.Relative.main_declarant_code).save()
def test_search_section_by_partial_person_name(self): models.Section.objects.all().delete() models.Source_Document.objects.all().delete() src_doc = models.Source_Document(id=1) src_doc.save() models.Section(id=1, person_name="Один Иван Ильич", source_document=src_doc, office_id=1).save() models.Section(id=2, person_name="Два Иван Ильич", source_document=src_doc, office_id=1).save() models.Section(id=3, person_name="Иван Ильич", source_document=src_doc, office_id=1).save() BuildElasticIndex(None, None).handle(None, model="section") res = self.search_sections_by_fio("Один Иван") self.assertEqual(len(res), 1) self.assertEqual(res[0].id, 1) res = self.search_sections_by_fio("Два Иван") self.assertEqual(len(res), 1) self.assertEqual(res[0].id, 2) res = self.search_sections_by_fio("Один Иван Ильич") self.assertEqual(len(res), 1) self.assertEqual(res[0].id, 1) res = self.search_sections_by_fio("Иван Ильич") self.assertEqual(len(res), 3) res = self.search_sections_by_fio("Ильич") self.assertEqual(len(res), 3) res = self.search_sections_by_fio("Один") self.assertEqual(len(res), 1)
def create_default_source_document(): models.Section.objects.all().delete() models.Source_Document.objects.all().delete() models.Person.objects.all().delete() src_doc = models.Source_Document(id=1) src_doc.office_id = 1 src_doc.save() return src_doc
def test_elastic(self): self.assertGreater(models.Office.objects.count(), 0) #delete all documents index = Index(settings.ELASTICSEARCH_INDEX_NAMES['section_index_name'], Elasticsearch()) index.delete() index.create() time.sleep(2) #search to get no results people = list(ElasticSectionDocument.search().query( 'match', person_name='Иванов')) self.assertEqual(len(people), 0) models.Section.objects.all().delete() models.Source_Document.objects.all().delete() ofc = models.Office.objects.get(id=1) src_doc = models.Source_Document() src_doc.id = 1 src_doc.save() models.Section(id=1, person_name="Иванов Иван", source_document=src_doc, office=ofc).save() models.Section(id=2, person_name="Петров Петр", source_document=src_doc, office=ofc).save() models.Section(id=3, person_name="Сидоров Федор", source_document=src_doc, office=ofc).save() #reindex section index TSectionElasticIndexator.chunk_size = 2 BuildElasticIndex(None, None).handle(None, model="section") time.sleep(2) people = list(ElasticSectionDocument.search().query( 'match', person_name='Иванов')) self.assertEqual(len(people), 1) people = list(ElasticSectionDocument.search().query( 'match', person_name='Петров')) self.assertEqual(len(people), 1) people = list(ElasticSectionDocument.search().query( 'match', person_name='Сидоров')) self.assertEqual(len(people), 1) people = list(ElasticSectionDocument.search().query( 'match', person_name='Сокирко')) self.assertEqual(len(people), 0)
def check_case(self, use_only_surname, check_ambiguity): models.Section.objects.all().delete() self.assertGreater(models.Office.objects.count(), 0) fio = "Иванов Иван Иванович" document_id = 1784 income_main = 12534 declarator_person_id = 178 person_ids_path = os.path.join(os.path.dirname(__file__), "person_ids.json") with open(person_ids_path, "w") as outp: fio_key = fio if use_only_surname: fio_key = fio.split()[0] value = declarator_person_id if check_ambiguity: value = "AMBIGUOUS_KEY" record = { build_section_passport(document_id, fio_key, income_main): value } json.dump(record, outp, ensure_ascii=False, indent=4) src_doc = models.Source_Document() src_doc.office_id = 1 src_doc.id = 1 src_doc.save() decl_info = models.Declarator_File_Reference(source_document=src_doc, declarator_document_id=document_id) decl_info.save() section = models.Section(source_document=src_doc, person_name=fio) section.id = 1 section.save() income = models.Income(section=section, size=income_main, relative=models.Relative.main_declarant_code ) income.save() permalinks_path = os.path.join(os.path.dirname(__file__), "permalinks.dbm") p = TPermaLinksDB(permalinks_path) p.create_db() p.close_db() copier = CopyPersonIdCommand(None, None) copier.handle(None, read_person_from_json=person_ids_path, permanent_links_db=permalinks_path) section.refresh_from_db() if check_ambiguity: self.assertEqual(section.person, None) else: self.assertEqual(section.person.declarator_person_id, declarator_person_id)
def test_search_section_by_person_name(self): self.assertGreater(models.Office.objects.count(), 0) models.Section.objects.all().delete() models.Source_Document.objects.all().delete() src_doc = models.Source_Document(id=1) src_doc.save() models.Section(id=1, person_name="Иванов Иван Иванович", source_document=src_doc, office_id=1).save() BuildElasticIndex(None, None).handle(None, model="section") self.assertEqual(self.search_sections_by_fio("Иванов И.И.")[0].id, 1) self.assertEqual(self.search_sections_by_fio("Иванов Иван Иванович")[0].id, 1) self.assertEqual(self.search_sections_by_fio("Иванов Иван")[0].id, 1)
def convert_one_id(self, header, section_or_person_id_key, json_key, row): id_index = header.index(section_or_person_id_key) assert id_index != -1 section_or_person_id = row[id_index] json_index = header.index(json_key) assert json_index != -1 person_json = json.loads(row[json_index]) sections = person_json.get('sections', []) if len(sections) == 0: raise ConvertException( "cannot find sections or bad format for id: {}".format( section_or_person_id)) if 'person' in sections[0]: input_person_name = sections[0]['person']['name_raw'] else: input_person_name = person_json['fio'] if section_or_person_id.startswith('person-'): declarator_person_id = int(section_or_person_id[len('person-'):]) if declarator_person_id not in self.squeeze.declarator_person_id_to_person_id: raise ConvertException( "declarator_person_id {} cannot be found in disclosures, skip this record" .format(declarator_person_id)) person_id, person_name = self.squeeze.declarator_person_id_to_person_id.get( declarator_person_id) if not check_family_name(person_name, input_person_name): raise ConvertException( "person id: {} has a different family name, skip this record" .format(person_id)) row[id_index] = str("person-") + str(person_id) else: assert section_or_person_id.startswith('section-') year = sections[0].get('year', 0) office_id = office_id = sections[0].get('office_id', -1) json_file = models.Source_Document() passport_items = TSmartParserSectionJson( year, office_id, json_file).read_raw_json( sections[0]).get_passport_components1() section_id, search_results = self.search_by_passports( passport_items) if section_id is not None: row[id_index] = str("section-") + str(section_id) else: raise ConvertException( "cannot find in disclosures declarator section_id={}, passport={}, " "search_results={} ".format( section_or_person_id, passport_items.get_main_section_passport(), search_results))
def test_rating(self): models.Person.objects.all().delete() models.Section.objects.all().delete() models.Source_Document.objects.all().delete() src_doc = models.Source_Document(id=1) src_doc.save() person_id = 99 person = models.Person(id=person_id) person.save() models.Section(id=1, source_document=src_doc, person_name="i1", income_year=2019, office_id=1, person=person).save() models.Section(id=2, source_document=src_doc, person_name="i2", income_year=2019, office_id=1, person=person).save() models.Section(id=3, source_document=src_doc, person_name="i3", income_year=2019, office_id=1, person=person).save() models.Income(section_id=1, size=1, relative=models.Relative.main_declarant_code).save() models.Income(section_id=2, size=2, relative=models.Relative.main_declarant_code).save() models.Income(section_id=3, size=3, relative=models.Relative.main_declarant_code).save() builder = BuildRatingCommand(None, None) builder.handle(None, min_members_count=3) self.assertEqual(models.Person_Rating_Items.objects.count(), 3) rating = list(models.Person_Rating_Items.objects.all()) self.assertEqual(rating[0].rating_value, 3) self.assertEqual(rating[0].person_place, 1) self.assertEqual(rating[1].rating_value, 2) self.assertEqual(rating[1].person_place, 2) self.assertEqual(rating[2].rating_value, 1) self.assertEqual(rating[2].person_place, 3)
def test_simple_import(self): models.Section.objects.all().delete() models.Source_Document.objects.all().delete() self.assertGreater(models.Office.objects.count(), 0) os.environ['SMART_PARSER_SERVER_ADDRESS'] = "localhost:8178" domains_folder = os.path.join(os.path.dirname(__file__), "domains") sp_workdir = os.path.join(os.path.dirname(__file__), "smart_parser_server") permalinks_path = os.path.join(os.path.dirname(__file__), "permalinks.dbm") section_count = 111999 doc_old_id = 111110 p = TPermaLinksDB(permalinks_path) p.create_db() p.save_next_primary_key_value(models.Section, section_count) src_doc = models.Source_Document( id=doc_old_id, sha256= "f974dc82aa52acea2f9c49467e7395924605de474e76bafa85572351194b153a") p.put_record_id(src_doc) p.close() with SmartParserServerForTesting(sp_workdir, domains_folder): importer = ImportJsonCommand(None, None) input_path = os.path.join(os.path.dirname(__file__), "dlrobot_human.json") importer.handle(None, dlrobot_human=input_path, permanent_links_db=permalinks_path) self.assertEqual(models.Source_Document.objects.count(), 1) self.assertEqual( list(models.Source_Document.objects.all())[0].id, doc_old_id) self.assertEqual(models.Section.objects.count(), 1) self.assertEqual( list(models.Section.objects.all())[0].id, section_count) self.assertEqual(models.RealEstate.objects.count(), 1) self.assertEqual(models.Income.objects.count(), 1) self.assertEqual(models.Income.objects.count(), 1) self.assertEqual(models.Income.objects.all()[:1].get().size, 1462642) self.assertGreater(models.Office.objects.count(), 0)
def convert_line(self, header, section_or_person_id_key, json_key, row): id_index = header.index(section_or_person_id_key) assert id_index != -1 section_or_person_id = row[id_index] json_index = header.index(json_key) assert json_index != -1 person_json = json.loads(row[json_index]) sections = person_json.get('sections', []) if len(sections) == 0: raise ConvertException( "cannot find sections or bad format for id: {}".format( section_or_person_id)) input_person_name = sections[0]['person']['name_raw'] assert sections[0]['source'] == "declarator" if section_or_person_id.startswith('person-'): person_id = int(section_or_person_id[len('person-'):]) person = models.Person.objects.get(id=person_id) person_sections = list(person.section_set.all()) if not check_family_name(person_sections[0].person_name, input_person_name): raise ConvertException( "person id: {} has a different family name, skip this record" .format(person_id)) return # nothing to set just check that it is the right person else: assert section_or_person_id.startswith('section-') year = sections[0].get('year', 0) json_file = models.Source_Document( office_id=sections[0].get('office_id', -1)) passport_factory = TSmartParserJsonReader( year, json_file, sections[0]).get_passport_factory(self.office_hierarchy) section_id, search_results = passport_factory.search_by_passports( self.stable_key_to_sections) if section_id is not None: row[id_index] = str("section-") + str(section_id) else: raise ConvertException( "cannot find in disclosures declarator section_id={}, passport={}, " "search_results={} ".format( section_or_person_id, passport_factory.get_passport_collection()[0], search_results))
def register_document_in_database(self, sha256, src_doc): office = models.Office(id=src_doc.calculated_office_id) source_document_in_db = models.Source_Document(office=office, sha256=sha256, file_path=src_doc.document_path, intersection_status=src_doc.intersection_status, ) source_document_in_db.id = self.primary_keys_builder.get_record_id(source_document_in_db) source_document_in_db.save() for ref in src_doc.decl_references: models.Declarator_File_Reference(source_document=source_document_in_db, declarator_documentfile_id=ref.document_file_id, declarator_document_id=ref.document_id, declarator_document_file_url=ref.document_file_url).save() for ref in src_doc.web_references: models.Web_Reference(source_document=source_document_in_db, dlrobot_url=ref.url, crawl_epoch=ref.crawl_epoch).save() return source_document_in_db
def create_records(self, records): models.Section.objects.all().delete() models.Source_Document.objects.all().delete() models.Person.objects.all().delete() models.PersonRedirect.objects.all().delete() assert models.Office.objects.all().count() > 0 for d in records.get('source_documents', []): d = models.Source_Document(**d) d.save() for d in records.get('persons', []): models.Person(**d).save() for d in records.get('sections', []): if len(models.Office.objects.filter(id=d['office_id'])) == 0: o = models.Office(id=d['office_id'], name="aaa") o.save() models.Section(**d).save() for d in records.get('redirects', []): models.PersonRedirect(**d).save()
def test_corrected_person(self): models.Section.objects.all().delete() models.Source_Document.objects.all().delete() models.Person.objects.all().delete() src_doc = models.Source_Document(id=1) src_doc.save() assert SECTION_CORRECTIONS.get_corrected_section_id(8048661) == 9798543 models.Person(id=1, person_name="Иванов Иван Ильич").save() models.Section(id=8048661, income_year=2016, person_name="Иванов Иван Ильич", source_document=src_doc, office_id=1, person_id=1).save() models.Section(id=9798543, income_year=2016, person_name="Иванов Иван Ильич", source_document=src_doc, office_id=1, person_id=1).save() person = models.Person.objects.get(id=1) sections = person.sections_ordered_by_year self.assertEqual(1, len(sections))
def initialize(self): models.Section.objects.all().delete() models.Source_Document.objects.all().delete() models.Person.objects.all().delete() self.src_doc = models.Source_Document(id=1) self.src_doc.save()