def handle(self, *args, **options): for person in Person.objects.all(): person.last_name_uk = person.last_name_uk or "" person.first_name_uk = person.first_name_uk or "" person.patronymic_uk = person.patronymic_uk or "" names = self.transliterate( person.last_name_uk, person.first_name_uk, person.patronymic_uk ) if person.also_known_as_uk: for aka_name in filter(None, person.also_known_as_uk.split("\n")): last_name, first_name, patronymic, _ = parse_fullname(aka_name) names |= self.transliterate( last_name, first_name, patronymic ) person.names = "\n".join(names) person.first_name_uk = person.first_name_uk.strip() person.last_name_uk = person.last_name_uk.strip() person.patronymic_uk = person.patronymic_uk.strip() if len(person.first_name) == 1: person.first_name += "." if len(person.patronymic) == 1: person.patronymic += "." person.save()
def family(self): if not self.source: return [] res = [] if "family" in self.source["general"] and self.source["general"][ "family"]: res = [ { "relation": member.get("relations", member.get("relations_other", "")), "name": member.get("family_name", ""), } for member in self.source["general"]["family"] if (member.get("family_name", "") and ( member["relations"] + member.get("relations_other", ""))) ] elif ("family_raw" in self.source["general"] and self.source["general"]["family_raw"]): res = map( parse_family_member, filter(None, self.source["general"]["family_raw"].split(";")), ) res = filter(None, res) for i, r in enumerate(res): res[i]["mapped"] = RELATIONS_MAPPING.get( r["relation"].lower(), "особи, які спільно проживають") ( res[i]["last_name"], res[i]["first_name"], res[i]["patronymic"], res[i]["dob"], ) = parse_fullname(r["name"]) return res
def to_dict(self): """ Convert Person model to an indexable presentation for ES. """ d = model_to_dict( self, fields=[ "id", "last_name", "first_name", "patronymic", "dob", "last_name_en", "first_name_en", "patronymic_en", "dob_details", "is_pep", "names", "wiki_uk", "wiki_en", "city_of_birth_uk", "city_of_birth_en", "reputation_sanctions_uk", "reputation_sanctions_en", "reputation_convictions_uk", "reputation_convictions_en", "reputation_assets_uk", "reputation_assets_en", "reputation_crimes_uk", "reputation_crimes_en", "reputation_manhunt_uk", "reputation_manhunt_en", "also_known_as_uk", "also_known_as_en", "last_change", "inn", "inn_source", "passport", "passport_source", ], ) d["related_persons"] = [ i.to_dict() for i in self.to_persons.prefetch_related("to_person") ] + [ i.to_dict_reverse() for i in self.from_persons.prefetch_related("from_person") ] d["related_countries"] = [ i.to_dict() for i in self.person2country_set.prefetch_related("to_country") ] d["related_companies"] = [ i.to_company_dict() for i in self.person2company_set.prefetch_related("to_company") ] d["declarations"] = [ i.to_dict() for i in Declaration.objects.filter(person=self, confirmed="a") ] manhunt_records = self.manhunt_records if manhunt_records: curr_lang = get_language() activate("uk") d["reputation_manhunt_uk"] = render_to_string( "_manhunt_records_uk.jinja", {"manhunt_records": manhunt_records }) + (d["reputation_manhunt_uk"] or "") activate("en") d["reputation_manhunt_en"] = render_to_string( "_manhunt_records_en.jinja", {"manhunt_records": manhunt_records }) + (d["reputation_manhunt_en"] or "") activate(curr_lang) d["inn_source"] = (settings.SITE_URL + self.inn_source.doc.url if self.inn_source else "") d["passport_source"] = (settings.SITE_URL + self.passport_source.doc.url if self.passport_source else "") d["photo"] = settings.SITE_URL + self.photo.url if self.photo else "" d["photo_path"] = self.photo.name if self.photo else "" d["date_of_birth"] = self.date_of_birth d["terminated"] = self.terminated d["last_modified"] = self.last_modified d["died"] = self.died if d["terminated"]: d["reason_of_termination"] = self.get_reason_of_termination_display( ) d["reason_of_termination_en"] = translate_into( self.get_reason_of_termination_display(), "en") d["termination_date_human"] = self.termination_date_human last_workplace = self.last_workplace if last_workplace: d["last_workplace"] = last_workplace["company"] d["last_job_title"] = last_workplace["position"] d["last_job_id"] = last_workplace["company_id"] last_workplace_en = self.last_workplace_en d["last_workplace_en"] = last_workplace_en["company"] d["last_job_title_en"] = last_workplace_en["position"] d["type_of_official"] = self.get_type_of_official_display() d["type_of_official_en"] = translate_into( self.get_type_of_official_display(), "en") d["full_name"] = self.full_name d["full_name_en"] = self.full_name_en def generate_suggestions(last_name, first_name, patronymic, *args): if not last_name: return [] return [ { "input": " ".join([last_name, first_name, patronymic]), "weight": 5 }, { "input": " ".join([first_name, patronymic, last_name]), "weight": 2 }, { "input": " ".join([first_name, last_name]), "weight": 2 }, ] input_variants = [ generate_suggestions(d["last_name"], d["first_name"], d["patronymic"]) ] input_variants += list( map(lambda x: generate_suggestions(*parse_fullname(x)), self.parsed_names)) d["full_name_suggest"] = list(chain.from_iterable(input_variants)) d["_id"] = d["id"] return d
def handle(self, *args, **options): peklun = User.objects.get(username="******") wks = get_spreadsheet().sheet1 for i, l in enumerate(wks.get_all_records()): # reopen it time from time to avoid disconnect by timeout if i % 2000 == 0 and i: wks = get_spreadsheet().sheet1 self.stdout.write('Processing line #{}'.format(i)) company_ipn = l.get("ІПН", "") company_name = l.get("Назва", "") person_id = l.get("id персони", "") company_id = l.get("id компанії", "") photo_url = l.get("Фото", "") person = None # First let's search for appropriate company company = self.process_company(company_id, company_ipn, company_name) # No company — no go if company is None: continue # Let's backwrite company id to the spreadsheet for further use if company.pk != company_id: company_id = company.pk wks.update_cell(i + 2, len(l.keys()), company.pk) person_name = l.get("ПІБ", "").strip() position = l.get("Посада", "").strip() person_dob = unicode(l.get("Дата народження", "")).strip() person_from = parse_date(l.get("Дата призначення", "")) person_to = parse_date(l.get("Дата звільнення", "")) doc_received = parse_date(l.get("Дата відповіді", "")) docs = l.get("Лінк на відповідь", "").strip() website = l.get("лінк на сайт", "").strip() # Now let's search for the person if person_name: last_name, first_name, patronymic, _ = parse_fullname( person_name) if not last_name: continue # First we search by person_id (if it's present) if person_id: try: person = Person.objects.get(pk=person_id) except Person.DoesNotExist: pass # If nothing is found we search for name (for now) if not person: try: person = Person.objects.get( first_name_uk__iexact=first_name, last_name_uk__iexact=last_name, patronymic_uk__iexact=patronymic) except Person.MultipleObjectsReturned: self.stderr.write( "Double person {}!".format(person_name)) except Person.DoesNotExist: pass # If nothing is found, let's create a record for that person if not person: person = Person() self.stderr.write( "Created new person {}".format(person_name)) person.first_name_uk = first_name person.last_name_uk = last_name person.patronymic_uk = patronymic Ua2RuDictionary.objects.get_or_create(term=first_name) Ua2RuDictionary.objects.get_or_create(term=last_name) Ua2RuDictionary.objects.get_or_create(term=patronymic) person.first_name_en = translitua(first_name) person.last_name_en = translitua(last_name) person.patronymic_en = translitua(patronymic) person.is_pep = True person.imported = True person.type_of_official = 1 # Parsing date (can be a full date or just a year or # year/month) if person_dob: person.dob = parse_date(person_dob) if len(person_dob) == 4: person.dob_details = 2 # Only year if len(person_dob) > 4 and len(person_dob) < 7: person.dob_details = 1 # month and year # Let's download the photo (if any) if not person.photo and photo_url: photo_name, photo_san_name, photo_content = download( photo_url, translitua(person_name)) if photo_name: person.photo.save(photo_san_name, ContentFile(photo_content)) else: self.stdout.write("Cannot download image %s for %s" % (photo_url, person_name)) person.save() # Let's write the person id back to the table. if person.pk != person_id: person_id = person.pk wks.update_cell(i + 2, len(l.keys()) - 1, person.pk) # Now let's download all supporting docs docs_downloaded = [] first_doc_name = False # There might be many of them for doc in docs.split(", "): doc_instance = None # we cannot download folders from google docs, so let's # skip them if doc and "folderview" not in doc \ and "drive/#folders" not in doc: doc = expand_gdrive_download_url(doc) doc_hash = sha1(doc).hexdigest() # Check, if docs try: doc_instance = Document.objects.get(hash=doc_hash) except Document.DoesNotExist: self.stdout.write( 'Downloading file {}'.format(doc)) doc_name, doc_san_name, doc_content = download(doc) doc_san_name = translitua(doc_san_name) if doc_name: doc_instance = Document(name_uk=doc_name, uploader=peklun, hash=doc_hash) doc_instance.doc.save(doc_san_name, ContentFile(doc_content)) doc_instance.save() else: self.stdout.write( 'Cannot download file {}'.format(doc)) if doc_instance: first_doc_name = doc_instance.name_uk docs_downloaded.append(doc_instance.doc.url) # Now let's setup links between person and companies links = Person2Company.objects.filter( (Q(date_established=person_from) | Q(date_established=mangle_date(person_from)) | Q(date_established__isnull=True)), (Q(date_finished=person_to) | Q(date_finished=mangle_date(person_to)) | Q(date_finished__isnull=True)), from_person=person, to_company=company) # Delete if there are doubling links # including those cases when dates were imported incorrectly # because of parse_date if len(links) > 1: links.delete() link, _ = Person2Company.objects.update_or_create( from_person=person, to_company=company, date_established=person_from, date_established_details=0, date_finished=person_to, date_finished_details=0) if not link.relationship_type: link.relationship_type = position # And translate them Ua2EnDictionary.objects.get_or_create( term=lookup_term(position)) # oh, and add links to supporting docs all_docs = docs_downloaded + website.split(", ") if all_docs: link.proof = ", ".join(filter(None, all_docs)) if first_doc_name: link.proof_title = first_doc_name link.date_confirmed = doc_received link.is_employee = True link.save()
def resolve_person(self, family_id): """ Finds the relative mentioned in the declaration in PEP db. Returns person id and a flag set to true, if it was fuzzy match """ if str(family_id) == "1": return self.person, False def _is_fuzzy_match(lastname, firstname, middlename, person_rec): if lastname.strip().lower() != person_rec.last_name.strip().lower( ): return True if firstname.strip().lower() != person_rec.first_name.strip( ).lower(): return True if middlename.strip().lower() != person_rec.patronymic.strip( ).lower(): return True return False data = self.source["nacp_orig"] family = data.get("step_2") if isinstance(family, dict): if not family_id or family_id not in family: raise CannotResolveRelativeException( "Cannot find person %s in the declaration %s" % (family_id, self.declaration_id)) member = family[family_id] else: raise CannotResolveRelativeException( "Cannot find family section in the declaration %s" % (self.declaration_id)) try: lastname = member["lastname"].strip() firstname = member["firstname"].strip() middlename = member["middlename"].strip() except KeyError: if "ukr_full_name" in member: lastname, firstname, middlename, _ = parse_fullname( member["ukr_full_name"]) else: raise CannotResolveRelativeException( "Cannot find name of a person %s in the declaration %s" % (family_id, self.declaration_id)) chunk1 = list( Person2Person.objects.filter( from_person_id=self.person_id, to_person__last_name_uk__iexact=lastname.strip(), to_person__first_name_uk__iexact=firstname.strip(), to_person__patronymic_uk__iexact=middlename.strip(), ).select_related("to_person") ) + list( Person2Person.objects.filter( from_person_id=self.person_id, to_person__last_name_uk__trigram_similar=lastname.strip(), to_person__first_name_uk__trigram_similar=firstname.strip(), to_person__patronymic_uk__trigram_similar=middlename.strip(), ).select_related("to_person")) chunk2 = list( Person2Person.objects.filter( to_person_id=self.person_id, from_person__last_name_uk__iexact=lastname.strip(), from_person__first_name_uk__iexact=firstname.strip(), from_person__patronymic_uk__iexact=middlename.strip(), ).select_related("from_person") ) + list( Person2Person.objects.filter( to_person_id=self.person_id, from_person__last_name_uk__trigram_similar=lastname.strip(), from_person__first_name_uk__trigram_similar=firstname.strip(), from_person__patronymic_uk__trigram_similar=middlename.strip(), ).select_related("from_person")) if len(set(chunk1)) + len(set(chunk2)) > 1: raise CannotResolveRelativeException( "Uh, oh, more than one connection between %s and %s %s %s" % (self.person, lastname, firstname, middlename)) for conn in chunk1: fuzzy_match = _is_fuzzy_match(lastname, firstname, middlename, conn.to_person) if fuzzy_match: logger.warning( "It was fuzzy match between %s %s %s and the declarant %s" % (lastname, firstname, middlename, conn.to_person)) return conn.to_person, fuzzy_match for conn in chunk2: fuzzy_match = _is_fuzzy_match(lastname, firstname, middlename, conn.from_person) if fuzzy_match: logger.warning( "It was fuzzy match between %s %s %s and the declarant %s" % (lastname, firstname, middlename, conn.from_person)) return conn.from_person, fuzzy_match raise CannotResolveRelativeException( "Cannot find person %s %s %s for the declarant %s" % (lastname, firstname, middlename, self.person))
def create_person(self, person_name, is_pep, yob, real_run=False): def create_new_person(): person = Person( last_name=title(last_name), first_name=title(first_name), patronymic=title(patronymic), is_pep=is_pep, type_of_official=1 if is_pep else 4 ) if yob and yob > 1850: dob = dt_parse("{}-01-01".format(yob)) person.dob = dob person.dob_details = 2 if real_run: person.save() self.new_persons_pk.append(person.pk) self.persons_dict[person_name] = person self.persons_stats["created_total"] += 1 return person qs = Person.objects.all() last_name, first_name, patronymic, _ = parse_fullname(person_name) if not last_name or not first_name: tqdm.write("Can not split name: {}".format(person_name)) return qs = qs.filter(last_name_uk__icontains=last_name, first_name_uk__icontains=first_name) if patronymic: qs = qs.filter(patronymic_uk__icontains=patronymic) name_matches = qs.count() if name_matches == 0: tqdm.write("No matches for: {}. Person will be created" .format(person_name)) return create_new_person() for person in qs.iterator(): edrpou_list = [edrpou.rjust(8, "0") for edrpou in self.smida_p2c[person_name]] p2c_qs = Person2Company.objects.filter(from_person=person, to_company__edrpou__in=edrpou_list) if p2c_qs.count(): tqdm.write("Matched {}. Found common P2C relation: {}, [{}]" .format(person.full_name, person.url_uk, " ".join([p2c.to_company.url_uk for p2c in p2c_qs.iterator()]))) self.persons_dict[person_name] = person self.persons_stats["matched_resolved"] += 1 # Update DoB self.update_person_dob(person, yob, real_run) return person tqdm.write("Found matches for name: {}. Person with same name will be created." .format(person_name)) self.persons_stats["matched_not_resolved"] += 1 return create_new_person()
def handle(self, *args, **options): activate(settings.LANGUAGE_CODE) successful = 0 failed = 0 exact_matches = 0 fuzzy_matches = 0 connections_created = 0 persons_created = 0 for company in Company.objects.filter(state_company=True).exclude( edrpou=""): k = company.edrpou.lstrip("0") # Because open copy of registry has no dates and some of companies # has more than one record we are using heuristic here to determine # latest record using registration status (they have "priorities") for order in self.status_order: res = EDRPOU.search().query( "bool", must=[Q("term", edrpou=k), Q("term", status=order)]) ans = res.execute() if ans: break # Last attempt if not ans: res = EDRPOU.search().query( "term", edrpou=k, ) ans = res.execute() if len(ans) > 1: self.stderr.write( "Too many companies found by code %s, for the name %s, skipping" % (k, company)) failed += 1 continue if len(ans) == 0: self.stderr.write("Cannot find the company by code %s" % (k, )) failed += 1 continue edr_company = ans[0] if not edr_company.head: self.stderr.write("Cannot find head for the company %s, (%s)" % (ans[0].name, k)) failed += 1 continue successful += 1 lastname, firstname, patronymic, _ = parse_fullname( edr_company.head) exact_links = Person2Company.objects.select_related( "from_person").filter( to_company_id=company.pk, from_person__first_name__iexact=firstname, from_person__last_name__iexact=lastname) if patronymic: exact_links = exact_links.filter( from_person__patronymic__iexact=patronymic) if exact_links.count(): exact_matches += 1 for l in exact_links: l.created_from_edr = True l.date_confirmed = edr_company.last_update l.date_confirmed_details = 0 l.save() if l.relationship_type != "Керівник": self.stdout.write( "Relation %s exists but has different type: %s" % (l, l.relationship_type)) continue else: fuzzy_links = Person2Company.objects.select_related( "from_person").filter( to_company_id=company.pk, from_person__last_name__iexact=lastname, from_person__first_name__istartswith=firstname[0], ) if patronymic: fuzzy_links = fuzzy_links.filter( from_person__patronymic__istartswith=patronymic[0]) if fuzzy_links: fuzzy_matches += 1 for l in fuzzy_links: l.created_from_edr = True l.date_confirmed = edr_company.last_update l.date_confirmed_details = 0 l.save() self.stdout.write( "Fuzzy match: %s vs %s" % (edr_company.head, l.from_person.full_name)) if l.relationship_type != "Керівник": self.stdout.write( "Relation %s exists but has different type: %s" % (l, l.relationship_type)) continue try: if options["real_run"]: person = Person.objects.create(first_name=firstname, last_name=lastname, patronymic=patronymic, is_pep=True, type_of_official=1) persons_created += 1 if options["real_run"]: Person2Company.objects.create( from_person=person, to_company=company, relationship_type="Керівник", is_employee=True, created_from_edr=True, date_confirmed=edr_company.last_update, # TODO: decide what to do with connection proofs proof_title="Інформація, отримана з ЄДР", ) connections_created += 1 except DataError: self.stdout.write("Cannot create %s person or connection" % edr_company.head) self.stdout.write("Creation failed: %s, creation successful: %s" % (failed, successful)) self.stdout.write("Exact matches: %s, fuzzy matches: %s" % (exact_matches, fuzzy_matches)) self.stdout.write("Persons created: %s, connections created: %s" % (persons_created, connections_created))
def handle(self, *args, **options): activate(settings.LANGUAGE_CODE) all_persons = [] keys = ["pk", "key", "fullname", "has_initials", "last_name", "first_name", "patronymic"] for p in Person.objects.all().nocache().iterator(): all_persons.append(dict(zip(keys, [ p.pk, ("%s %s %s" % ( p.last_name, p.first_name[:1], p.patronymic[:1])).lower(), ("%s %s %s" % ( p.last_name, p.first_name, p.patronymic)).lower(), is_initial(p.first_name) or is_initial(p.patronymic), p.last_name, p.first_name, p.patronymic]))) for aka in map(unicode.strip, (p.also_known_as_uk or "").replace(",", "\n").split("\n")): if not aka: continue last_name, first_name, patronymic, _ = parse_fullname(aka) if not(all([last_name, first_name, patronymic])): continue all_persons.append(dict(zip(keys, [ p.pk, ("%s %s %s" % ( last_name, first_name[:1], patronymic[:1])).lower(), ("%s %s %s" % ( last_name, first_name, patronymic)).lower(), is_initial(first_name) or is_initial(patronymic), last_name, first_name, patronymic]))) grouped_by_fullname = defaultdict(list) grouped_by_shortenedname = defaultdict(list) # First pass: exact matches by full name (even if those are given with initials) for l in tqdm(all_persons): if l["has_initials"]: grouped_by_shortenedname[l["key"]].append(l["pk"]) else: grouped_by_fullname[l["fullname"]].append(l["pk"]) spoiled_ids = set() chunks_to_review = list() for k, v in grouped_by_fullname.items(): if len(v) > 1: spoiled_ids |= set(v) chunks_to_review.append(v) for k, v in grouped_by_shortenedname.items(): if len(v) > 1: spoiled_ids |= set(v) chunks_to_review.append(v) mixed_grouping = defaultdict(list) # Second pass: initials vs full names for l in tqdm(all_persons): if l["pk"] not in spoiled_ids and l["has_initials"]: mixed_grouping[l["key"]].append(l["pk"]) for l in tqdm(all_persons): if l["pk"] not in spoiled_ids and not l["has_initials"] and l["key"] in mixed_grouping: mixed_grouping[l["key"]].append(l["pk"]) for k, v in tqdm(mixed_grouping.items()): if len(v) > 1: spoiled_ids |= set(v) chunks_to_review.append(v) for chunk in chunks_to_review: try: PersonDeduplication( person1_id=chunk[0], person2_id=chunk[1], person1_json=Person.objects.get(pk=chunk[0]).to_dict(), person2_json=Person.objects.get(pk=chunk[1]).to_dict(), ).save() except IntegrityError: pass candidates_for_fuzzy = [ l for l in all_persons if l["pk"] not in spoiled_ids and not l["has_initials"] ] for a, b in tqdm(combinations(candidates_for_fuzzy, 2)): score = jaro(a["fullname"], b["fullname"]) if score > 0.93: try: PersonDeduplication( person1_id=a["pk"], person2_id=b["pk"], fuzzy=True, person1_json=Person.objects.get(pk=a["pk"]).to_dict(), person2_json=Person.objects.get(pk=b["pk"]).to_dict(), ).save() except IntegrityError: pass