def get_existing_before(self, profile_to_check, link_string, prof_id): ''' This function will detect if the profile has a link existing before it will create a task informing that profile should not be avoided ''' #We first get all webs of the current profile current_webs = profile_to_check.get_all_webs() for web in current_webs: #This will mean that we have an existing match before if web["name"] == link_string: profile = self.db_input.get_profile_by_ID(prof_id) details = MATCH_POTENTIAL_INFO_EXISTING + profile_to_check.nameLifespan() + " " + str(profile_to_check.get_id()) include_task_no_duplicate(profile, MATCH_POTENTIAL_EXISTING, 1, details) print_out("- AVOIDING INTRODUCTION OF EXISTING LINK IN "+ str(profile_to_check.nameLifespan()) ) return True return False
def _conflict_storing(self, profile_rm, conflicted_profiles_ids, db_conflict): ''' Internal function to avoid duplicates. It stores a conflict of matches deviation profile_rm is a profile kind conflicted_profile_ids is a list of ids with conflict ''' conflict_str = "" for prof_conf_id in conflicted_profiles_ids: prof_conf = db_conflict.get_profile_by_ID(prof_conf_id) conflict_str += str(prof_conf.nameLifespan()) + " " print_out("- CONFLICT of profile " + str(profile_rm.nameLifespan()) + " WITH PROFILE(S) " + conflict_str) #This is a conflict, we should have a single match!!! self.conflict_profiles[profile_rm.get_id()] = conflicted_profiles_ids details_info = MATCH_CONFLICT_INFO for ids_geni in conflicted_profiles_ids: details_info += self.database_geni.get_profile_by_ID(ids_geni).get_this_profile_url() + " " include_task_no_duplicate(profile_rm, MATCH_CONFLICT_TASK, 1, details_info)
def process(self, profiles_2_analyze="all", storage=False, threshold=360, avoid_import_living_from=["input", "check"]): ''' Determines the full database and detects potential matches profiles_2_analyze = an array of profiles for the first database which shall be analyzed, in case of no input it will analyze all profiles. avoid_import_living_from will be a list with either input or check, it will not import the living from that database ''' kind_match = self.db_check.get_db_kind() match_str = str(kind_match) + MATCH matcher_profiles = match_single_profile( self.db_input, self.db_check, data_language=self.data_language, name_convention=self.name_convention) linked_profiles = {} for prof in self.db_input.get_all_profiles(): if prof.get_specific_web(kind_match): linked_profiles[prof.get_id()] = prof print_out(PROCESS_MATCH_NUMBER_OF_IMPACTS_BEGIN + str(kind_match) + PROCESS_MATCH_NUMBER_OF_IMPACTS_END + str(len(linked_profiles))) for prof_id in linked_profiles: if (profiles_2_analyze == "all") or (prof_id in profiles_2_analyze): prof_in_study = linked_profiles[prof_id] if continue_match(prof_in_study, match_str, threshold=threshold): prof_linked_id = prof_in_study.get_specific_web( kind_match)["url"] prof_linked = self.db_check.get_profile_by_ID( prof_linked_id) #In order to get the marriage, first we get the family from the other database _, family_profile = self.db_check.get_family_from_child( prof_linked.get_id()) _, family_input = self.db_input.get_family_from_child( prof_id) #We will only analyze the profiles which have been introduced loc_research = get_research_log_id(prof_in_study, storage=storage) non_matched_profiles_input, non_matched_profiles_check, conflict_profiles, matched_profiles = matcher_profiles.match( prof_id) #We create the following slots that will be used for the introduction of confirmed candidates inside the database check_non_matches_existing_in_input = {} input_non_matches_existing_in_check = {} #REMOVAL OF LIVING #If the option is selected to not publish data that is restricted like living we remove teh matched characters temp_data = { "input": { "non_matches": non_matched_profiles_input, "database": self.db_input, "other_db": self.db_check, "family_is_child": family_input, "current_id_to_add": prof_linked_id, "current_id_matched": prof_id, "existing_prof": input_non_matches_existing_in_check }, "check": { "non_matches": non_matched_profiles_check, "database": self.db_check, "other_db": self.db_input, "family_is_child": family_profile, "current_id_to_add": prof_id, "current_id_matched": prof_linked.get_id(), "existing_prof": check_non_matches_existing_in_input } } for kind_db in temp_data: #We create a temporal copy to remove the data temp_input = list(temp_data[kind_db]["non_matches"]) for prof in temp_input: potential_profile = temp_data[kind_db][ "database"].get_profile_by_ID(prof) #REMOVAL OF LIVING #If the option is selected to not publish data that is restricted like living we remove teh matched characters if (kind_db in avoid_import_living_from ) and potential_profile.getLiving(): del temp_data[kind_db]["non_matches"][prof] print_out( "- AVOIDING INTRODUCTION OF LIVING " + str(potential_profile.nameLifespan()) + " FROM THE DATABASE " + temp_data[kind_db] ["database"].get_db_kind()) #Prior to starting the overall copy/match, we should check first if potential candidates exists in the input database temp_checking = dict(non_matched_profiles_check) for prof in temp_checking: potential_profile = self.db_check.get_profile_by_ID( prof) #MATCHING: we look if the profile also exists before matches = self.db_input.get_potential_profile_match( potential_profile, data_language=self.data_language, name_convention=self.name_convention) #Profiles might have been matched before Existing_found = False duplicate_names = "" for candidate_prof in matches.keys(): existing_prof = self.db_input.get_profile_by_ID( candidate_prof) if (existing_prof.getName() != NOT_KNOWN_VALUE ) and (existing_prof.getSurname() != NOT_KNOWN_VALUE): #We only continue looking for alternatives if there is no uncertain information int he profile duplicate_names += existing_prof.nameLifespan( ) + " " for web_data in existing_prof.get_all_webs(): #We now know that the profiles are the same in this database, we can proceed to add this existing one if web_data[ "url"] == potential_profile.get_this_profile_url( ): #We store the link of a profile id in check to a profile existing in input (profile class) check_non_matches_existing_in_input[ prof] = existing_prof Existing_found = True if matches[candidate_prof][ "score*factor"] >= FACTOR_DUPLICATE: check_non_matches_existing_in_input[ prof] = existing_prof Existing_found = True if (len(matches.keys()) > 0) and not Existing_found: #In this case we have a certain potential match. We should not add and rather, leave for checking an human being del non_matched_profiles_check[prof] print_out("- POTENTIAL DUPLICATE of profile " + str(potential_profile.nameLifespan()) + " WITH PROFILE(S) ID(S) " + str(duplicate_names)) #This is a conflict, we should avoid duplicating job of checking and solving conflict_profiles[prof] = matches details_info = ( "Potential existing duplicates for profile " + potential_profile.nameLifespan() + " with web " + potential_profile.get_this_profile_url() + " in the profiles: ") for ids_check in matches: temp_prof = self.db_input.get_profile_by_ID( ids_check) details_info += str( ids_check ) + " : " + temp_prof.nameLifespan( ) + " " include_task_no_duplicate( prof_in_study, MATCH_POTENTIAL_DUPLICATE, 1, details_info) ################### #MATCH INTRODUCTION ################### for db_kind in temp_data: #CASE MATCH: Missing a parents or several parents. non_match_now = temp_data[db_kind]["non_matches"] db_now = temp_data[db_kind]["database"] db_other = temp_data[db_kind]["other_db"] family_is_child = temp_data[db_kind]["family_is_child"] current_id_to_add = temp_data[db_kind][ "current_id_to_add"] if (("father" in non_match_now.values()) or ("mother" in non_match_now.values())): father_id = None mother_id = None father_profile = None mother_profile = None #We shall first obtain the id of the profiles from the check database (i.e. geni) if ("father" in non_match_now.values()): father_id = list(non_match_now.keys())[list( non_match_now.values()).index("father")] father_profile = db_now.get_profile_by_ID( father_id) if ("mother" in non_match_now.values()): mother_id = list(non_match_now.keys())[list( non_match_now.values()).index("mother")] mother_profile = db_now.get_profile_by_ID( mother_id) Intro_sentence = MATCH_ADDING_PROFILES if (father_id in temp_data[kind_db]["existing_prof"] ) or (mother_id in temp_data[kind_db]["existing_prof"]): Intro_sentence = MATCH_EXISTING_PROFILES #We also add the link! if father_profile and mother_profile: Intro_sentence += father_profile.nameLifespan( ) + AND_STRING + mother_profile.nameLifespan( ) + TO_STRING + db_other.get_db_kind() elif father_profile: Intro_sentence += father_profile.nameLifespan( ) + TO_STRING + db_other.get_db_kind() elif mother_profile: Intro_sentence += mother_profile.nameLifespan( ) + TO_STRING + db_other.get_db_kind() #We inform of the inclusion of the new profiles print_out(Intro_sentence) marriage_event = family_is_child.getMarriage() #If the parent was existing before, we avoid double introduction father_to_add = temp_data[kind_db][ "existing_prof"].get(father_id, father_profile) mother_to_add = temp_data[kind_db][ "existing_prof"].get(mother_id, mother_profile) if father_to_add and mother_to_add: #So.. we add the new profiles new_father_id, new_mother_id, _ = db_other.add_parents( child_profile_id=current_id_to_add, father_profile=father_to_add, mother_profile=mother_to_add, marriage_event=marriage_event) elif father_to_add: #So.. we just add the father new_father_id, new_mother_id, _ = db_other.add_parents( child_profile_id=current_id_to_add, father_profile=father_to_add, marriage_event=marriage_event) elif mother_to_add: #So.. we just add the father new_father_id, new_mother_id, _ = db_other.add_parents( child_profile_id=current_id_to_add, mother_profile=mother_to_add, marriage_event=marriage_event) if db_kind == "check": if father_profile: self.add_match_to_prof( new_father_id, father_profile) if mother_profile: self.add_match_to_prof( new_mother_id, mother_profile) #We remove the profiles, as will be added if father_id: del non_match_now[father_id] if mother_id: del non_match_now[mother_id] ################# # CONTINUE HERE ################# #PARTNERS: Review of partners for inclusion partners_input = self.db_input.get_partners_from_profile( prof_id) matched_partners = {} for partner_input in partners_input: if partner_input in matched_profiles: matched_partners[partner_input] = matched_profiles[ partner_input] for kind_db in temp_data: #We create a temporal copy to remove the data temp_2_use = list(temp_data[kind_db]["non_matches"]) for prof in temp_2_use: #We might delete some profiles (children) in the middle, that's why we check first if the profile is in the list if (prof in temp_data[kind_db]["non_matches"] ) and temp_data[kind_db]["non_matches"][ prof] == "partner": partner_profile = temp_data[kind_db][ "database"].get_profile_by_ID(prof) #Checking is already having a link to the database existing_link = self.get_existing_before( partner_profile, self.db_check.get_db_kind(), prof_id) #Now, if the partner is not accessible due to data restriction, we will skip this step if partner_profile.get_accessible( ) and not existing_link: #Good, we will need now to add the new partner to the INPUT area Intro_sentence = PROCESS_ADD_PROFILE_BEGIN if prof in temp_data[kind_db][ "existing_prof"]: Intro_sentence = PROCESS_LINK_PROFILE_BEGIN print_out(Intro_sentence + partner_profile.nameLifespan() + TO_STRING + temp_data[kind_db] ["other_db"].get_db_kind() + PROCESS_ADD_PROFILE_END + temp_data[kind_db]["non_matches"] [prof]) family_check = temp_data[kind_db][ "database"].get_family_from_partners( temp_data[kind_db] ["current_id_matched"], partner_profile.get_id()) marriage_event = temp_data[kind_db][ "database"].get_family_by_ID( family_check).getMarriage() #The profile might be existing before and we do not need to add it again, so we have a potential deviation #If we store it before, in this case, we will use the existing profile, if not, we continue we the one in the other database partner_to_introduce = temp_data[kind_db][ "existing_prof"].get( prof, partner_profile) id_partner, _ = temp_data[kind_db][ "other_db"].add_partner( temp_data[kind_db] ["current_id_to_add"], partner_to_introduce, marriage=marriage_event) if kind_db == "input": self.add_match_to_prof( prof, temp_data[kind_db]["other_db"]. get_profile_by_ID(id_partner), adding=False) matched_partners[prof] = id_partner elif kind_db == "check": self.add_match_to_prof( id_partner, partner_profile) matched_partners[id_partner] = prof elif not partner_profile.get_accessible(): print_out( "- AVOIDING INTRODUCTION OF LIVING " + str(partner_profile.nameLifespan()) + " FROM THE DATABASE " + temp_data[kind_db] ["database"].get_db_kind()) family_eliminate = temp_data[kind_db][ "database"].get_family_from_partners( temp_data[kind_db] ["current_id_matched"], partner_profile.get_id()) #As the partner is not known and we might have access issues, #we stop the review of those children in the non accesible partner for child in family_eliminate.getChildren( ): if child in temp_data[kind_db][ "non_matches"]: del temp_data[kind_db][ "non_matches"][child] #We remove also from the non-matching pending those profiles that have been skipped due to privacy if not existing_link: del temp_data[kind_db]["non_matches"][ partner_profile.get_id()] #We go ahead looking first for each matched partner for partner_input in matched_partners: partner_check = matched_partners[partner_input] #We need the family which will be the input for the children family_part_input = self.db_input.get_family_from_partners( prof_id, partner_input) family_part_check = self.db_check.get_family_from_partners( prof_linked.get_id(), partner_check) family_part = { "input": family_part_check, "check": family_part_input } family_current = { "input": family_part_input, "check": family_part_check } #INTRODUCTION: CHILDREN in the family for kind_db in temp_data: temp = dict(temp_data[kind_db]["non_matches"]) for prof in temp.keys(): #We will only select those profiles which are children if (temp_data[kind_db]["non_matches"][prof] == "child") and ( prof in temp_data[kind_db]["database"]. get_children_from_family( family_current[kind_db])): child_profile = temp_data[kind_db][ "database"].get_profile_by_ID(prof) #Ok, if the profile is accessible, we go ahead for creation if (child_profile.get_accessible()): Intro_sentence = PROCESS_ADD_PROFILE_BEGIN if prof in temp_data[kind_db][ "existing_prof"]: Intro_sentence = PROCESS_LINK_PROFILE_BEGIN print_out( Intro_sentence + child_profile.nameLifespan() + TO_STRING + temp_data[kind_db] ["other_db"].get_db_kind() + PROCESS_ADD_PROFILE_END + temp_data[kind_db]["non_matches"] [prof]) #If the child was existing before, we add it directly with the right profile, avoiding double introduction child_to_add = temp_data[kind_db][ "existing_prof"].get( prof, child_profile) child_new_ids = temp_data[kind_db][ "other_db"].add_child( family_part[kind_db], [child_to_add]) if kind_db == "input": child_new_prof = self.db_check.get_profile_by_ID( child_new_ids[0]) self.add_match_to_prof( prof, child_new_prof, adding=False) elif kind_db == "check": self.add_match_to_prof( child_new_ids[0], child_profile) else: print_out( child_profile.nameLifespan() + PROCESS_NO_ACCESS) del temp_data[kind_db]["non_matches"][prof] ################################################################ if len({ **non_matched_profiles_input, **non_matched_profiles_check, **conflict_profiles }) == 0: #In this case, we have achieved the full matching, we store the information today = datetime.date.today().toordinal() notes_toadd = STATUS_MATCHED + str(today) record_research_log(prof_in_study, match_str, loc_research, prof_linked_id, notes_toadd) else: notes_toadd = (STATUS_TO_CHECK + " " * 10 + "--Missing match input " + str(non_matched_profiles_input) + " " * 10 + "--Missing match check " + str(non_matched_profiles_check) + " " * 10 + "--Pending conflicts " + str(conflict_profiles)) record_research_log(prof_in_study, match_str, loc_research, prof_linked_id, notes_toadd) else: print_out("SKIPPING " + prof_in_study.nameLifespan(), log_level=15)
def match(self, profile_ID): ''' It executes the match, assumes contains a GENI link in the profile_ID It will: - Generate a web link to the matched profile - Return the following: - A list of non-matched profiles with the relationship according to standard namings requesting review. - A dictonary of conflicts linked to lists of profile in score with request to review - A dictionary of matched profiles. ''' #Initialization of the different matcher functions self._init_tracking_logs() profile_rm = self.database.get_profile_by_ID(profile_ID) print_out(str(profile_ID) + " = " + profile_rm.nameLifespan()) url = None #We confirm is a valid profile, should contain a match confirmed = False for web_ref in profile_rm.get_all_webs(): if web_ref["name"] == self.database_geni.get_db_kind(): confirmed = True url = web_ref["url"] if not confirmed: logging.error(MATCH_PROFILE_ERROR) return False #This is the profile for analysis profile_geni = self.database_geni.get_profile_by_ID(url) #We might have an address that has been updated, we double check for updating it in DB if url != profile_geni.get_this_profile_url(): profile_rm.update_web_ref(url = profile_geni.get_this_profile_url(), name = self.database_geni.get_db_kind()) print_out("UPDATING " + self.database_geni.get_db_kind() + " LINK to " + profile_geni.get_this_profile_url()) #Starting checking of the parents #FATHER _, father_rm = self.database.get_father_from_child(profile_rm.get_id()) _, father_geni = self.database_geni.get_father_from_child(profile_geni.get_id()) #First case is a potential match between the profiles if (father_rm and father_geni): self._match_single_pair(father_rm, father_geni) #We can only have the father_rm elif father_rm: self.non_matched_profiles_rm[father_rm.get_id()] = FATHER print_out("- NO MATCH of profile in " + self.database_geni.get_db_kind() + " " + str(father_rm.nameLifespan()) + " Relation = " + FATHER) elif father_geni: self.non_matched_profiles_geni[father_geni.get_id()] = FATHER print_out("- NO MATCH of profile in " + self.database.get_db_kind() + " " + str(father_geni.nameLifespan()) + " Relation = " + FATHER) #Notice that we do not consider the case of no parents at all identified, no match needed. #MOTHER _, mother_rm = self.database.get_mother_from_child(profile_rm.get_id()) _, mother_geni = self.database_geni.get_mother_from_child(profile_geni.get_id()) #First case is a potential match between the profiles if (mother_rm and mother_geni): self._match_single_pair(mother_rm, mother_geni) #We can only have the father_rm elif mother_rm: self.non_matched_profiles_rm[mother_rm.get_id()] = MOTHER print_out("- NO MATCH of profile in " + self.database_geni.get_db_kind() + " " + str(mother_rm.nameLifespan()) + " Relation = " + MOTHER) elif mother_geni: self.non_matched_profiles_geni[mother_geni.get_id()] = MOTHER print_out("- NO MATCH of profile in " + self.database.get_db_kind() + " " + str(mother_geni.nameLifespan()) + " Relation = " + MOTHER) #Notice that we do not consider teh case of no parents at all identified, no match needed. #PARTNERS partners_rm = self.database.get_partners_from_profile(profile_rm.get_id()) partners_geni = self.database_geni.get_partners_from_profile(profile_geni.get_id()) self._track_2_lists(partners_rm, partners_geni, PARTNER) #CHILDREN children_rm = self.database.get_all_children(profile_ID) children_geni = self.database_geni.get_all_children(url) self._track_2_lists(children_rm, children_geni, CHILD) return self.non_matched_profiles_rm, self.non_matched_profiles_geni, self.conflict_profiles, self.matched_profiles
def _track_2_lists(self, profiles_rm, profiles_geni, kind_of_match): ''' Function used for both partners and children as is a common function ''' #We store here the profiles that have been identified in profiles_geni profiles_not_identified = list(profiles_geni) conflict_potential_dictionary = {} for rm_id in profiles_rm: #We might be in a situation where very small similarities might create confusion of profiles, we store the previous score previous_score = 0 conflict_match = False profile_rm = self.database.get_profile_by_ID(rm_id) geni_matches = [] for geni_id in list(profiles_not_identified): profile_geni = self.database_geni.get_profile_by_ID(geni_id) score, factor = profile_rm.comparison_score(profile_geni, self.data_language, self.name_convention) if score*factor > self.threshold: #OPTIONS: # 1.New profile is the right one # 2.New profile is the first one # 3.New profile is not the right one.but is the previous # 4.New profile is as bad as the others. #Option 1 if score*factor > 3*previous_score: recover_profs = list(geni_matches) geni_matches = [geni_id] profiles_not_identified += recover_profs previous_score = score*factor if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id) #Option 2 elif len(geni_matches) == 0: geni_matches.append(geni_id) previous_score = score*factor if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id) #Option 3 elif previous_score >= 3*score*factor: #In this case we ignore... we keep the previous one pass #Option 4 else: if score*factor > previous_score: previous_score = score*factor if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id) elif score >= 3*self.threshold: conflict_match = True #This is a common case, where profiles have a minimum difference but still relevant, user to check if rm_id in conflict_potential_dictionary: conflict_potential_dictionary[rm_id].append(geni_id) else: conflict_potential_dictionary[rm_id] = [geni_id] #If there is a single match, whatever other conditions, we introduce as a match if len(geni_matches) == 1: self._match_single_pair(profile_rm, self.database_geni.get_profile_by_ID(geni_matches[0])) #In case there has been found also a conflict, the conflict is no longer needed, as we do have a match. if rm_id in conflict_potential_dictionary: del conflict_potential_dictionary[rm_id] else: #If there is no single match, we can have several options... if (len(geni_matches) == 0) and (not conflict_match): self.non_matched_profiles_rm[rm_id] = kind_of_match print_out("- NO MATCH of profile in " + self.database_geni.get_db_kind() + " " + str(profile_rm.nameLifespan()) + " Relation = " + kind_of_match) #Or we have more than one match... that is a conflict elif len(geni_matches) > 1: self._conflict_storing(profile_rm, geni_matches, self.database_geni) #We perform another loop with those profiles which were conflicted as some of the might have been identified on the other side temp_conflicted = conflict_potential_dictionary.copy() for rm_id_conflicted in temp_conflicted.keys(): #We might have some profiles that have been already identified but stored as potential conflicts. We shall remove them for geni_conflict_key in temp_conflicted[rm_id_conflicted]: if geni_conflict_key in self.matched_profiles.values(): conflict_potential_dictionary[rm_id_conflicted].remove(geni_conflict_key) #The profiles left, are either a match, or we have found again gaps of profiles not found for rm_id_final_confliced in conflict_potential_dictionary.keys(): #If the list in the dictionary is empty, we do have a missing proifle profile_rm_new = self.database.get_profile_by_ID(rm_id_final_confliced) if conflict_potential_dictionary[rm_id_final_confliced] == []: #This profile was having a potential conflict that ended being actually an empty profile, it is a missing match self.non_matched_profiles_rm[rm_id_final_confliced] = kind_of_match print_out("- NO MATCH of profile in " + self.database_geni.get_db_kind() + " " + str(profile_rm_new.nameLifespan()) + " Relation = " + kind_of_match) else: #Ok, in this case we have a potential match with an actual conflict details_info = "- CONFLICT POTENTIAL MATCH " + str(profile_rm_new.nameLifespan()) + " with the following: " address_list = [] for profile_id in conflict_potential_dictionary[rm_id_final_confliced]: #We remove conflicted profiles from the matching step if profile_id in profiles_not_identified: profiles_not_identified.remove(profile_id) profile = self.database_geni.get_profile_by_ID(profile_id) address_list.append(profile.get_this_profile_url()) details_info += str(profile.nameLifespan()) + " Relation = " + kind_of_match include_task_no_duplicate(profile_rm_new, MATCH_CONFLICT_TASK, 1, details_info) print_out(details_info) self.conflict_profiles[rm_id_final_confliced] = address_list #Now, we are able to detect those children on the "RIGHT" side. Not linked to other if len(profiles_not_identified) > 0: for missing_prof in profiles_not_identified: self.non_matched_profiles_geni[missing_prof] = kind_of_match prof = self.database_geni.get_profile_by_ID(missing_prof) print_out("- NO MATCH of profile in " + self.database.get_db_kind() + " " + str(prof.nameLifespan()) + " Relation = " + kind_of_match)
def _track_2_lists(self, profiles_rm, profiles_geni, kind_of_match): ''' Function used for both partners and children as is a common function ''' #We store here the profiles that have been identified in profiles_geni profiles_not_identified = list(profiles_geni) #We will create here the dictionary of the addresses dict_address = {} total_prof = self.database_geni.get_several_profile_by_ID(profiles_not_identified) for prof in total_prof: dict_address[total_prof[prof].get_this_profile_url()] = total_prof[prof].get_id() conflict_potential_dictionary = {} for rm_id in profiles_rm: #We might be in a situation where very small similarities might create confusion of profiles, we store the previous score previous_score = 0 conflict_match = False geni_matches = [] profile_rm = self.database.get_profile_by_ID(rm_id) url_rm_now = profile_rm.get_specific_web(self.database_geni.get_db_kind()).get("url", None) if url_rm_now in dict_address: #In this case we have a match, and potentially will be the same profile, we avoid several checks by using one profile_geni = total_prof[dict_address[url_rm_now]] score, factor = profile_rm.comparison_score(profile_geni, self.data_language, self.name_convention) if score*factor > self.threshold: #We confirm that this link is correct, so we avoid doing the complete loop... geni_matches = [dict_address[url_rm_now]] if dict_address[url_rm_now] in profiles_not_identified: profiles_not_identified.remove(dict_address[url_rm_now]) #We only continue if the check of the url was not sucecssful if len(geni_matches) == 0: for geni_id in list(profiles_not_identified): #We have already obtained all profiles above profile_geni = total_prof[geni_id] score, factor = profile_rm.comparison_score(profile_geni, self.data_language, self.name_convention) if score*factor > self.threshold: #OPTIONS: # 1.New profile is the right one # 2.New profile is the first one # 3.New profile is not the right one.but is the previous # 4.New profile is as bad as the others. #Option 1 if score*factor > 2.5*previous_score: recover_profs = list(geni_matches) geni_matches = [geni_id] profiles_not_identified += recover_profs previous_score = score*factor if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id) #Option 2 elif len(geni_matches) == 0: geni_matches.append(geni_id) previous_score = score*factor if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id) #Option 3 elif previous_score >= 2.5*score*factor: #In this case we ignore... we keep the previous one pass #Option 4 else: if score*factor > previous_score: previous_score = score*factor if geni_id in profiles_not_identified: profiles_not_identified.remove(geni_id) elif score >= 3*self.threshold: conflict_match = True #This is a common case, where profiles have a minimum difference but still relevant, user to check if rm_id in conflict_potential_dictionary: conflict_potential_dictionary[rm_id].append(geni_id) else: conflict_potential_dictionary[rm_id] = [geni_id] #Options in place with the current match of url # No existing url => we ignore # Existing so... # - The match is the same. => NO ACTION. Covered by code # - The match is different => NO ACTION. Covered by code double match will be created with warning # - There is no match => ACTION. Include a conflict warning # #If there is a single match, whatever other conditions, we introduce as a match if len(geni_matches) == 1: self._match_single_pair(profile_rm, self.database_geni.get_profile_by_ID(geni_matches[0])) #In case there has been found also a conflict, the conflict is no longer needed, as we do have a match. if rm_id in conflict_potential_dictionary: del conflict_potential_dictionary[rm_id] else: #If there is no single match, we can have several options... if (len(geni_matches) == 0) and (not conflict_match): if url_rm_now and (len(geni_matches) == 0): #This is the only option where we are going to generate a conflict, as existing before! print_out("- CONFLICT of profile " + str(profile_rm.nameLifespan()) + MATCH_PREVIOUS_MATCH) details = MATCH_CONFLICT_URL_MESSAGE + self.current_match + " as " + kind_of_match include_task_no_duplicate(profile_rm, MATCH_CONFLICT_URL_EXISTING, 1, details) #We move the profile to conflicted ones self.conflict_profiles[profile_rm.get_id()] = [] else: self.non_matched_profiles_rm[rm_id] = kind_of_match print_out("- NO MATCH of profile in " + self.database_geni.get_db_kind() + " " + str(profile_rm.nameLifespan()) + " Relation = " + kind_of_match) #Or we have more than one match... that is a conflict elif len(geni_matches) > 1: self._conflict_storing(profile_rm, geni_matches, self.database_geni) #We perform another loop with those profiles which were conflicted as some of the might have been identified on the other side temp_conflicted = conflict_potential_dictionary.copy() for rm_id_conflicted in temp_conflicted.keys(): #We might have some profiles that have been already identified but stored as potential conflicts. We shall remove them for geni_conflict_key in temp_conflicted[rm_id_conflicted]: if geni_conflict_key in self.matched_profiles.values(): conflict_potential_dictionary[rm_id_conflicted].remove(geni_conflict_key) #The profiles left, are either a match, or we have found again gaps of profiles not found for rm_id_final_confliced in conflict_potential_dictionary.keys(): #If the list in the dictionary is empty, we do have a missing proifle profile_rm_new = self.database.get_profile_by_ID(rm_id_final_confliced) if conflict_potential_dictionary[rm_id_final_confliced] == []: #This profile was having a potential conflict that ended being actually an empty profile, it is a missing match self.non_matched_profiles_rm[rm_id_final_confliced] = kind_of_match print_out("- NO MATCH of profile in " + self.database_geni.get_db_kind() + " " + str(profile_rm_new.nameLifespan()) + " Relation = " + kind_of_match) else: #Ok, in this case we have a potential match with an actual conflict details_info = "- CONFLICT POTENTIAL MATCH " + str(profile_rm_new.nameLifespan()) + " with the following: " address_list = [] for profile_id in conflict_potential_dictionary[rm_id_final_confliced]: #We remove conflicted profiles from the matching step if profile_id in profiles_not_identified: profiles_not_identified.remove(profile_id) profile = self.database_geni.get_profile_by_ID(profile_id) address_list.append(profile.get_this_profile_url()) details_info += str(profile.nameLifespan()) + " Relation = " + kind_of_match include_task_no_duplicate(profile_rm_new, MATCH_CONFLICT_TASK, 1, details_info) print_out(details_info) self.conflict_profiles[rm_id_final_confliced] = address_list #Now, we are able to detect those children on the "RIGHT" side. Not linked to other if len(profiles_not_identified) > 0: for missing_prof in profiles_not_identified: self.non_matched_profiles_geni[missing_prof] = kind_of_match prof = self.database_geni.get_profile_by_ID(missing_prof) print_out("- NO MATCH of profile in " + self.database.get_db_kind() + " " + str(prof.nameLifespan()) + " Relation = " + kind_of_match)
def execute_sync(self, profiles2analyze="all", threshold=360, storage=False): ''' This is the core function, it will execute the global sync of profiles Between primary and secondary database ''' list_prof = self.dbp.get_all_profiles() if profiles2analyze != "all": list_prof = self.dbp.get_several_profile_by_ID( profiles2analyze).values() kind_match = self.dbs.get_db_kind() match_str = str(kind_match) + MATCH for prof in list_prof: if prof.get_specific_web(kind_match) and continue_execution_step( prof, match_str, STATUS_SYNC, threshold=threshold): #Obtain the secondary database profile needed prof_second = self.dbs.get_profile_by_ID( prof.get_specific_web(kind_match)["url"]) #We obtain the latest update time update_primary = (datetime.now() - prof.get_update_datetime()) update_secondary = (datetime.now() - prof_second.get_update_datetime()) minimum_diff = min(update_primary.days, update_secondary.days) #Now, we only review in case the modification date is recent or there has not been any review before if (prof.get_research_item_by_name(match_str) is None) or minimum_diff < threshold: #We inform by the command line that we are analyzing one profile print_out(str(prof.get_id()) + " : " + prof.nameLifespan()) #We check all the events of the profile in the dictionary events_primary = prof.getEventsDict() events_secondary = prof_second.getEventsDict() events_in_both = set(events_primary) & set( events_secondary) events_only_in_primary = set(events_primary).difference( set(events_secondary)) events_only_in_secondary = set( events_secondary).difference(set(events_primary)) sync_data = { "PRIM": { "prof_destination": prof, "events2introduce": events_only_in_secondary, "events_dict": events_secondary, "db_destiny": self.dbp }, "SEC": { "prof_destination": prof_second, "events2introduce": events_only_in_primary, "events_dict": events_primary, "db_destiny": self.dbs } } for id_sync in sync_data: #As we know, we iterate on those events we know we need to intoduce for event_id in sync_data[id_sync]["events2introduce"]: if event_id in ARRAY_EVENTS: print_out("NOT IMPLEMENTED FOR " + event_id) else: #We obtain the new event class print_out(SYNC_NEW_EVENT + event_id + SYNC_IN_DB + sync_data[id_sync] ["db_destiny"].get_db_kind()) event_new = sync_data[id_sync]["events_dict"][ event_id] sync_data[id_sync][ "prof_destination"].setNewEvent(event_new) #We store teh exercise performed loc_research = get_research_log_id(prof, storage=storage) today = datetime.now().toordinal() notes_toadd = STATUS_SYNC + str(today) record_research_log(prof, match_str, loc_research, "", notes_toadd)