def analyze_author(self, doc_unified: str, author: (str, str)): """ Given a unified document title and an author tuple (first_name, last_name) tries to find a matching profile Adds an UnknownProfile if unsuccessful :param doc_unified: :param author: :return: """ # Build the unified name of the found core_author author_unified, author_real = unify_profile_name(author[0], author[1]) # Check if the found author is already linked to a profile if author_unified in self._unified_name_to_profiles: # If yes, append the doc title to the participated_documents of this author participated_docs = self._unified_name_to_participated_documents[ author_unified] participated_docs.add(doc_unified) else: # If not, check if there is already an unknown_profile for this unified name # If not, create one if author_unified not in self._unified_name_to_unknown_profile: self._unified_name_to_unknown_profile[ author_unified] = CacheUnknownProfile( name=author_real, unified_name=author_unified) # Add document to participated_documents participated_docs = set() participated_docs.add(doc_unified) self._unified_name_to_participated_documents[ author_unified] = participated_docs else: # The profile exists -> only add doc to participated_documents participated_docs = self._unified_name_to_participated_documents[ author_unified] participated_docs.add(doc_unified)
def analyze_author(self, doc_unified: str, author: (str, str)): """ Given a unified document title and an author tuple (first_name, last_name) tries to find a matching profile Adds an UnknownProfile if unsuccessful :param doc_unified: :param author: :return: """ # Build the unified name of the found core_author author_unified, author_real = unify_profile_name(author[0], author[1]) # Check if the found author is already linked to a profile if author_unified in self._unified_name_to_profiles: # If yes, append the doc title to the participated_documents of this author participated_docs = self._unified_name_to_participated_documents[author_unified] participated_docs.add(doc_unified) else: # If not, check if there is already an unknown_profile for this unified name # If not, create one if author_unified not in self._unified_name_to_unknown_profile: self._unified_name_to_unknown_profile[author_unified] = CacheUnknownProfile( name=author_real, unified_name=author_unified) # Add document to participated_documents participated_docs = set() participated_docs.add(doc_unified) self._unified_name_to_participated_documents[author_unified] = participated_docs else: # The profile exists -> only add doc to participated_documents participated_docs = self._unified_name_to_participated_documents[author_unified] participated_docs.add(doc_unified)
def process_profile_documents(self): """ Iterates over the profile documents, finds research fields, finds duplicates, finds author profiles :return: """ for profile_unified in self._unified_name_to_profiles: found_docs = [] profiles = self._unified_name_to_profiles[profile_unified] if len(profiles) == 0: log.warning("There were no profiles for the unified name %s" % profile_unified) continue # For each profile linked to that unified name, add the found documents to the list for profile in profiles: x = self._profile_docs[profile.identifier] log.debug( "Used {len_x} documents from id {mendeley_id} for unified name {name}" .format(len_x=len(x), mendeley_id=profile.identifier, name=unify_profile_name(profile.first_name, profile.last_name))) found_docs += x # Process these documents for doc in found_docs: # Add doc to all docs self._documents.append(doc) # Create unified document title doc_unified, doc_real = unify_document_title(doc.core_title) # Add document to docs if doc_unified in self._unified_document_title_to_documents: existing_docs = self._unified_document_title_to_documents[ doc_unified] existing_docs.append(doc) else: self._unified_document_title_to_documents[doc_unified] = [ doc ] # Append the doc title to the authored_docs of that unified profile name authored_docs = self._unified_name_to_authored_documents[ profile_unified] authored_docs.add(doc_unified) # Process core_authors field of the doc to find participants for author in doc.core_authors: self.analyze_author(doc_unified, author) # Analyze the tags fields of the doc to find research fields for tag in doc.tags: self.analyze_field_tag(doc_unified, tag) log.info("Profile documents have been analyzed")
def test_unify_profile_name(self): first_name = "Claudia" last_name = "Linnhoff-Popien" unified, real = unify_profile_name(first_name, last_name) self.assertEqual(unified, "claudialinnhoffpopien") self.assertEqual(real, "Claudia Linnhoff-Popien") first_name = "" last_name = "Juan Haladjian" unified, real = unify_profile_name(first_name, last_name) unified, real = unify_profile_name(first_name, last_name) self.assertEqual(unified, "juanhaladjian") self.assertEqual(real, "Juan Haladjian") first_name = "Juan" last_name = "Haladjian" unified, real = unify_profile_name(first_name, last_name) self.assertEqual(unified, "juanhaladjian") self.assertEqual(real, "Juan Haladjian")
def process_profile_documents(self): """ Iterates over the profile documents, finds research fields, finds duplicates, finds author profiles :return: """ for profile_unified in self._unified_name_to_profiles: found_docs = [] profiles = self._unified_name_to_profiles[profile_unified] if len(profiles) == 0: log.warning("There were no profiles for the unified name %s" % profile_unified) continue # For each profile linked to that unified name, add the found documents to the list for profile in profiles: x = self._profile_docs[profile.identifier] log.debug("Used {len_x} documents from id {mendeley_id} for unified name {name}".format( len_x=len(x), mendeley_id=profile.identifier, name=unify_profile_name(profile.first_name, profile.last_name) )) found_docs += x # Process these documents for doc in found_docs: # Add doc to all docs self._documents.append(doc) # Create unified document title doc_unified, doc_real = unify_document_title(doc.core_title) # Add document to docs if doc_unified in self._unified_document_title_to_documents: existing_docs = self._unified_document_title_to_documents[doc_unified] existing_docs.append(doc) else: self._unified_document_title_to_documents[doc_unified] = [doc] # Append the doc title to the authored_docs of that unified profile name authored_docs = self._unified_name_to_authored_documents[profile_unified] authored_docs.add(doc_unified) # Process core_authors field of the doc to find participants for author in doc.core_authors: self.analyze_author(doc_unified, author) # Analyze the tags fields of the doc to find research fields for tag in doc.tags: self.analyze_field_tag(doc_unified, tag) log.info("Profile documents have been analyzed")
def process_group_documents(self): """ Iterates over the group documents, finds research fields, finds duplicates, finds author profiles :return: """ for doc in self._group_docs: # Add doc to all docs self._documents.append(doc) # Create unified document title doc_unified, doc_real = unify_document_title(doc.core_title) # Add document to docs if doc_unified in self._unified_document_title_to_documents: existing_docs = self._unified_document_title_to_documents[ doc_unified] existing_docs.append(doc) else: self._unified_document_title_to_documents[doc_unified] = [doc] # Try to find the main owner of the document through the document profile_id # If not existent do nothing # (we can't do much only with the profile_id. # We could post-fetch the unknown profiles but that is more involved) profile_id = doc.core_profile_id if profile_id in self._profiles: profile = self._profiles[profile_id] unified_name, real_name = unify_profile_name( profile.first_name, profile.last_name) if unified_name in self._unified_name_to_authored_documents: authored_documents = self._unified_name_to_authored_documents[ unified_name] authored_documents.add(doc_unified) # Process core_authors field of the doc to find participants for author in doc.core_authors: self.analyze_author(doc_unified, author) # Analyze the tags fiels of the doc to find research fields for tag in doc.tags: self.analyze_field_tag(doc_unified, tag) log.info("Group documents have been analyzed")
def process_profiles(self): """ Iterates over the profiles and finds duplicates :return: """ for profile in self._profiles: unified, real = unify_profile_name(profile.first_name, profile.last_name) # Check if the name is already stored in the profiles # Then store the additional profile existing_profiles = [] if unified in self._unified_name_to_profiles: existing_profiles = self._unified_name_to_profiles[unified] existing_profiles.append(profile) self._unified_name_to_profiles[unified] = existing_profiles # Store empty entries in documents maps for that profile # (then we don't need to check the key every time) self._unified_name_to_authored_documents[unified] = set() self._unified_name_to_participated_documents[unified] = set() log.info("Profiles have been analyzed")
def process_group_documents(self): """ Iterates over the group documents, finds research fields, finds duplicates, finds author profiles :return: """ for doc in self._group_docs: # Add doc to all docs self._documents.append(doc) # Create unified document title doc_unified, doc_real = unify_document_title(doc.core_title) # Add document to docs if doc_unified in self._unified_document_title_to_documents: existing_docs = self._unified_document_title_to_documents[doc_unified] existing_docs.append(doc) else: self._unified_document_title_to_documents[doc_unified] = [doc] # Try to find the main owner of the document through the document profile_id # If not existent do nothing # (we can't do much only with the profile_id. # We could post-fetch the unknown profiles but that is more involved) profile_id = doc.core_profile_id if profile_id in self._profiles: profile = self._profiles[profile_id] unified_name, real_name = unify_profile_name(profile.first_name, profile.last_name) if unified_name in self._unified_name_to_authored_documents: authored_documents = self._unified_name_to_authored_documents[unified_name] authored_documents.add(doc_unified) # Process core_authors field of the doc to find participants for author in doc.core_authors: self.analyze_author(doc_unified, author) # Analyze the tags fiels of the doc to find research fields for tag in doc.tags: self.analyze_field_tag(doc_unified, tag) log.info("Group documents have been analyzed")