Beispiel #1
0
    def analyze_author(self, doc_unified: str, author: (str, str)):
        """
        Given a unified document title and an author tuple (first_name, last_name) tries to find a matching profile
        Adds an UnknownProfile if unsuccessful
        :param doc_unified:
        :param author:
        :return:
        """
        # Build the unified name of the found core_author
        author_unified, author_real = unify_profile_name(author[0], author[1])

        # Check if the found author is already linked to a profile
        if author_unified in self._unified_name_to_profiles:
            # If yes, append the doc title to the participated_documents of this author
            participated_docs = self._unified_name_to_participated_documents[
                author_unified]
            participated_docs.add(doc_unified)
        else:
            # If not, check if there is already an unknown_profile for this unified name
            # If not, create one
            if author_unified not in self._unified_name_to_unknown_profile:
                self._unified_name_to_unknown_profile[
                    author_unified] = CacheUnknownProfile(
                        name=author_real, unified_name=author_unified)
                # Add document to participated_documents
                participated_docs = set()
                participated_docs.add(doc_unified)
                self._unified_name_to_participated_documents[
                    author_unified] = participated_docs
            else:
                # The profile exists -> only add doc to participated_documents
                participated_docs = self._unified_name_to_participated_documents[
                    author_unified]
                participated_docs.add(doc_unified)
Beispiel #2
0
    def analyze_author(self, doc_unified: str, author: (str, str)):
        """
        Given a unified document title and an author tuple (first_name, last_name) tries to find a matching profile
        Adds an UnknownProfile if unsuccessful
        :param doc_unified:
        :param author:
        :return:
        """
        # Build the unified name of the found core_author
        author_unified, author_real = unify_profile_name(author[0], author[1])

        # Check if the found author is already linked to a profile
        if author_unified in self._unified_name_to_profiles:
            # If yes, append the doc title to the participated_documents of this author
            participated_docs = self._unified_name_to_participated_documents[author_unified]
            participated_docs.add(doc_unified)
        else:
            # If not, check if there is already an unknown_profile for this unified name
            # If not, create one
            if author_unified not in self._unified_name_to_unknown_profile:
                self._unified_name_to_unknown_profile[author_unified] = CacheUnknownProfile(
                    name=author_real, unified_name=author_unified)
                # Add document to participated_documents
                participated_docs = set()
                participated_docs.add(doc_unified)
                self._unified_name_to_participated_documents[author_unified] = participated_docs
            else:
                # The profile exists -> only add doc to participated_documents
                participated_docs = self._unified_name_to_participated_documents[author_unified]
                participated_docs.add(doc_unified)
Beispiel #3
0
    def process_profile_documents(self):
        """
        Iterates over the profile documents, finds research fields, finds duplicates, finds author profiles
        :return:
        """
        for profile_unified in self._unified_name_to_profiles:
            found_docs = []

            profiles = self._unified_name_to_profiles[profile_unified]
            if len(profiles) == 0:
                log.warning("There were no profiles for the unified name %s" %
                            profile_unified)
                continue

            # For each profile linked to that unified name, add the found documents to the list
            for profile in profiles:
                x = self._profile_docs[profile.identifier]
                log.debug(
                    "Used {len_x} documents from id {mendeley_id} for unified name {name}"
                    .format(len_x=len(x),
                            mendeley_id=profile.identifier,
                            name=unify_profile_name(profile.first_name,
                                                    profile.last_name)))
                found_docs += x

            # Process these documents
            for doc in found_docs:
                # Add doc to all docs
                self._documents.append(doc)

                # Create unified document title
                doc_unified, doc_real = unify_document_title(doc.core_title)

                # Add document to docs
                if doc_unified in self._unified_document_title_to_documents:
                    existing_docs = self._unified_document_title_to_documents[
                        doc_unified]
                    existing_docs.append(doc)
                else:
                    self._unified_document_title_to_documents[doc_unified] = [
                        doc
                    ]

                # Append the doc title to the authored_docs of that unified profile name
                authored_docs = self._unified_name_to_authored_documents[
                    profile_unified]
                authored_docs.add(doc_unified)

                # Process core_authors field of the doc to find participants
                for author in doc.core_authors:
                    self.analyze_author(doc_unified, author)

                # Analyze the tags fields of the doc to find research fields
                for tag in doc.tags:
                    self.analyze_field_tag(doc_unified, tag)
        log.info("Profile documents have been analyzed")
Beispiel #4
0
    def test_unify_profile_name(self):
        first_name = "Claudia"
        last_name = "Linnhoff-Popien"
        unified, real = unify_profile_name(first_name, last_name)
        self.assertEqual(unified, "claudialinnhoffpopien")
        self.assertEqual(real, "Claudia Linnhoff-Popien")

        first_name = ""
        last_name = "Juan Haladjian"
        unified, real = unify_profile_name(first_name, last_name)
        unified, real = unify_profile_name(first_name, last_name)
        self.assertEqual(unified, "juanhaladjian")
        self.assertEqual(real, "Juan Haladjian")

        first_name = "Juan"
        last_name = "Haladjian"
        unified, real = unify_profile_name(first_name, last_name)
        self.assertEqual(unified, "juanhaladjian")
        self.assertEqual(real, "Juan Haladjian")
Beispiel #5
0
    def test_unify_profile_name(self):
        first_name = "Claudia"
        last_name = "Linnhoff-Popien"
        unified, real = unify_profile_name(first_name, last_name)
        self.assertEqual(unified, "claudialinnhoffpopien")
        self.assertEqual(real, "Claudia Linnhoff-Popien")

        first_name = ""
        last_name = "Juan Haladjian"
        unified, real = unify_profile_name(first_name, last_name)
        unified, real = unify_profile_name(first_name, last_name)
        self.assertEqual(unified, "juanhaladjian")
        self.assertEqual(real, "Juan Haladjian")

        first_name = "Juan"
        last_name = "Haladjian"
        unified, real = unify_profile_name(first_name, last_name)
        self.assertEqual(unified, "juanhaladjian")
        self.assertEqual(real, "Juan Haladjian")
Beispiel #6
0
    def process_profile_documents(self):
        """
        Iterates over the profile documents, finds research fields, finds duplicates, finds author profiles
        :return:
        """
        for profile_unified in self._unified_name_to_profiles:
            found_docs = []

            profiles = self._unified_name_to_profiles[profile_unified]
            if len(profiles) == 0:
                log.warning("There were no profiles for the unified name %s" % profile_unified)
                continue

            # For each profile linked to that unified name, add the found documents to the list
            for profile in profiles:
                x = self._profile_docs[profile.identifier]
                log.debug("Used {len_x} documents from id {mendeley_id} for unified name {name}".format(
                    len_x=len(x),
                    mendeley_id=profile.identifier,
                    name=unify_profile_name(profile.first_name, profile.last_name)
                ))
                found_docs += x

            # Process these documents
            for doc in found_docs:
                # Add doc to all docs
                self._documents.append(doc)

                # Create unified document title
                doc_unified, doc_real = unify_document_title(doc.core_title)

                # Add document to docs
                if doc_unified in self._unified_document_title_to_documents:
                    existing_docs = self._unified_document_title_to_documents[doc_unified]
                    existing_docs.append(doc)
                else:
                    self._unified_document_title_to_documents[doc_unified] = [doc]

                # Append the doc title to the authored_docs of that unified profile name
                authored_docs = self._unified_name_to_authored_documents[profile_unified]
                authored_docs.add(doc_unified)

                # Process core_authors field of the doc to find participants
                for author in doc.core_authors:
                    self.analyze_author(doc_unified, author)

                # Analyze the tags fields of the doc to find research fields
                for tag in doc.tags:
                    self.analyze_field_tag(doc_unified, tag)
        log.info("Profile documents have been analyzed")
Beispiel #7
0
    def process_group_documents(self):
        """
        Iterates over the group documents, finds research fields, finds duplicates, finds author profiles
        :return:
        """
        for doc in self._group_docs:
            # Add doc to all docs
            self._documents.append(doc)

            # Create unified document title
            doc_unified, doc_real = unify_document_title(doc.core_title)

            # Add document to docs
            if doc_unified in self._unified_document_title_to_documents:
                existing_docs = self._unified_document_title_to_documents[
                    doc_unified]
                existing_docs.append(doc)
            else:
                self._unified_document_title_to_documents[doc_unified] = [doc]

            # Try to find the main owner of the document through the document profile_id
            # If not existent do nothing
            # (we can't do much only with the profile_id.
            # We could post-fetch the unknown profiles but that is more involved)
            profile_id = doc.core_profile_id
            if profile_id in self._profiles:
                profile = self._profiles[profile_id]
                unified_name, real_name = unify_profile_name(
                    profile.first_name, profile.last_name)
                if unified_name in self._unified_name_to_authored_documents:
                    authored_documents = self._unified_name_to_authored_documents[
                        unified_name]
                    authored_documents.add(doc_unified)

            # Process core_authors field of the doc to find participants
            for author in doc.core_authors:
                self.analyze_author(doc_unified, author)

            # Analyze the tags fiels of the doc to find research fields
            for tag in doc.tags:
                self.analyze_field_tag(doc_unified, tag)
        log.info("Group documents have been analyzed")
Beispiel #8
0
    def process_profiles(self):
        """
        Iterates over the profiles and finds duplicates
        :return:
        """
        for profile in self._profiles:
            unified, real = unify_profile_name(profile.first_name, profile.last_name)

            # Check if the name is already stored in the profiles
            # Then store the additional profile
            existing_profiles = []
            if unified in self._unified_name_to_profiles:
                existing_profiles = self._unified_name_to_profiles[unified]
            existing_profiles.append(profile)
            self._unified_name_to_profiles[unified] = existing_profiles

            # Store empty entries in documents maps for that profile
            # (then we don't need to check the key every time)
            self._unified_name_to_authored_documents[unified] = set()
            self._unified_name_to_participated_documents[unified] = set()
        log.info("Profiles have been analyzed")
Beispiel #9
0
    def process_profiles(self):
        """
        Iterates over the profiles and finds duplicates
        :return:
        """
        for profile in self._profiles:
            unified, real = unify_profile_name(profile.first_name,
                                               profile.last_name)

            # Check if the name is already stored in the profiles
            # Then store the additional profile
            existing_profiles = []
            if unified in self._unified_name_to_profiles:
                existing_profiles = self._unified_name_to_profiles[unified]
            existing_profiles.append(profile)
            self._unified_name_to_profiles[unified] = existing_profiles

            # Store empty entries in documents maps for that profile
            # (then we don't need to check the key every time)
            self._unified_name_to_authored_documents[unified] = set()
            self._unified_name_to_participated_documents[unified] = set()
        log.info("Profiles have been analyzed")
Beispiel #10
0
    def process_group_documents(self):
        """
        Iterates over the group documents, finds research fields, finds duplicates, finds author profiles
        :return:
        """
        for doc in self._group_docs:
            # Add doc to all docs
            self._documents.append(doc)

            # Create unified document title
            doc_unified, doc_real = unify_document_title(doc.core_title)

            # Add document to docs
            if doc_unified in self._unified_document_title_to_documents:
                existing_docs = self._unified_document_title_to_documents[doc_unified]
                existing_docs.append(doc)
            else:
                self._unified_document_title_to_documents[doc_unified] = [doc]

            # Try to find the main owner of the document through the document profile_id
            # If not existent do nothing
            # (we can't do much only with the profile_id.
            # We could post-fetch the unknown profiles but that is more involved)
            profile_id = doc.core_profile_id
            if profile_id in self._profiles:
                profile = self._profiles[profile_id]
                unified_name, real_name = unify_profile_name(profile.first_name, profile.last_name)
                if unified_name in self._unified_name_to_authored_documents:
                    authored_documents = self._unified_name_to_authored_documents[unified_name]
                    authored_documents.add(doc_unified)

            # Process core_authors field of the doc to find participants
            for author in doc.core_authors:
                self.analyze_author(doc_unified, author)

            # Analyze the tags fiels of the doc to find research fields
            for tag in doc.tags:
                self.analyze_field_tag(doc_unified, tag)
        log.info("Group documents have been analyzed")