def test_name_tidy(self):
        # remove improper comma
        sort_name = display_name_to_sort_name("Bitshifter, Bob,")
        assert "Bitshifter, Bob" == sort_name

        # remove improper period
        sort_name = display_name_to_sort_name("Bitshifter, Bober.")
        assert "Bitshifter, Bober" == sort_name

        # retain proper period
        sort_name = display_name_to_sort_name("Bitshifter, B.")
        assert "Bitshifter, B." == sort_name
 def default_name(self, display_name):
     shortened_name = self.primary_author_name(display_name)
     return display_name_to_sort_name(shortened_name)
    def cluster_has_record_for_named_author(
            self, cluster, working_sort_name, working_display_name, contributor_data=None):
        """  Looks through the xml cluster for all fields that could indicate the
        author's name.

        Don't short-circuit the xml parsing process -- if found an author name
        match, keep parsing and see what else can find.

        :return: a dictionary containing description of xml field
        that matched author name searched for.
        """
        match_confidences = {}
        if not contributor_data:
            contributor_data = ContributorData()

        # If we have a sort name to look for, and it's in this cluster's
        # sort names, great.
        if working_sort_name:
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(potential_match, working_sort_name)
                match_confidences["sort_name"] = match_confidence
                # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match"
                if match_confidence > 90:
                    contributor_data.sort_name=potential_match
                    return match_confidences

        # If we have a display name to look for, and this cluster's
        # Wikipedia name converts to the display name, great.
        if working_display_name:
            wikipedia_name = self.extract_wikipedia_name(cluster)
            if wikipedia_name:
                contributor_data.wikipedia_name=wikipedia_name
                display_name = self.wikipedia_name_to_display_name(wikipedia_name)
                match_confidence = contributor_name_match_ratio(display_name, working_display_name)
                match_confidences["display_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.display_name=display_name
                    return match_confidences

        # If there are UNIMARC records, and every part of the UNIMARC
        # record matches the sort name or the display name, great.
        unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
        candidates = []
        for unimarc in unimarcs:
            (possible_given, possible_family,
             possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc)
            if working_sort_name:
                match_confidence = contributor_name_match_ratio(possible_sort_name, working_sort_name)
                match_confidences["unimarc"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name=possible_sort_name
                    return match_confidences

            for name in (working_sort_name, working_display_name):
                if not name:
                    continue
                if (possible_given and possible_given in name
                    and possible_family and possible_family in name and (
                        not possible_extra or possible_extra in name)):
                    match_confidences["unimarc"] = 90
                    contributor_data.family_name=possible_family
                    return match_confidences

        # Last-ditch effort. Guess at the sort name and see if *that's* one
        # of the cluster sort names.
        if working_display_name and not working_sort_name:
            test_sort_name = display_name_to_sort_name(working_display_name)
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(potential_match, test_sort_name)
                match_confidences["guessed_sort_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.sort_name=potential_match
                    return match_confidences

        # OK, last last-ditch effort.  See if the alternate name forms (pseudonyms) are it.
        if working_sort_name:
            for potential_match in self.alternate_name_forms_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(potential_match, working_sort_name)
                match_confidences["alternate_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name=potential_match
                    return match_confidences

        return match_confidences
Beispiel #4
0
    def cluster_has_record_for_named_author(self,
                                            cluster,
                                            working_sort_name,
                                            working_display_name,
                                            contributor_data=None):
        """  Looks through the xml cluster for all fields that could indicate the 
        author's name.

        Don't short-circuit the xml parsing process -- if found an author name 
        match, keep parsing and see what else can find.

        :return: a dictionary containing description of xml field 
        that matched author name searched for.
        """
        match_confidences = {}
        if not contributor_data:
            contributor_data = ContributorData()

        # If we have a sort name to look for, and it's in this cluster's
        # sort names, great.
        if working_sort_name:
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(
                    potential_match, working_sort_name)
                match_confidences["sort_name"] = match_confidence
                # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match"
                if match_confidence > 90:
                    contributor_data.sort_name = potential_match
                    return match_confidences

        # If we have a display name to look for, and this cluster's
        # Wikipedia name converts to the display name, great.
        if working_display_name:
            wikipedia_name = self.extract_wikipedia_name(cluster)
            if wikipedia_name:
                contributor_data.wikipedia_name = wikipedia_name
                display_name = self.wikipedia_name_to_display_name(
                    wikipedia_name)
                match_confidence = contributor_name_match_ratio(
                    display_name, working_display_name)
                match_confidences["display_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.display_name = display_name
                    return match_confidences

        # If there are UNIMARC records, and every part of the UNIMARC
        # record matches the sort name or the display name, great.
        unimarcs = self._xpath(
            cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
        candidates = []
        for unimarc in unimarcs:
            (possible_given, possible_family, possible_extra,
             possible_sort_name) = self.extract_name_from_unimarc(unimarc)
            if working_sort_name:
                match_confidence = contributor_name_match_ratio(
                    possible_sort_name, working_sort_name)
                match_confidences["unimarc"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name = possible_sort_name
                    return match_confidences

            for name in (working_sort_name, working_display_name):
                if not name:
                    continue
                if (possible_given and possible_given in name
                        and possible_family and possible_family in name
                        and (not possible_extra or possible_extra in name)):
                    match_confidences["unimarc"] = 90
                    contributor_data.family_name = possible_family
                    return match_confidences

        # Last-ditch effort. Guess at the sort name and see if *that's* one
        # of the cluster sort names.
        if working_display_name and not working_sort_name:
            test_sort_name = display_name_to_sort_name(working_display_name)
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(
                    potential_match, test_sort_name)
                match_confidences["guessed_sort_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.sort_name = potential_match
                    return match_confidences

        # OK, last last-ditch effort.  See if the alternate name forms (pseudonyms) are it.
        if working_sort_name:
            for potential_match in self.alternate_name_forms_for_cluster(
                    cluster):
                match_confidence = contributor_name_match_ratio(
                    potential_match, working_sort_name)
                match_confidences["alternate_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name = potential_match
                    return match_confidences

        return match_confidences