Esempio n. 1
0
    def extract_viaf_info(self, cluster, working_sort_name=None,
                          working_display_name=False):
        """ Extract name info from a single VIAF cluster.

        :return: a tuple containing:
        - ContributorData object filled with display, sort, family, and wikipedia names.
        - dictionary of ways the xml cluster data matched the names searched for.
        - list of titles attributed to the contributor in the cluster.
        or Nones on error.
        """
        contributor_data = ContributorData()
        contributor_titles = []
        match_confidences = {}

        # Find out if one of the working names shows up in a name record.
        # Note: Potentially sets contributor_data.sort_name.
        match_confidences = self.cluster_has_record_for_named_author(
                cluster, working_sort_name, working_display_name,
                contributor_data
        )

        # Get the VIAF ID for this cluster, just in case we don't have one yet.
        viaf_tag = self._xpath1(cluster, './/*[local-name()="viafID"]')
        if viaf_tag is None:
            contributor_data.viaf = None
        else:
            contributor_data.viaf = viaf_tag.text

        # If we don't have a working sort name, find the most popular
        # sort name in this cluster and use it as the sort name.
        sort_name_popularity = self.sort_names_by_popularity(cluster)

        # Does this cluster have a Wikipedia page?
        contributor_data.wikipedia_name = self.extract_wikipedia_name(cluster)
        if contributor_data.wikipedia_name:
            contributor_data.display_name = self.wikipedia_name_to_display_name(contributor_data.wikipedia_name)
            working_display_name = contributor_data.display_name
            # TODO: There's a problem here when someone's record has a
            # Wikipedia page other than their personal page (e.g. for
            # a band they're in.)

        known_name = working_sort_name or working_display_name
        unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
        candidates = []
        for unimarc in unimarcs:
            (possible_given, possible_family,
             possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc)
            # Some part of this name must also show up in the original
            # name for it to even be considered. Otherwise it's a
            # better bet to try to munge the original name.
            for v in (possible_given, possible_family, possible_extra):
                if not v:
                    continue
                if not known_name or v in known_name:
                    self.log.debug(
                        "FOUND %s in %s", v, known_name
                    )
                    candidates.append((possible_given, possible_family, possible_extra))
                    if possible_sort_name:
                        if possible_sort_name.endswith(","):
                            possible_sort_name = possible_sort_name[:-1]
                        sort_name_popularity[possible_sort_name] += 1
                    break
            else:
                self.log.debug(
                    "EXCLUDED %s/%s/%s for lack of resemblance to %s",
                    possible_given, possible_family, possible_extra,
                    known_name
                )
                pass

        if sort_name_popularity and not contributor_data.sort_name:
            contributor_data.sort_name, ignore = sort_name_popularity.most_common(1)[0]

        if contributor_data.display_name:
            parts = contributor_data.display_name.split(" ")
            if len(parts) == 2:
                # Pretty clearly given name+family name.
                # If it gets more complicated than this we can't
                # be confident.
                candidates.append(parts + [None])

        display_nameparts = self.best_choice(candidates)
        if display_nameparts[1]: # Family name
            contributor_data.family_name = display_nameparts[1]

        contributor_data.display_name = contributor_data.display_name or self.combine_nameparts(*display_nameparts) or working_display_name


        # Now go through the title elements, and make a list.
        titles = self._xpath(cluster, './/*[local-name()="titles"]/*[local-name()="work"]/*[local-name()="title"]')
        for title in titles:
            contributor_titles.append(title.text)

        return contributor_data, match_confidences, contributor_titles
Esempio n. 2
0
    def extract_viaf_info(self,
                          cluster,
                          working_sort_name=None,
                          working_display_name=False):
        """ Extract name info from a single VIAF cluster.

        :return: a tuple containing: 
        - ContributorData object filled with display, sort, family, and wikipedia names.
        - dictionary of ways the xml cluster data matched the names searched for.
        - list of titles attributed to the contributor in the cluster.
        or Nones on error.
        """
        contributor_data = ContributorData()
        contributor_titles = []
        match_confidences = {}

        # Find out if one of the working names shows up in a name record.
        # Note: Potentially sets contributor_data.sort_name.
        match_confidences = self.cluster_has_record_for_named_author(
            cluster, working_sort_name, working_display_name, contributor_data)

        # Get the VIAF ID for this cluster, just in case we don't have one yet.
        viaf_tag = self._xpath1(cluster, './/*[local-name()="viafID"]')
        if viaf_tag is None:
            contributor_data.viaf = None
        else:
            contributor_data.viaf = viaf_tag.text

        # If we don't have a working sort name, find the most popular
        # sort name in this cluster and use it as the sort name.
        sort_name_popularity = self.sort_names_by_popularity(cluster)

        # Does this cluster have a Wikipedia page?
        contributor_data.wikipedia_name = self.extract_wikipedia_name(cluster)
        if contributor_data.wikipedia_name:
            contributor_data.display_name = self.wikipedia_name_to_display_name(
                contributor_data.wikipedia_name)
            working_display_name = contributor_data.display_name
            # TODO: There's a problem here when someone's record has a
            # Wikipedia page other than their personal page (e.g. for
            # a band they're in.)

        known_name = working_sort_name or working_display_name
        unimarcs = self._xpath(
            cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
        candidates = []
        for unimarc in unimarcs:
            (possible_given, possible_family, possible_extra,
             possible_sort_name) = self.extract_name_from_unimarc(unimarc)
            # Some part of this name must also show up in the original
            # name for it to even be considered. Otherwise it's a
            # better bet to try to munge the original name.
            for v in (possible_given, possible_family, possible_extra):
                if not v:
                    continue
                if not known_name or v in known_name:
                    self.log.debug("FOUND %s in %s", v, known_name)
                    candidates.append(
                        (possible_given, possible_family, possible_extra))
                    if possible_sort_name:
                        if possible_sort_name.endswith(","):
                            possible_sort_name = possible_sort_name[:-1]
                        sort_name_popularity[possible_sort_name] += 1
                    break
            else:
                self.log.debug(
                    "EXCLUDED %s/%s/%s for lack of resemblance to %s",
                    possible_given, possible_family, possible_extra,
                    known_name)
                pass

        if sort_name_popularity and not contributor_data.sort_name:
            contributor_data.sort_name, ignore = sort_name_popularity.most_common(
                1)[0]

        if contributor_data.display_name:
            parts = contributor_data.display_name.split(" ")
            if len(parts) == 2:
                # Pretty clearly given name+family name.
                # If it gets more complicated than this we can't
                # be confident.
                candidates.append(parts + [None])

        display_nameparts = self.best_choice(candidates)
        if display_nameparts[1]:  # Family name
            contributor_data.family_name = display_nameparts[1]

        contributor_data.display_name = contributor_data.display_name or self.combine_nameparts(
            *display_nameparts) or working_display_name

        # Now go through the title elements, and make a list.
        titles = self._xpath(
            cluster,
            './/*[local-name()="titles"]/*[local-name()="work"]/*[local-name()="title"]'
        )
        for title in titles:
            contributor_titles.append(title.text)

        return contributor_data, match_confidences, contributor_titles
Esempio n. 3
0
    def cluster_has_record_for_named_author(
            self, cluster, working_sort_name, working_display_name, contributor_data=None):
        """  Looks through the xml cluster for all fields that could indicate the
        author's name.

        Don't short-circuit the xml parsing process -- if found an author name
        match, keep parsing and see what else can find.

        :return: a dictionary containing description of xml field
        that matched author name searched for.
        """
        match_confidences = {}
        if not contributor_data:
            contributor_data = ContributorData()

        # If we have a sort name to look for, and it's in this cluster's
        # sort names, great.
        if working_sort_name:
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(potential_match, working_sort_name)
                match_confidences["sort_name"] = match_confidence
                # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match"
                if match_confidence > 90:
                    contributor_data.sort_name=potential_match
                    return match_confidences

        # If we have a display name to look for, and this cluster's
        # Wikipedia name converts to the display name, great.
        if working_display_name:
            wikipedia_name = self.extract_wikipedia_name(cluster)
            if wikipedia_name:
                contributor_data.wikipedia_name=wikipedia_name
                display_name = self.wikipedia_name_to_display_name(wikipedia_name)
                match_confidence = contributor_name_match_ratio(display_name, working_display_name)
                match_confidences["display_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.display_name=display_name
                    return match_confidences

        # If there are UNIMARC records, and every part of the UNIMARC
        # record matches the sort name or the display name, great.
        unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
        candidates = []
        for unimarc in unimarcs:
            (possible_given, possible_family,
             possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc)
            if working_sort_name:
                match_confidence = contributor_name_match_ratio(possible_sort_name, working_sort_name)
                match_confidences["unimarc"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name=possible_sort_name
                    return match_confidences

            for name in (working_sort_name, working_display_name):
                if not name:
                    continue
                if (possible_given and possible_given in name
                    and possible_family and possible_family in name and (
                        not possible_extra or possible_extra in name)):
                    match_confidences["unimarc"] = 90
                    contributor_data.family_name=possible_family
                    return match_confidences

        # Last-ditch effort. Guess at the sort name and see if *that's* one
        # of the cluster sort names.
        if working_display_name and not working_sort_name:
            test_sort_name = display_name_to_sort_name(working_display_name)
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(potential_match, test_sort_name)
                match_confidences["guessed_sort_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.sort_name=potential_match
                    return match_confidences

        # OK, last last-ditch effort.  See if the alternate name forms (pseudonyms) are it.
        if working_sort_name:
            for potential_match in self.alternate_name_forms_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(potential_match, working_sort_name)
                match_confidences["alternate_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name=potential_match
                    return match_confidences

        return match_confidences
Esempio n. 4
0
    def cluster_has_record_for_named_author(self,
                                            cluster,
                                            working_sort_name,
                                            working_display_name,
                                            contributor_data=None):
        """  Looks through the xml cluster for all fields that could indicate the 
        author's name.

        Don't short-circuit the xml parsing process -- if found an author name 
        match, keep parsing and see what else can find.

        :return: a dictionary containing description of xml field 
        that matched author name searched for.
        """
        match_confidences = {}
        if not contributor_data:
            contributor_data = ContributorData()

        # If we have a sort name to look for, and it's in this cluster's
        # sort names, great.
        if working_sort_name:
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(
                    potential_match, working_sort_name)
                match_confidences["sort_name"] = match_confidence
                # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match"
                if match_confidence > 90:
                    contributor_data.sort_name = potential_match
                    return match_confidences

        # If we have a display name to look for, and this cluster's
        # Wikipedia name converts to the display name, great.
        if working_display_name:
            wikipedia_name = self.extract_wikipedia_name(cluster)
            if wikipedia_name:
                contributor_data.wikipedia_name = wikipedia_name
                display_name = self.wikipedia_name_to_display_name(
                    wikipedia_name)
                match_confidence = contributor_name_match_ratio(
                    display_name, working_display_name)
                match_confidences["display_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.display_name = display_name
                    return match_confidences

        # If there are UNIMARC records, and every part of the UNIMARC
        # record matches the sort name or the display name, great.
        unimarcs = self._xpath(
            cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
        candidates = []
        for unimarc in unimarcs:
            (possible_given, possible_family, possible_extra,
             possible_sort_name) = self.extract_name_from_unimarc(unimarc)
            if working_sort_name:
                match_confidence = contributor_name_match_ratio(
                    possible_sort_name, working_sort_name)
                match_confidences["unimarc"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name = possible_sort_name
                    return match_confidences

            for name in (working_sort_name, working_display_name):
                if not name:
                    continue
                if (possible_given and possible_given in name
                        and possible_family and possible_family in name
                        and (not possible_extra or possible_extra in name)):
                    match_confidences["unimarc"] = 90
                    contributor_data.family_name = possible_family
                    return match_confidences

        # Last-ditch effort. Guess at the sort name and see if *that's* one
        # of the cluster sort names.
        if working_display_name and not working_sort_name:
            test_sort_name = display_name_to_sort_name(working_display_name)
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(
                    potential_match, test_sort_name)
                match_confidences["guessed_sort_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.sort_name = potential_match
                    return match_confidences

        # OK, last last-ditch effort.  See if the alternate name forms (pseudonyms) are it.
        if working_sort_name:
            for potential_match in self.alternate_name_forms_for_cluster(
                    cluster):
                match_confidence = contributor_name_match_ratio(
                    potential_match, working_sort_name)
                match_confidences["alternate_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name = potential_match
                    return match_confidences

        return match_confidences