コード例 #1
0
    def lookup_by_name(self, sort_name, display_name=None, do_get=None,
                       known_titles=None):
        """
        Asks VIAF for a list of author clusters, matching the passed-in
        author name.  Selects the cluster we deem the best match for
        the author we mean.

        :param sort_name: Author name in Last, First format.
        :param display_name: Author name in First Last format.
        :param do_get: Ask Representation to use Http GET?
        :param known_titles: A list of titles we know this author wrote.
        :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData.
        """
        author_name = sort_name or display_name
        # from OCLC tech support:
        # VIAF's SRU endpoint can only return a maximum number of 10 records
        # when the recordSchema is http://viaf.org/VIAFCluster
        maximum_records = 10 # viaf maximum that's not ignored
        page = 1
        contributor_candidates = []

        # limit ourselves to reading the first 500 viaf clusters, on the
        # assumption that search match quality is unlikely to be usable after that.
        for page in range (1, 51):
            start_record = 1 + maximum_records * (page-1)
            scope = 'local.personalNames'
            if is_corporate_name(author_name):
                scope = 'local.corporateNames'

            url = self.SEARCH_URL.format(
                scope=scope, author_name=author_name.encode("utf8"),
                maximum_records=maximum_records, start_record=start_record
            )
            representation, cached = Representation.get(
                self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
            )
            xml = representation.content

            candidates = self.parser.parse_multiple(xml, sort_name, display_name, page)
            if not any(candidates):
                # Delete the representation so it's not cached.
                self._db.query(Representation).filter(
                    Representation.id==representation.id
                ).delete()
                # We ran out of clusters, so we can relax and move on to
                # ordering the returned results
                break

            contributor_candidates.extend(candidates)
            page += 1

        best_match = self.select_best_match(candidates=contributor_candidates,
            working_sort_name=author_name, known_titles=known_titles)

        return best_match
コード例 #2
0
    def primary_author_name(self, author_name):
        """From an 'author' name that may contain multiple people, extract
        just the first name.

        This is intended to extract e.g. "Bill O'Reilly" from
        "Bill O'Reilly with Martin Dugard".

        TODO: When the author is "Ryan and Josh Shook" I really have no clue
        what to do.
        """
        if not author_name:
            return None
        if is_corporate_name(author_name):
            return author_name
        for splitter in (" with ", " and "):
            if splitter in author_name:
                author_name = author_name.split(splitter)[0]
        author_name = author_name.split(", ")[0]

        return author_name
コード例 #3
0
    def primary_author_name(self, author_name):
        """From an 'author' name that may contain multiple people, extract
        just the first name.

        This is intended to extract e.g. "Bill O'Reilly" from
        "Bill O'Reilly with Martin Dugard".

        TODO: When the author is "Ryan and Josh Shook" I really have no clue
        what to do.
        """
        if not author_name:
            return None
        if is_corporate_name(author_name):
            return author_name
        for splitter in (' with ', ' and '):
            if splitter in author_name:
                author_name = author_name.split(splitter)[0]
        author_name = author_name.split(", ")[0]

        return author_name
コード例 #4
0
    def primary_author_name(self, author_name):
        """From an 'author' name that may contain multiple people, extract
        just the first name.

        This is intended to extract e.g. "Bill O'Reilly" from
        "Bill O'Reilly with Martin Dugard".

        TODO: Cases we can't handle:
         van Damme, Jean Claude
         Madonna, Cher
         Ryan and Josh Shook
        """
        if not author_name:
            return None
        if is_corporate_name(author_name):
            return author_name
        for splitter in (' with ', ' and '):
            if splitter in author_name:
                author_name = author_name.split(splitter)[0]

        author_names = author_name.split(", ")
        if len(author_names) == 2 and any(
            ' ' not in name for name in author_names
        ):
            # There are two putative author names, and one of them doesn't
            # have a space in it. The most likely scenario is that
            # this is the sort name of a single person
            # (e.g. "Tolkien, J. R. R."), not two different display names.
            # In that situation the best we can do is return the
            # sort name as-is.
            pass
        else:
            # Either there is no comma here, or the comma really does seem to
            # separate multiple peoples' names. Pick the first one.
            author_name = author_names[0]

        if author_name.endswith(','):
            # Sometimes peoples' sort names end with a period, but
            # commas, not so much.
            author_name = author_name[:-1]
        return author_name
コード例 #5
0
    def lookup_by_name(self, sort_name, display_name=None, do_get=None,
                       best_match=False):
        sort_name = sort_name or display_name
        # from OCLC tech support:
        # VIAF's SRU endpoint can only return a maximum number of 10 records
        # when the recordSchema is http://viaf.org/VIAFCluster
        maximum_records = 10 # viaf maximum that's not ignored
        page = 1
        contributor_candidates = []

        # limit ourselves to reading the first 500 viaf clusters, on the
        # assumption that search match quality is unlikely to be usable after that.
        for page in range (1, 51):
            start_record = 1 + maximum_records * (page-1)
            scope = 'local.personalNames'
            if is_corporate_name(sort_name):
                scope = 'local.corporateNames'

            url = self.SEARCH_URL.format(
                scope=scope, sort_name=sort_name.encode("utf8"),
                maximum_records=maximum_records, start_record=start_record
            )
            representation, cached = Representation.get(
                self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE
            )
            xml = representation.content

            candidates = self.parser.parse_multiple(xml, sort_name, display_name, page)
            if not any(candidates):
                # Delete the representation so it's not cached.
                self._db.query(Representation).filter(
                    Representation.id==representation.id
                ).delete()
                # We ran out of clusters, so we can relax and move on to
                # ordering the returned results
                break

            contributor_candidates.extend(candidates)
            page += 1

        return self.select_best_match(contributor_candidates, sort_name)
コード例 #6
0
ファイル: viaf.py プロジェクト: rskm1/metadata_wrangler
    def lookup_by_name(self,
                       sort_name,
                       display_name=None,
                       do_get=None,
                       known_titles=None):
        """
        Asks VIAF for a list of author clusters, matching the passed-in 
        author name.  Selects the cluster we deem the best match for 
        the author we mean.

        :param sort_name: Author name in Last, First format.
        :param display_name: Author name in First Last format.
        :param do_get: Ask Representation to use Http GET?
        :param known_titles: A list of titles we know this author wrote.
        :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData.
        """
        author_name = sort_name or display_name
        # from OCLC tech support:
        # VIAF's SRU endpoint can only return a maximum number of 10 records
        # when the recordSchema is http://viaf.org/VIAFCluster
        maximum_records = 10  # viaf maximum that's not ignored
        page = 1
        contributor_candidates = []

        # limit ourselves to reading the first 500 viaf clusters, on the
        # assumption that search match quality is unlikely to be usable after that.
        for page in range(1, 51):
            start_record = 1 + maximum_records * (page - 1)
            scope = 'local.personalNames'
            if is_corporate_name(author_name):
                scope = 'local.corporateNames'

            url = self.SEARCH_URL.format(
                scope=scope,
                author_name=author_name.encode("utf8"),
                maximum_records=maximum_records,
                start_record=start_record)
            representation, cached = Representation.get(
                self._db,
                url,
                do_get=do_get,
                max_age=self.REPRESENTATION_MAX_AGE)
            xml = representation.content

            candidates = self.parser.parse_multiple(xml, sort_name,
                                                    display_name, page)
            if not any(candidates):
                # Delete the representation so it's not cached.
                self._db.query(Representation).filter(
                    Representation.id == representation.id).delete()
                # We ran out of clusters, so we can relax and move on to
                # ordering the returned results
                break

            contributor_candidates.extend(candidates)
            page += 1

        best_match = self.select_best_match(candidates=contributor_candidates,
                                            working_sort_name=author_name,
                                            known_titles=known_titles)

        return best_match