Python AuthorNameCanonicalizerの例、canonicalize.AuthorNameCanonicalizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: scripts.py プロジェクト: NYPL-Simplified/metadata_wrangler

    def __init__(self, _db=None, cmd_args=None):
        super(CheckContributorNamesOnWeb, self).__init__(_db=_db)

        parsed_args = self.parse_command_line(_db=self._db, cmd_args=cmd_args)
        self.mock_mode = parsed_args.mock

        if self.mock_mode:
            self.log.debug(
                "This is mocked run, with metadata coming from test files, rather than live OneClick connection."
            )
            self.base_path = os.path.split(__file__)[0]
            self.base_path = os.path.join(self.base_path, "tests")
            self.canonicalizer = MockAuthorNameCanonicalizer(self._db)
        else:
            self.canonicalizer = AuthorNameCanonicalizer(self._db)

コード例 #2

0

ファイルを表示

class CanonicalizationController(object):

    log = logging.getLogger("Canonicalization Controller")

    def __init__(self, _db):
        self._db = _db
        self.canonicalizer = AuthorNameCanonicalizer(self._db)

    def canonicalize_author_name(self):
        urn = request.args.get('urn')
        display_name = request.args.get('display_name')
        if urn:
            identifier = URNLookupController.parse_urn(self._db, urn, False)
            if not isinstance(identifier, Identifier):
                return INVALID_URN
        else:
            identifier = None

        author_name = self.canonicalizer.canonicalize_author_name(
            identifier, display_name
        )
        self.log.info(
            "Incoming display name/identifier: %r/%s. Canonicalizer said: %s",
            display_name, identifier, author_name
        )

        if not author_name:
            return make_response("", 404)
        return make_response(author_name, 200, {"Content-Type": "text/plain"})

コード例 #3

0

ファイルを表示

ファイル: scripts.py プロジェクト: NYPL-Simplified/metadata_wrangler

class CheckContributorNamesOnWeb(CheckContributorNamesInDB):
    """
    Inherits process_contribution_local from parent.
    Adds process_contribution_viaf functionality, which
    sends a request to viaf to try and determine correct sort_name
    for a given author.
    """

    COMPLAINT_SOURCE = "CheckContributorNamesOnWeb"


    def __init__(self, _db=None, cmd_args=None):
        super(CheckContributorNamesOnWeb, self).__init__(_db=_db)

        parsed_args = self.parse_command_line(_db=self._db, cmd_args=cmd_args)
        self.mock_mode = parsed_args.mock

        if self.mock_mode:
            self.log.debug(
                "This is mocked run, with metadata coming from test files, rather than live OneClick connection."
            )
            self.base_path = os.path.split(__file__)[0]
            self.base_path = os.path.join(self.base_path, "tests")
            self.canonicalizer = MockAuthorNameCanonicalizer(self._db)
        else:
            self.canonicalizer = AuthorNameCanonicalizer(self._db)


    def run(self, batch_size=10):
        """
        TODO:  run the local db one, make a fix_mismatch, and
        override it here.  in db local make it just register the complaint,
        but here make it first check the web, then register the complaint.

        start by running the db local to make sure generated complaints where should
        then run the web search only on the ones that have complaints about.  either run only
        on the non-
        """
        param_args = self.parse_command_line(self._db)

        self.query = self.make_query(
            self._db, param_args.identifier_type, param_args.identifiers, self.log
        )

        editions = True
        offset = 0
        output = "ContributorID|\tSortName|\tDisplayName|\tComputedSortName|\tResolution|\tComplaintSource"
        print output.encode("utf8")

        while editions:
            my_query = self.query.offset(offset).limit(batch_size)
            editions = my_query.all()

            for edition in editions:
                if edition.contributions:
                    for contribution in edition.contributions:
                        self.process_contribution_local(self._db, contribution, self.log)
            offset += batch_size

            self._db.commit()
        self._db.commit()


    @classmethod
    def arg_parser(cls):
        parser = super(CheckContributorNamesOnWeb, cls).arg_parser()

        parser.add_argument(
            '--mock',
            help='If turned on, will use the MockCheckContributorNamesOnWeb client.',
            action='store_true'
        )
        return parser


    def process_local_mismatch(self, _db, contribution, computed_sort_name, error_message_detail, log=None):
        """
        Overrides parent method to allow further resolution of sort_name problems by
        calling process_contribution_web, which asks OCLC and VIAF for info.
        Determines if a problem is to be investigated further or recorded as a Complaint,
        to be solved by a human.
        """
        self.process_contribution_web(_db=_db, contribution=contribution,
            redo_complaints=False, log=log)


    def process_contribution_web(self, _db, contribution, redo_complaints=False, log=None):
        """
        If sort_name that got from VIAF is not too far off from sort_name we already have,
        then use it (auto-fix).  If it is far off, then it's possible we did not match
        the author very well.  Make a wrong-author complaint, and ask a human to fix it.

        Searches VIAF by contributor's display_name and contribution title.  If the
        contributor already has a viaf_id store in our database, ignore it.  It's possible
        that id was produced by an older, less precise matching algorithm and might want replacing.

        :param redo_complaints: Should try OCLC/VIAF on the names that already have Complaint objects lodged against them?
        Alternative is to require human review of all Complaints.
        """
        if not contribution or not contribution.edition:
            return

        contributor = contribution.contributor
        if not contributor.display_name:
            return

        identifier = contribution.edition.primary_identifier
        if not identifier:
            return

        known_titles = []
        if contribution.edition.title:
            known_titles.append(contribution.edition.title)

        # Searching viaf can be resource-expensive, so only do it if specifically asked
        # See if there are any complaints already lodged by a previous run of this script.
        pool = contribution.edition.is_presentation_for
        parent_source = super(CheckContributorNamesOnWeb, self).COMPLAINT_SOURCE
        complaint = get_one(
            _db, Complaint, on_multiple='interchangeable',
            license_pool=pool,
            source=self.COMPLAINT_SOURCE,
            type=self.COMPLAINT_TYPE,
        )

        if not redo_complaints and complaint:
            # We already did some work on this contributor, and determined to
            # ask a human for help.  This method was called with the time-saving
            # redo_complaints=False flag.  Skip calling OCLC and VIAF.
            return

        # can we find an ISBN-type Identifier for this Contribution to send
        # a request to OCLC with?
        isbn_identifier = None
        if identifier.type == Identifier.ISBN:
            isbn_identifier = identifier
        else:
            equivalencies = identifier.equivalencies
            for equivalency in equivalencies:
                if equivalency.output.type == Identifier.ISBN:
                    isbn_identifier = equivalency.output
                    break

        if isbn_identifier:
            # we can ask OCLC Linked Data about this ISBN
            uris = None
            sort_name, uris = self.canonicalizer.sort_name_from_oclc_linked_data(
                isbn_identifier, contributor.display_name)
            if sort_name:
                # see it's in correct format and not too far off from display_name
                name_ok = self.verify_sort_name(sort_name, contributor)
                if name_ok:
                    self.resolve_local_complaints(contribution)
                    self.set_contributor_sort_name(sort_name, contribution)
                    return
            else:
                # Nope. If OCLC Linked Data gave us any VIAF IDs, look them up
                # and see if we can get a sort name out of them.
                if uris:
                    for uri in uris:
                        match_found = self.canonicalizer.VIAF_ID.search(uri)
                        if match_found:
                            viaf_id = match_found.groups()[0]
                            contributor_data = self.canonicalizer.viaf.lookup_by_viaf(
                                viaf_id, working_display_name=contributor.display_name
                            )[0]
                            if contributor_data.sort_name:
                                # see it's in correct format and not too far off from display_name
                                name_ok = self.verify_sort_name(sort_name, contributor)
                                if name_ok:
                                    self.resolve_local_complaints(contribution)
                                    self.set_contributor_sort_name(sort_name, contribution)
                                    return

        # Nope. If we were given a display name, let's ask VIAF about it
        # and see what it says.
        sort_name = self.canonicalizer.sort_name_from_viaf(contributor.display_name, known_titles)
        if sort_name:
            # see it's in correct format and not too far off from display_name
            name_ok = self.verify_sort_name(sort_name, contributor)
            if name_ok:
                self.resolve_local_complaints(contribution)
                self.set_contributor_sort_name(sort_name, contribution)
                return

        # If we got to this point, we have not gotten a satisfying enough answer from
        # either OCLC or VIAF.  Now is the time to generate a Complaint, ask a human to
        # come fix this.
        error_message_detail = "Contributor[id=%s].sort_name cannot be resolved from outside web services, human intervention required." % contributor.id
        self.register_problem(source=self.COMPLAINT_SOURCE, contribution=contribution,
            computed_sort_name=sort_name, error_message_detail=error_message_detail, log=log)


    @classmethod
    def verify_sort_name(cls, sort_name, contributor):
        """
        See how well the new sort_name matches the display_name and the expected 'Last, First' format.
        Too far off is an unexpected result and is a problem.
        Does not check for proper formatting, like "Last, First".
        :return name_ok: Boolean answer to "is this computed name good enough?"
        """
        if not contributor.sort_name:
            # any port in a storm is an acceptable sort name
            return True

        computed_sort_name = unicodedata.normalize("NFKD", unicode(sort_name))

        if (contributor.sort_name.strip().lower() == computed_sort_name.strip().lower()):
            # no change is good change
            return True

        # computed names don't match.  by how much?  if it's a matter of a comma or a misplaced
        # suffix, we can fix without asking for human intervention.  if the names are very different,
        # there's a chance the sort and display names are different on purpose, s.a. when foreign names
        # are passed as translated into only one of the fields, or when the author has a popular pseudonym.
        # best ask a human.

        # if the relative lengths are off than by a stray space or comma, ask a human
        # it probably means that a human metadata professional had added an explanation/expansion to the
        # sort_name, s.a. "Bob A. Jones" --> "Bob A. (Allan) Jones", and we'd rather not replace this data
        # with the "Jones, Bob A." that the auto-algorigthm would generate.
        length_difference = len(contributor.sort_name.strip()) - len(computed_sort_name.strip())
        if abs(length_difference) > 3:
            return False

        match_ratio = contributor_name_match_ratio(contributor.sort_name, computed_sort_name, normalize_names=False)

        if (match_ratio < 40):
            # ask a human.  this kind of score can happen when the sort_name is a transliteration of the display_name,
            # and is non-trivial to fix.
            return False
        else:
            # we can fix it!
            return True


    def resolve_local_complaints(self, contribution):
        """
        Resolves any complaints that the parent script may have made about this
        contributor's sort_name, because we've now asked the Web, and it gave us the answer.
        """
        pool = contribution.edition.is_presentation_for
        parent_source = super(CheckContributorNamesOnWeb, self).COMPLAINT_SOURCE
        parent_type = super(CheckContributorNamesOnWeb, self).COMPLAINT_TYPE

        query = self._db.query(Complaint)
        query = query.filter(Complaint.license_pool_id == pool.id)
        query = query.filter(Complaint.source == parent_source)
        query = query.filter(Complaint.type == parent_type)
        query = query.filter(Complaint.resolved == None)

        complaints = query.all()
        for complaint in complaints:
            # say that we fixed it
            complaint.resolved = datetime.datetime.utcnow()

コード例 #4

0

ファイルを表示

 def __init__(self, _db):
     self._db = _db
     self.canonicalizer = AuthorNameCanonicalizer(self._db)

コード例 #5

0

ファイルを表示

ファイル: test_canonicalize.py プロジェクト: NYPL-Simplified/metadata_wrangler

 def setup(self):
     super(TestAuthorNameCanonicalizer, self).setup()
     self.log = logging.getLogger("Author Name Canonicalizer Test")
     self.canonicalizer = AuthorNameCanonicalizer(self._db)
     self.viaf_client = MockVIAFClientLookup(self._db, self.log)
     self.canonicalizer.viaf = self.viaf_client

コード例 #6

0

ファイルを表示

ファイル: test_canonicalize.py プロジェクト: NYPL-Simplified/metadata_wrangler

class TestAuthorNameCanonicalizer(DatabaseTest):

    def setup(self):
        super(TestAuthorNameCanonicalizer, self).setup()
        self.log = logging.getLogger("Author Name Canonicalizer Test")
        self.canonicalizer = AuthorNameCanonicalizer(self._db)
        self.viaf_client = MockVIAFClientLookup(self._db, self.log)
        self.canonicalizer.viaf = self.viaf_client
        #self.oclc_client = MockOCLCLinkedData()
        #self.canonicalizer.oclcld = self.oclc_client


    def sample_data(self, filename):
        return sample_data(filename=filename, sample_data_dir="viaf")


    def queue_file_in_mock_http(self, filename):
        h = DummyHTTPClient()
        xml = self.sample_data(filename)
        h.queue_response(200, media_type='text/xml', content=xml)
        return h


    #def queue_viaf_lookup_result():
    #    http = self.queue_file_in_mock_http("mindy_kaling.xml")
    #    lookup = self.viaf_client.lookup_by_viaf(viaf="9581122", do_get=http.do_get)
    #    client.results = [lookup]


    def test_primary_author_name(self):
        # Test our ability to turn a freeform string that identifies
        # one or more people into the likely name of one person.
        m = self.canonicalizer.primary_author_name

        # Test the simplest case.
        eq_("Mindy Kaling", m("Mindy Kaling"))

        # Make sure only the first human's name is used.
        eq_("Mindy Kaling", m("Mindy Kaling, Bob Saget and Co"))
        eq_("Bill O'Reilly", m("Bill O'Reilly with Martin Dugard"))
        eq_("Clare Verbeek",
            m("Clare Verbeek, Thembani Dladla, Zanele Buthelezi"))

        # In most cases, when a sort name is passed in as a display
        # name, the situation is correctly diagnosed and the name is
        # returned as-is.
        for sort_name in (
            'Kaling, Mindy',
            'Tolkien, J. R. R.',
            'van Damme, Jean-Claude',
        ):
            eq_(sort_name, m(sort_name))

        # Similarly when there is no distinction between display
        # and sort name.
        for sort_name in (
            'Cher',
            'Various',
            'Anonymous',
        ):
            eq_(sort_name, m(sort_name))

        # These are not likely to show up in real usage, but we can
        # handle them.
        eq_("Rand, Ayn", m('Rand, Ayn, and Cher'))
        eq_("Rand, Ayn", m('Rand, Ayn, and Kaling, Mindy'))

    def test__canonicalize_single_name(self):
        # For single-named entities, the sort name and display name
        # are identical. We don't need to ask VIAF.
        self.canonicalizer.viaf.queue_lookup("bad data")

        for one_name in (
            'Various',
            'Anonymous',
            'Cher',
        ):
            eq_(
                one_name,
                self.canonicalizer._canonicalize(
                    identifier=None, display_name=one_name
                )
            )

        # We didn't ask the mock VIAF about anything.
        eq_(["bad data"], self.canonicalizer.viaf.results)

    def test_found_contributor(self):
        # If we find a matching contributor already in our database, 
        # then don't bother looking at OCLC or VIAF.
        contributor_1, made_new = self._contributor(sort_name="Zebra, Ant")
        contributor_1.display_name = "Ant Zebra"
        contributor_2, made_new = self._contributor(sort_name="Yarrow, Bloom")
        contributor_2.display_name = "Bloom Yarrow"

        # _canonicalize shouldn't try to contact viaf or oclc, but in case it does, make sure 
        # the contact brings wrong results.
        self.canonicalizer.viaf.queue_lookup([])
        #self.canonicalizer.oclcld.queue_lookup([])
        canonicalized_author = self.canonicalizer._canonicalize(identifier=None, display_name="Ant Zebra")
        eq_(canonicalized_author, contributor_1.sort_name)


    def test_oclc_contributor(self):
        # TODO: make sure isbn ids get directed to OCLC
        pass


    def test_non_isbn_identifier(self):
        # TODO: make sure non-isbn ids get directed to VIAF
        pass

コード例 #7

0

ファイルを表示

ファイル: test_canonicalize.py プロジェクト: wjzhu-class/metadata_wrangler

 def setup(self):
     super(TestAuthorNameCanonicalizer, self).setup()
     self.log = logging.getLogger("Author Name Canonicalizer Test")
     self.canonicalizer = AuthorNameCanonicalizer(self._db)
     self.viaf_client = MockVIAFClientLookup(self._db, self.log)
     self.canonicalizer.viaf = self.viaf_client

コード例 #8

0

ファイルを表示

ファイル: test_canonicalize.py プロジェクト: wjzhu-class/metadata_wrangler

class TestAuthorNameCanonicalizer(DatabaseTest):
    def setup(self):
        super(TestAuthorNameCanonicalizer, self).setup()
        self.log = logging.getLogger("Author Name Canonicalizer Test")
        self.canonicalizer = AuthorNameCanonicalizer(self._db)
        self.viaf_client = MockVIAFClientLookup(self._db, self.log)
        self.canonicalizer.viaf = self.viaf_client
        #self.oclc_client = MockOCLCLinkedData()
        #self.canonicalizer.oclcld = self.oclc_client

    def sample_data(self, filename):
        return sample_data(filename=filename, sample_data_dir="viaf")

    def queue_file_in_mock_http(self, filename):
        h = DummyHTTPClient()
        xml = self.sample_data(filename)
        h.queue_response(200, media_type='text/xml', content=xml)
        return h

    #def queue_viaf_lookup_result():
    #    http = self.queue_file_in_mock_http("mindy_kaling.xml")
    #    lookup = self.viaf_client.lookup_by_viaf(viaf="9581122", do_get=http.do_get)
    #    client.results = [lookup]

    def test_primary_author_name(self):
        # Test our ability to turn a freeform string that identifies
        # one or more people into the likely name of one person.
        m = self.canonicalizer.primary_author_name

        # Test the simplest case.
        eq_("Mindy Kaling", m("Mindy Kaling"))

        # Make sure only the first human's name is used.
        eq_("Mindy Kaling", m("Mindy Kaling, Bob Saget and Co"))
        eq_("Bill O'Reilly", m("Bill O'Reilly with Martin Dugard"))
        eq_("Clare Verbeek",
            m("Clare Verbeek, Thembani Dladla, Zanele Buthelezi"))

        # In most cases, when a sort name is passed in as a display
        # name, the situation is correctly diagnosed and the name is
        # returned as-is.
        for sort_name in (
                'Kaling, Mindy',
                'Tolkien, J. R. R.',
                'van Damme, Jean-Claude',
        ):
            eq_(sort_name, m(sort_name))

        # Similarly when there is no distinction between display
        # and sort name.
        for sort_name in (
                'Cher',
                'Various',
                'Anonymous',
        ):
            eq_(sort_name, m(sort_name))

        # These are not likely to show up in real usage, but we can
        # handle them.
        eq_("Rand, Ayn", m('Rand, Ayn, and Cher'))
        eq_("Rand, Ayn", m('Rand, Ayn, and Kaling, Mindy'))

    def test__canonicalize_single_name(self):
        # For single-named entities, the sort name and display name
        # are identical. We don't need to ask VIAF.
        self.canonicalizer.viaf.queue_lookup("bad data")

        for one_name in (
                'Various',
                'Anonymous',
                'Cher',
        ):
            eq_(
                one_name,
                self.canonicalizer._canonicalize(identifier=None,
                                                 display_name=one_name))

        # We didn't ask the mock VIAF about anything.
        eq_(["bad data"], self.canonicalizer.viaf.results)

    def test_found_contributor(self):
        # If we find a matching contributor already in our database,
        # then don't bother looking at OCLC or VIAF.
        contributor_1, made_new = self._contributor(sort_name="Zebra, Ant")
        contributor_1.display_name = "Ant Zebra"
        contributor_2, made_new = self._contributor(sort_name="Yarrow, Bloom")
        contributor_2.display_name = "Bloom Yarrow"

        # _canonicalize shouldn't try to contact viaf or oclc, but in case it does, make sure
        # the contact brings wrong results.
        self.canonicalizer.viaf.queue_lookup([])
        #self.canonicalizer.oclcld.queue_lookup([])
        canonicalized_author = self.canonicalizer._canonicalize(
            identifier=None, display_name="Ant Zebra")
        eq_(canonicalized_author, contributor_1.sort_name)

    def test_oclc_contributor(self):
        # TODO: make sure isbn ids get directed to OCLC
        pass

    def test_non_isbn_identifier(self):
        # TODO: make sure non-isbn ids get directed to VIAF
        pass