forked from NYPL-Simplified/metadata_wrangler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
canonicalize.py
382 lines (311 loc) · 13.6 KB
/
canonicalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
"""Use external services to canonicalize names."""
import logging
import os
import re
from nose.tools import set_trace
from oclc.linked_data import OCLCLinkedData
from viaf import VIAFClient, MockVIAFClient
from core.model import (
Contributor,
Identifier,
)
from core.util.personal_names import (
contributor_name_match_ratio,
display_name_to_sort_name,
is_corporate_name,
name_tidy,
)
from core.util.titles import (
title_match_ratio,
)
class CanonicalizationError(Exception):
pass
class AuthorNameCanonicalizer(object):
"""Does whatever it takes to find the name of a book's primary author
in canonicalized ("Picoult, Jody") form.
"""
VIAF_ID = re.compile("^http://viaf.org/viaf/([0-9]+)$")
def __init__(self, _db, oclcld=None, viaf=None):
self._db = _db
self.oclcld = oclcld or OCLCLinkedData(_db)
self.viaf = viaf or VIAFClient(_db)
self.log = logging.getLogger("Author name canonicalizer")
@classmethod
def primary_author_name(self, author_name):
"""From an 'author' name that may contain multiple people, extract
just the first name.
This is intended to extract e.g. "Bill O'Reilly" from
"Bill O'Reilly with Martin Dugard".
TODO: Cases we can't handle:
van Damme, Jean Claude
Madonna, Cher
Ryan and Josh Shook
"""
if not author_name:
return None
if is_corporate_name(author_name):
return author_name
for splitter in (' with ', ' and '):
if splitter in author_name:
author_name = author_name.split(splitter)[0]
author_names = author_name.split(", ")
if len(author_names) == 2 and any(
' ' not in name for name in author_names
):
# There are two putative author names, and one of them doesn't
# have a space in it. The most likely scenario is that
# this is the sort name of a single person
# (e.g. "Tolkien, J. R. R."), not two different display names.
# In that situation the best we can do is return the
# sort name as-is.
pass
else:
# Either there is no comma here, or the comma really does seem to
# separate multiple peoples' names. Pick the first one.
author_name = author_names[0]
if author_name.endswith(','):
# Sometimes peoples' sort names end with a period, but
# commas, not so much.
author_name = author_name[:-1]
return author_name
def canonicalize_author_name(self, identifier, display_name):
"""Canonicalize a book's primary author given an identifier and a
display name.
We are not interested in finding out which individual the
author is, we just want to know what their name looks like in
"Picoult, Jody" format.
TODO: VIAF recognizes "D.H. Lawrence" and "D H Lawrence" but
not "DH Lawrence". NYT commonly formats names like "DH
Lawrence".
"""
if not identifier and not display_name:
raise CanonicalizationError(
"Neither useful identifier nor display name was provided."
)
# From an author name that potentially names multiple people,
# extract only the first name.
shortened_name = self.primary_author_name(display_name)
# If we can canonicalize that shortened name, great. If not,
# try again with the full name.
candidates = [shortened_name]
if display_name != shortened_name:
candidates.append(display_name)
for n in candidates:
v = self._canonicalize(identifier, n)
if v:
return v
# All our techniques have failed. Woe! Let's just try to finagle
# this provided display name into a sort name.
return self.default_name(display_name)
def default_name(self, display_name):
shortened_name = self.primary_author_name(display_name)
return display_name_to_sort_name(shortened_name)
def _canonicalize(self, identifier, display_name):
if not ' ' in display_name:
# This is a one-named entity, like 'Cher' or 'Various'.
# The display name and sort name are identical.
return display_name
# The best outcome would be that we already have a Contributor
# with this exact display name and a known sort name.
self.log.debug("Attempting to canonicalize %s", display_name)
# can we infer any titles we know this person wrote?
known_titles = []
if identifier:
editions = identifier.primarily_identifies
# only choose one version of the title
if editions and editions[0].title:
known_titles.append(editions[0].title)
contributors = self._db.query(Contributor).filter(
Contributor.display_name==display_name).filter(
Contributor.sort_name != None).all()
sort_name = None
if contributors:
# Yes, awesome. Let's gild this lily -- are there any contributors
# who have sort_names and also have written titles similar to the
# identifier's? If not, no worries, choose any sort_name, and it's
# probably good.
for contributor in contributors:
# did we just find the sort_name in a previous iteration?
if sort_name:
break
for contribution in contributor.contributions:
if (contribution.edition and contribution.edition.title and
known_titles and
(title_match_ratio(known_titles[0], contribution.edition.title) > 80)):
# whew!
sort_name = contributor.sort_name
break
else:
# we have contributors, but none of their titles matched what we know
sort_name = contributors[0].sort_name
self.log.debug(
"Found existing contributor for %s: %s",
display_name, sort_name
)
return sort_name
# Looking in the database didn't work. Let's ask OCLC
# Linked Data about this ISBN and see if it gives us an
# author.
uris = None
if identifier:
sort_name, uris = self.sort_name_from_oclc_linked_data(
identifier, display_name)
if sort_name:
return sort_name
# Nope. If OCLC Linked Data gave us any VIAF IDs, look them up
# and see if we can get a sort name out of them.
if uris:
for uri in uris:
m = self.VIAF_ID.search(uri)
if m:
viaf_id = m.groups()[0]
contributor_data = self.viaf.lookup_by_viaf(
viaf_id, working_display_name=display_name
)[0]
if contributor_data.sort_name:
return sort_name
# Nope. If we were given a display name, let's ask VIAF about it
# and see what it says.
if display_name:
sort_name = self.sort_name_from_viaf(display_name, known_titles)
return sort_name
def sort_name_from_oclc_linked_data(
self, identifier, display_name):
"""Try to find an author sort name for this book from
OCLC Linked Data.
:param identifier: Must be of Identifier.ISBN type.
"""
if display_name:
test_working_display_name = name_tidy(display_name)
else:
test_working_display_name = None
if ((not identifier) or (identifier.type != Identifier.ISBN)):
# We have no way of telling OCLC Linked Data which book
# we're talking about. Don't bother.
return None, None
try:
self.log.debug(
"Asking OCLC about works for ISBN %s", identifier
)
works = list(self.oclcld.oclc_works_for_isbn(identifier))
except IOError, e:
self.log.error(
"OCLC errored out: %s", e, exc_info=e
)
works = []
shortest_candidate = None
uris = []
for work in works:
graph = self.oclcld.graph(work)
# TODO: Unroll this. We should try the creator names, then
# the creator URIs, then the contributor names, then the
# contributor URIs.
for field_name in ('creator', 'contributor'):
names, new_uris = self.oclcld.creator_names(graph, field_name)
if field_name == 'creator':
# Contributor URIs have too much junk in them to
# be trustworthy.
uris.extend(new_uris)
for name in names:
#if name.endswith(','):
# name = name[:-1]
#test_name = comparable_name(name)
test_name = name_tidy(name)
match_ratio = contributor_name_match_ratio(test_name, test_working_display_name, normalize_names=False)
if (match_ratio > 60):
if (not shortest_candidate
or len(test_name) < len(shortest_candidate)):
shortest_candidate = test_name
return shortest_candidate, uris
def sort_name_from_viaf(self, display_name, known_titles=None):
"""
Ask VIAF about the contributor, looking them up by name,
rather than any numeric id.
:param display_name: Author name in First Last format.
:param known_titles: A list of titles we know this author wrote
(helps better match the VIAF results if there's more than one matching VIAF author record).
:return: Author name in Last, First format.
"""
sort_name = None
viaf_contributor = self.viaf.lookup_by_name(
sort_name=None, display_name=display_name, known_titles=known_titles
)
if viaf_contributor:
contributor_data = viaf_contributor[0]
sort_name = contributor_data.sort_name
self.log.debug(
"Asked VIAF for sort name for %s. Response: %s",
display_name, sort_name
)
return sort_name
class MockAuthorNameCanonicalizer(AuthorNameCanonicalizer):
"""Mocks the services used by the author name canonicalizer, but
leaves the logic alone.
"""
def __init__(self, _db, oclcld=None, viaf=None):
super(MockAuthorNameCanonicalizer, self).__init__(_db)
self._db = _db
self.viaf = viaf or MockVIAFClient(_db)
self.oclcld = oclcld or MockOCLCLinkedData(_db)
self.log = logging.getLogger("Mocked Author Name Canonicalizer")
self.responses = []
self.requests = []
self.non_response_results = [] # all mocked results that are not http response objects
base_path = os.path.split(__file__)[0]
self.resource_path = os.path.join(base_path, "files", "canonicalizer")
def queue_response(self, status_code, headers={}, content=None):
from testing import MockRequestsResponse
self.responses.insert(
0, MockRequestsResponse(status_code, headers, content)
)
def _make_request(self, url, *args, **kwargs):
self.requests.append([url, args, kwargs])
response = self.responses.pop()
return HTTP._process_response(
url, response, kwargs.get('allowed_response_codes'),
kwargs.get('disallowed_response_codes')
)
def get_data(self, filename):
# returns contents of sample file as string and as dict
path = os.path.join(self.resource_path, filename)
data = open(path).read()
return data, json.loads(data)
def queue_non_response_result(self, dummy_result):
self.non_response_results.insert(dummy_result)
def sort_name_from_oclc_linked_data(self, identifier, display_name):
"""
Skip calling parent sort_name_from_oclc_linked_data for now. It contains http
calls it'd be hard to mock. Return a dummy response.
"""
uris = ["http://viaf.org/viaf/9581122"]
if len(self.non_response_results) > 0:
return self.non_response_results.pop()
else:
return None, uris
def sort_name_from_viaf(self, display_name, known_titles=None):
"""
Skip calling parent sort_name_from_viaf for now. It contains http
calls it'd be hard to mock. Return a dummy response.
"""
if len(self.non_response_results) > 0:
return self.non_response_results.pop()
else:
return None
class SimpleMockAuthorNameCanonicalizer(AuthorNameCanonicalizer):
"""Mocks the entire logic of AuthorNameCanonicalizer, allowing
the tester to simply define the 'correct' answers ahead of time.
"""
def __init__(self):
self.mapping = {}
self.canonicalize_author_name_calls = []
def register(self, identifier, display_name, value):
"""Register the canonical author name for an
(identifier, display_name) pair.
"""
self.mapping[(identifier, display_name)] = value
def canonicalize_author_name(self, identifier, display_name):
"""Record the fact that the method was called, and return
the predefined 'correct' answer.
"""
self.canonicalize_author_name_calls.append((identifier, display_name))
return self.mapping.get((identifier, display_name), None)