-
Notifications
You must be signed in to change notification settings - Fork 0
/
worker.py
538 lines (461 loc) · 22.1 KB
/
worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2014, YongSeok Choi <sseeookk@gmail.com> based on the Goodreads work by Grant Drake <grant.drake@gmail.com>'
__docformat__ = 'restructuredtext en'
import socket, re, datetime, lxml
from collections import OrderedDict
from threading import Thread
from lxml.html import fromstring, tostring
from calibre.ebooks.metadata.book.base import Metadata
from calibre.library.comments import sanitize_comments_html
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.localization import canonicalize_lang
import calibre_plugins.kyobobook.config as cfg
class Worker(Thread): # Get details
'''
Get book details from Kyobobook book page in a separate thread
'''
def __init__(self, url, result_queue, browser, log, relevance, plugin, timeout=20):
Thread.__init__(self)
self.daemon = True
self.url, self.result_queue = url, result_queue
self.log, self.timeout = log, timeout
self.relevance, self.plugin = relevance, plugin
self.browser = browser.clone_browser()
self.cover_url = self.kyobobook_id = self.isbn = None
lm = {
'eng': ('English', 'Englisch','ENG'),
'zho': ('Chinese', 'chinois','chi'),
'fra': ('French', 'Francais','FRA'),
'ita': ('Italian', 'Italiano','ITA'),
'dut': ('Dutch','DUT',),
'deu': ('German', 'Deutsch','GER'),
'spa': ('Spanish', 'Espa\xf1ol', 'Espaniol','SPA'),
'jpn': ('Japanese', '日本語','JAP'),
'por': ('Portuguese', 'Portugues','POR'),
'kor': ('Korean', '한국어','KOR'),
}
self.lang_map = {}
for code, names in lm.items():
for name in names:
self.lang_map[name] = code
def run(self):
try:
self.get_details()
except:
self.log.exception('get_details failed for url: %r'%self.url)
def get_details(self):
try:
raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
except Exception as e:
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
self.log.error('URL malformed: %r'%self.url)
return
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
msg = 'Kyobobook timed out. Try again later.'
self.log.error(msg)
else:
msg = 'Failed to make details query: %r'%self.url
self.log.exception(msg)
return
# open('c:\\Kyobobook1.html', 'wb').write(raw)
# raw = raw.decode('utf-8', errors='replace') #00
# open('c:\\Kyobobook2.html', 'wb').write(raw)
raw = raw.decode('euc-kr', 'ignore')
# if '<title>404 - ' in raw:
# self.log.error('URL malformed: %r'%self.url)
# return
try:
root = fromstring(clean_ascii_chars(raw))
except:
msg = 'Failed to parse Kyobobook details page: %r'%self.url
self.log.exception(msg)
return
try:
# Look at the <title> attribute for page to make sure that we were actually returned
# a details page for a book. If the user had specified an invalid ISBN, then the results
# page will just do a textual search.
title_node = root.xpath('//title')
if title_node:
page_title = title_node[0].text_content().strip()
# search success : "나의 문화유산답사기 1 - 인터넷교보문고"
# search fail : " - 인터넷교보문고"
if page_title is None or page_title == " - 인터넷교보문고":
self.log.error('Failed to see search results in page title: %r'%self.url)
return
except:
msg = 'Failed to read Kyobobook page title: %r'%self.url
self.log.exception(msg)
return
errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg:
msg = 'Failed to parse Kyobobook details page: %r'%self.url
msg += tostring(errmsg, method='text', encoding=unicode).strip()
self.log.error(msg)
return
self.parse_details(root)
def parse_details(self, root):
try:
kyobobook_id = self.parse_kyobobook_id(self.url)
except:
self.log.exception('Error parsing Kyobobook id for url: %r'%self.url)
kyobobook_id = None
try:
(title, series, series_index) = self.parse_title_series(root)
except:
self.log.exception('Error parsing title and series for url: %r'%self.url)
title = series = series_index = None
try:
authors = self.parse_authors(root)
except:
self.log.exception('Error parsing authors for url: %r'%self.url)
authors = []
if not title or not authors or not kyobobook_id:
self.log.error('Could not find title/authors/kyobobook id for %r'%self.url)
self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title,
authors))
return
mi = Metadata(title, authors)
if series:
mi.series = series
mi.series_index = series_index
mi.set_identifier('kyobobook', kyobobook_id)
self.kyobobook_id = kyobobook_id
try:
isbn = self.parse_isbn(root)
if isbn:
self.isbn = mi.isbn = isbn
except:
self.log.exception('Error parsing ISBN for url: %r'%self.url)
try:
mi.rating = self.parse_rating(root)
except:
self.log.exception('Error parsing ratings for url: %r'%self.url)
try:
mi.comments = self.parse_comments(root)
except:
self.log.exception('Error parsing comments for url: %r'%self.url)
try:
self.cover_url = self.parse_cover(root)
except:
self.log.exception('Error parsing cover for url: %r'%self.url)
mi.has_cover = bool(self.cover_url)
try:
tags = self.parse_tags(root)
if tags:
mi.tags = tags
except:
self.log.exception('Error parsing tags for url: %r'%self.url)
try:
mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
except:
self.log.exception('Error parsing publisher and date for url: %r'%self.url)
try:
lang = self._parse_language(root)
if lang:
mi.language = lang
except:
self.log.exception('Error parsing language for url: %r'%self.url)
mi.source_relevance = self.relevance
if self.kyobobook_id:
if self.isbn:
self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id)
if self.cover_url:
self.plugin.cache_identifier_to_cover_url(self.kyobobook_id,
self.cover_url)
self.plugin.clean_downloaded_metadata(mi)
self.result_queue.put(mi)
def parse_kyobobook_id(self, url):
# return re.search('&barcode=([^\&]+)', url).groups(0)[0]
return re.search('[\?|\&]barcode=([^\&]+)', url).group(1)
def parse_title_series(self, root):
title_node = root.xpath('//div[@class="box_detail_point"]/h1[@class="title"]/strong')
if not title_node:
return (None, None, None)
self._removeTags(title_node[0],["script","style"])
title_text = title_node[0].text_content().strip()
title_text = title_text.replace("\t","")
title_text = title_text.replace("\r\n","")
series_node = root.xpath('//div[@class="box_detail_point"]/div[@class="info"]')
if not series_node:
return (title_text, None, None)
series_info = series_node[0].text_content()
series_name = None
series_index = None
if series_info:
#try:
# series = series_info.split("|")
# if len(series) > 1:
# series_name = series[0].strip()
# series_index = float(series[1].strip())
#except:
# series_name = None
# series_index = None
match = re.search("\s+(\d+)\s*$",series_info)
if match:
series_index = match.group(1)
series_name = series_info[:-1 * len(match.group(0))]
else:
series_index = 0
series_name = series_info
return (title_text, series_name, series_index)
def parse_authors(self, root):
#self.log.warning(root)
# Build a dict of authors with their contribution if any in values
authors_elements = root.xpath("//span[@title='%s']/preceding-sibling::node()" % '출판사')
if not authors_elements:
return
authors_type_map = OrderedDict()
#authors_elements_len = len(authors_elements)
authors_elements.reverse()
# 거꾸로 검색하면서 "역할"을 할당한다.
contrib = ''
for el in authors_elements:
# print div_authors[n-1]
#el = authors_elements[n-1]
self._removeTags(el,["div","script","style"])
if isinstance(el, lxml.html.HtmlElement) and contrib:
if el.get("class") != "name": continue
spliter = ","
if re.search("detailViewEng",self.url): spliter = "/"
authors_splits = re.sub("\s{2,}"," ",el.text_content().strip()).replace("/","/").split(spliter)
authors_splits.reverse()
for authors_split in authors_splits:
if '(' in authors_split:
#log.info('Stripping off series(')
authors_split = authors_split.rpartition('(')[0]
authors_split = re.sub("(\s외|\s편|著 |\[著\]|編 )","",authors_split).strip()
if authors_split in authors_type_map.keys(): del authors_type_map[authors_split]
authors_type_map[authors_split] = contrib
elif isinstance(el, lxml.etree._ElementUnicodeResult):
if el.strip():
contrib = el.strip()
item = authors_type_map.items()
authors_type_map = OrderedDict(item, reverse=True)
# User either requests all authors, or only the primary authors (latter is the default)
# If only primary authors, only bring them in if:
# 1. They have no author type specified
# 2. They have an author type of 'Kyobobook Author'
# 3. There are no authors from 1&2 and they have an author type of 'Editor'
get_all_authors = cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_GET_ALL_AUTHORS]
authors = []
valid_contrib = None
for a, contrib in authors_type_map.items():
if get_all_authors:
authors.append(a)
else:
if not contrib or contrib == '지음' or contrib == '저자':
authors.append(a)
elif len(authors) == 0:
authors.append(a)
valid_contrib = contrib
elif contrib == valid_contrib:
authors.append(a)
else:
break
return authors
def parse_rating(self, root):
rating_node = root.xpath('//a[@href="#review"]/img')
if rating_node:
rating_text = rating_node[0].get("alt")
rating_num = re.search(u"5점 만점에 (\d)점",rating_text).group(1)
if rating_num:
rating_value = int(rating_num)
return rating_value
def parse_comments(self, root):
description_nodes = root.xpath("//*[preceding-sibling::comment()[. = ' *** s:%s *** '] and following-sibling::comment()[. = ' *** //e:%s *** ']]" % ('책소개','책소개'))
default_append_toc = cfg.DEFAULT_STORE_VALUES[cfg.KEY_APPEND_TOC]
append_toc = cfg.plugin_prefs[cfg.STORE_NAME].get(cfg.KEY_APPEND_TOC, default_append_toc)
comments = ''
if description_nodes:
for description_node in description_nodes:
comments += tostring(description_node, method='html', encoding=str).strip()
while comments.find(' ') >= 0:
comments = comments.replace(' ',' ')
comments = sanitize_comments_html(comments)
if append_toc:
toc_node = root.xpath('//div[@class="box_detail_content"]/h2[@class="title_detail_basic" and contains(text(),"%s")]/following-sibling::div' % "목차")
if toc_node:
toc = tostring(toc_node[0], method='html')
toc = sanitize_comments_html(toc)
comments += '<h3>[목차]</h3><div id="toc">' + toc + "</div>"
if comments:
comments += "<hr />" + '<div><div style="float:right">[kyobobook]</div></div>'
return comments
def parse_cover(self, root):
# <meta property="og:image" content="http://image.kyobobook.co.kr/images/book/xlarge/547/x9780132990547.jpg"/>
# 2016-02-04
# <meta property="og:image" content="http://image.kyobobook.co.kr/images/book/medium/196/m9788994909196.jpg"/>
# http://image.kyobobook.co.kr/images/book/large/196/l9788994909196.jpg
# http://image.kyobobook.co.kr/images/book/xlarge/196/x9788994909196.jpg
imgcol_node = root.xpath('//meta[@property="og:image"]/@content')
img_url_checked = None
if imgcol_node:
img_url = imgcol_node[0]
# http://image.kyobobook.co.kr/newimages/apps/b2b_academy/common/noimage_150_215.gif
if not "noimage" in img_url :
try:
# Unfortunately Kyobobook sometimes have broken links so we need to do
# an additional request to see if the URL actually exists
info = self.browser.open_novisit(img_url, timeout=self.timeout).info()
if int(info.getheader('Content-Length')) > 1000:
img_url_checked = img_url
else:
self.log.warning('Broken image(Large) for url: %s'%img_url)
except:
pass
if not img_url_checked:
imgcol_node = root.xpath('//p[@class="book_img_box"]/img/@src')
if imgcol_node:
img_url = imgcol_node[0]
# http://image.kyobobook.co.kr/newimages/apps/b2b_academy/common/noimage_150_215.gif
if not "noimage" in img_url :
try:
# Unfortunately Kyobobook sometimes have broken links so we need to do
# an additional request to see if the URL actually exists
info = self.browser.open_novisit(img_url, timeout=self.timeout).info()
if int(info.getheader('Content-Length')) > 1000:
img_url_checked = img_url
else:
self.log.warning('Broken image(small) for url: %s'%img_url)
except:
pass
if img_url_checked:
return img_url_checked
def parse_isbn(self, root):
isbn_node = root.xpath('//span[@title="ISBN-13"]')
if isbn_node:
return isbn_node[0].text_content()
#match = re.search("isbn(?:\-13)?\s?:\s?([^\s]*)",isbn_node[0].text_content(),re.I)
#if match:
# return match.group(1)
def parse_publisher_and_date(self, root):
# Publisher is specified within the a :
# <a class="np_af" href="/search/wsearchresult.aspx?PublisherSearch=%b4%d9%b9%ae@876&BranchType=1">다문</a> | 2009-09-20
publisher = None
pub_date = None
publisher_node = root.xpath("//span[@title='%s']" % '출판사')
if publisher_node:
# /search/SearchCommonMain.jsp?vPstrCategory=KOR&vPoutSearch=1&vPpubCD=04129&vPsKeywordInfo=실천문학사
# /search/SearchEngbookMain.jsp?vPstrCategory=ENG&vPoutSearch=1&vPejkGB=BNT&vPpubNM=Prentice Hall&vPsKeywordInfo=Prentice Hall
publisher_link = publisher_node[0].xpath(".//a")
if publisher_link:
publisher = publisher_link[0].text_content()
# Now look for the pubdate. There should always be one at start of the string
pubdate_node = publisher_node[0].getparent().xpath(".//span[@class='date']")
if pubdate_node :
pubdate_text_str = pubdate_node[0].text_content().strip()
pubdate_text_match = re.search('(\d{4}년\s*\d{1,2}월\s*\d{1,2}일)', pubdate_text_str)
if pubdate_text_match is not None:
pubdate_text = pubdate_text_match.group(1)
if pubdate_text:
pub_date = self._convert_date_text_name(pubdate_text)
return (publisher, pub_date)
def parse_tags(self, root):
# Kyobobook have both"tags" and Genres(category)
# We will use those as tags (with a bit of massaging)
calibre_tags = list()
category_lookup = cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_GET_CATEGORY]
if category_lookup:
genres_node = root.xpath('//div[@class="location_zone pathGroup"]/p[@class="location"]')
#self.log.info("Parsing categories")
if genres_node:
#self.log.info("Found genres_node")
for genre in genres_node:
genre = re.sub("\s{2,}"," ",genre.text_content().strip())
genre = re.sub("^\s*(국내도서|외국도서)\s*>\s*","",genre)
calibre_tags.append("▣" + ".".join(re.split("\s*\>\s*",genre)))
# tags_list = root.xpath('//div[@id="div_itemtaglist"]//a[contains(@href,"tagname=")]/text()')
# #self.log.info("Parsing tags")
# if tags_list:
# #self.log.info("Found tags")
# convert_tag_lookup = cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_CONVERT_TAG]
# if convert_tag_lookup:
# tags = self._convert_genres_to_calibre_tags(tags_list)
# else:
# tags = tags_list
# if len(tags) > 0:
# # return calibre_tags
# calibre_tags.extend(tags)
return calibre_tags
def _convert_genres_to_calibre_tags(self, genre_tags):
# for each tag, add if we have a dictionary lookup
calibre_tag_lookup = cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_GENRE_MAPPINGS]
calibre_tag_map = dict((k.lower(),v) for (k,v) in calibre_tag_lookup.iteritems())
tags_to_add = list()
for genre_tag in genre_tags:
tags = calibre_tag_map.get(genre_tag.lower(), None)
if tags:
for tag in tags:
if tag not in tags_to_add:
tags_to_add.append(tag)
# return list(tags_to_add)
return tags_to_add
def _convert_date_text(self, date_text):
# Note that the date text could be "2003", "December 2003" or "December 10th 2003"
year = int(date_text[-4:])
month = 1
day = 1
if len(date_text) > 4:
text_parts = date_text[:len(date_text)-5].partition(' ')
month_name = text_parts[0]
# Need to convert the month name into a numeric value
# For now I am "assuming" the Kyobobook website only displays in English
# If it doesn't will just fallback to assuming January
month_dict = {"January":1, "February":2, "March":3, "April":4, "May":5, "June":6,
"July":7, "August":8, "September":9, "October":10, "November":11, "December":12}
month = month_dict.get(month_name, 1)
if len(text_parts[2]) > 0:
day = int(re.match('([0-9]+)', text_parts[2]).groups(0)[0])
from calibre.utils.date import utc_tz
return datetime.datetime(year, month, day, tzinfo=utc_tz)
def _convert_date_text_name(self, date_text):
# 2014년 03월 20일 to datetime
year = 2014
month = 1
day = 1
#dates = re.search("(?P<year>\d{4})년\s*(?P<month>\d{1,2})월\s*(?P<day>\d{1,2})일",date_text)
dates = re.search("(\d{4})년",date_text)
if dates:
year = int(dates.group(1))
dates = re.search("(\d{1,2})월",date_text)
if dates:
month = int(dates.group(1))
dates = re.search("(\d{1,2})일",date_text)
if dates:
day = int(dates.group(1))
else:
return None
from calibre.utils.date import utc_tz
return datetime.datetime(year, month, day, tzinfo=utc_tz)
# Defalut language is Korean at Kyobobook.
# Kyobobook 에서 언어를 찾을 수 없을 때
# 기본 언어로 Korean 을 넣는다.
def _parse_language(self, root):
raw = "Korean"
lang_node = root.xpath('//div[@class="book_info_basic2"]')
if lang_node:
match = re.search("%s\s?:\s?([^\s]*)" % '언어',lang_node[0].text_content(),re.I)
if match:
raw = match.group(1)
ans = self.lang_map.get(raw, None)
if ans:
return ans
ans = canonicalize_lang(ans)
if ans:
return ans
def _removeTags(self, element, tags):
try:
for node in element.getchildren():
if node.tag in tags:
element.remove(node)
else:
self._removeTags(node,tags)
except:
return