def build_meta(log, issue_id): """Build metadata record based on comicvine issue_id""" issue = pycomicvine.Issue( issue_id, field_list=[ "id", "name", "volume", "issue_number", "person_credits", "description", "store_date", "cover_date", ], ) if not issue or not issue.volume: log.warn("Unable to load Issue(%d)" % issue_id) return None title = "%s #%s" % (issue.volume.name, issue.issue_number) if issue.name: title = title + ": %s" % (issue.name) authors = [p.name for p in issue.person_credits] meta = Metadata(title, authors) meta.series = issue.volume.name meta.series_index = str(issue.issue_number) meta.set_identifier("comicvine", str(issue.id)) meta.comments = issue.description meta.has_cover = False if issue.volume.publisher: meta.publisher = issue.volume.publisher.name meta.pubdate = issue.store_date or issue.cover_date return meta
def run(self): try: self.log.info('Worker parsing url: %r' % self.url) book = Book.from_url(self.browser, self.url, self.timeout, self.log) if not book.get("title") or not book.get("authors"): self.log.error('Insufficient metadata found for %r' % self.url) return title = book["title"].encode('utf-8') authors = [a.encode('utf-8') for a in book["authors"]] mi = Metadata(title, authors) isbn = book.get("ean") or book.get("isbn") if isbn: mi.set_identifier("isbn", isbn) for attr in ("pubdate", "rating", "languages"): if attr in book: setattr(mi, attr, book[attr]) if book.get("publisher"): mi.publisher = book["publisher"].encode('utf-8') if book.get("cover_url"): self.plugin.cache_identifier_to_cover_url(isbn, book["cover_url"]) mi.has_cover = True self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi) except Exception as e: self.log.exception('Worker failed to fetch and parse url %r with error %r' % (self.url, e))
def _parse_work(self, work): """Convert a list of works returned in CrossRef JSON to Calibre Metadata objects """ title = work.get('title')[0] authors = self._parse_authors(work) # Now we have a title - init Calibre Metadata mi = Metadata(title, authors) doi = work.get('DOI') if doi: mi.set_identifier('doi', doi) pubdate = self._parse_pubdate(work) if pubdate: mi.pubdate = pubdate publisher = self._parse_publisher(work) if publisher: mi.publisher = publisher series = self._parse_series(work) if series: mi.series = series[0] mi.series_index = series[1] return mi
def read_metadata_kfx(stream, read_cover=True): ' Read the metadata.kfx file that is found in the sdr book folder for KFX files ' c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else '' else: ans = [clean_xml_chars(y) for y in ans] return ans title = get('title') or _('Unknown') authors = get('authors', False) or [_('Unknown')] auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$') def fix_author(x): if tweaks['author_sort_copy_method'] != 'copy': m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + ' ' + m.group(1) return x mi = Metadata(title, [fix_author(x) for x in authors]) if has('author'): mi.author_sort = get('author') if has('ASIN'): mi.set_identifier('mobi-asin', get('ASIN')) elif has('content_id'): mi.set_identifier('mobi-asin', get('content_id')) if has('languages'): langs = list( filter(None, (canonicalize_lang(x) for x in get('languages', False)))) if langs: mi.languages = langs if has('issue_date'): try: mi.pubdate = parse_only_date(get('issue_date')) except Exception: pass if has('publisher') and get('publisher') != 'Unknown': mi.publisher = get('publisher') if read_cover and m[COVER_KEY]: try: data = base64.standard_b64decode(m[COVER_KEY]) fmt, w, h = identify(bytes(data)) except Exception: w, h, fmt = 0, 0, None if fmt and w > -1 and h > -1: mi.cover_data = (fmt, data) return mi
def read_metadata_kfx(stream, read_cover=True): ' Read the metadata.kfx file that is found in the sdr book folder for KFX files ' c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else '' else: ans = [clean_xml_chars(y) for y in ans] return ans title = get('title') or _('Unknown') authors = get('authors', False) or [_('Unknown')] auth_pat = re.compile(r'([^,]+?)\s*,\s+([^,]+)$') def fix_author(x): if tweaks['author_sort_copy_method'] != 'copy': m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + ' ' + m.group(1) return x mi = Metadata(title, [fix_author(x) for x in authors]) if has('author'): mi.author_sort = get('author') if has('ASIN'): mi.set_identifier('mobi-asin', get('ASIN')) elif has('content_id'): mi.set_identifier('mobi-asin', get('content_id')) if has('languages'): langs = list(filter(None, (canonicalize_lang(x) for x in get('languages', False)))) if langs: mi.languages = langs if has('issue_date'): try: mi.pubdate = parse_only_date(get('issue_date')) except Exception: pass if has('publisher') and get('publisher') != 'Unknown': mi.publisher = get('publisher') if read_cover and m[COVER_KEY]: try: data = base64.standard_b64decode(m[COVER_KEY]) w, h, fmt = identify_data(data) except Exception: w, h, fmt = 0, 0, None if fmt and w and h: mi.cover_data = (fmt, data) return mi
def read_metadata_kfx(stream, read_cover=True): " Read the metadata.kfx file that is found in the sdr book folder for KFX files " c = Container(stream.read()) m = extract_metadata(c.decode()) # dump_metadata(m) def has(x): return m[x] and m[x][0] def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else "" else: ans = [clean_xml_chars(y) for y in ans] return ans title = get("title") or _("Unknown") authors = get("authors", False) or [_("Unknown")] auth_pat = re.compile(r"([^,]+?)\s*,\s+([^,]+)$") def fix_author(x): if tweaks["author_sort_copy_method"] != "copy": m = auth_pat.match(x.strip()) if m is not None: return m.group(2) + " " + m.group(1) return x mi = Metadata(title, [fix_author(x) for x in authors]) if has("author"): mi.author_sort = get("author") if has("ASIN"): mi.set_identifier("mobi-asin", get("ASIN")) elif has("content_id"): mi.set_identifier("mobi-asin", get("content_id")) if has("languages"): langs = list(filter(None, (canonicalize_lang(x) for x in get("languages", False)))) if langs: mi.languages = langs if has("issue_date"): try: mi.pubdate = parse_only_date(get("issue_date")) except Exception: pass if has("publisher") and get("publisher") != "Unknown": mi.publisher = get("publisher") if read_cover and m[COVER_KEY]: try: data = base64.standard_b64decode(m[COVER_KEY]) fmt, w, h = identify(bytes(data)) except Exception: w, h, fmt = 0, 0, None if fmt and w > -1 and h > -1: mi.cover_data = (fmt, data) return mi
def parse_details(self, root): """ """ try: self.log.info(" Parse details: %r" % self.url) self.databazeknih_id = self.parse_databazeknih_id(self.url) self.log.info(" Parsed DK identifier: %s" % self.databazeknih_id) except: self.log.exception("Error parsing DK identifier for url: %r" % self.url) self.databazeknih_id = None # Parse title self.parse_title(root) # Parse authors self.parse_authors(root) if not self.title or not self.authors or not self.databazeknih_id: self.log.error("Could not find title/authors/DK id for %r" % self.url) self.log.error("DK id: %r Title: %r Authors: %r" % (self.databazeknih_id, self.title, self.authors)) return mi = Metadata(self.title, self.authors) mi.set_identifier("databazeknih", self.databazeknih_id) # Parse series self.parse_series(root, mi) # Parse comments self.parse_comments(root, mi) # Parse publisher self.parse_publisher(root, mi) # Parse pubdate self.parse_pubdate(root, mi) # Parse tags self.parse_tags(root, mi) # Parse rating self.parse_rating(root, mi) # Parse book ISBN self.parse_isbn(self.more_info, mi) # Parse language self.parse_language(self.more_info, mi) # Parse book cover self.parse_cover(root, mi) mi.source_relevance = self.relevance self.log.info(mi) self.result_queue.put(mi)
def _get_results(self): """ Download Information from Google Scholar """ querier = ScholarQuerier(author=self.query_authors[0], count=self.count) querier.query(self.query_title, bibtex=True) articles = querier.articles if self.count > 0: articles = articles[:self.count] for num, art in enumerate(articles): bibtex_string = art.as_bib() bib = Bibparser(bibtex_string) bib.parse() slug = bib.records.keys()[0] bib_dict = bib.records[slug] title = bib_dict.get('title') authors = [] for author in bib_dict.get('author', []): # Ignore non existant given names given_name = '%s ' % author.get( 'given') if 'given' in author else '' # Add full stops after abbreviated name parts given_name = re.sub(r'(^| +)([A-Z])( +|$)', r'\1\2.\3', given_name) authors.append('%s%s' % (given_name, author['family'])) mi = Metadata(title, authors) mi.set_identifier('googlescholar', slug) mi.source_relevance = 100 - num if 'publisher' in bib_dict: mi.publisher = bib_dict['publisher'] if 'issued' in bib_dict: if 'literal' in bib_dict['issued']: year = int(bib_dict['issued']['literal']) from calibre.utils.date import utc_tz # We only have the year, so let's use Jan 1st mi.pubdate = datetime.datetime(year, 1, 1, tzinfo=utc_tz) self.plugin.clean_downloaded_metadata(mi) self._log_metadata(mi) self.result_queue.put(mi, True) self.log.info(self.result_queue.qsize())
def _get_bookdetails(self, url): u = self.BASE_URL + url["url"] print("_get_bookdetails:: traukiam knygą iš %s" % u) resp = urllib2.urlopen(u) contents = resp.read() #print(contents) tree = etree.HTML(contents) authors = self._get_authors(tree) publisher = self._get_details(tree, self.details_publisher) year = self._get_year(tree) pages = self._get_details(tree, self.details_pages) isbn = self._get_details(tree, self.details_isbn) description = self._get_description(tree) cover = self._get_cover_url(tree) tags = self._get_tags(tree) mi = Metadata(url["title"], authors) mi.set_identifier("isbn", isbn) mi.comments = description mi.language = "LT" mi.tags = tags try: mi.set("publisher", publisher) except: print(u"_get_bookdetails:: nepavyko užsetinti leidėjo") try: mi.set("pubdate", datetime.datetime(year, 1, 2)) except: print(u"_get_bookdetails:: nepavyko užsetinti leidimo datos") try: if self.gui: print("YYYYRAAA GUI!!!") col = {} col["#value#"] = pages mi.set_user_metadata("#count", col) except: print(u"_get_bookdetails:: nepavyko užsetinti puslapių skaičiaus") if cover and isbn: print(u"_get_bookdetails:: kešuojam viršelį:", cover) self.cache_isbn_to_identifier(isbn, isbn) self.cache_identifier_to_cover_url(isbn, cover) mi.has_cover = True print(self.cached_identifier_to_cover_url(isbn)) return mi
def build_meta(log, issue_id): """Build metadata record based on comicvine issue_id.""" issue = PyComicvineWrapper(log).lookup_issue(issue_id) if issue: meta = Metadata(issue.get_full_title(), issue.get_authors()) meta.series = issue.volume_name meta.series_index = issue.issue_number meta.set_identifier('comicvine', str(issue.id)) meta.set_identifier('comicvine-volume', str(issue.volume_id)) meta.comments = issue.description meta.has_cover = False meta.publisher = issue.publisher_name meta.pubdate = issue.date return meta else: return None
def _get_results(self): """ Download Information from Google Scholar """ querier = ScholarQuerier(author=self.query_authors[0], count=self.count) querier.query(self.query_title, bibtex=True) articles = querier.articles if self.count > 0: articles = articles[:self.count] for num, art in enumerate(articles): bibtex_string = art.as_bib() bib = Bibparser(bibtex_string) bib.parse() slug = bib.records.keys()[0] bib_dict = bib.records[slug] title = bib_dict.get('title') authors = [] for author in bib_dict.get('author', []): # Ignore non existant given names given_name = '%s ' % author.get('given') if 'given' in author else '' # Add full stops after abbreviated name parts given_name = re.sub(r'(^| +)([A-Z])( +|$)', r'\1\2.\3', given_name) authors.append('%s%s' % (given_name, author['family'])) mi = Metadata(title, authors) mi.set_identifier('googlescholar', slug) mi.source_relevance = 100-num if 'publisher' in bib_dict: mi.publisher = bib_dict['publisher'] if 'issued' in bib_dict: if 'literal' in bib_dict['issued']: year = int(bib_dict['issued']['literal']) from calibre.utils.date import utc_tz # We only have the year, so let's use Jan 1st mi.pubdate = datetime.datetime(year, 1, 1, tzinfo=utc_tz) self.plugin.clean_downloaded_metadata(mi) self._log_metadata(mi) self.result_queue.put(mi, True) self.log.info(self.result_queue.qsize())
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import UNDEFINED_DATE root = parse_html(raw) mi = Metadata(self.basic_data['title'], self.basic_data['authors']) # Identifiers if self.basic_data['isbns']: mi.isbn = self.basic_data['isbns'][0] mi.set_identifier('edelweiss', self.sku) # Tags if self.basic_data['tags']: mi.tags = self.basic_data['tags'] mi.tags = [ t[1:].strip() if t.startswith('&') else t for t in mi.tags ] # Publisher mi.publisher = self.basic_data['publisher'] # Pubdate if self.basic_data['pubdate'] and self.basic_data[ 'pubdate'].year != UNDEFINED_DATE: mi.pubdate = self.basic_data['pubdate'] # Rating if self.basic_data['rating']: mi.rating = self.basic_data['rating'] # Comments comments = '' for cid in ('summary', 'contributorbio', 'quotes_reviews'): cid = 'desc_{}{}-content'.format(cid, self.sku) div = root.xpath('//*[@id="{}"]'.format(cid)) if div: comments += self.render_comments(div[0]) if comments: mi.comments = comments mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None return mi
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import UNDEFINED_DATE root = parse_html(raw) mi = Metadata(self.basic_data['title'], self.basic_data['authors']) # Identifiers if self.basic_data['isbns']: mi.isbn = self.basic_data['isbns'][0] mi.set_identifier('edelweiss', self.sku) # Tags if self.basic_data['tags']: mi.tags = self.basic_data['tags'] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] # Publisher mi.publisher = self.basic_data['publisher'] # Pubdate if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE: mi.pubdate = self.basic_data['pubdate'] # Rating if self.basic_data['rating']: mi.rating = self.basic_data['rating'] # Comments comments = '' for cid in ('summary', 'contributorbio', 'quotes_reviews'): cid = 'desc_{}{}-content'.format(cid, self.sku) div = root.xpath('//*[@id="{}"]'.format(cid)) if div: comments += self.render_comments(div[0]) if comments: mi.comments = comments mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def build_meta(log, issue_id): '''Build metadata record based on comicvine issue_id''' issue = pycomicvine.Issue(issue_id, field_list=[ 'id', 'name', 'volume', 'issue_number', 'person_credits', 'description', 'store_date', 'cover_date']) if not issue or not issue.volume: log.warn('Unable to load Issue(%d)' % issue_id) return None title = '%s #%s' % (issue.volume.name, issue.issue_number) if issue.name: title = title + ': %s' % (issue.name) authors = [p.name for p in issue.person_credits] meta = Metadata(title, authors) meta.series = issue.volume.name meta.series_index = str(issue.issue_number) meta.set_identifier('comicvine', str(issue.id)) meta.set_identifier('comicvine-volume', str(issue.volume.id)) meta.comments = issue.description meta.has_cover = False if issue.volume.publisher: meta.publisher = issue.volume.publisher.name meta.pubdate = issue.store_date or issue.cover_date return meta
def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index( item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi
def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index(item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi
def parse_details(self, root): try: moly_id = self.parse_moly_id(self.url) self.log.info('Parsed moly.hu identifier: %s'%moly_id) except: self.log.exception('Error parsing moly.hu id for url: %r'%self.url) moly_id = None try: title = self.parse_title(root) self.log.info('Parsed title: %s'%title) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors: %s'%authors) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not moly_id: self.log.error('Could not find title/authors/moly.hu id for %r'%self.url) self.log.error('Moly.hu id: %r Title: %r Authors: %r'%(moly_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('moly_hu', moly_id) self.moly_id = moly_id try: isbn = self.parse_isbn(root) self.log.info('Parsed ISBN: %s'%isbn) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: series_info = self.parse_series(root) if series_info is not None: mi.series = series_info[0] mi.series_index = int(series_info[1]) self.log.info('Parsed series: %s, series index: %f'%(mi.series,mi.series_index)) except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments: %s'%mi.comments) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_covers(root) self.log.info('Parsed URL for cover: %r'%self.cover_url) self.plugin.cache_identifier_to_cover_url(self.moly_id, self.cover_url) mi.has_cover = bool(self.cover_url) except: self.log.exception('Error parsing cover for url: %r'%self.url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags: %s'%mi.tags) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.languages = self.parse_languages(mi.tags) self.log.info('Parsed languages: %r'%mi.languages) except: self.log.exception('Error parsing language for url: %r'%self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher: %s'%mi.publisher) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) self.log.info('Parsed publication date: %s'%mi.pubdate) except: self.log.exception('Error parsing published date for url: %r'%self.url) try: mi.rating = self.parse_rating(root) self.log.info('Parsed rating: %s\n\n'%mi.rating) except: self.log.exception('Error parsing tags for url: %r\n\n'%self.url) mi.source_relevance = self.relevance if self.moly_id and self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.moly_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE from css_selectors import Select root = parse_html(raw) selector = Select(root) sku = next(selector('div.sku.attGroup')) info = sku.getparent() top = info.getparent().getparent() banner = top.find('div') spans = banner.findall('span') title = '' for i, span in enumerate(spans): if i == 0 or '12pt' in span.get('style', ''): title += astext(span) else: break authors = [re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',')] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x:len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier('edelweiss', self.sku) # Tags bisac = tuple(selector('div.bisac.attGroup')) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags] # Publisher pub = tuple(selector('div.supplier.attGroup')) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = tuple(selector('div.shipDate.attGroupItem')) if pub: pub = astext(pub[0]) parts = pub.partition(':')[0::2] pub = parts[1] or parts[0] try: if ', Ship Date:' in pub: pub = pub.partition(', Ship Date:')[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception('Error parsing published date: %r'%pub) # Comments comm = '' general = tuple(selector('div#pd-general-overview-content')) if general: q = self.render_comments(general[0]) if q != '<p>No title summary available. </p>': comm += q general = tuple(selector('div#pd-general-contributor-content')) if general: comm += self.render_comments(general[0]) general = tuple(selector('div#pd-general-quotes-content')) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = tuple(selector('img.title-image[src]')) if img: href = img[0].get('src').replace('jacket_covers/medium/', 'jacket_covers/flyout/') self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def parse_details(self, raw, root): #解析元数据各字段数据 #self.log.info("=====") try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r' % self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile( prefix=(asin or str(uuid.uuid4())) + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', asin, 'saved in', f.name) # 分析取得书名 try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None #分析取得作者 try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title, authors)) return #以书名,作者为元数据对象mi,用于设置元数据 mi = Metadata(title, authors) #设置Bookid idtype = '17k' mi.set_identifier(idtype, asin) self.k17k_id = asin #设备注释(简介) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) #设置丛书系列 try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) #设置标签 try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) #设置最后更新日期 # try: # mi.last_modified = self.parse_last_modified(root) # except: # self.log.exception('Error parsing last_modified for url: %r'%self.url) #设置封面 try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) mi.source_relevance = self.relevance mi.languages = [ u'中文', ] if self.k17k_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.k17k_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.k17k_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: kyobobook_id = self.parse_kyobobook_id(self.url) except: self.log.exception('Error parsing Kyobobook id for url: %r' % self.url) kyobobook_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r' % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not kyobobook_id: self.log.error('Could not find title/authors/kyobobook id for %r' % self.url) self.log.error('Kyobobook: %r Title: %r Authors: %r' % (kyobobook_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('kyobobook', kyobobook_id) self.kyobobook_id = kyobobook_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception('Error parsing publisher and date for url: %r' % self.url) try: lang = self._parse_language(root) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r' % self.url) mi.source_relevance = self.relevance if self.kyobobook_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.kyobobook_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def get_details(self): ''' The get_details() function for stripping the website for all information ''' self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # Parse the html code from the website try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() # Do some error handling if it fails to read data except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for saxo timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # Do some error handling if the html code returned 404 if "<title>404 - " == raw: self.log.error('URL malformed: %r' % self.url) return # Clean the html data a little try: root = parse(raw) except: self.log.error("Error cleaning HTML") return # Get the title of the book try: title_node = root.xpath('//span[@itemprop="name"]') self.title = title_node[0].text except: self.log.exception('Error parsing title for url: %r' % self.url) # Get the author of the book try: author_node = root.xpath('//span[@class="expandAuthorName"]') author_strings = author_node[0].text.split(",") #print(author_strings) for name in author_strings: self.authors.append(name) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None # Get the series of the book try: series_node = root.xpath('//b[contains(text(), "Serie")]/a') if len(series_node) > 0: self.series = series_node[0].text.split(": ")[0].strip() self.series_index = series_node[0].text.split(": ")[-1].strip() # print("'%s'" % self.series) # print("'%s'" % self.series_index) except: self.log.exception('Error parsing series for url: %r' % self.url) # Some books have ratings, let's use them. try: self.rating = 0.0 except: self.log.exception('Error parsing rating for url: %r' % self.url) self.rating = 0.0 # Get the ISBN number from the site try: isbn_node = root.xpath( '//div[@class="eBookContainer"]/b/span[@itemprop="identifier"]' ) if len(isbn_node) > 0: self.isbn = isbn_node[0].text.replace("ISBN: ", "").strip() except: self.log.exception('Error parsing isbn for url: %r' % self.url) self.isbn = None # Get the comments/blurb for the book try: comment_node = root.xpath('//meta[@name="description"]/@content') self.comments = comment_node[0] except: self.log.exception('Error parsing comments for url: %r' % self.url) self.comments = None # Parse the cover url for downloading the cover. try: cover_node = root.xpath( '//div[@class="bookDetailCoverCover"]/img/@src') self.cover_url = "https://mofibo.com" + cover_node[0] self.log.info(' Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.isbn, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) # Get the publisher name try: publisher_node = root.xpath( '//div[@class="eBookContainer"]/b/span/a[@itemprop="brand"]') if len(publisher_node) > 0: self.publisher = publisher_node[0].text except: self.log.exception('Error parsing publisher for url: %r' % self.url) # Get the language of the book. Only english and danish are supported tho try: language_node = root.xpath('//b[@class="expanderLanguage"]') language = language_node[0].text.strip().replace("Sprog:", "").replace( " ", "") language = self.lang_map.get(language, None) self.language = language except: self.log.exception('Error parsing language for url: %r' % self.url) # Get the publisher date try: pubdate_node = root.xpath( '//div[@class="eBookContainer"]/b[contains(text(),"Udgivet:")]' ) if len(pubdate_node) > 0: date_str = pubdate_node[0].text.replace("Udgivet:", "").strip() format_str = '%Y-%m-%d' # The format self.pubdate = datetime.datetime.strptime(date_str, format_str) except: self.log.exception('Error parsing published date for url: %r' % self.url) # Get the tags try: tags = [] tags_node = root.xpath('//span[@itemprop="category"]') tags.append(tags_node[0].text.strip()) self.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) # Setup the metadata meta_data = Metadata(self.title, self.authors) meta_data.set_identifier('isbn', self.isbn) meta_data.set_identifier('mofibo', self.url) # Set rating if self.series: try: meta_data.series = self.series meta_data.series_index = self.series_index except: self.log.exception('Error loading series') # Set ISBN if self.isbn: try: meta_data.isbn = self.isbn except: self.log.exception('Error loading ISBN') # Set relevance if self.relevance: try: meta_data.source_relevance = self.relevance except: self.log.exception('Error loading relevance') # Set cover url if self.cover_url: try: meta_data.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') # Set publisher if self.publisher: try: meta_data.publisher = self.publisher except: self.log.exception('Error loading publisher') # Set language if self.language: try: meta_data.language = self.language except: self.log.exception('Error loading language') # Set comments/blurb if self.comments: try: meta_data.comments = self.comments except: self.log.exception("Error loading comments") # Set publisher data if self.pubdate: try: meta_data.pubdate = self.pubdate except: self.log.exception('Error loading pubdate') # Set tags data if self.tags: try: meta_data.tags = self.tags except: self.log.exception('Error loading tags') # Put meta data self.plugin.clean_downloaded_metadata(meta_data) self.result_queue.put(meta_data)
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [_.get('content') for _ in node if _.get('property') == property][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath('//meta[starts-with(@property, "og") or starts-with(@property, "books")]') # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath('//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([_ + u'(역자)' for _ in _format_list(book_info['translator']['name'])]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url(ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE root = parse_html(raw) sku = CSSSelect("div.sku.attGroup")(root)[0] info = sku.getparent() top = info.getparent().getparent() banner = top.find("div") spans = banner.findall("span") title = "" for i, span in enumerate(spans): if i == 0 or "12pt" in span.get("style", ""): title += astext(span) else: break authors = [re.sub(r"\(.*\)", "", x).strip() for x in astext(spans[-1]).split(",")] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(",")] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier("edelweiss", self.sku) # Tags bisac = CSSSelect("div.bisac.attGroup")(root) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(",")] mi.tags = [t[1:].strip() if t.startswith("&") else t for t in mi.tags] # Publisher pub = CSSSelect("div.supplier.attGroup")(root) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = CSSSelect("div.shipDate.attGroupItem")(root) if pub: pub = astext(pub[0]) parts = pub.partition(":")[0::2] pub = parts[1] or parts[0] try: if ", Ship Date:" in pub: pub = pub.partition(", Ship Date:")[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception("Error parsing published date: %r" % pub) # Comments comm = "" general = CSSSelect("div#pd-general-overview-content")(root) if general: q = self.render_comments(general[0]) if q != "<p>No title summary available. </p>": comm += q general = CSSSelect("div#pd-general-contributor-content")(root) if general: comm += self.render_comments(general[0]) general = CSSSelect("div#pd-general-quotes-content")(root) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = CSSSelect("img.title-image[src]")(root) if img: href = img[0].get("src").replace("jacket_covers/medium/", "jacket_covers/flyout/") self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None return mi
def parse_details(self, raw, root): dang_id = parse_dang_id(root, self.log, self.url) if not dang_id and root.xpath( '//form[@action="/errors/validateCaptcha"]'): raise CaptchaError( 'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.' ) if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile( prefix=(dang_id or str(uuid.uuid4())) + '_', suffix='.html', delete=False) as f: f.write(raw) print('Downloaded html for', dang_id, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not dang_id: self.log.error('Could not find title/authors/dang_id for %r' % self.url) self.log.error('ASIN: %r Title: %r Authors: %r' % (dang_id, title, authors)) return mi = Metadata(title, authors) idtype = 'dang' mi.set_identifier(idtype, dang_id) self.dang_id = dang_id try: mi.comments = self.parse_comments(root, raw) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r' % self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_url) pd = root.xpath(self.pd_desc_xpath) pd_info = root.xpath(self.pd_info_xpath) pd_info_store = root.xpath(self.pd_info_store_xpath) pd_desc = root.xpath(self.pd_desc_xpath) if pd_info or pd_info_store: try: isbn = self.parse_isbn(pd_info, pd_info_store, pd_desc) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) if pd_info: pd_info = pd_info[0] else: pd_info = pd_info_store[0] try: mi.publisher = self.parse_publisher(pd_info) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: mi.pubdate = self.parse_pubdate(pd_info) except: self.log.exception('Error parsing publish date for url: %r' % self.url) else: self.log.warning('Failed to find product description for url: %r' % self.url) mi.source_relevance = self.relevance if self.dang_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.dang_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.dang_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def _GoodreadsBook_to_Metadata(self, book): # type: (_GoodreadsBook) -> Metadata """ :param book: _GoodreadsBook: book :return: Metadata: Metadata """ mi = Metadata(book.title, book.authors) mi.source_relevance = 0 mi.set_identifier('goodreads', book.id) if self.prefs['NEVER_REPLACE_ISBN'] and mi.get_identifiers().get( 'isbn'): mi.set_identifier('isbn', '') if book.asin and not self.prefs['NEVER_REPLACE_AMAZONID']: mi.set_identifier('amazon', book.asin) if book.isbn and not self.prefs['NEVER_REPLACE_ISBN']: try: if len(book.isbn) == 10: mi.isbn = check_isbn13(_ISBNConvert.convert(book.isbn)) else: mi.isbn = check_isbn13(book.isbn) except: self.log.error("ISBN CONVERSION ERROR:", book.isbn) self.log.exception() if book.image_url: self.log.info('cache_identifier_to_cover_url:', book.asin, ':', book.image_url) self.cache_identifier_to_cover_url(book.id, book.image_url) if book.publisher: self.log.info('book.publisher is:', book.publisher) mi.publisher = book.publisher if book.pubdate: self.log.info('book.pubdate is:', book.pubdate.strftime('%Y-%m-%d')) mi.pubdate = book.pubdate if book.comments: self.log.info('book.editorial_review is:', book.comments) mi.comments = book.comments tags = self.prefs['ADD_THESE_TAGS'].split(',') tags.extend(book.tags) # tag_mappings = JSONConfig('plugins/GenreMappings')['genreMappings'] # mi.tags = list(set(sorted(filter(lambda x: tag_mappings.get(x, x), tags)))) if book.series: mi.series = book.series self.log.info(u'series:', book.series) if book.series_index: mi.series_index = book.series_index self.log.info(u'series_index:', "{0:.2f}".format(book.series_index)) else: mi.series_index = 0 if book.average_rating: mi.rating = book.average_rating self.clean_downloaded_metadata(mi) return mi
def load_details(self, url, timeout): def _format_item(str): return re.sub('^"(.*)"$', '\\1', unescape(str)) def _format_list(str): return [_.strip() for _ in _format_item(str).split(',')] def _find_meta(node, property): return [ _.get('content') for _ in node if _.get('property') == property ][0] def _format_date(date_text): year = int(date_text[0:4]) month = int(date_text[4:6]) day = int(date_text[6:]) return datetime.datetime(year, month, day, tzinfo=utc_tz) try: response = self.browser.open(url, timeout=timeout) root = lxml.html.fromstring(response.read()) # <meta> tag에서 불러오는 항목 # 책ID, 제목, ISBN, 이미지URL, 평점 meta = root.xpath( '//meta[starts-with(@property, "og") or starts-with(@property, "books")]' ) # schema.org JSON에서 불러오는 항목 # 제목, 저자, 책소개, 출판사 ld_json = root.xpath( '//script[@type="application/ld+json"]/text()') ld = [json.loads(_) for _ in ld_json] book_info = [_ for _ in ld if _['@type'] == 'Book'][0] except Exception as e: self.log.exception(e) ridibooks_id = re.search('id=([0-9]+)', url).group(1) isbn = _find_meta(meta, 'books:isbn') cover_url = _find_meta(meta, 'og:image') title = _find_meta(meta, 'og:title') authors = _format_list(book_info['author']['name']) if book_info.has_key('translator'): authors.extend([ _ + u'(역자)' for _ in _format_list(book_info['translator']['name']) ]) mi = Metadata(title, authors) mi.set_identifier('ridibooks', ridibooks_id) mi.cover_url = cover_url mi.has_cover = bool(cover_url) mi.publisher = _format_item(book_info['publisher']['name']) mi.pubdate = _format_date(book_info['datePublished']) mi.comments = _format_item(book_info['description']) mi.rating = float(_find_meta(meta, 'books:rating:normalized_value')) series = re.search(u'(.*)\s*(\d+)권', title) if series: mi.series = series.group(1) mi.series_index = float(series.group(2)) mi.language = 'Korean' mi.source_relevance = self.relevance if ridibooks_id: if isbn: self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id) if cover_url: self.plugin.cache_identifier_to_cover_url( ridibooks_id, cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def get_details(self): ''' The get_details() function for stripping the website for all information ''' self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # Parse the html code from the website try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() # Do some error handling if it fails to read data except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for saxo timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # Do some error handling if the html code returned 404 if "<title>404 - " == raw: self.log.error('URL malformed: %r' % self.url) return # Clean the html data a little try: root = parse(raw) except: self.log.error("Error cleaning HTML") return # Get the json data within the HTML code (some stuff is easier to get with json) try: json_raw = root.xpath('(//script[@type="application/ld+json"])[2]') json_root = json.loads(json_raw[0].text.strip()) #print(json.dumps(json_root, indent=4, sort_keys=True)) except: self.log.error("Error loading JSON data") return # Get the title of the book try: self.title = json_root['name'] except: self.log.exception('Error parsing title for url: %r' % self.url) # Get the author of the book try: author_node = root.xpath( '//h2[@class="product-page-heading__autor"]//a') for name in author_node: self.authors.append(name.text.strip()) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None # Some books have ratings, let's use them. try: self.rating = float(json_root['aggregateRating']['ratingValue']) except: self.log.exception('Error parsing rating for url: %r' % self.url) self.rating = 0.0 # Get the ISBN number from the site try: self.isbn = json_root['isbn'] except: self.log.exception('Error parsing isbn for url: %r' % self.url) self.isbn = None # Get the comments/blurb for the book try: self.comments = parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) self.comments = None # Parse the cover url for downloading the cover. try: self.cover_url = json_root['image'] self.log.info(' Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.isbn, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) # Get the publisher name try: self.publisher = json_root['publisher']['name'] except: self.log.exception('Error parsing publisher for url: %r' % self.url) # Get the language of the book. Only english and danish are supported tho try: language = json_root['inLanguage']['name'] language = self.lang_map.get(language, None) self.language = language except: self.log.exception('Error parsing language for url: %r' % self.url) # Get the publisher date try: #pubdate_node = root.xpath('(//dl[@class="product-info-list"]//dd)[2]') # Format dd-mm-yyyy pubdate_node = root.xpath( '//div[@class="product-page-block__container"]//dd' ) # Format dd-mm-yyyy date_str = pubdate_node[0].text.strip() format_str = '%d-%m-%Y' # The format self.pubdate = datetime.datetime.strptime(date_str, format_str) except: self.log.exception('Error parsing published date for url: %r' % self.url) # Setup the metadata meta_data = Metadata(self.title, self.authors) meta_data.set_identifier('isbn', self.isbn) meta_data.set_identifier('saxo', self.url) # Set rating if self.rating: try: meta_data.rating = self.rating except: self.log.exception('Error loading rating') # Set ISBN if self.isbn: try: meta_data.isbn = self.isbn except: self.log.exception('Error loading ISBN') # Set relevance if self.relevance: try: meta_data.source_relevance = self.relevance except: self.log.exception('Error loading relevance') # Set cover url if self.cover_url: try: meta_data.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') # Set publisher if self.publisher: try: meta_data.publisher = self.publisher except: self.log.exception('Error loading publisher') # Set language if self.language: try: meta_data.language = self.language except: self.log.exception('Error loading language') # Set comments/blurb if self.comments: try: meta_data.comments = self.comments except: self.log.exception("Error loading comments") # Set publisher data if self.pubdate: try: meta_data.pubdate = self.pubdate except: self.log.exception('Error loading pubdate') # Put meta data self.plugin.clean_downloaded_metadata(meta_data) self.result_queue.put(meta_data)
def parse_details(self, root): try: antik_id = self.parse_antik_id(root) self.log.info('Parsed Antikvarium identifier: %s' % antik_id) except: self.log.exception('Error parsing Antikvarium id for url: %r' % self.url) antik_id = None try: title = self.parse_title(root) self.log.info('Parsed title: %s' % title) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors: %s' % authors) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not antik_id: self.log.error( 'Could not find title/authors/Antikvarium.hu id for %r' % self.url) self.log.error('Antikvarium.hu id: %r Title: %r Authors: %r' % (antik_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('antik_hu', antik_id) self.antik_id = antik_id try: isbn = self.parse_isbn(root) self.log.info('Parsed ISBN: %s' % isbn) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: series = self.parse_series(root) self.log.info('Parsed series: %s' % series) except: self.log.exception('Error parsing series for url: %r' % self.url) series = None try: mi.series_index = self.parse_series_index(root) self.log.info('Parsed series index: %s' % mi.series_index) except: self.log.exception('Error parsing series for url: %r' % self.url) mi.series_index = None try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments: %s' % mi.comments) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_url = self.parse_cover(root) self.log.info('Parsed URL for cover: %r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.antik_id, self.cover_url) mi.has_cover = bool(self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher: %s' % mi.publisher) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags: %s' % mi.tags) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.pubdate = self.parse_published_date(root) self.log.info('Parsed publication date: %s' % mi.pubdate) except: self.log.exception('Error parsing published date for url: %r' % self.url) try: mi.languages = self.parse_languages(root) self.log.info('Parsed languages: %r' % mi.languages) except: self.log.exception('Error parsing languages for url: %r' % self.url) mi.source_relevance = self.relevance if series: mi.series = series if self.antik_id and self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.antik_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): search_data = '' isbn = None try: self.log.info('Parse details:%s'%self.url) databazeknih_id = self.parse_databazeknih_id(self.url) self.log.info('Parsed DK identifier:%s'%databazeknih_id) except: self.log.exception('Error parsing databazeknih id for url: %r'%self.url) databazeknih_id = None # self.log.info('11') try: title = self.parse_title(root) self.log.info('Parsed title:%s'%title) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) self.log.info('Parsed authors:%s'%authors) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not databazeknih_id: self.log.error('Could not find title/authors/databazeknih id for %r'%self.url) self.log.error('DK id: %r Title: %r Authors: %r'%(databazeknih_id, title, authors)) return mi = Metadata(title, authors) self.log.info('dbki:%s'%databazeknih_id) mi.set_identifier('databazeknih', databazeknih_id) self.databazeknih_id = databazeknih_id try: (mi.series, mi.series_index) = self.parse_series(root) self.log.info('Parsed series:%s'%mi.series) self.log.info('Parsed series index:%s'%mi.series_index) except : self.log.exception('Error parsing series for url: %r'%self.url) series = None try: mi.comments = self.parse_comments(root) self.log.info('Parsed comments:%s'%mi.comments) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) self.log.info('Parsed URL for cover:%r'%self.cover_url) self.plugin.cache_identifier_to_cover_url(self.databazeknih_id, self.cover_url) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) try: mi.tags = self.parse_tags(root) self.log.info('Parsed tags:%s'%mi.tags) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.publisher = self.parse_publisher(root) self.log.info('Parsed publisher:%s'%mi.publisher) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(root) self.log.info('Parsed pubdate:%s'%mi.pubdate) except: self.log.exception('Error parsing pubdate for url: %r'%self.url) try: mi.rating = self.parse_rating(root) self.log.info('Parsed rating:%s'%mi.rating) except: self.log.exception('Error parsing rating for url: %r'%self.url) mi.source_relevance = self.relevance # if series: # mi.series = series try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) if self.databazeknih_id: self.plugin.cache_isbn_to_identifier(self.isbn, self.databazeknih_id) # self.plugin.clean_downloaded_metadata(mi) # mi.isbn = check_isbn(mi.isbn) self.log.info(mi) self.result_queue.put(mi)
def parse_details(self, root): try: legie_id = self.parse_legie_id(self.url) except: self.log.exception('Error parsing Legie id for url: %r' % self.url) legie_id = None try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not legie_id: self.log.error('Could not find title/authors/Legie id for %r' % self.url) self.log.error('Legie: %r Title: %r Authors: %r' % (legie_id, title, authors)) return self.legie_id = legie_id rating = comments = series = series_index = None try: rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) try: comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: (series, series_index) = self.parse_series(root) except: self.log.info('Series not found.') try: tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r' % self.url) tags = None if legie_id: editions = self.get_editions() if editions: num_editions = len(editions) self.log.info('Nalezeno %d vydani' % num_editions) for edition in editions: (year, cover_url, publisher, isbn) = edition mi = Metadata(title, authors) self.legie_id = "%s#%s" % (legie_id, year) mi.set_identifier('legie', self.legie_id) mi.source_relevance = self.relevance mi.rating = rating mi.comments = comments mi.series = series mi.series_index = series_index if cover_url: mi.cover_url = self.cover_url = cover_url self.plugin.cache_identifier_to_cover_url( self.legie_id, self.cover_url) if tags: mi.tags = tags mi.has_cover = bool(self.cover_url) mi.publisher = publisher mi.isbn = isbn mi.pubdate = self.prepare_date(int(year)) mi.language = "ces" self.result_queue.put(mi) else: mi = Metadata(title, authors) mi.set_identifier('legie', self.legie_id) mi.source_relevance = self.relevance mi.rating = rating mi.comments = comments mi.series = series mi.series_index = series_index try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) if tags: mi.tags = tags mi.has_cover = bool(self.cover_url) mi.publisher = publisher mi.isbn = isbn mi.pubdate = self.prepare_date(int(year)) mi.language = "ces" self.result_queue.put(mi) if self.legie_id: if self.cover_url: self.plugin.cache_identifier_to_cover_url( self.legie_id, self.cover_url)
def parse_details(self, root): try: goodreads_id = self.parse_goodreads_id(self.url) except: self.log.exception("Error parsing goodreads id for url: %r" % self.url) goodreads_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception("Error parsing title and series for url: %r" % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception("Error parsing authors for url: %r" % self.url) authors = [] if not title or not authors or not goodreads_id: self.log.error("Could not find title/authors/goodreads id for %r" % self.url) self.log.error("Goodreads: %r Title: %r Authors: %r" % (goodreads_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier("goodreads", goodreads_id) self.goodreads_id = goodreads_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception("Error parsing ISBN for url: %r" % self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception("Error parsing ratings for url: %r" % self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception("Error parsing comments for url: %r" % self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception("Error parsing cover for url: %r" % self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception("Error parsing tags for url: %r" % self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception("Error parsing publisher and date for url: %r" % self.url) mi.source_relevance = self.relevance if self.goodreads_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.goodreads_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.goodreads_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def get_details(self): self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) # We should not even be here if we are not processing an ebook hit if self.url.find("/ebook/") == -1: return try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Beam Ebooks timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return # raw = raw.decode('utf-8', errors='replace') raw = raw.decode('iso-8859-1', errors='replace') # open('D:\\work\\calibre-dump-book-details.html', 'wb').write(raw) if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: # root = fromstring(clean_ascii_chars(raw)) root = fromstring(raw) except: msg = 'Failed to parse beam ebooks details page: %r' % self.url self.log.exception(msg) return try: self.beam_ebooks_id = self.parse_beam_ebooks_id(self.url) except: self.log.exception('Error parsing beam ebooks id for url: %r' % self.url) self.beam_ebooks_id = None try: (self.title, self.series_index) = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r' % self.url) self.title = None self.series_index = None try: self.authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None mi = Metadata(self.title, self.authors) mi.set_identifier('beam-ebooks', self.beam_ebooks_id) if self.series_index: mi.series_index = float(self.series_index) self._determine_perry_rhodan_cycle_name(mi) mi.source_relevance = self.relevance self.plugin.clean_downloaded_metadata(mi) print(mi) self.result_queue.put(mi)
def identify(self, log, result_queue, abort, title=None, authors=[], identifiers={}, timeout=30): self.load_config() if authors is None: authors=[] # get identifying tags from book idn = identifiers.get('dnb-idn', None) isbn = check_isbn(identifiers.get('isbn', None)) # ignore unknown authors ignored_authors = [ "V. A.", "V.A.", "Unknown", "Unbekannt" ] for i in ignored_authors: authors = [ x for x in authors if x != i ] if (isbn is None) and (idn is None) and (title is None) and (authors is None): log.info("This plugin requires at least either ISBN, IDN, Title or Author(s).") return None queries=[] # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results exact_search = {} if idn is not None: exact_search['idn'] = idn # in case look for a IDN only search for the IDN and skip all the other stuff queries.append('num='+idn) else: authors_v = [] title_v = [] # create some variants of given authors if authors != []: authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=False))) # concat all author names ("Peter Meier Luise Stark") authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=True))) # use only first author for a in authors: authors_v.append(a) # use all authors, one by one # remove duplicates unique_authors_v = [] for i in authors_v: if i not in unique_authors_v: unique_authors_v.append(i) # create some variants of given title if title is not None: title_v.append(title) # simply use given title title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=False))) # remove some punctation characters title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=True))) # remove subtitle (everything after " : ") title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=False))) # remove some punctation characters and joiners ("and", "&", ...) title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True))) # remove subtitle (everything after " : ") and joiners ("and", "&", ...) # TODO: remove subtitle after " - " # remove duplicates unique_title_v = [] for i in title_v: if i not in unique_title_v: unique_title_v.append(i) # title and author if authors_v != [] and title_v != []: for a in authors_v: for t in title_v: if isbn is not None: queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"') else: queries.append('tit="' + t + '" AND per="' + a + '"') # try with first author as title and title (without subtitle) as author if isbn is not None: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"') else: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # try with author and title (without subtitle) in any index if isbn is not None: queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"') else: queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # author but no title elif authors_v != [] and title_v == []: for i in authors_v: if isbn is not None: queries.append('per="'+ i +'" AND num="' + isbn + '"') else: queries.append('per="'+ i +'"') # try with author as title if isbn is not None: queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="' + isbn + '"') else: queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # title but no author elif authors_v == [] and title_v != []: for i in title_v: if isbn is not None: queries.append('tit="' + i + '" AND num="' + isbn + '"') else: queries.append('tit="' + i + '"') # try with title as author if isbn is not None: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND num="' + isbn + '"') else: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '"') # as last resort only use isbn if isbn is not None: queries.append('num=' + isbn) # remove duplicate queries uniqueQueries=[] for i in queries: if i not in uniqueQueries: uniqueQueries.append(i) # Process queries results = None for query in uniqueQueries: # SRU does not work with "+" or "?" characters in query, so we simply remove them query = re.sub('[\+\?]','',query) query = query + ' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)' log.info(query) if self.cfg_dnb_token is None: results = self.getSearchResultsByScraping(log, query, timeout) else: results = self.getSearchResults(log, query, timeout) if results is None: continue log.info("Parsing records") ns = { 'marc21' : 'http://www.loc.gov/MARC21/slim' } for record in results: series = None series_index = None publisher = None pubdate = None languages = [] title = None title_sort = None authors = [] author_sort = None edition = None comments = None idn = None urn = None isbn = None ddc = [] subjects_gnd = [] subjects_non_gnd = [] publisher_name = None publisher_location = None ##### Field 264 ##### # Publisher Name and Location fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns) if len(fields)>0: publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip(); publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip(); else: fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",namespaces=ns) if len(fields)>0: publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip(); else: fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",namespaces=ns) if len(fields)>0: publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip(); # Publishing Date for i in record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",namespaces=ns): match = re.search("(\d{4})", i.text.strip()) if match is not None: year = match.group(1) pubdate = datetime.datetime(int(year), 1, 1, 12 , 30, 0) break # Log if publisher_name is not None: log.info("Extracted Publisher: %s" % publisher_name) if publisher_location is not None: log.info("Extracted Publisher Location: %s" % publisher_location) if pubdate is not None: log.info("Extracted Publication Year: %s" % pubdate) ##### Field 245 #### # Title/Series/Series_Index title_parts = [] for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # if a,n,p,n,p,n,p exist: series = a + n0 + " - " + p0 + n1 + " - " + p1, series_index = n2, title = p2 # if a,n,p,n,p exist: series = a + n0 + " - " + p0, series_index = n1, title = p1 (Example: dnb-id 1008774839) # if a,n,p exist: series = a, series_index = n, title = p # if a exist: title = a # TODO: a,n,p,n (i.e. 956375146) code_p = [] code_n = [] code_a = [] for j in i.xpath(".//marc21:subfield[@code='p']",namespaces=ns): code_p.append(j.text.strip()) for j in i.xpath(".//marc21:subfield[@code='n']",namespaces=ns): match = re.search("(\d+[,\.\d+]?)", j.text.strip()) if match: code_n.append(match.group(1)) else: code_n.append("0") # looks like sometimes DNB does not know the series index and uses something like "[...]" for j in i.xpath(".//marc21:subfield[@code='a']",namespaces=ns): code_a.append(j.text.strip()) if len(code_p) == 0: title_parts = title_parts + code_a elif len(code_p)>0 and len(code_p) == len(code_n): series = " : ".join(code_a) # I've never seen more than one code_a, but who knows... for i in range (0,len(code_p)-1): series = series + " " + code_n[i] + " " + code_p[i] series_index = code_n[-1] title_parts.append(code_p[-1]) # subtitle 1: Field 245 for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 2 #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns): # title = title + " / " + i.text.strip() # break title = " : ".join(title_parts) # Log if series_index is not None: log.info("Extracted Series_Index from Field 245: %s" % series_index) if series is not None: log.info("Extracted Series from Field 245: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if title is not None: log.info("Extracted Title: %s" % title) title = self.cleanUpTitle(log, title) # Title_Sort if len(title_parts)>0: title_sort_parts = list(title_parts) title_sort_regex = re.match('^(.*?)('+chr(152)+'.*'+chr(156)+')?(.*?)$',title_parts[0]) sortword = title_sort_regex.group(2) if sortword: title_sort_parts[0] = ''.join(filter(None,[title_sort_regex.group(1).strip(),title_sort_regex.group(3).strip(),", "+sortword])) title_sort = " : ".join(title_sort_parts) # Log if title_sort is not None: log.info("Extracted Title_Sort: %s" % title_sort) ##### Field 100 and Field 700 ##### # Authors for i in record.xpath(".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # primary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # secondary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) if len(authors)==0: # if no "real" autor was found take all persons involved for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # secondary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) if len(authors)>0: author_sort = authors[0] # Log if len(authors)>0: log.info("Extracted Authors: %s" % " & ".join(authors)) if author_sort is not None: log.info("Extracted Author_Sort: %s" % " & ".join(authors)) ##### Field 856 ##### # Comments for i in record.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE) comments = sanitize_comments_html(comments) break except: log.info("Could not download Comments from %s" % i) # Log if comments is not None: log.info('Comments: %s' % comments) # If no comments are found for this edition, look at other editions of this book (Fields 776) # TODO: Make this configurable (default: yes) if comments is None: # get all other issues for i in record.xpath(".//marc21:datafield[@tag='776']/marc21:subfield[@code='w' and string-length(text())>0]",namespaces=ns): other_idn = re.sub("^\(.*\)","",i.text.strip()); subquery = 'num='+other_idn+' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)' log.info(subquery) if self.cfg_dnb_token is None: subresults = self.getSearchResultsByScraping(log, subquery, timeout) else: subresults = self.getSearchResults(log, subquery, timeout) if subresults is None: continue for subrecord in subresults: for i in subrecord.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE) comments = sanitize_comments_html(comments) break except: log.info("Could not download Comments from %s" % i) if comments is not None: log.info('Comments from other issue: %s' % comments) break ##### Field 16 ##### # ID: IDN for i in record.xpath(".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): idn = i.text.strip() break # Log if idn is not None: log.info("Extracted ID IDN: %s" % idn) ##### Field 24 ##### # ID: URN for i in record.xpath(".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): urn = i.text.strip() break # Log if urn is not None: log.info("Extracted ID URN: %s" % urn) ##### Field 20 ##### # ID: ISBN for i in record.xpath(".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]" match = re.search(isbn_regex, i.text.strip()) isbn = match.group() isbn = isbn.replace('-','') break # Log if isbn is not None: log.info("Extracted ID ISBN: %s" % isbn) # When doing an exact search for a given ISBN skip books with wrong ISBNs if isbn is not None and "isbn" in exact_search: if isbn != exact_search["isbn"]: log.info("Extracted ISBN does not match book's ISBN, skipping record") continue ##### Field 82 ##### # ID: Sachgruppe (DDC) for i in record.xpath(".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): ddc.append(i.text.strip()) # Log if len(ddc)>0: log.info("Extracted ID DDC: %s" % ",".join(ddc)) ##### Field 490 ##### # In theory this field is not used for "real" book series, use field 830 instead. But it is used. # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='490']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # "v" either "Nr. 220" or "This great Seriestitle : Nr. 220" - if available use this instead of attribute a attr_v = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() parts = re.split(" : ",attr_v) if len(parts)==2: if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])): # figure out which part contains the index if bool(re.search("\d",parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] match = re.search("(\d+[,\.\d+]?)", indexpart) if match is not None: series_index = match.group(1) series = textpart.strip() else: match = re.search("(\d+[,\.\d+]?)", attr_v) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Use Series Name from attribute "a" if not already found in attribute "v" if series is None: series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 490: %s" % series_index) if series is not None: log.info("Extracted Series from Field 490: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 246 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='246']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): match = re.search("^(.+?) ; (\d+[,\.\d+]?)$", i.text.strip()) if match is not None: series = match.group(1) series_index = match.group(2) # Log if series_index is not None: log.info("Extracted Series Index from Field 246: %s" % series_index) if series is not None: log.info("Extracted Series from Field 246: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 800 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='800']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='t' and string-length(text())>0]/..",namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Series series = i.xpath(".//marc21:subfield[@code='t']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 800: %s" % series_index) if series is not None: log.info("Extracted Series from Field 800: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 830 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Series series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 830: %s" % series_index) if series is not None: log.info("Extracted Series from Field 830: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 689 ##### # GND Subjects for i in record.xpath(".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): subjects_gnd.append(i.text.strip()) for f in range(600,656): for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): if i.text.startswith("("): continue subjects_gnd.append(i.text) # Log if len(subjects_gnd)>0: log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd)) ##### Fields 600-655 ##### # Non-GND subjects for f in range(600,656): for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # ignore entries starting with "(": if i.text.startswith("("): continue subjects_non_gnd.extend(re.split(',|;',i.text)) # remove one-character subjects: for i in subjects_non_gnd: if len(i)<2: subjects_non_gnd.remove(i) # Log if len(subjects_non_gnd)>0: log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd)) ##### Field 250 ##### # Edition for i in record.xpath(".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): edition = i.text.strip() break # Log if edition is not None: log.info("Extracted Edition: %s" % edition) ##### Field 41 ##### # Languages for i in record.xpath(".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): languages.append(i.text.strip()) # Log if languages is not None: log.info("Extracted Languages: %s" % ",".join(languages)) ##### If configured: Try to separate Series, Series Index and Title from the fetched title ##### #if self.cfg_guess_series is True: if (series is None or (series is not None and series_index == "0")) and self.cfg_guess_series is True: guessed_series = None guessed_series_index = None guessed_title = None log.info("Starting Series Guesser") parts = re.split("[:]",self.removeSortingCharacters(title)) if len(parts)==2: log.info("Title has two parts") # make sure only one part of the two parts contains digits if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])): log.info("only one title part contains digits") # figure out which part contains the index if bool(re.search("\d",parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] # Look at the part without digits: match = re.match("^[\s\-–:]*(.+?)[\s\-–:]*$",textpart) # remove odd characters from start and end of the text part if match: textpart = match.group(1) # Look at the part with digits: # for Titleparts like: "Name of the series - Episode 2" match = re.match("^\s*(\S\D*?[a-zA-Z]\D*?)\W[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",indexpart) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) if guessed_series is None: guessed_series = textpart guessed_title = textpart + " : Band " + guessed_series_index else: guessed_title = textpart #log.info("ALGO1: guessed_title: " + guessed_title) #log.info("ALGO1: guessed_series: " + guessed_series) #log.info("ALGO1: guessed_series_index: " + guessed_series_index) else: # for Titleparts like: "Episode 2 Name of the series" match = re.match("^\s*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S\D*?[a-zA-Z]\D*?)[\/\.,\-–\s]*$",indexpart) if match: guessed_series_index = match.group(1) guessed_series = match.group(2) if guessed_series is None: # sometimes books with multiple volumes are detected as series without name -> Add the volume to the title guessed_series = textpart guessed_title = textpart + " : Band " + guessed_series_index else: guessed_title = textpart #log.info("ALGO2: guessed_title: " + guessed_title) #log.info("ALGO2: guessed_series: " + guessed_series) #log.info("ALGO2: guessed_series_index: " + guessed_series_index) else: # for titleparts like: "Band 2" match = re.match("^[\s\(]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*[\/\.,\-–\s]*$",indexpart) if match: guessed_series_index = match.group(1) # ...with textpart like NAME OF SERIES\s[\-\.;:]\sNAME OF TITLE # some false positives match = re.match("^\s*(\w+.+?)\s?[\.;\-–:]+\s(\w+.+)\s*$",textpart) if match: guessed_series = match.group(1) guessed_title = match.group(2) log.info("ALGO3: guessed_title: " + guessed_title) log.info("ALGO3: guessed_series: " + guessed_series) log.info("ALGO3: guessed_series_index: " + guessed_series_index) elif len(parts)==1: log.info("Title has one part") # for Titles like: "Name of the series - Title (Episode 2)" match = re.match("^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0]) if match: guessed_series_index = match.group(3) guessed_series = match.group(1) guessed_title = match.group(2) #log.info("ALGO4: guessed_title: " + guessed_title) #log.info("ALGO4: guessed_series: " + guessed_series) #log.info("ALGO4: guessed_series_index: " + guessed_series_index) else: # for Titles like: "Name of the series - Episode 2" match = re.match("^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0]) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) guessed_title = guessed_series + " : Band " + guessed_series_index #log.info("ALGO5: guessed_title: " + guessed_title) #log.info("ALGO5: guessed_series: " + guessed_series) #log.info("ALGO5: guessed_series_index: " + guessed_series_index) # Log if guessed_series is not None: log.info("Guessed Series: %s" % guessed_series) #guessed_series = self.cleanUpSeries(log, guessed_series, publisher_name) if guessed_series_index is not None: log.info("Guessed Series Index: %s" % guessed_series_index) if guessed_title is not None: log.info("Guessed Title: %s" % guessed_title) guessed_title = self.cleanUpTitle(log, guessed_title) if guessed_series is not None and guessed_series_index is not None and guessed_title is not None: title = guessed_title series = guessed_series series_index = guessed_series_index ##### Filter exact searches ##### # When doing an exact search for a given IDN skip books with wrong IDNs # TODO: Currently exact_search for ISBN is not implemented. Would require ISBN-10 and ISBN-13 conversions if idn is not None and "idn" in exact_search: if idn != exact_search["idn"]: log.info("Extracted IDN does not match book's IDN, skipping record") continue ##### Put it all together ##### if self.cfg_append_edition_to_title == True and edition is not None: title = title + " : " + edition mi = Metadata(self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors)) mi.title_sort = self.removeSortingCharacters(title_sort) mi.author_sort = self.removeSortingCharacters(author_sort) mi.languages = languages mi.pubdate = pubdate mi.publisher = " ; ".join(filter(None,[publisher_location, self.removeSortingCharacters(publisher_name)])) mi.series = self.removeSortingCharacters(series) mi.series_index = series_index mi.comments = comments mi.isbn = isbn # also required for cover download mi.set_identifier('urn',urn) mi.set_identifier('dnb-idn',idn) mi.set_identifier('ddc', ",".join(ddc)) # cfg_subjects: # 0: use only subjects_gnd if self.cfg_fetch_subjects == 0: mi.tags = self.uniq(subjects_gnd) # 1: use only subjects_gnd if found, else subjects_non_gnd elif self.cfg_fetch_subjects == 1: if len(subjects_gnd)>0: mi.tags = self.uniq(subjects_gnd) else: mi.tags = self.uniq(subjects_non_gnd) # 2: subjects_gnd and subjects_non_gnd elif self.cfg_fetch_subjects == 2: mi.tags = self.uniq(subjects_gnd + subjects_non_gnd) # 3: use only subjects_non_gnd if found, else subjects_gnd elif self.cfg_fetch_subjects == 3: if len(subjects_non_gnd)>0: mi.tags = self.uniq(subjects_non_gnd) else: mi.tags = self.uniq(subjects_gnd) # 4: use only subjects_non_gnd elif self.cfg_fetch_subjects == 4: mi.tags = self.uniq(subjects_non_gnd) # 5: use no subjects at all elif self.cfg_fetch_subjects == 5: mi.tags = [] # put current result's metdata into result queue log.info("Final formatted result: \n%s" % mi) result_queue.put(mi)
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src) def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans def get(field): ans = get_all(field) if ans: ans = ans[0] return ans # Title title = get('title') or title_tag.strip() or _('Unknown') # Author authors = authors_to_string(get_all('authors')) or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title, string_to_authors(authors)) # Single-value text fields for field in ('publisher', 'isbn'): val = get(field) if val: setattr(mi, field, val) # Multi-value text fields for field in ('languages', ): val = get_all(field) if val: setattr(mi, field, val) # HTML fields for field in ('comments', ): val = get(field) if val: setattr( mi, field, val.replace('&', '&').replace('<', '<').replace( '>', '>').replace('"', '"').replace("'", ''')) # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 10: mi.rating = 0 except: pass # TAGS tags = get_all('tags') if tags: tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags # IDENTIFIERS for (k, v) in iteritems(meta_tag_ids): v = [x.strip() for x in v if x.strip()] if v: mi.set_identifier(k, v[0]) return mi
#!/usr/bin/env python
def parse_details(self, root): try: kyobobook_id = self.parse_kyobobook_id(self.url) except: self.log.exception('Error parsing Kyobobook id for url: %r'%self.url) kyobobook_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not kyobobook_id: self.log.error('Could not find title/authors/kyobobook id for %r'%self.url) self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('kyobobook', kyobobook_id) self.kyobobook_id = kyobobook_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r'%self.url) try: mi.publisher, mi.pubdate = self.parse_publisher_and_date(root) except: self.log.exception('Error parsing publisher and date for url: %r'%self.url) try: lang = self._parse_language(root) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) mi.source_relevance = self.relevance if self.kyobobook_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.kyobobook_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, raw, root): try: asin = self.parse_asin(root) except: self.log.exception('Error parsing asin for url: %r'%self.url) asin = None if self.testing: import tempfile, uuid with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_', suffix='.html', delete=False) as f: f.write(raw) print ('Downloaded html for', asin, 'saved in', f.name) try: title = self.parse_title(root) except: self.log.exception('Error parsing title for url: %r'%self.url) title = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not asin: self.log.error('Could not find title/authors/asin for %r'%self.url) self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, authors)) return mi = Metadata(title, authors) idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain mi.set_identifier(idtype, asin) self.amazon_id = asin try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: series, series_index = self.parse_series(root) if series: mi.series, mi.series_index = series, series_index elif self.testing: mi.series, mi.series_index = 'Dummy series for testing', 1 except: self.log.exception('Error parsing series for url: %r'%self.url) try: mi.tags = self.parse_tags(root) except: self.log.exception('Error parsing tags for url: %r'%self.url) try: self.cover_url = self.parse_cover(root, raw) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root) if non_hero: # New style markup try: self.parse_new_details(root, mi, non_hero[0]) except: self.log.exception('Failed to parse new-style book details section') else: pd = root.xpath(self.pd_xpath) if pd: pd = pd[0] try: isbn = self.parse_isbn(pd) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.publisher = self.parse_publisher(pd) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_pubdate(pd) except: self.log.exception('Error parsing publish date for url: %r'%self.url) try: lang = self.parse_language(pd) if lang: mi.language = lang except: self.log.exception('Error parsing language for url: %r'%self.url) else: self.log.warning('Failed to find product description for url: %r'%self.url) mi.source_relevance = self.relevance if self.amazon_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) if self.cover_url: self.plugin.cache_identifier_to_cover_url(self.amazon_id, self.cover_url) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: title = self.parse_title(root) except: self.log.exception('Error parsing title for query: %r' % self.query) title = None if not title: self.log.error('Could not find title for %r' % self.query) try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for query: %r' % self.query) authors = [] if not authors: self.log.error('Could not find authors for %r' % self.query) return mi = Metadata(title, authors) try: isbn = self.parse_isbn(root) if isbn: # match 10 of 13 getallen aan het begin, gevolgd door een spatie of niets p = re.compile('^([0-9]{13}|[0-9]{10})(?= |\Z)') if isinstance(isbn, str): m = p.match(isbn) if m: mi.isbn = m.group() else: m = p.match(isbn[0]) if m: mi.isbn = m.group() except: self.log.exception('Error parsing ISBN for url: %r' % self.url) try: lang = self.parse_language(root) if lang: mi.languages = lang except: self.log.exception('Error parsing language for url: %r' % self.url) try: lccn = self.parse_lccn(root) if lccn: if isinstance(lccn, str): mi.set_identifier('lccn', lccn) else: for identifier in lccn: mi.set_identifier('lccn', identifier) except: self.log.exception('Error parsing LCCN for url: %r' % self.url) try: ddc = self.parse_ddc(root) if ddc: if isinstance(ddc, str): mi.set_identifier('ddc', ddc) else: for identifier in ddc: mi.set_identifier('ddc', identifier) except: self.log.exception('Error parsing DDC for url: %r' % self.url) try: lcc = self.parse_lcc(root) if lcc: if isinstance(lcc, str): mi.set_identifier('lcc', lcc) else: for identifier in lcc: mi.set_identifier('lcc', identifier) except: self.log.exception('Error parsing LCC for url: %r' % self.url) mi.source_relevance = self.relevance self.result_queue.put(mi)
def parse_details(self, root): isfdb_id = None title = None authors = [] isbn = None publisher = None pubdate = None try: isfdb_id = re.search('(\d+)$', self.url).groups(0)[0] except: self.log.exception('Error parsing ISFDB ID for url: %r' % self.url) detail_nodes = root.xpath('//div[@id="content"]//td[@class="pubheader"]/ul/li') if not detail_nodes: detail_nodes = root.xpath('//div[@id="content"]/div/ul/li') # no table (on records with no image) for detail_node in detail_nodes: section = detail_node[0].text_content().strip().rstrip(':') #self.log.info(section) try: if section == 'Publication': title = detail_node[0].tail.strip() if not title: # assume an extra span with a transliterated title tooltip title = detail_node[1].text_content().strip() #self.log.info(title) elif section == 'Authors' or section == 'Editors': for a in detail_node.xpath('.//a'): author = a.text_content().strip() if section.startswith('Editors'): authors.append(author + ' (Editor)') else: authors.append(author) #self.log.info(authors) elif section == 'ISBN': isbn = detail_node[0].tail.strip('[] \n') #self.log.info(isbn) elif section == 'Publisher': publisher = detail_node.xpath('a')[0].text_content().strip() #self.log.info(publisher) elif section == 'Date': pubdate = self._convert_date_text(detail_node[0].tail.strip()) #self.log.info(pubdate) except: self.log.exception('Error parsing section %r for url: %r' % (section, self.url) ) if not title or not authors or not isfdb_id: self.log.error('Could not find title/authors/ISFDB ID for %r' % self.url) self.log.error('ISFDB: %r Title: %r Authors: %r' % (isfdb_id, title, authors)) return mi = Metadata(title, authors) mi.set_identifier('isfdb', isfdb_id) self.isfdb_id = isfdb_id if isbn: self.isbn = mi.isbn = isbn if publisher: mi.publisher = publisher if pubdate: mi.pubdate = pubdate try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! mi.source_relevance = self.relevance if self.isfdb_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.isfdb_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def get_details(self): self.log.info(" Worker.get_details:") self.log.info(" self: ", self) self.log.info(" self.url: ", self.url) try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() self.log.info(raw) except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Bookmeta for biblionet timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: # root = fromstring(clean_ascii_chars(raw)) root = json.loads(raw) self.log.info(root) except: msg = 'Failed to parse book detail page: %r' % self.url self.log.exception(msg) return try: self.biblionetid = root['biblionetid'] except: self.log.exception('Error parsing book id for url: %r' % self.url) self.biblionetid = None try: self.title = root['title'].strip() except: self.log.exception('Error parsing title for url: %r' % self.url) self.title = None self.series_index = None try: self.authors = [root['authors'].strip()] self.log.info(self.authors) except: self.log.exception('Error parsing authors for url: %r' % self.url) self.authors = None try: self.cover_url = root['cover_url'] self.log.info('Parsed URL for cover:%r' % self.cover_url) self.plugin.cache_identifier_to_cover_url(self.biblionetid, self.cover_url) except: self.log.exception('Error parsing cover for url: %r' % self.url) self.has_cover = bool(self.cover_url) try: self.publisher = root['publisher'] self.log.info('Parsed publisher:%s' % self.publisher) except: self.log.exception('Error parsing publisher for url: %r' % self.url) try: self.tags = root['categories'].replace('DDC: ', 'DDC:').replace( '-', '').split()[:-1] self.log.info('Parsed tags:%s' % self.tags) except: self.log.exception('Error parsing tags for url: %r' % self.url) try: self.pubdate = root['yr_published'] self.log.info('Parsed publication date:%s' % self.pubdate) except: self.log.exception('Error parsing published date for url: %r' % self.url) mi = Metadata(self.title, self.authors) mi.set_identifier('biblionet', self.biblionetid) if self.series_index: try: mi.series_index = float(self.series_index) except: self.log.exception('Error loading series') if self.relevance: try: mi.source_relevance = self.relevance except: self.log.exception('Error loading relevance') if self.cover_url: try: mi.cover_url = self.cover_url except: self.log.exception('Error loading cover_url') if self.publisher: try: mi.publisher = self.publisher except: self.log.exception('Error loading publisher') if self.tags: try: mi.tags = self.tags except: self.log.exception('Error loading tags') if self.pubdate: try: if self.pubdate not in (self.yr_msg1, self.yr_msg2): d = datetime.date(int(self.pubdate), 1, 1) mi.pubdate = d except: self.log.exception('Error loading pubdate') self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def parse_details(self, root): try: yes24_id = self.parse_yes24_id(self.url) except: self.log.exception('Error parsing YES24 id for url: %r'%self.url) yes24_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r'%self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r'%self.url) authors = [] if not title or not authors or not yes24_id: self.log.error('Could not find title/authors/YES24 id for %r'%self.url) self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index mi.set_identifier('yes24', yes24_id) self.yes24_id = yes24_id try: isbn = self.parse_isbn(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing ISBN for url: %r'%self.url) try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r'%self.url) try: self.cover_url = self.parse_cover(root) except: self.log.exception('Error parsing cover for url: %r'%self.url) mi.has_cover = bool(self.cover_url) mi.cover_url = self.cover_url # This is purely so we can run a test for it!!! try: mi.publisher = self.parse_publisher(root) except: self.log.exception('Error parsing publisher for url: %r'%self.url) try: mi.pubdate = self.parse_published_date(root) except: self.log.exception('Error parsing published date for url: %r'%self.url) mi.language = 'ko' mi.source_relevance = self.relevance if self.yes24_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): self.load_config() # get identifying tags from book idn = identifiers.get('dnb-idn', None) isbn = check_isbn(identifiers.get('isbn', None)) # ignore unknown authors if authors is "V. A." or authors is "V.A." or authors is "Unknown" or authors is "Unbekannt": authors = None if (isbn is None) and (idn is None) and (title is None) and (authors is None): log.info( "This plugin requires at least either ISBN, IDN, Title or Author(s)." ) return None queries = [] # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results exact_search = {} if idn is not None: queries.append('num=' + idn) exact_search['idn'] = idn else: authors_v = [] title_v = [] if authors is not None: authors_v.append(' '.join(authors)) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=False))) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=True))) if title is not None: title_v.append(title) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=False))) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True))) if isbn is not None: exact_search['isbn'] = isbn # title and author if authors is not None and title is not None: for a in authors_v: for t in title_v: if isbn is not None: queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"') else: queries.append('tit="' + t + '" AND per="' + a + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '" AND tit="' + authors[0] + '"') # title but no author elif authors is not None and title is None: for i in authors_v: if isbn is not None: queries.append('per="' + i + '" AND num="' + isbn + '"') else: queries.append('per="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('tit="' + authors[0] + '"') # author but no title elif authors is None and title is not None: for i in title_v: if isbn is not None: queries.append('tit="' + i + '" AND num="' + isbn + '"') else: queries.append('tit="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '"') # as last resort only use isbn if isbn is not None: queries.append('num=' + isbn) # Sort queries descending by length (assumption: longer query -> less but better results) #queries.sort(key=len) #queries.reverse() # remove duplicate queries uniqueQueries = [] for i in queries: if i not in uniqueQueries: uniqueQueries.append(i) # Process queries results = None for query in uniqueQueries: query = query + ' NOT (mat=film OR mat=music OR mat=microfiches)' log.info(query) if self.cfg_dnb_token is None: results = self.getSearchResultsByScraping(log, query, timeout) else: results = self.getSearchResults(log, query, timeout) if results is None: continue log.info("Parsing records") ns = {'marc21': 'http://www.loc.gov/MARC21/slim'} for record in results: series = None series_index = None publisher = None pubdate = None languages = [] title = None title_sort = None edition = None comments = None idn = None urn = None isbn = None ddc = [] subjects_gnd = [] subjects_non_gnd = [] # Title: Field 245 title_parts = [] # if a,n,p exist: series = a, series_index = n, title = p for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/../marc21:subfield[@code='n' and string-length(text())>0]/../marc21:subfield[@code='p' and string-length(text())>0]/..", namespaces=ns): series_index = i.xpath(".//marc21:subfield[@code='n']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match: series_index = match.group(1) else: series_index = "0" # looks like sometimes DNB does not know the series index and uses something like "[...]" series_index = series_index.replace(',', '.') series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() title_parts.append( i.xpath(".//marc21:subfield[@code='p']", namespaces=ns)[0].text.strip()) log.info("Extracted Series: %s" % series) log.info("Extracted Series Index: %s" % series_index) break # otherwise: title = a if len(title_parts) == 0: for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 1 for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 2 #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns): # title = title + " / " + i.text.strip() # break title = " : ".join(title_parts) log.info("Extracted Title: %s" % title) # Title_Sort title_sort_parts = list(title_parts) title_sort_regex = re.match( '^(.*?)(' + chr(152) + '.*' + chr(156) + ')?(.*?)$', title_parts[0]) sortword = title_sort_regex.group(2) if sortword: title_sort_parts[0] = ''.join( filter(None, [ title_sort_regex.group(1).strip(), title_sort_regex.group(3).strip(), ", " + sortword ])) title_sort = " : ".join(title_sort_parts) log.info("Extracted Title_Sort: %s" % title_sort) # Authors authors = [] author_sort = None for i in record.xpath( ".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # primary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len( authors ) == 0: # if no "real" autor was found take all persons involved for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len(authors) > 0: author_sort = authors[0] log.info("Extracted Authors: %s" % " & ".join(authors)) # Comments for i in record.xpath( ".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]", namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = sanitize_comments_html(comments) log.info('Comments: %s' % comments) break except: log.info("Could not download Comments from %s" % i) # Publisher Name and Location publisher_name = None publisher_location = None fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() log.info("Extracted Publisher: %s" % publisher_name) log.info("Extracted Publisher Location: %s" % publisher_location) # Publishing Date for i in record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]", namespaces=ns): match = re.search("(\d{4})", i.text.strip()) if match is not None: year = match.group(1) pubdate = datetime.datetime(int(year), 1, 2) break log.info("Extracted Publication Year: %s" % pubdate) # ID: IDN for i in record.xpath( ".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): idn = i.text.strip() break log.info("Extracted ID IDN: %s" % idn) if "idn" in exact_search: if idn != exact_search["idn"]: log.info( "Extracted IDN does not match book's IDN, skipping record" ) continue # ID: URN for i in record.xpath( ".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): urn = i.text.strip() break log.info("Extracted ID URN: %s" % urn) # ID: ISBN for i in record.xpath( ".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]" match = re.search(isbn_regex, i.text.strip()) isbn = match.group() isbn = isbn.replace('-', '') break log.info("Extracted ID ISBN: %s" % isbn) if "isbn" in exact_search: if isbn != exact_search["isbn"]: log.info( "Extracted ISBN does not match book's ISBN, skipping record" ) continue # ID: Sachgruppe (DDC) for i in record.xpath( ".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): ddc.append(i.text.strip()) log.info("Extracted ID DDC: %s" % ",".join(ddc)) # Series and Series_Index if series is None and series_index is None: for i in record.xpath( ".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',', '.') log.info("Extracted Series Index: %s" % series_index) # Series series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() log.info("Extracted Series: %s" % series) break # Try to extract Series, Series Index and Title from the fetched title. # Caution: This overwrites DNB's series/series_index and modifies the title! if self.cfg_guess_series is True: guessed_series = None guessed_series_index = None parts = re.split("[:]", self.removeSortingCharacters(title)) if len(parts) == 2: if bool(re.search("\d", parts[0])) != bool( re.search("\d", parts[1])): # figure out which part contains the index if bool(re.search("\d", parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] match = re.match( "^[\s\-–:]*(.+?)[\s\-–:]*$", textpart ) # remove odd characters from start and end of the text part if match: textpart = match.group(1) # from Titleparts like: "Name of the series - Episode 2" OK match = re.match( "^\s*(\S.*?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", indexpart) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart else: # from Titleparts like: "Episode 2 Name of the series" match = re.match( "^\s*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S.*?)[\/\.,\-–\s]*$", indexpart) if match: guessed_series_index = match.group(1) guessed_series = match.group(2) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart elif len(parts) == 1: # from Titles like: "Name of the series - Title (Episode 2)" match = re.match( "^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(3) guessed_series = match.group(1) title = match.group(2) else: # from Titles like: "Name of the series - Episode 2" match = re.match( "^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) title = guessed_series + " : Band " + guessed_series_index if guessed_series is not None and guessed_series_index is not None: series = guessed_series series_index = guessed_series_index log.info("Guessed Series: %s" % series) log.info("Guessed Series Index: %s" % series_index) # GND Subjects from 689 for i in record.xpath( ".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): subjects_gnd.append(i.text.strip()) # GND Subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): if i.text.startswith("("): continue subjects_gnd.append(i.text) log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd)) # Non-GND subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # ignore entries starting with "(": if i.text.startswith("("): continue subjects_non_gnd.extend(re.split(',|;', i.text)) # remove one-character subjects: for i in subjects_non_gnd: if len(i) < 2: subjects_non_gnd.remove(i) log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd)) # Edition for i in record.xpath( ".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): edition = i.text.strip() break log.info("Extracted Edition: %s" % edition) # Languages for i in record.xpath( ".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): languages.append(i.text.strip()) if languages is not None: log.info("Extracted Languages: %s" % ",".join(languages)) # Put it all together if self.cfg_append_edition_to_title == True and edition is not None: title = title + " : " + edition mi = Metadata( self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors)) mi.title_sort = self.removeSortingCharacters(title_sort) mi.author_sort = self.removeSortingCharacters(author_sort) mi.languages = languages mi.pubdate = pubdate mi.publisher = " : ".join( filter(None, [ publisher_location, self.removeSortingCharacters(publisher_name) ])) mi.series = self.removeSortingCharacters(series) mi.series_index = series_index mi.comments = comments mi.isbn = isbn # also required for cover download mi.set_identifier('urn', urn) mi.set_identifier('dnb-idn', idn) mi.set_identifier('ddc', ",".join(ddc)) if self.cfg_fetch_subjects == 0: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 1: if len(subjects_gnd) > 0: mi.tags = self.uniq(subjects_gnd) else: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 2: mi.tags = self.uniq(subjects_gnd + subjects_non_gnd) elif self.cfg_fetch_subjects == 3: if len(subjects_non_gnd) > 0: mi.tags = self.uniq(subjects_non_gnd) else: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 4: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 5: mi.tags = [] # put current result's metdata into result queue log.info("Final formatted result: %s" % mi) result_queue.put(mi)
def parse(self, raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_only_date, UNDEFINED_DATE from css_selectors import Select root = parse_html(raw) selector = Select(root) sku = next(selector('div.sku.attGroup')) info = sku.getparent() top = info.getparent().getparent() banner = top.find('div') spans = banner.findall('span') title = '' for i, span in enumerate(spans): if i == 0 or '12pt' in span.get('style', ''): title += astext(span) else: break authors = [ re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',') ] mi = Metadata(title.strip(), authors) # Identifiers isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')] for isbn in isbns: if isbn: self.plugin.cache_isbn_to_identifier(isbn, self.sku) isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True) if isbns and isbns[0]: mi.isbn = isbns[0] mi.set_identifier('edelweiss', self.sku) # Tags bisac = tuple(selector('div.bisac.attGroup')) if bisac: bisac = astext(bisac[0]) mi.tags = [x.strip() for x in bisac.split(',')] mi.tags = [ t[1:].strip() if t.startswith('&') else t for t in mi.tags ] # Publisher pub = tuple(selector('div.supplier.attGroup')) if pub: pub = astext(pub[0]) mi.publisher = pub # Pubdate pub = tuple(selector('div.shipDate.attGroupItem')) if pub: pub = astext(pub[0]) parts = pub.partition(':')[0::2] pub = parts[1] or parts[0] try: if ', Ship Date:' in pub: pub = pub.partition(', Ship Date:')[0] q = parse_only_date(pub, assume_utc=True) if q.year != UNDEFINED_DATE: mi.pubdate = q except: self.log.exception('Error parsing published date: %r' % pub) # Comments comm = '' general = tuple(selector('div#pd-general-overview-content')) if general: q = self.render_comments(general[0]) if q != '<p>No title summary available. </p>': comm += q general = tuple(selector('div#pd-general-contributor-content')) if general: comm += self.render_comments(general[0]) general = tuple(selector('div#pd-general-quotes-content')) if general: comm += self.render_comments(general[0]) if comm: mi.comments = comm # Cover img = tuple(selector('img.title-image[src]')) if img: href = img[0].get('src').replace('jacket_covers/medium/', 'jacket_covers/flyout/') self.plugin.cache_identifier_to_cover_url(self.sku, href) mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None return mi
def parse_details(self, root): try: CBDB_id = self.parse_CBDB_id(self.url) except: self.log.exception('Error parsing CBDB id for url: %r' % self.url) CBDB_id = None try: (title, series, series_index) = self.parse_title_series(root) except: self.log.exception('Error parsing title and series for url: %r' % self.url) title = series = series_index = None try: authors = self.parse_authors(root) except: self.log.exception('Error parsing authors for url: %r' % self.url) authors = [] if not title or not authors or not CBDB_id: self.log.error('Could not find title/authors/CBDB id for %r' % self.url) self.log.error('CBDB: %r Title: %r Authors: %r' % (CBDB_id, title, authors)) return mi = Metadata(title, authors) if series: mi.series = series mi.series_index = series_index #mi.identifiers['cbdb'] = CBDB_id mi.set_identifier('cbdb', CBDB_id) #self.log.info(CBDB_id) #self.log.info(mi.identifiers.get('cbdb', None)) self.CBDB_id = CBDB_id try: mi.rating = self.parse_rating(root) except: self.log.exception('Error parsing ratings for url: %r' % self.url) # summary try: mi.comments = self.parse_comments(root) except: self.log.exception('Error parsing comments for url: %r' % self.url) try: self.cover_urls = self.parse_covers(root) except: self.log.exception('Error parsing cover for url: %r' % self.url) mi.has_cover = bool(self.cover_urls) #self.log.info('covers') #self.log.info(self.cover_urls) try: tags = self.parse_tags(root) if tags: mi.tags = tags except: self.log.exception('Error parsing tags for url: %r' % self.url) try: mi.publisher, mi.pubdate, isbn = self.parse_editions(root) if isbn: self.isbn = mi.isbn = isbn except: self.log.exception('Error parsing publisher and date for url: %r' % self.url) mi.source_relevance = self.relevance mi.language = 'Czech' #self.log.info('self.CBDB_id = ' + str(self.CBDB_id )) if self.CBDB_id: if self.isbn: self.plugin.cache_isbn_to_identifier(self.isbn, self.CBDB_id) if self.cover_urls: self.plugin.cache_identifier_to_cover_url( self.CBDB_id, self.cover_urls) self.plugin.clean_downloaded_metadata(mi) self.result_queue.put(mi)