def test_markdown(self): from calibre.ebooks.txt.processor import create_markdown_object from calibre.ebooks.conversion.plugins.txt_input import MD_EXTENSIONS create_markdown_object(sorted(MD_EXTENSIONS)) from calibre.library.comments import sanitize_comments_html sanitize_comments_html( b'''<script>moo</script>xxx<img src="http://moo.com/x.jpg">''')
def parse_comments(self, root): description_nodes = root.xpath( "//*[preceding-sibling::comment()[. = ' *** s:%s *** '] and following-sibling::comment()[. = ' *** //e:%s *** ']]" % ('책소개', '책소개')) default_append_toc = cfg.DEFAULT_STORE_VALUES[cfg.KEY_APPEND_TOC] append_toc = cfg.plugin_prefs[cfg.STORE_NAME].get( cfg.KEY_APPEND_TOC, default_append_toc) comments = '' if description_nodes: for description_node in description_nodes: comments += tostring(description_node, method='html', encoding=str).strip() while comments.find(' ') >= 0: comments = comments.replace(' ', ' ') comments = sanitize_comments_html(comments) if append_toc: toc_node = root.xpath( '//div[@class="box_detail_content"]/h2[@class="title_detail_basic" and contains(text(),"%s")]/following-sibling::div' % "목차") if toc_node: toc = tostring(toc_node[0], method='html') toc = sanitize_comments_html(toc) comments += '<h3>[목차]</h3><div id="toc">' + toc + "</div>" if comments: comments += "<hr />" + '<div><div style="float:right">[kyobobook]</div></div>' return comments
def _render_comments(self, desc): from calibre.library.comments import sanitize_comments_html for c in desc.xpath('descendant::noscript'): c.getparent().remove(c) for c in desc.xpath('descendant::*[@class="seeAll" or' ' @class="emptyClear" or @id="collapsePS" or' ' @id="expandPS"]'): c.getparent().remove(c) for a in desc.xpath('descendant::a[@href]'): del a.attrib['href'] a.tag = 'span' desc = self.tostring(desc, method='html', encoding=unicode).strip() # Encoding bug in Amazon data U+fffd (replacement char) # in some examples it is present in place of ' desc = desc.replace('\ufffd', "'") # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace # desc = re.sub('\n+', '\n', desc) # desc = re.sub(' +', ' ', desc) # Remove the notice about text referring to out of print editions desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) return sanitize_comments_html(desc)
def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): from html5_parser import parse from lxml import html from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = parse(raw, maybe_xhtml=False, sanitize_names=True) except Exception: return False pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]") if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass if lang: lang = lang[0].strip().lower() lang = {'english':'eng', 'french':'fra', 'german':'deu', 'spanish':'spa'}.get(lang, None) if lang: mi.language = lang if ebook_isbn: # print "ebook isbn is "+str(ebook_isbn[0]) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding='unicode').strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None
def parse_comments(self, root): description_node = root.xpath('//div[@class="ugc nonTruncatedSum"]/p') if description_node: desc = description_node[0] comments = tostring(desc, method='html', encoding=unicode).strip() while comments.find(' ') >= 0: comments = comments.replace(' ',' ') comments = sanitize_comments_html(comments) return comments
def parse_comments(self, root): description_node = root.xpath('//div[@class="ugc nonTruncatedSum"]/p') if description_node: desc = description_node[0] comments = tostring(desc, method='html', encoding=unicode).strip() while comments.find(' ') >= 0: comments = comments.replace(' ', ' ') comments = sanitize_comments_html(comments) return comments
def parse_comments(self, root): # Look for description in a second span that gets expanded when interactively displayed [@id="display:none"] description_node = root.xpath('//div[@id="annotation"]') if description_node: desc = description_node[0].text_content().strip() comments = sanitize_comments_html(desc) while comments.find(' ') >= 0: comments = comments.replace(' ', ' ') return comments
def parse_comments(self, root): # Look for description in a second span that gets expanded when interactively displayed [@id="display:none"] description_node = root.xpath('//div[@id="metacol"]/div[@id="description"]/span') if description_node: desc = description_node[0] if len(description_node) == 1 else description_node[1] less_link = desc.xpath('a[@class="actionLinkLite"]') if less_link is not None and len(less_link): desc.remove(less_link[0]) comments = tostring(desc, method="html", encoding=unicode).strip() while comments.find(" ") >= 0: comments = comments.replace(" ", " ") comments = sanitize_comments_html(comments) return comments
def parse_comments(self, root): # Look for description in a second span that gets expanded when interactively displayed [@id="display:none"] description_node = root.xpath('//div[@id="bookIntroContent"]') if description_node: desc = description_node[0] if len(description_node) == 1 else description_node[1] less_link = desc.xpath('div[@class="section_open more_btn_t2"]') if less_link is not None and len(less_link): desc.remove(less_link[0]) comments = tostring(desc, method='html', encoding=unicode).strip() while comments.find(' ') >= 0: comments = comments.replace(' ',' ') comments = sanitize_comments_html(comments) return comments
def parse_comments(self, root): description_nodes = root.xpath("//*[preceding-sibling::comment()[. = ' *** s:%s *** '] and following-sibling::comment()[. = ' *** //e:%s *** ']]" % (u'책소개',u'책소개')) default_append_toc = cfg.DEFAULT_STORE_VALUES[cfg.KEY_APPEND_TOC] append_toc = cfg.plugin_prefs[cfg.STORE_NAME].get(cfg.KEY_APPEND_TOC, default_append_toc) comments = '' if description_nodes: for description_node in description_nodes: comments += tostring(description_node, method='html', encoding=unicode).strip() while comments.find(' ') >= 0: comments = comments.replace(' ',' ') comments = sanitize_comments_html(comments) if append_toc: toc_node = root.xpath('//div[@class="box_detail_content"]/h2[@class="title_detail_basic" and contains(text(),"%s")]/following-sibling::div' % u"목차") if toc_node: toc = tostring(toc_node[0], method='html') toc = sanitize_comments_html(toc) comments += '<h3>[목차]</h3><div id="toc">' + toc + "</div>" if comments: comments += "<hr />" + '<div><div style="float:right">[kyobobook]</div></div>' return comments
def _render_comments(self, desc): from calibre.library.comments import sanitize_comments_html import html5lib # html5lib parsed noscript as CDATA desc = html5lib.parseFragment('<div>%s</div>' % (self.totext(desc).replace('textarea', 'div')), \ treebuilder='lxml', namespaceHTMLElements=False)[0] matches = desc.xpath('descendant::*[contains(text(), "内容提要") \ or contains(text(), "内容推荐") or contains(text(), "编辑推荐") \ or contains(text(), "内容简介") or contains(text(), "基本信息")]/../*[self::p or self::div or self::span]' ) if matches: if len(matches) > 1: desc = matches[-1] for item in matches: content_len = len(self.totext(item)) if content_len > 50 and content_len < 200: desc = item break for c in desc.xpath('descendant::noscript'): c.getparent().remove(c) for c in desc.xpath('descendant::*[@class="seeAll" or' ' @class="emptyClear" or @id="collapsePS" or' ' @id="expandPS"]'): c.getparent().remove(c) # for a in desc.xpath('descendant::a[@href]'): del a.attrib['href'] a.tag = 'span' desc = self.tostring(desc, method='text', encoding=unicode).strip() # return desc # Encoding bug in Amazon data U+fffd (replacement char) # in some examples it is present in place of ' desc = desc.replace('\ufffd', "'") # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace desc = re.sub('\n+', '\n', desc) desc = re.sub(' +', ' ', desc) # Remove the notice about text referring to out of print editions desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) return sanitize_comments_html(desc)
def parse_comments(self, root, mi): """ """ try: comments_node = root.xpath( "//p[@itemprop='description']/span/text()") if not comments_node: comments_node = root.xpath( "//p[@itemprop='description']/text()") self.log.info(" Comments node: %s" % comments_node) if comments_node: mi.comments = self.comments = sanitize_comments_html( "".join(comments_node)) self.log.info(" Parsed comments: %s" % mi.comments) except: self.log.exception("Error parsing comments for url: %r" % self.url)
def render_comments(self, desc): from lxml import etree from calibre.library.comments import sanitize_comments_html for c in desc.xpath('descendant::noscript'): c.getparent().remove(c) for a in desc.xpath('descendant::a[@href]'): del a.attrib['href'] a.tag = 'span' desc = etree.tostring(desc, method='html', encoding=unicode).strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace # desc = re.sub('\n+', '\n', desc) # desc = re.sub(' +', ' ', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) return sanitize_comments_html(desc)
def render_comments(self, desc): from lxml import etree from calibre.library.comments import sanitize_comments_html for c in desc.xpath('descendant::noscript'): c.getparent().remove(c) for a in desc.xpath('descendant::a[@href]'): del a.attrib['href'] a.tag = 'span' desc = etree.tostring(desc, method='html', encoding='unicode').strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace # desc = re.sub('\n+', '\n', desc) # desc = re.sub(' +', ' ', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) return sanitize_comments_html(desc)
def render_comments(self, desc): from lxml import etree from calibre.library.comments import sanitize_comments_html for c in desc.xpath("descendant::noscript"): c.getparent().remove(c) for a in desc.xpath("descendant::a[@href]"): del a.attrib["href"] a.tag = "span" desc = etree.tostring(desc, method="html", encoding=unicode).strip() # remove all attributes from tags desc = re.sub(r"<([a-zA-Z0-9]+)\s[^>]+>", r"<\1>", desc) # Collapse whitespace # desc = re.sub('\n+', '\n', desc) # desc = re.sub(' +', ' ', desc) # Remove comments desc = re.sub(r"(?s)<!--.*?-->", "", desc) return sanitize_comments_html(desc)
def _render_comments(self, desc): # 生成注释? from calibre.library.comments import sanitize_comments_html desc = self.tostring(desc, method='html', encoding=unicode).strip() # Encoding bug in 17k.com data U+fffd (replacement char) # in some examples it is present in place of ' desc = desc.replace('\ufffd', "'") # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Collapse whitespace # desc = re.sub('\n+', '\n', desc) # desc = re.sub(' +', ' ', desc) # Remove the notice about text referring to out of print editions desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) return sanitize_comments_html(desc)
def parse_comments(self, root): description_nodes = root.xpath( '//div[@id="anotace"]/strong/following-sibling::p') if not description_nodes: description_nodes = root.xpath( '//div[@id="nic"]/strong/following-sibling::p') if description_nodes: comments = [] for node in description_nodes: node_text = node.text_content() if node_text != None: comments.append("<p>" + node_text + "</p>") #comments = tostring(description_node, method='html') comments = sanitize_comments_html("".join(comments)) return comments else: self.log.info('No comment node was found.')
def parse_comments(root): ''' Function for parsing comments and clean them up a little Re-written script from the Goodreads script ''' # Look for description description_node = root.xpath('(//div[@class="product-page-block"]//p)[1]') if description_node: desc = description_node[0] if len( description_node) == 1 else description_node[1] less_link = desc.xpath('a[@class="actionLinkLite"]') if less_link is not None and len(less_link): desc.remove(less_link[0]) comments = tostring(desc, method='html', encoding=unicode).strip() while comments.find(' ') >= 0: comments = comments.replace(' ', ' ') if "Fil størrelse:" in comments: comments = comments.replace(comments.split(".")[-1], "</p>") comments = sanitize_comments_html(comments) return comments
def get_sanitized_description(self): ''' For calibre version so this code can be consolidated between fff_plugin.py and jobs.py ''' orig = description = self.getMetadata("description") # logger.debug("description:%s"%description) if not description: description = '' else: if self.getConfig('keep_summary_html'): ## Handles desc with (supposed) html without html->MD ## text->html dance that sanitize_comments_html does. description = sanitize_html(description) # logger.debug("desc using sanitize_html") else: ## because of the html->MD text->html dance, text only ## (or MD/MD-like) descs come out better. description = sanitize_comments_html(description) # logger.debug("desc using sanitize_comments_html") # if orig != description: # logger.debug("\nchanged description\n%s\n%s"%(orig,description)) return description
def identify(self, log, result_queue, abort, title=None, authors=[], identifiers={}, timeout=30): self.load_config() if authors is None: authors=[] # get identifying tags from book idn = identifiers.get('dnb-idn', None) isbn = check_isbn(identifiers.get('isbn', None)) # ignore unknown authors ignored_authors = [ "V. A.", "V.A.", "Unknown", "Unbekannt" ] for i in ignored_authors: authors = [ x for x in authors if x != i ] if (isbn is None) and (idn is None) and (title is None) and (authors is None): log.info("This plugin requires at least either ISBN, IDN, Title or Author(s).") return None queries=[] # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results exact_search = {} if idn is not None: exact_search['idn'] = idn # in case look for a IDN only search for the IDN and skip all the other stuff queries.append('num='+idn) else: authors_v = [] title_v = [] # create some variants of given authors if authors != []: authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=False))) # concat all author names ("Peter Meier Luise Stark") authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=True))) # use only first author for a in authors: authors_v.append(a) # use all authors, one by one # remove duplicates unique_authors_v = [] for i in authors_v: if i not in unique_authors_v: unique_authors_v.append(i) # create some variants of given title if title is not None: title_v.append(title) # simply use given title title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=False))) # remove some punctation characters title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=True))) # remove subtitle (everything after " : ") title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=False))) # remove some punctation characters and joiners ("and", "&", ...) title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True))) # remove subtitle (everything after " : ") and joiners ("and", "&", ...) # TODO: remove subtitle after " - " # remove duplicates unique_title_v = [] for i in title_v: if i not in unique_title_v: unique_title_v.append(i) # title and author if authors_v != [] and title_v != []: for a in authors_v: for t in title_v: if isbn is not None: queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"') else: queries.append('tit="' + t + '" AND per="' + a + '"') # try with first author as title and title (without subtitle) as author if isbn is not None: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"') else: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # try with author and title (without subtitle) in any index if isbn is not None: queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"') else: queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # author but no title elif authors_v != [] and title_v == []: for i in authors_v: if isbn is not None: queries.append('per="'+ i +'" AND num="' + isbn + '"') else: queries.append('per="'+ i +'"') # try with author as title if isbn is not None: queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="' + isbn + '"') else: queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # title but no author elif authors_v == [] and title_v != []: for i in title_v: if isbn is not None: queries.append('tit="' + i + '" AND num="' + isbn + '"') else: queries.append('tit="' + i + '"') # try with title as author if isbn is not None: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND num="' + isbn + '"') else: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '"') # as last resort only use isbn if isbn is not None: queries.append('num=' + isbn) # remove duplicate queries uniqueQueries=[] for i in queries: if i not in uniqueQueries: uniqueQueries.append(i) # Process queries results = None for query in uniqueQueries: # SRU does not work with "+" or "?" characters in query, so we simply remove them query = re.sub('[\+\?]','',query) query = query + ' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)' log.info(query) if self.cfg_dnb_token is None: results = self.getSearchResultsByScraping(log, query, timeout) else: results = self.getSearchResults(log, query, timeout) if results is None: continue log.info("Parsing records") ns = { 'marc21' : 'http://www.loc.gov/MARC21/slim' } for record in results: series = None series_index = None publisher = None pubdate = None languages = [] title = None title_sort = None authors = [] author_sort = None edition = None comments = None idn = None urn = None isbn = None ddc = [] subjects_gnd = [] subjects_non_gnd = [] publisher_name = None publisher_location = None ##### Field 264 ##### # Publisher Name and Location fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns) if len(fields)>0: publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip(); publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip(); else: fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",namespaces=ns) if len(fields)>0: publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip(); else: fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",namespaces=ns) if len(fields)>0: publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip(); # Publishing Date for i in record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",namespaces=ns): match = re.search("(\d{4})", i.text.strip()) if match is not None: year = match.group(1) pubdate = datetime.datetime(int(year), 1, 1, 12 , 30, 0) break # Log if publisher_name is not None: log.info("Extracted Publisher: %s" % publisher_name) if publisher_location is not None: log.info("Extracted Publisher Location: %s" % publisher_location) if pubdate is not None: log.info("Extracted Publication Year: %s" % pubdate) ##### Field 245 #### # Title/Series/Series_Index title_parts = [] for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # if a,n,p,n,p,n,p exist: series = a + n0 + " - " + p0 + n1 + " - " + p1, series_index = n2, title = p2 # if a,n,p,n,p exist: series = a + n0 + " - " + p0, series_index = n1, title = p1 (Example: dnb-id 1008774839) # if a,n,p exist: series = a, series_index = n, title = p # if a exist: title = a # TODO: a,n,p,n (i.e. 956375146) code_p = [] code_n = [] code_a = [] for j in i.xpath(".//marc21:subfield[@code='p']",namespaces=ns): code_p.append(j.text.strip()) for j in i.xpath(".//marc21:subfield[@code='n']",namespaces=ns): match = re.search("(\d+[,\.\d+]?)", j.text.strip()) if match: code_n.append(match.group(1)) else: code_n.append("0") # looks like sometimes DNB does not know the series index and uses something like "[...]" for j in i.xpath(".//marc21:subfield[@code='a']",namespaces=ns): code_a.append(j.text.strip()) if len(code_p) == 0: title_parts = title_parts + code_a elif len(code_p)>0 and len(code_p) == len(code_n): series = " : ".join(code_a) # I've never seen more than one code_a, but who knows... for i in range (0,len(code_p)-1): series = series + " " + code_n[i] + " " + code_p[i] series_index = code_n[-1] title_parts.append(code_p[-1]) # subtitle 1: Field 245 for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 2 #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns): # title = title + " / " + i.text.strip() # break title = " : ".join(title_parts) # Log if series_index is not None: log.info("Extracted Series_Index from Field 245: %s" % series_index) if series is not None: log.info("Extracted Series from Field 245: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if title is not None: log.info("Extracted Title: %s" % title) title = self.cleanUpTitle(log, title) # Title_Sort if len(title_parts)>0: title_sort_parts = list(title_parts) title_sort_regex = re.match('^(.*?)('+chr(152)+'.*'+chr(156)+')?(.*?)$',title_parts[0]) sortword = title_sort_regex.group(2) if sortword: title_sort_parts[0] = ''.join(filter(None,[title_sort_regex.group(1).strip(),title_sort_regex.group(3).strip(),", "+sortword])) title_sort = " : ".join(title_sort_parts) # Log if title_sort is not None: log.info("Extracted Title_Sort: %s" % title_sort) ##### Field 100 and Field 700 ##### # Authors for i in record.xpath(".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # primary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # secondary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) if len(authors)==0: # if no "real" autor was found take all persons involved for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # secondary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) if len(authors)>0: author_sort = authors[0] # Log if len(authors)>0: log.info("Extracted Authors: %s" % " & ".join(authors)) if author_sort is not None: log.info("Extracted Author_Sort: %s" % " & ".join(authors)) ##### Field 856 ##### # Comments for i in record.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE) comments = sanitize_comments_html(comments) break except: log.info("Could not download Comments from %s" % i) # Log if comments is not None: log.info('Comments: %s' % comments) # If no comments are found for this edition, look at other editions of this book (Fields 776) # TODO: Make this configurable (default: yes) if comments is None: # get all other issues for i in record.xpath(".//marc21:datafield[@tag='776']/marc21:subfield[@code='w' and string-length(text())>0]",namespaces=ns): other_idn = re.sub("^\(.*\)","",i.text.strip()); subquery = 'num='+other_idn+' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)' log.info(subquery) if self.cfg_dnb_token is None: subresults = self.getSearchResultsByScraping(log, subquery, timeout) else: subresults = self.getSearchResults(log, subquery, timeout) if subresults is None: continue for subrecord in subresults: for i in subrecord.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE) comments = sanitize_comments_html(comments) break except: log.info("Could not download Comments from %s" % i) if comments is not None: log.info('Comments from other issue: %s' % comments) break ##### Field 16 ##### # ID: IDN for i in record.xpath(".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): idn = i.text.strip() break # Log if idn is not None: log.info("Extracted ID IDN: %s" % idn) ##### Field 24 ##### # ID: URN for i in record.xpath(".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): urn = i.text.strip() break # Log if urn is not None: log.info("Extracted ID URN: %s" % urn) ##### Field 20 ##### # ID: ISBN for i in record.xpath(".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]" match = re.search(isbn_regex, i.text.strip()) isbn = match.group() isbn = isbn.replace('-','') break # Log if isbn is not None: log.info("Extracted ID ISBN: %s" % isbn) # When doing an exact search for a given ISBN skip books with wrong ISBNs if isbn is not None and "isbn" in exact_search: if isbn != exact_search["isbn"]: log.info("Extracted ISBN does not match book's ISBN, skipping record") continue ##### Field 82 ##### # ID: Sachgruppe (DDC) for i in record.xpath(".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): ddc.append(i.text.strip()) # Log if len(ddc)>0: log.info("Extracted ID DDC: %s" % ",".join(ddc)) ##### Field 490 ##### # In theory this field is not used for "real" book series, use field 830 instead. But it is used. # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='490']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # "v" either "Nr. 220" or "This great Seriestitle : Nr. 220" - if available use this instead of attribute a attr_v = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() parts = re.split(" : ",attr_v) if len(parts)==2: if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])): # figure out which part contains the index if bool(re.search("\d",parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] match = re.search("(\d+[,\.\d+]?)", indexpart) if match is not None: series_index = match.group(1) series = textpart.strip() else: match = re.search("(\d+[,\.\d+]?)", attr_v) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Use Series Name from attribute "a" if not already found in attribute "v" if series is None: series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 490: %s" % series_index) if series is not None: log.info("Extracted Series from Field 490: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 246 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='246']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): match = re.search("^(.+?) ; (\d+[,\.\d+]?)$", i.text.strip()) if match is not None: series = match.group(1) series_index = match.group(2) # Log if series_index is not None: log.info("Extracted Series Index from Field 246: %s" % series_index) if series is not None: log.info("Extracted Series from Field 246: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 800 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='800']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='t' and string-length(text())>0]/..",namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Series series = i.xpath(".//marc21:subfield[@code='t']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 800: %s" % series_index) if series is not None: log.info("Extracted Series from Field 800: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 830 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Series series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 830: %s" % series_index) if series is not None: log.info("Extracted Series from Field 830: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 689 ##### # GND Subjects for i in record.xpath(".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): subjects_gnd.append(i.text.strip()) for f in range(600,656): for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): if i.text.startswith("("): continue subjects_gnd.append(i.text) # Log if len(subjects_gnd)>0: log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd)) ##### Fields 600-655 ##### # Non-GND subjects for f in range(600,656): for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # ignore entries starting with "(": if i.text.startswith("("): continue subjects_non_gnd.extend(re.split(',|;',i.text)) # remove one-character subjects: for i in subjects_non_gnd: if len(i)<2: subjects_non_gnd.remove(i) # Log if len(subjects_non_gnd)>0: log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd)) ##### Field 250 ##### # Edition for i in record.xpath(".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): edition = i.text.strip() break # Log if edition is not None: log.info("Extracted Edition: %s" % edition) ##### Field 41 ##### # Languages for i in record.xpath(".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): languages.append(i.text.strip()) # Log if languages is not None: log.info("Extracted Languages: %s" % ",".join(languages)) ##### If configured: Try to separate Series, Series Index and Title from the fetched title ##### #if self.cfg_guess_series is True: if (series is None or (series is not None and series_index == "0")) and self.cfg_guess_series is True: guessed_series = None guessed_series_index = None guessed_title = None log.info("Starting Series Guesser") parts = re.split("[:]",self.removeSortingCharacters(title)) if len(parts)==2: log.info("Title has two parts") # make sure only one part of the two parts contains digits if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])): log.info("only one title part contains digits") # figure out which part contains the index if bool(re.search("\d",parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] # Look at the part without digits: match = re.match("^[\s\-–:]*(.+?)[\s\-–:]*$",textpart) # remove odd characters from start and end of the text part if match: textpart = match.group(1) # Look at the part with digits: # for Titleparts like: "Name of the series - Episode 2" match = re.match("^\s*(\S\D*?[a-zA-Z]\D*?)\W[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",indexpart) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) if guessed_series is None: guessed_series = textpart guessed_title = textpart + " : Band " + guessed_series_index else: guessed_title = textpart #log.info("ALGO1: guessed_title: " + guessed_title) #log.info("ALGO1: guessed_series: " + guessed_series) #log.info("ALGO1: guessed_series_index: " + guessed_series_index) else: # for Titleparts like: "Episode 2 Name of the series" match = re.match("^\s*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S\D*?[a-zA-Z]\D*?)[\/\.,\-–\s]*$",indexpart) if match: guessed_series_index = match.group(1) guessed_series = match.group(2) if guessed_series is None: # sometimes books with multiple volumes are detected as series without name -> Add the volume to the title guessed_series = textpart guessed_title = textpart + " : Band " + guessed_series_index else: guessed_title = textpart #log.info("ALGO2: guessed_title: " + guessed_title) #log.info("ALGO2: guessed_series: " + guessed_series) #log.info("ALGO2: guessed_series_index: " + guessed_series_index) else: # for titleparts like: "Band 2" match = re.match("^[\s\(]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*[\/\.,\-–\s]*$",indexpart) if match: guessed_series_index = match.group(1) # ...with textpart like NAME OF SERIES\s[\-\.;:]\sNAME OF TITLE # some false positives match = re.match("^\s*(\w+.+?)\s?[\.;\-–:]+\s(\w+.+)\s*$",textpart) if match: guessed_series = match.group(1) guessed_title = match.group(2) log.info("ALGO3: guessed_title: " + guessed_title) log.info("ALGO3: guessed_series: " + guessed_series) log.info("ALGO3: guessed_series_index: " + guessed_series_index) elif len(parts)==1: log.info("Title has one part") # for Titles like: "Name of the series - Title (Episode 2)" match = re.match("^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0]) if match: guessed_series_index = match.group(3) guessed_series = match.group(1) guessed_title = match.group(2) #log.info("ALGO4: guessed_title: " + guessed_title) #log.info("ALGO4: guessed_series: " + guessed_series) #log.info("ALGO4: guessed_series_index: " + guessed_series_index) else: # for Titles like: "Name of the series - Episode 2" match = re.match("^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0]) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) guessed_title = guessed_series + " : Band " + guessed_series_index #log.info("ALGO5: guessed_title: " + guessed_title) #log.info("ALGO5: guessed_series: " + guessed_series) #log.info("ALGO5: guessed_series_index: " + guessed_series_index) # Log if guessed_series is not None: log.info("Guessed Series: %s" % guessed_series) #guessed_series = self.cleanUpSeries(log, guessed_series, publisher_name) if guessed_series_index is not None: log.info("Guessed Series Index: %s" % guessed_series_index) if guessed_title is not None: log.info("Guessed Title: %s" % guessed_title) guessed_title = self.cleanUpTitle(log, guessed_title) if guessed_series is not None and guessed_series_index is not None and guessed_title is not None: title = guessed_title series = guessed_series series_index = guessed_series_index ##### Filter exact searches ##### # When doing an exact search for a given IDN skip books with wrong IDNs # TODO: Currently exact_search for ISBN is not implemented. Would require ISBN-10 and ISBN-13 conversions if idn is not None and "idn" in exact_search: if idn != exact_search["idn"]: log.info("Extracted IDN does not match book's IDN, skipping record") continue ##### Put it all together ##### if self.cfg_append_edition_to_title == True and edition is not None: title = title + " : " + edition mi = Metadata(self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors)) mi.title_sort = self.removeSortingCharacters(title_sort) mi.author_sort = self.removeSortingCharacters(author_sort) mi.languages = languages mi.pubdate = pubdate mi.publisher = " ; ".join(filter(None,[publisher_location, self.removeSortingCharacters(publisher_name)])) mi.series = self.removeSortingCharacters(series) mi.series_index = series_index mi.comments = comments mi.isbn = isbn # also required for cover download mi.set_identifier('urn',urn) mi.set_identifier('dnb-idn',idn) mi.set_identifier('ddc', ",".join(ddc)) # cfg_subjects: # 0: use only subjects_gnd if self.cfg_fetch_subjects == 0: mi.tags = self.uniq(subjects_gnd) # 1: use only subjects_gnd if found, else subjects_non_gnd elif self.cfg_fetch_subjects == 1: if len(subjects_gnd)>0: mi.tags = self.uniq(subjects_gnd) else: mi.tags = self.uniq(subjects_non_gnd) # 2: subjects_gnd and subjects_non_gnd elif self.cfg_fetch_subjects == 2: mi.tags = self.uniq(subjects_gnd + subjects_non_gnd) # 3: use only subjects_non_gnd if found, else subjects_gnd elif self.cfg_fetch_subjects == 3: if len(subjects_non_gnd)>0: mi.tags = self.uniq(subjects_non_gnd) else: mi.tags = self.uniq(subjects_gnd) # 4: use only subjects_non_gnd elif self.cfg_fetch_subjects == 4: mi.tags = self.uniq(subjects_non_gnd) # 5: use no subjects at all elif self.cfg_fetch_subjects == 5: mi.tags = [] # put current result's metdata into result queue log.info("Final formatted result: \n%s" % mi) result_queue.put(mi)
#!/usr/bin/env python
def do_download_for_worker(book,options,merge,notification=lambda x,y:x): ''' Child job, to download story when run as a worker job ''' from calibre_plugins.fanficfare_plugin import FanFicFareBase fffbase = FanFicFareBase(options['plugin_path']) with fffbase: from calibre_plugins.fanficfare_plugin.dialogs import (NotGoingToDownload, OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY, CALIBREONLYSAVECOL) from calibre_plugins.fanficfare_plugin.fanficfare import adapters, writers, exceptions from calibre_plugins.fanficfare_plugin.fanficfare.epubutils import get_update_data from calibre_plugins.fanficfare_plugin.fff_util import (get_fff_adapter, get_fff_config) try: book['comment'] = _('Download started...') configuration = get_fff_config(book['url'], options['fileform'], options['personal.ini']) if configuration.getConfig('use_ssl_unverified_context'): ## monkey patch to avoid SSL bug. dupliated from ## fff_plugin.py because bg jobs run in own process ## space. import ssl if hasattr(ssl, '_create_unverified_context'): ssl._create_default_https_context = ssl._create_unverified_context if not options['updateepubcover'] and 'epub_for_update' in book and options['collision'] in (UPDATE, UPDATEALWAYS): configuration.set("overrides","never_make_cover","true") # images only for epub, html, even if the user mistakenly # turned it on else where. if options['fileform'] not in ("epub","html"): configuration.set("overrides","include_images","false") adapter = adapters.getAdapter(configuration,book['url']) adapter.is_adult = book['is_adult'] adapter.username = book['username'] adapter.password = book['password'] adapter.setChaptersRange(book['begin'],book['end']) adapter.load_cookiejar(options['cookiejarfile']) #logger.debug("cookiejar:%s"%adapter.cookiejar) adapter.set_pagecache(options['pagecache']) story = adapter.getStoryMetadataOnly() if 'calibre_series' in book: adapter.setSeries(book['calibre_series'][0],book['calibre_series'][1]) # set PI version instead of default. if 'version' in options: story.setMetadata('version',options['version']) book['title'] = story.getMetadata("title", removeallentities=True) book['author_sort'] = book['author'] = story.getList("author", removeallentities=True) book['publisher'] = story.getMetadata("site") book['url'] = story.getMetadata("storyUrl") book['tags'] = story.getSubjectTags(removeallentities=True) if story.getMetadata("description"): book['comments'] = sanitize_comments_html(story.getMetadata("description")) else: book['comments']='' book['series'] = story.getMetadata("series", removeallentities=True) if story.getMetadataRaw('datePublished'): book['pubdate'] = story.getMetadataRaw('datePublished').replace(tzinfo=local_tz) if story.getMetadataRaw('dateUpdated'): book['updatedate'] = story.getMetadataRaw('dateUpdated').replace(tzinfo=local_tz) if story.getMetadataRaw('dateCreated'): book['timestamp'] = story.getMetadataRaw('dateCreated').replace(tzinfo=local_tz) else: book['timestamp'] = None # need *something* there for calibre. writer = writers.getWriter(options['fileform'],configuration,adapter) outfile = book['outfile'] ## No need to download at all. Shouldn't ever get down here. if options['collision'] in (CALIBREONLY, CALIBREONLYSAVECOL): logger.info("Skipping CALIBREONLY 'update' down inside worker--this shouldn't be happening...") book['comment'] = 'Metadata collected.' book['all_metadata'] = story.getAllMetadata(removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() ## checks were done earlier, it's new or not dup or newer--just write it. elif options['collision'] in (ADDNEW, SKIP, OVERWRITE, OVERWRITEALWAYS) or \ ('epub_for_update' not in book and options['collision'] in (UPDATE, UPDATEALWAYS)): # preserve logfile even on overwrite. if 'epub_for_update' in book: adapter.logfile = get_update_data(book['epub_for_update'])[6] # change the existing entries id to notid so # write_epub writes a whole new set to indicate overwrite. if adapter.logfile: adapter.logfile = adapter.logfile.replace("span id","span notid") if options['collision'] == OVERWRITE and 'fileupdated' in book: lastupdated=story.getMetadataRaw('dateUpdated') fileupdated=book['fileupdated'] # updated doesn't have time (or is midnight), use dates only. # updated does have time, use full timestamps. if (lastupdated.time() == time.min and fileupdated.date() > lastupdated.date()) or \ (lastupdated.time() != time.min and fileupdated > lastupdated): raise NotGoingToDownload(_("Not Overwriting, web site is not newer."),'edit-undo.png') logger.info("write to %s"%outfile) inject_cal_cols(book,story,configuration) writer.writeStory(outfilename=outfile, forceOverwrite=True) book['comment'] = 'Download %s completed, %s chapters.'%(options['fileform'],story.getMetadata("numChapters")) book['all_metadata'] = story.getAllMetadata(removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() ## checks were done earlier, just update it. elif 'epub_for_update' in book and options['collision'] in (UPDATE, UPDATEALWAYS): # update now handled by pre-populating the old images and # chapters in the adapter rather than merging epubs. urlchaptercount = int(story.getMetadata('numChapters').replace(',','')) (url, chaptercount, adapter.oldchapters, adapter.oldimgs, adapter.oldcover, adapter.calibrebookmark, adapter.logfile, adapter.oldchaptersmap, adapter.oldchaptersdata) = get_update_data(book['epub_for_update'])[0:9] # dup handling from fff_plugin needed for anthology updates. if options['collision'] == UPDATE: if chaptercount == urlchaptercount: if merge: book['comment']=_("Already contains %d chapters. Reuse as is.")%chaptercount book['all_metadata'] = story.getAllMetadata(removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() book['outfile'] = book['epub_for_update'] # for anthology merge ops. return book else: # not merge, raise NotGoingToDownload(_("Already contains %d chapters.")%chaptercount,'edit-undo.png') elif chaptercount > urlchaptercount: raise NotGoingToDownload(_("Existing epub contains %d chapters, web site only has %d. Use Overwrite to force update.") % (chaptercount,urlchaptercount),'dialog_error.png') elif chaptercount == 0: raise NotGoingToDownload(_("FanFicFare doesn't recognize chapters in existing epub, epub is probably from a different source. Use Overwrite to force update."),'dialog_error.png') if not (options['collision'] == UPDATEALWAYS and chaptercount == urlchaptercount) \ and adapter.getConfig("do_update_hook"): chaptercount = adapter.hookForUpdates(chaptercount) logger.info("Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount)) logger.info("write to %s"%outfile) inject_cal_cols(book,story,configuration) writer.writeStory(outfilename=outfile, forceOverwrite=True) book['comment'] = _('Update %s completed, added %s chapters for %s total.')%\ (options['fileform'],(urlchaptercount-chaptercount),urlchaptercount) book['all_metadata'] = story.getAllMetadata(removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() if options['do_wordcount'] == SAVE_YES or ( options['do_wordcount'] == SAVE_YES_UNLESS_SITE and not story.getMetadataRaw('numWords') ): wordcount = get_word_count(outfile) logger.info("get_word_count:%s"%wordcount) story.setMetadata('numWords',wordcount) writer.writeStory(outfilename=outfile, forceOverwrite=True) book['all_metadata'] = story.getAllMetadata(removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() if options['smarten_punctuation'] and options['fileform'] == "epub" \ and calibre_version >= (0, 9, 39): # for smarten punc from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS from calibre.utils.logging import Log from collections import namedtuple # do smarten_punctuation from calibre's polish feature data = {'smarten_punctuation':True} opts = ALL_OPTS.copy() opts.update(data) O = namedtuple('Options', ' '.join(ALL_OPTS.iterkeys())) opts = O(**opts) log = Log(level=Log.DEBUG) polish({outfile:outfile}, opts, log, logger.info) except NotGoingToDownload as d: book['good']=False book['comment']=unicode(d) book['icon'] = d.icon except Exception as e: book['good']=False book['comment']=unicode(e) book['icon']='dialog_error.png' book['status'] = 'Error' logger.info("Exception: %s:%s"%(book,unicode(e))) traceback.print_exc() #time.sleep(10) return book
def test_markdown(self): from calibre.ebooks.markdown import Markdown Markdown(extensions=['extra']) from calibre.library.comments import sanitize_comments_html sanitize_comments_html(b'''<script>moo</script>xxx<img src="http://moo.com/x.jpg">''')
def parse_comments(self, root): # <!-- 책소개--> # aladin uses other request for description and toc. # http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=8970122648&name=Introduce&type=0&date=16 urlDesc = "http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=%s&name=Introduce&type=0&date=%s" % ( self.isbn, datetime.datetime.now().hour, ) # TODO: foreign book description # 출판사 제공 책소개 - 외국 도서, # http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=0385340583&name=PublisherDesc&type=0&date=15 comments = "" toc = "" try: self.browser.addheaders = [("Referer", self.url)] rawDesc = self.browser.open_novisit(urlDesc, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, "getcode", None)) and e.getcode() == 404: self.log.error("URL malformed: %r" % urlDesc) else: attr = getattr(e, "args", [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = "Aladin timed out. Try again later." self.log.error(msg) else: msg = "Failed to make Descrpitions query: %r" % urlDesc self.log.exception(msg) if rawDesc: try: # rawDesc = rawDesc.decode('euc-kr', errors='replace') # 2015-03-19 22:26:51 rawDesc = rawDesc.decode("utf-8", errors="replace") rootDesc = fromstring(clean_ascii_chars(rawDesc)) nodeDesc = rootDesc.xpath('//div[@class="p_textbox"]') if nodeDesc: self._removeTags(nodeDesc[0], ["object", "script", "style"]) comments = tostring(nodeDesc[0], method="html") except: msg = "Failed to parse aladin details page: %r" % urlDesc self.log.exception(msg) default_append_toc = cfg.DEFAULT_STORE_VALUES[cfg.KEY_APPEND_TOC] append_toc = cfg.plugin_prefs[cfg.STORE_NAME].get(cfg.KEY_APPEND_TOC, default_append_toc) if rootDesc and append_toc: toc_node = rootDesc.xpath('//div[@id="div_TOC_All"]//p') if not toc_node: toc_node = rootDesc.xpath('//div[@id="div_TOC_Short"]//p') if toc_node: toc = tostring(toc_node[0], method="html") toc = sanitize_comments_html(toc) if not comments: # Look for description in a meta description_node = root.xpath('//meta[@name="Description"]/@content') if description_node: # return description_node[0] comments = description_node[0] if comments: comments = '<div id="comments">' + comments + "</div>" if toc: comments += '<h3>[목차]</h3><div id="toc">' + toc + "</div>" if comments: comments_suffix = cfg.DEFAULT_STORE_VALUES[cfg.KEY_COMMENTS_SUFFIX] comments_suffix = cfg.plugin_prefs[cfg.STORE_NAME].get(cfg.KEY_COMMENTS_SUFFIX, comments_suffix) # comments += '<hr /><div><div style="float:right">[aladin.co.kr]</div></div>' if comments_suffix: comments += comments_suffix return comments
# print "ebook isbn is "+str(ebook_isbn[0]) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding=unicode).strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None if __name__ == '__main__': # To run these test use: # calibre-debug -e src/calibre/ebooks/metadata/sources/overdrive.py from calibre.ebooks.metadata.sources.test import (test_identify_plugin, title_test, authors_test) test_identify_plugin(OverDrive.name, [ ({ 'title': 'The Sea Kings Daughter', 'authors': ['Elizabeth Peters'] }, [ title_test('The Sea Kings Daughter', exact=False),
# print "ebook isbn is "+str(ebook_isbn[0]) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding=unicode).strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None if __name__ == '__main__': # To run these test use: # calibre-debug -e src/calibre/ebooks/metadata/sources/overdrive.py from calibre.ebooks.metadata.sources.test import (test_identify_plugin, title_test, authors_test) test_identify_plugin(OverDrive.name, [ ( {'title':'The Sea Kings Daughter', 'authors':['Elizabeth Peters']},
def do_download_for_worker(book, options, merge, notification=lambda x, y: x): ''' Child job, to download story when run as a worker job ''' from calibre_plugins.fanficfare_plugin import FanFicFareBase fffbase = FanFicFareBase(options['plugin_path']) with fffbase: from calibre_plugins.fanficfare_plugin.dialogs import ( NotGoingToDownload, OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY, CALIBREONLYSAVECOL) from calibre_plugins.fanficfare_plugin.fanficfare import adapters, writers, exceptions from calibre_plugins.fanficfare_plugin.fanficfare.epubutils import get_update_data from calibre_plugins.fanficfare_plugin.fff_util import ( get_fff_adapter, get_fff_config) try: book['comment'] = _('Download started...') configuration = get_fff_config(book['url'], options['fileform'], options['personal.ini']) if configuration.getConfig('use_ssl_unverified_context'): ## monkey patch to avoid SSL bug. dupliated from ## fff_plugin.py because bg jobs run in own process ## space. import ssl if hasattr(ssl, '_create_unverified_context'): ssl._create_default_https_context = ssl._create_unverified_context if not options[ 'updateepubcover'] and 'epub_for_update' in book and options[ 'collision'] in (UPDATE, UPDATEALWAYS): configuration.set("overrides", "never_make_cover", "true") # images only for epub, html, even if the user mistakenly # turned it on else where. if options['fileform'] not in ("epub", "html"): configuration.set("overrides", "include_images", "false") adapter = adapters.getAdapter(configuration, book['url']) adapter.is_adult = book['is_adult'] adapter.username = book['username'] adapter.password = book['password'] adapter.setChaptersRange(book['begin'], book['end']) adapter.load_cookiejar(options['cookiejarfile']) logger.debug("cookiejar:%s" % adapter.cookiejar) adapter.set_pagecache(options['pagecache']) story = adapter.getStoryMetadataOnly() if 'calibre_series' in book: adapter.setSeries(book['calibre_series'][0], book['calibre_series'][1]) # set PI version instead of default. if 'version' in options: story.setMetadata('version', options['version']) book['title'] = story.getMetadata("title", removeallentities=True) book['author_sort'] = book['author'] = story.getList( "author", removeallentities=True) book['publisher'] = story.getMetadata("site") book['url'] = story.getMetadata("storyUrl") book['tags'] = story.getSubjectTags(removeallentities=True) if story.getMetadata("description"): book['comments'] = sanitize_comments_html( story.getMetadata("description")) else: book['comments'] = '' book['series'] = story.getMetadata("series", removeallentities=True) if story.getMetadataRaw('datePublished'): book['pubdate'] = story.getMetadataRaw( 'datePublished').replace(tzinfo=local_tz) if story.getMetadataRaw('dateUpdated'): book['updatedate'] = story.getMetadataRaw( 'dateUpdated').replace(tzinfo=local_tz) if story.getMetadataRaw('dateCreated'): book['timestamp'] = story.getMetadataRaw( 'dateCreated').replace(tzinfo=local_tz) else: book['timestamp'] = None # need *something* there for calibre. writer = writers.getWriter(options['fileform'], configuration, adapter) outfile = book['outfile'] ## No need to download at all. Shouldn't ever get down here. if options['collision'] in (CALIBREONLY, CALIBREONLYSAVECOL): logger.info( "Skipping CALIBREONLY 'update' down inside worker--this shouldn't be happening..." ) book['comment'] = 'Metadata collected.' book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() ## checks were done earlier, it's new or not dup or newer--just write it. elif options['collision'] in (ADDNEW, SKIP, OVERWRITE, OVERWRITEALWAYS) or \ ('epub_for_update' not in book and options['collision'] in (UPDATE, UPDATEALWAYS)): # preserve logfile even on overwrite. if 'epub_for_update' in book: adapter.logfile = get_update_data( book['epub_for_update'])[6] # change the existing entries id to notid so # write_epub writes a whole new set to indicate overwrite. if adapter.logfile: adapter.logfile = adapter.logfile.replace( "span id", "span notid") if options['collision'] == OVERWRITE and 'fileupdated' in book: lastupdated = story.getMetadataRaw('dateUpdated') fileupdated = book['fileupdated'] # updated doesn't have time (or is midnight), use dates only. # updated does have time, use full timestamps. if (lastupdated.time() == time.min and fileupdated.date() > lastupdated.date()) or \ (lastupdated.time() != time.min and fileupdated > lastupdated): raise NotGoingToDownload( _("Not Overwriting, web site is not newer."), 'edit-undo.png') logger.info("write to %s" % outfile) inject_cal_cols(book, story, configuration) writer.writeStory(outfilename=outfile, forceOverwrite=True) book['comment'] = 'Download %s completed, %s chapters.' % ( options['fileform'], story.getMetadata("numChapters")) book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() ## checks were done earlier, just update it. elif 'epub_for_update' in book and options['collision'] in ( UPDATE, UPDATEALWAYS): # update now handled by pre-populating the old images and # chapters in the adapter rather than merging epubs. urlchaptercount = int( story.getMetadata('numChapters').replace(',', '')) (url, chaptercount, adapter.oldchapters, adapter.oldimgs, adapter.oldcover, adapter.calibrebookmark, adapter.logfile, adapter.oldchaptersmap, adapter.oldchaptersdata) = get_update_data( book['epub_for_update'])[0:9] # dup handling from fff_plugin needed for anthology updates. if options['collision'] == UPDATE: if chaptercount == urlchaptercount: if merge: book['comment'] = _( "Already contains %d chapters. Reuse as is." ) % chaptercount book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata( ) book['outfile'] = book[ 'epub_for_update'] # for anthology merge ops. return book else: # not merge, raise NotGoingToDownload( _("Already contains %d chapters.") % chaptercount, 'edit-undo.png') elif chaptercount > urlchaptercount: raise NotGoingToDownload( _("Existing epub contains %d chapters, web site only has %d. Use Overwrite to force update." ) % (chaptercount, urlchaptercount), 'dialog_error.png') elif chaptercount == 0: raise NotGoingToDownload( _("FanFicFare doesn't recognize chapters in existing epub, epub is probably from a different source. Use Overwrite to force update." ), 'dialog_error.png') if not (options['collision'] == UPDATEALWAYS and chaptercount == urlchaptercount) \ and adapter.getConfig("do_update_hook"): chaptercount = adapter.hookForUpdates(chaptercount) logger.info("Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount)) logger.info("write to %s" % outfile) inject_cal_cols(book, story, configuration) writer.writeStory(outfilename=outfile, forceOverwrite=True) book['comment'] = _('Update %s completed, added %s chapters for %s total.')%\ (options['fileform'],(urlchaptercount-chaptercount),urlchaptercount) book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() if options['smarten_punctuation'] and options['fileform'] == "epub" \ and calibre_version >= (0, 9, 39): # for smarten punc from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS from calibre.utils.logging import Log from collections import namedtuple # do smarten_punctuation from calibre's polish feature data = {'smarten_punctuation': True} opts = ALL_OPTS.copy() opts.update(data) O = namedtuple('Options', ' '.join(ALL_OPTS.iterkeys())) opts = O(**opts) log = Log(level=Log.DEBUG) # report = [] polish({outfile: outfile}, opts, log, logger.info) # report.append except NotGoingToDownload as d: book['good'] = False book['comment'] = unicode(d) book['icon'] = d.icon except Exception as e: book['good'] = False book['comment'] = unicode(e) book['icon'] = 'dialog_error.png' book['status'] = 'Error' logger.info("Exception: %s:%s" % (book, unicode(e))) traceback.print_exc() #time.sleep(10) return book
def test_markdown(self): from calibre.ebooks.txt.processor import create_markdown_object from calibre.ebooks.conversion.plugins.txt_input import MD_EXTENSIONS create_markdown_object(sorted(MD_EXTENSIONS)) from calibre.library.comments import sanitize_comments_html sanitize_comments_html(b'''<script>moo</script>xxx<img src="http://moo.com/x.jpg">''')
def parse_comments(self, root): # <!-- 책소개--> # aladin uses other request for description and toc. # http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=8970122648&name=Introduce&type=0&date=16 urlDesc = "http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=%s&name=Introduce&type=0&date=%s" % ( self.isbn, datetime.datetime.now().hour) # TODO: foreign book description # 출판사 제공 책소개 - 외국 도서, # http://www.aladin.co.kr/shop/product/getContents.aspx?ISBN=0385340583&name=PublisherDesc&type=0&date=15 comments = '' toc = '' try: self.browser.addheaders = [('Referer', self.url)] rawDesc = self.browser.open_novisit( urlDesc, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: self.log.error('URL malformed: %r' % urlDesc) else: attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Aladin timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make Descrpitions query: %r' % urlDesc self.log.exception(msg) if rawDesc: try: #rawDesc = rawDesc.decode('euc-kr', errors='replace') # 2015-03-19 22:26:51 rawDesc = rawDesc.decode('utf-8', errors='replace') rootDesc = fromstring(clean_ascii_chars(rawDesc)) nodeDesc = rootDesc.xpath('//div[@class="p_textbox"]') if nodeDesc: self._removeTags(nodeDesc[0], ["object", "script", "style"]) comments = tostring(nodeDesc[0], method='html') except: msg = 'Failed to parse aladin details page: %r' % urlDesc self.log.exception(msg) default_append_toc = cfg.DEFAULT_STORE_VALUES[cfg.KEY_APPEND_TOC] append_toc = cfg.plugin_prefs[cfg.STORE_NAME].get( cfg.KEY_APPEND_TOC, default_append_toc) if rootDesc and append_toc: toc_node = rootDesc.xpath('//div[@id="div_TOC_All"]//p') if not toc_node: toc_node = rootDesc.xpath('//div[@id="div_TOC_Short"]//p') if toc_node: toc = tostring(toc_node[0], method='html') toc = sanitize_comments_html(toc) if not comments: # Look for description in a meta description_node = root.xpath( '//meta[@name="Description"]/@content') if description_node: # return description_node[0] comments = description_node[0] if comments: comments = '<div id="comments">' + comments + '</div>' if toc: comments += '<h3>[목차]</h3><div id="toc">' + toc + "</div>" if comments: comments_suffix = cfg.DEFAULT_STORE_VALUES[cfg.KEY_COMMENTS_SUFFIX] comments_suffix = cfg.plugin_prefs[cfg.STORE_NAME].get( cfg.KEY_COMMENTS_SUFFIX, comments_suffix) # comments += '<hr /><div><div style="float:right">[aladin.co.kr]</div></div>' if comments_suffix: comments += comments_suffix return comments
def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): from html5_parser import parse from lxml import html from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = parse(raw, maybe_xhtml=False, sanitize_names=True) except Exception: return False pub_date = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath( "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]" ) if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass if lang: lang = lang[0].strip().lower() lang = { 'english': 'eng', 'french': 'fra', 'german': 'deu', 'spanish': 'spa' }.get(lang, None) if lang: mi.language = lang if ebook_isbn: # print("ebook isbn is "+type('')(ebook_isbn[0])) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding='unicode').strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): self.load_config() # get identifying tags from book idn = identifiers.get('dnb-idn', None) isbn = check_isbn(identifiers.get('isbn', None)) # ignore unknown authors if authors is "V. A." or authors is "V.A." or authors is "Unknown" or authors is "Unbekannt": authors = None if (isbn is None) and (idn is None) and (title is None) and (authors is None): log.info( "This plugin requires at least either ISBN, IDN, Title or Author(s)." ) return None queries = [] # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results exact_search = {} if idn is not None: queries.append('num=' + idn) exact_search['idn'] = idn else: authors_v = [] title_v = [] if authors is not None: authors_v.append(' '.join(authors)) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=False))) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=True))) if title is not None: title_v.append(title) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=False))) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True))) if isbn is not None: exact_search['isbn'] = isbn # title and author if authors is not None and title is not None: for a in authors_v: for t in title_v: if isbn is not None: queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"') else: queries.append('tit="' + t + '" AND per="' + a + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '" AND tit="' + authors[0] + '"') # title but no author elif authors is not None and title is None: for i in authors_v: if isbn is not None: queries.append('per="' + i + '" AND num="' + isbn + '"') else: queries.append('per="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('tit="' + authors[0] + '"') # author but no title elif authors is None and title is not None: for i in title_v: if isbn is not None: queries.append('tit="' + i + '" AND num="' + isbn + '"') else: queries.append('tit="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '"') # as last resort only use isbn if isbn is not None: queries.append('num=' + isbn) # Sort queries descending by length (assumption: longer query -> less but better results) #queries.sort(key=len) #queries.reverse() # remove duplicate queries uniqueQueries = [] for i in queries: if i not in uniqueQueries: uniqueQueries.append(i) # Process queries results = None for query in uniqueQueries: query = query + ' NOT (mat=film OR mat=music OR mat=microfiches)' log.info(query) if self.cfg_dnb_token is None: results = self.getSearchResultsByScraping(log, query, timeout) else: results = self.getSearchResults(log, query, timeout) if results is None: continue log.info("Parsing records") ns = {'marc21': 'http://www.loc.gov/MARC21/slim'} for record in results: series = None series_index = None publisher = None pubdate = None languages = [] title = None title_sort = None edition = None comments = None idn = None urn = None isbn = None ddc = [] subjects_gnd = [] subjects_non_gnd = [] # Title: Field 245 title_parts = [] # if a,n,p exist: series = a, series_index = n, title = p for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/../marc21:subfield[@code='n' and string-length(text())>0]/../marc21:subfield[@code='p' and string-length(text())>0]/..", namespaces=ns): series_index = i.xpath(".//marc21:subfield[@code='n']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match: series_index = match.group(1) else: series_index = "0" # looks like sometimes DNB does not know the series index and uses something like "[...]" series_index = series_index.replace(',', '.') series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() title_parts.append( i.xpath(".//marc21:subfield[@code='p']", namespaces=ns)[0].text.strip()) log.info("Extracted Series: %s" % series) log.info("Extracted Series Index: %s" % series_index) break # otherwise: title = a if len(title_parts) == 0: for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 1 for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 2 #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns): # title = title + " / " + i.text.strip() # break title = " : ".join(title_parts) log.info("Extracted Title: %s" % title) # Title_Sort title_sort_parts = list(title_parts) title_sort_regex = re.match( '^(.*?)(' + chr(152) + '.*' + chr(156) + ')?(.*?)$', title_parts[0]) sortword = title_sort_regex.group(2) if sortword: title_sort_parts[0] = ''.join( filter(None, [ title_sort_regex.group(1).strip(), title_sort_regex.group(3).strip(), ", " + sortword ])) title_sort = " : ".join(title_sort_parts) log.info("Extracted Title_Sort: %s" % title_sort) # Authors authors = [] author_sort = None for i in record.xpath( ".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # primary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len( authors ) == 0: # if no "real" autor was found take all persons involved for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len(authors) > 0: author_sort = authors[0] log.info("Extracted Authors: %s" % " & ".join(authors)) # Comments for i in record.xpath( ".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]", namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = sanitize_comments_html(comments) log.info('Comments: %s' % comments) break except: log.info("Could not download Comments from %s" % i) # Publisher Name and Location publisher_name = None publisher_location = None fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() log.info("Extracted Publisher: %s" % publisher_name) log.info("Extracted Publisher Location: %s" % publisher_location) # Publishing Date for i in record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]", namespaces=ns): match = re.search("(\d{4})", i.text.strip()) if match is not None: year = match.group(1) pubdate = datetime.datetime(int(year), 1, 2) break log.info("Extracted Publication Year: %s" % pubdate) # ID: IDN for i in record.xpath( ".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): idn = i.text.strip() break log.info("Extracted ID IDN: %s" % idn) if "idn" in exact_search: if idn != exact_search["idn"]: log.info( "Extracted IDN does not match book's IDN, skipping record" ) continue # ID: URN for i in record.xpath( ".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): urn = i.text.strip() break log.info("Extracted ID URN: %s" % urn) # ID: ISBN for i in record.xpath( ".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]" match = re.search(isbn_regex, i.text.strip()) isbn = match.group() isbn = isbn.replace('-', '') break log.info("Extracted ID ISBN: %s" % isbn) if "isbn" in exact_search: if isbn != exact_search["isbn"]: log.info( "Extracted ISBN does not match book's ISBN, skipping record" ) continue # ID: Sachgruppe (DDC) for i in record.xpath( ".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): ddc.append(i.text.strip()) log.info("Extracted ID DDC: %s" % ",".join(ddc)) # Series and Series_Index if series is None and series_index is None: for i in record.xpath( ".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',', '.') log.info("Extracted Series Index: %s" % series_index) # Series series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() log.info("Extracted Series: %s" % series) break # Try to extract Series, Series Index and Title from the fetched title. # Caution: This overwrites DNB's series/series_index and modifies the title! if self.cfg_guess_series is True: guessed_series = None guessed_series_index = None parts = re.split("[:]", self.removeSortingCharacters(title)) if len(parts) == 2: if bool(re.search("\d", parts[0])) != bool( re.search("\d", parts[1])): # figure out which part contains the index if bool(re.search("\d", parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] match = re.match( "^[\s\-–:]*(.+?)[\s\-–:]*$", textpart ) # remove odd characters from start and end of the text part if match: textpart = match.group(1) # from Titleparts like: "Name of the series - Episode 2" OK match = re.match( "^\s*(\S.*?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", indexpart) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart else: # from Titleparts like: "Episode 2 Name of the series" match = re.match( "^\s*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S.*?)[\/\.,\-–\s]*$", indexpart) if match: guessed_series_index = match.group(1) guessed_series = match.group(2) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart elif len(parts) == 1: # from Titles like: "Name of the series - Title (Episode 2)" match = re.match( "^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(3) guessed_series = match.group(1) title = match.group(2) else: # from Titles like: "Name of the series - Episode 2" match = re.match( "^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) title = guessed_series + " : Band " + guessed_series_index if guessed_series is not None and guessed_series_index is not None: series = guessed_series series_index = guessed_series_index log.info("Guessed Series: %s" % series) log.info("Guessed Series Index: %s" % series_index) # GND Subjects from 689 for i in record.xpath( ".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): subjects_gnd.append(i.text.strip()) # GND Subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): if i.text.startswith("("): continue subjects_gnd.append(i.text) log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd)) # Non-GND subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # ignore entries starting with "(": if i.text.startswith("("): continue subjects_non_gnd.extend(re.split(',|;', i.text)) # remove one-character subjects: for i in subjects_non_gnd: if len(i) < 2: subjects_non_gnd.remove(i) log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd)) # Edition for i in record.xpath( ".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): edition = i.text.strip() break log.info("Extracted Edition: %s" % edition) # Languages for i in record.xpath( ".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): languages.append(i.text.strip()) if languages is not None: log.info("Extracted Languages: %s" % ",".join(languages)) # Put it all together if self.cfg_append_edition_to_title == True and edition is not None: title = title + " : " + edition mi = Metadata( self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors)) mi.title_sort = self.removeSortingCharacters(title_sort) mi.author_sort = self.removeSortingCharacters(author_sort) mi.languages = languages mi.pubdate = pubdate mi.publisher = " : ".join( filter(None, [ publisher_location, self.removeSortingCharacters(publisher_name) ])) mi.series = self.removeSortingCharacters(series) mi.series_index = series_index mi.comments = comments mi.isbn = isbn # also required for cover download mi.set_identifier('urn', urn) mi.set_identifier('dnb-idn', idn) mi.set_identifier('ddc', ",".join(ddc)) if self.cfg_fetch_subjects == 0: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 1: if len(subjects_gnd) > 0: mi.tags = self.uniq(subjects_gnd) else: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 2: mi.tags = self.uniq(subjects_gnd + subjects_non_gnd) elif self.cfg_fetch_subjects == 3: if len(subjects_non_gnd) > 0: mi.tags = self.uniq(subjects_non_gnd) else: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 4: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 5: mi.tags = [] # put current result's metdata into result queue log.info("Final formatted result: %s" % mi) result_queue.put(mi)