def read_metadata(root): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in identifiers.iteritems(): if key == 'calibre': ans.application_id = vals[0] elif key != 'uuid': ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm return ans
def read_metadata(root, ver=None, return_extra_data=False): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in iteritems(identifiers): if key == 'calibre': ans.application_id = vals[0] elif key == 'uuid': ans.uuid = vals[0] else: ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: if bkp[0]: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm ans.comments = read_comments(root, prefixes, refines) or ans.comments ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher ans.tags = read_tags(root, prefixes, refines) or ans.tags ans.rating = read_rating(root, prefixes, refines) or ans.rating s, si = read_series(root, prefixes, refines) if s: ans.series, ans.series_index = s, si ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories for name, fm in iteritems((read_user_metadata(root, prefixes, refines) or {})): ans.set_user_metadata(name, fm) if return_extra_data: ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item( root, prefixes, refines) return ans
def read_metadata(root, ver=None, return_extra_data=False): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in identifiers.iteritems(): if key == 'calibre': ans.application_id = vals[0] elif key == 'uuid': ans.uuid = vals[0] else: ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages auts, aus = [], [] for a in read_authors(root, prefixes, refines): auts.append(a.name), aus.append(a.sort) ans.authors = auts or ans.authors ans.author_sort = authors_to_string(aus) or ans.author_sort bkp = read_book_producers(root, prefixes, refines) if bkp: if bkp[0]: ans.book_producer = bkp[0] pd = read_pubdate(root, prefixes, refines) if not is_date_undefined(pd): ans.pubdate = pd ts = read_timestamp(root, prefixes, refines) if not is_date_undefined(ts): ans.timestamp = ts lm = read_last_modified(root, prefixes, refines) if not is_date_undefined(lm): ans.last_modified = lm ans.comments = read_comments(root, prefixes, refines) or ans.comments ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher ans.tags = read_tags(root, prefixes, refines) or ans.tags ans.rating = read_rating(root, prefixes, refines) or ans.rating s, si = read_series(root, prefixes, refines) if s: ans.series, ans.series_index = s, si ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories for name, fm in (read_user_metadata(root, prefixes, refines) or {}).iteritems(): ans.set_user_metadata(name, fm) if return_extra_data: ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines) return ans
def read_metadata(root): ans = Metadata(_('Unknown'), [_('Unknown')]) prefixes, refines = read_prefixes(root), read_refines(root) identifiers = read_identifiers(root, prefixes, refines) ids = {} for key, vals in identifiers.iteritems(): if key == 'calibre': ans.application_id = vals[0] elif key != 'uuid': ids[key] = vals[0] ans.set_identifiers(ids) ans.title = read_title(root, prefixes, refines) or ans.title ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort ans.languages = read_languages(root, prefixes, refines) or ans.languages return ans
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for( 'cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers( self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key + '_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name, cat, ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name, cat]) elif name == v: res.append([name, cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def _get_metadata(self, book_id, get_user_categories=True): # {{{ mi = Metadata(None, template_cache=self.formatter_template_cache) author_ids = self._field_ids_for('authors', book_id) aut_list = [self._author_data(i) for i in author_ids] aum = [] aus = {} aul = {} for rec in aut_list: aut = rec['name'] aum.append(aut) aus[aut] = rec['sort'] aul[aut] = rec['link'] mi.title = self._field_for('title', book_id, default_value=_('Unknown')) mi.authors = aum mi.author_sort = self._field_for('author_sort', book_id, default_value=_('Unknown')) mi.author_sort_map = aus mi.author_link_map = aul mi.comments = self._field_for('comments', book_id) mi.publisher = self._field_for('publisher', book_id) n = nowf() mi.timestamp = self._field_for('timestamp', book_id, default_value=n) mi.pubdate = self._field_for('pubdate', book_id, default_value=n) mi.uuid = self._field_for('uuid', book_id, default_value='dummy') mi.title_sort = self._field_for('sort', book_id, default_value=_('Unknown')) mi.book_size = self._field_for('size', book_id, default_value=0) mi.ondevice_col = self._field_for('ondevice', book_id, default_value='') mi.last_modified = self._field_for('last_modified', book_id, default_value=n) formats = self._field_for('formats', book_id) mi.format_metadata = {} mi.languages = list(self._field_for('languages', book_id)) if not formats: good_formats = None else: mi.format_metadata = FormatMetadata(self, book_id, formats) good_formats = FormatsList(formats, mi.format_metadata) mi.formats = good_formats mi.has_cover = _('Yes') if self._field_for('cover', book_id, default_value=False) else '' mi.tags = list(self._field_for('tags', book_id, default_value=())) mi.series = self._field_for('series', book_id) if mi.series: mi.series_index = self._field_for('series_index', book_id, default_value=1.0) mi.rating = self._field_for('rating', book_id) mi.set_identifiers(self._field_for('identifiers', book_id, default_value={})) mi.application_id = book_id mi.id = book_id composites = [] for key, meta in self.field_metadata.custom_iteritems(): mi.set_user_metadata(key, meta) if meta['datatype'] == 'composite': composites.append(key) else: val = self._field_for(key, book_id) if isinstance(val, tuple): val = list(val) extra = self._field_for(key+'_index', book_id) mi.set(key, val=val, extra=extra) for key in composites: mi.set(key, val=self._composite_for(key, book_id, mi)) user_cat_vals = {} if get_user_categories: user_cats = self.backend.prefs['user_categories'] for ucat in user_cats: res = [] for name,cat,ign in user_cats[ucat]: v = mi.get(cat, None) if isinstance(v, list): if name in v: res.append([name,cat]) elif name == v: res.append([name,cat]) user_cat_vals[ucat] = res mi.user_categories = user_cat_vals return mi
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): self.load_config() # get identifying tags from book idn = identifiers.get('dnb-idn', None) isbn = check_isbn(identifiers.get('isbn', None)) # ignore unknown authors if authors is "V. A." or authors is "V.A." or authors is "Unknown" or authors is "Unbekannt": authors = None if (isbn is None) and (idn is None) and (title is None) and (authors is None): log.info( "This plugin requires at least either ISBN, IDN, Title or Author(s)." ) return None queries = [] # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results exact_search = {} if idn is not None: queries.append('num=' + idn) exact_search['idn'] = idn else: authors_v = [] title_v = [] if authors is not None: authors_v.append(' '.join(authors)) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=False))) authors_v.append(' '.join( self.get_author_tokens(authors, only_first_author=True))) if title is not None: title_v.append(title) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=False))) title_v.append(' '.join( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True))) if isbn is not None: exact_search['isbn'] = isbn # title and author if authors is not None and title is not None: for a in authors_v: for t in title_v: if isbn is not None: queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"') else: queries.append('tit="' + t + '" AND per="' + a + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '" AND tit="' + authors[0] + '"') # title but no author elif authors is not None and title is None: for i in authors_v: if isbn is not None: queries.append('per="' + i + '" AND num="' + isbn + '"') else: queries.append('per="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('tit="' + authors[0] + '" AND num="' + isbn + '"') else: queries.append('tit="' + authors[0] + '"') # author but no title elif authors is None and title is not None: for i in title_v: if isbn is not None: queries.append('tit="' + i + '" AND num="' + isbn + '"') else: queries.append('tit="' + i + '"') # try with author and title swapped if isbn is not None: queries.append('per="' + title + '" AND num="' + isbn + '"') else: queries.append('per="' + title + '"') # as last resort only use isbn if isbn is not None: queries.append('num=' + isbn) # Sort queries descending by length (assumption: longer query -> less but better results) #queries.sort(key=len) #queries.reverse() # remove duplicate queries uniqueQueries = [] for i in queries: if i not in uniqueQueries: uniqueQueries.append(i) # Process queries results = None for query in uniqueQueries: query = query + ' NOT (mat=film OR mat=music OR mat=microfiches)' log.info(query) if self.cfg_dnb_token is None: results = self.getSearchResultsByScraping(log, query, timeout) else: results = self.getSearchResults(log, query, timeout) if results is None: continue log.info("Parsing records") ns = {'marc21': 'http://www.loc.gov/MARC21/slim'} for record in results: series = None series_index = None publisher = None pubdate = None languages = [] title = None title_sort = None edition = None comments = None idn = None urn = None isbn = None ddc = [] subjects_gnd = [] subjects_non_gnd = [] # Title: Field 245 title_parts = [] # if a,n,p exist: series = a, series_index = n, title = p for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/../marc21:subfield[@code='n' and string-length(text())>0]/../marc21:subfield[@code='p' and string-length(text())>0]/..", namespaces=ns): series_index = i.xpath(".//marc21:subfield[@code='n']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match: series_index = match.group(1) else: series_index = "0" # looks like sometimes DNB does not know the series index and uses something like "[...]" series_index = series_index.replace(',', '.') series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() title_parts.append( i.xpath(".//marc21:subfield[@code='p']", namespaces=ns)[0].text.strip()) log.info("Extracted Series: %s" % series) log.info("Extracted Series Index: %s" % series_index) break # otherwise: title = a if len(title_parts) == 0: for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 1 for i in record.xpath( ".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 2 #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns): # title = title + " / " + i.text.strip() # break title = " : ".join(title_parts) log.info("Extracted Title: %s" % title) # Title_Sort title_sort_parts = list(title_parts) title_sort_regex = re.match( '^(.*?)(' + chr(152) + '.*' + chr(156) + ')?(.*?)$', title_parts[0]) sortword = title_sort_regex.group(2) if sortword: title_sort_parts[0] = ''.join( filter(None, [ title_sort_regex.group(1).strip(), title_sort_regex.group(3).strip(), ", " + sortword ])) title_sort = " : ".join(title_sort_parts) log.info("Extracted Title_Sort: %s" % title_sort) # Authors authors = [] author_sort = None for i in record.xpath( ".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # primary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len( authors ) == 0: # if no "real" autor was found take all persons involved for i in record.xpath( ".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # secondary authors name = re.sub(" \[.*\]$", "", i.text.strip()) authors.append(name) if len(authors) > 0: author_sort = authors[0] log.info("Extracted Authors: %s" % " & ".join(authors)) # Comments for i in record.xpath( ".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]", namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = sanitize_comments_html(comments) log.info('Comments: %s' % comments) break except: log.info("Could not download Comments from %s" % i) # Publisher Name and Location publisher_name = None publisher_location = None fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_name = fields[0].xpath( ".//marc21:subfield[@code='b' and string-length(text())>0]", namespaces=ns)[0].text.strip() else: fields = record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..", namespaces=ns) if len(fields) > 0: publisher_location = fields[0].xpath( ".//marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns)[0].text.strip() log.info("Extracted Publisher: %s" % publisher_name) log.info("Extracted Publisher Location: %s" % publisher_location) # Publishing Date for i in record.xpath( ".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]", namespaces=ns): match = re.search("(\d{4})", i.text.strip()) if match is not None: year = match.group(1) pubdate = datetime.datetime(int(year), 1, 2) break log.info("Extracted Publication Year: %s" % pubdate) # ID: IDN for i in record.xpath( ".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): idn = i.text.strip() break log.info("Extracted ID IDN: %s" % idn) if "idn" in exact_search: if idn != exact_search["idn"]: log.info( "Extracted IDN does not match book's IDN, skipping record" ) continue # ID: URN for i in record.xpath( ".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): urn = i.text.strip() break log.info("Extracted ID URN: %s" % urn) # ID: ISBN for i in record.xpath( ".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]" match = re.search(isbn_regex, i.text.strip()) isbn = match.group() isbn = isbn.replace('-', '') break log.info("Extracted ID ISBN: %s" % isbn) if "isbn" in exact_search: if isbn != exact_search["isbn"]: log.info( "Extracted ISBN does not match book's ISBN, skipping record" ) continue # ID: Sachgruppe (DDC) for i in record.xpath( ".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): ddc.append(i.text.strip()) log.info("Extracted ID DDC: %s" % ",".join(ddc)) # Series and Series_Index if series is None and series_index is None: for i in record.xpath( ".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..", namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']", namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',', '.') log.info("Extracted Series Index: %s" % series_index) # Series series = i.xpath(".//marc21:subfield[@code='a']", namespaces=ns)[0].text.strip() log.info("Extracted Series: %s" % series) break # Try to extract Series, Series Index and Title from the fetched title. # Caution: This overwrites DNB's series/series_index and modifies the title! if self.cfg_guess_series is True: guessed_series = None guessed_series_index = None parts = re.split("[:]", self.removeSortingCharacters(title)) if len(parts) == 2: if bool(re.search("\d", parts[0])) != bool( re.search("\d", parts[1])): # figure out which part contains the index if bool(re.search("\d", parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] match = re.match( "^[\s\-–:]*(.+?)[\s\-–:]*$", textpart ) # remove odd characters from start and end of the text part if match: textpart = match.group(1) # from Titleparts like: "Name of the series - Episode 2" OK match = re.match( "^\s*(\S.*?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", indexpart) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart else: # from Titleparts like: "Episode 2 Name of the series" match = re.match( "^\s*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S.*?)[\/\.,\-–\s]*$", indexpart) if match: guessed_series_index = match.group(1) guessed_series = match.group(2) if guessed_series is None: guessed_series = textpart title = textpart + " : Band " + guessed_series_index else: title = textpart elif len(parts) == 1: # from Titles like: "Name of the series - Title (Episode 2)" match = re.match( "^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(3) guessed_series = match.group(1) title = match.group(2) else: # from Titles like: "Name of the series - Episode 2" match = re.match( "^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$", parts[0]) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) title = guessed_series + " : Band " + guessed_series_index if guessed_series is not None and guessed_series_index is not None: series = guessed_series series_index = guessed_series_index log.info("Guessed Series: %s" % series) log.info("Guessed Series Index: %s" % series_index) # GND Subjects from 689 for i in record.xpath( ".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): subjects_gnd.append(i.text.strip()) # GND Subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): if i.text.startswith("("): continue subjects_gnd.append(i.text) log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd)) # Non-GND subjects from 600-655 for f in range(600, 656): for i in record.xpath(".//marc21:datafield[@tag='" + str( f ) + "']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): # ignore entries starting with "(": if i.text.startswith("("): continue subjects_non_gnd.extend(re.split(',|;', i.text)) # remove one-character subjects: for i in subjects_non_gnd: if len(i) < 2: subjects_non_gnd.remove(i) log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd)) # Edition for i in record.xpath( ".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): edition = i.text.strip() break log.info("Extracted Edition: %s" % edition) # Languages for i in record.xpath( ".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]", namespaces=ns): languages.append(i.text.strip()) if languages is not None: log.info("Extracted Languages: %s" % ",".join(languages)) # Put it all together if self.cfg_append_edition_to_title == True and edition is not None: title = title + " : " + edition mi = Metadata( self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors)) mi.title_sort = self.removeSortingCharacters(title_sort) mi.author_sort = self.removeSortingCharacters(author_sort) mi.languages = languages mi.pubdate = pubdate mi.publisher = " : ".join( filter(None, [ publisher_location, self.removeSortingCharacters(publisher_name) ])) mi.series = self.removeSortingCharacters(series) mi.series_index = series_index mi.comments = comments mi.isbn = isbn # also required for cover download mi.set_identifier('urn', urn) mi.set_identifier('dnb-idn', idn) mi.set_identifier('ddc', ",".join(ddc)) if self.cfg_fetch_subjects == 0: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 1: if len(subjects_gnd) > 0: mi.tags = self.uniq(subjects_gnd) else: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 2: mi.tags = self.uniq(subjects_gnd + subjects_non_gnd) elif self.cfg_fetch_subjects == 3: if len(subjects_non_gnd) > 0: mi.tags = self.uniq(subjects_non_gnd) else: mi.tags = self.uniq(subjects_gnd) elif self.cfg_fetch_subjects == 4: mi.tags = self.uniq(subjects_non_gnd) elif self.cfg_fetch_subjects == 5: mi.tags = [] # put current result's metdata into result queue log.info("Final formatted result: %s" % mi) result_queue.put(mi)
def identify(self, log, result_queue, abort, title=None, authors=[], identifiers={}, timeout=30): self.load_config() if authors is None: authors=[] # get identifying tags from book idn = identifiers.get('dnb-idn', None) isbn = check_isbn(identifiers.get('isbn', None)) # ignore unknown authors ignored_authors = [ "V. A.", "V.A.", "Unknown", "Unbekannt" ] for i in ignored_authors: authors = [ x for x in authors if x != i ] if (isbn is None) and (idn is None) and (title is None) and (authors is None): log.info("This plugin requires at least either ISBN, IDN, Title or Author(s).") return None queries=[] # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results exact_search = {} if idn is not None: exact_search['idn'] = idn # in case look for a IDN only search for the IDN and skip all the other stuff queries.append('num='+idn) else: authors_v = [] title_v = [] # create some variants of given authors if authors != []: authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=False))) # concat all author names ("Peter Meier Luise Stark") authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=True))) # use only first author for a in authors: authors_v.append(a) # use all authors, one by one # remove duplicates unique_authors_v = [] for i in authors_v: if i not in unique_authors_v: unique_authors_v.append(i) # create some variants of given title if title is not None: title_v.append(title) # simply use given title title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=False))) # remove some punctation characters title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=True))) # remove subtitle (everything after " : ") title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=False))) # remove some punctation characters and joiners ("and", "&", ...) title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True))) # remove subtitle (everything after " : ") and joiners ("and", "&", ...) # TODO: remove subtitle after " - " # remove duplicates unique_title_v = [] for i in title_v: if i not in unique_title_v: unique_title_v.append(i) # title and author if authors_v != [] and title_v != []: for a in authors_v: for t in title_v: if isbn is not None: queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"') else: queries.append('tit="' + t + '" AND per="' + a + '"') # try with first author as title and title (without subtitle) as author if isbn is not None: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"') else: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # try with author and title (without subtitle) in any index if isbn is not None: queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"') else: queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # author but no title elif authors_v != [] and title_v == []: for i in authors_v: if isbn is not None: queries.append('per="'+ i +'" AND num="' + isbn + '"') else: queries.append('per="'+ i +'"') # try with author as title if isbn is not None: queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="' + isbn + '"') else: queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"') # title but no author elif authors_v == [] and title_v != []: for i in title_v: if isbn is not None: queries.append('tit="' + i + '" AND num="' + isbn + '"') else: queries.append('tit="' + i + '"') # try with title as author if isbn is not None: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND num="' + isbn + '"') else: queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '"') # as last resort only use isbn if isbn is not None: queries.append('num=' + isbn) # remove duplicate queries uniqueQueries=[] for i in queries: if i not in uniqueQueries: uniqueQueries.append(i) # Process queries results = None for query in uniqueQueries: # SRU does not work with "+" or "?" characters in query, so we simply remove them query = re.sub('[\+\?]','',query) query = query + ' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)' log.info(query) if self.cfg_dnb_token is None: results = self.getSearchResultsByScraping(log, query, timeout) else: results = self.getSearchResults(log, query, timeout) if results is None: continue log.info("Parsing records") ns = { 'marc21' : 'http://www.loc.gov/MARC21/slim' } for record in results: series = None series_index = None publisher = None pubdate = None languages = [] title = None title_sort = None authors = [] author_sort = None edition = None comments = None idn = None urn = None isbn = None ddc = [] subjects_gnd = [] subjects_non_gnd = [] publisher_name = None publisher_location = None ##### Field 264 ##### # Publisher Name and Location fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns) if len(fields)>0: publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip(); publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip(); else: fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",namespaces=ns) if len(fields)>0: publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip(); else: fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",namespaces=ns) if len(fields)>0: publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip(); # Publishing Date for i in record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",namespaces=ns): match = re.search("(\d{4})", i.text.strip()) if match is not None: year = match.group(1) pubdate = datetime.datetime(int(year), 1, 1, 12 , 30, 0) break # Log if publisher_name is not None: log.info("Extracted Publisher: %s" % publisher_name) if publisher_location is not None: log.info("Extracted Publisher Location: %s" % publisher_location) if pubdate is not None: log.info("Extracted Publication Year: %s" % pubdate) ##### Field 245 #### # Title/Series/Series_Index title_parts = [] for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # if a,n,p,n,p,n,p exist: series = a + n0 + " - " + p0 + n1 + " - " + p1, series_index = n2, title = p2 # if a,n,p,n,p exist: series = a + n0 + " - " + p0, series_index = n1, title = p1 (Example: dnb-id 1008774839) # if a,n,p exist: series = a, series_index = n, title = p # if a exist: title = a # TODO: a,n,p,n (i.e. 956375146) code_p = [] code_n = [] code_a = [] for j in i.xpath(".//marc21:subfield[@code='p']",namespaces=ns): code_p.append(j.text.strip()) for j in i.xpath(".//marc21:subfield[@code='n']",namespaces=ns): match = re.search("(\d+[,\.\d+]?)", j.text.strip()) if match: code_n.append(match.group(1)) else: code_n.append("0") # looks like sometimes DNB does not know the series index and uses something like "[...]" for j in i.xpath(".//marc21:subfield[@code='a']",namespaces=ns): code_a.append(j.text.strip()) if len(code_p) == 0: title_parts = title_parts + code_a elif len(code_p)>0 and len(code_p) == len(code_n): series = " : ".join(code_a) # I've never seen more than one code_a, but who knows... for i in range (0,len(code_p)-1): series = series + " " + code_n[i] + " " + code_p[i] series_index = code_n[-1] title_parts.append(code_p[-1]) # subtitle 1: Field 245 for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns): title_parts.append(i.text.strip()) break # subtitle 2 #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns): # title = title + " / " + i.text.strip() # break title = " : ".join(title_parts) # Log if series_index is not None: log.info("Extracted Series_Index from Field 245: %s" % series_index) if series is not None: log.info("Extracted Series from Field 245: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if title is not None: log.info("Extracted Title: %s" % title) title = self.cleanUpTitle(log, title) # Title_Sort if len(title_parts)>0: title_sort_parts = list(title_parts) title_sort_regex = re.match('^(.*?)('+chr(152)+'.*'+chr(156)+')?(.*?)$',title_parts[0]) sortword = title_sort_regex.group(2) if sortword: title_sort_parts[0] = ''.join(filter(None,[title_sort_regex.group(1).strip(),title_sort_regex.group(3).strip(),", "+sortword])) title_sort = " : ".join(title_sort_parts) # Log if title_sort is not None: log.info("Extracted Title_Sort: %s" % title_sort) ##### Field 100 and Field 700 ##### # Authors for i in record.xpath(".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # primary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # secondary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) if len(authors)==0: # if no "real" autor was found take all persons involved for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # secondary authors name = re.sub(" \[.*\]$","",i.text.strip()); authors.append(name) if len(authors)>0: author_sort = authors[0] # Log if len(authors)>0: log.info("Extracted Authors: %s" % " & ".join(authors)) if author_sort is not None: log.info("Extracted Author_Sort: %s" % " & ".join(authors)) ##### Field 856 ##### # Comments for i in record.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE) comments = sanitize_comments_html(comments) break except: log.info("Could not download Comments from %s" % i) # Log if comments is not None: log.info('Comments: %s' % comments) # If no comments are found for this edition, look at other editions of this book (Fields 776) # TODO: Make this configurable (default: yes) if comments is None: # get all other issues for i in record.xpath(".//marc21:datafield[@tag='776']/marc21:subfield[@code='w' and string-length(text())>0]",namespaces=ns): other_idn = re.sub("^\(.*\)","",i.text.strip()); subquery = 'num='+other_idn+' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)' log.info(subquery) if self.cfg_dnb_token is None: subresults = self.getSearchResultsByScraping(log, subquery, timeout) else: subresults = self.getSearchResults(log, subquery, timeout) if subresults is None: continue for subrecord in subresults: for i in subrecord.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns): if i.text.startswith("http://deposit.dnb.de/"): br = self.browser log.info('Downloading Comments from: %s' % i.text) try: comments = br.open_novisit(i.text, timeout=30).read() comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE) comments = sanitize_comments_html(comments) break except: log.info("Could not download Comments from %s" % i) if comments is not None: log.info('Comments from other issue: %s' % comments) break ##### Field 16 ##### # ID: IDN for i in record.xpath(".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): idn = i.text.strip() break # Log if idn is not None: log.info("Extracted ID IDN: %s" % idn) ##### Field 24 ##### # ID: URN for i in record.xpath(".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): urn = i.text.strip() break # Log if urn is not None: log.info("Extracted ID URN: %s" % urn) ##### Field 20 ##### # ID: ISBN for i in record.xpath(".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]" match = re.search(isbn_regex, i.text.strip()) isbn = match.group() isbn = isbn.replace('-','') break # Log if isbn is not None: log.info("Extracted ID ISBN: %s" % isbn) # When doing an exact search for a given ISBN skip books with wrong ISBNs if isbn is not None and "isbn" in exact_search: if isbn != exact_search["isbn"]: log.info("Extracted ISBN does not match book's ISBN, skipping record") continue ##### Field 82 ##### # ID: Sachgruppe (DDC) for i in record.xpath(".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): ddc.append(i.text.strip()) # Log if len(ddc)>0: log.info("Extracted ID DDC: %s" % ",".join(ddc)) ##### Field 490 ##### # In theory this field is not used for "real" book series, use field 830 instead. But it is used. # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='490']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # "v" either "Nr. 220" or "This great Seriestitle : Nr. 220" - if available use this instead of attribute a attr_v = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() parts = re.split(" : ",attr_v) if len(parts)==2: if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])): # figure out which part contains the index if bool(re.search("\d",parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] match = re.search("(\d+[,\.\d+]?)", indexpart) if match is not None: series_index = match.group(1) series = textpart.strip() else: match = re.search("(\d+[,\.\d+]?)", attr_v) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Use Series Name from attribute "a" if not already found in attribute "v" if series is None: series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 490: %s" % series_index) if series is not None: log.info("Extracted Series from Field 490: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 246 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='246']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): match = re.search("^(.+?) ; (\d+[,\.\d+]?)$", i.text.strip()) if match is not None: series = match.group(1) series_index = match.group(2) # Log if series_index is not None: log.info("Extracted Series Index from Field 246: %s" % series_index) if series is not None: log.info("Extracted Series from Field 246: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 800 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='800']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='t' and string-length(text())>0]/..",namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Series series = i.xpath(".//marc21:subfield[@code='t']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 800: %s" % series_index) if series is not None: log.info("Extracted Series from Field 800: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 830 ##### # Series and Series_Index if series is None or (series is not None and series_index == "0"): for i in record.xpath(".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns): # Series Index series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip() match = re.search("(\d+[,\.\d+]?)", series_index) if match is not None: series_index = match.group(1) else: series_index = "0" series_index = series_index.replace(',','.') # Series series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip() # Log if series_index is not None: log.info("Extracted Series Index from Field 830: %s" % series_index) if series is not None: log.info("Extracted Series from Field 830: %s" % series) series = self.cleanUpSeries(log, series, publisher_name) if series is not None: break ##### Field 689 ##### # GND Subjects for i in record.xpath(".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): subjects_gnd.append(i.text.strip()) for f in range(600,656): for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): if i.text.startswith("("): continue subjects_gnd.append(i.text) # Log if len(subjects_gnd)>0: log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd)) ##### Fields 600-655 ##### # Non-GND subjects for f in range(600,656): for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): # ignore entries starting with "(": if i.text.startswith("("): continue subjects_non_gnd.extend(re.split(',|;',i.text)) # remove one-character subjects: for i in subjects_non_gnd: if len(i)<2: subjects_non_gnd.remove(i) # Log if len(subjects_non_gnd)>0: log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd)) ##### Field 250 ##### # Edition for i in record.xpath(".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): edition = i.text.strip() break # Log if edition is not None: log.info("Extracted Edition: %s" % edition) ##### Field 41 ##### # Languages for i in record.xpath(".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns): languages.append(i.text.strip()) # Log if languages is not None: log.info("Extracted Languages: %s" % ",".join(languages)) ##### If configured: Try to separate Series, Series Index and Title from the fetched title ##### #if self.cfg_guess_series is True: if (series is None or (series is not None and series_index == "0")) and self.cfg_guess_series is True: guessed_series = None guessed_series_index = None guessed_title = None log.info("Starting Series Guesser") parts = re.split("[:]",self.removeSortingCharacters(title)) if len(parts)==2: log.info("Title has two parts") # make sure only one part of the two parts contains digits if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])): log.info("only one title part contains digits") # figure out which part contains the index if bool(re.search("\d",parts[0])): indexpart = parts[0] textpart = parts[1] else: indexpart = parts[1] textpart = parts[0] # Look at the part without digits: match = re.match("^[\s\-–:]*(.+?)[\s\-–:]*$",textpart) # remove odd characters from start and end of the text part if match: textpart = match.group(1) # Look at the part with digits: # for Titleparts like: "Name of the series - Episode 2" match = re.match("^\s*(\S\D*?[a-zA-Z]\D*?)\W[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",indexpart) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) if guessed_series is None: guessed_series = textpart guessed_title = textpart + " : Band " + guessed_series_index else: guessed_title = textpart #log.info("ALGO1: guessed_title: " + guessed_title) #log.info("ALGO1: guessed_series: " + guessed_series) #log.info("ALGO1: guessed_series_index: " + guessed_series_index) else: # for Titleparts like: "Episode 2 Name of the series" match = re.match("^\s*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S\D*?[a-zA-Z]\D*?)[\/\.,\-–\s]*$",indexpart) if match: guessed_series_index = match.group(1) guessed_series = match.group(2) if guessed_series is None: # sometimes books with multiple volumes are detected as series without name -> Add the volume to the title guessed_series = textpart guessed_title = textpart + " : Band " + guessed_series_index else: guessed_title = textpart #log.info("ALGO2: guessed_title: " + guessed_title) #log.info("ALGO2: guessed_series: " + guessed_series) #log.info("ALGO2: guessed_series_index: " + guessed_series_index) else: # for titleparts like: "Band 2" match = re.match("^[\s\(]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*[\/\.,\-–\s]*$",indexpart) if match: guessed_series_index = match.group(1) # ...with textpart like NAME OF SERIES\s[\-\.;:]\sNAME OF TITLE # some false positives match = re.match("^\s*(\w+.+?)\s?[\.;\-–:]+\s(\w+.+)\s*$",textpart) if match: guessed_series = match.group(1) guessed_title = match.group(2) log.info("ALGO3: guessed_title: " + guessed_title) log.info("ALGO3: guessed_series: " + guessed_series) log.info("ALGO3: guessed_series_index: " + guessed_series_index) elif len(parts)==1: log.info("Title has one part") # for Titles like: "Name of the series - Title (Episode 2)" match = re.match("^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0]) if match: guessed_series_index = match.group(3) guessed_series = match.group(1) guessed_title = match.group(2) #log.info("ALGO4: guessed_title: " + guessed_title) #log.info("ALGO4: guessed_series: " + guessed_series) #log.info("ALGO4: guessed_series_index: " + guessed_series_index) else: # for Titles like: "Name of the series - Episode 2" match = re.match("^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0]) if match: guessed_series_index = match.group(2) guessed_series = match.group(1) guessed_title = guessed_series + " : Band " + guessed_series_index #log.info("ALGO5: guessed_title: " + guessed_title) #log.info("ALGO5: guessed_series: " + guessed_series) #log.info("ALGO5: guessed_series_index: " + guessed_series_index) # Log if guessed_series is not None: log.info("Guessed Series: %s" % guessed_series) #guessed_series = self.cleanUpSeries(log, guessed_series, publisher_name) if guessed_series_index is not None: log.info("Guessed Series Index: %s" % guessed_series_index) if guessed_title is not None: log.info("Guessed Title: %s" % guessed_title) guessed_title = self.cleanUpTitle(log, guessed_title) if guessed_series is not None and guessed_series_index is not None and guessed_title is not None: title = guessed_title series = guessed_series series_index = guessed_series_index ##### Filter exact searches ##### # When doing an exact search for a given IDN skip books with wrong IDNs # TODO: Currently exact_search for ISBN is not implemented. Would require ISBN-10 and ISBN-13 conversions if idn is not None and "idn" in exact_search: if idn != exact_search["idn"]: log.info("Extracted IDN does not match book's IDN, skipping record") continue ##### Put it all together ##### if self.cfg_append_edition_to_title == True and edition is not None: title = title + " : " + edition mi = Metadata(self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors)) mi.title_sort = self.removeSortingCharacters(title_sort) mi.author_sort = self.removeSortingCharacters(author_sort) mi.languages = languages mi.pubdate = pubdate mi.publisher = " ; ".join(filter(None,[publisher_location, self.removeSortingCharacters(publisher_name)])) mi.series = self.removeSortingCharacters(series) mi.series_index = series_index mi.comments = comments mi.isbn = isbn # also required for cover download mi.set_identifier('urn',urn) mi.set_identifier('dnb-idn',idn) mi.set_identifier('ddc', ",".join(ddc)) # cfg_subjects: # 0: use only subjects_gnd if self.cfg_fetch_subjects == 0: mi.tags = self.uniq(subjects_gnd) # 1: use only subjects_gnd if found, else subjects_non_gnd elif self.cfg_fetch_subjects == 1: if len(subjects_gnd)>0: mi.tags = self.uniq(subjects_gnd) else: mi.tags = self.uniq(subjects_non_gnd) # 2: subjects_gnd and subjects_non_gnd elif self.cfg_fetch_subjects == 2: mi.tags = self.uniq(subjects_gnd + subjects_non_gnd) # 3: use only subjects_non_gnd if found, else subjects_gnd elif self.cfg_fetch_subjects == 3: if len(subjects_non_gnd)>0: mi.tags = self.uniq(subjects_non_gnd) else: mi.tags = self.uniq(subjects_gnd) # 4: use only subjects_non_gnd elif self.cfg_fetch_subjects == 4: mi.tags = self.uniq(subjects_non_gnd) # 5: use no subjects at all elif self.cfg_fetch_subjects == 5: mi.tags = [] # put current result's metdata into result queue log.info("Final formatted result: \n%s" % mi) result_queue.put(mi)