Beispiel #1
0
def read_metadata(root):
    ans = Metadata(_('Unknown'), [_('Unknown')])
    prefixes, refines = read_prefixes(root), read_refines(root)
    identifiers = read_identifiers(root, prefixes, refines)
    ids = {}
    for key, vals in identifiers.iteritems():
        if key == 'calibre':
            ans.application_id = vals[0]
        elif key != 'uuid':
            ids[key] = vals[0]
    ans.set_identifiers(ids)
    ans.title = read_title(root, prefixes, refines) or ans.title
    ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
    ans.languages = read_languages(root, prefixes, refines) or ans.languages
    auts, aus = [], []
    for a in read_authors(root, prefixes, refines):
        auts.append(a.name), aus.append(a.sort)
    ans.authors = auts or ans.authors
    ans.author_sort = authors_to_string(aus) or ans.author_sort
    bkp = read_book_producers(root, prefixes, refines)
    if bkp:
        ans.book_producer = bkp[0]
    pd = read_pubdate(root, prefixes, refines)
    if not is_date_undefined(pd):
        ans.pubdate = pd
    ts = read_timestamp(root, prefixes, refines)
    if not is_date_undefined(ts):
        ans.timestamp = ts
    lm = read_last_modified(root, prefixes, refines)
    if not is_date_undefined(lm):
        ans.last_modified = lm
    return ans
Beispiel #2
0
def read_metadata(root, ver=None, return_extra_data=False):
    ans = Metadata(_('Unknown'), [_('Unknown')])
    prefixes, refines = read_prefixes(root), read_refines(root)
    identifiers = read_identifiers(root, prefixes, refines)
    ids = {}
    for key, vals in iteritems(identifiers):
        if key == 'calibre':
            ans.application_id = vals[0]
        elif key == 'uuid':
            ans.uuid = vals[0]
        else:
            ids[key] = vals[0]
    ans.set_identifiers(ids)
    ans.title = read_title(root, prefixes, refines) or ans.title
    ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
    ans.languages = read_languages(root, prefixes, refines) or ans.languages
    auts, aus = [], []
    for a in read_authors(root, prefixes, refines):
        auts.append(a.name), aus.append(a.sort)
    ans.authors = auts or ans.authors
    ans.author_sort = authors_to_string(aus) or ans.author_sort
    bkp = read_book_producers(root, prefixes, refines)
    if bkp:
        if bkp[0]:
            ans.book_producer = bkp[0]
    pd = read_pubdate(root, prefixes, refines)
    if not is_date_undefined(pd):
        ans.pubdate = pd
    ts = read_timestamp(root, prefixes, refines)
    if not is_date_undefined(ts):
        ans.timestamp = ts
    lm = read_last_modified(root, prefixes, refines)
    if not is_date_undefined(lm):
        ans.last_modified = lm
    ans.comments = read_comments(root, prefixes, refines) or ans.comments
    ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher
    ans.tags = read_tags(root, prefixes, refines) or ans.tags
    ans.rating = read_rating(root, prefixes, refines) or ans.rating
    s, si = read_series(root, prefixes, refines)
    if s:
        ans.series, ans.series_index = s, si
    ans.author_link_map = read_author_link_map(root, prefixes,
                                               refines) or ans.author_link_map
    ans.user_categories = read_user_categories(root, prefixes,
                                               refines) or ans.user_categories
    for name, fm in iteritems((read_user_metadata(root, prefixes, refines)
                               or {})):
        ans.set_user_metadata(name, fm)
    if return_extra_data:
        ans = ans, ver, read_raster_cover(root, prefixes,
                                          refines), first_spine_item(
                                              root, prefixes, refines)
    return ans
Beispiel #3
0
def read_metadata(root, ver=None, return_extra_data=False):
    ans = Metadata(_('Unknown'), [_('Unknown')])
    prefixes, refines = read_prefixes(root), read_refines(root)
    identifiers = read_identifiers(root, prefixes, refines)
    ids = {}
    for key, vals in identifiers.iteritems():
        if key == 'calibre':
            ans.application_id = vals[0]
        elif key == 'uuid':
            ans.uuid = vals[0]
        else:
            ids[key] = vals[0]
    ans.set_identifiers(ids)
    ans.title = read_title(root, prefixes, refines) or ans.title
    ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
    ans.languages = read_languages(root, prefixes, refines) or ans.languages
    auts, aus = [], []
    for a in read_authors(root, prefixes, refines):
        auts.append(a.name), aus.append(a.sort)
    ans.authors = auts or ans.authors
    ans.author_sort = authors_to_string(aus) or ans.author_sort
    bkp = read_book_producers(root, prefixes, refines)
    if bkp:
        if bkp[0]:
            ans.book_producer = bkp[0]
    pd = read_pubdate(root, prefixes, refines)
    if not is_date_undefined(pd):
        ans.pubdate = pd
    ts = read_timestamp(root, prefixes, refines)
    if not is_date_undefined(ts):
        ans.timestamp = ts
    lm = read_last_modified(root, prefixes, refines)
    if not is_date_undefined(lm):
        ans.last_modified = lm
    ans.comments = read_comments(root, prefixes, refines) or ans.comments
    ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher
    ans.tags = read_tags(root, prefixes, refines) or ans.tags
    ans.rating = read_rating(root, prefixes, refines) or ans.rating
    s, si = read_series(root, prefixes, refines)
    if s:
        ans.series, ans.series_index = s, si
    ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map
    ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories
    for name, fm in (read_user_metadata(root, prefixes, refines) or {}).iteritems():
        ans.set_user_metadata(name, fm)
    if return_extra_data:
        ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines)
    return ans
Beispiel #4
0
def read_metadata(root):
    ans = Metadata(_('Unknown'), [_('Unknown')])
    prefixes, refines = read_prefixes(root), read_refines(root)
    identifiers = read_identifiers(root, prefixes, refines)
    ids = {}
    for key, vals in identifiers.iteritems():
        if key == 'calibre':
            ans.application_id = vals[0]
        elif key != 'uuid':
            ids[key] = vals[0]
    ans.set_identifiers(ids)
    ans.title = read_title(root, prefixes, refines) or ans.title
    ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
    ans.languages = read_languages(root, prefixes, refines) or ans.languages

    return ans
Beispiel #5
0
    def _get_metadata(self, book_id, get_user_categories=True):  # {{{
        mi = Metadata(None, template_cache=self.formatter_template_cache)
        author_ids = self._field_ids_for('authors', book_id)
        aut_list = [self._author_data(i) for i in author_ids]
        aum = []
        aus = {}
        aul = {}
        for rec in aut_list:
            aut = rec['name']
            aum.append(aut)
            aus[aut] = rec['sort']
            aul[aut] = rec['link']
        mi.title = self._field_for('title',
                                   book_id,
                                   default_value=_('Unknown'))
        mi.authors = aum
        mi.author_sort = self._field_for('author_sort',
                                         book_id,
                                         default_value=_('Unknown'))
        mi.author_sort_map = aus
        mi.author_link_map = aul
        mi.comments = self._field_for('comments', book_id)
        mi.publisher = self._field_for('publisher', book_id)
        n = nowf()
        mi.timestamp = self._field_for('timestamp', book_id, default_value=n)
        mi.pubdate = self._field_for('pubdate', book_id, default_value=n)
        mi.uuid = self._field_for('uuid', book_id, default_value='dummy')
        mi.title_sort = self._field_for('sort',
                                        book_id,
                                        default_value=_('Unknown'))
        mi.book_size = self._field_for('size', book_id, default_value=0)
        mi.ondevice_col = self._field_for('ondevice',
                                          book_id,
                                          default_value='')
        mi.last_modified = self._field_for('last_modified',
                                           book_id,
                                           default_value=n)
        formats = self._field_for('formats', book_id)
        mi.format_metadata = {}
        mi.languages = list(self._field_for('languages', book_id))
        if not formats:
            good_formats = None
        else:
            mi.format_metadata = FormatMetadata(self, book_id, formats)
            good_formats = FormatsList(formats, mi.format_metadata)
        mi.formats = good_formats
        mi.has_cover = _('Yes') if self._field_for(
            'cover', book_id, default_value=False) else ''
        mi.tags = list(self._field_for('tags', book_id, default_value=()))
        mi.series = self._field_for('series', book_id)
        if mi.series:
            mi.series_index = self._field_for('series_index',
                                              book_id,
                                              default_value=1.0)
        mi.rating = self._field_for('rating', book_id)
        mi.set_identifiers(
            self._field_for('identifiers', book_id, default_value={}))
        mi.application_id = book_id
        mi.id = book_id
        composites = []
        for key, meta in self.field_metadata.custom_iteritems():
            mi.set_user_metadata(key, meta)
            if meta['datatype'] == 'composite':
                composites.append(key)
            else:
                val = self._field_for(key, book_id)
                if isinstance(val, tuple):
                    val = list(val)
                extra = self._field_for(key + '_index', book_id)
                mi.set(key, val=val, extra=extra)
        for key in composites:
            mi.set(key, val=self._composite_for(key, book_id, mi))

        user_cat_vals = {}
        if get_user_categories:
            user_cats = self.backend.prefs['user_categories']
            for ucat in user_cats:
                res = []
                for name, cat, ign in user_cats[ucat]:
                    v = mi.get(cat, None)
                    if isinstance(v, list):
                        if name in v:
                            res.append([name, cat])
                    elif name == v:
                        res.append([name, cat])
                user_cat_vals[ucat] = res
        mi.user_categories = user_cat_vals

        return mi
Beispiel #6
0
    def _get_metadata(self, book_id, get_user_categories=True):  # {{{
        mi = Metadata(None, template_cache=self.formatter_template_cache)
        author_ids = self._field_ids_for('authors', book_id)
        aut_list = [self._author_data(i) for i in author_ids]
        aum = []
        aus = {}
        aul = {}
        for rec in aut_list:
            aut = rec['name']
            aum.append(aut)
            aus[aut] = rec['sort']
            aul[aut] = rec['link']
        mi.title       = self._field_for('title', book_id,
                default_value=_('Unknown'))
        mi.authors     = aum
        mi.author_sort = self._field_for('author_sort', book_id,
                default_value=_('Unknown'))
        mi.author_sort_map = aus
        mi.author_link_map = aul
        mi.comments    = self._field_for('comments', book_id)
        mi.publisher   = self._field_for('publisher', book_id)
        n = nowf()
        mi.timestamp   = self._field_for('timestamp', book_id, default_value=n)
        mi.pubdate     = self._field_for('pubdate', book_id, default_value=n)
        mi.uuid        = self._field_for('uuid', book_id,
                default_value='dummy')
        mi.title_sort  = self._field_for('sort', book_id,
                default_value=_('Unknown'))
        mi.book_size   = self._field_for('size', book_id, default_value=0)
        mi.ondevice_col = self._field_for('ondevice', book_id, default_value='')
        mi.last_modified = self._field_for('last_modified', book_id,
                default_value=n)
        formats = self._field_for('formats', book_id)
        mi.format_metadata = {}
        mi.languages = list(self._field_for('languages', book_id))
        if not formats:
            good_formats = None
        else:
            mi.format_metadata = FormatMetadata(self, book_id, formats)
            good_formats = FormatsList(formats, mi.format_metadata)
        mi.formats = good_formats
        mi.has_cover = _('Yes') if self._field_for('cover', book_id,
                default_value=False) else ''
        mi.tags = list(self._field_for('tags', book_id, default_value=()))
        mi.series = self._field_for('series', book_id)
        if mi.series:
            mi.series_index = self._field_for('series_index', book_id,
                    default_value=1.0)
        mi.rating = self._field_for('rating', book_id)
        mi.set_identifiers(self._field_for('identifiers', book_id,
            default_value={}))
        mi.application_id = book_id
        mi.id = book_id
        composites = []
        for key, meta in self.field_metadata.custom_iteritems():
            mi.set_user_metadata(key, meta)
            if meta['datatype'] == 'composite':
                composites.append(key)
            else:
                val = self._field_for(key, book_id)
                if isinstance(val, tuple):
                    val = list(val)
                extra = self._field_for(key+'_index', book_id)
                mi.set(key, val=val, extra=extra)
        for key in composites:
            mi.set(key, val=self._composite_for(key, book_id, mi))

        user_cat_vals = {}
        if get_user_categories:
            user_cats = self.backend.prefs['user_categories']
            for ucat in user_cats:
                res = []
                for name,cat,ign in user_cats[ucat]:
                    v = mi.get(cat, None)
                    if isinstance(v, list):
                        if name in v:
                            res.append([name,cat])
                    elif name == v:
                        res.append([name,cat])
                user_cat_vals[ucat] = res
        mi.user_categories = user_cat_vals

        return mi
Beispiel #7
0
    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers={},
                 timeout=30):
        self.load_config()

        # get identifying tags from book
        idn = identifiers.get('dnb-idn', None)
        isbn = check_isbn(identifiers.get('isbn', None))

        # ignore unknown authors
        if authors is "V. A." or authors is "V.A." or authors is "Unknown" or authors is "Unbekannt":
            authors = None

        if (isbn is None) and (idn is None) and (title is None) and (authors is
                                                                     None):
            log.info(
                "This plugin requires at least either ISBN, IDN, Title or Author(s)."
            )
            return None

        queries = []
        # DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results
        exact_search = {}

        if idn is not None:
            queries.append('num=' + idn)
            exact_search['idn'] = idn

        else:
            authors_v = []
            title_v = []

            if authors is not None:
                authors_v.append(' '.join(authors))
                authors_v.append(' '.join(
                    self.get_author_tokens(authors, only_first_author=False)))
                authors_v.append(' '.join(
                    self.get_author_tokens(authors, only_first_author=True)))

            if title is not None:
                title_v.append(title)
                title_v.append(' '.join(
                    self.get_title_tokens(title,
                                          strip_joiners=False,
                                          strip_subtitle=False)))
                title_v.append(' '.join(
                    self.get_title_tokens(title,
                                          strip_joiners=False,
                                          strip_subtitle=True)))

            if isbn is not None:
                exact_search['isbn'] = isbn

            # title and author
            if authors is not None and title is not None:
                for a in authors_v:
                    for t in title_v:
                        if isbn is not None:
                            queries.append('tit="' + t + '" AND per="' + a +
                                           '" AND num="' + isbn + '"')
                        else:
                            queries.append('tit="' + t + '" AND per="' + a +
                                           '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('per="' + title + '" AND tit="' +
                                   authors[0] + '" AND num="' + isbn + '"')
                else:
                    queries.append('per="' + title + '" AND tit="' +
                                   authors[0] + '"')

            # title but no author
            elif authors is not None and title is None:
                for i in authors_v:
                    if isbn is not None:
                        queries.append('per="' + i + '" AND num="' + isbn +
                                       '"')
                    else:
                        queries.append('per="' + i + '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('tit="' + authors[0] + '" AND num="' +
                                   isbn + '"')
                else:
                    queries.append('tit="' + authors[0] + '"')

            # author but no title
            elif authors is None and title is not None:
                for i in title_v:
                    if isbn is not None:
                        queries.append('tit="' + i + '" AND num="' + isbn +
                                       '"')
                    else:
                        queries.append('tit="' + i + '"')

                # try with author and title swapped
                if isbn is not None:
                    queries.append('per="' + title + '" AND num="' + isbn +
                                   '"')
                else:
                    queries.append('per="' + title + '"')

            # as last resort only use isbn
            if isbn is not None:
                queries.append('num=' + isbn)

            # Sort queries descending by length (assumption: longer query -> less but better results)
            #queries.sort(key=len)
            #queries.reverse()

        # remove duplicate queries
        uniqueQueries = []
        for i in queries:
            if i not in uniqueQueries:
                uniqueQueries.append(i)

        # Process queries
        results = None

        for query in uniqueQueries:
            query = query + ' NOT (mat=film OR mat=music OR mat=microfiches)'
            log.info(query)

            if self.cfg_dnb_token is None:
                results = self.getSearchResultsByScraping(log, query, timeout)
            else:
                results = self.getSearchResults(log, query, timeout)

            if results is None:
                continue

            log.info("Parsing records")

            ns = {'marc21': 'http://www.loc.gov/MARC21/slim'}
            for record in results:
                series = None
                series_index = None
                publisher = None
                pubdate = None
                languages = []
                title = None
                title_sort = None
                edition = None
                comments = None
                idn = None
                urn = None
                isbn = None
                ddc = []
                subjects_gnd = []
                subjects_non_gnd = []

                # Title: Field 245
                title_parts = []
                # if a,n,p exist: series = a, series_index = n, title = p
                for i in record.xpath(
                        ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/../marc21:subfield[@code='n' and string-length(text())>0]/../marc21:subfield[@code='p' and string-length(text())>0]/..",
                        namespaces=ns):
                    series_index = i.xpath(".//marc21:subfield[@code='n']",
                                           namespaces=ns)[0].text.strip()
                    match = re.search("(\d+[,\.\d+]?)", series_index)
                    if match:
                        series_index = match.group(1)
                    else:
                        series_index = "0"  # looks like sometimes DNB does not know the series index and uses something like "[...]"
                    series_index = series_index.replace(',', '.')
                    series = i.xpath(".//marc21:subfield[@code='a']",
                                     namespaces=ns)[0].text.strip()
                    title_parts.append(
                        i.xpath(".//marc21:subfield[@code='p']",
                                namespaces=ns)[0].text.strip())
                    log.info("Extracted Series: %s" % series)
                    log.info("Extracted Series Index: %s" % series_index)
                    break
                # otherwise: title = a
                if len(title_parts) == 0:
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]",
                            namespaces=ns):
                        title_parts.append(i.text.strip())
                        break

                # subtitle 1
                for i in record.xpath(
                        ".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",
                        namespaces=ns):
                    title_parts.append(i.text.strip())
                    break

                # subtitle 2
                #for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns):
                #    title = title + " / " + i.text.strip()
                #    break

                title = " : ".join(title_parts)
                log.info("Extracted Title: %s" % title)

                # Title_Sort
                title_sort_parts = list(title_parts)
                title_sort_regex = re.match(
                    '^(.*?)(' + chr(152) + '.*' + chr(156) + ')?(.*?)$',
                    title_parts[0])
                sortword = title_sort_regex.group(2)
                if sortword:
                    title_sort_parts[0] = ''.join(
                        filter(None, [
                            title_sort_regex.group(1).strip(),
                            title_sort_regex.group(3).strip(), ", " + sortword
                        ]))
                title_sort = " : ".join(title_sort_parts)
                log.info("Extracted Title_Sort: %s" % title_sort)

                # Authors
                authors = []
                author_sort = None
                for i in record.xpath(
                        ".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):  # primary authors
                    name = re.sub(" \[.*\]$", "", i.text.strip())
                    authors.append(name)
                for i in record.xpath(
                        ".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):  # secondary authors
                    name = re.sub(" \[.*\]$", "", i.text.strip())
                    authors.append(name)
                if len(
                        authors
                ) == 0:  # if no "real" autor was found take all persons involved
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",
                            namespaces=ns):  # secondary authors
                        name = re.sub(" \[.*\]$", "", i.text.strip())
                        authors.append(name)
                if len(authors) > 0:
                    author_sort = authors[0]
                log.info("Extracted Authors: %s" % " & ".join(authors))

                # Comments
                for i in record.xpath(
                        ".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",
                        namespaces=ns):
                    if i.text.startswith("http://deposit.dnb.de/"):
                        br = self.browser
                        log.info('Downloading Comments from: %s' % i.text)
                        try:
                            comments = br.open_novisit(i.text,
                                                       timeout=30).read()
                            comments = sanitize_comments_html(comments)
                            log.info('Comments: %s' % comments)
                            break
                        except:
                            log.info("Could not download Comments from %s" % i)

                # Publisher Name and Location
                publisher_name = None
                publisher_location = None
                fields = record.xpath(
                    ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",
                    namespaces=ns)
                if len(fields) > 0:
                    publisher_name = fields[0].xpath(
                        ".//marc21:subfield[@code='b' and string-length(text())>0]",
                        namespaces=ns)[0].text.strip()
                    publisher_location = fields[0].xpath(
                        ".//marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns)[0].text.strip()
                else:
                    fields = record.xpath(
                        ".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",
                        namespaces=ns)
                    if len(fields) > 0:
                        publisher_name = fields[0].xpath(
                            ".//marc21:subfield[@code='b' and string-length(text())>0]",
                            namespaces=ns)[0].text.strip()
                    else:
                        fields = record.xpath(
                            ".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",
                            namespaces=ns)
                        if len(fields) > 0:
                            publisher_location = fields[0].xpath(
                                ".//marc21:subfield[@code='a' and string-length(text())>0]",
                                namespaces=ns)[0].text.strip()

                log.info("Extracted Publisher: %s" % publisher_name)
                log.info("Extracted Publisher Location: %s" %
                         publisher_location)

                # Publishing Date
                for i in record.xpath(
                        ".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",
                        namespaces=ns):
                    match = re.search("(\d{4})", i.text.strip())
                    if match is not None:
                        year = match.group(1)
                        pubdate = datetime.datetime(int(year), 1, 2)
                        break
                log.info("Extracted Publication Year: %s" % pubdate)

                # ID: IDN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    idn = i.text.strip()
                    break
                log.info("Extracted ID IDN: %s" % idn)
                if "idn" in exact_search:
                    if idn != exact_search["idn"]:
                        log.info(
                            "Extracted IDN does not match book's IDN, skipping record"
                        )
                        continue

                # ID: URN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    urn = i.text.strip()
                    break
                log.info("Extracted ID URN: %s" % urn)

                # ID: ISBN
                for i in record.xpath(
                        ".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]"
                    match = re.search(isbn_regex, i.text.strip())
                    isbn = match.group()
                    isbn = isbn.replace('-', '')
                    break
                log.info("Extracted ID ISBN: %s" % isbn)
                if "isbn" in exact_search:
                    if isbn != exact_search["isbn"]:
                        log.info(
                            "Extracted ISBN does not match book's ISBN, skipping record"
                        )
                        continue

                # ID: Sachgruppe (DDC)
                for i in record.xpath(
                        ".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    ddc.append(i.text.strip())
                log.info("Extracted ID DDC: %s" % ",".join(ddc))

                # Series and Series_Index
                if series is None and series_index is None:
                    for i in record.xpath(
                            ".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",
                            namespaces=ns):
                        # Series Index
                        series_index = i.xpath(".//marc21:subfield[@code='v']",
                                               namespaces=ns)[0].text.strip()
                        match = re.search("(\d+[,\.\d+]?)", series_index)
                        if match is not None:
                            series_index = match.group(1)
                        else:
                            series_index = "0"
                        series_index = series_index.replace(',', '.')
                        log.info("Extracted Series Index: %s" % series_index)
                        # Series
                        series = i.xpath(".//marc21:subfield[@code='a']",
                                         namespaces=ns)[0].text.strip()
                        log.info("Extracted Series: %s" % series)
                        break

                # Try to extract Series, Series Index and Title from the fetched title.
                # Caution: This overwrites DNB's series/series_index and modifies the title!
                if self.cfg_guess_series is True:
                    guessed_series = None
                    guessed_series_index = None
                    parts = re.split("[:]",
                                     self.removeSortingCharacters(title))
                    if len(parts) == 2:
                        if bool(re.search("\d", parts[0])) != bool(
                                re.search("\d", parts[1])):
                            # figure out which part contains the index
                            if bool(re.search("\d", parts[0])):
                                indexpart = parts[0]
                                textpart = parts[1]
                            else:
                                indexpart = parts[1]
                                textpart = parts[0]

                            match = re.match(
                                "^[\s\-–:]*(.+?)[\s\-–:]*$", textpart
                            )  # remove odd characters from start and end of the text part
                            if match:
                                textpart = match.group(1)

                            # from Titleparts like: "Name of the series - Episode 2"	OK
                            match = re.match(
                                "^\s*(\S.*?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                                indexpart)
                            if match:
                                guessed_series_index = match.group(2)
                                guessed_series = match.group(1)
                                if guessed_series is None:
                                    guessed_series = textpart
                                    title = textpart + " : Band " + guessed_series_index
                                else:
                                    title = textpart
                            else:
                                # from Titleparts like: "Episode 2 Name of the series"
                                match = re.match(
                                    "^\s*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S.*?)[\/\.,\-–\s]*$",
                                    indexpart)
                                if match:
                                    guessed_series_index = match.group(1)
                                    guessed_series = match.group(2)
                                    if guessed_series is None:
                                        guessed_series = textpart
                                        title = textpart + " : Band " + guessed_series_index
                                    else:
                                        title = textpart
                    elif len(parts) == 1:
                        # from Titles like: "Name of the series - Title (Episode 2)"
                        match = re.match(
                            "^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                            parts[0])
                        if match:
                            guessed_series_index = match.group(3)
                            guessed_series = match.group(1)
                            title = match.group(2)

                        else:
                            # from Titles like: "Name of the series - Episode 2"
                            match = re.match(
                                "^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",
                                parts[0])
                            if match:
                                guessed_series_index = match.group(2)
                                guessed_series = match.group(1)
                                title = guessed_series + " : Band " + guessed_series_index

                    if guessed_series is not None and guessed_series_index is not None:
                        series = guessed_series
                        series_index = guessed_series_index
                        log.info("Guessed Series: %s" % series)
                        log.info("Guessed Series Index: %s" % series_index)

                # GND Subjects from 689
                for i in record.xpath(
                        ".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    subjects_gnd.append(i.text.strip())
                # GND Subjects from 600-655
                for f in range(600, 656):
                    for i in record.xpath(".//marc21:datafield[@tag='" + str(
                            f
                    ) + "']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",
                                          namespaces=ns):
                        if i.text.startswith("("):
                            continue
                        subjects_gnd.append(i.text)
                log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd))

                # Non-GND subjects from 600-655
                for f in range(600, 656):
                    for i in record.xpath(".//marc21:datafield[@tag='" + str(
                            f
                    ) + "']/marc21:subfield[@code='a' and string-length(text())>0]",
                                          namespaces=ns):
                        # ignore entries starting with "(":
                        if i.text.startswith("("):
                            continue
                        subjects_non_gnd.extend(re.split(',|;', i.text))
                # remove one-character subjects:
                for i in subjects_non_gnd:
                    if len(i) < 2:
                        subjects_non_gnd.remove(i)
                log.info("Extracted non-GND Subjects: %s" %
                         " ".join(subjects_non_gnd))

                # Edition
                for i in record.xpath(
                        ".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    edition = i.text.strip()
                    break
                log.info("Extracted Edition: %s" % edition)

                # Languages
                for i in record.xpath(
                        ".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",
                        namespaces=ns):
                    languages.append(i.text.strip())
                if languages is not None:
                    log.info("Extracted Languages: %s" % ",".join(languages))

                # Put it all together
                if self.cfg_append_edition_to_title == True and edition is not None:
                    title = title + " : " + edition

                mi = Metadata(
                    self.removeSortingCharacters(title),
                    map(lambda i: self.removeSortingCharacters(i), authors))
                mi.title_sort = self.removeSortingCharacters(title_sort)
                mi.author_sort = self.removeSortingCharacters(author_sort)
                mi.languages = languages
                mi.pubdate = pubdate
                mi.publisher = " : ".join(
                    filter(None, [
                        publisher_location,
                        self.removeSortingCharacters(publisher_name)
                    ]))
                mi.series = self.removeSortingCharacters(series)
                mi.series_index = series_index
                mi.comments = comments
                mi.isbn = isbn  # also required for cover download
                mi.set_identifier('urn', urn)
                mi.set_identifier('dnb-idn', idn)
                mi.set_identifier('ddc', ",".join(ddc))

                if self.cfg_fetch_subjects == 0:
                    mi.tags = self.uniq(subjects_gnd)
                elif self.cfg_fetch_subjects == 1:
                    if len(subjects_gnd) > 0:
                        mi.tags = self.uniq(subjects_gnd)
                    else:
                        mi.tags = self.uniq(subjects_non_gnd)
                elif self.cfg_fetch_subjects == 2:
                    mi.tags = self.uniq(subjects_gnd + subjects_non_gnd)
                elif self.cfg_fetch_subjects == 3:
                    if len(subjects_non_gnd) > 0:
                        mi.tags = self.uniq(subjects_non_gnd)
                    else:
                        mi.tags = self.uniq(subjects_gnd)
                elif self.cfg_fetch_subjects == 4:
                    mi.tags = self.uniq(subjects_non_gnd)
                elif self.cfg_fetch_subjects == 5:
                    mi.tags = []

                # put current result's metdata into result queue
                log.info("Final formatted result: %s" % mi)
                result_queue.put(mi)
Beispiel #8
0
    def identify(self, log, result_queue, abort, title=None, authors=[], identifiers={}, timeout=30):
	self.load_config()

	if authors is None:
	    authors=[]

	# get identifying tags from book
	idn = identifiers.get('dnb-idn', None)
	isbn = check_isbn(identifiers.get('isbn', None))

	# ignore unknown authors
	ignored_authors = [ "V. A.", "V.A.", "Unknown", "Unbekannt" ]
	for i in ignored_authors:
	    authors = [ x for x in authors if x != i ]

	if (isbn is None) and (idn is None) and (title is None) and (authors is None):
	    log.info("This plugin requires at least either ISBN, IDN, Title or Author(s).")
	    return None


	queries=[]
	# DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results
	exact_search = {}

	if idn is not None:
	    exact_search['idn'] = idn
	    # in case look for a IDN only search for the IDN and skip all the other stuff
	    queries.append('num='+idn)

	else:
	    authors_v = []
	    title_v = []

	    # create some variants of given authors
	    if authors != []:
		authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=False)))	# concat all author names ("Peter Meier Luise Stark")
		authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=True)))	# use only first author
		for a in authors:
		    authors_v.append(a)	# use all authors, one by one

		# remove duplicates
		unique_authors_v = []
		for i in authors_v:
		    if i not in unique_authors_v:
			unique_authors_v.append(i)


	    # create some variants of given title
	    if title is not None:
		title_v.append(title)	# simply use given title
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=False)))	# remove some punctation characters
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=True)))	# remove subtitle (everything after " : ")
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=False)))	# remove some punctation characters and joiners ("and", "&", ...)
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)))	# remove subtitle (everything after " : ") and joiners ("and", "&", ...)
		# TODO: remove subtitle after " - "

		# remove duplicates
		unique_title_v = []
		for i in title_v:
		    if i not in unique_title_v:
			unique_title_v.append(i)


	    # title and author
	    if authors_v != [] and title_v != []:
		for a in authors_v:
		    for t in title_v:
			if isbn is not None:
			    queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"')
			else:
			    queries.append('tit="' + t + '" AND per="' + a + '"')

		# try with first author as title and title (without subtitle) as author
		if isbn is not None:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"')
		else:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')

		# try with author and title (without subtitle) in any index
		if isbn is not None:
		    queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"')
		else:
		    queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')


	    # author but no title
	    elif authors_v != [] and title_v == []:
		for i in authors_v:
		    if isbn is not None:
			queries.append('per="'+ i +'" AND num="' + isbn + '"')
		    else:
			queries.append('per="'+ i +'"')

		# try with author as title
		if isbn is not None:
		    queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="' + isbn + '"')
		else:
		    queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')


	    # title but no author
	    elif authors_v == [] and title_v != []:
		for i in title_v:
		    if isbn is not None:
			queries.append('tit="' + i + '" AND num="' + isbn + '"')
		    else:
			queries.append('tit="' + i + '"')

		# try with title as author
		if isbn is not None:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND num="' + isbn + '"')
		else:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '"')


	    # as last resort only use isbn
	    if isbn is not None:
		queries.append('num=' + isbn)


	# remove duplicate queries
	uniqueQueries=[]
	for i in queries:
	    if i not in uniqueQueries:
		uniqueQueries.append(i)


	# Process queries
	results = None

	for query in uniqueQueries:
	    # SRU does not work with "+" or "?" characters in query, so we simply remove them
	    query = re.sub('[\+\?]','',query)

	    query = query + ' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)'
	    log.info(query)

	    if self.cfg_dnb_token is None:
		results = self.getSearchResultsByScraping(log, query, timeout)
	    else:
		results = self.getSearchResults(log, query, timeout)

	    if results is None:
		continue

	    log.info("Parsing records")

	    ns = { 'marc21' : 'http://www.loc.gov/MARC21/slim' }
	    for record in results:
		series = None
		series_index = None
		publisher = None
		pubdate = None
		languages = []
		title = None
		title_sort = None
		authors = []
		author_sort = None
		edition = None
		comments = None
		idn = None
		urn = None
		isbn = None
		ddc = []
		subjects_gnd = []
		subjects_non_gnd = []
		publisher_name = None
		publisher_location = None


		##### Field 264 #####
		# Publisher Name and Location
		fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns)
		if len(fields)>0:
		    publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		    publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		else:
		    fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",namespaces=ns)
		    if len(fields)>0:
			publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		    else:
			fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",namespaces=ns)
			if len(fields)>0:
			    publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip();

		# Publishing Date
		for i in record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",namespaces=ns):
		    match = re.search("(\d{4})", i.text.strip())
		    if match is not None:
			year = match.group(1)
			pubdate = datetime.datetime(int(year), 1, 1, 12 , 30, 0)
			break

		# Log
		if publisher_name is not None:
		    log.info("Extracted Publisher: %s" % publisher_name)
		if publisher_location is not None:
		    log.info("Extracted Publisher Location: %s" % publisher_location)
		if pubdate is not None:
		    log.info("Extracted Publication Year: %s" % pubdate)


		##### Field 245 ####
		# Title/Series/Series_Index
		title_parts = []
		for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
		    # if a,n,p,n,p,n,p exist:		series = a + n0 + " - " + p0 + n1 + " - " + p1,	series_index = n2,	title = p2
		    # if a,n,p,n,p exist:		series = a + n0 + " - " + p0, 			series_index = n1,	title = p1	(Example: dnb-id 1008774839)
		    # if a,n,p exist:			series = a,					series_index = n,	title = p
		    # if a exist:												title = a
		    # TODO: a,n,p,n (i.e. 956375146)

		    code_p = []
		    code_n = []
		    code_a = []

		    for j in i.xpath(".//marc21:subfield[@code='p']",namespaces=ns):
			code_p.append(j.text.strip())

		    for j in i.xpath(".//marc21:subfield[@code='n']",namespaces=ns):
			match = re.search("(\d+[,\.\d+]?)", j.text.strip())
			if match:
			    code_n.append(match.group(1))
			else:
			    code_n.append("0")	# looks like sometimes DNB does not know the series index and uses something like "[...]"

		    for j in i.xpath(".//marc21:subfield[@code='a']",namespaces=ns):
			code_a.append(j.text.strip())

		    if len(code_p) == 0:
			title_parts = title_parts + code_a

		    elif len(code_p)>0 and len(code_p) == len(code_n):
			series = " : ".join(code_a)	# I've never seen more than one code_a, but who knows...
			for i in range (0,len(code_p)-1):
			    series = series + " " + code_n[i] + " " + code_p[i]
			series_index = code_n[-1]
			title_parts.append(code_p[-1])


		# subtitle 1: Field 245
		for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns):
		    title_parts.append(i.text.strip())
		    break
		
		# subtitle 2
		#for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns):
		#    title = title + " / " + i.text.strip()
		#    break

		title = " : ".join(title_parts)

		# Log
		if series_index is not None:
		    log.info("Extracted Series_Index from Field 245: %s" % series_index)
		if series is not None:
		    log.info("Extracted Series from Field 245: %s" % series)
		    series = self.cleanUpSeries(log, series, publisher_name)
		if title is not None:
		    log.info("Extracted Title: %s" % title)
		    title = self.cleanUpTitle(log, title)

		# Title_Sort
		if len(title_parts)>0:
		    title_sort_parts = list(title_parts)
		    title_sort_regex = re.match('^(.*?)('+chr(152)+'.*'+chr(156)+')?(.*?)$',title_parts[0])
		    sortword = title_sort_regex.group(2)
		    if sortword:
			title_sort_parts[0] = ''.join(filter(None,[title_sort_regex.group(1).strip(),title_sort_regex.group(3).strip(),", "+sortword]))
		    title_sort = " : ".join(title_sort_parts)

		# Log
		if title_sort is not None:
		    log.info("Extracted Title_Sort: %s" % title_sort)


		##### Field 100 and Field 700 #####
		# Authors
		for i in record.xpath(".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# primary authors
		    name = re.sub(" \[.*\]$","",i.text.strip());
		    authors.append(name)
		for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# secondary authors
		    name = re.sub(" \[.*\]$","",i.text.strip());
		    authors.append(name)
		if len(authors)==0:	# if no "real" autor was found take all persons involved
		    for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# secondary authors
			name = re.sub(" \[.*\]$","",i.text.strip());
			authors.append(name)
		if len(authors)>0:
		    author_sort = authors[0]

		# Log
		if len(authors)>0:
		    log.info("Extracted Authors: %s" % " & ".join(authors))
		if author_sort is not None:
		    log.info("Extracted Author_Sort: %s" % " & ".join(authors))


		##### Field 856 #####
		# Comments
		for i in record.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns):
		    if i.text.startswith("http://deposit.dnb.de/"):
			br = self.browser
			log.info('Downloading Comments from: %s' % i.text)
			try:
			    comments = br.open_novisit(i.text, timeout=30).read()
			    comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE)
			    comments = sanitize_comments_html(comments)
			    break
			except:
			    log.info("Could not download Comments from %s" % i)

		# Log
		if comments is not None:
		    log.info('Comments: %s' % comments)

		# If no comments are found for this edition, look at other editions of this book (Fields 776)
		# TODO: Make this configurable (default: yes)
		if comments is None:
		    # get all other issues
		    for i in record.xpath(".//marc21:datafield[@tag='776']/marc21:subfield[@code='w' and string-length(text())>0]",namespaces=ns):
			other_idn = re.sub("^\(.*\)","",i.text.strip());
			subquery = 'num='+other_idn+' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)'
			log.info(subquery)

			if self.cfg_dnb_token is None:
			    subresults = self.getSearchResultsByScraping(log, subquery, timeout)
			else:
			    subresults = self.getSearchResults(log, subquery, timeout)

			if subresults is None:
			    continue

			for subrecord in subresults:
			    for i in subrecord.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns):
				if i.text.startswith("http://deposit.dnb.de/"):
				    br = self.browser
				    log.info('Downloading Comments from: %s' % i.text)
				    try:
					comments = br.open_novisit(i.text, timeout=30).read()
					comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE)
					comments = sanitize_comments_html(comments)
					break
				    except:
					log.info("Could not download Comments from %s" % i)
			    if comments is not None:
				log.info('Comments from other issue: %s' % comments)
				break


		##### Field 16 #####
		# ID: IDN
		for i in record.xpath(".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    idn = i.text.strip()
		    break
		# Log
		if idn is not None:
		    log.info("Extracted ID IDN: %s" % idn)


		##### Field 24 #####
		# ID: URN
		for i in record.xpath(".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    urn = i.text.strip()
		    break

		# Log
		if urn is not None:
		    log.info("Extracted ID URN: %s" % urn)


		##### Field 20 #####
		# ID: ISBN
		for i in record.xpath(".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]"
		    match = re.search(isbn_regex, i.text.strip())
		    isbn = match.group()
		    isbn = isbn.replace('-','')
		    break

		# Log
		if isbn is not None:
		    log.info("Extracted ID ISBN: %s" % isbn)

		# When doing an exact search for a given ISBN skip books with wrong ISBNs
		if isbn is not None and "isbn" in exact_search:
		    if isbn != exact_search["isbn"]:
			log.info("Extracted ISBN does not match book's ISBN, skipping record")
			continue


		##### Field 82 #####
		# ID: Sachgruppe (DDC)
		for i in record.xpath(".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    ddc.append(i.text.strip())

		# Log
		if len(ddc)>0:
		    log.info("Extracted ID DDC: %s" % ",".join(ddc))


		##### Field 490 #####
		# In theory this field is not used for "real" book series, use field 830 instead. But it is used.
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='490']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
			# "v" either "Nr. 220" or "This great Seriestitle : Nr. 220" - if available use this instead of attribute a
			attr_v = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			parts = re.split(" : ",attr_v)
			if len(parts)==2:
			    if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])):
				# figure out which part contains the index
				if bool(re.search("\d",parts[0])):
				    indexpart = parts[0]
				    textpart = parts[1]
				else:
				    indexpart = parts[1]
				    textpart = parts[0]

				match = re.search("(\d+[,\.\d+]?)", indexpart)
				if match is not None:
				    series_index = match.group(1)
				    series = textpart.strip()

			else:
			    match = re.search("(\d+[,\.\d+]?)", attr_v)
			    if match is not None:
				series_index = match.group(1)
			    else:
				series_index = "0"

			series_index = series_index.replace(',','.')

			# Use Series Name from attribute "a" if not already found in attribute "v"
			if series is None:
			    series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 490: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 490: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 246 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='246']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			match = re.search("^(.+?) ; (\d+[,\.\d+]?)$", i.text.strip())
			if match is not None:
			    series = match.group(1)
			    series_index = match.group(2)

			    # Log
			    if series_index is not None:
				log.info("Extracted Series Index from Field 246: %s" % series_index)
			    if series is not None:
				log.info("Extracted Series from Field 246: %s" % series)
				series = self.cleanUpSeries(log, series, publisher_name)
			    if series is not None:
				break

		##### Field 800 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='800']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='t' and string-length(text())>0]/..",namespaces=ns):
			# Series Index
			series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			match = re.search("(\d+[,\.\d+]?)", series_index)
			if match is not None:
			    series_index = match.group(1)
			else:
			    series_index = "0"
			series_index = series_index.replace(',','.')
			# Series
			series = i.xpath(".//marc21:subfield[@code='t']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 800: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 800: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 830 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
			# Series Index
			series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			match = re.search("(\d+[,\.\d+]?)", series_index)
			if match is not None:
			    series_index = match.group(1)
			else:
			    series_index = "0"
			series_index = series_index.replace(',','.')
			# Series
			series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 830: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 830: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 689 #####
		# GND Subjects
		for i in record.xpath(".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    subjects_gnd.append(i.text.strip())
		for f in range(600,656):
		    for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			if i.text.startswith("("):
			    continue
			subjects_gnd.append(i.text)

		# Log
		if len(subjects_gnd)>0:
		    log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd))


		##### Fields 600-655 #####
		# Non-GND subjects
		for f in range(600,656):
		    for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			# ignore entries starting with "(":
			if i.text.startswith("("):
			    continue
			subjects_non_gnd.extend(re.split(',|;',i.text))
		# remove one-character subjects:
		for i in subjects_non_gnd:
		    if len(i)<2:
			subjects_non_gnd.remove(i)

		# Log
		if len(subjects_non_gnd)>0:
		    log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd))


		##### Field 250 #####
		# Edition
		for i in record.xpath(".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    edition = i.text.strip()
		    break

		# Log
		if edition is not None:
		    log.info("Extracted Edition: %s" % edition)


		##### Field 41 #####
		# Languages
		for i in record.xpath(".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    languages.append(i.text.strip())

		# Log
		if languages is not None:
		    log.info("Extracted Languages: %s" % ",".join(languages))


		##### If configured: Try to separate Series, Series Index and Title from the fetched title #####
		#if self.cfg_guess_series is True:
		if (series is None or (series is not None and series_index == "0")) and self.cfg_guess_series is True:
		    guessed_series = None
		    guessed_series_index = None
		    guessed_title = None

		    log.info("Starting Series Guesser")

		    parts = re.split("[:]",self.removeSortingCharacters(title))

		    if len(parts)==2:
			log.info("Title has two parts")
			# make sure only one part of the two parts contains digits
			if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])):
			    log.info("only one title part contains digits")
			    # figure out which part contains the index
			    if bool(re.search("\d",parts[0])):
				indexpart = parts[0]
				textpart = parts[1]
			    else:
				indexpart = parts[1]
				textpart = parts[0]

			    # Look at the part without digits:
			    match = re.match("^[\s\-–:]*(.+?)[\s\-–:]*$",textpart)	# remove odd characters from start and end of the text part
			    if match:
				textpart = match.group(1)

			    # Look at the part with digits:
			    # for Titleparts like: "Name of the series - Episode 2"
			    match = re.match("^\s*(\S\D*?[a-zA-Z]\D*?)\W[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",indexpart)
			    if match:
				guessed_series_index = match.group(2)
				guessed_series = match.group(1)
				if guessed_series is None:
				    guessed_series = textpart
				    guessed_title = textpart + " : Band " + guessed_series_index
				else:
				    guessed_title = textpart

				#log.info("ALGO1: guessed_title: " + guessed_title)
				#log.info("ALGO1: guessed_series: " + guessed_series)
				#log.info("ALGO1: guessed_series_index: " + guessed_series_index)

			    else:
				# for Titleparts like: "Episode 2 Name of the series"
				match = re.match("^\s*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S\D*?[a-zA-Z]\D*?)[\/\.,\-–\s]*$",indexpart)
				if match:
				    guessed_series_index = match.group(1)
				    guessed_series = match.group(2)

				    if guessed_series is None:
					# sometimes books with multiple volumes are detected as series without name -> Add the volume to the title 
					guessed_series = textpart
					guessed_title = textpart + " : Band " + guessed_series_index
				    else:
					guessed_title = textpart

				    #log.info("ALGO2: guessed_title: " + guessed_title)
				    #log.info("ALGO2: guessed_series: " + guessed_series)
				    #log.info("ALGO2: guessed_series_index: " + guessed_series_index)

				else:
				    # for titleparts like: "Band 2"
				    match = re.match("^[\s\(]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*[\/\.,\-–\s]*$",indexpart)
				    if match:
					guessed_series_index = match.group(1)
					# ...with textpart like NAME OF SERIES\s[\-\.;:]\sNAME OF TITLE
					# some false positives
					match = re.match("^\s*(\w+.+?)\s?[\.;\-–:]+\s(\w+.+)\s*$",textpart)
					if match:
					    guessed_series = match.group(1)
					    guessed_title = match.group(2)

					    log.info("ALGO3: guessed_title: " + guessed_title)
					    log.info("ALGO3: guessed_series: " + guessed_series)
					    log.info("ALGO3: guessed_series_index: " + guessed_series_index)


		    elif len(parts)==1:
			log.info("Title has one part")
			# for Titles like: "Name of the series - Title (Episode 2)"
			match = re.match("^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0])
			if match:
			    guessed_series_index = match.group(3)
			    guessed_series = match.group(1)
			    guessed_title = match.group(2)

			    #log.info("ALGO4: guessed_title: " + guessed_title)
			    #log.info("ALGO4: guessed_series: " + guessed_series)
			    #log.info("ALGO4: guessed_series_index: " + guessed_series_index)

			else:
			    # for Titles like: "Name of the series - Episode 2"
			    match = re.match("^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0])
			    if match:
				guessed_series_index = match.group(2)
				guessed_series = match.group(1)
				guessed_title = guessed_series + " : Band " + guessed_series_index

				#log.info("ALGO5: guessed_title: " + guessed_title)
				#log.info("ALGO5: guessed_series: " + guessed_series)
				#log.info("ALGO5: guessed_series_index: " + guessed_series_index)

		    # Log
		    if guessed_series is not None:
			log.info("Guessed Series: %s" % guessed_series)
			#guessed_series = self.cleanUpSeries(log, guessed_series, publisher_name)
		    if guessed_series_index is not None:
			log.info("Guessed Series Index: %s" % guessed_series_index)
		    if guessed_title is not None:
			log.info("Guessed Title: %s" % guessed_title)
			guessed_title = self.cleanUpTitle(log, guessed_title)

		    if guessed_series is not None and guessed_series_index is not None and guessed_title is not None:
			title = guessed_title
			series = guessed_series
			series_index = guessed_series_index


		##### Filter exact searches #####
		# When doing an exact search for a given IDN skip books with wrong IDNs
		# TODO: Currently exact_search for ISBN is not implemented. Would require ISBN-10 and ISBN-13 conversions
		if idn is not None and "idn" in exact_search:
		    if idn != exact_search["idn"]:
			log.info("Extracted IDN does not match book's IDN, skipping record")
			continue

		##### Put it all together #####
		if self.cfg_append_edition_to_title == True and edition is not None:
		    title = title + " : " + edition

		mi = Metadata(self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors))
		mi.title_sort = self.removeSortingCharacters(title_sort)
		mi.author_sort = self.removeSortingCharacters(author_sort)
		mi.languages = languages
		mi.pubdate = pubdate
		mi.publisher = " ; ".join(filter(None,[publisher_location, self.removeSortingCharacters(publisher_name)]))
		mi.series = self.removeSortingCharacters(series)
		mi.series_index = series_index
		mi.comments = comments
		mi.isbn = isbn # also required for cover download
		mi.set_identifier('urn',urn)
		mi.set_identifier('dnb-idn',idn)
		mi.set_identifier('ddc', ",".join(ddc))

		# cfg_subjects:
		# 0: use only subjects_gnd
		if self.cfg_fetch_subjects == 0:
		    mi.tags = self.uniq(subjects_gnd)
		# 1: use only subjects_gnd if found, else subjects_non_gnd
		elif self.cfg_fetch_subjects == 1:
		    if len(subjects_gnd)>0:
			mi.tags = self.uniq(subjects_gnd)
		    else:
			mi.tags = self.uniq(subjects_non_gnd)
		# 2: subjects_gnd and subjects_non_gnd
		elif self.cfg_fetch_subjects == 2:
		    mi.tags = self.uniq(subjects_gnd + subjects_non_gnd)
		# 3: use only subjects_non_gnd if found, else subjects_gnd
		elif self.cfg_fetch_subjects == 3:
		    if len(subjects_non_gnd)>0:
			mi.tags = self.uniq(subjects_non_gnd)
		    else:
			mi.tags = self.uniq(subjects_gnd)
		# 4: use only subjects_non_gnd
		elif self.cfg_fetch_subjects == 4:
		    mi.tags = self.uniq(subjects_non_gnd)
		# 5: use no subjects at all
		elif self.cfg_fetch_subjects == 5:
		    mi.tags = []

		# put current result's metdata into result queue
		log.info("Final formatted result: \n%s" % mi)
		result_queue.put(mi)