Beispiel #1
0
    def parse(self, xml_detail, xml_more_info):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_more_info)
        publisher = self.parse_publisher(xml_detail)
        tags = self.parse_tags(xml_detail, xml_more_info)
        serie, serie_index = self.parse_serie(xml_detail)
        pub_year = self.parse_pub_year(xml_detail, xml_more_info)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(as_unicode(title), authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(self.ident, cover)

            return mi
        else:
            self.log('Result skipped for because title or authors not found')
            return None
def build_meta(log, issue_id):
    """Build metadata record based on comicvine issue_id"""
    issue = pycomicvine.Issue(
        issue_id,
        field_list=[
            "id",
            "name",
            "volume",
            "issue_number",
            "person_credits",
            "description",
            "store_date",
            "cover_date",
        ],
    )
    if not issue or not issue.volume:
        log.warn("Unable to load Issue(%d)" % issue_id)
        return None
    title = "%s #%s" % (issue.volume.name, issue.issue_number)
    if issue.name:
        title = title + ": %s" % (issue.name)
    authors = [p.name for p in issue.person_credits]
    meta = Metadata(title, authors)
    meta.series = issue.volume.name
    meta.series_index = str(issue.issue_number)
    meta.set_identifier("comicvine", str(issue.id))
    meta.comments = issue.description
    meta.has_cover = False
    if issue.volume.publisher:
        meta.publisher = issue.volume.publisher.name
    meta.pubdate = issue.store_date or issue.cover_date
    return meta
Beispiel #3
0
    def parse(self, xml_detail):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_detail)
        publisher = self.parse_publisher(xml_detail)
        pub_year = self.parse_pubdate(xml_detail)
        tags = self.parse_tags(xml_detail)
        serie, serie_index = self.parse_serie(xml_detail)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:str(self.number)}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(str(self.number), cover)

            return mi
        else:
            return None
Beispiel #4
0
def convert_markdown_with_metadata(txt, title='', extensions=DEFAULT_MD_EXTENSIONS):
    from calibre.ebooks.metadata.book.base import Metadata
    from calibre.utils.date import parse_only_date
    from calibre.db.write import get_series_values
    if 'meta' not in extensions:
        extensions.append('meta')
    md = create_markdown_object(extensions)
    html = md.convert(txt)
    mi = Metadata(title or _('Unknown'))
    m = md.Meta
    for k, v in iteritems({'date':'pubdate', 'summary':'comments'}):
        if v not in m and k in m:
            m[v] = m.pop(k)
    for k in 'title authors series tags pubdate comments publisher rating'.split():
        val = m.get(k)
        if val:
            mf = mi.metadata_for_field(k)
            if not mf.get('is_multiple'):
                val = val[0]
            if k == 'series':
                val, si = get_series_values(val)
                mi.series_index = 1 if si is None else si
            if k == 'rating':
                try:
                    val = max(0, min(int(float(val)), 10))
                except Exception:
                    continue
            if mf.get('datatype') == 'datetime':
                try:
                    val = parse_only_date(val, assume_utc=False)
                except Exception:
                    continue
            setattr(mi, k, val)
    return mi, HTML_TEMPLATE % (mi.title, html)
Beispiel #5
0
def test(scale=0.5):
    from PyQt5.Qt import QLabel, QApplication, QPixmap, QMainWindow, QWidget, QScrollArea, QGridLayout
    app = QApplication([])
    mi = Metadata('xxx', ['Kovid Goyal', 'John Q. Doe', 'Author'])
    mi.series = 'A series of styles'
    m = QMainWindow()
    sa = QScrollArea(m)
    w = QWidget(m)
    sa.setWidget(w)
    l = QGridLayout(w)
    w.setLayout(l), l.setSpacing(30)
    labels = []
    for r, color in enumerate(sorted(default_color_themes)):
        for c, style in enumerate(sorted(all_styles())):
            mi.series_index = c + 1
            mi.title = 'An algorithmic cover [%s]' % color
            prefs = override_prefs(cprefs, override_color_theme=color, override_style=style)
            for x in ('cover_width', 'cover_height', 'title_font_size', 'subtitle_font_size', 'footer_font_size'):
                prefs[x] = int(scale * prefs[x])
            img = generate_cover(mi, prefs=prefs, as_qimage=True)
            la = QLabel()
            la.setPixmap(QPixmap.fromImage(img))
            l.addWidget(la, r, c)
            labels.append(la)
    m.setCentralWidget(sa)
    w.resize(w.sizeHint())
    m.show()
    app.exec_()
Beispiel #6
0
def test(scale=0.25):
    from PyQt5.Qt import QLabel, QPixmap, QMainWindow, QWidget, QScrollArea, QGridLayout
    from calibre.gui2 import Application
    app = Application([])
    mi = Metadata('Unknown', ['Kovid Goyal', 'John & Doe', 'Author'])
    mi.series = 'A series & styles'
    m = QMainWindow()
    sa = QScrollArea(m)
    w = QWidget(m)
    sa.setWidget(w)
    l = QGridLayout(w)
    w.setLayout(l), l.setSpacing(30)
    scale *= w.devicePixelRatioF()
    labels = []
    for r, color in enumerate(sorted(default_color_themes)):
        for c, style in enumerate(sorted(all_styles())):
            mi.series_index = c + 1
            mi.title = 'An algorithmic cover [%s]' % color
            prefs = override_prefs(cprefs, override_color_theme=color, override_style=style)
            scale_cover(prefs, scale)
            img = generate_cover(mi, prefs=prefs, as_qimage=True)
            img.setDevicePixelRatio(w.devicePixelRatioF())
            la = QLabel()
            la.setPixmap(QPixmap.fromImage(img))
            l.addWidget(la, r, c)
            labels.append(la)
    m.setCentralWidget(sa)
    w.resize(w.sizeHint())
    m.show()
    app.exec_()
Beispiel #7
0
def get_series(title, authors, timeout=60):
    mi = Metadata(title, authors)
    if title and title[0] in _ignore_starts:
        title = title[1:]
    title = re.sub(r'^(A|The|An)\s+', '', title).strip()
    if not title:
        return mi
    if isinstance(title, unicode):
        title = title.encode('utf-8')

    title = urllib.quote_plus(title)

    author = authors[0].strip()
    if not author:
        return mi
    if ',' in author:
        author = author.split(',')[0]
    else:
        author = author.split()[-1]

    url = URL.format(author, title)
    br = browser()
    try:
        raw = br.open_novisit(url, timeout=timeout).read()
    except URLError as e:
        if isinstance(e.reason, socket.timeout):
            raise Exception('KDL Server busy, try again later')
        raise
    if 'see the full results' not in raw:
        return mi
    raw = xml_to_unicode(raw)[0]
    soup = BeautifulSoup(raw)
    searcharea = soup.find('div', attrs={'class':'searcharea'})
    if searcharea is None:
        return mi
    ss = searcharea.find('div', attrs={'class':'seriessearch'})
    if ss is None:
        return mi
    a = ss.find('a', href=True)
    if a is None:
        return mi
    href = a['href'].partition('?')[-1]
    data = urlparse.parse_qs(href)
    series = data.get('SeriesName', [])
    if not series:
        return mi
    series = series[0]
    series = re.sub(r' series$', '', series).strip()
    if series:
        mi.series = series
    ns = ss.nextSibling
    if ns.contents:
        raw = unicode(ns.contents[0])
        raw = raw.partition('.')[0].strip()
        try:
            mi.series_index = int(raw)
        except:
            pass
    return mi
Beispiel #8
0
 def default_mi(self):
     from calibre.ebooks.metadata.book.base import Metadata
     mi = Metadata(_('A sample book'), [_('Author One'), _('Author Two')])
     mi.series = _('A series of samples')
     mi.series_index = 4
     mi.tags = [_('Tag One'), _('Tag Two')]
     mi.publisher = _('Some publisher')
     mi.rating = 4
     mi.identifiers = {'isbn':'123456789', 'url': 'http://calibre-ebook.com'}
     mi.languages = ['eng', 'fra']
     mi.pubdate = mi.timestamp = now()
     return mi
Beispiel #9
0
    def data2mi(self, item):
        """Converts a single metadata answer in the form of a dict to a MetadataInformation object"""

        mi = Metadata(_('Unknown'))

        # Regular metadata
        mi.title = item.get('title', None)
        mi.authors = item.get('authors', [])
        mi.publisher = item.get('publisher', None)

        if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id'])
        if 'doi' in item.keys(): mi.set_identifier('doi', item['doi'])
        if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn'])

        if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True)

        if 'series' in item.keys():
            mi.series = item['series']
            mi.series_index = self.format_series_index(item.get('series_index'), None)

        if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True)

        if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract'])

        if 'language' in item.keys(): mi.language = item['language']

        if 'journal' in item.keys():
            mi.series = item['journal']
            mi.series_index = self.format_series_index(item.get('volume'), item.get('number'))

        if 'subject' in item.keys():
            tags = set([])
            for s in item['subject']:
                tags.update(msc_tags(s))
                tags.update(arxiv_tags(s))

            mi.tags = list(sorted(tags))

        return mi
Beispiel #10
0
    def parse(self, xml_detail):
        sys_ident = title = isbn = publisher = pub_year = serie = serie_index = cover = None
        authors = []
        tags = []
        xpath = self.XPath('//table[@id="record"]//tr')
        for row in xpath(xml_detail):
            ch = row.getchildren()
            txt = ch[0].text.strip()
            data = self.normalize(ch[1].text)
            if txt.startswith('245') and title is None:
                title = self.parse_title(data)
            if txt.startswith('246'):
                title = self.parse_title(data)
            elif txt.startswith('100') or txt.startswith('700'):
                res = self.parse_author(data)
                if res is not None:
                    authors.append(res)
            elif txt == 'SYS':
                sys_ident = data.strip()
            elif txt =='020':
                isbn = self.parse_isbn(data)
            elif txt == '260':
                publisher, pub_year = self.parse_publisher(data)
            elif txt.startswith('490') and serie is None:
                serie, serie_index = self.parse_serie(data)
            elif txt == '655 7':
                tags.append(self.parse_tags(data))

        if isbn is not None and isbn != '':
            cover = self.parse_cover(isbn)

        if title is not None and len(authors) > 0 and sys_ident is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.identifiers = {self.plugin.name:sys_ident}
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(sys_ident, cover)

            return mi
        else:
            self.log('Data not found')
            return None
Beispiel #11
0
def build_meta(log, issue_id):
    """Build metadata record based on comicvine issue_id."""
    issue = PyComicvineWrapper(log).lookup_issue(issue_id)
    if issue:
        meta = Metadata(issue.get_full_title(), issue.get_authors())
        meta.series = issue.volume_name
        meta.series_index = issue.issue_number
        meta.set_identifier('comicvine', str(issue.id))
        meta.set_identifier('comicvine-volume', str(issue.volume_id))
        meta.comments = issue.description
        meta.has_cover = False
        meta.publisher = issue.publisher_name
        meta.pubdate = issue.date
        return meta
    else:
        return None
Beispiel #12
0
    def parse(self, xml_detail):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        tags = self.parse_tags(xml_detail)
        serie, serie_index = self.parse_serie(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.rating = rating
            mi.tags = tags
            mi.series = serie
            mi.series_index = serie_index
            return mi
        else:
            return None
Beispiel #13
0
def build_meta(log, issue_id):
  '''Build metadata record based on comicvine issue_id'''
  issue = pycomicvine.Issue(issue_id, field_list=[
      'id', 'name', 'volume', 'issue_number', 'person_credits', 'description', 
      'store_date', 'cover_date'])
  if not issue or not issue.volume:
    log.warn('Unable to load Issue(%d)' % issue_id)
    return None
  title = '%s #%s' %  (issue.volume.name, issue.issue_number)
  if issue.name:
    title = title + ': %s' % (issue.name)
  authors = [p.name for p in issue.person_credits]
  meta = Metadata(title, authors)
  meta.series = issue.volume.name
  meta.series_index = str(issue.issue_number)
  meta.set_identifier('comicvine', str(issue.id))
  meta.set_identifier('comicvine-volume', str(issue.volume.id))
  meta.comments = issue.description
  meta.has_cover = False
  if issue.volume.publisher:
    meta.publisher = issue.volume.publisher.name
  meta.pubdate = issue.store_date or issue.cover_date
  return meta
Beispiel #14
0
    def get_details(self):
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)

        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
            self.log.info(raw)
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Bookmeta for biblionet timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        try:
            # root = fromstring(clean_ascii_chars(raw))
            root = json.loads(raw)
            self.log.info(root)
        except:
            msg = 'Failed to parse book detail page: %r' % self.url
            self.log.exception(msg)
            return

        try:
            self.biblionetid = root['biblionetid']
        except:
            self.log.exception('Error parsing book id for url: %r' % self.url)
            self.biblionetid = None

        try:
            self.title = root['title'].strip()
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            self.title = None
            self.series_index = None

        try:
            self.authors = [root['authors'].strip()]
            self.log.info(self.authors)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        try:
            self.cover_url = root['cover_url']
            self.log.info('Parsed URL for cover:%r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.biblionetid,
                                                      self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
            self.has_cover = bool(self.cover_url)

        try:
            self.publisher = root['publisher']
            self.log.info('Parsed publisher:%s' % self.publisher)
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        try:
            self.tags = root['categories'].replace('DDC: ', 'DDC:').replace(
                '-', '').split()[:-1]
            self.log.info('Parsed tags:%s' % self.tags)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            self.pubdate = root['yr_published']
            self.log.info('Parsed publication date:%s' % self.pubdate)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        mi = Metadata(self.title, self.authors)
        mi.set_identifier('biblionet', self.biblionetid)

        if self.series_index:
            try:
                mi.series_index = float(self.series_index)
            except:
                self.log.exception('Error loading series')
        if self.relevance:
            try:
                mi.source_relevance = self.relevance
            except:
                self.log.exception('Error loading relevance')
        if self.cover_url:
            try:
                mi.cover_url = self.cover_url
            except:
                self.log.exception('Error loading cover_url')
        if self.publisher:
            try:
                mi.publisher = self.publisher
            except:
                self.log.exception('Error loading publisher')
        if self.tags:
            try:
                mi.tags = self.tags
            except:
                self.log.exception('Error loading tags')
        if self.pubdate:
            try:
                if self.pubdate not in (self.yr_msg1, self.yr_msg2):
                    d = datetime.date(int(self.pubdate), 1, 1)
                    mi.pubdate = d
            except:
                self.log.exception('Error loading pubdate')

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #15
0
    def _get_metadata(self, book_id, get_user_categories=True):  # {{{
        mi = Metadata(None, template_cache=self.formatter_template_cache)
        author_ids = self._field_ids_for('authors', book_id)
        aut_list = [self._author_data(i) for i in author_ids]
        aum = []
        aus = {}
        aul = {}
        for rec in aut_list:
            aut = rec['name']
            aum.append(aut)
            aus[aut] = rec['sort']
            aul[aut] = rec['link']
        mi.title = self._field_for('title',
                                   book_id,
                                   default_value=_('Unknown'))
        mi.authors = aum
        mi.author_sort = self._field_for('author_sort',
                                         book_id,
                                         default_value=_('Unknown'))
        mi.author_sort_map = aus
        mi.author_link_map = aul
        mi.comments = self._field_for('comments', book_id)
        mi.publisher = self._field_for('publisher', book_id)
        n = nowf()
        mi.timestamp = self._field_for('timestamp', book_id, default_value=n)
        mi.pubdate = self._field_for('pubdate', book_id, default_value=n)
        mi.uuid = self._field_for('uuid', book_id, default_value='dummy')
        mi.title_sort = self._field_for('sort',
                                        book_id,
                                        default_value=_('Unknown'))
        mi.book_size = self._field_for('size', book_id, default_value=0)
        mi.ondevice_col = self._field_for('ondevice',
                                          book_id,
                                          default_value='')
        mi.last_modified = self._field_for('last_modified',
                                           book_id,
                                           default_value=n)
        formats = self._field_for('formats', book_id)
        mi.format_metadata = {}
        mi.languages = list(self._field_for('languages', book_id))
        if not formats:
            good_formats = None
        else:
            mi.format_metadata = FormatMetadata(self, book_id, formats)
            good_formats = FormatsList(formats, mi.format_metadata)
        mi.formats = good_formats
        mi.has_cover = _('Yes') if self._field_for(
            'cover', book_id, default_value=False) else ''
        mi.tags = list(self._field_for('tags', book_id, default_value=()))
        mi.series = self._field_for('series', book_id)
        if mi.series:
            mi.series_index = self._field_for('series_index',
                                              book_id,
                                              default_value=1.0)
        mi.rating = self._field_for('rating', book_id)
        mi.set_identifiers(
            self._field_for('identifiers', book_id, default_value={}))
        mi.application_id = book_id
        mi.id = book_id
        composites = []
        for key, meta in self.field_metadata.custom_iteritems():
            mi.set_user_metadata(key, meta)
            if meta['datatype'] == 'composite':
                composites.append(key)
            else:
                val = self._field_for(key, book_id)
                if isinstance(val, tuple):
                    val = list(val)
                extra = self._field_for(key + '_index', book_id)
                mi.set(key, val=val, extra=extra)
        for key in composites:
            mi.set(key, val=self._composite_for(key, book_id, mi))

        user_cat_vals = {}
        if get_user_categories:
            user_cats = self.backend.prefs['user_categories']
            for ucat in user_cats:
                res = []
                for name, cat, ign in user_cats[ucat]:
                    v = mi.get(cat, None)
                    if isinstance(v, list):
                        if name in v:
                            res.append([name, cat])
                    elif name == v:
                        res.append([name, cat])
                user_cat_vals[ucat] = res
        mi.user_categories = user_cat_vals

        return mi
Beispiel #16
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isbytestring(src):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags = parse_comment_tags(src)
    meta_tags = parse_meta_tags(src)

    def get(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = ans.strip()
        if not ans:
            ans = None
        return ans

    # Title
    title = get('title')
    if not title:
        pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE)
        match = pat.search(src)
        if match:
            title = replace_entities(match.group(1))

    # Author
    authors = get('authors') or _('Unknown')

    # Create MetaInformation with Title and Author
    mi = Metadata(title or _('Unknown'), string_to_authors(authors))

    for field in ('publisher', 'isbn', 'language', 'comments'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 5:
                mi.rating /= 2.
            if mi.rating > 5:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get('tags')
    if tags:
        tags = [x.strip() for x in tags.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    return mi
Beispiel #17
0
    def parse_details(self, root):
        try:
            goodreads_id = self.parse_goodreads_id(self.url)
        except:
            self.log.exception("Error parsing goodreads id for url: %r" % self.url)
            goodreads_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception("Error parsing title and series for url: %r" % self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception("Error parsing authors for url: %r" % self.url)
            authors = []

        if not title or not authors or not goodreads_id:
            self.log.error("Could not find title/authors/goodreads id for %r" % self.url)
            self.log.error("Goodreads: %r Title: %r Authors: %r" % (goodreads_id, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier("goodreads", goodreads_id)
        self.goodreads_id = goodreads_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception("Error parsing ISBN for url: %r" % self.url)

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception("Error parsing ratings for url: %r" % self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception("Error parsing comments for url: %r" % self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception("Error parsing cover for url: %r" % self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception("Error parsing tags for url: %r" % self.url)

        try:
            mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
        except:
            self.log.exception("Error parsing publisher and date for url: %r" % self.url)

        mi.source_relevance = self.relevance

        if self.goodreads_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.goodreads_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.goodreads_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #18
0
#!/usr/bin/env python
Beispiel #19
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isbytestring(src):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src)

    def get_all(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = [x.strip() for x in ans if x.strip()]
        if not ans:
            ans = None
        return ans

    def get(field):
        ans = get_all(field)
        if ans:
            ans = ans[0]
        return ans

    # Title
    title = get('title') or title_tag.strip() or _('Unknown')

    # Author
    authors = authors_to_string(get_all('authors')) or _('Unknown')

    # Create MetaInformation with Title and Author
    mi = Metadata(title, string_to_authors(authors))

    # Single-value text fields
    for field in ('publisher', 'isbn'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    # Multi-value text fields
    for field in ('languages', ):
        val = get_all(field)
        if val:
            setattr(mi, field, val)

    # HTML fields
    for field in ('comments', ):
        val = get(field)
        if val:
            setattr(
                mi, field,
                val.replace('&', '&amp;').replace('<', '&lt;').replace(
                    '>', '&gt;').replace('"', '&quot;').replace("'", '&apos;'))

    # Date fields
    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 10:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get_all('tags')
    if tags:
        tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    # IDENTIFIERS
    for (k, v) in iteritems(meta_tag_ids):
        v = [x.strip() for x in v if x.strip()]
        if v:
            mi.set_identifier(k, v[0])

    return mi
Beispiel #20
0
    def _get_metadata(self, book_id, get_user_categories=True):  # {{{
        mi = Metadata(None, template_cache=self.formatter_template_cache)
        author_ids = self._field_ids_for('authors', book_id)
        aut_list = [self._author_data(i) for i in author_ids]
        aum = []
        aus = {}
        aul = {}
        for rec in aut_list:
            aut = rec['name']
            aum.append(aut)
            aus[aut] = rec['sort']
            aul[aut] = rec['link']
        mi.title       = self._field_for('title', book_id,
                default_value=_('Unknown'))
        mi.authors     = aum
        mi.author_sort = self._field_for('author_sort', book_id,
                default_value=_('Unknown'))
        mi.author_sort_map = aus
        mi.author_link_map = aul
        mi.comments    = self._field_for('comments', book_id)
        mi.publisher   = self._field_for('publisher', book_id)
        n = nowf()
        mi.timestamp   = self._field_for('timestamp', book_id, default_value=n)
        mi.pubdate     = self._field_for('pubdate', book_id, default_value=n)
        mi.uuid        = self._field_for('uuid', book_id,
                default_value='dummy')
        mi.title_sort  = self._field_for('sort', book_id,
                default_value=_('Unknown'))
        mi.book_size   = self._field_for('size', book_id, default_value=0)
        mi.ondevice_col = self._field_for('ondevice', book_id, default_value='')
        mi.last_modified = self._field_for('last_modified', book_id,
                default_value=n)
        formats = self._field_for('formats', book_id)
        mi.format_metadata = {}
        mi.languages = list(self._field_for('languages', book_id))
        if not formats:
            good_formats = None
        else:
            mi.format_metadata = FormatMetadata(self, book_id, formats)
            good_formats = FormatsList(formats, mi.format_metadata)
        mi.formats = good_formats
        mi.has_cover = _('Yes') if self._field_for('cover', book_id,
                default_value=False) else ''
        mi.tags = list(self._field_for('tags', book_id, default_value=()))
        mi.series = self._field_for('series', book_id)
        if mi.series:
            mi.series_index = self._field_for('series_index', book_id,
                    default_value=1.0)
        mi.rating = self._field_for('rating', book_id)
        mi.set_identifiers(self._field_for('identifiers', book_id,
            default_value={}))
        mi.application_id = book_id
        mi.id = book_id
        composites = []
        for key, meta in self.field_metadata.custom_iteritems():
            mi.set_user_metadata(key, meta)
            if meta['datatype'] == 'composite':
                composites.append(key)
            else:
                val = self._field_for(key, book_id)
                if isinstance(val, tuple):
                    val = list(val)
                extra = self._field_for(key+'_index', book_id)
                mi.set(key, val=val, extra=extra)
        for key in composites:
            mi.set(key, val=self._composite_for(key, book_id, mi))

        user_cat_vals = {}
        if get_user_categories:
            user_cats = self.backend.prefs['user_categories']
            for ucat in user_cats:
                res = []
                for name,cat,ign in user_cats[ucat]:
                    v = mi.get(cat, None)
                    if isinstance(v, list):
                        if name in v:
                            res.append([name,cat])
                    elif name == v:
                        res.append([name,cat])
                user_cat_vals[ucat] = res
        mi.user_categories = user_cat_vals

        return mi
Beispiel #21
0
    def parse_response(cls, response, log):
        metadata_items = []
        tags = []
        series = u''
        title = u''
        translators = []
        series_index = 0
        authors = []

        resp = urllib2.urlopen(response)
        page = html.parse(resp)
        e = page.getroot().find_class('_ga1_on_').pop()
        e.find("noindex").drop_tree()

        for i in e.xpath('//ol/li/a/text()'):
            tags.append(unicode(i))

        for i in e.xpath(u"//div[@class='_ga1_on_']/br[position()=1]/preceding-sibling::a[contains(@href,'/a/')]/text()"):
            authors.append(unicode(i))

        for i in e.xpath(u"//div[@class='_ga1_on_']/br[position()=1]/preceding-sibling::a[preceding::text()[contains(.,'перевод:')]]/text()"):
            translators.append(unicode(i))

        for i in e.xpath("(//div[@class='_ga1_on_']/div[@id='z0']/following-sibling::text())[1]"):
            title += i

        for i in e.xpath("//div[@class='_ga1_on_']/h8"):
            series = i.text_content()
            series_index = re.findall('\d+', i.tail)[0]

        for i in e.xpath("./a[contains(@href,'/s/')]/text()"):
            tags.append(unicode(i))

        for i in e.xpath("//div[@class='genre']/a/@href"):
            tags.append(unicode(i.split('/')[-1]))

        log.info(u'Found %s/%s: %s' % (series, series_index, title))

        if tags and series in tags:
            tags.remove(series)

        if translators:
            for t in translators:
                if t in authors:
                    authors.remove(t)

        metadata_item = Metadata(title, authors)
        if tags:
            metadata_item.tags = tags
        if series != '':
            metadata_item.series = series
        if series_index is not 0:
            metadata_item.series_index = series_index
        metadata_items.append(metadata_item)

        log.info(series, metadata_item.series)
        if u'Игрушечный дом' == series:
            log.info('1')
        if u'Игрушечный дом' == metadata_item.series:
            log.info('2')
        if series == metadata_item.series:
            log.info('3')

        return metadata_items
Beispiel #22
0
    def parse_details(self, root):
        try:
            antik_id = self.parse_antik_id(root)
            self.log.info('Parsed Antikvarium identifier: %s' % antik_id)
        except:
            self.log.exception('Error parsing Antikvarium id for url: %r' %
                               self.url)
            antik_id = None

        try:
            title = self.parse_title(root)
            self.log.info('Parsed title: %s' % title)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
            self.log.info('Parsed authors: %s' % authors)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not antik_id:
            self.log.error(
                'Could not find title/authors/Antikvarium.hu id for %r' %
                self.url)
            self.log.error('Antikvarium.hu id: %r Title: %r Authors: %r' %
                           (antik_id, title, authors))
            return

        mi = Metadata(title, authors)
        mi.set_identifier('antik_hu', antik_id)
        self.antik_id = antik_id

        try:
            isbn = self.parse_isbn(root)
            self.log.info('Parsed ISBN: %s' % isbn)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r' % self.url)

        try:
            series = self.parse_series(root)
            self.log.info('Parsed series: %s' % series)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
            series = None

        try:
            mi.series_index = self.parse_series_index(root)
            self.log.info('Parsed series index: %s' % mi.series_index)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
            mi.series_index = None

        try:
            mi.comments = self.parse_comments(root)
            self.log.info('Parsed comments: %s' % mi.comments)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            self.cover_url = self.parse_cover(root)
            self.log.info('Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.antik_id,
                                                      self.cover_url)
            mi.has_cover = bool(self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)

        try:
            mi.publisher = self.parse_publisher(root)
            self.log.info('Parsed publisher: %s' % mi.publisher)
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        try:
            mi.tags = self.parse_tags(root)
            self.log.info('Parsed tags: %s' % mi.tags)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
            self.log.info('Parsed publication date: %s' % mi.pubdate)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        try:
            mi.languages = self.parse_languages(root)
            self.log.info('Parsed languages: %r' % mi.languages)
        except:
            self.log.exception('Error parsing languages for url: %r' %
                               self.url)

        mi.source_relevance = self.relevance

        if series:
            mi.series = series

        if self.antik_id and self.isbn:
            self.plugin.cache_isbn_to_identifier(self.isbn, self.antik_id)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #23
0
    def parse_details(self, root):
        try:
            legie_id = self.parse_legie_id(self.url)
        except:
            self.log.exception('Error parsing Legie id for url: %r' % self.url)
            legie_id = None

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not legie_id:
            self.log.error('Could not find title/authors/Legie id for %r' %
                           self.url)
            self.log.error('Legie: %r Title: %r Authors: %r' %
                           (legie_id, title, authors))
            return

        self.legie_id = legie_id

        rating = comments = series = series_index = None
        try:
            rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r' % self.url)

        try:
            comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            (series, series_index) = self.parse_series(root)
        except:
            self.log.info('Series not found.')

        try:
            tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)
            tags = None

        if legie_id:
            editions = self.get_editions()

            if editions:
                num_editions = len(editions)
                self.log.info('Nalezeno %d vydani' % num_editions)
                for edition in editions:
                    (year, cover_url, publisher, isbn) = edition
                    mi = Metadata(title, authors)
                    self.legie_id = "%s#%s" % (legie_id, year)
                    mi.set_identifier('legie', self.legie_id)
                    mi.source_relevance = self.relevance
                    mi.rating = rating
                    mi.comments = comments
                    mi.series = series
                    mi.series_index = series_index
                    if cover_url:
                        mi.cover_url = self.cover_url = cover_url
                        self.plugin.cache_identifier_to_cover_url(
                            self.legie_id, self.cover_url)
                    if tags:
                        mi.tags = tags
                    mi.has_cover = bool(self.cover_url)
                    mi.publisher = publisher
                    mi.isbn = isbn
                    mi.pubdate = self.prepare_date(int(year))
                    mi.language = "ces"
                    self.result_queue.put(mi)
            else:
                mi = Metadata(title, authors)
                mi.set_identifier('legie', self.legie_id)
                mi.source_relevance = self.relevance
                mi.rating = rating
                mi.comments = comments
                mi.series = series
                mi.series_index = series_index
                try:
                    self.cover_url = self.parse_cover(root)
                except:
                    self.log.exception('Error parsing cover for url: %r' %
                                       self.url)
                if tags:
                    mi.tags = tags
                mi.has_cover = bool(self.cover_url)
                mi.publisher = publisher
                mi.isbn = isbn
                mi.pubdate = self.prepare_date(int(year))
                mi.language = "ces"
                self.result_queue.put(mi)
                if self.legie_id:
                    if self.cover_url:
                        self.plugin.cache_identifier_to_cover_url(
                            self.legie_id, self.cover_url)
    def parse_details(self, root):
        try:
            yes24_id = self.parse_yes24_id(self.url)
        except:
            self.log.exception('Error parsing YES24 id for url: %r'%self.url)
            yes24_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not yes24_id:
            self.log.error('Could not find title/authors/YES24 id for %r'%self.url)
            self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title,
                authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('yes24', yes24_id)
        self.yes24_id = yes24_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)
        mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!

        try:
            mi.publisher = self.parse_publisher(root)
        except:
            self.log.exception('Error parsing publisher for url: %r'%self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
        except:
            self.log.exception('Error parsing published date for url: %r'%self.url)

        mi.language = 'ko'

        mi.source_relevance = self.relevance

        if self.yes24_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #25
0
    def _GoodreadsBook_to_Metadata(self, book):
        # type: (_GoodreadsBook) -> Metadata
        """
        :param book: _GoodreadsBook: book
        :return: Metadata: Metadata
        """
        mi = Metadata(book.title, book.authors)
        mi.source_relevance = 0
        mi.set_identifier('goodreads', book.id)

        if self.prefs['NEVER_REPLACE_ISBN'] and mi.get_identifiers().get(
                'isbn'):
            mi.set_identifier('isbn', '')

        if book.asin and not self.prefs['NEVER_REPLACE_AMAZONID']:
            mi.set_identifier('amazon', book.asin)

        if book.isbn and not self.prefs['NEVER_REPLACE_ISBN']:
            try:
                if len(book.isbn) == 10:
                    mi.isbn = check_isbn13(_ISBNConvert.convert(book.isbn))
                else:
                    mi.isbn = check_isbn13(book.isbn)
            except:
                self.log.error("ISBN CONVERSION ERROR:", book.isbn)
                self.log.exception()

        if book.image_url:
            self.log.info('cache_identifier_to_cover_url:', book.asin, ':',
                          book.image_url)
            self.cache_identifier_to_cover_url(book.id, book.image_url)

        if book.publisher:
            self.log.info('book.publisher is:', book.publisher)
            mi.publisher = book.publisher

        if book.pubdate:
            self.log.info('book.pubdate is:',
                          book.pubdate.strftime('%Y-%m-%d'))
            mi.pubdate = book.pubdate

        if book.comments:
            self.log.info('book.editorial_review is:', book.comments)
            mi.comments = book.comments

        tags = self.prefs['ADD_THESE_TAGS'].split(',')
        tags.extend(book.tags)
        # tag_mappings = JSONConfig('plugins/GenreMappings')['genreMappings']
        # mi.tags = list(set(sorted(filter(lambda x: tag_mappings.get(x, x), tags))))

        if book.series:
            mi.series = book.series
            self.log.info(u'series:', book.series)
            if book.series_index:
                mi.series_index = book.series_index
                self.log.info(u'series_index:',
                              "{0:.2f}".format(book.series_index))
            else:
                mi.series_index = 0

        if book.average_rating:
            mi.rating = book.average_rating

        self.clean_downloaded_metadata(mi)

        return mi
Beispiel #26
0
    def get_details(self):
        '''
        The get_details() function for stripping the website for all information
        '''
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)

        # Parse the html code from the website
        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        # Do some error handling if it fails to read data
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Bookmeta for saxo timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        # Do some error handling if the html code returned 404
        if "<title>404 - " == raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        # Clean the html data a little
        try:
            root = parse(raw)
        except:
            self.log.error("Error cleaning HTML")
            return

        # Get the title of the book
        try:
            title_node = root.xpath('//span[@itemprop="name"]')
            self.title = title_node[0].text
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)

        # Get the author of the book
        try:
            author_node = root.xpath('//span[@class="expandAuthorName"]')
            author_strings = author_node[0].text.split(",")
            #print(author_strings)
            for name in author_strings:
                self.authors.append(name)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        # Get the series of the book
        try:
            series_node = root.xpath('//b[contains(text(), "Serie")]/a')
            if len(series_node) > 0:
                self.series = series_node[0].text.split(": ")[0].strip()
                self.series_index = series_node[0].text.split(": ")[-1].strip()
            #    print("'%s'" % self.series)
            #    print("'%s'" % self.series_index)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)

        # Some books have ratings, let's use them.
        try:
            self.rating = 0.0
        except:
            self.log.exception('Error parsing rating for url: %r' % self.url)
            self.rating = 0.0

        # Get the ISBN number from the site
        try:
            isbn_node = root.xpath(
                '//div[@class="eBookContainer"]/b/span[@itemprop="identifier"]'
            )
            if len(isbn_node) > 0:
                self.isbn = isbn_node[0].text.replace("ISBN: ", "").strip()
        except:
            self.log.exception('Error parsing isbn for url: %r' % self.url)
            self.isbn = None

        # Get the comments/blurb for the book
        try:
            comment_node = root.xpath('//meta[@name="description"]/@content')
            self.comments = comment_node[0]
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)
            self.comments = None

        # Parse the cover url for downloading the cover.
        try:
            cover_node = root.xpath(
                '//div[@class="bookDetailCoverCover"]/img/@src')
            self.cover_url = "https://mofibo.com" + cover_node[0]
            self.log.info('    Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.isbn,
                                                      self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
            self.has_cover = bool(self.cover_url)

        # Get the publisher name
        try:
            publisher_node = root.xpath(
                '//div[@class="eBookContainer"]/b/span/a[@itemprop="brand"]')
            if len(publisher_node) > 0:
                self.publisher = publisher_node[0].text
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        # Get the language of the book. Only english and danish are supported tho
        try:
            language_node = root.xpath('//b[@class="expanderLanguage"]')
            language = language_node[0].text.strip().replace("Sprog:",
                                                             "").replace(
                                                                 " ", "")
            language = self.lang_map.get(language, None)
            self.language = language
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        # Get the publisher date
        try:
            pubdate_node = root.xpath(
                '//div[@class="eBookContainer"]/b[contains(text(),"Udgivet:")]'
            )
            if len(pubdate_node) > 0:
                date_str = pubdate_node[0].text.replace("Udgivet:", "").strip()
                format_str = '%Y-%m-%d'  # The format
                self.pubdate = datetime.datetime.strptime(date_str, format_str)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        # Get the tags
        try:
            tags = []
            tags_node = root.xpath('//span[@itemprop="category"]')
            tags.append(tags_node[0].text.strip())
            self.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        # Setup the metadata
        meta_data = Metadata(self.title, self.authors)
        meta_data.set_identifier('isbn', self.isbn)
        meta_data.set_identifier('mofibo', self.url)

        # Set rating
        if self.series:
            try:
                meta_data.series = self.series
                meta_data.series_index = self.series_index
            except:
                self.log.exception('Error loading series')
        # Set ISBN
        if self.isbn:
            try:
                meta_data.isbn = self.isbn
            except:
                self.log.exception('Error loading ISBN')
        # Set relevance
        if self.relevance:
            try:
                meta_data.source_relevance = self.relevance
            except:
                self.log.exception('Error loading relevance')
        # Set cover url
        if self.cover_url:
            try:
                meta_data.cover_url = self.cover_url
            except:
                self.log.exception('Error loading cover_url')
        # Set publisher
        if self.publisher:
            try:
                meta_data.publisher = self.publisher
            except:
                self.log.exception('Error loading publisher')
        # Set language
        if self.language:
            try:
                meta_data.language = self.language
            except:
                self.log.exception('Error loading language')
        # Set comments/blurb
        if self.comments:
            try:
                meta_data.comments = self.comments
            except:
                self.log.exception("Error loading comments")
        # Set publisher data
        if self.pubdate:
            try:
                meta_data.pubdate = self.pubdate
            except:
                self.log.exception('Error loading pubdate')
        # Set tags data
        if self.tags:
            try:
                meta_data.tags = self.tags
            except:
                self.log.exception('Error loading tags')

        # Put meta data
        self.plugin.clean_downloaded_metadata(meta_data)
        self.result_queue.put(meta_data)
Beispiel #27
0
    def identify(self, log, result_queue, abort, title=None, authors=[], identifiers={}, timeout=30):
	self.load_config()

	if authors is None:
	    authors=[]

	# get identifying tags from book
	idn = identifiers.get('dnb-idn', None)
	isbn = check_isbn(identifiers.get('isbn', None))

	# ignore unknown authors
	ignored_authors = [ "V. A.", "V.A.", "Unknown", "Unbekannt" ]
	for i in ignored_authors:
	    authors = [ x for x in authors if x != i ]

	if (isbn is None) and (idn is None) and (title is None) and (authors is None):
	    log.info("This plugin requires at least either ISBN, IDN, Title or Author(s).")
	    return None


	queries=[]
	# DNB does not do an exact search when searching for a idn or isbn, so we have to filter the results
	exact_search = {}

	if idn is not None:
	    exact_search['idn'] = idn
	    # in case look for a IDN only search for the IDN and skip all the other stuff
	    queries.append('num='+idn)

	else:
	    authors_v = []
	    title_v = []

	    # create some variants of given authors
	    if authors != []:
		authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=False)))	# concat all author names ("Peter Meier Luise Stark")
		authors_v.append(' '.join(self.get_author_tokens(authors,only_first_author=True)))	# use only first author
		for a in authors:
		    authors_v.append(a)	# use all authors, one by one

		# remove duplicates
		unique_authors_v = []
		for i in authors_v:
		    if i not in unique_authors_v:
			unique_authors_v.append(i)


	    # create some variants of given title
	    if title is not None:
		title_v.append(title)	# simply use given title
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=False)))	# remove some punctation characters
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=False,strip_subtitle=True)))	# remove subtitle (everything after " : ")
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=False)))	# remove some punctation characters and joiners ("and", "&", ...)
		title_v.append(' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)))	# remove subtitle (everything after " : ") and joiners ("and", "&", ...)
		# TODO: remove subtitle after " - "

		# remove duplicates
		unique_title_v = []
		for i in title_v:
		    if i not in unique_title_v:
			unique_title_v.append(i)


	    # title and author
	    if authors_v != [] and title_v != []:
		for a in authors_v:
		    for t in title_v:
			if isbn is not None:
			    queries.append('tit="' + t + '" AND per="' + a + '" AND num="' + isbn + '"')
			else:
			    queries.append('tit="' + t + '" AND per="' + a + '"')

		# try with first author as title and title (without subtitle) as author
		if isbn is not None:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"')
		else:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')

		# try with author and title (without subtitle) in any index
		if isbn is not None:
		    queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="'+isbn+'"')
		else:
		    queries.append('"' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND "' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')


	    # author but no title
	    elif authors_v != [] and title_v == []:
		for i in authors_v:
		    if isbn is not None:
			queries.append('per="'+ i +'" AND num="' + isbn + '"')
		    else:
			queries.append('per="'+ i +'"')

		# try with author as title
		if isbn is not None:
		    queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '" AND num="' + isbn + '"')
		else:
		    queries.append('tit="' + ' '.join(self.get_author_tokens(authors,only_first_author=True)) + '"')


	    # title but no author
	    elif authors_v == [] and title_v != []:
		for i in title_v:
		    if isbn is not None:
			queries.append('tit="' + i + '" AND num="' + isbn + '"')
		    else:
			queries.append('tit="' + i + '"')

		# try with title as author
		if isbn is not None:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '" AND num="' + isbn + '"')
		else:
		    queries.append('per="' + ' '.join(self.get_title_tokens(title,strip_joiners=True,strip_subtitle=True)) + '"')


	    # as last resort only use isbn
	    if isbn is not None:
		queries.append('num=' + isbn)


	# remove duplicate queries
	uniqueQueries=[]
	for i in queries:
	    if i not in uniqueQueries:
		uniqueQueries.append(i)


	# Process queries
	results = None

	for query in uniqueQueries:
	    # SRU does not work with "+" or "?" characters in query, so we simply remove them
	    query = re.sub('[\+\?]','',query)

	    query = query + ' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)'
	    log.info(query)

	    if self.cfg_dnb_token is None:
		results = self.getSearchResultsByScraping(log, query, timeout)
	    else:
		results = self.getSearchResults(log, query, timeout)

	    if results is None:
		continue

	    log.info("Parsing records")

	    ns = { 'marc21' : 'http://www.loc.gov/MARC21/slim' }
	    for record in results:
		series = None
		series_index = None
		publisher = None
		pubdate = None
		languages = []
		title = None
		title_sort = None
		authors = []
		author_sort = None
		edition = None
		comments = None
		idn = None
		urn = None
		isbn = None
		ddc = []
		subjects_gnd = []
		subjects_non_gnd = []
		publisher_name = None
		publisher_location = None


		##### Field 264 #####
		# Publisher Name and Location
		fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns)
		if len(fields)>0:
		    publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		    publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		else:
		    fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='b' and string-length(text())>0]/../..",namespaces=ns)
		    if len(fields)>0:
			publisher_name = fields[0].xpath(".//marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns)[0].text.strip();
		    else:
			fields = record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='a' and string-length(text())>0]/../..",namespaces=ns)
			if len(fields)>0:
			    publisher_location = fields[0].xpath(".//marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns)[0].text.strip();

		# Publishing Date
		for i in record.xpath(".//marc21:datafield[@tag='264']/marc21:subfield[@code='c' and string-length(text())>=4]",namespaces=ns):
		    match = re.search("(\d{4})", i.text.strip())
		    if match is not None:
			year = match.group(1)
			pubdate = datetime.datetime(int(year), 1, 1, 12 , 30, 0)
			break

		# Log
		if publisher_name is not None:
		    log.info("Extracted Publisher: %s" % publisher_name)
		if publisher_location is not None:
		    log.info("Extracted Publisher Location: %s" % publisher_location)
		if pubdate is not None:
		    log.info("Extracted Publication Year: %s" % pubdate)


		##### Field 245 ####
		# Title/Series/Series_Index
		title_parts = []
		for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
		    # if a,n,p,n,p,n,p exist:		series = a + n0 + " - " + p0 + n1 + " - " + p1,	series_index = n2,	title = p2
		    # if a,n,p,n,p exist:		series = a + n0 + " - " + p0, 			series_index = n1,	title = p1	(Example: dnb-id 1008774839)
		    # if a,n,p exist:			series = a,					series_index = n,	title = p
		    # if a exist:												title = a
		    # TODO: a,n,p,n (i.e. 956375146)

		    code_p = []
		    code_n = []
		    code_a = []

		    for j in i.xpath(".//marc21:subfield[@code='p']",namespaces=ns):
			code_p.append(j.text.strip())

		    for j in i.xpath(".//marc21:subfield[@code='n']",namespaces=ns):
			match = re.search("(\d+[,\.\d+]?)", j.text.strip())
			if match:
			    code_n.append(match.group(1))
			else:
			    code_n.append("0")	# looks like sometimes DNB does not know the series index and uses something like "[...]"

		    for j in i.xpath(".//marc21:subfield[@code='a']",namespaces=ns):
			code_a.append(j.text.strip())

		    if len(code_p) == 0:
			title_parts = title_parts + code_a

		    elif len(code_p)>0 and len(code_p) == len(code_n):
			series = " : ".join(code_a)	# I've never seen more than one code_a, but who knows...
			for i in range (0,len(code_p)-1):
			    series = series + " " + code_n[i] + " " + code_p[i]
			series_index = code_n[-1]
			title_parts.append(code_p[-1])


		# subtitle 1: Field 245
		for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='b' and string-length(text())>0]",namespaces=ns):
		    title_parts.append(i.text.strip())
		    break
		
		# subtitle 2
		#for i in record.xpath(".//marc21:datafield[@tag='245']/marc21:subfield[@code='c' and string-length(text())>0]",namespaces=ns):
		#    title = title + " / " + i.text.strip()
		#    break

		title = " : ".join(title_parts)

		# Log
		if series_index is not None:
		    log.info("Extracted Series_Index from Field 245: %s" % series_index)
		if series is not None:
		    log.info("Extracted Series from Field 245: %s" % series)
		    series = self.cleanUpSeries(log, series, publisher_name)
		if title is not None:
		    log.info("Extracted Title: %s" % title)
		    title = self.cleanUpTitle(log, title)

		# Title_Sort
		if len(title_parts)>0:
		    title_sort_parts = list(title_parts)
		    title_sort_regex = re.match('^(.*?)('+chr(152)+'.*'+chr(156)+')?(.*?)$',title_parts[0])
		    sortword = title_sort_regex.group(2)
		    if sortword:
			title_sort_parts[0] = ''.join(filter(None,[title_sort_regex.group(1).strip(),title_sort_regex.group(3).strip(),", "+sortword]))
		    title_sort = " : ".join(title_sort_parts)

		# Log
		if title_sort is not None:
		    log.info("Extracted Title_Sort: %s" % title_sort)


		##### Field 100 and Field 700 #####
		# Authors
		for i in record.xpath(".//marc21:datafield[@tag='100']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# primary authors
		    name = re.sub(" \[.*\]$","",i.text.strip());
		    authors.append(name)
		for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='4' and text()='aut']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# secondary authors
		    name = re.sub(" \[.*\]$","",i.text.strip());
		    authors.append(name)
		if len(authors)==0:	# if no "real" autor was found take all persons involved
		    for i in record.xpath(".//marc21:datafield[@tag='700']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):	# secondary authors
			name = re.sub(" \[.*\]$","",i.text.strip());
			authors.append(name)
		if len(authors)>0:
		    author_sort = authors[0]

		# Log
		if len(authors)>0:
		    log.info("Extracted Authors: %s" % " & ".join(authors))
		if author_sort is not None:
		    log.info("Extracted Author_Sort: %s" % " & ".join(authors))


		##### Field 856 #####
		# Comments
		for i in record.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns):
		    if i.text.startswith("http://deposit.dnb.de/"):
			br = self.browser
			log.info('Downloading Comments from: %s' % i.text)
			try:
			    comments = br.open_novisit(i.text, timeout=30).read()
			    comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE)
			    comments = sanitize_comments_html(comments)
			    break
			except:
			    log.info("Could not download Comments from %s" % i)

		# Log
		if comments is not None:
		    log.info('Comments: %s' % comments)

		# If no comments are found for this edition, look at other editions of this book (Fields 776)
		# TODO: Make this configurable (default: yes)
		if comments is None:
		    # get all other issues
		    for i in record.xpath(".//marc21:datafield[@tag='776']/marc21:subfield[@code='w' and string-length(text())>0]",namespaces=ns):
			other_idn = re.sub("^\(.*\)","",i.text.strip());
			subquery = 'num='+other_idn+' NOT (mat=film OR mat=music OR mat=microfiches OR cod=tt)'
			log.info(subquery)

			if self.cfg_dnb_token is None:
			    subresults = self.getSearchResultsByScraping(log, subquery, timeout)
			else:
			    subresults = self.getSearchResults(log, subquery, timeout)

			if subresults is None:
			    continue

			for subrecord in subresults:
			    for i in subrecord.xpath(".//marc21:datafield[@tag='856']/marc21:subfield[@code='u' and string-length(text())>0]",namespaces=ns):
				if i.text.startswith("http://deposit.dnb.de/"):
				    br = self.browser
				    log.info('Downloading Comments from: %s' % i.text)
				    try:
					comments = br.open_novisit(i.text, timeout=30).read()
					comments = re.sub('(\s|<br>|<p>|\n)*Angaben aus der Verlagsmeldung(\s|<br>|<p>|\n)*(<h3>.*?</h3>)?(\s|<br>|<p>|\n)*','',comments,flags=re.IGNORECASE)
					comments = sanitize_comments_html(comments)
					break
				    except:
					log.info("Could not download Comments from %s" % i)
			    if comments is not None:
				log.info('Comments from other issue: %s' % comments)
				break


		##### Field 16 #####
		# ID: IDN
		for i in record.xpath(".//marc21:datafield[@tag='016']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    idn = i.text.strip()
		    break
		# Log
		if idn is not None:
		    log.info("Extracted ID IDN: %s" % idn)


		##### Field 24 #####
		# ID: URN
		for i in record.xpath(".//marc21:datafield[@tag='024']/marc21:subfield[@code='2' and text()='urn']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    urn = i.text.strip()
		    break

		# Log
		if urn is not None:
		    log.info("Extracted ID URN: %s" % urn)


		##### Field 20 #####
		# ID: ISBN
		for i in record.xpath(".//marc21:datafield[@tag='020']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    isbn_regex = "(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?(?:[0-9]+[- ]?){2}[0-9X]"
		    match = re.search(isbn_regex, i.text.strip())
		    isbn = match.group()
		    isbn = isbn.replace('-','')
		    break

		# Log
		if isbn is not None:
		    log.info("Extracted ID ISBN: %s" % isbn)

		# When doing an exact search for a given ISBN skip books with wrong ISBNs
		if isbn is not None and "isbn" in exact_search:
		    if isbn != exact_search["isbn"]:
			log.info("Extracted ISBN does not match book's ISBN, skipping record")
			continue


		##### Field 82 #####
		# ID: Sachgruppe (DDC)
		for i in record.xpath(".//marc21:datafield[@tag='082']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    ddc.append(i.text.strip())

		# Log
		if len(ddc)>0:
		    log.info("Extracted ID DDC: %s" % ",".join(ddc))


		##### Field 490 #####
		# In theory this field is not used for "real" book series, use field 830 instead. But it is used.
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='490']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
			# "v" either "Nr. 220" or "This great Seriestitle : Nr. 220" - if available use this instead of attribute a
			attr_v = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			parts = re.split(" : ",attr_v)
			if len(parts)==2:
			    if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])):
				# figure out which part contains the index
				if bool(re.search("\d",parts[0])):
				    indexpart = parts[0]
				    textpart = parts[1]
				else:
				    indexpart = parts[1]
				    textpart = parts[0]

				match = re.search("(\d+[,\.\d+]?)", indexpart)
				if match is not None:
				    series_index = match.group(1)
				    series = textpart.strip()

			else:
			    match = re.search("(\d+[,\.\d+]?)", attr_v)
			    if match is not None:
				series_index = match.group(1)
			    else:
				series_index = "0"

			series_index = series_index.replace(',','.')

			# Use Series Name from attribute "a" if not already found in attribute "v"
			if series is None:
			    series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 490: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 490: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 246 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='246']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			match = re.search("^(.+?) ; (\d+[,\.\d+]?)$", i.text.strip())
			if match is not None:
			    series = match.group(1)
			    series_index = match.group(2)

			    # Log
			    if series_index is not None:
				log.info("Extracted Series Index from Field 246: %s" % series_index)
			    if series is not None:
				log.info("Extracted Series from Field 246: %s" % series)
				series = self.cleanUpSeries(log, series, publisher_name)
			    if series is not None:
				break

		##### Field 800 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='800']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='t' and string-length(text())>0]/..",namespaces=ns):
			# Series Index
			series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			match = re.search("(\d+[,\.\d+]?)", series_index)
			if match is not None:
			    series_index = match.group(1)
			else:
			    series_index = "0"
			series_index = series_index.replace(',','.')
			# Series
			series = i.xpath(".//marc21:subfield[@code='t']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 800: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 800: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 830 #####
		# Series and Series_Index
		if series is None or (series is not None and series_index == "0"):
		    for i in record.xpath(".//marc21:datafield[@tag='830']/marc21:subfield[@code='v' and string-length(text())>0]/../marc21:subfield[@code='a' and string-length(text())>0]/..",namespaces=ns):
			# Series Index
			series_index = i.xpath(".//marc21:subfield[@code='v']",namespaces=ns)[0].text.strip()
			match = re.search("(\d+[,\.\d+]?)", series_index)
			if match is not None:
			    series_index = match.group(1)
			else:
			    series_index = "0"
			series_index = series_index.replace(',','.')
			# Series
			series = i.xpath(".//marc21:subfield[@code='a']",namespaces=ns)[0].text.strip()

			# Log
			if series_index is not None:
			    log.info("Extracted Series Index from Field 830: %s" % series_index)
			if series is not None:
			    log.info("Extracted Series from Field 830: %s" % series)
			    series = self.cleanUpSeries(log, series, publisher_name)
			if series is not None:
			    break


		##### Field 689 #####
		# GND Subjects
		for i in record.xpath(".//marc21:datafield[@tag='689']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    subjects_gnd.append(i.text.strip())
		for f in range(600,656):
		    for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='2' and text()='gnd']/../marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			if i.text.startswith("("):
			    continue
			subjects_gnd.append(i.text)

		# Log
		if len(subjects_gnd)>0:
		    log.info("Extracted GND Subjects: %s" % " ".join(subjects_gnd))


		##### Fields 600-655 #####
		# Non-GND subjects
		for f in range(600,656):
		    for i in record.xpath(".//marc21:datafield[@tag='"+str(f)+"']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
			# ignore entries starting with "(":
			if i.text.startswith("("):
			    continue
			subjects_non_gnd.extend(re.split(',|;',i.text))
		# remove one-character subjects:
		for i in subjects_non_gnd:
		    if len(i)<2:
			subjects_non_gnd.remove(i)

		# Log
		if len(subjects_non_gnd)>0:
		    log.info("Extracted non-GND Subjects: %s" % " ".join(subjects_non_gnd))


		##### Field 250 #####
		# Edition
		for i in record.xpath(".//marc21:datafield[@tag='250']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    edition = i.text.strip()
		    break

		# Log
		if edition is not None:
		    log.info("Extracted Edition: %s" % edition)


		##### Field 41 #####
		# Languages
		for i in record.xpath(".//marc21:datafield[@tag='041']/marc21:subfield[@code='a' and string-length(text())>0]",namespaces=ns):
		    languages.append(i.text.strip())

		# Log
		if languages is not None:
		    log.info("Extracted Languages: %s" % ",".join(languages))


		##### If configured: Try to separate Series, Series Index and Title from the fetched title #####
		#if self.cfg_guess_series is True:
		if (series is None or (series is not None and series_index == "0")) and self.cfg_guess_series is True:
		    guessed_series = None
		    guessed_series_index = None
		    guessed_title = None

		    log.info("Starting Series Guesser")

		    parts = re.split("[:]",self.removeSortingCharacters(title))

		    if len(parts)==2:
			log.info("Title has two parts")
			# make sure only one part of the two parts contains digits
			if bool(re.search("\d",parts[0])) != bool(re.search("\d",parts[1])):
			    log.info("only one title part contains digits")
			    # figure out which part contains the index
			    if bool(re.search("\d",parts[0])):
				indexpart = parts[0]
				textpart = parts[1]
			    else:
				indexpart = parts[1]
				textpart = parts[0]

			    # Look at the part without digits:
			    match = re.match("^[\s\-–:]*(.+?)[\s\-–:]*$",textpart)	# remove odd characters from start and end of the text part
			    if match:
				textpart = match.group(1)

			    # Look at the part with digits:
			    # for Titleparts like: "Name of the series - Episode 2"
			    match = re.match("^\s*(\S\D*?[a-zA-Z]\D*?)\W[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",indexpart)
			    if match:
				guessed_series_index = match.group(2)
				guessed_series = match.group(1)
				if guessed_series is None:
				    guessed_series = textpart
				    guessed_title = textpart + " : Band " + guessed_series_index
				else:
				    guessed_title = textpart

				#log.info("ALGO1: guessed_title: " + guessed_title)
				#log.info("ALGO1: guessed_series: " + guessed_series)
				#log.info("ALGO1: guessed_series_index: " + guessed_series_index)

			    else:
				# for Titleparts like: "Episode 2 Name of the series"
				match = re.match("^\s*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*(\S\D*?[a-zA-Z]\D*?)[\/\.,\-–\s]*$",indexpart)
				if match:
				    guessed_series_index = match.group(1)
				    guessed_series = match.group(2)

				    if guessed_series is None:
					# sometimes books with multiple volumes are detected as series without name -> Add the volume to the title 
					guessed_series = textpart
					guessed_title = textpart + " : Band " + guessed_series_index
				    else:
					guessed_title = textpart

				    #log.info("ALGO2: guessed_title: " + guessed_title)
				    #log.info("ALGO2: guessed_series: " + guessed_series)
				    #log.info("ALGO2: guessed_series_index: " + guessed_series_index)

				else:
				    # for titleparts like: "Band 2"
				    match = re.match("^[\s\(]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*[\/\.,\-–\s]*$",indexpart)
				    if match:
					guessed_series_index = match.group(1)
					# ...with textpart like NAME OF SERIES\s[\-\.;:]\sNAME OF TITLE
					# some false positives
					match = re.match("^\s*(\w+.+?)\s?[\.;\-–:]+\s(\w+.+)\s*$",textpart)
					if match:
					    guessed_series = match.group(1)
					    guessed_title = match.group(2)

					    log.info("ALGO3: guessed_title: " + guessed_title)
					    log.info("ALGO3: guessed_series: " + guessed_series)
					    log.info("ALGO3: guessed_series_index: " + guessed_series_index)


		    elif len(parts)==1:
			log.info("Title has one part")
			# for Titles like: "Name of the series - Title (Episode 2)"
			match = re.match("^\s*(\S.+?) \- (\S.+?) [\(\/\.,\s\-–:](?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0])
			if match:
			    guessed_series_index = match.group(3)
			    guessed_series = match.group(1)
			    guessed_title = match.group(2)

			    #log.info("ALGO4: guessed_title: " + guessed_title)
			    #log.info("ALGO4: guessed_series: " + guessed_series)
			    #log.info("ALGO4: guessed_series_index: " + guessed_series_index)

			else:
			    # for Titles like: "Name of the series - Episode 2"
			    match = re.match("^\s*(\S.+?)[\(\/\.,\s\-–:]*(?:#|Nr\.|Episode|Bd\.|Sammelband|[B|b]and|Part|Teil|Folge)[,\-–:\s#\(]*(\d+\.?\d*)[\)\s\-–:]*$",parts[0])
			    if match:
				guessed_series_index = match.group(2)
				guessed_series = match.group(1)
				guessed_title = guessed_series + " : Band " + guessed_series_index

				#log.info("ALGO5: guessed_title: " + guessed_title)
				#log.info("ALGO5: guessed_series: " + guessed_series)
				#log.info("ALGO5: guessed_series_index: " + guessed_series_index)

		    # Log
		    if guessed_series is not None:
			log.info("Guessed Series: %s" % guessed_series)
			#guessed_series = self.cleanUpSeries(log, guessed_series, publisher_name)
		    if guessed_series_index is not None:
			log.info("Guessed Series Index: %s" % guessed_series_index)
		    if guessed_title is not None:
			log.info("Guessed Title: %s" % guessed_title)
			guessed_title = self.cleanUpTitle(log, guessed_title)

		    if guessed_series is not None and guessed_series_index is not None and guessed_title is not None:
			title = guessed_title
			series = guessed_series
			series_index = guessed_series_index


		##### Filter exact searches #####
		# When doing an exact search for a given IDN skip books with wrong IDNs
		# TODO: Currently exact_search for ISBN is not implemented. Would require ISBN-10 and ISBN-13 conversions
		if idn is not None and "idn" in exact_search:
		    if idn != exact_search["idn"]:
			log.info("Extracted IDN does not match book's IDN, skipping record")
			continue

		##### Put it all together #####
		if self.cfg_append_edition_to_title == True and edition is not None:
		    title = title + " : " + edition

		mi = Metadata(self.removeSortingCharacters(title), map(lambda i: self.removeSortingCharacters(i), authors))
		mi.title_sort = self.removeSortingCharacters(title_sort)
		mi.author_sort = self.removeSortingCharacters(author_sort)
		mi.languages = languages
		mi.pubdate = pubdate
		mi.publisher = " ; ".join(filter(None,[publisher_location, self.removeSortingCharacters(publisher_name)]))
		mi.series = self.removeSortingCharacters(series)
		mi.series_index = series_index
		mi.comments = comments
		mi.isbn = isbn # also required for cover download
		mi.set_identifier('urn',urn)
		mi.set_identifier('dnb-idn',idn)
		mi.set_identifier('ddc', ",".join(ddc))

		# cfg_subjects:
		# 0: use only subjects_gnd
		if self.cfg_fetch_subjects == 0:
		    mi.tags = self.uniq(subjects_gnd)
		# 1: use only subjects_gnd if found, else subjects_non_gnd
		elif self.cfg_fetch_subjects == 1:
		    if len(subjects_gnd)>0:
			mi.tags = self.uniq(subjects_gnd)
		    else:
			mi.tags = self.uniq(subjects_non_gnd)
		# 2: subjects_gnd and subjects_non_gnd
		elif self.cfg_fetch_subjects == 2:
		    mi.tags = self.uniq(subjects_gnd + subjects_non_gnd)
		# 3: use only subjects_non_gnd if found, else subjects_gnd
		elif self.cfg_fetch_subjects == 3:
		    if len(subjects_non_gnd)>0:
			mi.tags = self.uniq(subjects_non_gnd)
		    else:
			mi.tags = self.uniq(subjects_gnd)
		# 4: use only subjects_non_gnd
		elif self.cfg_fetch_subjects == 4:
		    mi.tags = self.uniq(subjects_non_gnd)
		# 5: use no subjects at all
		elif self.cfg_fetch_subjects == 5:
		    mi.tags = []

		# put current result's metdata into result queue
		log.info("Final formatted result: \n%s" % mi)
		result_queue.put(mi)
Beispiel #28
0
    def parse_details(self, root):
        try:
            moly_id = self.parse_moly_id(self.url)
            self.log.info('Parsed moly.hu identifier: %s' % moly_id)
        except:
            self.log.exception(
                'Error parsing moly.hu id for url: %r' % self.url)
            moly_id = None

        try:
            title = self.parse_title(root)
            self.log.info('Parsed title: %s' % title)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
            self.log.info('Parsed authors: %s' % authors)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not moly_id:
            self.log.error(
                'Could not find title/authors/moly.hu id for %r' % self.url)
            self.log.error('Moly.hu id: %r Title: %r Authors: %r' %
                           (moly_id, title, authors))
            return

        mi = Metadata(title, authors)
        mi.set_identifier('moly_hu', moly_id)
        self.moly_id = moly_id

        try:
            isbn = self.parse_isbn(root)
            self.log.info('Parsed ISBN: %s' % isbn)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r' % self.url)

        try:
            series_info = self.parse_series(root)
            if series_info is not None:
                mi.series = series_info[0]
                mi.series_index = int(series_info[1])
                self.log.info('Parsed series: %s, series index: %f' %
                              (mi.series, mi.series_index))
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)

        try:
            mi.comments = self.parse_comments(root)
            self.log.info('Parsed comments: %s' % mi.comments)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            self.cover_url = self.parse_covers(root)
            self.log.info('Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(
                self.moly_id, self.cover_url)
            mi.has_cover = bool(self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)

        try:
            mi.tags = self.parse_tags(root)
            self.log.info('Parsed tags: %s' % mi.tags)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            mi.languages = self.parse_languages(mi.tags)
            self.log.info('Parsed languages: %r' % mi.languages)
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        try:
            mi.publisher = self.parse_publisher(root)
            self.log.info('Parsed publisher: %s' % mi.publisher)
        except:
            self.log.exception(
                'Error parsing publisher for url: %r' % self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
            self.log.info('Parsed publication date: %s' % mi.pubdate)
        except:
            self.log.exception(
                'Error parsing published date for url: %r' % self.url)

        try:
            mi.rating = self.parse_rating(root)
            self.log.info('Parsed rating: %s\n\n' % mi.rating)
        except:
            self.log.exception('Error parsing tags for url: %r\n\n' % self.url)

        mi.source_relevance = self.relevance

        if self.moly_id and self.isbn:
            self.plugin.cache_isbn_to_identifier(self.isbn, self.moly_id)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
    def parse_details(self, root):
        try:
            isbn = self.extract_isbn(self.url)
        except:
            self.log.exception('No ISBN in URL: %r'%self.url)
            isbn = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not isbn:
            self.log.error('Could not find title/authors/Aladin id for %r'%self.url)
            self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        #mi.set_identifier('isbn', isbn)
        mi.isbn = isbn
        self.isbn = isbn

        # ISBN-13
        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)
        mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!
        if mi.has_cover:
            self.log.info('Cover URL: '+mi.cover_url)

        try:
            mi.publisher = self.parse_publisher(root)
        except:
            self.log.exception('Error parsing publisher for url: %r'%self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
        except:
            self.log.exception('Error parsing published date for url: %r'%self.url)

        mi.language = 'ko'

        mi.source_relevance = self.relevance

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #30
0
    def parse_details(self, root):
        try:
            kyobobook_id = self.parse_kyobobook_id(self.url)
        except:
            self.log.exception('Error parsing Kyobobook id for url: %r'%self.url)
            kyobobook_id = None
        
        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not kyobobook_id:
            self.log.error('Could not find title/authors/kyobobook id for %r'%self.url)
            self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title,
                authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('kyobobook', kyobobook_id)
        self.kyobobook_id = kyobobook_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r'%self.url)

        try:
            mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
        except:
            self.log.exception('Error parsing publisher and date for url: %r'%self.url)

        try:
            lang = self._parse_language(root)
            if lang:
                mi.language = lang
        except:
            self.log.exception('Error parsing language for url: %r'%self.url)

        mi.source_relevance = self.relevance

        if self.kyobobook_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.kyobobook_id,
                        self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
    def get_details(self):
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)
        
        # We should not even be here if we are not processing an ebook hit
        if self.url.find("/ebook/") == -1:
            return

        try:
            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Beam Ebooks timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        # raw = raw.decode('utf-8', errors='replace')
        raw = raw.decode('iso-8859-1', errors='replace')
        # open('D:\\work\\calibre-dump-book-details.html', 'wb').write(raw)

        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        try:
            # root = fromstring(clean_ascii_chars(raw))
            root = fromstring(raw)
        except:
            msg = 'Failed to parse beam ebooks details page: %r' % self.url
            self.log.exception(msg)
            return

        try:
            self.beam_ebooks_id = self.parse_beam_ebooks_id(self.url)
        except:
            self.log.exception('Error parsing beam ebooks id for url: %r' % self.url)
            self.beam_ebooks_id = None

        try:
            (self.title, self.series_index) = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            self.title = None
            self.series_index = None

        try:
            self.authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        mi = Metadata(self.title, self.authors)
        mi.set_identifier('beam-ebooks', self.beam_ebooks_id)

        if self.series_index:
            mi.series_index = float(self.series_index)
        
        self._determine_perry_rhodan_cycle_name(mi)

        mi.source_relevance = self.relevance

        self.plugin.clean_downloaded_metadata(mi)

        print(mi)
        self.result_queue.put(mi)        
    def load_details(self, url, timeout):
        def _format_item(str):
            return re.sub('^"(.*)"$', '\\1', unescape(str))

        def _format_list(str):
            return [_.strip() for _ in _format_item(str).split(',')]

        def _find_meta(node, property):
            return [_.get('content') for _ in node if _.get('property') == property][0]

        def _format_date(date_text):
            year = int(date_text[0:4])
            month = int(date_text[4:6]) 
            day = int(date_text[6:])
            return datetime.datetime(year, month, day, tzinfo=utc_tz)

        try:
            response = self.browser.open(url, timeout=timeout)
            root = lxml.html.fromstring(response.read())

            # <meta> tag에서 불러오는 항목
            # 책ID, 제목, ISBN, 이미지URL, 평점
            meta = root.xpath('//meta[starts-with(@property, "og") or starts-with(@property, "books")]')

            # schema.org JSON에서 불러오는 항목
            # 제목, 저자, 책소개, 출판사
            ld_json = root.xpath('//script[@type="application/ld+json"]/text()')
            ld = [json.loads(_) for _ in ld_json]
            book_info = [_ for _ in ld if _['@type'] == 'Book'][0]
        except Exception as e:
            self.log.exception(e)

        ridibooks_id = re.search('id=([0-9]+)', url).group(1)
        isbn = _find_meta(meta, 'books:isbn')
        cover_url = _find_meta(meta, 'og:image')

        title = _find_meta(meta, 'og:title')
        authors = _format_list(book_info['author']['name'])
        if book_info.has_key('translator'):
            authors.extend([_ + u'(역자)' for _ in _format_list(book_info['translator']['name'])])

        mi = Metadata(title, authors)
        mi.set_identifier('ridibooks', ridibooks_id)

        mi.cover_url = cover_url
        mi.has_cover = bool(cover_url)

        mi.publisher = _format_item(book_info['publisher']['name'])
        mi.pubdate = _format_date(book_info['datePublished'])

        mi.comments = _format_item(book_info['description'])
        mi.rating = float(_find_meta(meta, 'books:rating:normalized_value'))

        series = re.search(u'(.*)\s*(\d+)권', title)
        if series:
            mi.series = series.group(1)
            mi.series_index = float(series.group(2))

        mi.language = 'Korean'
        mi.source_relevance = self.relevance

        if ridibooks_id:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id)
            if cover_url:
                self.plugin.cache_identifier_to_cover_url(ridibooks_id, cover_url)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #33
0
    def load_details(self, url, timeout):
        def _format_item(str):
            return re.sub('^"(.*)"$', '\\1', unescape(str))

        def _format_list(str):
            return [_.strip() for _ in _format_item(str).split(',')]

        def _find_meta(node, property):
            return [
                _.get('content') for _ in node if _.get('property') == property
            ][0]

        def _format_date(date_text):
            year = int(date_text[0:4])
            month = int(date_text[4:6])
            day = int(date_text[6:])
            return datetime.datetime(year, month, day, tzinfo=utc_tz)

        try:
            response = self.browser.open(url, timeout=timeout)
            root = lxml.html.fromstring(response.read())

            # <meta> tag에서 불러오는 항목
            # 책ID, 제목, ISBN, 이미지URL, 평점
            meta = root.xpath(
                '//meta[starts-with(@property, "og") or starts-with(@property, "books")]'
            )

            # schema.org JSON에서 불러오는 항목
            # 제목, 저자, 책소개, 출판사
            ld_json = root.xpath(
                '//script[@type="application/ld+json"]/text()')
            ld = [json.loads(_) for _ in ld_json]
            book_info = [_ for _ in ld if _['@type'] == 'Book'][0]
        except Exception as e:
            self.log.exception(e)

        ridibooks_id = re.search('id=([0-9]+)', url).group(1)
        isbn = _find_meta(meta, 'books:isbn')
        cover_url = _find_meta(meta, 'og:image')

        title = _find_meta(meta, 'og:title')
        authors = _format_list(book_info['author']['name'])
        if book_info.has_key('translator'):
            authors.extend([
                _ + u'(역자)'
                for _ in _format_list(book_info['translator']['name'])
            ])

        mi = Metadata(title, authors)
        mi.set_identifier('ridibooks', ridibooks_id)

        mi.cover_url = cover_url
        mi.has_cover = bool(cover_url)

        mi.publisher = _format_item(book_info['publisher']['name'])
        mi.pubdate = _format_date(book_info['datePublished'])

        mi.comments = _format_item(book_info['description'])
        mi.rating = float(_find_meta(meta, 'books:rating:normalized_value'))

        series = re.search(u'(.*)\s*(\d+)권', title)
        if series:
            mi.series = series.group(1)
            mi.series_index = float(series.group(2))

        mi.language = 'Korean'
        mi.source_relevance = self.relevance

        if ridibooks_id:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id)
            if cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    ridibooks_id, cover_url)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #34
0
    def merge(self, results, min_year, do_asr=True):
        ans = Metadata(_('Unknown'))

        # We assume the shortest title has the least cruft in it
        ans.title = self.length_merge('title', results, null_value=ans.title)

        # No harm in having extra authors, maybe something useful like an
        # editor or translator
        ans.authors = self.length_merge('authors', results,
                null_value=ans.authors, shortest=False)

        # We assume the shortest publisher has the least cruft in it
        ans.publisher = self.length_merge('publisher', results,
                null_value=ans.publisher)

        # We assume the smallest set of tags has the least cruft in it
        ans.tags = self.length_merge('tags', results,
                null_value=ans.tags, shortest=msprefs['fewer_tags'])

        # We assume the longest series has the most info in it
        ans.series = self.length_merge('series', results,
                null_value=ans.series, shortest=False)
        for r in results:
            if r.series and r.series == ans.series:
                ans.series_index = r.series_index
                break

        # Average the rating over all sources
        ratings = []
        for r in results:
            rating = r.rating
            if rating and rating > 0 and rating <= 5:
                ratings.append(rating)
        if ratings:
            ans.rating = int(round(sum(ratings)/len(ratings)))

        # Smallest language is likely to be valid
        ans.language = self.length_merge('language', results,
                null_value=ans.language)

        # Choose longest comments
        ans.comments = self.length_merge('comments', results,
                null_value=ans.comments, shortest=False)

        # Published date
        if min_year:
            for r in results:
                year = getattr(r.pubdate, 'year', None)
                if year == min_year:
                    ans.pubdate = r.pubdate
                    break
            if getattr(ans.pubdate, 'year', None) == min_year:
                min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day,
                                    tzinfo=utc_tz)
            else:
                min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
            ans.pubdate = min_date
        else:
            min_date = datetime(3001, 1, 1, tzinfo=utc_tz)
            for r in results:
                if r.pubdate is not None:
                    candidate = as_utc(r.pubdate)
                    if candidate < min_date:
                        min_date = candidate
            if min_date.year < 3000:
                ans.pubdate = min_date

        # Identifiers
        for r in results:
            ans.identifiers.update(r.identifiers)

        # Cover URL
        ans.has_cached_cover_url = bool([r for r in results if
            getattr(r, 'has_cached_cover_url', False)])

        # Merge any other fields with no special handling (random merge)
        touched_fields = set()
        for r in results:
            if hasattr(r, 'identify_plugin'):
                touched_fields |= r.identify_plugin.touched_fields

        for f in touched_fields:
            if f.startswith('identifier:') or not ans.is_null(f):
                continue
            setattr(ans, f, self.random_merge(f, results,
                null_value=getattr(ans, f)))

        if do_asr:
            avg = [x.relevance_in_source for x in results]
            avg = sum(avg)/len(avg)
            ans.average_source_relevance = avg

        return ans
Beispiel #35
0
 def set_mi(self, mi, fm):
     '''
     This sets the metadata for the test result books table. It doesn't reset
     the contents of the field selectors for editing rules.
     '''
     self.fm = fm
     if mi:
         if not isinstance(mi, list):
             mi = (mi, )
     else:
         mi = Metadata(_('Title'), [_('Author')])
         mi.author_sort = _('Author Sort')
         mi.series = ngettext('Series', 'Series', 1)
         mi.series_index = 3
         mi.rating = 4.0
         mi.tags = [_('Tag 1'), _('Tag 2')]
         mi.languages = ['eng']
         mi.id = 1
         if self.fm is not None:
             mi.set_all_user_metadata(self.fm.custom_field_metadata())
         else:
             # No field metadata. Grab a copy from the current library so
             # that we can validate any custom column names. The values for
             # the columns will all be empty, which in some very unusual
             # cases might cause formatter errors. We can live with that.
             from calibre.gui2.ui import get_gui
             fm = get_gui().current_db.new_api.field_metadata
             mi.set_all_user_metadata(fm.custom_field_metadata())
         for col in mi.get_all_user_metadata(False):
             if fm[col]['datatype'] == 'datetime':
                 mi.set(col, DEFAULT_DATE)
             elif fm[col]['datatype'] in ('int', 'float', 'rating'):
                 mi.set(col, 2)
             elif fm[col]['datatype'] == 'bool':
                 mi.set(col, False)
             elif fm[col]['is_multiple']:
                 mi.set(col, (col, ))
             else:
                 mi.set(col, col, 1)
         mi = (mi, )
     self.mi = mi
     tv = self.template_value
     tv.setColumnCount(2)
     tv.setHorizontalHeaderLabels((_('Book title'), _('Template value')))
     tv.horizontalHeader().setStretchLastSection(True)
     tv.horizontalHeader().sectionResized.connect(self.table_column_resized)
     tv.setRowCount(len(mi))
     # Set the height of the table
     h = tv.rowHeight(0) * min(len(mi), 5)
     h += 2 * tv.frameWidth() + tv.horizontalHeader().height()
     tv.setMinimumHeight(h)
     tv.setMaximumHeight(h)
     # Set the size of the title column
     if self.table_column_widths:
         tv.setColumnWidth(0, self.table_column_widths[0])
     else:
         tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10)
     tv.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows)
     tv.setRowCount(len(mi))
     # Use our own widget to get rid of elision. setTextElideMode() doesn't work
     for r in range(0, len(mi)):
         w = QLineEdit(tv)
         w.setReadOnly(True)
         tv.setCellWidget(r, 0, w)
         w = QLineEdit(tv)
         w.setReadOnly(True)
         tv.setCellWidget(r, 1, w)
     self.display_values('')
Beispiel #36
0
    def parse_book_page(self, url):
        # TODO: Support for login-based rating fetching
        # TODO: Move all parsing logic to methods in order to avoid dangling variables
        # TODO: Saving metadata in custom columns
        # TODO: Configurable embedding metadata in comment
        # TODO: missing items
        # original language, first polish publish date, publisher serie, form

        self.log.info('INFO: Downloading book page: {}'.format(url))
        root_tag = self.get_lxml_root(url)

        if not root_tag:
            return None

        book_tag = self.get_book_tag(root_tag)

        if self.prefs['title']:
            book_title = self.parse_title(root_tag, book_tag, url)
        else:
            book_title = self.title

        if self.prefs['authors']:
            book_authors = self.parse_authors(root_tag, book_tag, url)
        else:
            book_authors = self.authors

        mi = Metadata(book_title, book_authors)
        additional_meta = {}

        if self.enabled('languages'):
            languages = self.parse_languages(root_tag, book_tag, url)
            if languages:
                mi.languages = languages

        if self.enabled('rating'):
            rating = self.parse_rating(root_tag, book_tag, url)
            if rating != None:
                mi.rating = rating

        if self.enabled('tags'):
            tags = self.parse_tags(root_tag, book_tag, url)
            if tags:
                mi.tags = tags

        if self.enabled('identifier'):
            identifier = self.parse_identifier(root_tag, book_tag, url)
            if identifier:
                mi.set_identifier(IDENTIFIER, identifier)

        if self.enabled('pubdate'):
            pubdate = self.parse_pubdate(root_tag, book_tag, url)
            if pubdate:
                mi.pubdate = pubdate

        if self.enabled('covers'):
            covers = self.parse_covers(root_tag, book_tag, url)
            if covers:
                mi.has_cover = True
                self.plugin.cached_identifier_to_cover_url('urls').extend(
                    covers)
            else:
                self.plugin.cache_identifier_to_cover_url('nocover', True)
                # TODO: is this necessary?

        if self.enabled('series'):
            series = self.parse_series(root_tag, book_tag, url)
            if series:
                additional_meta['series'] = [
                    self.get_series_string(name, index)
                    for name, index in series
                ]
                name, index = series[0]
                mi.series = name
                if index is not None:
                    mi.series_index = index

        if self.enabled('translators'):
            translators = self.parse_translators(root_tag, book_tag, url)
            if translators:
                additional_meta['translators'] = translators

        if self.enabled('original_title'):
            original_title = self.parse_original_title(root_tag, book_tag, url)
            if original_title:
                additional_meta['original_title'] = original_title

        if self.enabled('categories'):
            categories = self.parse_categories(root_tag, book_tag, url)
            if categories:
                additional_meta['categories'] = categories

        if self.enabled('genres'):
            genres = self.parse_genres(root_tag, book_tag, url)
            if genres:
                additional_meta['genres'] = genres

        if self.enabled('comments'):
            comments = self.parse_comments(root_tag, book_tag, url) or ''
            additional_comments = self.format_additional_comment(
                additional_meta)

            if comments or additional_comments:
                mi.comments = comments + additional_comments

        self.log.info('INFO: Parsing book page completed')

        return mi
Beispiel #37
0
    def merge(self, results, min_year, do_asr=True):
        ans = Metadata(_('Unknown'))

        # We assume the shortest title has the least cruft in it
        ans.title = self.length_merge('title', results, null_value=ans.title)

        # No harm in having extra authors, maybe something useful like an
        # editor or translator
        ans.authors = self.length_merge('authors',
                                        results,
                                        null_value=ans.authors,
                                        shortest=False)

        # We assume the shortest publisher has the least cruft in it
        ans.publisher = self.length_merge('publisher',
                                          results,
                                          null_value=ans.publisher)

        # We assume the smallest set of tags has the least cruft in it
        ans.tags = self.length_merge('tags',
                                     results,
                                     null_value=ans.tags,
                                     shortest=msprefs['fewer_tags'])

        # We assume the longest series has the most info in it
        ans.series = self.length_merge('series',
                                       results,
                                       null_value=ans.series,
                                       shortest=False)
        for r in results:
            if r.series and r.series == ans.series:
                ans.series_index = r.series_index
                break

        # Average the rating over all sources
        ratings = []
        for r in results:
            rating = r.rating
            if rating and rating > 0 and rating <= 5:
                ratings.append(rating)
        if ratings:
            ans.rating = int(round(sum(ratings) / len(ratings)))

        # Smallest language is likely to be valid
        ans.language = self.length_merge('language',
                                         results,
                                         null_value=ans.language)

        # Choose longest comments
        ans.comments = self.length_merge('comments',
                                         results,
                                         null_value=ans.comments,
                                         shortest=False)

        # Published date
        if min_year:
            for r in results:
                year = getattr(r.pubdate, 'year', None)
                if year == min_year:
                    ans.pubdate = r.pubdate
                    break
            if getattr(ans.pubdate, 'year', None) == min_year:
                min_date = datetime(min_year,
                                    ans.pubdate.month,
                                    ans.pubdate.day,
                                    tzinfo=utc_tz)
            else:
                min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
            ans.pubdate = min_date
        else:
            min_date = datetime(3001, 1, 1, tzinfo=utc_tz)
            for r in results:
                if r.pubdate is not None:
                    candidate = as_utc(r.pubdate)
                    if candidate < min_date:
                        min_date = candidate
            if min_date.year < 3000:
                ans.pubdate = min_date

        # Identifiers
        for r in results:
            ans.identifiers.update(r.identifiers)

        # Cover URL
        ans.has_cached_cover_url = bool(
            [r for r in results if getattr(r, 'has_cached_cover_url', False)])

        # Merge any other fields with no special handling (random merge)
        touched_fields = set()
        for r in results:
            if hasattr(r, 'identify_plugin'):
                touched_fields |= r.identify_plugin.touched_fields

        for f in touched_fields:
            if f.startswith('identifier:') or not ans.is_null(f):
                continue
            setattr(ans, f,
                    self.random_merge(f, results, null_value=getattr(ans, f)))

        if do_asr:
            avg = [x.relevance_in_source for x in results]
            avg = sum(avg) / len(avg)
            ans.average_source_relevance = avg

        return ans
    def parse_details(self, root):
        try:
            kyobobook_id = self.parse_kyobobook_id(self.url)
        except:
            self.log.exception('Error parsing Kyobobook id for url: %r' %
                               self.url)
            kyobobook_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r' %
                               self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not kyobobook_id:
            self.log.error('Could not find title/authors/kyobobook id for %r' %
                           self.url)
            self.log.error('Kyobobook: %r Title: %r Authors: %r' %
                           (kyobobook_id, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('kyobobook', kyobobook_id)
        self.kyobobook_id = kyobobook_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r' % self.url)

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r' % self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
        except:
            self.log.exception('Error parsing publisher and date for url: %r' %
                               self.url)

        try:
            lang = self._parse_language(root)
            if lang:
                mi.language = lang
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        mi.source_relevance = self.relevance

        if self.kyobobook_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn,
                                                     self.kyobobook_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    self.kyobobook_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #39
0
    def __init__(self,
                 parent,
                 text,
                 mi=None,
                 fm=None,
                 color_field=None,
                 icon_field_key=None,
                 icon_rule_kind=None,
                 doing_emblem=False,
                 text_is_placeholder=False,
                 dialog_is_st_editor=False,
                 global_vars=None,
                 all_functions=None,
                 builtin_functions=None):
        QDialog.__init__(self, parent)
        Ui_TemplateDialog.__init__(self)
        self.setupUi(self)

        self.coloring = color_field is not None
        self.iconing = icon_field_key is not None
        self.embleming = doing_emblem
        self.dialog_is_st_editor = dialog_is_st_editor
        if global_vars is None:
            self.global_vars = {}
        else:
            self.global_vars = global_vars

        cols = []
        if fm is not None:
            for key in sorted(
                    displayable_columns(fm),
                    key=lambda k: sort_key(fm[k]['name']
                                           if k != color_row_key else 0)):
                if key == color_row_key and not self.coloring:
                    continue
                from calibre.gui2.preferences.coloring import all_columns_string
                name = all_columns_string if key == color_row_key else fm[key][
                    'name']
                if name:
                    cols.append((name, key))

        self.color_layout.setVisible(False)
        self.icon_layout.setVisible(False)

        if self.coloring:
            self.color_layout.setVisible(True)
            for n1, k1 in cols:
                self.colored_field.addItem(
                    n1 + (' (' + k1 + ')' if k1 != color_row_key else ''), k1)
            self.colored_field.setCurrentIndex(
                self.colored_field.findData(color_field))
        elif self.iconing or self.embleming:
            self.icon_layout.setVisible(True)
            if self.embleming:
                self.icon_kind_label.setVisible(False)
                self.icon_kind.setVisible(False)
                self.icon_chooser_label.setVisible(False)
                self.icon_field.setVisible(False)

            for n1, k1 in cols:
                self.icon_field.addItem('{} ({})'.format(n1, k1), k1)
            self.icon_file_names = []
            d = os.path.join(config_dir, 'cc_icons')
            if os.path.exists(d):
                for icon_file in os.listdir(d):
                    icon_file = icu_lower(icon_file)
                    if os.path.exists(os.path.join(d, icon_file)):
                        if icon_file.endswith('.png'):
                            self.icon_file_names.append(icon_file)
            self.icon_file_names.sort(key=sort_key)
            self.update_filename_box()

            if self.iconing:
                dex = 0
                from calibre.gui2.preferences.coloring import icon_rule_kinds
                for i, tup in enumerate(icon_rule_kinds):
                    txt, val = tup
                    self.icon_kind.addItem(txt, userData=(val))
                    if val == icon_rule_kind:
                        dex = i
                self.icon_kind.setCurrentIndex(dex)
                self.icon_field.setCurrentIndex(
                    self.icon_field.findData(icon_field_key))

        if dialog_is_st_editor:
            self.buttonBox.setVisible(False)
        else:
            self.new_doc_label.setVisible(False)
            self.new_doc.setVisible(False)
            self.template_name_label.setVisible(False)
            self.template_name.setVisible(False)

        if mi:
            if not isinstance(mi, list):
                mi = (mi, )
        else:
            mi = Metadata(_('Title'), [_('Author')])
            mi.author_sort = _('Author Sort')
            mi.series = ngettext('Series', 'Series', 1)
            mi.series_index = 3
            mi.rating = 4.0
            mi.tags = [_('Tag 1'), _('Tag 2')]
            mi.languages = ['eng']
            mi.id = 1
            if fm is not None:
                mi.set_all_user_metadata(fm.custom_field_metadata())
            else:
                # No field metadata. Grab a copy from the current library so
                # that we can validate any custom column names. The values for
                # the columns will all be empty, which in some very unusual
                # cases might cause formatter errors. We can live with that.
                from calibre.gui2.ui import get_gui
                mi.set_all_user_metadata(get_gui(
                ).current_db.new_api.field_metadata.custom_field_metadata())
            for col in mi.get_all_user_metadata(False):
                mi.set(col, (col, ), 0)
            mi = (mi, )
        self.mi = mi

        # Set up the display table
        self.table_column_widths = None
        try:
            self.table_column_widths = \
                        gprefs.get('template_editor_table_widths', None)
        except:
            pass
        tv = self.template_value
        tv.setRowCount(len(mi))
        tv.setColumnCount(2)
        tv.setHorizontalHeaderLabels((_('Book title'), _('Template value')))
        tv.horizontalHeader().setStretchLastSection(True)
        tv.horizontalHeader().sectionResized.connect(self.table_column_resized)
        # Set the height of the table
        h = tv.rowHeight(0) * min(len(mi), 5)
        h += 2 * tv.frameWidth() + tv.horizontalHeader().height()
        tv.setMinimumHeight(h)
        tv.setMaximumHeight(h)
        # Set the size of the title column
        if self.table_column_widths:
            tv.setColumnWidth(0, self.table_column_widths[0])
        else:
            tv.setColumnWidth(0, tv.fontMetrics().averageCharWidth() * 10)
        # Use our own widget to get rid of elision. setTextElideMode() doesn't work
        for r in range(0, len(mi)):
            w = QLineEdit(tv)
            w.setReadOnly(True)
            tv.setCellWidget(r, 0, w)
            w = QLineEdit(tv)
            w.setReadOnly(True)
            tv.setCellWidget(r, 1, w)
        tv.setSelectionBehavior(QAbstractItemView.SelectionBehavior.SelectRows)

        # Remove help icon on title bar
        icon = self.windowIcon()
        self.setWindowFlags(self.windowFlags()
                            & (~Qt.WindowType.WindowContextHelpButtonHint))
        self.setWindowIcon(icon)

        self.all_functions = all_functions if all_functions else formatter_functions(
        ).get_functions()
        self.builtins = (builtin_functions if builtin_functions else
                         formatter_functions().get_builtins_and_aliases())

        self.last_text = ''
        self.highlighter = TemplateHighlighter(self.textbox.document(),
                                               builtin_functions=self.builtins)
        self.textbox.cursorPositionChanged.connect(self.text_cursor_changed)
        self.textbox.textChanged.connect(self.textbox_changed)
        self.textbox.setFont(self.get_current_font())

        self.textbox.setTabStopWidth(10)
        self.source_code.setTabStopWidth(10)
        self.documentation.setReadOnly(True)
        self.source_code.setReadOnly(True)

        if text is not None:
            if text_is_placeholder:
                self.textbox.setPlaceholderText(text)
                self.textbox.clear()
                text = ''
            else:
                self.textbox.setPlainText(text)
        else:
            text = ''
        self.buttonBox.button(QDialogButtonBox.StandardButton.Ok).setText(
            _('&OK'))
        self.buttonBox.button(QDialogButtonBox.StandardButton.Cancel).setText(
            _('&Cancel'))

        self.color_copy_button.clicked.connect(self.color_to_clipboard)
        self.filename_button.clicked.connect(self.filename_button_clicked)
        self.icon_copy_button.clicked.connect(self.icon_to_clipboard)

        try:
            with open(P('template-functions.json'), 'rb') as f:
                self.builtin_source_dict = json.load(f, encoding='utf-8')
        except:
            self.builtin_source_dict = {}

        func_names = sorted(self.all_functions)
        self.function.clear()
        self.function.addItem('')
        for f in func_names:
            self.function.addItem(
                '{}  --  {}'.format(
                    f, self.function_type_string(f, longform=False)), f)
        self.function.setCurrentIndex(0)
        self.function.currentIndexChanged.connect(self.function_changed)
        self.display_values(text)
        self.rule = (None, '')

        tt = _('Template language tutorial')
        self.template_tutorial.setText(
            '<a href="%s">%s</a>' % (localize_user_manual_link(
                'https://manual.calibre-ebook.com/template_lang.html'), tt))
        tt = _('Template function reference')
        self.template_func_reference.setText(
            '<a href="%s">%s</a>' % (localize_user_manual_link(
                'https://manual.calibre-ebook.com/generated/en/template_ref.html'
            ), tt))

        s = gprefs.get('template_editor_break_on_print', False)
        self.go_button.setEnabled(s)
        self.remove_all_button.setEnabled(s)
        self.set_all_button.setEnabled(s)
        self.toggle_button.setEnabled(s)
        self.breakpoint_line_box.setEnabled(s)
        self.breakpoint_line_box_label.setEnabled(s)
        self.break_box.setChecked(s)
        self.break_box.stateChanged.connect(self.break_box_changed)
        self.go_button.clicked.connect(self.go_button_pressed)
        self.textbox.setFocus()
        self.set_up_font_boxes()
        self.toggle_button.clicked.connect(self.toggle_button_pressed)
        self.remove_all_button.clicked.connect(self.remove_all_button_pressed)
        self.set_all_button.clicked.connect(self.set_all_button_pressed)

        self.load_button.clicked.connect(self.load_template)
        self.save_button.clicked.connect(self.save_template)
        # Now geometry
        try:
            geom = gprefs.get('template_editor_dialog_geometry', None)
            if geom is not None:
                QApplication.instance().safe_restore_geometry(
                    self, QByteArray(geom))
        except Exception:
            pass
Beispiel #40
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isbytestring(src):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags = parse_comment_tags(src)
    meta_tags = parse_meta_tags(src)

    def get(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = ans.strip()
        if not ans:
            ans = None
        return ans

    # Title
    title = get('title')
    if not title:
        pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE)
        match = pat.search(src)
        if match:
            title = replace_entities(match.group(1))

    # Author
    authors = get('authors') or _('Unknown')

    # Create MetaInformation with Title and Author
    mi = Metadata(title or _('Unknown'), string_to_authors(authors))

    for field in ('publisher', 'isbn', 'language', 'comments'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 5:
                mi.rating /= 2.
            if mi.rating > 5:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get('tags')
    if tags:
        tags = [x.strip() for x in tags.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    return mi
Beispiel #41
0
    def extract_vol_details(self, vol_url):
        # Here we extract and format the information from the choosen volume.
        # - The first name and last name to populate author and author sort : vol_auteur_prenom  and vol_auteur_nom
        # - The title of the volume                                         : vol_title
        # - The serie name the volume is part of                            : vol_serie
        # - The sequence number in the serie                                : vol_serie_seq                         # missing
        # - The editor of this volume                                       : vol_editor
        # - The editor's collection of this volume                          : vol_coll
        # - The collection serial code of this volume                       : vol_coll_srl
        # - The "dépot légal" date (the publication date is vastly unknown) : vol_dp_lgl                            # date format to be computed
        # - The ISBN number assoi-ciated with the volume                    : vol_isbn
        # - The volume tags                                                 : vol_genre
        # - The url pointer to the volume cover image                       : vol_cover_index
        # - The comments includes various info about the book               : vol_comment_soup
        #   . reference, an url pointer to noosfere
        #   . couverture, an url pointer to noosfere, cover may be real smal, but is accurate to the volume
        #   . first edition information
        #   . serie (cycle) name and number
        #   . this volume editor info
        #   . Resume (quatrième de couverture)
        #   . Critiques
        #   . Sommaire detailing what novels are in the volume when it is an anthology
        #   . Critiques about the serie and/or about another volume of the book
        #

        debug = self.dbg_lvl & 2
        self.log.info(self.who, "\nIn extract_vol_details(soup)")
        if debug:
            self.log.info(self.who, "vol_url       : ", vol_url)

        if debug:
            self.log.info(
                self.who,
                "calling ret_soup(log, dbg_lvl, br, url, rkt=None, who='[__init__]')"
            )
            self.log.info(self.who, "vol_url : ", vol_url, "who : ", self.who)
        rsp = ret_soup(self.log, self.dbg_lvl, self.br, vol_url, who=self.who)
        soup = rsp[0]
        url_vrai = rsp[1].replace("&Tri=3", "")
        #        if debug: self.log.info(self.who,soup.prettify())              # useful but too big...

        self.nsfr_id = self.nsfr_id + "$vl$" + url_vrai.replace(
            '?', '&').replace('=', '&').split('&')[2]
        # self.nsfr_id = (self.nfsr_id).strip("$")                        # If I use this form, it gives this error: 'Worker' object has no attribute 'nfsr_id' ???
        tmp = self.nsfr_id
        self.nsfr_id = tmp.strip('$')

        if debug:
            self.log.info(self.who, "self.nsfr_id, type() : ", self.nsfr_id,
                          type(self.nsfr_id))

        tmp_lst = []
        vol_info = {}
        vol_title = ""
        vol_auteur = ""
        vol_auteur_prenom = ""
        vol_auteur_nom = ""
        vol_serie = ""
        vol_serie_seq = ""
        vol_editor = ""
        vol_coll = ""
        vol_coll_srl = ""
        vol_dp_lgl = ""
        vol_isbn = ""
        vol_genre = ""
        vol_cover_index = ""
        comment_generic = None
        comment_resume = None
        comment_Critiques = None
        comment_Sommaire = None
        comment_AutresCritique = None
        comment_cover = None
        comment_decoupage_annexe = None

        # add volume address as a reference in the comment
        vol_comment_soup = BS(
            '<div><p>Référence: <a href="' + url_vrai + '">' + url_vrai +
            '</a></p></div>', "lxml")
        if debug: self.log.info(self.who, "vol reference processed")

        if soup.select("span[class='TitreNiourf']"):
            vol_title = soup.select(
                "span[class='TitreNiourf']")[0].text.strip()
        if debug: self.log.info(self.who, "vol_title processed : ", vol_title)

        if soup.select("span[class='AuteurNiourf']"):
            vol_auteur = soup.select(
                "span[class='AuteurNiourf']")[0].text.replace("\n",
                                                              "").strip()
        if debug:
            self.log.info(self.who, "vol_auteur processed : ", vol_auteur)
        for i in range(len(vol_auteur.split())):
            if not vol_auteur.split()[i].isupper():
                vol_auteur_prenom += " " + vol_auteur.split()[i]
            else:
                vol_auteur_nom += " " + vol_auteur.split()[i].title()
        vol_auteur = vol_auteur.title()
        vol_auteur_prenom = vol_auteur_prenom.strip()
        if debug:
            self.log.info(self.who, "vol_auteur_prenom processed : ",
                          vol_auteur_prenom)
        vol_auteur_nom = vol_auteur_nom.strip()
        if debug:
            self.log.info(self.who, "vol_auteur_nom processed : ",
                          vol_auteur_nom)

        if soup.select("a[href*='serie.asp']"):
            if soup.select("a[href*='serie.asp']")[0].find_parent(
                    "span", {"class": "ficheNiourf"}):
                vol_serie = soup.select("a[href*='serie.asp']")[0].text
                tmp_vss = [
                    x for x in soup.select("a[href*='serie.asp']")
                    [0].parent.stripped_strings
                ]
                for i in range(len(tmp_vss)):
                    if "vol." in tmp_vss[i]:
                        if not vol_serie_seq:
                            vol_serie_seq = tmp_vss[i].replace("vol.",
                                                               "").strip()
                    if "découpage" in tmp_vss[i]:
                        dec_anx_url = "https://www.noosfere.org/livres/" + soup.select(
                            "a[href*='serie.asp']")[0]['href']
                        comment_pre_decoupage_annexe = BS(
                            '<div><p> </p><p style="font-weight: 600; font-size: 18px"> Découpage annexe</p><hr style="color:CCC;"/></div>',
                            "lxml")
                        comment_decoupage_annexe = self.get_decoupage_annexe(
                            dec_anx_url)
                if debug:
                    self.log.info(self.who,
                                  "vol_serie, vol_serie_seq processed : ",
                                  vol_serie, ",", vol_serie_seq)

        comment_generic = soup.select("span[class='ficheNiourf']")[0]
        new_div = soup.new_tag('div')
        comment_generic = comment_generic.wrap(new_div)
        if debug: self.log.info(self.who, "comment_generic processed")

        if soup.select("a[href*='editeur.asp']"):
            vol_editor = soup.select("a[href*='editeur.asp']")[0].text
        if debug:
            self.log.info(self.who, "vol_editor processed : ", vol_editor)

        if soup.select("a[href*='collection.asp']"):
            vol_coll = soup.select("a[href*='collection.asp']")[0].text
        if debug: self.log.info(self.who, "vol_coll : ", vol_coll)

        for i in comment_generic.stripped_strings:
            tmp_lst.append(str(i))
        vol_coll_srl = tmp_lst[len(tmp_lst) - 1]
        if "n°" in vol_coll_srl:
            for k in ["n°", "(", ")"]:
                if k in vol_coll_srl:
                    vol_coll_srl = vol_coll_srl.replace(k, "")
            vol_coll_srl = vol_coll_srl.strip()
            vol_coll_srl = vol_coll_srl.split("/")[0]
            if vol_coll_srl[0].isnumeric():
                vol_coll_srl = ("0" * 5 + vol_coll_srl)[-6:]
        else:
            vol_coll_srl = ""
        if debug:
            self.log.info(self.who, "vol_coll_srl processed : ", vol_coll_srl)

        # publication date is largely ignored in noosfere, but we have the "dépot legal" date and I use it instead
        # note that I 'calculate' the missing day of the month and even sometimes the missing month
        ms = ("janvier", "février", "mars", "avril", "mai", "juin", "juillet",
              "août", "septembre", "octobre", "novembre", "décembre")
        for elemnt in soup.select_one(
                "span[class='sousFicheNiourf']").stripped_strings:
            if debug: self.log.info(self.who, "elemnt : ", elemnt)
            if not vol_dp_lgl:
                elemn = (elemnt.replace("Dépôt légal :",
                                        "").split(','))[0].strip()
                if elemn:
                    if elemn.isnumeric() and len(elemn) == 4:
                        vol_dp_lgl = datetime.datetime.strptime(
                            "175 " + elemn, "%j %Y")
                    elif "semestre" in elemn:
                        ele = elemn.split()
                        vol_dp_lgl = datetime.datetime.strptime(
                            ("000" + str((int(ele[0][0]) - 1) * 175 + 97))[-3:]
                            + " " + ele[2], "%j %Y")
                    elif "trimestre" in elemn:
                        ele = elemn.split()
                        vol_dp_lgl = datetime.datetime.strptime(
                            ("000" + str((int(ele[0][0]) - 1) * 91 + 47))[-3:]
                            + " " + ele[2], "%j %Y")
                    else:
                        for i in range(len(ms)):
                            if ms[i] in elemn:
                                ele = elemn.split()
                                vol_dp_lgl = datetime.datetime.strptime(
                                    ("000" + str(10 + 31 * i))[-3:] + " " +
                                    ele[1], "%j %Y")
                                break
                    if debug:
                        self.log.info(self.who, "vol_dp_lgl : ", vol_dp_lgl)

            if "ISBN" in elemnt:
                vol_isbn = elemnt.lower().replace(" ", "").replace('isbn:', '')
                if "néant" in vol_isbn: vol_isbn = ""
                if debug:
                    self.log.info(self.who, "vol_isbn processed : ", vol_isbn)

            if "Genre" in elemnt:
                vol_genre = elemnt.lstrip("Genre : ")
                if debug:
                    self.log.info(self.who, "vol_genre processed : ",
                                  vol_genre)

        if soup.select("img[name='couverture']"):
            for elemnt in repr(
                    soup.select("img[name='couverture']")[0]).split('"'):
                if "http" in elemnt:
                    if not vol_cover_index:
                        vol_cover_index = elemnt
                        if debug:
                            self.log.info(self.who,
                                          "vol_cover_index processed : ",
                                          vol_cover_index)

        # add cover image address as a reference in the comment
        if vol_cover_index:
            comment_cover = BS(
                '<div><p>Couverture: <a href="' + vol_cover_index + '">' +
                vol_cover_index + '</a></p></div>', "lxml")

    # select the fields I want... More exist such as film adaptations or references to advises to read
    # but that is not quite consistant around all the books (noosfere is a common database from many people)
    # and beside I have enough info like that AND I do NOT want to take out the noosfere's business

        tmp_comm_lst = soup.select("span[class='AuteurNiourf']")
        if debug: self.log.info(self.who, tmp_comm_lst)  #usefull but too long
        for i in range(len(tmp_comm_lst)):
            if "Quatrième de couverture" in str(tmp_comm_lst[i]):
                comment_resume = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]
                if debug: self.log.info(self.who, "comment_resume processed")

            if "Critiques" in str(tmp_comm_lst[i]):
                if not "autres" in str(tmp_comm_lst[i]):
                    comment_Critiques = tmp_comm_lst[i].find_parents(
                        "div", {'class': 'sousbloc'})[0]
                    if debug:
                        self.log.info(self.who, "comment_Critiques processed")

            if "Sommaire" in str(tmp_comm_lst[i]):
                comment_Sommaire = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]
                if debug: self.log.info(self.who, "comment_Sommaire processed")

            if "Critiques des autres" in str(tmp_comm_lst[i]):
                comment_AutresCritique = tmp_comm_lst[i].find_parents(
                    "div", {'class': 'sousbloc'})[0]

                if comment_AutresCritique.select('a[href*="serie.asp"]') and (
                        "Critique de la série" in comment_AutresCritique.
                        select('a[href*="serie.asp"]')[0].text):
                    critic_url = "https://www.noosfere.org/livres/" + comment_AutresCritique.select(
                        'a[href*="serie.asp"]')[0]['href']
                    try:
                        more_comment_AutresCritique = self.get_Critique_de_la_serie(
                            critic_url)
                        comment_AutresCritique.append(
                            more_comment_AutresCritique)
                    except:
                        self.log.exception(
                            "get_Critique_de_la_serie failed for url: ",
                            critic_url)

                if debug:
                    self.log.info(self.who, "comment_AutresCritique processed")

    # group in a big bundle all the fields I think I want... (It is difficult not to include more... :-))

        if comment_cover:
            vol_comment_soup.append(comment_cover)
        if comment_generic:
            vol_comment_soup.append(comment_generic)
        if comment_resume:
            vol_comment_soup.append(comment_resume)
        if comment_Critiques:
            vol_comment_soup.append(comment_Critiques)
        if comment_Sommaire:
            vol_comment_soup.append(comment_Sommaire)
        if comment_AutresCritique:
            vol_comment_soup.append(comment_AutresCritique)
        if comment_decoupage_annexe:
            vol_comment_soup.append(
                comment_pre_decoupage_annexe)  # this is the title
            vol_comment_soup.append(comment_decoupage_annexe)

    #
    # Make a minimum of "repair" over vol_comment_soup so that it displays correctly (how I like it) in the comments and in my catalogs
    # - I hate justify when it makes margin "float" around the correct position (in fact when space are used instead of absolute positioning)
    # - I like to have functional url when they exist
    # - I like to find out the next and/or previous books in a serie (simulated arrows are link :-) )

        for elemnt in vol_comment_soup.select('[align="justify"]'):
            del elemnt['align']

    # remove all double or triple 'br' to improve presentation.
    # Note: tmp1 and tmp2 must contain a different value from any possible first elemnt. (yes, I am lrp and I am unique :-) )
    #
    # ouais, et alors, si je modifie comment_generic APRES l'avoir integré à vol_comment_soup, il n'y a qu'une seule version en mémoire...
    # donc vol_comment_soup est modifié...
    #

        tmp1 = tmp2 = "lrp_the_unique"
        for elemnt in vol_comment_soup.findAll():
            tmp1, tmp2 = tmp2, elemnt
            if tmp1 == tmp2:
                elemnt.extract()

        br = soup.new_tag('br')
        for elemnt in vol_comment_soup.select('.AuteurNiourf'):
            elemnt.insert(0, br)
            elemnt["style"] = "font-weight: 600; font-size: 18px"

        if debug:
            for elemnt in vol_comment_soup.select("a[href*='.asp']"):
                if 'http' not in elemnt.get('href'):
                    self.log.info(self.who, "url incomplet avant correction: ",
                                  elemnt)

        for elemnt in vol_comment_soup.select("a[href*='/livres/auteur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/livres/auteur.asp",
                    "https://www.noosfere.org/livres/auteur.asp")
        for elemnt in vol_comment_soup.select("a[href*='/livres/niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/livres/niourf.asp",
                    "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='/heberg/']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "/heberg/", "https://www.noosfere.org/heberg/")

        for elemnt in vol_comment_soup.select(
                "a[href*='./EditionsLivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "./EditionsLivre.asp",
                    "https://www.noosfere.org/livres/EditionsLivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='./niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "./niourf.asp",
                    "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='heberg']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "../../heberg", "https://www.noosfere.org/heberg")
        for elemnt in vol_comment_soup.select("a[href*='../bd']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "../bd", "https://www.noosfere.org/bd")

        for elemnt in vol_comment_soup.select("a[href*='auteur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "auteur.asp", "https://www.noosfere.org/livres/auteur.asp")
        for elemnt in vol_comment_soup.select("a[href*='collection.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "collection.asp",
                    "https://www.noosfere.org/livres/collection.asp")
        for elemnt in vol_comment_soup.select("a[href*='critsign.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "critsign.asp",
                    "https://www.noosfere.org/livres/critsign.asp")
        for elemnt in vol_comment_soup.select("a[href*='EditionsLivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "EditionsLivre.asp",
                    "https://www.noosfere.org/livres/EditionsLivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='editeur.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "editeur.asp",
                    "https://www.noosfere.org/livres/editeur.asp")
        for elemnt in vol_comment_soup.select("a[href*='editionslivre.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "editionslivre.asp",
                    "https://www.noosfere.org/livres/editionslivre.asp")
        for elemnt in vol_comment_soup.select("a[href*='niourf.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "niourf.asp", "https://www.noosfere.org/livres/niourf.asp")
        for elemnt in vol_comment_soup.select("a[href*='serie.asp']"):
            if 'http' not in elemnt.get('href'):
                elemnt["href"] = elemnt["href"].replace(
                    "serie.asp", "https://www.noosfere.org/livres/serie.asp")

        if debug:
            for elemnt in vol_comment_soup.select("a[href*='.asp']"):
                if 'http' not in elemnt.get('href'):
                    self.log.info(self.who, "url incomplet apres correction: ",
                                  elemnt)

        fg, fd = "<<==", "==>>"  #chr(0x21D0),chr(0x21D2)   #chr(0x27f8),chr(0x27f9)
        for elemnt in vol_comment_soup.select("img[src*='arrow_left']"):
            elemnt.replace_with(fg)
        for elemnt in vol_comment_soup.select("img[src*='arrow_right']"):
            elemnt.replace_with(fd)

        # depending on the tick box, make a fat publisher using seperators that have a very low probability to pop up (§ and €)
        # only set vol_coll_srl if vol_coll exists
        # the idea is to use search and replace in the edit Metadata in bulk window.

        if self.extended_publisher:
            if debug:
                self.log.info(
                    self.who,
                    """flag : "Ajoute collection et son numéro d'ordre au champ èditeur" set"""
                )
            if vol_coll:
                if debug: self.log.info(self.who, 'add collection')
                vol_editor = vol_editor + ('§') + vol_coll
                if vol_coll_srl:
                    if debug: self.log.info(self.who, 'add collection number')
                    vol_editor = vol_editor + ('€') + vol_coll_srl

        if vol_serie:
            if vol_serie_seq.isnumeric(): vol_serie_seq = float(vol_serie_seq)
            else: vol_serie_seq = 1.0

        # UTF-8 characters may be serialized different ways, only xmlcharrefreplace produces xml compatible strings
        # any other non ascii character with another utf-8 byte representation will make calibre behave with the messsage:
        # ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
        # Side note:
        # I have no real good url structure(i once got html 3 times, div a sibling of html...), but calibre does not seems to care (nice :-) )
        #
        # Ca m'a pris un temps fou pour trouver, par hazard, que encode('ascii','xmlcharrefreplace') aidait bien...
        # (enfin, quasi par hazard, j' ai essayé tout ce qui pouvait ameliorer la compatibilité avec xml... mais je
        # lisais mal et je pensais à une incompatibilité avec la structure xml),
        #
        vol_comment_soup = vol_comment_soup.encode('ascii',
                                                   'xmlcharrefreplace')

        self.log.info(self.who, "+++" * 25)
        self.log.info(self.who,
                      "nsfr_id, type()                : ", self.nsfr_id,
                      type(self.nsfr_id))  # must be <class 'str'>
        self.log.info(self.who,
                      "relevance, type()              : ", self.relevance,
                      type(self.relevance))  # must be <class 'float'>
        self.log.info(self.who, "vol_title, type()              : ", vol_title,
                      type(vol_title))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_auteur, type()             : ", vol_auteur,
            type(vol_auteur))  # must be <class 'list'> of <class 'str'>
        self.log.info(self.who,
                      "vol_auteur_prenom, type()      : ", vol_auteur_prenom,
                      type(vol_auteur_prenom))  # must be <class 'str'>
        self.log.info(self.who,
                      "vol_auteur_nom, type()         : ", vol_auteur_nom,
                      type(vol_auteur_nom))  # must be <class 'str'>
        if vol_serie:
            self.log.info(self.who, "vol_serie, type()              : ",
                          vol_serie, type(vol_serie))  # must be <class 'str'>
            self.log.info(self.who,
                          "vol_serie_seq, type()          : ", vol_serie_seq,
                          type(vol_serie_seq))  # must be <class 'float'>
        self.log.info(self.who, "vol_editor, type()             : ",
                      vol_editor, type(vol_editor))  # must be <class 'str'>
        self.log.info(self.who, "vol_coll, type()               : ", vol_coll,
                      type(vol_coll))  # must be <class 'str'>
        self.log.info(self.who,
                      "vol_coll_srl, type()           : ", vol_coll_srl,
                      type(vol_coll_srl))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_dp_lgl, type()             : ", vol_dp_lgl,
            type(vol_dp_lgl)
        )  # must be <class 'datetime.datetime'> ('renderer=isoformat')
        self.log.info(self.who, "vol_isbn, type()               : ", vol_isbn,
                      type(vol_isbn))  # must be <class 'str'>
        self.log.info(
            self.who, "vol_genre, type()              : ", vol_genre,
            type(vol_genre))  # must be <class 'list'> of <class 'str'>
        self.log.info(self.who, "vol_cover_index, type()        : ",
                      vol_cover_index, type(vol_cover_index))  # must be
        self.log.info(self.who, "type(vol_comment_soup)         : ",
                      type(vol_comment_soup)
                      )  # must be byte encoded (start with b'blablabla...
        #        self.log.info(self.who,"vol_comment_soup               :\n",vol_comment_soup)                                # Maybe a bit long sometimes
        # language must be <class 'str'>

        if vol_cover_index:
            self.plugin.cache_identifier_to_cover_url(self.nsfr_id,
                                                      vol_cover_index)

        if vol_isbn:
            self.plugin.cache_isbn_to_identifier(vol_isbn, self.nsfr_id)

        mi = Metadata(vol_title, [vol_auteur])
        mi.set_identifier('nsfr_id', self.nsfr_id)
        mi.publisher = vol_editor
        mi.isbn = vol_isbn
        mi.tags = [vol_genre]
        mi.source_relevance = self.relevance
        mi.has_cover = bool(vol_cover_index)
        if vol_dp_lgl:
            mi.pubdate = vol_dp_lgl
        if vol_serie:
            mi.series = vol_serie
            mi.series_index = vol_serie_seq
        mi.language = "fra"

        mi.comments = vol_comment_soup

        if debug: self.log.info(self.who, "mi\n", mi, "\n")
        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)