Beispiel #1
0
    def parse(self, xml_detail):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_detail)
        publisher = self.parse_publisher(xml_detail)
        pub_year = self.parse_pubdate(xml_detail)
        tags = self.parse_tags(xml_detail)
        serie, serie_index = self.parse_serie(xml_detail)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:str(self.number)}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(str(self.number), cover)

            return mi
        else:
            return None
Beispiel #2
0
    def to_metadata(self, log, entry):  # {{{
        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'

        title = entry.xpath(xp_template.format('Name'))
        author = entry.xpath(xp_template.format('Author'))
        norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.xpath(xp_template.format('ID'))
        mi.identifiers = {'ozon':ozon_id}

        mi.comments = entry.xpath(xp_template.format('Annotation'))

        mi.ozon_cover_url = None
        cover = entry.xpath(xp_template.format('Picture'))
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)

        pub_year = entry.xpath(xp_template.format('Year'))
        if pub_year:
            mi.pubdate = toPubdate(log, pub_year)
            #log.debug('pubdate %s'%mi.pubdate)

        rating = entry.xpath(xp_template.format('ClientRatingValue'))
        if rating:
            try:
                #'rating',     A floating point number between 0 and 10
                # OZON raion N of 5, calibre of 10, but there is a bug? in identify
                mi.rating = float(rating)
            except:
                pass
            rating
        return mi
Beispiel #3
0
    def to_metadata_for_single_entry(self, log, ozon_id, title, authors):  # {{{

        # parsing javascript data from the redirect page
        mi = Metadata(title, authors)
        mi.identifiers = {'ozon': ozon_id}

        return mi
Beispiel #4
0
    def to_metadata_for_single_entry(self, log, ozon_id, title, authors):  # {{{

        # parsing javascript data from the redirect page
        mi = Metadata(title, authors)
        mi.identifiers = {'ozon': ozon_id}

        return mi
Beispiel #5
0
    def to_metadata(self, log, entry):  # {{{
        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'

        title = entry.xpath(xp_template.format('Name'))
        author = entry.xpath(xp_template.format('Author'))
        norm_authors = map(_normalizeAuthorNameWithInitials,
                           map(unicode.strip,
                               unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.xpath(xp_template.format('ID'))
        mi.identifiers = {'ozon': ozon_id}

        mi.comments = entry.xpath(xp_template.format('Annotation'))

        mi.ozon_cover_url = None
        cover = entry.xpath(xp_template.format('Picture'))
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)

        pub_year = entry.xpath(xp_template.format('Year'))
        if pub_year:
            mi.pubdate = toPubdate(log, pub_year)
            #log.debug('pubdate %s'%mi.pubdate)

        rating = entry.xpath(xp_template.format('ClientRatingValue'))
        if rating:
            try:
                #'rating',     A floating point number between 0 and 10
                # OZON raion N of 5, calibre of 10, but there is a bug? in identify
                mi.rating = float(rating)
            except:
                pass
            rating
        return mi
Beispiel #6
0
    def to_metadata(self, log, entry):  # {{{
        title = unicode(entry.xpath(u'normalize-space(.//span[@itemprop="name"][1]/text())'))
        # log.debug(u'Tile: -----> %s' % title)

        author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
        # log.debug(u'Author: -----> %s' % author)

        norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
        if ozon_id:
            mi.identifiers = {'ozon':ozon_id}
            # log.debug(u'ozon_id: -----> %s' % ozon_id)

        mi.ozon_cover_url = None
        cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
        # log.debug(u'cover: -----> %s' % cover)
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)
            # log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url)

        pub_year = None
        if pub_year:
            mi.pubdate = toPubdate(log, pub_year)
            # log.debug('pubdate %s' % mi.pubdate)

        mi.rating = self.get_rating(entry)
        # if not mi.rating:
        #    log.debug('No rating found. ozon_id:%s'%ozon_id)

        return mi
Beispiel #7
0
    def parse(self, xml_detail, xml_more_info):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_more_info)
        publisher = self.parse_publisher(xml_detail)
        tags = self.parse_tags(xml_detail, xml_more_info)
        serie, serie_index = self.parse_serie(xml_detail)
        pub_year = self.parse_pub_year(xml_detail, xml_more_info)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(as_unicode(title), authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(self.ident, cover)

            return mi
        else:
            self.log('Result skipped for because title or authors not found')
            return None
Beispiel #8
0
    def get_metadata_from_detail(self, log, entry, title, authors, identifiers):  # {{{
        title = unicode(entry.xpath(u'normalize-space(.//h1[@itemprop="name"][1]/text())'))
        # log.debug(u'Tile (from_detail): -----> %s' % title)

        author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
        # log.debug(u'Author (from_detail): -----> %s' % author)

        norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(//link[@rel="canonical"][contains(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
        if ozon_id:
            # log.debug(u'ozon_id (from_detail): -----> %s' % ozon_id)
            mi.identifiers = {'ozon':ozon_id}

        mi.ozon_cover_url = None
        cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)
            # log.debug(u'mi.ozon_cover_url  (from_detail): -----> %s' % mi.ozon_cover_url)

        mi.rating = self.get_rating(entry)
        # log.debug(u'mi.rating  (from_detail): -----> %s' % mi.rating)
        if not mi.rating:
            log.debug('No rating (from_detail) found. ozon_id:%s'%ozon_id)

        return mi
Beispiel #9
0
    def get_metadata_from_detail(self, log, entry, title, authors, identifiers):  # {{{
        title = unicode(entry.xpath(u'normalize-space(.//h1[@itemprop="name"][1]/text())'))
        # log.debug(u'Tile (from_detail): -----> %s' % title)

        author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
        # log.debug(u'Author (from_detail): -----> %s' % author)

        norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
        if ozon_id:
            # log.debug(u'ozon_id (from_detail): -----> %s' % ozon_id)
            mi.identifiers = {'ozon':ozon_id}

        mi.ozon_cover_url = None
        cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)
            # log.debug(u'mi.ozon_cover_url  (from_detail): -----> %s' % mi.ozon_cover_url)

        mi.rating = self.get_rating(entry)
        # log.debug(u'mi.rating  (from_detail): -----> %s' % mi.rating)
        if not mi.rating:
            log.debug('No rating (from_detail) found. ozon_id:%s'%ozon_id)

        return mi
Beispiel #10
0
    def to_metadata(self, log, entry):  # {{{
        title = unicode(entry.xpath(u'normalize-space(.//span[@itemprop="name"][1]/text())'))
        # log.debug(u'Tile: -----> %s' % title)

        author = unicode(entry.xpath(u'normalize-space(.//a[contains(@href, "person")][1]/text())'))
        # log.debug(u'Author: -----> %s' % author)

        norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.xpath(u'substring-before(substring-after(normalize-space(.//a[starts-with(@href, "/context/detail/id/")][1]/@href), "id/"), "/")')
        if ozon_id:
            mi.identifiers = {'ozon':ozon_id}
            # log.debug(u'ozon_id: -----> %s' % ozon_id)

        mi.ozon_cover_url = None
        cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
        # log.debug(u'cover: -----> %s' % cover)
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)
            # log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url)

        pub_year = None
        if pub_year:
            mi.pubdate = toPubdate(log, pub_year)
            # log.debug('pubdate %s' % mi.pubdate)

        mi.rating = self.get_rating(entry)
        # if not mi.rating:
        #    log.debug('No rating found. ozon_id:%s'%ozon_id)

        return mi
Beispiel #11
0
    def parse(self, xml_detail):
        data = xml_detail.split('\n')[1].split("|")
        self.log(data)

        title = data[1]
        authors = [data[0]]
        comments = data[13]
        isbn = data[3]
        publisher = data[6]
        pub_date_tmp = data[34].split('-')
        pub_date = datetime.datetime(int(pub_date_tmp[0]), int(pub_date_tmp[1]), int(pub_date_tmp[2]), tzinfo=utc_tz)
        if isbn is not None:
            isbn_tmp = re.sub("-", "", isbn)
            cover = "%s/images/covers/%s.jpg"%(self.plugin.BASE_URL, isbn_tmp)
        else:
            cover = None

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.publisher = publisher
            mi.pubdate = pub_date
            mi.isbn = isbn
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(self.ident, cover)

            return mi
        else:
            return None
    def result2meta(self, result, prev_identifiers={}):
        '''
        Converts the result dict into Calibre metadata.
        Note: Source download plugins do  not have access to custom columns.
        '''
        title = get_title(result)
        authors = get_author_list(result)
        mi = Metadata(title=title, authors=authors)

        mi.identifiers = update_identifiers(prev_identifiers, result)

        put_publisher(mi, result)
        put_language(mi, result)
        self.put_pubdate(mi, result)
        put_tags(mi, result)
        put_journal(mi, result)
        self.put_series_index(mi, result)

        comments = ""
        if prefs['abstract_to_comment'] and 'abstract' in result:
            comments = "\n\n".join([comments, result['abstract']])

        if prefs['query_to_comment']:
            extra_meta = self.mkComments(result)
            extra_plus = map(lambda x: "crossref:%s" % x, extra_meta)
            extra = "\n".join(extra_plus)
            comments = "\n\n".join([comments, extra])
        mi.comments = comments

        if 'score' in result:
            mi.source_relevance = 100 - result['score']
        else:
            mi.source_relevance = 100
        # self.log.info("set comment to %s"%mi.comments)
        return mi
Beispiel #13
0
 def default_mi(self):
     from calibre.ebooks.metadata.book.base import Metadata
     mi = Metadata(_('A sample book'), [_('Author One'), _('Author Two')])
     mi.series = _('A series of samples')
     mi.series_index = 4
     mi.tags = [_('Tag One'), _('Tag Two')]
     mi.publisher = _('Some publisher')
     mi.rating = 4
     mi.identifiers = {'isbn':'123456789', 'url': 'http://calibre-ebook.com'}
     mi.languages = ['eng', 'fra']
     mi.pubdate = mi.timestamp = now()
     return mi
Beispiel #14
0
 def default_mi(self):
     from calibre.ebooks.metadata.book.base import Metadata
     mi = Metadata(_('A sample book'), [_('Author One'), _('Author Two')])
     mi.series = _('A series of samples')
     mi.series_index = 4
     mi.tags = [_('Tag One'), _('Tag Two')]
     mi.publisher = _('Some publisher')
     mi.rating = 4
     mi.identifiers = {'isbn':'123456789', 'url': 'https://calibre-ebook.com'}
     mi.languages = ['eng', 'fra']
     mi.pubdate = mi.timestamp = now()
     return mi
Beispiel #15
0
    def parse(self, xml_detail):
        sys_ident = title = isbn = publisher = pub_year = serie = serie_index = cover = None
        authors = []
        tags = []
        xpath = self.XPath('//table[@id="record"]//tr')
        for row in xpath(xml_detail):
            ch = row.getchildren()
            txt = ch[0].text.strip()
            data = self.normalize(ch[1].text)
            if txt.startswith('245') and title is None:
                title = self.parse_title(data)
            if txt.startswith('246'):
                title = self.parse_title(data)
            elif txt.startswith('100') or txt.startswith('700'):
                res = self.parse_author(data)
                if res is not None:
                    authors.append(res)
            elif txt == 'SYS':
                sys_ident = data.strip()
            elif txt =='020':
                isbn = self.parse_isbn(data)
            elif txt == '260':
                publisher, pub_year = self.parse_publisher(data)
            elif txt.startswith('490') and serie is None:
                serie, serie_index = self.parse_serie(data)
            elif txt == '655 7':
                tags.append(self.parse_tags(data))

        if isbn is not None and isbn != '':
            cover = self.parse_cover(isbn)

        if title is not None and len(authors) > 0 and sys_ident is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.identifiers = {self.plugin.name:sys_ident}
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(sys_ident, cover)

            return mi
        else:
            self.log('Data not found')
            return None
Beispiel #16
0
    def to_metadata(self, log, entry):  # {{{
        title = unicode(
            entry.xpath(
                u'normalize-space(.//div[@itemprop="name"][1]/text())'))
        # log.debug(u'Title: -----> %s' % title)

        author = unicode(
            entry.xpath(
                u'normalize-space(.//div[contains(@class, "mPerson")])'))
        # log.debug(u'Author: -----> %s' % author)

        norm_authors = map(_normalizeAuthorNameWithInitials,
                           map(unicode.strip,
                               unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.get('data-href').split('/')[-2]

        if ozon_id:
            mi.identifiers = {'ozon': ozon_id}
            # log.debug(u'ozon_id: -----> %s' % ozon_id)

        mi.ozon_cover_url = None
        cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
        log.debug(u'cover: -----> %s' % cover)
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)
            # log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url)

        pub_year = None
        pub_year_block = entry.xpath(
            u'.//div[@class="bOneTileProperty"]/text()')
        year_pattern = re.compile('\d{4}')
        if pub_year_block:
            pub_year = re.search(year_pattern, pub_year_block[0])
            if pub_year:
                mi.pubdate = toPubdate(log, pub_year.group())
        # log.debug('pubdate %s' % mi.pubdate)

        mi.rating = self.get_rating(log, entry)
        # if not mi.rating:
        #    log.debug('No rating found. ozon_id:%s'%ozon_id)

        return mi
    def to_metadata(self, browser, log, entry_, timeout):  # {{{
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.date import parse_date, utcnow
        from calibre.utils.cleantext import clean_ascii_chars

        # log.info('entry_ is: ',entry_)
        id_url = entry_['url']
        douban_id = entry_['id']
        title_ = entry_['title']
        subtitle = entry_['subtitle']
        authors = [x.strip() for x in entry_['author'] if x]
        if not authors:
            authors = [_('Unknown')]

        mi = Metadata(title_, authors)
        mi.identifiers = {'douban': douban_id}
        mi.comments = entry_['summary']
        mi.publisher = entry_['publisher']

        # ISBN
        mi.isbn = entry_['isbn10']
        mi.all_isbns = [entry_['isbn10'], entry_['isbn13']]

        # Tags
        mi.tags = [x['name'].strip() for x in entry_['tags']]

        # pubdate
        pubdate = entry_['pubdate']
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        # Ratings
        mi.rating = float(entry_['rating']['average']) / 2.0

        # Cover
        mi.has_douban_cover = entry_['image']
        return mi
Beispiel #18
0
    def parse(self, xml_detail):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        tags = self.parse_tags(xml_detail)
        serie, serie_index = self.parse_serie(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.rating = rating
            mi.tags = tags
            mi.series = serie
            mi.series_index = serie_index
            return mi
        else:
            return None
Beispiel #19
0
    def to_metadata(self, log, entry):  # {{{
        title = unicode(entry.xpath(u'normalize-space(.//div[@itemprop="name"][1]/text())'))
        # log.debug(u'Title: -----> %s' % title)

        author = unicode(entry.xpath(u'normalize-space(.//div[contains(@class, "mPerson")])'))
        # log.debug(u'Author: -----> %s' % author)

        norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.get('data-href').split('/')[-2]

        if ozon_id:
            mi.identifiers = {'ozon': ozon_id}
            # log.debug(u'ozon_id: -----> %s' % ozon_id)

        mi.ozon_cover_url = None
        cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
        log.debug(u'cover: -----> %s' % cover)
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)
            # log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url)

        pub_year = None
        pub_year_block = entry.xpath(u'.//div[@class="bOneTileProperty"]/text()')
        year_pattern = re.compile('\d{4}')
        if pub_year_block:
            pub_year = re.search(year_pattern, pub_year_block[0])
            if pub_year:
                mi.pubdate = toPubdate(log, pub_year.group())
        # log.debug('pubdate %s' % mi.pubdate)

        mi.rating = self.get_rating(log, entry)
        # if not mi.rating:
        #    log.debug('No rating found. ozon_id:%s'%ozon_id)

        return mi
Beispiel #20
0
 def start(self, title, authors, identifiers):
     book = Metadata(title, authors)
     book.identifiers = identifiers
     self.covers_widget.start(book, self.current_cover,
             title, authors, {})
     return self.exec_()
Beispiel #21
0
    def to_metadata(self, browser, log, entry_, timeout):  # {{{
        from calibre.utils.date import parse_date, utcnow

        douban_id = entry_.get('id')
        title = entry_.get('title')
        description = entry_.get('summary')
        # subtitle = entry_.get('subtitle')  # TODO: std metada doesn't have this field
        publisher = entry_.get('publisher')
        isbn = entry_.get('isbn13')  # ISBN11 is obsolute, use ISBN13
        pubdate = entry_.get('pubdate')
        authors = entry_.get('author')
        book_tags = entry_.get('tags')
        rating = entry_.get('rating')
        cover_url = entry_.get('images', {}).get('large')
        series = entry_.get('series')

        if not authors:
            authors = [_('Unknown')]
        if not douban_id or not title:
            # Silently discard this entry
            return None

        mi = Metadata(title, authors)
        mi.identifiers = {'douban': douban_id}
        mi.publisher = publisher
        mi.comments = description
        # mi.subtitle = subtitle

        # ISBN
        isbns = []
        if isinstance(isbn, (type(''), bytes)):
            if check_isbn(isbn):
                isbns.append(isbn)
        else:
            for x in isbn:
                if check_isbn(x):
                    isbns.append(x)
        if isbns:
            mi.isbn = sorted(isbns, key=len)[-1]
        mi.all_isbns = isbns

        # Tags
        mi.tags = [tag['name'] for tag in book_tags]

        # pubdate
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        # Ratings
        if rating:
            try:
                mi.rating = float(rating['average']) / 2.0
            except:
                log.exception('Failed to parse rating')
                mi.rating = 0

        # Cover
        mi.has_douban_cover = None
        u = cover_url
        if u:
            # If URL contains "book-default", the book doesn't have a cover
            if u.find('book-default') == -1:
                mi.has_douban_cover = u

        # Series
        if series:
            mi.series = series['title']

        return mi
Beispiel #22
0
def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
    XPath = partial(etree.XPath, namespaces=NAMESPACES)

    # total_results  = XPath('//openSearch:totalResults')
    # start_index    = XPath('//openSearch:startIndex')
    # items_per_page = XPath('//openSearch:itemsPerPage')
    entry          = XPath('//atom:entry')
    entry_id       = XPath('descendant::atom:id')
    creator        = XPath('descendant::dc:creator')
    identifier     = XPath('descendant::dc:identifier')
    title          = XPath('descendant::dc:title')
    date           = XPath('descendant::dc:date')
    publisher      = XPath('descendant::dc:publisher')
    subject        = XPath('descendant::dc:subject')
    description    = XPath('descendant::dc:description')
    language       = XPath('descendant::dc:language')
    rating         = XPath('descendant::gd:rating[@average]')

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text
    google_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'google':google_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
            strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi

    mi.comments = get_text(extra, description)
    lang = canonicalize_lang(get_text(extra, language))
    if lang:
        mi.language = lang
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in identifier(extra):
        t = str(x.text).strip()
        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
            if t[:5].upper() == 'ISBN:':
                t = check_isbn(t[5:])
                if t:
                    isbns.append(t)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x.text for x in subject(extra) if x.text]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        from calibre.utils.date import parse_date, utcnow
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r'%pubdate)

    # Ratings
    for x in rating(extra):
        try:
            mi.rating = float(x.get('average'))
            if mi.rating > 5:
                mi.rating /= 2
        except:
            log.exception('Failed to parse rating')

    # Cover
    mi.has_google_cover = None
    for x in extra.xpath(
            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
        mi.has_google_cover = x.get('href')
        break

    return mi
Beispiel #23
0
 def start(self, title, authors, identifiers):
     book = Metadata(title, authors)
     book.identifiers = identifiers
     self.covers_widget.start(book, self.current_cover, title, authors, {})
     return self.exec_()
Beispiel #24
0
    def parse(self, raw, desc_raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_date, utcnow
        import json

        root = parse_html(raw.decode('gb18030'))
        title = root.xpath('//*[@id="name"]/div[1]/text()')
        title = title[0].strip()
        authors = []
        for i in root.xpath('//*[@id="p-author"]/a'):
            authors.append(i.text.strip())
        mi = Metadata(title, authors)

        information = root.xpath('//*[@id="parameter2"]/li')
        info = dict()
        for i in information:
            tmp = etree.tostring(i, method='text',
                                 encoding='utf-8').split(u':')
            info[tmp[0].strip()] = tmp[1].strip()
        # Identifiers
        mi.identifiers = self.plugin.identifiers
        mi.identifiers['jd'] = self.sku
        isbn = info['ISBN']
        self.log.error(isbn)
        if isbn:
            mi.isbn = isbn
            self.plugin.cache_isbn_to_identifier(isbn, self.sku)
            mi.identifiers['isbn'] = isbn

        # Publisher
        mi.publisher = info.get(u'出版社')

        # Pubdate
        pubdate = info.get(u'出版时间')
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                self.log.error('Failed to parse pubdate %r' % pubdate)

        # Series
        mi.series = info.get(u'丛书名')

        img = root.xpath('//*[@id="spec-n1"]/img')
        cover = img[0].get('src')
        if cover:
            if not cover.startswith('http'):
                cover = 'https:' + cover
            self.plugin.cache_identifier_to_cover_url(self.sku, cover)
        self.log.error(cover)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(
            self.sku) is not None

        # Comments
        # showdesc({"date":1583588455348,"content":" ... "})
        try:
            desc = json.loads(desc_raw[9:-1].decode('gb18030'))
            desc_root = parse_html(desc['content'])
            div = desc_root.xpath(
                '//*[@id="detail-tag-id-3"]/div[2]/div/text()')

            comments = div[0]
            mi.comments = comments
        finally:
            return mi
    def retrieve_bokelai_detail(self, bokelai_id, log, result_queue, timeout):

        detail_url = self.BOKELAI_DETAIL_URL % bokelai_id
        log.info(detail_url)

        try:
            br = self.browser
            _raw = br.open_novisit(detail_url, timeout=timeout)
            raw = _raw.read()
        except Exception as e:
            log.exception('Failed to load detail page: %s' % detail_url)
            return

        root = etree.HTML(raw)
        info_json_text = root.xpath(
            "//script[@type='application/ld+json']")[0].text
        log.info(info_json_text)
        info_json = json.loads(info_json_text)

        title = info_json['name']
        authors = info_json['author'][0]['name'].split(",")
        publisher = info_json['publisher'][0]['name']
        isbn = info_json['workExample']['workExample']['isbn']
        pubdate = info_json['datePublished']

        comments = ""
        comments_ele = root.xpath("(//div[@class='content'])[1]//text()")
        comments = "\n".join(comments_ele)

        tags = list()
        for ele in root.xpath("//li[contains(text(),'本書分類:')]/a"):
            log.info(ele.text)
            if "/" in ele.text:
                tags.extend(ele.text.split("/"))
            if "/" in ele.text:
                tags.extend(ele.text.split("/"))
            else:
                tags.append(ele.text)

        cover_url = re.search(r'https[^\?\=\&]*' + bokelai_id + r'[^\?\=\&]*',
                              info_json['image']).group(0)

        if not authors:
            authors = [_('Unknown')]

        log.info(title, authors, publisher, isbn, pubdate, comments, tags,
                 cover_url)

        mi = Metadata(title, authors)
        mi.identifiers = {'bokelai': bokelai_id, 'isbn': isbn}
        mi.publisher = publisher
        mi.comments = comments
        mi.isbn = isbn
        mi.tags = tags
        if pubdate:
            try:
                from calibre.utils.date import parse_date, utcnow
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        if not cover_url is None:
            mi.has_bokelai_cover = cover_url
            self.cache_identifier_to_cover_url(mi.identifiers['bokelai'],
                                               mi.has_bokelai_cover)
        else:
            mi.has_bokelai_cover = None

        result_queue.put(mi)
Beispiel #26
0
def to_metadata(log, gmetadata, ExHentai_Status):  # {{{
    title = gmetadata['title']
    title_jpn = gmetadata['title_jpn']
    tags = gmetadata['tags']
    rating = gmetadata['rating']
    category = gmetadata['category']
    gid = gmetadata['gid']
    token = gmetadata['token']
    thumb = gmetadata['thumb']

    # title
    if title_jpn:
        raw_title = title_jpn
    else:
        raw_title = title
    pat1 = re.compile(
        r'(?P<comments>.*?\[(?P<author>(?:(?!汉化|漢化)[^\[\]])*)\](?:\s*(?:\[[^\(\)]+\]|\([^\[\]\(\)]+\))\s*)*(?P<title>[^\[\]\(\)]+).*)'
    )
    if re.findall(pat1, raw_title):
        m = re.search(pat1, raw_title)
        title_ = m.group('title').strip()
        author = m.group('author').strip()
    else:
        title_ = raw_title.strip()
        author = 'Unknown'
        log.exception('Title match failed. Title is %s' % raw_title)

    authors = [(author)]

    mi = Metadata(title_, authors)
    mi.identifiers = {
        'ehentai': '%s_%s_%d' % (str(gid), str(token), int(ExHentai_Status))
    }

    # publisher
    pat2 = re.compile(r'^\(([^\[\]\(\)]*)\)')
    if re.findall(pat2, raw_title):
        publisher = re.search(pat2, raw_title).group(1).strip()
        mi.publisher = publisher
    else:
        mi.publisher = 'Unknown'
        log.exception('Not Found publisher.')

    # Tags
    tags_ = []
    for tag in tags:
        if re.match('language', tag):
            tag_ = re.sub('language:', '', tag)
            if tag_ != 'translated':
                mi.language = tag_
            else:
                tags_.append(tag_)


#         elif re.match('parody|group|character|artist', tag):
#             log('drop tag %s' % tag)
#             continue
        elif not ':' in tag:
            log('drop tag %s' % tag)
            continue
        else:
            tags_.append(tag)
    tags_.append(category)
    mi.tags = tags_

    # rating
    mi.rating = float(rating)

    # cover
    mi.has_ehentai_cover = None
    if thumb:
        mi.has_ehentai_cover = thumb
    return mi
Beispiel #27
0
def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.date import parse_date, utcnow
    from calibre.utils.cleantext import clean_ascii_chars

    XPath = partial(etree.XPath, namespaces=NAMESPACES)
    entry          = XPath('//atom:entry')
    entry_id       = XPath('descendant::atom:id')
    title          = XPath('descendant::atom:title')
    description    = XPath('descendant::atom:summary')
    publisher      = XPath("descendant::db:attribute[@name='publisher']")
    isbn           = XPath("descendant::db:attribute[@name='isbn13']")
    date           = XPath("descendant::db:attribute[@name='pubdate']")
    creator        = XPath("descendant::db:attribute[@name='author']")
    booktag        = XPath("descendant::db:tag/attribute::name")
    rating         = XPath("descendant::gd:rating/attribute::average")
    cover_url      = XPath("descendant::atom:link[@rel='image']/attribute::href")

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text.replace('http://', 'https://')
    douban_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'douban':douban_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
            strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi
    mi.comments = get_text(extra, description)
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in [t.text for t in isbn(extra)]:
        if check_isbn(x):
            isbns.append(x)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x for x in booktag(extra) if x]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r'%pubdate)

    # Ratings
    if rating(extra):
        try:
            mi.rating = float(rating(extra)[0]) / 2.0
        except:
            log.exception('Failed to parse rating')
            mi.rating = 0

    # Cover
    mi.has_douban_cover = None
    u = cover_url(extra)
    if u:
        u = u[0].replace('/spic/', '/lpic/')
        # If URL contains "book-default", the book doesn't have a cover
        if u.find('book-default') == -1:
            mi.has_douban_cover = u
    return mi
Beispiel #28
0
    def to_metadata(self, log, entry_, timeout):  # {{{
        from calibre.utils.date import parse_date, utcnow

        log.info("to_metadata")
        douban_id = entry_.get("id")
        title = entry_.get("title")
        description = entry_.get("summary")
        # subtitle = entry_.get('subtitle')  # TODO: std metada doesn't have this field
        publisher = entry_.get("publisher")
        isbn = entry_.get("isbn13")  # ISBN11 is obsolute, use ISBN13
        pubdate = entry_.get("pubdate")
        authors = entry_.get("author")
        # authors = "author"
        book_tags = entry_.get("tags")
        rating = entry_.get("rating")
        cover_url = entry_.get("cover")
        series = entry_.get("series")

        if not authors:
            authors = [("Unknown")]
        if not douban_id or not title:
            # Silently discard this entry
            return None

        mi = Metadata(title, authors)
        mi.identifiers = {"douban": douban_id}
        mi.publisher = publisher
        mi.comments = description
        # mi.subtitle = subtitle

        # ISBN
        isbns = []
        if isinstance(isbn, (type(""), bytes)):
            if check_isbn(isbn):
                isbns.append(isbn)
        else:
            for x in isbn:
                if check_isbn(x):
                    isbns.append(x)
        if isbns:
            mi.isbn = sorted(isbns, key=len)[-1]
        mi.all_isbns = isbns

        # Tags
        mi.tags = book_tags

        # pubdate
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except BaseException:
                log.error("Failed to parse pubdate %r" % pubdate)

        if rating:
            try:
                # mi.publisher += "#PrB.rating#" + str(rating)
                mi.rating = rating / 2.0
            except BaseException:
                log.exception("Failed to parse rating")
                mi.rating = 0

        # Cover
        mi.has_douban_cover = None
        u = cover_url
        if u:
            # If URL contains "book-default", the book doesn't have a cover
            if u.find("book-default") == -1:
                mi.has_douban_cover = u

        # Series
        if series:
            mi.series = series

        return mi
Beispiel #29
0
def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.date import parse_date, utcnow
    from calibre.utils.cleantext import clean_ascii_chars

    XPath = partial(etree.XPath, namespaces=NAMESPACES)
    entry = XPath('//atom:entry')
    entry_id = XPath('descendant::atom:id')
    title = XPath('descendant::atom:title')
    description = XPath('descendant::atom:summary')
    publisher = XPath("descendant::db:attribute[@name='publisher']")
    isbn = XPath("descendant::db:attribute[@name='isbn13']")
    date = XPath("descendant::db:attribute[@name='pubdate']")
    creator = XPath("descendant::db:attribute[@name='author']")
    booktag = XPath("descendant::db:tag/attribute::name")
    rating = XPath("descendant::gd:rating/attribute::average")
    cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text
    douban_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'douban': douban_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(
            xml_to_unicode(clean_ascii_chars(raw),
                           strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi
    mi.comments = get_text(extra, description)
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in [t.text for t in isbn(extra)]:
        if check_isbn(x):
            isbns.append(x)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x for x in booktag(extra) if x]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r' % pubdate)

    # Ratings
    if rating(extra):
        try:
            mi.rating = float(rating(extra)[0]) / 2.0
        except:
            log.exception('Failed to parse rating')
            mi.rating = 0

    # Cover
    mi.has_douban_cover = None
    u = cover_url(extra)
    if u:
        u = u[0].replace('/spic/', '/lpic/')
        # If URL contains "book-default", the book doesn't have a cover
        if u.find('book-default') == -1:
            mi.has_douban_cover = u
    return mi