Beispiel #1
0
    def get_metadata(self, md, select):
        book = None
        if md.isbn:
            book = self.get_book_by_isbn(md.isbn)
        if not book:
            book = self.get_book_by_title(md.title, md.author_sort, select)
        if not book:
            return None
        mi = Metadata(book['title'])
        mi.authors = book['author']
        mi.author_sort = mi.authors[0] if mi.authors else None
        if mi.author_sort:
            for r in REMOVES:
                mi.author_sort = r.sub("", mi.author_sort)
            mi.authors[0] = mi.author_sort
        mi.publisher = book['publisher']
        mi.comments = book['summary']
        mi.isbn = book.get('isbn13', None)
        mi.tags = [t['name'] for t in book['tags']][:8]
        mi.rating = int(float(book['rating']['average']))
        mi.pubdate = self.str2date(book['pubdate'])
        mi.timestamp = datetime.datetime.now()
        mi.douban_id = book['id']
        mi.douban_author_intro = book['author_intro']
        mi.douban_subtitle = book.get('subtitle', None)

        img_url = book['images']['large']
        img_fmt = img_url.split(".")[-1]
        img = StringIO(urlopen(img_url).read())
        mi.cover_data = (img_fmt, img)
        #logging.error("=================\ndouban metadata:\n%s" % mi)
        return mi
Beispiel #2
0
    def _metadata(self, book):
        authors = []
        if book['author']:
            for author in book['author']:
                for r in REMOVES:
                    author = r.sub("", author)
                authors.append(author)
        if not authors: authors = [u'佚名']

        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO
        mi = Metadata(book['title'])
        mi.authors = authors
        mi.author_sort = mi.authors[0]
        mi.publisher = book['publisher']
        mi.comments = book['summary']
        mi.isbn = book.get('isbn13', None)
        mi.tags = [t['name'] for t in book['tags']][:8]
        mi.rating = int(float(book['rating']['average']))
        mi.pubdate = self.str2date(book['pubdate'])
        mi.timestamp = datetime.datetime.now()
        mi.douban_id = book['id']
        mi.douban_author_intro = book['author_intro']
        mi.douban_subtitle = book.get('subtitle', None)
        mi.website = "https://book.douban.com/isbn/%s" % mi.isbn
        mi.source = u'豆瓣'

        mi.cover_url = book['images']['large']
        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        logging.debug("=================\ndouban metadata:\n%s" % mi)
        return mi
Beispiel #3
0
    def to_metadata(self, log, entry):  # {{{
        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'

        title = entry.xpath(xp_template.format('Name'))
        author = entry.xpath(xp_template.format('Author'))
        norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.xpath(xp_template.format('ID'))
        mi.identifiers = {'ozon':ozon_id}

        mi.comments = entry.xpath(xp_template.format('Annotation'))

        mi.ozon_cover_url = None
        cover = entry.xpath(xp_template.format('Picture'))
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)

        pub_year = entry.xpath(xp_template.format('Year'))
        if pub_year:
            mi.pubdate = toPubdate(log, pub_year)
            #log.debug('pubdate %s'%mi.pubdate)

        rating = entry.xpath(xp_template.format('ClientRatingValue'))
        if rating:
            try:
                #'rating',     A floating point number between 0 and 10
                # OZON raion N of 5, calibre of 10, but there is a bug? in identify
                mi.rating = float(rating)
            except:
                pass
            rating
        return mi
def build_meta(log, issue_id):
    """Build metadata record based on comicvine issue_id"""
    issue = pycomicvine.Issue(
        issue_id,
        field_list=[
            "id",
            "name",
            "volume",
            "issue_number",
            "person_credits",
            "description",
            "store_date",
            "cover_date",
        ],
    )
    if not issue or not issue.volume:
        log.warn("Unable to load Issue(%d)" % issue_id)
        return None
    title = "%s #%s" % (issue.volume.name, issue.issue_number)
    if issue.name:
        title = title + ": %s" % (issue.name)
    authors = [p.name for p in issue.person_credits]
    meta = Metadata(title, authors)
    meta.series = issue.volume.name
    meta.series_index = str(issue.issue_number)
    meta.set_identifier("comicvine", str(issue.id))
    meta.comments = issue.description
    meta.has_cover = False
    if issue.volume.publisher:
        meta.publisher = issue.volume.publisher.name
    meta.pubdate = issue.store_date or issue.cover_date
    return meta
Beispiel #5
0
    def parse(self, xml_detail, xml_more_info):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_more_info)
        publisher = self.parse_publisher(xml_detail)
        tags = self.parse_tags(xml_detail, xml_more_info)
        serie, serie_index = self.parse_serie(xml_detail)
        pub_year = self.parse_pub_year(xml_detail, xml_more_info)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(as_unicode(title), authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(self.ident, cover)

            return mi
        else:
            self.log('Result skipped for because title or authors not found')
            return None
Beispiel #6
0
    def parse(self, xml_detail):
        data = xml_detail.split('\n')[1].split("|")
        self.log(data)

        title = data[1]
        authors = [data[0]]
        comments = data[13]
        isbn = data[3]
        publisher = data[6]
        pub_date_tmp = data[34].split('-')
        pub_date = datetime.datetime(int(pub_date_tmp[0]), int(pub_date_tmp[1]), int(pub_date_tmp[2]), tzinfo=utc_tz)
        if isbn is not None:
            isbn_tmp = re.sub("-", "", isbn)
            cover = "%s/images/covers/%s.jpg"%(self.plugin.BASE_URL, isbn_tmp)
        else:
            cover = None

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.publisher = publisher
            mi.pubdate = pub_date
            mi.isbn = isbn
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(self.ident, cover)

            return mi
        else:
            return None
Beispiel #7
0
    def parse(self, xml_detail):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_detail)
        publisher = self.parse_publisher(xml_detail)
        pub_year = self.parse_pubdate(xml_detail)
        tags = self.parse_tags(xml_detail)
        serie, serie_index = self.parse_serie(xml_detail)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:str(self.number)}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(str(self.number), cover)

            return mi
        else:
            return None
Beispiel #8
0
    def get_metadata(self, md):
        book = None
        if md.isbn:
            book = self.get_book_by_isbn(md.isbn)
        if not book:
            book = self.get_book_by_title(md.title)
        mi = Metadata(book['title'])
        mi.authors     = book['author']
        mi.author_sort = mi.authors[0] if mi.authors else None
        if mi.author_sort:
            for r in REMOVES:
                mi.author_sort = r.sub("", mi.author_sort)
            mi.authors[0] = mi.author_sort
        mi.publisher   = book['publisher']
        mi.comments    = book['summary']
        mi.isbn        = book.get('isbn13', None)
        mi.tags        = [ t['name'] for t in book['tags'] ][:8]
        mi.rating      = int(float(book['rating']['average']))
        mi.pubdate     = self.str2date(book['pubdate'])
        mi.timestamp   = datetime.datetime.now()
        mi.douban_id   = book['id']
        mi.douban_author_intro = book['author_intro']
        mi.douban_subtitle = book.get('subtitle', None)

        img_url = book['images']['large']
        img_fmt = img_url.split(".")[-1]
        img = StringIO(urlopen(img_url).read())
        mi.cover_data = (img_fmt, img)
        logging.error("=================\ndouban metadata:\n%s" % mi)
        return mi
Beispiel #9
0
    def to_metadata(self, log, entry):  # {{{
        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'

        title = entry.xpath(xp_template.format('Name'))
        author = entry.xpath(xp_template.format('Author'))
        norm_authors = map(_normalizeAuthorNameWithInitials,
                           map(unicode.strip,
                               unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.xpath(xp_template.format('ID'))
        mi.identifiers = {'ozon': ozon_id}

        mi.comments = entry.xpath(xp_template.format('Annotation'))

        mi.ozon_cover_url = None
        cover = entry.xpath(xp_template.format('Picture'))
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)

        pub_year = entry.xpath(xp_template.format('Year'))
        if pub_year:
            mi.pubdate = toPubdate(log, pub_year)
            #log.debug('pubdate %s'%mi.pubdate)

        rating = entry.xpath(xp_template.format('ClientRatingValue'))
        if rating:
            try:
                #'rating',     A floating point number between 0 and 10
                # OZON raion N of 5, calibre of 10, but there is a bug? in identify
                mi.rating = float(rating)
            except:
                pass
            rating
        return mi
Beispiel #10
0
    def get_baike_metadata(self, title):
        from baidubaike import Page

        try: baike = Page(title)
        except: return None

        info = baike.get_info()
        print "\n".join( "%s:\t%s" % v for v in info.items())
        mi = Metadata(info['title'])
        plat = info.get(u'首发网站', None)
        if not plat:
            plat = info.get(u'首发状态', "网络小说平台")
        plat = plat.replace(u'首发', '')
        mi.publisher = info.get(u'连载平台', plat)
        mi.authors   = [info[u'作者']]
        mi.isbn      = '0000000000001'
        mi.tags      = baike.get_tags()
        mi.pubdate   = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.comments  = baike.get_summary()
        if u'完结' in info.get(u'连载状态', ""):
            day = re.findall('\d*-\d*-\d*', info[u'连载状态'])
            try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except: pass
        return mi
Beispiel #11
0
    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO

        info = baike.get_info()
        print "\n".join( "%s:\t%s" % v for v in info.items())

        mi = Metadata(info['title'])
        plat = "网络小说平台"
        plat = info.get(u'首发状态', plat)
        plat = info.get(u'首发网站', plat)
        plat = plat.replace(u'首发', '')
        mi.publisher = info.get(u'连载平台', plat)
        mi.authors   = [ info.get(u'作者', u'佚名') ]
        mi.author_sort = mi.authors[0]
        mi.isbn      = BAIKE_ISBN
        mi.tags      = baike.get_tags()
        mi.pubdate   = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments  = re.sub(r'\[\d+\]$', "", baike.get_summary() )
        mi.website   = baike.http.url
        mi.source    = u'百度百科'

        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完结' in info.get(u'连载状态', ""):
            day = re.findall('\d*-\d*-\d*', info[u'连载状态'])
            try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except: pass
        return mi
    def result2meta(self, result, prev_identifiers={}):
        '''
        Converts the result dict into Calibre metadata.
        Note: Source download plugins do  not have access to custom columns.
        '''
        title = get_title(result)
        authors = get_author_list(result)
        mi = Metadata(title=title, authors=authors)

        mi.identifiers = update_identifiers(prev_identifiers, result)

        put_publisher(mi, result)
        put_language(mi, result)
        self.put_pubdate(mi, result)
        put_tags(mi, result)
        put_journal(mi, result)
        self.put_series_index(mi, result)

        comments = ""
        if prefs['abstract_to_comment'] and 'abstract' in result:
            comments = "\n\n".join([comments, result['abstract']])

        if prefs['query_to_comment']:
            extra_meta = self.mkComments(result)
            extra_plus = map(lambda x: "crossref:%s" % x, extra_meta)
            extra = "\n".join(extra_plus)
            comments = "\n\n".join([comments, extra])
        mi.comments = comments

        if 'score' in result:
            mi.source_relevance = 100 - result['score']
        else:
            mi.source_relevance = 100
        # self.log.info("set comment to %s"%mi.comments)
        return mi
Beispiel #13
0
 def test_rtf_metadata(self):
     stream = BytesIO(br'{\rtf1\ansi\ansicpg1252}')
     m = Metadata('Test ø̄title', ['Author One', 'Author БTwo'])
     m.tags = 'tag1 見tag2'.split()
     m.comments = '<p>some ⊹comments</p>'
     m.publisher = 'publiSher'
     set_metadata(stream, m)
     stream.seek(0)
     o = get_metadata(stream)
     for attr in 'title authors publisher comments tags'.split():
         self.assertEqual(getattr(m, attr), getattr(o, attr))
Beispiel #14
0
 def test_rtf_metadata(self):
     stream = BytesIO(br'{\rtf1\ansi\ansicpg1252}')
     m = Metadata('Test ø̄title', ['Author One', 'Author БTwo'])
     m.tags = 'tag1 見tag2'.split()
     m.comments = '<p>some ⊹comments</p>'
     m.publisher = 'publiSher'
     set_metadata(stream, m)
     stream.seek(0)
     o = get_metadata(stream)
     for attr in 'title authors publisher comments tags'.split():
         self.assertEqual(getattr(m, attr), getattr(o, attr))
Beispiel #15
0
def read_metadata(root, ver=None, return_extra_data=False):
    ans = Metadata(_('Unknown'), [_('Unknown')])
    prefixes, refines = read_prefixes(root), read_refines(root)
    identifiers = read_identifiers(root, prefixes, refines)
    ids = {}
    for key, vals in iteritems(identifiers):
        if key == 'calibre':
            ans.application_id = vals[0]
        elif key == 'uuid':
            ans.uuid = vals[0]
        else:
            ids[key] = vals[0]
    ans.set_identifiers(ids)
    ans.title = read_title(root, prefixes, refines) or ans.title
    ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
    ans.languages = read_languages(root, prefixes, refines) or ans.languages
    auts, aus = [], []
    for a in read_authors(root, prefixes, refines):
        auts.append(a.name), aus.append(a.sort)
    ans.authors = auts or ans.authors
    ans.author_sort = authors_to_string(aus) or ans.author_sort
    bkp = read_book_producers(root, prefixes, refines)
    if bkp:
        if bkp[0]:
            ans.book_producer = bkp[0]
    pd = read_pubdate(root, prefixes, refines)
    if not is_date_undefined(pd):
        ans.pubdate = pd
    ts = read_timestamp(root, prefixes, refines)
    if not is_date_undefined(ts):
        ans.timestamp = ts
    lm = read_last_modified(root, prefixes, refines)
    if not is_date_undefined(lm):
        ans.last_modified = lm
    ans.comments = read_comments(root, prefixes, refines) or ans.comments
    ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher
    ans.tags = read_tags(root, prefixes, refines) or ans.tags
    ans.rating = read_rating(root, prefixes, refines) or ans.rating
    s, si = read_series(root, prefixes, refines)
    if s:
        ans.series, ans.series_index = s, si
    ans.author_link_map = read_author_link_map(root, prefixes,
                                               refines) or ans.author_link_map
    ans.user_categories = read_user_categories(root, prefixes,
                                               refines) or ans.user_categories
    for name, fm in iteritems((read_user_metadata(root, prefixes, refines)
                               or {})):
        ans.set_user_metadata(name, fm)
    if return_extra_data:
        ans = ans, ver, read_raster_cover(root, prefixes,
                                          refines), first_spine_item(
                                              root, prefixes, refines)
    return ans
    def _get_bookdetails(self, url):
        u = self.BASE_URL + url["url"]
        print("_get_bookdetails:: traukiam knygą iš %s" % u)

        resp = urllib2.urlopen(u)
        contents = resp.read()
        #print(contents)
        tree = etree.HTML(contents)

        authors = self._get_authors(tree)
        publisher = self._get_details(tree, self.details_publisher)
        year = self._get_year(tree)
        pages = self._get_details(tree, self.details_pages)
        isbn = self._get_details(tree, self.details_isbn)
        description = self._get_description(tree)
        cover = self._get_cover_url(tree)
        tags = self._get_tags(tree)

        mi = Metadata(url["title"], authors)
        mi.set_identifier("isbn", isbn)
        mi.comments = description
        mi.language = "LT"
        mi.tags = tags
        try:
            mi.set("publisher", publisher)
        except:
            print(u"_get_bookdetails:: nepavyko užsetinti leidėjo")
        try:
            mi.set("pubdate", datetime.datetime(year, 1, 2))
        except:
            print(u"_get_bookdetails:: nepavyko užsetinti leidimo datos")
        try:
            if self.gui:
                print("YYYYRAAA GUI!!!")
            col = {}
            col["#value#"] = pages
            mi.set_user_metadata("#count", col)
        except:
            print(u"_get_bookdetails:: nepavyko užsetinti puslapių skaičiaus")

        if cover and isbn:
            print(u"_get_bookdetails:: kešuojam viršelį:", cover)
            self.cache_isbn_to_identifier(isbn, isbn)
            self.cache_identifier_to_cover_url(isbn, cover)
            mi.has_cover = True

            print(self.cached_identifier_to_cover_url(isbn))

        return mi
Beispiel #17
0
def read_metadata(root, ver=None, return_extra_data=False):
    ans = Metadata(_('Unknown'), [_('Unknown')])
    prefixes, refines = read_prefixes(root), read_refines(root)
    identifiers = read_identifiers(root, prefixes, refines)
    ids = {}
    for key, vals in identifiers.iteritems():
        if key == 'calibre':
            ans.application_id = vals[0]
        elif key == 'uuid':
            ans.uuid = vals[0]
        else:
            ids[key] = vals[0]
    ans.set_identifiers(ids)
    ans.title = read_title(root, prefixes, refines) or ans.title
    ans.title_sort = read_title_sort(root, prefixes, refines) or ans.title_sort
    ans.languages = read_languages(root, prefixes, refines) or ans.languages
    auts, aus = [], []
    for a in read_authors(root, prefixes, refines):
        auts.append(a.name), aus.append(a.sort)
    ans.authors = auts or ans.authors
    ans.author_sort = authors_to_string(aus) or ans.author_sort
    bkp = read_book_producers(root, prefixes, refines)
    if bkp:
        if bkp[0]:
            ans.book_producer = bkp[0]
    pd = read_pubdate(root, prefixes, refines)
    if not is_date_undefined(pd):
        ans.pubdate = pd
    ts = read_timestamp(root, prefixes, refines)
    if not is_date_undefined(ts):
        ans.timestamp = ts
    lm = read_last_modified(root, prefixes, refines)
    if not is_date_undefined(lm):
        ans.last_modified = lm
    ans.comments = read_comments(root, prefixes, refines) or ans.comments
    ans.publisher = read_publisher(root, prefixes, refines) or ans.publisher
    ans.tags = read_tags(root, prefixes, refines) or ans.tags
    ans.rating = read_rating(root, prefixes, refines) or ans.rating
    s, si = read_series(root, prefixes, refines)
    if s:
        ans.series, ans.series_index = s, si
    ans.author_link_map = read_author_link_map(root, prefixes, refines) or ans.author_link_map
    ans.user_categories = read_user_categories(root, prefixes, refines) or ans.user_categories
    for name, fm in (read_user_metadata(root, prefixes, refines) or {}).iteritems():
        ans.set_user_metadata(name, fm)
    if return_extra_data:
        ans = ans, ver, read_raster_cover(root, prefixes, refines), first_spine_item(root, prefixes, refines)
    return ans
Beispiel #18
0
    def sample_results(self):
        m1 = Metadata("The Great Gatsby", ["Francis Scott Fitzgerald"])
        m2 = Metadata("The Great Gatsby - An extra long title to test resizing", ["F. Scott Fitzgerald"])
        m1.has_cached_cover_url = True
        m2.has_cached_cover_url = False
        m1.comments = "Some comments " * 10
        m1.tags = ["tag%d" % i for i in range(20)]
        m1.rating = 4.4
        m1.language = "en"
        m2.language = "fr"
        m1.pubdate = utcnow()
        m2.pubdate = fromordinal(1000000)
        m1.publisher = "Publisher 1"
        m2.publisher = "Publisher 2"

        return [m1, m2]
Beispiel #19
0
    def sample_results(self):
        m1 = Metadata('The Great Gatsby', ['Francis Scott Fitzgerald'])
        m2 = Metadata('The Great Gatsby - An extra long title to test resizing', ['F. Scott Fitzgerald'])
        m1.has_cached_cover_url = True
        m2.has_cached_cover_url = False
        m1.comments  = 'Some comments '*10
        m1.tags = ['tag%d'%i for i in range(20)]
        m1.rating = 4.4
        m1.language = 'en'
        m2.language = 'fr'
        m1.pubdate = utcnow()
        m2.pubdate = fromordinal(1000000)
        m1.publisher = 'Publisher 1'
        m2.publisher = 'Publisher 2'

        return [m1, m2]
Beispiel #20
0
def build_meta(log, issue_id):
    """Build metadata record based on comicvine issue_id."""
    issue = PyComicvineWrapper(log).lookup_issue(issue_id)
    if issue:
        meta = Metadata(issue.get_full_title(), issue.get_authors())
        meta.series = issue.volume_name
        meta.series_index = issue.issue_number
        meta.set_identifier('comicvine', str(issue.id))
        meta.set_identifier('comicvine-volume', str(issue.volume_id))
        meta.comments = issue.description
        meta.has_cover = False
        meta.publisher = issue.publisher_name
        meta.pubdate = issue.date
        return meta
    else:
        return None
Beispiel #21
0
    def sample_results(self):
        m1 = Metadata('The Great Gatsby', ['Francis Scott Fitzgerald'])
        m2 = Metadata('The Great Gatsby', ['F. Scott Fitzgerald'])
        m1.has_cached_cover_url = True
        m2.has_cached_cover_url = False
        m1.comments  = 'Some comments '*10
        m1.tags = ['tag%d'%i for i in range(20)]
        m1.rating = 4.4
        m1.language = 'en'
        m2.language = 'fr'
        m1.pubdate = utcnow()
        m2.pubdate = fromordinal(1000000)
        m1.publisher = 'Publisher 1'
        m2.publisher = 'Publisher 2'

        return [m1, m2]
Beispiel #22
0
    def parse_response(cls, response, isbn_initial, log):
        metadata_items = []

        page_soup = BeautifulSoup(response.text)

        for idx, candidate in enumerate(cls.find(page_soup, 'b-result'), 1):

            title = cls.find(candidate, 'b-result__name-wrap', True)
            author = map(
                unicode.strip,
                cls.find(candidate, 'b-result__author', True).split(','))
            comments = cls.find(candidate, 'b-result__desc__full',
                                True).replace(u'Скрыть', '').strip()
            isbn = cls.find(candidate, 'b-result__isbn',
                            True).split(':')[-1].split(',')[0].strip()

            log.info(u'Found candidate %s: %s' % (idx, title))

            publisher = None
            pubdate = None

            other_info = cls.find(candidate, 'b-result__years', True).strip()
            if other_info:
                for entry in other_info.split(';'):
                    k, v = entry.split(':', 1)
                    k = k.strip()
                    if k == u'Год':
                        pubdate = parse_only_date('1.1.%s' %
                                                  v.split(',')[0].strip())
                    elif k == u'Издательство':
                        publisher = v.strip()

            metadata_item = Metadata(title, author)
            metadata_item.isbn = isbn or isbn_initial

            if comments:
                metadata_item.comments = comments

            if publisher is not None:
                metadata_item.publisher = publisher

            if pubdate is not None:
                metadata_item.pubdate = pubdate

            metadata_items.append(metadata_item)

        return metadata_items
Beispiel #23
0
 def test_input_comment_single(self):
     stream_meta = get_metadata(self.get_stream('comment_single'))
     canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ',
                           ['James Madison', 'James Monroe'])
     canon_meta.publisher = 'Publisher C'
     canon_meta.languages = ['French']
     canon_meta.pubdate = parse_date('2015-01-01')
     canon_meta.timestamp = parse_date('2014-01-01')
     canon_meta.series = 'Comment Series'
     canon_meta.series_index = float(3)
     canon_meta.rating = float(0)
     canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML -- too &amp;amp;'
     canon_meta.tags = ['tag d']
     canon_meta.set_identifiers({
         'isbn': '3456789012',
         'url': 'http://google.com/search?q=calibre'
     })
     self.compare_metadata(stream_meta, canon_meta)
Beispiel #24
0
    def data2mi(self, item):
        """Converts a single metadata answer in the form of a dict to a MetadataInformation object"""

        mi = Metadata(_('Unknown'))

        # Regular metadata
        mi.title = item.get('title', None)
        mi.authors = item.get('authors', [])
        mi.publisher = item.get('publisher', None)

        if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id'])
        if 'doi' in item.keys(): mi.set_identifier('doi', item['doi'])
        if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn'])

        if 'updated' in item.keys():
            mi.pubdate = parse_date(item['updated'], assume_utc=True)

        if 'series' in item.keys():
            mi.series = item['series']
            mi.series_index = self.format_series_index(
                item.get('series_index'), None)

        if 'year' in item.keys():
            mi.pubdate = parse_date(item['year'], assume_utc=True)

        if 'abstract' in item.keys():
            mi.comments = self.format_abstract(item['abstract'])

        if 'language' in item.keys(): mi.language = item['language']

        if 'journal' in item.keys():
            mi.series = item['journal']
            mi.series_index = self.format_series_index(item.get('volume'),
                                                       item.get('number'))

        if 'subject' in item.keys():
            tags = set([])
            for s in item['subject']:
                tags.update(msc_tags(s))
                tags.update(arxiv_tags(s))

            mi.tags = list(sorted(tags))

        return mi
    def to_metadata(self, browser, log, entry_, timeout):  # {{{
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.date import parse_date, utcnow
        from calibre.utils.cleantext import clean_ascii_chars

        # log.info('entry_ is: ',entry_)
        id_url = entry_['url']
        douban_id = entry_['id']
        title_ = entry_['title']
        subtitle = entry_['subtitle']
        authors = [x.strip() for x in entry_['author'] if x]
        if not authors:
            authors = [_('Unknown')]

        mi = Metadata(title_, authors)
        mi.identifiers = {'douban': douban_id}
        mi.comments = entry_['summary']
        mi.publisher = entry_['publisher']

        # ISBN
        mi.isbn = entry_['isbn10']
        mi.all_isbns = [entry_['isbn10'], entry_['isbn13']]

        # Tags
        mi.tags = [x['name'].strip() for x in entry_['tags']]

        # pubdate
        pubdate = entry_['pubdate']
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        # Ratings
        mi.rating = float(entry_['rating']['average']) / 2.0

        # Cover
        mi.has_douban_cover = entry_['image']
        return mi
Beispiel #26
0
    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import UNDEFINED_DATE
        root = parse_html(raw)
        mi = Metadata(self.basic_data['title'], self.basic_data['authors'])

        # Identifiers
        if self.basic_data['isbns']:
            mi.isbn = self.basic_data['isbns'][0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        if self.basic_data['tags']:
            mi.tags = self.basic_data['tags']
            mi.tags = [
                t[1:].strip() if t.startswith('&') else t for t in mi.tags
            ]

        # Publisher
        mi.publisher = self.basic_data['publisher']

        # Pubdate
        if self.basic_data['pubdate'] and self.basic_data[
                'pubdate'].year != UNDEFINED_DATE:
            mi.pubdate = self.basic_data['pubdate']

        # Rating
        if self.basic_data['rating']:
            mi.rating = self.basic_data['rating']

        # Comments
        comments = ''
        for cid in ('summary', 'contributorbio', 'quotes_reviews'):
            cid = 'desc_{}{}-content'.format(cid, self.sku)
            div = root.xpath('//*[@id="{}"]'.format(cid))
            if div:
                comments += self.render_comments(div[0])
        if comments:
            mi.comments = comments

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(
            self.sku) is not None
        return mi
Beispiel #27
0
 def test_input_meta_multi(self):
     stream_meta = get_metadata(self.get_stream('meta_multi'))
     canon_meta = Metadata(
         'A Meta Tag &amp; Title Ⓒ',
         ['George Washington', 'John Adams', 'Thomas Jefferson'])
     canon_meta.publisher = 'Publisher A'
     canon_meta.languages = ['English', 'Spanish']
     canon_meta.pubdate = parse_date('2019-01-01')
     canon_meta.timestamp = parse_date('2018-01-01')
     canon_meta.series = 'Meta Series'
     canon_meta.series_index = float(1)
     canon_meta.rating = float(8)
     canon_meta.comments = 'meta &quot;comments&quot; ♥ HTML &amp;amp;'
     canon_meta.tags = ['tag a', 'tag b', 'tag c']
     canon_meta.set_identifiers({
         'isbn': '1234567890',
         'url': 'http://google.com/search?q=calibre'
     })
     self.compare_metadata(stream_meta, canon_meta)
Beispiel #28
0
    def parse_response(cls, response, isbn_initial, log):
        metadata_items = []

        page_soup = BeautifulSoup(response.text)

        for idx, candidate in enumerate(cls.find(page_soup, 'b-result'), 1):

            title = cls.find(candidate, 'b-result__name-wrap', True)
            author = map(unicode.strip, cls.find(candidate, 'b-result__author', True).split(','))
            comments = cls.find(candidate, 'b-result__desc__full', True).replace(u'Скрыть', '').strip()
            isbn = cls.find(candidate, 'b-result__isbn', True).split(':')[-1].split(',')[0].strip()

            log.info(u'Found candidate %s: %s' % (idx, title))

            publisher = None
            pubdate = None

            other_info = cls.find(candidate, 'b-result__years', True).strip()
            if other_info:
                for entry in other_info.split(';'):
                    k, v = entry.split(':', 1)
                    k = k.strip()
                    if k == u'Год':
                        pubdate = parse_only_date('1.1.%s' % v.split(',')[0].strip())
                    elif k == u'Издательство':
                        publisher = v.strip()

            metadata_item = Metadata(title, author)
            metadata_item.isbn = isbn or isbn_initial

            if comments:
                metadata_item.comments = comments

            if publisher is not None:
                metadata_item.publisher = publisher

            if pubdate is not None:
                metadata_item.pubdate = pubdate

            metadata_items.append(metadata_item)

        return metadata_items
Beispiel #29
0
    def parse(self, xml_detail):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        tags = self.parse_tags(xml_detail)
        serie, serie_index = self.parse_serie(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.rating = rating
            mi.tags = tags
            mi.series = serie
            mi.series_index = serie_index
            return mi
        else:
            return None
Beispiel #30
0
    def data2mi(self, item):
        """Converts a single metadata answer in the form of a dict to a MetadataInformation object"""

        mi = Metadata(_('Unknown'))

        # Regular metadata
        mi.title = item.get('title', None)
        mi.authors = item.get('authors', [])
        mi.publisher = item.get('publisher', None)

        if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id'])
        if 'doi' in item.keys(): mi.set_identifier('doi', item['doi'])
        if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn'])

        if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True)

        if 'series' in item.keys():
            mi.series = item['series']
            mi.series_index = self.format_series_index(item.get('series_index'), None)

        if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True)

        if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract'])

        if 'language' in item.keys(): mi.language = item['language']

        if 'journal' in item.keys():
            mi.series = item['journal']
            mi.series_index = self.format_series_index(item.get('volume'), item.get('number'))

        if 'subject' in item.keys():
            tags = set([])
            for s in item['subject']:
                tags.update(msc_tags(s))
                tags.update(arxiv_tags(s))

            mi.tags = list(sorted(tags))

        return mi
Beispiel #31
0
    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import UNDEFINED_DATE
        root = parse_html(raw)
        mi = Metadata(self.basic_data['title'], self.basic_data['authors'])

        # Identifiers
        if self.basic_data['isbns']:
            mi.isbn = self.basic_data['isbns'][0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        if self.basic_data['tags']:
            mi.tags = self.basic_data['tags']
            mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]

        # Publisher
        mi.publisher = self.basic_data['publisher']

        # Pubdate
        if self.basic_data['pubdate'] and self.basic_data['pubdate'].year != UNDEFINED_DATE:
            mi.pubdate = self.basic_data['pubdate']

        # Rating
        if self.basic_data['rating']:
            mi.rating = self.basic_data['rating']

        # Comments
        comments = ''
        for cid in ('summary', 'contributorbio', 'quotes_reviews'):
            cid = 'desc_{}{}-content'.format(cid, self.sku)
            div = root.xpath('//*[@id="{}"]'.format(cid))
            if div:
                comments += self.render_comments(div[0])
        if comments:
            mi.comments = comments

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None
        return mi
Beispiel #32
0
    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata

        info = baike.get_info()
        logging.debug("\n".join("%s:\t%s" % v for v in info.items()))

        mi = Metadata(info['title'])
        plat = "网络小说平台"
        plat = info.get(u'首发状态', plat)
        plat = info.get(u'首发网站', plat)
        plat = plat.replace(u'首发', '')
        mi.publisher = info.get(u'连载平台', plat)
        mi.authors = [info.get(u'作者', u'佚名')]
        mi.author_sort = mi.authors[0]
        mi.isbn = BAIKE_ISBN
        mi.tags = baike.get_tags()
        mi.pubdate = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary())
        mi.website = baike.http.url
        mi.source = u'百度百科'
        mi.provider_key = KEY
        mi.provider_value = baike.get_id()

        if self.copy_image and mi.cover_url:
            logging.debug("fetching cover: %s", mi.cover_url)
            img = io.BytesIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完结' in info.get(u'连载状态', ""):
            day = re.findall('\d*-\d*-\d*', info[u'连载状态'])
            try:
                mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except:
                pass
        return mi
Beispiel #33
0
def build_meta(log, issue_id):
  '''Build metadata record based on comicvine issue_id'''
  issue = pycomicvine.Issue(issue_id, field_list=[
      'id', 'name', 'volume', 'issue_number', 'person_credits', 'description', 
      'store_date', 'cover_date'])
  if not issue or not issue.volume:
    log.warn('Unable to load Issue(%d)' % issue_id)
    return None
  title = '%s #%s' %  (issue.volume.name, issue.issue_number)
  if issue.name:
    title = title + ': %s' % (issue.name)
  authors = [p.name for p in issue.person_credits]
  meta = Metadata(title, authors)
  meta.series = issue.volume.name
  meta.series_index = str(issue.issue_number)
  meta.set_identifier('comicvine', str(issue.id))
  meta.set_identifier('comicvine-volume', str(issue.volume.id))
  meta.comments = issue.description
  meta.has_cover = False
  if issue.volume.publisher:
    meta.publisher = issue.volume.publisher.name
  meta.pubdate = issue.store_date or issue.cover_date
  return meta
Beispiel #34
0
    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO

        info = baike.get_info()
        print "\n".join("%s:\t%s" % v for v in info.items())

        mi = Metadata(info['title'])
        plat = "網絡小說平台"
        plat = info.get(u'首發狀態', plat)
        plat = info.get(u'首發網站', plat)
        plat = plat.replace(u'首發', '')
        mi.publisher = info.get(u'連載平台', plat)
        mi.authors = [info.get(u'作者', u'佚名')]
        mi.author_sort = mi.authors[0]
        mi.isbn = BAIKE_ISBN
        mi.tags = baike.get_tags()
        mi.pubdate = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary())
        mi.website = baike.http.url
        mi.source = u'百度百科'

        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完結' in info.get(u'連載狀態', ""):
            day = re.findall('\d*-\d*-\d*', info[u'連載狀態'])
            try:
                mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except:
                pass
        return mi
Beispiel #35
0
	def parse_details(self, root):
		isfdb_id = None
		title = None
		authors = []
		isbn = None
		publisher = None
		pubdate = None
		
		try:
			isfdb_id = re.search('(\d+)$', self.url).groups(0)[0]
		except:
			self.log.exception('Error parsing ISFDB ID for url: %r' % self.url)
        
		detail_nodes = root.xpath('//div[@id="content"]//td[@class="pubheader"]/ul/li')
		if not detail_nodes:
			detail_nodes = root.xpath('//div[@id="content"]/div/ul/li') # no table (on records with no image)

		for detail_node in detail_nodes:
			section = detail_node[0].text_content().strip().rstrip(':')
			#self.log.info(section)
			try:
				if section == 'Publication':
					title = detail_node[0].tail.strip()
					if not title:
						# assume an extra span with a transliterated title tooltip
						title = detail_node[1].text_content().strip()
                    #self.log.info(title)
				elif section == 'Authors' or section == 'Editors':
					for a in detail_node.xpath('.//a'):
						author = a.text_content().strip()
						if section.startswith('Editors'):
							authors.append(author + ' (Editor)')
						else:
							authors.append(author)
                    #self.log.info(authors)
				elif section == 'ISBN':
					isbn = detail_node[0].tail.strip('[] \n')
                    #self.log.info(isbn)
				elif section == 'Publisher':
					publisher = detail_node.xpath('a')[0].text_content().strip()
                    #self.log.info(publisher)
				elif section == 'Date':
					pubdate = self._convert_date_text(detail_node[0].tail.strip())
                    #self.log.info(pubdate)
			except:
				self.log.exception('Error parsing section %r for url: %r' % (section, self.url) )

		if not title or not authors or not isfdb_id:
			self.log.error('Could not find title/authors/ISFDB ID for %r' % self.url)
			self.log.error('ISFDB: %r Title: %r Authors: %r' % (isfdb_id, title,
				authors))
			return

		mi = Metadata(title, authors)
		mi.set_identifier('isfdb', isfdb_id)
		self.isfdb_id = isfdb_id

		if isbn:
			self.isbn = mi.isbn = isbn
		if publisher:
			mi.publisher = publisher
		if pubdate:
			mi.pubdate = pubdate
			
		try:
			mi.comments = self.parse_comments(root)
		except:
			self.log.exception('Error parsing comments for url: %r'%self.url)

		try:
			self.cover_url = self.parse_cover(root)
		except:
			self.log.exception('Error parsing cover for url: %r'%self.url)
		
		mi.has_cover = bool(self.cover_url)
		mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!

		mi.source_relevance = self.relevance

		if self.isfdb_id:
			if self.isbn:
				self.plugin.cache_isbn_to_identifier(self.isbn, self.isfdb_id)

		self.plugin.clean_downloaded_metadata(mi)
		self.result_queue.put(mi)
Beispiel #36
0
    def parse_details(self, raw, root):
        try:
            asin = self.parse_asin(root)
        except:
            self.log.exception('Error parsing asin for url: %r'%self.url)
            asin = None
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
                    suffix='.html', delete=False) as f:
                f.write(raw)
            print ('Downloaded html for', asin, 'saved in', f.name)

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r'%self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not asin:
            self.log.error('Could not find title/authors/asin for %r'%self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
                authors))
            return

        mi = Metadata(title, authors)
        idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain
        mi.set_identifier(idtype, asin)
        self.amazon_id = asin

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r'%self.url)

        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root)
        if non_hero:
            # New style markup
            try:
                self.parse_new_details(root, mi, non_hero[0])
            except:
                self.log.exception('Failed to parse new-style book details section')
        else:
            pd = root.xpath(self.pd_xpath)
            if pd:
                pd = pd[0]

                try:
                    isbn = self.parse_isbn(pd)
                    if isbn:
                        self.isbn = mi.isbn = isbn
                except:
                    self.log.exception('Error parsing ISBN for url: %r'%self.url)

                try:
                    mi.publisher = self.parse_publisher(pd)
                except:
                    self.log.exception('Error parsing publisher for url: %r'%self.url)

                try:
                    mi.pubdate = self.parse_pubdate(pd)
                except:
                    self.log.exception('Error parsing publish date for url: %r'%self.url)

                try:
                    lang = self.parse_language(pd)
                    if lang:
                        mi.language = lang
                except:
                    self.log.exception('Error parsing language for url: %r'%self.url)

            else:
                self.log.warning('Failed to find product description for url: %r'%self.url)

        mi.source_relevance = self.relevance

        if self.amazon_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.amazon_id,
                        self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #37
0
    def parse_details(self, raw, root):
        #解析元数据各字段数据
        #self.log.info("=====")
        try:
            asin = self.parse_asin(root)
        except:
            self.log.exception('Error parsing asin for url: %r' % self.url)
            asin = None
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(
                    prefix=(asin or str(uuid.uuid4())) + '_',
                    suffix='.html',
                    delete=False) as f:
                f.write(raw)
            print('Downloaded html for', asin, 'saved in', f.name)
        # 分析取得书名
        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None
        #分析取得作者
        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not asin:
            self.log.error('Could not find title/authors/asin for %r' %
                           self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r' %
                           (asin, title, authors))
            return
        #以书名,作者为元数据对象mi,用于设置元数据
        mi = Metadata(title, authors)
        #设置Bookid
        idtype = '17k'
        mi.set_identifier(idtype, asin)
        self.k17k_id = asin

        #设备注释(简介)
        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)
        #设置丛书系列
        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)
        #设置标签
        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        #设置最后更新日期
#        try:
#            mi.last_modified = self.parse_last_modified(root)
#        except:
#            self.log.exception('Error parsing last_modified for url: %r'%self.url)
#设置封面
        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)

        mi.has_cover = bool(self.cover_url)
        mi.source_relevance = self.relevance
        mi.languages = [
            u'中文',
        ]

        if self.k17k_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.k17k_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    self.k17k_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #38
0
#!/usr/bin/env python
Beispiel #39
0
    def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers):
        from lxml import etree

        def tostring(x):
            if x is None:
                return ''
            return etree.tostring(x, method='text', encoding=unicode).strip()

        orig_isbn = identifiers.get('isbn', None)
        title_tokens = list(self.get_title_tokens(orig_title))
        author_tokens = list(self.get_author_tokens(orig_authors))
        results = []

        def ismatch(title, authors):
            authors = lower(' '.join(authors))
            title = lower(title)
            match = not title_tokens
            for t in title_tokens:
                if lower(t) in title:
                    match = True
                    break
            amatch = not author_tokens
            for a in author_tokens:
                if lower(a) in authors:
                    amatch = True
                    break
            if not author_tokens:
                amatch = True
            return match and amatch

        bl = feed.find('BookList')
        if bl is None:
            err = tostring(feed.find('errormessage'))
            raise ValueError('ISBNDb query failed:' + err)
        total_results = int(bl.get('total_results'))
        shown_results = int(bl.get('shown_results'))
        for bd in bl.xpath('.//BookData'):
            isbn = check_isbn(bd.get('isbn', None))
            isbn13 = check_isbn(bd.get('isbn13', None))
            if not isbn and not isbn13:
                continue
            if orig_isbn and orig_isbn not in {isbn, isbn13}:
                continue
            title = tostring(bd.find('Title'))
            if not title:
                continue
            authors = []
            for au in bd.xpath('.//Authors/Person'):
                au = tostring(au)
                if au:
                    if ',' in au:
                        ln, _, fn = au.partition(',')
                        au = fn.strip() + ' ' + ln.strip()
                authors.append(au)
            if not authors:
                continue
            comments = tostring(bd.find('Summary'))
            id_ = (title, tuple(authors))
            if id_ in seen:
                continue
            seen.add(id_)
            if not ismatch(title, authors):
                continue
            publisher = tostring(bd.find('PublisherText'))
            if not publisher:
                publisher = None
            if publisher and 'audio' in publisher.lower():
                continue
            mi = Metadata(title, authors)
            mi.isbn = isbn
            mi.publisher = publisher
            mi.comments = comments
            results.append(mi)
        return total_results, shown_results, results
Beispiel #40
0
    def parse_details(self, root):
        try:
            kyobobook_id = self.parse_kyobobook_id(self.url)
        except:
            self.log.exception('Error parsing Kyobobook id for url: %r'%self.url)
            kyobobook_id = None
        
        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not kyobobook_id:
            self.log.error('Could not find title/authors/kyobobook id for %r'%self.url)
            self.log.error('Kyobobook: %r Title: %r Authors: %r'%(kyobobook_id, title,
                authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('kyobobook', kyobobook_id)
        self.kyobobook_id = kyobobook_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r'%self.url)

        try:
            mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
        except:
            self.log.exception('Error parsing publisher and date for url: %r'%self.url)

        try:
            lang = self._parse_language(root)
            if lang:
                mi.language = lang
        except:
            self.log.exception('Error parsing language for url: %r'%self.url)

        mi.source_relevance = self.relevance

        if self.kyobobook_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.kyobobook_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.kyobobook_id,
                        self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #41
0
    def _get_metadata(self, book_id, get_user_categories=True):  # {{{
        mi = Metadata(None, template_cache=self.formatter_template_cache)
        author_ids = self._field_ids_for('authors', book_id)
        aut_list = [self._author_data(i) for i in author_ids]
        aum = []
        aus = {}
        aul = {}
        for rec in aut_list:
            aut = rec['name']
            aum.append(aut)
            aus[aut] = rec['sort']
            aul[aut] = rec['link']
        mi.title       = self._field_for('title', book_id,
                default_value=_('Unknown'))
        mi.authors     = aum
        mi.author_sort = self._field_for('author_sort', book_id,
                default_value=_('Unknown'))
        mi.author_sort_map = aus
        mi.author_link_map = aul
        mi.comments    = self._field_for('comments', book_id)
        mi.publisher   = self._field_for('publisher', book_id)
        n = nowf()
        mi.timestamp   = self._field_for('timestamp', book_id, default_value=n)
        mi.pubdate     = self._field_for('pubdate', book_id, default_value=n)
        mi.uuid        = self._field_for('uuid', book_id,
                default_value='dummy')
        mi.title_sort  = self._field_for('sort', book_id,
                default_value=_('Unknown'))
        mi.book_size   = self._field_for('size', book_id, default_value=0)
        mi.ondevice_col = self._field_for('ondevice', book_id, default_value='')
        mi.last_modified = self._field_for('last_modified', book_id,
                default_value=n)
        formats = self._field_for('formats', book_id)
        mi.format_metadata = {}
        mi.languages = list(self._field_for('languages', book_id))
        if not formats:
            good_formats = None
        else:
            mi.format_metadata = FormatMetadata(self, book_id, formats)
            good_formats = FormatsList(formats, mi.format_metadata)
        mi.formats = good_formats
        mi.has_cover = _('Yes') if self._field_for('cover', book_id,
                default_value=False) else ''
        mi.tags = list(self._field_for('tags', book_id, default_value=()))
        mi.series = self._field_for('series', book_id)
        if mi.series:
            mi.series_index = self._field_for('series_index', book_id,
                    default_value=1.0)
        mi.rating = self._field_for('rating', book_id)
        mi.set_identifiers(self._field_for('identifiers', book_id,
            default_value={}))
        mi.application_id = book_id
        mi.id = book_id
        composites = []
        for key, meta in self.field_metadata.custom_iteritems():
            mi.set_user_metadata(key, meta)
            if meta['datatype'] == 'composite':
                composites.append(key)
            else:
                val = self._field_for(key, book_id)
                if isinstance(val, tuple):
                    val = list(val)
                extra = self._field_for(key+'_index', book_id)
                mi.set(key, val=val, extra=extra)
        for key in composites:
            mi.set(key, val=self._composite_for(key, book_id, mi))

        user_cat_vals = {}
        if get_user_categories:
            user_cats = self.backend.prefs['user_categories']
            for ucat in user_cats:
                res = []
                for name,cat,ign in user_cats[ucat]:
                    v = mi.get(cat, None)
                    if isinstance(v, list):
                        if name in v:
                            res.append([name,cat])
                    elif name == v:
                        res.append([name,cat])
                user_cat_vals[ucat] = res
        mi.user_categories = user_cat_vals

        return mi
Beispiel #42
0
def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.date import parse_date, utcnow
    from calibre.utils.cleantext import clean_ascii_chars

    XPath = partial(etree.XPath, namespaces=NAMESPACES)
    entry          = XPath('//atom:entry')
    entry_id       = XPath('descendant::atom:id')
    title          = XPath('descendant::atom:title')
    description    = XPath('descendant::atom:summary')
    publisher      = XPath("descendant::db:attribute[@name='publisher']")
    isbn           = XPath("descendant::db:attribute[@name='isbn13']")
    date           = XPath("descendant::db:attribute[@name='pubdate']")
    creator        = XPath("descendant::db:attribute[@name='author']")
    booktag        = XPath("descendant::db:tag/attribute::name")
    rating         = XPath("descendant::gd:rating/attribute::average")
    cover_url      = XPath("descendant::atom:link[@rel='image']/attribute::href")

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text.replace('http://', 'https://')
    douban_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'douban':douban_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
            strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi
    mi.comments = get_text(extra, description)
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in [t.text for t in isbn(extra)]:
        if check_isbn(x):
            isbns.append(x)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x for x in booktag(extra) if x]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r'%pubdate)

    # Ratings
    if rating(extra):
        try:
            mi.rating = float(rating(extra)[0]) / 2.0
        except:
            log.exception('Failed to parse rating')
            mi.rating = 0

    # Cover
    mi.has_douban_cover = None
    u = cover_url(extra)
    if u:
        u = u[0].replace('/spic/', '/lpic/')
        # If URL contains "book-default", the book doesn't have a cover
        if u.find('book-default') == -1:
            mi.has_douban_cover = u
    return mi
Beispiel #43
0
    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_only_date, UNDEFINED_DATE
        from css_selectors import Select
        root = parse_html(raw)
        selector = Select(root)
        sku = next(selector('div.sku.attGroup'))
        info = sku.getparent()
        top = info.getparent().getparent()
        banner = top.find('div')
        spans = banner.findall('span')
        title = ''
        for i, span in enumerate(spans):
            if i == 0 or '12pt' in span.get('style', ''):
                title += astext(span)
            else:
                break
        authors = [re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',')]
        mi = Metadata(title.strip(), authors)

        # Identifiers
        isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')]
        for isbn in isbns:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, self.sku)
        isbns = sorted(isbns, key=lambda x:len(x) if x else 0, reverse=True)
        if isbns and isbns[0]:
            mi.isbn = isbns[0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        bisac = tuple(selector('div.bisac.attGroup'))
        if bisac:
            bisac = astext(bisac[0])
            mi.tags = [x.strip() for x in bisac.split(',')]
            mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]

        # Publisher
        pub = tuple(selector('div.supplier.attGroup'))
        if pub:
            pub = astext(pub[0])
            mi.publisher = pub

        # Pubdate
        pub = tuple(selector('div.shipDate.attGroupItem'))
        if pub:
            pub = astext(pub[0])
            parts = pub.partition(':')[0::2]
            pub = parts[1] or parts[0]
            try:
                if ', Ship Date:' in pub:
                    pub = pub.partition(', Ship Date:')[0]
                q = parse_only_date(pub, assume_utc=True)
                if q.year != UNDEFINED_DATE:
                    mi.pubdate = q
            except:
                self.log.exception('Error parsing published date: %r'%pub)

        # Comments
        comm = ''
        general = tuple(selector('div#pd-general-overview-content'))
        if general:
            q = self.render_comments(general[0])
            if q != '<p>No title summary available. </p>':
                comm += q
        general = tuple(selector('div#pd-general-contributor-content'))
        if general:
            comm += self.render_comments(general[0])
        general = tuple(selector('div#pd-general-quotes-content'))
        if general:
            comm += self.render_comments(general[0])
        if comm:
            mi.comments = comm

        # Cover
        img = tuple(selector('img.title-image[src]'))
        if img:
            href = img[0].get('src').replace('jacket_covers/medium/',
                                             'jacket_covers/flyout/')
            self.plugin.cache_identifier_to_cover_url(self.sku, href)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None

        return mi
Beispiel #44
0
    def to_metadata(self, browser, log, entry_, timeout):  # {{{
        from calibre.utils.date import parse_date, utcnow

        douban_id = entry_.get('id')
        title = entry_.get('title')
        description = entry_.get('summary')
        # subtitle = entry_.get('subtitle')  # TODO: std metada doesn't have this field
        publisher = entry_.get('publisher')
        isbn = entry_.get('isbn13')  # ISBN11 is obsolute, use ISBN13
        pubdate = entry_.get('pubdate')
        authors = entry_.get('author')
        book_tags = entry_.get('tags')
        rating = entry_.get('rating')
        cover_url = entry_.get('images', {}).get('large')
        series = entry_.get('series')

        if not authors:
            authors = [_('Unknown')]
        if not douban_id or not title:
            # Silently discard this entry
            return None

        mi = Metadata(title, authors)
        mi.identifiers = {'douban': douban_id}
        mi.publisher = publisher
        mi.comments = description
        # mi.subtitle = subtitle

        # ISBN
        isbns = []
        if isinstance(isbn, (type(''), bytes)):
            if check_isbn(isbn):
                isbns.append(isbn)
        else:
            for x in isbn:
                if check_isbn(x):
                    isbns.append(x)
        if isbns:
            mi.isbn = sorted(isbns, key=len)[-1]
        mi.all_isbns = isbns

        # Tags
        mi.tags = [tag['name'] for tag in book_tags]

        # pubdate
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        # Ratings
        if rating:
            try:
                mi.rating = float(rating['average']) / 2.0
            except:
                log.exception('Failed to parse rating')
                mi.rating = 0

        # Cover
        mi.has_douban_cover = None
        u = cover_url
        if u:
            # If URL contains "book-default", the book doesn't have a cover
            if u.find('book-default') == -1:
                mi.has_douban_cover = u

        # Series
        if series:
            mi.series = series['title']

        return mi
Beispiel #45
0
    def parse_details(self, raw, root):
        try:
            asin = self.parse_asin(root)
        except:
            self.log.exception('Error parsing asin for url: %r'%self.url)
            asin = None
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(prefix=(asin or str(uuid.uuid4()))+ '_',
                    suffix='.html', delete=False) as f:
                f.write(raw)
            print ('Downloaded html for', asin, 'saved in', f.name)

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r'%self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not asin:
            self.log.error('Could not find title/authors/asin for %r'%self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
                authors))
            return

        mi = Metadata(title, authors)
        idtype = 'amazon' if self.domain == 'com' else 'amazon_'+self.domain
        mi.set_identifier(idtype, asin)
        self.amazon_id = asin

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r'%self.url)

        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        non_hero = CSSSelect('div#bookDetails_container_div div#nonHeroSection')(root)
        if non_hero:
            # New style markup
            try:
                self.parse_new_details(root, mi, non_hero[0])
            except:
                self.log.exception('Failed to parse new-style book details section')
        else:
            pd = root.xpath(self.pd_xpath)
            if pd:
                pd = pd[0]

                try:
                    isbn = self.parse_isbn(pd)
                    if isbn:
                        self.isbn = mi.isbn = isbn
                except:
                    self.log.exception('Error parsing ISBN for url: %r'%self.url)

                try:
                    mi.publisher = self.parse_publisher(pd)
                except:
                    self.log.exception('Error parsing publisher for url: %r'%self.url)

                try:
                    mi.pubdate = self.parse_pubdate(pd)
                except:
                    self.log.exception('Error parsing publish date for url: %r'%self.url)

                try:
                    lang = self.parse_language(pd)
                    if lang:
                        mi.language = lang
                except:
                    self.log.exception('Error parsing language for url: %r'%self.url)

            else:
                self.log.warning('Failed to find product description for url: %r'%self.url)

        mi.source_relevance = self.relevance

        if self.amazon_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.amazon_id,
                        self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
    def parse_details(self, root):
        try:
            yes24_id = self.parse_yes24_id(self.url)
        except:
            self.log.exception('Error parsing YES24 id for url: %r'%self.url)
            yes24_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not yes24_id:
            self.log.error('Could not find title/authors/YES24 id for %r'%self.url)
            self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title,
                authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('yes24', yes24_id)
        self.yes24_id = yes24_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)
        mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!

        try:
            mi.publisher = self.parse_publisher(root)
        except:
            self.log.exception('Error parsing publisher for url: %r'%self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
        except:
            self.log.exception('Error parsing published date for url: %r'%self.url)

        mi.language = 'ko'

        mi.source_relevance = self.relevance

        if self.yes24_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #47
0
    def merge(self, results, min_year, do_asr=True):
        ans = Metadata(_('Unknown'))

        # We assume the shortest title has the least cruft in it
        ans.title = self.length_merge('title', results, null_value=ans.title)

        # No harm in having extra authors, maybe something useful like an
        # editor or translator
        ans.authors = self.length_merge('authors', results,
                null_value=ans.authors, shortest=False)

        # We assume the shortest publisher has the least cruft in it
        ans.publisher = self.length_merge('publisher', results,
                null_value=ans.publisher)

        # We assume the smallest set of tags has the least cruft in it
        ans.tags = self.length_merge('tags', results,
                null_value=ans.tags, shortest=msprefs['fewer_tags'])

        # We assume the longest series has the most info in it
        ans.series = self.length_merge('series', results,
                null_value=ans.series, shortest=False)
        for r in results:
            if r.series and r.series == ans.series:
                ans.series_index = r.series_index
                break

        # Average the rating over all sources
        ratings = []
        for r in results:
            rating = r.rating
            if rating and rating > 0 and rating <= 5:
                ratings.append(rating)
        if ratings:
            ans.rating = int(round(sum(ratings)/len(ratings)))

        # Smallest language is likely to be valid
        ans.language = self.length_merge('language', results,
                null_value=ans.language)

        # Choose longest comments
        ans.comments = self.length_merge('comments', results,
                null_value=ans.comments, shortest=False)

        # Published date
        if min_year:
            for r in results:
                year = getattr(r.pubdate, 'year', None)
                if year == min_year:
                    ans.pubdate = r.pubdate
                    break
            if getattr(ans.pubdate, 'year', None) == min_year:
                min_date = datetime(min_year, ans.pubdate.month, ans.pubdate.day,
                                    tzinfo=utc_tz)
            else:
                min_date = datetime(min_year, 1, 2, tzinfo=utc_tz)
            ans.pubdate = min_date
        else:
            min_date = datetime(3001, 1, 1, tzinfo=utc_tz)
            for r in results:
                if r.pubdate is not None:
                    candidate = as_utc(r.pubdate)
                    if candidate < min_date:
                        min_date = candidate
            if min_date.year < 3000:
                ans.pubdate = min_date

        # Identifiers
        for r in results:
            ans.identifiers.update(r.identifiers)

        # Cover URL
        ans.has_cached_cover_url = bool([r for r in results if
            getattr(r, 'has_cached_cover_url', False)])

        # Merge any other fields with no special handling (random merge)
        touched_fields = set()
        for r in results:
            if hasattr(r, 'identify_plugin'):
                touched_fields |= r.identify_plugin.touched_fields

        for f in touched_fields:
            if f.startswith('identifier:') or not ans.is_null(f):
                continue
            setattr(ans, f, self.random_merge(f, results,
                null_value=getattr(ans, f)))

        if do_asr:
            avg = [x.relevance_in_source for x in results]
            avg = sum(avg)/len(avg)
            ans.average_source_relevance = avg

        return ans
Beispiel #48
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata(_('Unknown'))
    title = first_alt('//dc:title', root)
    if title.startswith(r'\376\377'):
        # corrupted XMP packet generated by Nitro PDF. See
        # https://bugs.launchpad.net/calibre/+bug/1541981
        raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF')
    if title:
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = authors
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False)
    except:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = safe_parse_date(first_simple('//xmp:MetadataDate', root))
    mod = safe_parse_date(first_simple('//xmp:ModifyDate', root))
    fd = more_recent(md, mod)
    if fd is not None:
        mi.metadata_date = fd
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:'+x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = filter(None, map(canonicalize_lang, languages))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi
Beispiel #49
0
    def parse_details(self, raw, root):
        dang_id = parse_dang_id(root, self.log, self.url)
        if not dang_id and root.xpath(
                '//form[@action="/errors/validateCaptcha"]'):
            raise CaptchaError(
                'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.'
            )
        if self.testing:
            import tempfile, uuid
            with tempfile.NamedTemporaryFile(
                    prefix=(dang_id or str(uuid.uuid4())) + '_',
                    suffix='.html',
                    delete=False) as f:
                f.write(raw)
            print('Downloaded html for', dang_id, 'saved in', f.name)

        try:
            title = self.parse_title(root)
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            title = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            authors = []

        if not title or not authors or not dang_id:
            self.log.error('Could not find title/authors/dang_id for %r' %
                           self.url)
            self.log.error('ASIN: %r Title: %r Authors: %r' %
                           (dang_id, title, authors))
            return

        mi = Metadata(title, authors)
        idtype = 'dang'
        mi.set_identifier(idtype, dang_id)
        self.dang_id = dang_id

        try:
            mi.comments = self.parse_comments(root, raw)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)

        try:
            series, series_index = self.parse_series(root)
            if series:
                mi.series, mi.series_index = series, series_index
            elif self.testing:
                mi.series, mi.series_index = 'Dummy series for testing', 1
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)

        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            self.cover_url = self.parse_cover(root, raw)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
        mi.has_cover = bool(self.cover_url)

        pd = root.xpath(self.pd_desc_xpath)
        pd_info = root.xpath(self.pd_info_xpath)
        pd_info_store = root.xpath(self.pd_info_store_xpath)
        pd_desc = root.xpath(self.pd_desc_xpath)

        if pd_info or pd_info_store:
            try:
                isbn = self.parse_isbn(pd_info, pd_info_store, pd_desc)
                if isbn:
                    self.isbn = mi.isbn = isbn
            except:
                self.log.exception('Error parsing ISBN for url: %r' % self.url)

            if pd_info:
                pd_info = pd_info[0]
            else:
                pd_info = pd_info_store[0]

            try:
                mi.publisher = self.parse_publisher(pd_info)
            except:
                self.log.exception('Error parsing publisher for url: %r' %
                                   self.url)

            try:
                mi.pubdate = self.parse_pubdate(pd_info)
            except:
                self.log.exception('Error parsing publish date for url: %r' %
                                   self.url)

        else:
            self.log.warning('Failed to find product description for url: %r' %
                             self.url)

        mi.source_relevance = self.relevance

        if self.dang_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.dang_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    self.dang_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)
Beispiel #50
0
    def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers):
        from lxml import etree

        def tostring(x):
            if x is None:
                return ''
            return etree.tostring(x, method='text', encoding=unicode).strip()

        orig_isbn = identifiers.get('isbn', None)
        title_tokens = list(self.get_title_tokens(orig_title))
        author_tokens = list(self.get_author_tokens(orig_authors))
        results = []

        def ismatch(title, authors):
            authors = lower(' '.join(authors))
            title = lower(title)
            match = not title_tokens
            for t in title_tokens:
                if lower(t) in title:
                    match = True
                    break
            amatch = not author_tokens
            for a in author_tokens:
                if lower(a) in authors:
                    amatch = True
                    break
            if not author_tokens:
                amatch = True
            return match and amatch

        bl = feed.find('BookList')
        if bl is None:
            err = tostring(feed.find('errormessage'))
            raise ValueError('ISBNDb query failed:' + err)
        total_results = int(bl.get('total_results'))
        shown_results = int(bl.get('shown_results'))
        for bd in bl.xpath('.//BookData'):
            isbn = check_isbn(bd.get('isbn', None))
            isbn13 = check_isbn(bd.get('isbn13', None))
            if not isbn and not isbn13:
                continue
            if orig_isbn and orig_isbn not in {isbn, isbn13}:
                continue
            title = tostring(bd.find('Title'))
            if not title:
                continue
            authors = []
            for au in bd.xpath('.//Authors/Person'):
                au = tostring(au)
                if au:
                    if ',' in au:
                        ln, _, fn = au.partition(',')
                        au = fn.strip() + ' ' + ln.strip()
                authors.append(au)
            if not authors:
                continue
            comments = tostring(bd.find('Summary'))
            id_ = (title, tuple(authors))
            if id_ in seen:
                continue
            seen.add(id_)
            if not ismatch(title, authors):
                continue
            publisher = tostring(bd.find('PublisherText'))
            if not publisher:
                publisher = None
            if publisher and 'audio' in publisher.lower():
                continue
            mi = Metadata(title, authors)
            mi.isbn = isbn
            mi.publisher = publisher
            mi.comments = comments
            results.append(mi)
        return total_results, shown_results, results
Beispiel #51
0
def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
    XPath = partial(etree.XPath, namespaces=NAMESPACES)

    # total_results  = XPath('//openSearch:totalResults')
    # start_index    = XPath('//openSearch:startIndex')
    # items_per_page = XPath('//openSearch:itemsPerPage')
    entry          = XPath('//atom:entry')
    entry_id       = XPath('descendant::atom:id')
    creator        = XPath('descendant::dc:creator')
    identifier     = XPath('descendant::dc:identifier')
    title          = XPath('descendant::dc:title')
    date           = XPath('descendant::dc:date')
    publisher      = XPath('descendant::dc:publisher')
    subject        = XPath('descendant::dc:subject')
    description    = XPath('descendant::dc:description')
    language       = XPath('descendant::dc:language')
    rating         = XPath('descendant::gd:rating[@average]')

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text
    google_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'google':google_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
            strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi

    mi.comments = get_text(extra, description)
    lang = canonicalize_lang(get_text(extra, language))
    if lang:
        mi.language = lang
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in identifier(extra):
        t = str(x.text).strip()
        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
            if t[:5].upper() == 'ISBN:':
                t = check_isbn(t[5:])
                if t:
                    isbns.append(t)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x.text for x in subject(extra) if x.text]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        from calibre.utils.date import parse_date, utcnow
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r'%pubdate)

    # Ratings
    for x in rating(extra):
        try:
            mi.rating = float(x.get('average'))
            if mi.rating > 5:
                mi.rating /= 2
        except:
            log.exception('Failed to parse rating')

    # Cover
    mi.has_google_cover = None
    for x in extra.xpath(
            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'):
        mi.has_google_cover = x.get('href')
        break

    return mi
Beispiel #52
0
	def parse_details(self, root):
		search_data = ''
		isbn = None
		
		try:
			self.log.info('Parse details:%s'%self.url)
			databazeknih_id = self.parse_databazeknih_id(self.url)
			self.log.info('Parsed DK identifier:%s'%databazeknih_id)
		except:
			self.log.exception('Error parsing databazeknih id for url: %r'%self.url)
			databazeknih_id = None

#		self.log.info('11')
		try:
			title = self.parse_title(root)
			self.log.info('Parsed title:%s'%title)
		except:
			self.log.exception('Error parsing title for url: %r'%self.url)
			title = None
		
		try:
			authors = self.parse_authors(root)
			self.log.info('Parsed authors:%s'%authors)
		except:
			self.log.exception('Error parsing authors for url: %r'%self.url)
			authors = []

		if not title or not authors or not databazeknih_id:
			self.log.error('Could not find title/authors/databazeknih id for %r'%self.url)
			self.log.error('DK id: %r Title: %r Authors: %r'%(databazeknih_id, title, authors))
			return

		mi = Metadata(title, authors)
		self.log.info('dbki:%s'%databazeknih_id)
		mi.set_identifier('databazeknih', databazeknih_id)
		self.databazeknih_id = databazeknih_id

		try:
			(mi.series, mi.series_index) = self.parse_series(root)
			self.log.info('Parsed series:%s'%mi.series)
			self.log.info('Parsed series index:%s'%mi.series_index)
		except :
			self.log.exception('Error parsing series for url: %r'%self.url)
			series = None
			
		try:
			mi.comments = self.parse_comments(root)
			self.log.info('Parsed comments:%s'%mi.comments)
		except:
			self.log.exception('Error parsing comments for url: %r'%self.url)

		try:
			self.cover_url = self.parse_cover(root)
			self.log.info('Parsed URL for cover:%r'%self.cover_url)
			self.plugin.cache_identifier_to_cover_url(self.databazeknih_id, self.cover_url)
		except:
			self.log.exception('Error parsing cover for url: %r'%self.url)
		mi.has_cover = bool(self.cover_url)

		try:
			mi.tags = self.parse_tags(root)
			self.log.info('Parsed tags:%s'%mi.tags)
		except:
			self.log.exception('Error parsing tags for url: %r'%self.url)
			
		try:
			mi.publisher = self.parse_publisher(root)
			self.log.info('Parsed publisher:%s'%mi.publisher)
		except:
			self.log.exception('Error parsing publisher for url: %r'%self.url)
			
		try:
			mi.pubdate = self.parse_pubdate(root)
			self.log.info('Parsed pubdate:%s'%mi.pubdate)
		except:
			self.log.exception('Error parsing pubdate for url: %r'%self.url)

			
		try:
			mi.rating = self.parse_rating(root)
			self.log.info('Parsed rating:%s'%mi.rating)
		except:
			self.log.exception('Error parsing rating for url: %r'%self.url)

		mi.source_relevance = self.relevance

#		if series:
#			mi.series = series
		
		try:
			isbn = self.parse_isbn(root)
			if isbn:
				self.isbn = mi.isbn = isbn
		except:
			self.log.exception('Error parsing ISBN for url: %r'%self.url)

		if self.databazeknih_id:
			self.plugin.cache_isbn_to_identifier(self.isbn, self.databazeknih_id)
			
#		self.plugin.clean_downloaded_metadata(mi)
#		mi.isbn = check_isbn(mi.isbn)
		self.log.info(mi)
		self.result_queue.put(mi)
Beispiel #53
0
    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_only_date, UNDEFINED_DATE
        from css_selectors import Select
        root = parse_html(raw)
        selector = Select(root)
        sku = next(selector('div.sku.attGroup'))
        info = sku.getparent()
        top = info.getparent().getparent()
        banner = top.find('div')
        spans = banner.findall('span')
        title = ''
        for i, span in enumerate(spans):
            if i == 0 or '12pt' in span.get('style', ''):
                title += astext(span)
            else:
                break
        authors = [
            re.sub(r'\(.*\)', '', x).strip()
            for x in astext(spans[-1]).split(',')
        ]
        mi = Metadata(title.strip(), authors)

        # Identifiers
        isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')]
        for isbn in isbns:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, self.sku)
        isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True)
        if isbns and isbns[0]:
            mi.isbn = isbns[0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        bisac = tuple(selector('div.bisac.attGroup'))
        if bisac:
            bisac = astext(bisac[0])
            mi.tags = [x.strip() for x in bisac.split(',')]
            mi.tags = [
                t[1:].strip() if t.startswith('&') else t for t in mi.tags
            ]

        # Publisher
        pub = tuple(selector('div.supplier.attGroup'))
        if pub:
            pub = astext(pub[0])
            mi.publisher = pub

        # Pubdate
        pub = tuple(selector('div.shipDate.attGroupItem'))
        if pub:
            pub = astext(pub[0])
            parts = pub.partition(':')[0::2]
            pub = parts[1] or parts[0]
            try:
                if ', Ship Date:' in pub:
                    pub = pub.partition(', Ship Date:')[0]
                q = parse_only_date(pub, assume_utc=True)
                if q.year != UNDEFINED_DATE:
                    mi.pubdate = q
            except:
                self.log.exception('Error parsing published date: %r' % pub)

        # Comments
        comm = ''
        general = tuple(selector('div#pd-general-overview-content'))
        if general:
            q = self.render_comments(general[0])
            if q != '<p>No title summary available. </p>':
                comm += q
        general = tuple(selector('div#pd-general-contributor-content'))
        if general:
            comm += self.render_comments(general[0])
        general = tuple(selector('div#pd-general-quotes-content'))
        if general:
            comm += self.render_comments(general[0])
        if comm:
            mi.comments = comm

        # Cover
        img = tuple(selector('img.title-image[src]'))
        if img:
            href = img[0].get('src').replace('jacket_covers/medium/',
                                             'jacket_covers/flyout/')
            self.plugin.cache_identifier_to_cover_url(self.sku, href)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(
            self.sku) is not None

        return mi
Beispiel #54
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata(_('Unknown'))
    title = first_alt('//dc:title', root)
    if title:
        if title.startswith(r'\376\377'):
            # corrupted XMP packet generated by Nitro PDF. See
            # https://bugs.launchpad.net/calibre/+bug/1541981
            raise ValueError(
                'Corrupted XMP metadata packet detected, probably generated by Nitro PDF'
            )
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = [au for aus in authors for au in string_to_authors(aus)]
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences(
        '//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = parse_date(first_sequence('//dc:date', root)
                             or first_simple('//xmp:CreateDate', root),
                             assume_utc=False)
    except:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = safe_parse_date(first_simple('//xmp:MetadataDate', root))
    mod = safe_parse_date(first_simple('//xmp:ModifyDate', root))
    fd = more_recent(md, mod)
    if fd is not None:
        mi.metadata_date = fd
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:' + x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = list(filter(None, map(canonicalize_lang, languages)))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in iteritems({
            'doi': check_doi,
            'isbn': check_isbn
    }):
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi
Beispiel #55
0
    def _get_metadata(self, book_id, get_user_categories=True):  # {{{
        mi = Metadata(None, template_cache=self.formatter_template_cache)
        author_ids = self._field_ids_for('authors', book_id)
        aut_list = [self._author_data(i) for i in author_ids]
        aum = []
        aus = {}
        aul = {}
        for rec in aut_list:
            aut = rec['name']
            aum.append(aut)
            aus[aut] = rec['sort']
            aul[aut] = rec['link']
        mi.title = self._field_for('title',
                                   book_id,
                                   default_value=_('Unknown'))
        mi.authors = aum
        mi.author_sort = self._field_for('author_sort',
                                         book_id,
                                         default_value=_('Unknown'))
        mi.author_sort_map = aus
        mi.author_link_map = aul
        mi.comments = self._field_for('comments', book_id)
        mi.publisher = self._field_for('publisher', book_id)
        n = nowf()
        mi.timestamp = self._field_for('timestamp', book_id, default_value=n)
        mi.pubdate = self._field_for('pubdate', book_id, default_value=n)
        mi.uuid = self._field_for('uuid', book_id, default_value='dummy')
        mi.title_sort = self._field_for('sort',
                                        book_id,
                                        default_value=_('Unknown'))
        mi.book_size = self._field_for('size', book_id, default_value=0)
        mi.ondevice_col = self._field_for('ondevice',
                                          book_id,
                                          default_value='')
        mi.last_modified = self._field_for('last_modified',
                                           book_id,
                                           default_value=n)
        formats = self._field_for('formats', book_id)
        mi.format_metadata = {}
        mi.languages = list(self._field_for('languages', book_id))
        if not formats:
            good_formats = None
        else:
            mi.format_metadata = FormatMetadata(self, book_id, formats)
            good_formats = FormatsList(formats, mi.format_metadata)
        mi.formats = good_formats
        mi.has_cover = _('Yes') if self._field_for(
            'cover', book_id, default_value=False) else ''
        mi.tags = list(self._field_for('tags', book_id, default_value=()))
        mi.series = self._field_for('series', book_id)
        if mi.series:
            mi.series_index = self._field_for('series_index',
                                              book_id,
                                              default_value=1.0)
        mi.rating = self._field_for('rating', book_id)
        mi.set_identifiers(
            self._field_for('identifiers', book_id, default_value={}))
        mi.application_id = book_id
        mi.id = book_id
        composites = []
        for key, meta in self.field_metadata.custom_iteritems():
            mi.set_user_metadata(key, meta)
            if meta['datatype'] == 'composite':
                composites.append(key)
            else:
                val = self._field_for(key, book_id)
                if isinstance(val, tuple):
                    val = list(val)
                extra = self._field_for(key + '_index', book_id)
                mi.set(key, val=val, extra=extra)
        for key in composites:
            mi.set(key, val=self._composite_for(key, book_id, mi))

        user_cat_vals = {}
        if get_user_categories:
            user_cats = self.backend.prefs['user_categories']
            for ucat in user_cats:
                res = []
                for name, cat, ign in user_cats[ucat]:
                    v = mi.get(cat, None)
                    if isinstance(v, list):
                        if name in v:
                            res.append([name, cat])
                    elif name == v:
                        res.append([name, cat])
                user_cat_vals[ucat] = res
        mi.user_categories = user_cat_vals

        return mi
Beispiel #56
0
    def parse_details(self, root):
        try:
            goodreads_id = self.parse_goodreads_id(self.url)
        except:
            self.log.exception("Error parsing goodreads id for url: %r" % self.url)
            goodreads_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception("Error parsing title and series for url: %r" % self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception("Error parsing authors for url: %r" % self.url)
            authors = []

        if not title or not authors or not goodreads_id:
            self.log.error("Could not find title/authors/goodreads id for %r" % self.url)
            self.log.error("Goodreads: %r Title: %r Authors: %r" % (goodreads_id, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier("goodreads", goodreads_id)
        self.goodreads_id = goodreads_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception("Error parsing ISBN for url: %r" % self.url)

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception("Error parsing ratings for url: %r" % self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception("Error parsing comments for url: %r" % self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception("Error parsing cover for url: %r" % self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
        except:
            self.log.exception("Error parsing tags for url: %r" % self.url)

        try:
            mi.publisher, mi.pubdate = self.parse_publisher_and_date(root)
        except:
            self.log.exception("Error parsing publisher and date for url: %r" % self.url)

        mi.source_relevance = self.relevance

        if self.goodreads_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.goodreads_id)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(self.goodreads_id, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)

        self.result_queue.put(mi)